From ac7efa2085ed5d94ce8541aab309ab10e49ff0f0 Mon Sep 17 00:00:00 2001 From: Karol Nowacki Date: Tue, 24 Mar 2026 05:22:33 +0100 Subject: [PATCH 1/5] index: test: secondary index target option serialization test Target option serialization must remain stable for backward compatibility. The index is restored from this property on startup, so unintentional changes to the serialization schema can break indexes after upgrade. --- docs/dev/secondary_index.md | 13 ++++- test/cqlpy/test_secondary_index.py | 77 ++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 2 deletions(-) diff --git a/docs/dev/secondary_index.md b/docs/dev/secondary_index.md index 972d4a3364..b7d9ca1ca1 100644 --- a/docs/dev/secondary_index.md +++ b/docs/dev/secondary_index.md @@ -37,8 +37,17 @@ Global index's target is usually just the indexed column name, unless the index - index on map, set or list values: VALUES(v) - index on map entries: ENTRIES(v) -Their serialization is just string representation, so: -"v", "FULL(v)", "KEYS(v)", "VALUES(v)", "ENTRIES(v)" are all valid targets. +Their serialization uses lowercase type names as prefixes, except for `full` which is serialized +as just the column name (without any prefix): +`"v"`, `"keys(v)"`, `"values(v)"`, `"entries(v)"` are valid targets; a frozen full collection +index on column `v` is stored simply as `"v"` (same as a regular index). + +If the column name contains characters that could be confused with the above formats +(e.g., a name containing parentheses or braces), it is escaped using the CQL +quoted-identifier syntax (column_identifier::to_cql_string()), which wraps the +name in double quotes and doubles any embedded double-quote characters. For example, +a column named `hEllo` is stored as `"hEllo"`, and a column named `keys(m)` is +stored as `"keys(m)"`. ## Local index diff --git a/test/cqlpy/test_secondary_index.py b/test/cqlpy/test_secondary_index.py index a4f8990e24..09437209fc 100644 --- a/test/cqlpy/test_secondary_index.py +++ b/test/cqlpy/test_secondary_index.py @@ -4,6 +4,7 @@ # Tests for secondary indexes +import json import random import itertools import time @@ -2031,6 +2032,82 @@ def test_index_in_system_schema_indexes(cql, built_index): assert res[0].kind == 'COMPOSITES' assert res[0].options == {'target': 'v'} +# Test that the "target" option in system_schema.indexes is serialized +# correctly for secondary indexes on collection columns. +# This format is critical for backward compatibility, as it's read from +# disk on startup to rebuild indexes. An incompatible change would prevent +# existing indexes from being recreated after an upgrade. +def test_global_collection_index_target_serialization(cql, test_keyspace): + schema = "p int PRIMARY KEY, m map, fl frozen>" + with new_test_table(cql, test_keyspace, schema) as table: + # Scylla normalizes full(col) targets to just the column name; + # Cassandra keeps the full(col) prefix. + full_target = "fl" if is_scylla(cql) else "full(fl)" + cases = [ + ("keys(m)", "keys(m)"), + ("values(m)", "values(m)"), + ("entries(m)", "entries(m)"), + ("full(fl)", full_target), + ] + for index_expr, expected_target in cases: + index_name = unique_name() + cql.execute(f"CREATE INDEX {index_name} ON {table}({index_expr})") + wait_for_index(cql, test_keyspace, index_name) + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + assert res[0].kind == 'COMPOSITES' + assert res[0].options == {'target': expected_target}, \ + f"For index expression '{index_expr}': expected target '{expected_target}', got '{res[0].options}'" + +# Test that the "target" option in system_schema.indexes is serialized +# correctly when the indexed column name contains special characters +# (e.g., upper-case, spaces, braces, or keywords like "keys(m)"). +# The encoding uses the CQL quoted-identifier form, so e.g. column "hEllo" +# is stored as '"hEllo"'. An incompatible change here would break index +# lookup after an upgrade. +def test_global_index_target_serialization_quoted_names(cql, test_keyspace): + # Column names requiring quoting in CQL (mixed-case, space, characters + # that would otherwise be confused with target-format prefixes or JSON). + quoted_names = ['"hEllo"', '"x y"', '"keys(m)"'] + schema = 'p int PRIMARY KEY, ' + ', '.join(name + " int" for name in quoted_names) + with new_test_table(cql, test_keyspace, schema) as table: + for name in quoted_names: + index_name = unique_name() + cql.execute(f"CREATE INDEX {index_name} ON {table}({name})") + wait_for_index(cql, test_keyspace, index_name) + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + assert res[0].kind == 'COMPOSITES' + # The target is the CQL representation of the column name, + # i.e., quoted exactly as provided in the CREATE INDEX statement. + assert res[0].options == {'target': name}, \ + f"For column {name}: got target '{res[0].options}'" + +# Test that the "target" option in system_schema.indexes is serialized +# correctly for local secondary indexes. This format is critical for +# backward compatibility, as it's read from disk on startup to rebuild +# indexes. An incompatible change would prevent existing local indexes +# from being recreated after an upgrade. +def test_local_index_target_serialization(cql, test_keyspace, scylla_only): + schema = "a int, b int, c int, v int, PRIMARY KEY ((a, b), c)" + with new_test_table(cql, test_keyspace, schema) as table: + index_name = unique_name() + cql.execute(f"CREATE INDEX {index_name} ON {table}((a, b), v)") + wait_for_index(cql, test_keyspace, index_name) + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + # The target for a local secondary index is stored as JSON. + assert json.loads(res[0].options['target']) == {"pk": ["a", "b"], "ck": ["v"]} + # Test index representation in REST API def test_index_in_API(cql, test_keyspace): with new_test_table(cql, test_keyspace, "p int PRIMARY KEY, v int") as table: From 1cb9d0b245f917dc8de4f680c3213359f8763fc8 Mon Sep 17 00:00:00 2001 From: Karol Nowacki Date: Mon, 23 Mar 2026 17:36:44 +0100 Subject: [PATCH 2/5] index: test: vector index target option serialization test This test ensures that the serialization format for vector index target options remains stable. Maintaining backward compatibility is critical because the index is restored from this property on startup. Any unintended changes to the serialization schema could break existing indexes after an upgrade. This option is also an interface for the vector-store service, which uses it to identify the indexed column. --- test/cqlpy/test_vector_index.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/test/cqlpy/test_vector_index.py b/test/cqlpy/test_vector_index.py index 491b7a23b3..0015f30602 100644 --- a/test/cqlpy/test_vector_index.py +++ b/test/cqlpy/test_vector_index.py @@ -264,6 +264,25 @@ def test_vector_index_version_fail_given_as_option(cql, test_keyspace, scylla_on with pytest.raises(InvalidRequest, match="Cannot specify index_version as a CUSTOM option"): cql.execute(f"CREATE CUSTOM INDEX abc ON {table}(v) USING 'vector_index' WITH OPTIONS = {{'index_version': '18ad2003-05ea-17d9-1855-0325ac0a755d'}}") +# Test that the "target" option in system_schema.indexes is serialized +# correctly for a vector index on a single vector column. This format is +# critical for backward compatibility, as it's read from disk on startup +# to rebuild indexes. An incompatible change would prevent existing vector +# indexes from being recreated after an upgrade. +# This is also an interface with the vector-store service, which relies on the "target" +# option to identify the target column. +def test_vector_index_target_serialization(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p int primary key, v vector' + with new_test_table(cql, test_keyspace, schema) as table: + index_name = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}(v) USING 'vector_index'") + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + assert res[0].options['target'] == 'v' + def test_one_vector_index_on_column(cql, test_keyspace, skip_without_tablets): schema = "p int primary key, v vector" if is_scylla(cql): From bcd320a82a423bfb69faed6bee255b6615138982 Mon Sep 17 00:00:00 2001 From: Karol Nowacki Date: Tue, 24 Mar 2026 05:40:04 +0100 Subject: [PATCH 3/5] vector_search: fix SELECT on local vector index Queries against local vector indexes were failing with the error: "ANN ordering by vector requires the column to be indexed using 'vector_index'" This was a regression introduced by 15788c3734, which incorrectly assumed the first column in the targets list is always the vector column. For local vector indexes, the first column is the partition key, causing the failure. Previously, serialization logic for the target index option was shared between vector and secondary indexes. This is no longer viable due to the introduction of local vector indexes and vector indexes with filtering columns, which have different target format. This commit introduces a dedicated JSON-based serialization format for vector index targets, identifying the target column (tc), filtering columns (fc), and partition key columns (pk). This ensures unambiguous serialization and deserialization for all vector index types. This change is backward compatible for regular vector indexes. However, it breaks compatibility for local vector indexes and vector indexes with filtering columns created in version 2026.1.0. To mitigate this, usage of these specific index types will be blocked in the 2026.1.0 release by failing ANN queries against them in vector-store service. Fixes: SCYLLADB-895 --- configure.py | 1 + cql3/statements/create_index_statement.cc | 17 +- docs/dev/vector_index.md | 10 + index/vector_index.cc | 80 +++++++- index/vector_index.hh | 3 + test/boost/CMakeLists.txt | 1 + test/boost/vector_index_test.cc | 189 ++++++++++++++++++ test/cqlpy/test_vector_index.py | 40 ++++ .../vector_search/vector_store_client_test.cc | 23 +++ 9 files changed, 353 insertions(+), 11 deletions(-) create mode 100644 docs/dev/vector_index.md create mode 100644 test/boost/vector_index_test.cc diff --git a/configure.py b/configure.py index 90537153d5..897733b97f 100755 --- a/configure.py +++ b/configure.py @@ -1701,6 +1701,7 @@ deps['test/boost/combined_tests'] += [ 'test/boost/tracing_test.cc', 'test/boost/user_function_test.cc', 'test/boost/user_types_test.cc', + 'test/boost/vector_index_test.cc', 'test/boost/view_build_test.cc', 'test/boost/view_complex_test.cc', 'test/boost/view_schema_ckey_test.cc', diff --git a/cql3/statements/create_index_statement.cc b/cql3/statements/create_index_statement.cc index acf2407da6..e2a113dedd 100644 --- a/cql3/statements/create_index_statement.cc +++ b/cql3/statements/create_index_statement.cc @@ -8,6 +8,7 @@ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0) */ +#include #include #include "create_index_statement.hh" #include "db/config.hh" @@ -37,6 +38,7 @@ #include "types/concrete_types.hh" #include "db/tags/extension.hh" #include "tombstone_gc_extension.hh" +#include "index/secondary_index.hh" #include @@ -116,6 +118,15 @@ static data_type type_for_computed_column(cql3::statements::index_target::target } } +static bool is_vector_capable_class(const sstring& class_name) { + return boost::iequals(class_name, "vector_index"); +} + +static bool is_vector_index(const index_options_map& options) { + auto class_it = options.find(db::index::secondary_index::custom_class_option_name); + return class_it != options.end() && is_vector_capable_class(class_it->second); +} + view_ptr create_index_statement::create_view_for_index(const schema_ptr schema, const index_metadata& im, const data_dictionary::database& db) const { @@ -266,7 +277,7 @@ create_index_statement::validate(query_processor& qp, const service::client_stat _idx_properties->validate(); // FIXME: This is ugly and can be improved. - const bool is_vector_index = _idx_properties->custom_class && *_idx_properties->custom_class == "vector_index"; + const bool is_vector_index = _idx_properties->custom_class && is_vector_capable_class(*_idx_properties->custom_class); const bool uses_view_properties = _view_properties.properties()->count() > 0 || _view_properties.use_compact_storage() || _view_properties.defined_ordering().size() > 0; @@ -697,7 +708,9 @@ index_metadata create_index_statement::make_index_metadata(const std::vector<::s const index_options_map& options) { index_options_map new_options = options; - auto target_option = secondary_index::target_parser::serialize_targets(targets); + auto target_option = is_vector_index(options) + ? secondary_index::vector_index::serialize_targets(targets) + : secondary_index::target_parser::serialize_targets(targets); new_options.emplace(index_target::target_option_name, target_option); const auto& first_target = targets.front()->value; diff --git a/docs/dev/vector_index.md b/docs/dev/vector_index.md new file mode 100644 index 0000000000..ba615d74f1 --- /dev/null +++ b/docs/dev/vector_index.md @@ -0,0 +1,10 @@ +# Vector index in Scylla + +Vector indexes are custom indexes (USING 'vector\_index'). Their `target` option in `system_schema.indexes` uses following format: + +- Simple single-column vector index `(v)`: just the (escaped) column name, e.g. `v` +- Vector index with filtering columns `(v, f1, f2)`: JSON with `tc` (target column) and `fc` (filtering columns): `{"tc":"v","fc":["f1","f2"]}` +- Local vector index `((p1, p2), v)`: JSON with `tc` and `pk` (partition key columns): `{"tc":"v","pk":["p1","p2"]}` +- Local vector index with filtering columns `((p1, p2), v, f1, f2)`: JSON with `tc`, `pk`, and `fc`: `{"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]}` + +The `target` option acts as the interface for the vector-store service, providing the metadata necessary to determine which columns are indexed and how they are structured. diff --git a/index/vector_index.cc b/index/vector_index.cc index 890e437152..c9d30e4b3f 100644 --- a/index/vector_index.cc +++ b/index/vector_index.cc @@ -105,17 +105,79 @@ const static std::unordered_map json_value = rjson::try_parse(targets); - if (!json_value || !json_value->IsObject()) { - return target_parser::get_target_column_name_from_string(targets); +static constexpr auto TC_TARGET_KEY = "tc"; +static constexpr auto PK_TARGET_KEY = "pk"; +static constexpr auto FC_TARGET_KEY = "fc"; + +// Serialize vector index targets into a format using: +// "tc" for the target (vector) column, +// "pk" for partition key columns (local index), +// "fc" for filtering columns. +// For a simple single-column vector index, returns just the column name. +// Examples: +// (v) -> "v" +// (v, f1, f2) -> {"tc":"v","fc":["f1","f2"]} +// ((p1, p2), v) -> {"tc":"v","pk":["p1","p2"]} +// ((p1, p2), v, f1, f2) -> {"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]} +sstring vector_index::serialize_targets(const std::vector<::shared_ptr>& targets) { + using cql3::statements::index_target; + + if (targets.size() == 0) { + throw exceptions::invalid_request_exception("Vector index must have at least one target column"); } - rjson::value* pk = rjson::find(*json_value, "pk"); - if (pk && pk->IsArray() && !pk->Empty()) { - return sstring(rjson::to_string_view(pk->GetArray()[0])); + if (targets.size() == 1) { + auto tc = targets[0]->value; + if (!std::holds_alternative(tc)) { + throw exceptions::invalid_request_exception("Missing vector column target for local vector index"); + } + return index_target::escape_target_column(*std::get(tc)); } - return target_parser::get_target_column_name_from_string(targets); + + const bool has_pk = std::holds_alternative(targets.front()->value); + const size_t tc_idx = has_pk ? 1 : 0; + const size_t fc_count = targets.size() - tc_idx - 1; + + if (!std::holds_alternative(targets[tc_idx]->value)) { + throw exceptions::invalid_request_exception("Vector index target column must be a single column"); + } + + rjson::value json_map = rjson::empty_object(); + rjson::add_with_string_name(json_map, TC_TARGET_KEY, rjson::from_string(std::get(targets[tc_idx]->value)->text())); + + if (has_pk) { + rjson::value pk_json = rjson::empty_array(); + for (const auto& col : std::get(targets.front()->value)) { + rjson::push_back(pk_json, rjson::from_string(col->text())); + } + rjson::add_with_string_name(json_map, PK_TARGET_KEY, std::move(pk_json)); + } + + if (fc_count > 0) { + rjson::value fc_json = rjson::empty_array(); + for (size_t i = tc_idx + 1; i < targets.size(); ++i) { + if (!std::holds_alternative(targets[i]->value)) { + throw exceptions::invalid_request_exception("Vector index filtering column must be a single column"); + } + rjson::push_back(fc_json, rjson::from_string(std::get(targets[i]->value)->text())); + } + rjson::add_with_string_name(json_map, FC_TARGET_KEY, std::move(fc_json)); + } + + return rjson::print(json_map); +} + +sstring vector_index::get_target_column(const sstring& targets) { + std::optional json_value = rjson::try_parse(targets); + if (!json_value || !json_value->IsObject()) { + return cql3::statements::index_target::column_name_from_target_string(targets); + } + + rjson::value* tc = rjson::find(*json_value, TC_TARGET_KEY); + if (tc && tc->IsString()) { + return sstring(rjson::to_string_view(*tc)); + } + return cql3::statements::index_target::column_name_from_target_string(targets); } bool vector_index::is_rescoring_enabled(const index_options_map& properties) { @@ -346,7 +408,7 @@ bool vector_index::is_vector_index_on_column(const index_metadata& im, const sst auto target_it = im.options().find(cql3_parser::index_target::target_option_name); if (class_it != im.options().end() && target_it != im.options().end()) { auto custom_class = secondary_index_manager::get_custom_class_factory(class_it->second); - return custom_class && dynamic_cast((*custom_class)().get()) && get_vector_index_target_column(target_it->second) == target_name; + return custom_class && dynamic_cast((*custom_class)().get()) && get_target_column(target_it->second) == target_name; } return false; } diff --git a/index/vector_index.hh b/index/vector_index.hh index 5756c02ef1..7aedb3a1cf 100644 --- a/index/vector_index.hh +++ b/index/vector_index.hh @@ -37,6 +37,9 @@ public: static bool is_vector_index_on_column(const index_metadata& im, const sstring& target_name); static void check_cdc_options(const schema& schema); + static sstring serialize_targets(const std::vector<::shared_ptr>& targets); + static sstring get_target_column(const sstring& targets); + static bool is_rescoring_enabled(const index_options_map& properties); static float get_oversampling(const index_options_map& properties); static sstring get_cql_similarity_function_name(const index_options_map& properties); diff --git a/test/boost/CMakeLists.txt b/test/boost/CMakeLists.txt index 4d7e137f9a..452754883e 100644 --- a/test/boost/CMakeLists.txt +++ b/test/boost/CMakeLists.txt @@ -378,6 +378,7 @@ add_scylla_test(combined_tests tracing_test.cc user_function_test.cc user_types_test.cc + vector_index_test.cc view_build_test.cc view_complex_test.cc view_schema_ckey_test.cc diff --git a/test/boost/vector_index_test.cc b/test/boost/vector_index_test.cc new file mode 100644 index 0000000000..bb3cfb3c3f --- /dev/null +++ b/test/boost/vector_index_test.cc @@ -0,0 +1,189 @@ +/* + * Copyright (C) 2026-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 + */ + +#include +#include "cql3/column_identifier.hh" +#include "cql3/statements/index_target.hh" +#include "exceptions/exceptions.hh" +#include "index/vector_index.hh" +#include "utils/rjson.hh" + +using namespace secondary_index; +using namespace cql3; + +using statements::index_target; + +BOOST_AUTO_TEST_SUITE(vector_index_test) + +namespace { + +::shared_ptr make_single(const sstring& name) { + auto col = ::make_shared(name, true); + return ::make_shared(col, index_target::target_type::regular_values); +} + +::shared_ptr make_multi(const std::vector& names) { + std::vector<::shared_ptr> cols; + cols.reserve(names.size()); + for (const auto& n : names) { + cols.push_back(::make_shared(n, true)); + } + return ::make_shared(std::move(cols), index_target::target_type::regular_values); +} + +} // namespace + +BOOST_AUTO_TEST_CASE(serialize_targets_empty_targets_throws) { + std::vector<::shared_ptr> targets; + BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_single_pk_only_target_throws) { + std::vector<::shared_ptr> targets = {make_multi({"p1", "p2"})}; + BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception); +} + + +BOOST_AUTO_TEST_CASE(serialize_targets_single_column) { + std::vector<::shared_ptr> targets = { + make_single("v"), + }; + BOOST_CHECK_EQUAL(vector_index::serialize_targets(targets), "v"); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_with_filtering_columns) { + std::vector<::shared_ptr> targets = { + make_single("v"), + make_single("f1"), + make_single("f2"), + }; + + const auto result = vector_index::serialize_targets(targets); + const auto json = rjson::parse(result); + + BOOST_REQUIRE(json.IsObject()); + + BOOST_REQUIRE(!rjson::find(json, "pk")); + + auto tc = rjson::find(json, "tc"); + BOOST_REQUIRE(tc); + BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v"); + + const auto* fc = rjson::find(json, "fc"); + BOOST_REQUIRE(fc); + BOOST_REQUIRE(fc->IsArray()); + BOOST_REQUIRE_EQUAL(fc->Size(), 2); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[0])), "f1"); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[1])), "f2"); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_local_index) { + std::vector<::shared_ptr> targets = { + make_multi({"p1", "p2"}), + make_single("v"), + }; + + const auto result = vector_index::serialize_targets(targets); + const auto json = rjson::parse(result); + + BOOST_REQUIRE(json.IsObject()); + + const auto* pk = rjson::find(json, "pk"); + BOOST_REQUIRE(pk); + BOOST_REQUIRE(pk->IsArray()); + BOOST_REQUIRE_EQUAL(pk->Size(), 2); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[0])), "p1"); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[1])), "p2"); + + auto tc = rjson::find(json, "tc"); + BOOST_REQUIRE(tc); + BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v"); + + BOOST_REQUIRE(!rjson::find(json, "fc")); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_local_index_with_filtering_columns) { + std::vector<::shared_ptr> targets = { + make_multi({"p1", "p2"}), + make_single("v"), + make_single("f1"), + make_single("f2"), + }; + + const auto result = vector_index::serialize_targets(targets); + const auto json = rjson::parse(result); + + BOOST_REQUIRE(json.IsObject()); + + const auto* pk = rjson::find(json, "pk"); + BOOST_REQUIRE(pk); + BOOST_REQUIRE(pk->IsArray()); + BOOST_REQUIRE_EQUAL(pk->Size(), 2); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[0])), "p1"); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[1])), "p2"); + + auto tc = rjson::find(json, "tc"); + BOOST_REQUIRE(tc); + BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v"); + + const auto* fc = rjson::find(json, "fc"); + BOOST_REQUIRE(fc); + BOOST_REQUIRE(fc->IsArray()); + BOOST_REQUIRE_EQUAL(fc->Size(), 2); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[0])), "f1"); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[1])), "f2"); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_multi_column_target_throws) { + std::vector<::shared_ptr> targets = {make_multi({"p1"}), make_multi({"c1"})}; + BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_multi_column_filtering_throws) { + std::vector<::shared_ptr> targets = {make_single("v"), make_multi({"c1"})}; + BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception); +} + +BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_uppercase_letters_from_escaped_string) { + std::vector<::shared_ptr> targets = {make_single("MyCol")}; + const auto serialized = vector_index::serialize_targets(targets); + + BOOST_CHECK_EQUAL(serialized, "\"MyCol\""); + BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "MyCol"); +} + +BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_quotes_from_escaped_string) { + std::vector<::shared_ptr> targets = {make_single("a\"b")}; + const auto serialized = vector_index::serialize_targets(targets); + + BOOST_CHECK_EQUAL(serialized, "\"a\"\"b\""); + BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "a\"b"); +} + +BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_uppercase_letters_from_json) { + std::vector<::shared_ptr> targets = { + make_single("MyCol"), + make_single("f1"), + }; + const auto serialized = vector_index::serialize_targets(targets); + + BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "MyCol"); +} + +BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_quotes_from_json) { + std::vector<::shared_ptr> targets = { + make_single("a\"b"), + make_single("f1"), + }; + const auto serialized = vector_index::serialize_targets(targets); + + BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "a\"b"); +} + + +BOOST_AUTO_TEST_SUITE_END() diff --git a/test/cqlpy/test_vector_index.py b/test/cqlpy/test_vector_index.py index 0015f30602..2de0ab9804 100644 --- a/test/cqlpy/test_vector_index.py +++ b/test/cqlpy/test_vector_index.py @@ -7,6 +7,7 @@ ############################################################################### import pytest +import json from .util import new_test_table, is_scylla, unique_name from cassandra.protocol import InvalidRequest, ConfigurationException @@ -283,6 +284,45 @@ def test_vector_index_target_serialization(cql, test_keyspace, scylla_only, skip assert len(res) == 1 assert res[0].options['target'] == 'v' +# Test "target" option serialization for vector index with filtering columns. +def test_vector_index_target_serialization_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p int primary key, v vector, f1 int, f2 int' + with new_test_table(cql, test_keyspace, schema) as table: + index_name = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}(v, f1, f2) USING 'vector_index'") + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + assert json.loads(res[0].options['target']) == {"tc": "v", "fc": ["f1", "f2"]} + +# Test "target" option serialization for local vector index. +def test_vector_index_target_serialization_local_index(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p1 int, p2 int, c1 int, c2 int, v vector, PRIMARY KEY ((p1, p2), c1, c2)' + with new_test_table(cql, test_keyspace, schema) as table: + index_name = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}((p1, p2), v) USING 'vector_index'") + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + assert json.loads(res[0].options['target']) == {"tc": "v", "pk": ["p1", "p2"]} + +# Test "target" option serialization for local vector index with filtering columns. +def test_vector_index_target_serialization_local_index_with_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p1 int, p2 int, c1 int, c2 int, v vector, f1 text, f2 text, PRIMARY KEY ((p1, p2), c1, c2)' + with new_test_table(cql, test_keyspace, schema) as table: + index_name = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}((p1, p2), v, f1, f2) USING 'vector_index'") + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + assert json.loads(res[0].options['target']) == {"tc": "v", "pk": ["p1", "p2"], "fc": ["f1", "f2"]} + def test_one_vector_index_on_column(cql, test_keyspace, skip_without_tablets): schema = "p int primary key, v vector" if is_scylla(cql): diff --git a/test/vector_search/vector_store_client_test.cc b/test/vector_search/vector_store_client_test.cc index f8c10f4f28..1a494ecf82 100644 --- a/test/vector_search/vector_store_client_test.cc +++ b/test/vector_search/vector_store_client_test.cc @@ -1277,3 +1277,26 @@ SEASTAR_TEST_CASE(vector_store_client_vector_index_with_additional_filtering_col co_await server->stop(); })); } + +SEASTAR_TEST_CASE(vector_store_client_local_vector_index) { + auto server = co_await make_vs_mock_server(); + + auto cfg = make_config(); + cfg.db_config->vector_store_primary_uri.set(format("http://server.node:{}", server->port())); + co_await do_with_cql_env( + [&](cql_test_env& env) -> future<> { + auto schema = co_await create_test_table(env, "ks", "test"); + auto& vs = env.local_qp().vector_store_client(); + configure(vs).with_dns({{"server.node", std::vector{server->host()}}}); + vs.start_background_tasks(); + // Create a local vector index on the 'embedding' column. + auto result = co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test ((pk1, pk2), embedding) USING 'vector_index'"); + + BOOST_CHECK_NO_THROW( + co_await env.execute_cql("SELECT * FROM ks.test WHERE pk1 = 1 AND pk2 = 2 ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;")); + }, + cfg) + .finally(seastar::coroutine::lambda([&] -> future<> { + co_await server->stop(); + })); +} From dae73f47813f2dea113e6eff956a04590f189d51 Mon Sep 17 00:00:00 2001 From: Karol Nowacki Date: Wed, 11 Mar 2026 20:26:23 +0100 Subject: [PATCH 4/5] vector_search: test: refactor boilerplate setup The test boilerplate setup for some vector store client tests has been extracted to a common function. --- .../vector_search/vector_store_client_test.cc | 102 +++++++----------- 1 file changed, 41 insertions(+), 61 deletions(-) diff --git a/test/vector_search/vector_store_client_test.cc b/test/vector_search/vector_store_client_test.cc index 1a494ecf82..83e21f43fe 100644 --- a/test/vector_search/vector_store_client_test.cc +++ b/test/vector_search/vector_store_client_test.cc @@ -83,6 +83,25 @@ timeout_config make_query_timeout(std::chrono::seconds timeout) { return cfg; } +future<> do_with_vector_store_mock(std::function(cql_test_env&, vs_mock_server&)> func) { + auto server = co_await make_vs_mock_server(); + + auto cfg = make_config(); + cfg.db_config->vector_store_primary_uri.set(format("http://server.node:{}", server->port())); + co_await do_with_cql_env( + [&](cql_test_env& env) -> future<> { + co_await create_test_table(env, "ks", "test"); + auto& vs = env.local_qp().vector_store_client(); + configure(vs).with_dns({{"server.node", std::vector{server->host()}}}); + vs.start_background_tasks(); + co_await func(env, *server); + }, + cfg) + .finally(seastar::coroutine::lambda([&] -> future<> { + co_await server->stop(); + })); +} + } // namespace BOOST_AUTO_TEST_CASE(vector_store_client_test_ctor) { @@ -1222,32 +1241,19 @@ SEASTAR_TEST_CASE(vector_store_client_abort_due_to_query_timeout) { /// Verify that the HTTP error description from the vector store is propagated /// through the CQL interface as part of the invalid_request_exception message. SEASTAR_TEST_CASE(vector_store_client_cql_error_contains_http_error_description) { - auto server = co_await make_vs_mock_server(); + co_await do_with_vector_store_mock([](cql_test_env& env, vs_mock_server& server) -> future<> { + co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test (embedding) USING 'vector_index'"); - auto cfg = make_config(); - cfg.db_config->vector_store_primary_uri.set(format("http://server.node:{}", server->port())); - co_await do_with_cql_env( - [&](cql_test_env& env) -> future<> { - auto schema = co_await create_test_table(env, "ks", "test"); - auto& vs = env.local_qp().vector_store_client(); - configure(vs).with_dns({{"server.node", std::vector{server->host()}}}); - vs.start_background_tasks(); - co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test (embedding) USING 'vector_index'"); + // Configure mock to return 404 with a specific error message + server.next_ann_response({status_type::not_found, "index does not exist"}); - // Configure mock to return 404 with a specific error message - server->next_ann_response({status_type::not_found, "index does not exist"}); - - BOOST_CHECK_EXCEPTION(co_await env.execute_cql("SELECT * FROM ks.test ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;"), - exceptions::invalid_request_exception, [](const exceptions::invalid_request_exception& ex) { - auto msg = std::string(ex.what()); - // Verify the error message contains both the HTTP status and the error description - return msg.find("404") != std::string::npos && msg.find("index does not exist") != std::string::npos; - }); - }, - cfg) - .finally(seastar::coroutine::lambda([&] -> future<> { - co_await server->stop(); - })); + BOOST_CHECK_EXCEPTION(co_await env.execute_cql("SELECT * FROM ks.test ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;"), + exceptions::invalid_request_exception, [](const exceptions::invalid_request_exception& ex) { + auto msg = std::string(ex.what()); + // Verify the error message contains both the HTTP status and the error description + return msg.find("404") != std::string::npos && msg.find("index does not exist") != std::string::npos; + }); + }); } // Create a vector index with an additional filtering column. @@ -1257,46 +1263,20 @@ SEASTAR_TEST_CASE(vector_store_client_cql_error_contains_http_error_description) // ANN ordering by vector requires the column to be indexed using 'vector_index'. // Reproduces SCYLLADB-635. SEASTAR_TEST_CASE(vector_store_client_vector_index_with_additional_filtering_column) { - auto server = co_await make_vs_mock_server(); + co_await do_with_vector_store_mock([](cql_test_env& env, vs_mock_server&) -> future<> { + // Create a vector index on the embedding column, including ck1 for filtered ANN search support. + co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test (embedding, ck1) USING 'vector_index'"); - auto cfg = make_config(); - cfg.db_config->vector_store_primary_uri.set(format("http://server.node:{}", server->port())); - co_await do_with_cql_env( - [&](cql_test_env& env) -> future<> { - auto schema = co_await create_test_table(env, "ks", "test"); - auto& vs = env.local_qp().vector_store_client(); - configure(vs).with_dns({{"server.node", std::vector{server->host()}}}); - vs.start_background_tasks(); - // Create a vector index on the embedding column, including ck1 for filtered ANN search support. - auto result = co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test (embedding, ck1) USING 'vector_index'"); - - BOOST_CHECK_NO_THROW(co_await env.execute_cql("SELECT * FROM ks.test ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;")); - }, - cfg) - .finally(seastar::coroutine::lambda([&] -> future<> { - co_await server->stop(); - })); + BOOST_CHECK_NO_THROW(co_await env.execute_cql("SELECT * FROM ks.test ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;")); + }); } SEASTAR_TEST_CASE(vector_store_client_local_vector_index) { - auto server = co_await make_vs_mock_server(); + co_await do_with_vector_store_mock([](cql_test_env& env, vs_mock_server&) -> future<> { + // Create a local vector index on the 'embedding' column. + co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test ((pk1, pk2), embedding) USING 'vector_index'"); - auto cfg = make_config(); - cfg.db_config->vector_store_primary_uri.set(format("http://server.node:{}", server->port())); - co_await do_with_cql_env( - [&](cql_test_env& env) -> future<> { - auto schema = co_await create_test_table(env, "ks", "test"); - auto& vs = env.local_qp().vector_store_client(); - configure(vs).with_dns({{"server.node", std::vector{server->host()}}}); - vs.start_background_tasks(); - // Create a local vector index on the 'embedding' column. - auto result = co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test ((pk1, pk2), embedding) USING 'vector_index'"); - - BOOST_CHECK_NO_THROW( - co_await env.execute_cql("SELECT * FROM ks.test WHERE pk1 = 1 AND pk2 = 2 ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;")); - }, - cfg) - .finally(seastar::coroutine::lambda([&] -> future<> { - co_await server->stop(); - })); + BOOST_CHECK_NO_THROW( + co_await env.execute_cql("SELECT * FROM ks.test WHERE pk1 = 1 AND pk2 = 2 ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;")); + }); } From ec46a8a7d390cb3220808a74baa71471dc281e69 Mon Sep 17 00:00:00 2001 From: Karol Nowacki Date: Tue, 24 Mar 2026 09:04:04 +0100 Subject: [PATCH 5/5] index: fix DESC INDEX for vector index The `DESC INDEX` command returned incorrect results for local vector indexes and for vector indexes that included filtering columns. This patch corrects the implementation to ensure `DESCRIBE INDEX` accurately reflects the index configuration. This was a pre-existing issue, not a regression from recent serialization schema changes for vector index target options. --- index/vector_index.cc | 49 ++++++++++++++++++++++++++++++--- test/cqlpy/test_vector_index.py | 30 ++++++++++++++++++++ 2 files changed, 75 insertions(+), 4 deletions(-) diff --git a/index/vector_index.cc b/index/vector_index.cc index c9d30e4b3f..f23955f884 100644 --- a/index/vector_index.cc +++ b/index/vector_index.cc @@ -20,6 +20,7 @@ #include "types/concrete_types.hh" #include "types/types.hh" #include "utils/managed_string.hh" +#include #include #include @@ -103,12 +104,53 @@ const static std::unordered_map). +// +// JSON examples: +// {"tc":"v","fc":["f1","f2"]} -> "v, f1, f2" +// {"tc":"v","pk":["p1","p2"]} -> "(p1, p2), v" +// {"tc":"v","pk":["p1","p2"],"fc":["f1"]} -> "(p1, p2), v, f1" +static sstring targets_to_cql(const sstring& targets) { + std::optional json_value = rjson::try_parse(targets); + if (!json_value || !json_value->IsObject()) { + return cql3::util::maybe_quote(cql3::statements::index_target::column_name_from_target_string(targets)); + } + + sstring result; + + const rjson::value* pk = rjson::find(*json_value, PK_TARGET_KEY); + if (pk && pk->IsArray() && !pk->Empty()) { + result += "("; + auto pk_cols = std::views::all(pk->GetArray()) | std::views::transform([&](const rjson::value& col) { + return cql3::util::maybe_quote(sstring(rjson::to_string_view(col))); + }) | std::ranges::to>(); + result += boost::algorithm::join(pk_cols, ", "); + result += "), "; + } + + const rjson::value* tc = rjson::find(*json_value, TC_TARGET_KEY); + if (tc && tc->IsString()) { + result += cql3::util::maybe_quote(sstring(rjson::to_string_view(*tc))); + } + + const rjson::value* fc = rjson::find(*json_value, FC_TARGET_KEY); + if (fc && fc->IsArray()) { + for (rapidjson::SizeType i = 0; i < fc->Size(); ++i) { + result += ", "; + result += cql3::util::maybe_quote(sstring(rjson::to_string_view((*fc)[i]))); + } + } + + return result; +} + // Serialize vector index targets into a format using: // "tc" for the target (vector) column, // "pk" for partition key columns (local index), @@ -209,9 +251,8 @@ bool vector_index::view_should_exist() const { std::optional vector_index::describe(const index_metadata& im, const schema& base_schema) const { fragmented_ostringstream os; - os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON " - << cql3::util::maybe_quote(base_schema.ks_name()) << "." << cql3::util::maybe_quote(base_schema.cf_name()) - << "(" << cql3::util::maybe_quote(im.options().at(cql3::statements::index_target::target_option_name)) << ")" + os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON " << cql3::util::maybe_quote(base_schema.ks_name()) << "." + << cql3::util::maybe_quote(base_schema.cf_name()) << "(" << targets_to_cql(im.options().at(cql3::statements::index_target::target_option_name)) << ")" << " USING 'vector_index'"; return cql3::description{ diff --git a/test/cqlpy/test_vector_index.py b/test/cqlpy/test_vector_index.py index 2de0ab9804..ba0219cf05 100644 --- a/test/cqlpy/test_vector_index.py +++ b/test/cqlpy/test_vector_index.py @@ -200,6 +200,36 @@ def test_describe_custom_index(cql, test_keyspace, skip_without_tablets): assert f"CREATE CUSTOM INDEX custom ON {table}{maybe_space}(v1) USING '{custom_class}'" in a_desc assert f"CREATE CUSTOM INDEX custom1 ON {table}{maybe_space}(v2) USING '{custom_class}'" in b_desc +def test_describe_vector_index_with_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p int primary key, v vector, f1 int, f2 int' + with new_test_table(cql, test_keyspace, schema) as table: + idx = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {idx} ON {table}(v, f1, f2) USING 'vector_index'") + + desc = cql.execute(f"DESC INDEX {test_keyspace}.{idx}").one().create_statement + + assert f"CREATE CUSTOM INDEX {idx} ON {table}(v, f1, f2) USING 'vector_index'" in desc + +def test_describe_vector_index_local(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p1 int, p2 int, c int, v vector, PRIMARY KEY ((p1, p2), c)' + with new_test_table(cql, test_keyspace, schema) as table: + idx = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {idx} ON {table}((p1, p2), v) USING 'vector_index'") + + desc = cql.execute(f"DESC INDEX {test_keyspace}.{idx}").one().create_statement + + assert f"CREATE CUSTOM INDEX {idx} ON {table}((p1, p2), v) USING 'vector_index'" in desc + +def test_describe_vector_index_local_with_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p1 int, p2 int, c int, v vector, f1 text, f2 text, PRIMARY KEY ((p1, p2), c)' + with new_test_table(cql, test_keyspace, schema) as table: + idx = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {idx} ON {table}((p1, p2), v, f1, f2) USING 'vector_index'") + + desc = cql.execute(f"DESC INDEX {test_keyspace}.{idx}").one().create_statement + + assert f"CREATE CUSTOM INDEX {idx} ON {table}((p1, p2), v, f1, f2) USING 'vector_index'" in desc + def test_vector_index_version_on_recreate(cql, test_keyspace, scylla_only, skip_without_tablets): schema = 'p int primary key, v vector'