From 6bc88e817f012331b463c47decf1320c8d4e7486 Mon Sep 17 00:00:00 2001 From: Karol Nowacki Date: Tue, 24 Mar 2026 05:40:04 +0100 Subject: [PATCH] vector_search: fix SELECT on local vector index Queries against local vector indexes were failing with the error: "ANN ordering by vector requires the column to be indexed using 'vector_index'" This was a regression introduced by 15788c3734, which incorrectly assumed the first column in the targets list is always the vector column. For local vector indexes, the first column is the partition key, causing the failure. Previously, serialization logic for the target index option was shared between vector and secondary indexes. This is no longer viable due to the introduction of local vector indexes and vector indexes with filtering columns, which have different target format. This commit introduces a dedicated JSON-based serialization format for vector index targets, identifying the target column (tc), filtering columns (fc), and partition key columns (pk). This ensures unambiguous serialization and deserialization for all vector index types. This change is backward compatible for regular vector indexes. However, it breaks compatibility for local vector indexes and vector indexes with filtering columns created in version 2026.1.0. To mitigate this, usage of these specific index types will be blocked in the 2026.1.0 release by failing ANN queries against them in vector-store service. Fixes: SCYLLADB-895 --- configure.py | 1 + cql3/statements/create_index_statement.cc | 17 +- docs/dev/vector_index.md | 10 + index/vector_index.cc | 80 +++++++- index/vector_index.hh | 3 + test/boost/CMakeLists.txt | 1 + test/boost/vector_index_test.cc | 189 ++++++++++++++++++ test/cqlpy/test_vector_index.py | 40 ++++ .../vector_search/vector_store_client_test.cc | 23 +++ 9 files changed, 353 insertions(+), 11 deletions(-) create mode 100644 docs/dev/vector_index.md create mode 100644 test/boost/vector_index_test.cc diff --git a/configure.py b/configure.py index 0d5acfe4f9..02a59ae124 100755 --- a/configure.py +++ b/configure.py @@ -1714,6 +1714,7 @@ deps['test/boost/combined_tests'] += [ 'test/boost/tracing_test.cc', 'test/boost/user_function_test.cc', 'test/boost/user_types_test.cc', + 'test/boost/vector_index_test.cc', 'test/boost/view_build_test.cc', 'test/boost/view_complex_test.cc', 'test/boost/view_schema_ckey_test.cc', diff --git a/cql3/statements/create_index_statement.cc b/cql3/statements/create_index_statement.cc index acf2407da6..e2a113dedd 100644 --- a/cql3/statements/create_index_statement.cc +++ b/cql3/statements/create_index_statement.cc @@ -8,6 +8,7 @@ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0) */ +#include #include #include "create_index_statement.hh" #include "db/config.hh" @@ -37,6 +38,7 @@ #include "types/concrete_types.hh" #include "db/tags/extension.hh" #include "tombstone_gc_extension.hh" +#include "index/secondary_index.hh" #include @@ -116,6 +118,15 @@ static data_type type_for_computed_column(cql3::statements::index_target::target } } +static bool is_vector_capable_class(const sstring& class_name) { + return boost::iequals(class_name, "vector_index"); +} + +static bool is_vector_index(const index_options_map& options) { + auto class_it = options.find(db::index::secondary_index::custom_class_option_name); + return class_it != options.end() && is_vector_capable_class(class_it->second); +} + view_ptr create_index_statement::create_view_for_index(const schema_ptr schema, const index_metadata& im, const data_dictionary::database& db) const { @@ -266,7 +277,7 @@ create_index_statement::validate(query_processor& qp, const service::client_stat _idx_properties->validate(); // FIXME: This is ugly and can be improved. - const bool is_vector_index = _idx_properties->custom_class && *_idx_properties->custom_class == "vector_index"; + const bool is_vector_index = _idx_properties->custom_class && is_vector_capable_class(*_idx_properties->custom_class); const bool uses_view_properties = _view_properties.properties()->count() > 0 || _view_properties.use_compact_storage() || _view_properties.defined_ordering().size() > 0; @@ -697,7 +708,9 @@ index_metadata create_index_statement::make_index_metadata(const std::vector<::s const index_options_map& options) { index_options_map new_options = options; - auto target_option = secondary_index::target_parser::serialize_targets(targets); + auto target_option = is_vector_index(options) + ? secondary_index::vector_index::serialize_targets(targets) + : secondary_index::target_parser::serialize_targets(targets); new_options.emplace(index_target::target_option_name, target_option); const auto& first_target = targets.front()->value; diff --git a/docs/dev/vector_index.md b/docs/dev/vector_index.md new file mode 100644 index 0000000000..ba615d74f1 --- /dev/null +++ b/docs/dev/vector_index.md @@ -0,0 +1,10 @@ +# Vector index in Scylla + +Vector indexes are custom indexes (USING 'vector\_index'). Their `target` option in `system_schema.indexes` uses following format: + +- Simple single-column vector index `(v)`: just the (escaped) column name, e.g. `v` +- Vector index with filtering columns `(v, f1, f2)`: JSON with `tc` (target column) and `fc` (filtering columns): `{"tc":"v","fc":["f1","f2"]}` +- Local vector index `((p1, p2), v)`: JSON with `tc` and `pk` (partition key columns): `{"tc":"v","pk":["p1","p2"]}` +- Local vector index with filtering columns `((p1, p2), v, f1, f2)`: JSON with `tc`, `pk`, and `fc`: `{"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]}` + +The `target` option acts as the interface for the vector-store service, providing the metadata necessary to determine which columns are indexed and how they are structured. diff --git a/index/vector_index.cc b/index/vector_index.cc index 890e437152..c9d30e4b3f 100644 --- a/index/vector_index.cc +++ b/index/vector_index.cc @@ -105,17 +105,79 @@ const static std::unordered_map json_value = rjson::try_parse(targets); - if (!json_value || !json_value->IsObject()) { - return target_parser::get_target_column_name_from_string(targets); +static constexpr auto TC_TARGET_KEY = "tc"; +static constexpr auto PK_TARGET_KEY = "pk"; +static constexpr auto FC_TARGET_KEY = "fc"; + +// Serialize vector index targets into a format using: +// "tc" for the target (vector) column, +// "pk" for partition key columns (local index), +// "fc" for filtering columns. +// For a simple single-column vector index, returns just the column name. +// Examples: +// (v) -> "v" +// (v, f1, f2) -> {"tc":"v","fc":["f1","f2"]} +// ((p1, p2), v) -> {"tc":"v","pk":["p1","p2"]} +// ((p1, p2), v, f1, f2) -> {"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]} +sstring vector_index::serialize_targets(const std::vector<::shared_ptr>& targets) { + using cql3::statements::index_target; + + if (targets.size() == 0) { + throw exceptions::invalid_request_exception("Vector index must have at least one target column"); } - rjson::value* pk = rjson::find(*json_value, "pk"); - if (pk && pk->IsArray() && !pk->Empty()) { - return sstring(rjson::to_string_view(pk->GetArray()[0])); + if (targets.size() == 1) { + auto tc = targets[0]->value; + if (!std::holds_alternative(tc)) { + throw exceptions::invalid_request_exception("Missing vector column target for local vector index"); + } + return index_target::escape_target_column(*std::get(tc)); } - return target_parser::get_target_column_name_from_string(targets); + + const bool has_pk = std::holds_alternative(targets.front()->value); + const size_t tc_idx = has_pk ? 1 : 0; + const size_t fc_count = targets.size() - tc_idx - 1; + + if (!std::holds_alternative(targets[tc_idx]->value)) { + throw exceptions::invalid_request_exception("Vector index target column must be a single column"); + } + + rjson::value json_map = rjson::empty_object(); + rjson::add_with_string_name(json_map, TC_TARGET_KEY, rjson::from_string(std::get(targets[tc_idx]->value)->text())); + + if (has_pk) { + rjson::value pk_json = rjson::empty_array(); + for (const auto& col : std::get(targets.front()->value)) { + rjson::push_back(pk_json, rjson::from_string(col->text())); + } + rjson::add_with_string_name(json_map, PK_TARGET_KEY, std::move(pk_json)); + } + + if (fc_count > 0) { + rjson::value fc_json = rjson::empty_array(); + for (size_t i = tc_idx + 1; i < targets.size(); ++i) { + if (!std::holds_alternative(targets[i]->value)) { + throw exceptions::invalid_request_exception("Vector index filtering column must be a single column"); + } + rjson::push_back(fc_json, rjson::from_string(std::get(targets[i]->value)->text())); + } + rjson::add_with_string_name(json_map, FC_TARGET_KEY, std::move(fc_json)); + } + + return rjson::print(json_map); +} + +sstring vector_index::get_target_column(const sstring& targets) { + std::optional json_value = rjson::try_parse(targets); + if (!json_value || !json_value->IsObject()) { + return cql3::statements::index_target::column_name_from_target_string(targets); + } + + rjson::value* tc = rjson::find(*json_value, TC_TARGET_KEY); + if (tc && tc->IsString()) { + return sstring(rjson::to_string_view(*tc)); + } + return cql3::statements::index_target::column_name_from_target_string(targets); } bool vector_index::is_rescoring_enabled(const index_options_map& properties) { @@ -346,7 +408,7 @@ bool vector_index::is_vector_index_on_column(const index_metadata& im, const sst auto target_it = im.options().find(cql3_parser::index_target::target_option_name); if (class_it != im.options().end() && target_it != im.options().end()) { auto custom_class = secondary_index_manager::get_custom_class_factory(class_it->second); - return custom_class && dynamic_cast((*custom_class)().get()) && get_vector_index_target_column(target_it->second) == target_name; + return custom_class && dynamic_cast((*custom_class)().get()) && get_target_column(target_it->second) == target_name; } return false; } diff --git a/index/vector_index.hh b/index/vector_index.hh index 5756c02ef1..7aedb3a1cf 100644 --- a/index/vector_index.hh +++ b/index/vector_index.hh @@ -37,6 +37,9 @@ public: static bool is_vector_index_on_column(const index_metadata& im, const sstring& target_name); static void check_cdc_options(const schema& schema); + static sstring serialize_targets(const std::vector<::shared_ptr>& targets); + static sstring get_target_column(const sstring& targets); + static bool is_rescoring_enabled(const index_options_map& properties); static float get_oversampling(const index_options_map& properties); static sstring get_cql_similarity_function_name(const index_options_map& properties); diff --git a/test/boost/CMakeLists.txt b/test/boost/CMakeLists.txt index 23f01b4fbb..f08fd34d3b 100644 --- a/test/boost/CMakeLists.txt +++ b/test/boost/CMakeLists.txt @@ -375,6 +375,7 @@ add_scylla_test(combined_tests tracing_test.cc user_function_test.cc user_types_test.cc + vector_index_test.cc view_build_test.cc view_complex_test.cc view_schema_ckey_test.cc diff --git a/test/boost/vector_index_test.cc b/test/boost/vector_index_test.cc new file mode 100644 index 0000000000..bb3cfb3c3f --- /dev/null +++ b/test/boost/vector_index_test.cc @@ -0,0 +1,189 @@ +/* + * Copyright (C) 2026-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 + */ + +#include +#include "cql3/column_identifier.hh" +#include "cql3/statements/index_target.hh" +#include "exceptions/exceptions.hh" +#include "index/vector_index.hh" +#include "utils/rjson.hh" + +using namespace secondary_index; +using namespace cql3; + +using statements::index_target; + +BOOST_AUTO_TEST_SUITE(vector_index_test) + +namespace { + +::shared_ptr make_single(const sstring& name) { + auto col = ::make_shared(name, true); + return ::make_shared(col, index_target::target_type::regular_values); +} + +::shared_ptr make_multi(const std::vector& names) { + std::vector<::shared_ptr> cols; + cols.reserve(names.size()); + for (const auto& n : names) { + cols.push_back(::make_shared(n, true)); + } + return ::make_shared(std::move(cols), index_target::target_type::regular_values); +} + +} // namespace + +BOOST_AUTO_TEST_CASE(serialize_targets_empty_targets_throws) { + std::vector<::shared_ptr> targets; + BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_single_pk_only_target_throws) { + std::vector<::shared_ptr> targets = {make_multi({"p1", "p2"})}; + BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception); +} + + +BOOST_AUTO_TEST_CASE(serialize_targets_single_column) { + std::vector<::shared_ptr> targets = { + make_single("v"), + }; + BOOST_CHECK_EQUAL(vector_index::serialize_targets(targets), "v"); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_with_filtering_columns) { + std::vector<::shared_ptr> targets = { + make_single("v"), + make_single("f1"), + make_single("f2"), + }; + + const auto result = vector_index::serialize_targets(targets); + const auto json = rjson::parse(result); + + BOOST_REQUIRE(json.IsObject()); + + BOOST_REQUIRE(!rjson::find(json, "pk")); + + auto tc = rjson::find(json, "tc"); + BOOST_REQUIRE(tc); + BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v"); + + const auto* fc = rjson::find(json, "fc"); + BOOST_REQUIRE(fc); + BOOST_REQUIRE(fc->IsArray()); + BOOST_REQUIRE_EQUAL(fc->Size(), 2); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[0])), "f1"); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[1])), "f2"); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_local_index) { + std::vector<::shared_ptr> targets = { + make_multi({"p1", "p2"}), + make_single("v"), + }; + + const auto result = vector_index::serialize_targets(targets); + const auto json = rjson::parse(result); + + BOOST_REQUIRE(json.IsObject()); + + const auto* pk = rjson::find(json, "pk"); + BOOST_REQUIRE(pk); + BOOST_REQUIRE(pk->IsArray()); + BOOST_REQUIRE_EQUAL(pk->Size(), 2); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[0])), "p1"); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[1])), "p2"); + + auto tc = rjson::find(json, "tc"); + BOOST_REQUIRE(tc); + BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v"); + + BOOST_REQUIRE(!rjson::find(json, "fc")); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_local_index_with_filtering_columns) { + std::vector<::shared_ptr> targets = { + make_multi({"p1", "p2"}), + make_single("v"), + make_single("f1"), + make_single("f2"), + }; + + const auto result = vector_index::serialize_targets(targets); + const auto json = rjson::parse(result); + + BOOST_REQUIRE(json.IsObject()); + + const auto* pk = rjson::find(json, "pk"); + BOOST_REQUIRE(pk); + BOOST_REQUIRE(pk->IsArray()); + BOOST_REQUIRE_EQUAL(pk->Size(), 2); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[0])), "p1"); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[1])), "p2"); + + auto tc = rjson::find(json, "tc"); + BOOST_REQUIRE(tc); + BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v"); + + const auto* fc = rjson::find(json, "fc"); + BOOST_REQUIRE(fc); + BOOST_REQUIRE(fc->IsArray()); + BOOST_REQUIRE_EQUAL(fc->Size(), 2); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[0])), "f1"); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[1])), "f2"); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_multi_column_target_throws) { + std::vector<::shared_ptr> targets = {make_multi({"p1"}), make_multi({"c1"})}; + BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_multi_column_filtering_throws) { + std::vector<::shared_ptr> targets = {make_single("v"), make_multi({"c1"})}; + BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception); +} + +BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_uppercase_letters_from_escaped_string) { + std::vector<::shared_ptr> targets = {make_single("MyCol")}; + const auto serialized = vector_index::serialize_targets(targets); + + BOOST_CHECK_EQUAL(serialized, "\"MyCol\""); + BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "MyCol"); +} + +BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_quotes_from_escaped_string) { + std::vector<::shared_ptr> targets = {make_single("a\"b")}; + const auto serialized = vector_index::serialize_targets(targets); + + BOOST_CHECK_EQUAL(serialized, "\"a\"\"b\""); + BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "a\"b"); +} + +BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_uppercase_letters_from_json) { + std::vector<::shared_ptr> targets = { + make_single("MyCol"), + make_single("f1"), + }; + const auto serialized = vector_index::serialize_targets(targets); + + BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "MyCol"); +} + +BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_quotes_from_json) { + std::vector<::shared_ptr> targets = { + make_single("a\"b"), + make_single("f1"), + }; + const auto serialized = vector_index::serialize_targets(targets); + + BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "a\"b"); +} + + +BOOST_AUTO_TEST_SUITE_END() diff --git a/test/cqlpy/test_vector_index.py b/test/cqlpy/test_vector_index.py index 0015f30602..2de0ab9804 100644 --- a/test/cqlpy/test_vector_index.py +++ b/test/cqlpy/test_vector_index.py @@ -7,6 +7,7 @@ ############################################################################### import pytest +import json from .util import new_test_table, is_scylla, unique_name from cassandra.protocol import InvalidRequest, ConfigurationException @@ -283,6 +284,45 @@ def test_vector_index_target_serialization(cql, test_keyspace, scylla_only, skip assert len(res) == 1 assert res[0].options['target'] == 'v' +# Test "target" option serialization for vector index with filtering columns. +def test_vector_index_target_serialization_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p int primary key, v vector, f1 int, f2 int' + with new_test_table(cql, test_keyspace, schema) as table: + index_name = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}(v, f1, f2) USING 'vector_index'") + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + assert json.loads(res[0].options['target']) == {"tc": "v", "fc": ["f1", "f2"]} + +# Test "target" option serialization for local vector index. +def test_vector_index_target_serialization_local_index(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p1 int, p2 int, c1 int, c2 int, v vector, PRIMARY KEY ((p1, p2), c1, c2)' + with new_test_table(cql, test_keyspace, schema) as table: + index_name = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}((p1, p2), v) USING 'vector_index'") + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + assert json.loads(res[0].options['target']) == {"tc": "v", "pk": ["p1", "p2"]} + +# Test "target" option serialization for local vector index with filtering columns. +def test_vector_index_target_serialization_local_index_with_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p1 int, p2 int, c1 int, c2 int, v vector, f1 text, f2 text, PRIMARY KEY ((p1, p2), c1, c2)' + with new_test_table(cql, test_keyspace, schema) as table: + index_name = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}((p1, p2), v, f1, f2) USING 'vector_index'") + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + assert json.loads(res[0].options['target']) == {"tc": "v", "pk": ["p1", "p2"], "fc": ["f1", "f2"]} + def test_one_vector_index_on_column(cql, test_keyspace, skip_without_tablets): schema = "p int primary key, v vector" if is_scylla(cql): diff --git a/test/vector_search/vector_store_client_test.cc b/test/vector_search/vector_store_client_test.cc index f8c10f4f28..1a494ecf82 100644 --- a/test/vector_search/vector_store_client_test.cc +++ b/test/vector_search/vector_store_client_test.cc @@ -1277,3 +1277,26 @@ SEASTAR_TEST_CASE(vector_store_client_vector_index_with_additional_filtering_col co_await server->stop(); })); } + +SEASTAR_TEST_CASE(vector_store_client_local_vector_index) { + auto server = co_await make_vs_mock_server(); + + auto cfg = make_config(); + cfg.db_config->vector_store_primary_uri.set(format("http://server.node:{}", server->port())); + co_await do_with_cql_env( + [&](cql_test_env& env) -> future<> { + auto schema = co_await create_test_table(env, "ks", "test"); + auto& vs = env.local_qp().vector_store_client(); + configure(vs).with_dns({{"server.node", std::vector{server->host()}}}); + vs.start_background_tasks(); + // Create a local vector index on the 'embedding' column. + auto result = co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test ((pk1, pk2), embedding) USING 'vector_index'"); + + BOOST_CHECK_NO_THROW( + co_await env.execute_cql("SELECT * FROM ks.test WHERE pk1 = 1 AND pk2 = 2 ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;")); + }, + cfg) + .finally(seastar::coroutine::lambda([&] -> future<> { + co_await server->stop(); + })); +}