diff --git a/configure.py b/configure.py index 0d5acfe4f9..02a59ae124 100755 --- a/configure.py +++ b/configure.py @@ -1714,6 +1714,7 @@ deps['test/boost/combined_tests'] += [ 'test/boost/tracing_test.cc', 'test/boost/user_function_test.cc', 'test/boost/user_types_test.cc', + 'test/boost/vector_index_test.cc', 'test/boost/view_build_test.cc', 'test/boost/view_complex_test.cc', 'test/boost/view_schema_ckey_test.cc', diff --git a/cql3/statements/create_index_statement.cc b/cql3/statements/create_index_statement.cc index acf2407da6..e2a113dedd 100644 --- a/cql3/statements/create_index_statement.cc +++ b/cql3/statements/create_index_statement.cc @@ -8,6 +8,7 @@ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0) */ +#include #include #include "create_index_statement.hh" #include "db/config.hh" @@ -37,6 +38,7 @@ #include "types/concrete_types.hh" #include "db/tags/extension.hh" #include "tombstone_gc_extension.hh" +#include "index/secondary_index.hh" #include @@ -116,6 +118,15 @@ static data_type type_for_computed_column(cql3::statements::index_target::target } } +static bool is_vector_capable_class(const sstring& class_name) { + return boost::iequals(class_name, "vector_index"); +} + +static bool is_vector_index(const index_options_map& options) { + auto class_it = options.find(db::index::secondary_index::custom_class_option_name); + return class_it != options.end() && is_vector_capable_class(class_it->second); +} + view_ptr create_index_statement::create_view_for_index(const schema_ptr schema, const index_metadata& im, const data_dictionary::database& db) const { @@ -266,7 +277,7 @@ create_index_statement::validate(query_processor& qp, const service::client_stat _idx_properties->validate(); // FIXME: This is ugly and can be improved. - const bool is_vector_index = _idx_properties->custom_class && *_idx_properties->custom_class == "vector_index"; + const bool is_vector_index = _idx_properties->custom_class && is_vector_capable_class(*_idx_properties->custom_class); const bool uses_view_properties = _view_properties.properties()->count() > 0 || _view_properties.use_compact_storage() || _view_properties.defined_ordering().size() > 0; @@ -697,7 +708,9 @@ index_metadata create_index_statement::make_index_metadata(const std::vector<::s const index_options_map& options) { index_options_map new_options = options; - auto target_option = secondary_index::target_parser::serialize_targets(targets); + auto target_option = is_vector_index(options) + ? secondary_index::vector_index::serialize_targets(targets) + : secondary_index::target_parser::serialize_targets(targets); new_options.emplace(index_target::target_option_name, target_option); const auto& first_target = targets.front()->value; diff --git a/docs/dev/vector_index.md b/docs/dev/vector_index.md new file mode 100644 index 0000000000..ba615d74f1 --- /dev/null +++ b/docs/dev/vector_index.md @@ -0,0 +1,10 @@ +# Vector index in Scylla + +Vector indexes are custom indexes (USING 'vector\_index'). Their `target` option in `system_schema.indexes` uses following format: + +- Simple single-column vector index `(v)`: just the (escaped) column name, e.g. `v` +- Vector index with filtering columns `(v, f1, f2)`: JSON with `tc` (target column) and `fc` (filtering columns): `{"tc":"v","fc":["f1","f2"]}` +- Local vector index `((p1, p2), v)`: JSON with `tc` and `pk` (partition key columns): `{"tc":"v","pk":["p1","p2"]}` +- Local vector index with filtering columns `((p1, p2), v, f1, f2)`: JSON with `tc`, `pk`, and `fc`: `{"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]}` + +The `target` option acts as the interface for the vector-store service, providing the metadata necessary to determine which columns are indexed and how they are structured. diff --git a/index/vector_index.cc b/index/vector_index.cc index 890e437152..c9d30e4b3f 100644 --- a/index/vector_index.cc +++ b/index/vector_index.cc @@ -105,17 +105,79 @@ const static std::unordered_map json_value = rjson::try_parse(targets); - if (!json_value || !json_value->IsObject()) { - return target_parser::get_target_column_name_from_string(targets); +static constexpr auto TC_TARGET_KEY = "tc"; +static constexpr auto PK_TARGET_KEY = "pk"; +static constexpr auto FC_TARGET_KEY = "fc"; + +// Serialize vector index targets into a format using: +// "tc" for the target (vector) column, +// "pk" for partition key columns (local index), +// "fc" for filtering columns. +// For a simple single-column vector index, returns just the column name. +// Examples: +// (v) -> "v" +// (v, f1, f2) -> {"tc":"v","fc":["f1","f2"]} +// ((p1, p2), v) -> {"tc":"v","pk":["p1","p2"]} +// ((p1, p2), v, f1, f2) -> {"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]} +sstring vector_index::serialize_targets(const std::vector<::shared_ptr>& targets) { + using cql3::statements::index_target; + + if (targets.size() == 0) { + throw exceptions::invalid_request_exception("Vector index must have at least one target column"); } - rjson::value* pk = rjson::find(*json_value, "pk"); - if (pk && pk->IsArray() && !pk->Empty()) { - return sstring(rjson::to_string_view(pk->GetArray()[0])); + if (targets.size() == 1) { + auto tc = targets[0]->value; + if (!std::holds_alternative(tc)) { + throw exceptions::invalid_request_exception("Missing vector column target for local vector index"); + } + return index_target::escape_target_column(*std::get(tc)); } - return target_parser::get_target_column_name_from_string(targets); + + const bool has_pk = std::holds_alternative(targets.front()->value); + const size_t tc_idx = has_pk ? 1 : 0; + const size_t fc_count = targets.size() - tc_idx - 1; + + if (!std::holds_alternative(targets[tc_idx]->value)) { + throw exceptions::invalid_request_exception("Vector index target column must be a single column"); + } + + rjson::value json_map = rjson::empty_object(); + rjson::add_with_string_name(json_map, TC_TARGET_KEY, rjson::from_string(std::get(targets[tc_idx]->value)->text())); + + if (has_pk) { + rjson::value pk_json = rjson::empty_array(); + for (const auto& col : std::get(targets.front()->value)) { + rjson::push_back(pk_json, rjson::from_string(col->text())); + } + rjson::add_with_string_name(json_map, PK_TARGET_KEY, std::move(pk_json)); + } + + if (fc_count > 0) { + rjson::value fc_json = rjson::empty_array(); + for (size_t i = tc_idx + 1; i < targets.size(); ++i) { + if (!std::holds_alternative(targets[i]->value)) { + throw exceptions::invalid_request_exception("Vector index filtering column must be a single column"); + } + rjson::push_back(fc_json, rjson::from_string(std::get(targets[i]->value)->text())); + } + rjson::add_with_string_name(json_map, FC_TARGET_KEY, std::move(fc_json)); + } + + return rjson::print(json_map); +} + +sstring vector_index::get_target_column(const sstring& targets) { + std::optional json_value = rjson::try_parse(targets); + if (!json_value || !json_value->IsObject()) { + return cql3::statements::index_target::column_name_from_target_string(targets); + } + + rjson::value* tc = rjson::find(*json_value, TC_TARGET_KEY); + if (tc && tc->IsString()) { + return sstring(rjson::to_string_view(*tc)); + } + return cql3::statements::index_target::column_name_from_target_string(targets); } bool vector_index::is_rescoring_enabled(const index_options_map& properties) { @@ -346,7 +408,7 @@ bool vector_index::is_vector_index_on_column(const index_metadata& im, const sst auto target_it = im.options().find(cql3_parser::index_target::target_option_name); if (class_it != im.options().end() && target_it != im.options().end()) { auto custom_class = secondary_index_manager::get_custom_class_factory(class_it->second); - return custom_class && dynamic_cast((*custom_class)().get()) && get_vector_index_target_column(target_it->second) == target_name; + return custom_class && dynamic_cast((*custom_class)().get()) && get_target_column(target_it->second) == target_name; } return false; } diff --git a/index/vector_index.hh b/index/vector_index.hh index 5756c02ef1..7aedb3a1cf 100644 --- a/index/vector_index.hh +++ b/index/vector_index.hh @@ -37,6 +37,9 @@ public: static bool is_vector_index_on_column(const index_metadata& im, const sstring& target_name); static void check_cdc_options(const schema& schema); + static sstring serialize_targets(const std::vector<::shared_ptr>& targets); + static sstring get_target_column(const sstring& targets); + static bool is_rescoring_enabled(const index_options_map& properties); static float get_oversampling(const index_options_map& properties); static sstring get_cql_similarity_function_name(const index_options_map& properties); diff --git a/test/boost/CMakeLists.txt b/test/boost/CMakeLists.txt index 23f01b4fbb..f08fd34d3b 100644 --- a/test/boost/CMakeLists.txt +++ b/test/boost/CMakeLists.txt @@ -375,6 +375,7 @@ add_scylla_test(combined_tests tracing_test.cc user_function_test.cc user_types_test.cc + vector_index_test.cc view_build_test.cc view_complex_test.cc view_schema_ckey_test.cc diff --git a/test/boost/vector_index_test.cc b/test/boost/vector_index_test.cc new file mode 100644 index 0000000000..bb3cfb3c3f --- /dev/null +++ b/test/boost/vector_index_test.cc @@ -0,0 +1,189 @@ +/* + * Copyright (C) 2026-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 + */ + +#include +#include "cql3/column_identifier.hh" +#include "cql3/statements/index_target.hh" +#include "exceptions/exceptions.hh" +#include "index/vector_index.hh" +#include "utils/rjson.hh" + +using namespace secondary_index; +using namespace cql3; + +using statements::index_target; + +BOOST_AUTO_TEST_SUITE(vector_index_test) + +namespace { + +::shared_ptr make_single(const sstring& name) { + auto col = ::make_shared(name, true); + return ::make_shared(col, index_target::target_type::regular_values); +} + +::shared_ptr make_multi(const std::vector& names) { + std::vector<::shared_ptr> cols; + cols.reserve(names.size()); + for (const auto& n : names) { + cols.push_back(::make_shared(n, true)); + } + return ::make_shared(std::move(cols), index_target::target_type::regular_values); +} + +} // namespace + +BOOST_AUTO_TEST_CASE(serialize_targets_empty_targets_throws) { + std::vector<::shared_ptr> targets; + BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_single_pk_only_target_throws) { + std::vector<::shared_ptr> targets = {make_multi({"p1", "p2"})}; + BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception); +} + + +BOOST_AUTO_TEST_CASE(serialize_targets_single_column) { + std::vector<::shared_ptr> targets = { + make_single("v"), + }; + BOOST_CHECK_EQUAL(vector_index::serialize_targets(targets), "v"); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_with_filtering_columns) { + std::vector<::shared_ptr> targets = { + make_single("v"), + make_single("f1"), + make_single("f2"), + }; + + const auto result = vector_index::serialize_targets(targets); + const auto json = rjson::parse(result); + + BOOST_REQUIRE(json.IsObject()); + + BOOST_REQUIRE(!rjson::find(json, "pk")); + + auto tc = rjson::find(json, "tc"); + BOOST_REQUIRE(tc); + BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v"); + + const auto* fc = rjson::find(json, "fc"); + BOOST_REQUIRE(fc); + BOOST_REQUIRE(fc->IsArray()); + BOOST_REQUIRE_EQUAL(fc->Size(), 2); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[0])), "f1"); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[1])), "f2"); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_local_index) { + std::vector<::shared_ptr> targets = { + make_multi({"p1", "p2"}), + make_single("v"), + }; + + const auto result = vector_index::serialize_targets(targets); + const auto json = rjson::parse(result); + + BOOST_REQUIRE(json.IsObject()); + + const auto* pk = rjson::find(json, "pk"); + BOOST_REQUIRE(pk); + BOOST_REQUIRE(pk->IsArray()); + BOOST_REQUIRE_EQUAL(pk->Size(), 2); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[0])), "p1"); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[1])), "p2"); + + auto tc = rjson::find(json, "tc"); + BOOST_REQUIRE(tc); + BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v"); + + BOOST_REQUIRE(!rjson::find(json, "fc")); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_local_index_with_filtering_columns) { + std::vector<::shared_ptr> targets = { + make_multi({"p1", "p2"}), + make_single("v"), + make_single("f1"), + make_single("f2"), + }; + + const auto result = vector_index::serialize_targets(targets); + const auto json = rjson::parse(result); + + BOOST_REQUIRE(json.IsObject()); + + const auto* pk = rjson::find(json, "pk"); + BOOST_REQUIRE(pk); + BOOST_REQUIRE(pk->IsArray()); + BOOST_REQUIRE_EQUAL(pk->Size(), 2); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[0])), "p1"); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[1])), "p2"); + + auto tc = rjson::find(json, "tc"); + BOOST_REQUIRE(tc); + BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v"); + + const auto* fc = rjson::find(json, "fc"); + BOOST_REQUIRE(fc); + BOOST_REQUIRE(fc->IsArray()); + BOOST_REQUIRE_EQUAL(fc->Size(), 2); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[0])), "f1"); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[1])), "f2"); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_multi_column_target_throws) { + std::vector<::shared_ptr> targets = {make_multi({"p1"}), make_multi({"c1"})}; + BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_multi_column_filtering_throws) { + std::vector<::shared_ptr> targets = {make_single("v"), make_multi({"c1"})}; + BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception); +} + +BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_uppercase_letters_from_escaped_string) { + std::vector<::shared_ptr> targets = {make_single("MyCol")}; + const auto serialized = vector_index::serialize_targets(targets); + + BOOST_CHECK_EQUAL(serialized, "\"MyCol\""); + BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "MyCol"); +} + +BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_quotes_from_escaped_string) { + std::vector<::shared_ptr> targets = {make_single("a\"b")}; + const auto serialized = vector_index::serialize_targets(targets); + + BOOST_CHECK_EQUAL(serialized, "\"a\"\"b\""); + BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "a\"b"); +} + +BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_uppercase_letters_from_json) { + std::vector<::shared_ptr> targets = { + make_single("MyCol"), + make_single("f1"), + }; + const auto serialized = vector_index::serialize_targets(targets); + + BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "MyCol"); +} + +BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_quotes_from_json) { + std::vector<::shared_ptr> targets = { + make_single("a\"b"), + make_single("f1"), + }; + const auto serialized = vector_index::serialize_targets(targets); + + BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "a\"b"); +} + + +BOOST_AUTO_TEST_SUITE_END() diff --git a/test/cqlpy/test_vector_index.py b/test/cqlpy/test_vector_index.py index 0015f30602..2de0ab9804 100644 --- a/test/cqlpy/test_vector_index.py +++ b/test/cqlpy/test_vector_index.py @@ -7,6 +7,7 @@ ############################################################################### import pytest +import json from .util import new_test_table, is_scylla, unique_name from cassandra.protocol import InvalidRequest, ConfigurationException @@ -283,6 +284,45 @@ def test_vector_index_target_serialization(cql, test_keyspace, scylla_only, skip assert len(res) == 1 assert res[0].options['target'] == 'v' +# Test "target" option serialization for vector index with filtering columns. +def test_vector_index_target_serialization_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p int primary key, v vector, f1 int, f2 int' + with new_test_table(cql, test_keyspace, schema) as table: + index_name = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}(v, f1, f2) USING 'vector_index'") + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + assert json.loads(res[0].options['target']) == {"tc": "v", "fc": ["f1", "f2"]} + +# Test "target" option serialization for local vector index. +def test_vector_index_target_serialization_local_index(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p1 int, p2 int, c1 int, c2 int, v vector, PRIMARY KEY ((p1, p2), c1, c2)' + with new_test_table(cql, test_keyspace, schema) as table: + index_name = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}((p1, p2), v) USING 'vector_index'") + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + assert json.loads(res[0].options['target']) == {"tc": "v", "pk": ["p1", "p2"]} + +# Test "target" option serialization for local vector index with filtering columns. +def test_vector_index_target_serialization_local_index_with_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p1 int, p2 int, c1 int, c2 int, v vector, f1 text, f2 text, PRIMARY KEY ((p1, p2), c1, c2)' + with new_test_table(cql, test_keyspace, schema) as table: + index_name = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}((p1, p2), v, f1, f2) USING 'vector_index'") + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + assert json.loads(res[0].options['target']) == {"tc": "v", "pk": ["p1", "p2"], "fc": ["f1", "f2"]} + def test_one_vector_index_on_column(cql, test_keyspace, skip_without_tablets): schema = "p int primary key, v vector" if is_scylla(cql): diff --git a/test/vector_search/vector_store_client_test.cc b/test/vector_search/vector_store_client_test.cc index f8c10f4f28..1a494ecf82 100644 --- a/test/vector_search/vector_store_client_test.cc +++ b/test/vector_search/vector_store_client_test.cc @@ -1277,3 +1277,26 @@ SEASTAR_TEST_CASE(vector_store_client_vector_index_with_additional_filtering_col co_await server->stop(); })); } + +SEASTAR_TEST_CASE(vector_store_client_local_vector_index) { + auto server = co_await make_vs_mock_server(); + + auto cfg = make_config(); + cfg.db_config->vector_store_primary_uri.set(format("http://server.node:{}", server->port())); + co_await do_with_cql_env( + [&](cql_test_env& env) -> future<> { + auto schema = co_await create_test_table(env, "ks", "test"); + auto& vs = env.local_qp().vector_store_client(); + configure(vs).with_dns({{"server.node", std::vector{server->host()}}}); + vs.start_background_tasks(); + // Create a local vector index on the 'embedding' column. + auto result = co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test ((pk1, pk2), embedding) USING 'vector_index'"); + + BOOST_CHECK_NO_THROW( + co_await env.execute_cql("SELECT * FROM ks.test WHERE pk1 = 1 AND pk2 = 2 ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;")); + }, + cfg) + .finally(seastar::coroutine::lambda([&] -> future<> { + co_await server->stop(); + })); +}