diff --git a/configure.py b/configure.py index 90537153d5..897733b97f 100755 --- a/configure.py +++ b/configure.py @@ -1701,6 +1701,7 @@ deps['test/boost/combined_tests'] += [ 'test/boost/tracing_test.cc', 'test/boost/user_function_test.cc', 'test/boost/user_types_test.cc', + 'test/boost/vector_index_test.cc', 'test/boost/view_build_test.cc', 'test/boost/view_complex_test.cc', 'test/boost/view_schema_ckey_test.cc', diff --git a/cql3/statements/create_index_statement.cc b/cql3/statements/create_index_statement.cc index acf2407da6..e2a113dedd 100644 --- a/cql3/statements/create_index_statement.cc +++ b/cql3/statements/create_index_statement.cc @@ -8,6 +8,7 @@ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0) */ +#include #include #include "create_index_statement.hh" #include "db/config.hh" @@ -37,6 +38,7 @@ #include "types/concrete_types.hh" #include "db/tags/extension.hh" #include "tombstone_gc_extension.hh" +#include "index/secondary_index.hh" #include @@ -116,6 +118,15 @@ static data_type type_for_computed_column(cql3::statements::index_target::target } } +static bool is_vector_capable_class(const sstring& class_name) { + return boost::iequals(class_name, "vector_index"); +} + +static bool is_vector_index(const index_options_map& options) { + auto class_it = options.find(db::index::secondary_index::custom_class_option_name); + return class_it != options.end() && is_vector_capable_class(class_it->second); +} + view_ptr create_index_statement::create_view_for_index(const schema_ptr schema, const index_metadata& im, const data_dictionary::database& db) const { @@ -266,7 +277,7 @@ create_index_statement::validate(query_processor& qp, const service::client_stat _idx_properties->validate(); // FIXME: This is ugly and can be improved. - const bool is_vector_index = _idx_properties->custom_class && *_idx_properties->custom_class == "vector_index"; + const bool is_vector_index = _idx_properties->custom_class && is_vector_capable_class(*_idx_properties->custom_class); const bool uses_view_properties = _view_properties.properties()->count() > 0 || _view_properties.use_compact_storage() || _view_properties.defined_ordering().size() > 0; @@ -697,7 +708,9 @@ index_metadata create_index_statement::make_index_metadata(const std::vector<::s const index_options_map& options) { index_options_map new_options = options; - auto target_option = secondary_index::target_parser::serialize_targets(targets); + auto target_option = is_vector_index(options) + ? secondary_index::vector_index::serialize_targets(targets) + : secondary_index::target_parser::serialize_targets(targets); new_options.emplace(index_target::target_option_name, target_option); const auto& first_target = targets.front()->value; diff --git a/docs/dev/secondary_index.md b/docs/dev/secondary_index.md index 972d4a3364..b7d9ca1ca1 100644 --- a/docs/dev/secondary_index.md +++ b/docs/dev/secondary_index.md @@ -37,8 +37,17 @@ Global index's target is usually just the indexed column name, unless the index - index on map, set or list values: VALUES(v) - index on map entries: ENTRIES(v) -Their serialization is just string representation, so: -"v", "FULL(v)", "KEYS(v)", "VALUES(v)", "ENTRIES(v)" are all valid targets. +Their serialization uses lowercase type names as prefixes, except for `full` which is serialized +as just the column name (without any prefix): +`"v"`, `"keys(v)"`, `"values(v)"`, `"entries(v)"` are valid targets; a frozen full collection +index on column `v` is stored simply as `"v"` (same as a regular index). + +If the column name contains characters that could be confused with the above formats +(e.g., a name containing parentheses or braces), it is escaped using the CQL +quoted-identifier syntax (column_identifier::to_cql_string()), which wraps the +name in double quotes and doubles any embedded double-quote characters. For example, +a column named `hEllo` is stored as `"hEllo"`, and a column named `keys(m)` is +stored as `"keys(m)"`. ## Local index diff --git a/docs/dev/vector_index.md b/docs/dev/vector_index.md new file mode 100644 index 0000000000..ba615d74f1 --- /dev/null +++ b/docs/dev/vector_index.md @@ -0,0 +1,10 @@ +# Vector index in Scylla + +Vector indexes are custom indexes (USING 'vector\_index'). Their `target` option in `system_schema.indexes` uses following format: + +- Simple single-column vector index `(v)`: just the (escaped) column name, e.g. `v` +- Vector index with filtering columns `(v, f1, f2)`: JSON with `tc` (target column) and `fc` (filtering columns): `{"tc":"v","fc":["f1","f2"]}` +- Local vector index `((p1, p2), v)`: JSON with `tc` and `pk` (partition key columns): `{"tc":"v","pk":["p1","p2"]}` +- Local vector index with filtering columns `((p1, p2), v, f1, f2)`: JSON with `tc`, `pk`, and `fc`: `{"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]}` + +The `target` option acts as the interface for the vector-store service, providing the metadata necessary to determine which columns are indexed and how they are structured. diff --git a/index/vector_index.cc b/index/vector_index.cc index 890e437152..f23955f884 100644 --- a/index/vector_index.cc +++ b/index/vector_index.cc @@ -20,6 +20,7 @@ #include "types/concrete_types.hh" #include "types/types.hh" #include "utils/managed_string.hh" +#include #include #include @@ -103,19 +104,122 @@ const static std::unordered_map). +// +// JSON examples: +// {"tc":"v","fc":["f1","f2"]} -> "v, f1, f2" +// {"tc":"v","pk":["p1","p2"]} -> "(p1, p2), v" +// {"tc":"v","pk":["p1","p2"],"fc":["f1"]} -> "(p1, p2), v, f1" +static sstring targets_to_cql(const sstring& targets) { std::optional json_value = rjson::try_parse(targets); if (!json_value || !json_value->IsObject()) { - return target_parser::get_target_column_name_from_string(targets); + return cql3::util::maybe_quote(cql3::statements::index_target::column_name_from_target_string(targets)); } - rjson::value* pk = rjson::find(*json_value, "pk"); + sstring result; + + const rjson::value* pk = rjson::find(*json_value, PK_TARGET_KEY); if (pk && pk->IsArray() && !pk->Empty()) { - return sstring(rjson::to_string_view(pk->GetArray()[0])); + result += "("; + auto pk_cols = std::views::all(pk->GetArray()) | std::views::transform([&](const rjson::value& col) { + return cql3::util::maybe_quote(sstring(rjson::to_string_view(col))); + }) | std::ranges::to>(); + result += boost::algorithm::join(pk_cols, ", "); + result += "), "; } - return target_parser::get_target_column_name_from_string(targets); + + const rjson::value* tc = rjson::find(*json_value, TC_TARGET_KEY); + if (tc && tc->IsString()) { + result += cql3::util::maybe_quote(sstring(rjson::to_string_view(*tc))); + } + + const rjson::value* fc = rjson::find(*json_value, FC_TARGET_KEY); + if (fc && fc->IsArray()) { + for (rapidjson::SizeType i = 0; i < fc->Size(); ++i) { + result += ", "; + result += cql3::util::maybe_quote(sstring(rjson::to_string_view((*fc)[i]))); + } + } + + return result; +} + +// Serialize vector index targets into a format using: +// "tc" for the target (vector) column, +// "pk" for partition key columns (local index), +// "fc" for filtering columns. +// For a simple single-column vector index, returns just the column name. +// Examples: +// (v) -> "v" +// (v, f1, f2) -> {"tc":"v","fc":["f1","f2"]} +// ((p1, p2), v) -> {"tc":"v","pk":["p1","p2"]} +// ((p1, p2), v, f1, f2) -> {"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]} +sstring vector_index::serialize_targets(const std::vector<::shared_ptr>& targets) { + using cql3::statements::index_target; + + if (targets.size() == 0) { + throw exceptions::invalid_request_exception("Vector index must have at least one target column"); + } + + if (targets.size() == 1) { + auto tc = targets[0]->value; + if (!std::holds_alternative(tc)) { + throw exceptions::invalid_request_exception("Missing vector column target for local vector index"); + } + return index_target::escape_target_column(*std::get(tc)); + } + + const bool has_pk = std::holds_alternative(targets.front()->value); + const size_t tc_idx = has_pk ? 1 : 0; + const size_t fc_count = targets.size() - tc_idx - 1; + + if (!std::holds_alternative(targets[tc_idx]->value)) { + throw exceptions::invalid_request_exception("Vector index target column must be a single column"); + } + + rjson::value json_map = rjson::empty_object(); + rjson::add_with_string_name(json_map, TC_TARGET_KEY, rjson::from_string(std::get(targets[tc_idx]->value)->text())); + + if (has_pk) { + rjson::value pk_json = rjson::empty_array(); + for (const auto& col : std::get(targets.front()->value)) { + rjson::push_back(pk_json, rjson::from_string(col->text())); + } + rjson::add_with_string_name(json_map, PK_TARGET_KEY, std::move(pk_json)); + } + + if (fc_count > 0) { + rjson::value fc_json = rjson::empty_array(); + for (size_t i = tc_idx + 1; i < targets.size(); ++i) { + if (!std::holds_alternative(targets[i]->value)) { + throw exceptions::invalid_request_exception("Vector index filtering column must be a single column"); + } + rjson::push_back(fc_json, rjson::from_string(std::get(targets[i]->value)->text())); + } + rjson::add_with_string_name(json_map, FC_TARGET_KEY, std::move(fc_json)); + } + + return rjson::print(json_map); +} + +sstring vector_index::get_target_column(const sstring& targets) { + std::optional json_value = rjson::try_parse(targets); + if (!json_value || !json_value->IsObject()) { + return cql3::statements::index_target::column_name_from_target_string(targets); + } + + rjson::value* tc = rjson::find(*json_value, TC_TARGET_KEY); + if (tc && tc->IsString()) { + return sstring(rjson::to_string_view(*tc)); + } + return cql3::statements::index_target::column_name_from_target_string(targets); } bool vector_index::is_rescoring_enabled(const index_options_map& properties) { @@ -147,9 +251,8 @@ bool vector_index::view_should_exist() const { std::optional vector_index::describe(const index_metadata& im, const schema& base_schema) const { fragmented_ostringstream os; - os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON " - << cql3::util::maybe_quote(base_schema.ks_name()) << "." << cql3::util::maybe_quote(base_schema.cf_name()) - << "(" << cql3::util::maybe_quote(im.options().at(cql3::statements::index_target::target_option_name)) << ")" + os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON " << cql3::util::maybe_quote(base_schema.ks_name()) << "." + << cql3::util::maybe_quote(base_schema.cf_name()) << "(" << targets_to_cql(im.options().at(cql3::statements::index_target::target_option_name)) << ")" << " USING 'vector_index'"; return cql3::description{ @@ -346,7 +449,7 @@ bool vector_index::is_vector_index_on_column(const index_metadata& im, const sst auto target_it = im.options().find(cql3_parser::index_target::target_option_name); if (class_it != im.options().end() && target_it != im.options().end()) { auto custom_class = secondary_index_manager::get_custom_class_factory(class_it->second); - return custom_class && dynamic_cast((*custom_class)().get()) && get_vector_index_target_column(target_it->second) == target_name; + return custom_class && dynamic_cast((*custom_class)().get()) && get_target_column(target_it->second) == target_name; } return false; } diff --git a/index/vector_index.hh b/index/vector_index.hh index 5756c02ef1..7aedb3a1cf 100644 --- a/index/vector_index.hh +++ b/index/vector_index.hh @@ -37,6 +37,9 @@ public: static bool is_vector_index_on_column(const index_metadata& im, const sstring& target_name); static void check_cdc_options(const schema& schema); + static sstring serialize_targets(const std::vector<::shared_ptr>& targets); + static sstring get_target_column(const sstring& targets); + static bool is_rescoring_enabled(const index_options_map& properties); static float get_oversampling(const index_options_map& properties); static sstring get_cql_similarity_function_name(const index_options_map& properties); diff --git a/test/boost/CMakeLists.txt b/test/boost/CMakeLists.txt index 4d7e137f9a..452754883e 100644 --- a/test/boost/CMakeLists.txt +++ b/test/boost/CMakeLists.txt @@ -378,6 +378,7 @@ add_scylla_test(combined_tests tracing_test.cc user_function_test.cc user_types_test.cc + vector_index_test.cc view_build_test.cc view_complex_test.cc view_schema_ckey_test.cc diff --git a/test/boost/vector_index_test.cc b/test/boost/vector_index_test.cc new file mode 100644 index 0000000000..bb3cfb3c3f --- /dev/null +++ b/test/boost/vector_index_test.cc @@ -0,0 +1,189 @@ +/* + * Copyright (C) 2026-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 + */ + +#include +#include "cql3/column_identifier.hh" +#include "cql3/statements/index_target.hh" +#include "exceptions/exceptions.hh" +#include "index/vector_index.hh" +#include "utils/rjson.hh" + +using namespace secondary_index; +using namespace cql3; + +using statements::index_target; + +BOOST_AUTO_TEST_SUITE(vector_index_test) + +namespace { + +::shared_ptr make_single(const sstring& name) { + auto col = ::make_shared(name, true); + return ::make_shared(col, index_target::target_type::regular_values); +} + +::shared_ptr make_multi(const std::vector& names) { + std::vector<::shared_ptr> cols; + cols.reserve(names.size()); + for (const auto& n : names) { + cols.push_back(::make_shared(n, true)); + } + return ::make_shared(std::move(cols), index_target::target_type::regular_values); +} + +} // namespace + +BOOST_AUTO_TEST_CASE(serialize_targets_empty_targets_throws) { + std::vector<::shared_ptr> targets; + BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_single_pk_only_target_throws) { + std::vector<::shared_ptr> targets = {make_multi({"p1", "p2"})}; + BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception); +} + + +BOOST_AUTO_TEST_CASE(serialize_targets_single_column) { + std::vector<::shared_ptr> targets = { + make_single("v"), + }; + BOOST_CHECK_EQUAL(vector_index::serialize_targets(targets), "v"); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_with_filtering_columns) { + std::vector<::shared_ptr> targets = { + make_single("v"), + make_single("f1"), + make_single("f2"), + }; + + const auto result = vector_index::serialize_targets(targets); + const auto json = rjson::parse(result); + + BOOST_REQUIRE(json.IsObject()); + + BOOST_REQUIRE(!rjson::find(json, "pk")); + + auto tc = rjson::find(json, "tc"); + BOOST_REQUIRE(tc); + BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v"); + + const auto* fc = rjson::find(json, "fc"); + BOOST_REQUIRE(fc); + BOOST_REQUIRE(fc->IsArray()); + BOOST_REQUIRE_EQUAL(fc->Size(), 2); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[0])), "f1"); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[1])), "f2"); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_local_index) { + std::vector<::shared_ptr> targets = { + make_multi({"p1", "p2"}), + make_single("v"), + }; + + const auto result = vector_index::serialize_targets(targets); + const auto json = rjson::parse(result); + + BOOST_REQUIRE(json.IsObject()); + + const auto* pk = rjson::find(json, "pk"); + BOOST_REQUIRE(pk); + BOOST_REQUIRE(pk->IsArray()); + BOOST_REQUIRE_EQUAL(pk->Size(), 2); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[0])), "p1"); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[1])), "p2"); + + auto tc = rjson::find(json, "tc"); + BOOST_REQUIRE(tc); + BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v"); + + BOOST_REQUIRE(!rjson::find(json, "fc")); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_local_index_with_filtering_columns) { + std::vector<::shared_ptr> targets = { + make_multi({"p1", "p2"}), + make_single("v"), + make_single("f1"), + make_single("f2"), + }; + + const auto result = vector_index::serialize_targets(targets); + const auto json = rjson::parse(result); + + BOOST_REQUIRE(json.IsObject()); + + const auto* pk = rjson::find(json, "pk"); + BOOST_REQUIRE(pk); + BOOST_REQUIRE(pk->IsArray()); + BOOST_REQUIRE_EQUAL(pk->Size(), 2); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[0])), "p1"); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[1])), "p2"); + + auto tc = rjson::find(json, "tc"); + BOOST_REQUIRE(tc); + BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v"); + + const auto* fc = rjson::find(json, "fc"); + BOOST_REQUIRE(fc); + BOOST_REQUIRE(fc->IsArray()); + BOOST_REQUIRE_EQUAL(fc->Size(), 2); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[0])), "f1"); + BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[1])), "f2"); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_multi_column_target_throws) { + std::vector<::shared_ptr> targets = {make_multi({"p1"}), make_multi({"c1"})}; + BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception); +} + +BOOST_AUTO_TEST_CASE(serialize_targets_multi_column_filtering_throws) { + std::vector<::shared_ptr> targets = {make_single("v"), make_multi({"c1"})}; + BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception); +} + +BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_uppercase_letters_from_escaped_string) { + std::vector<::shared_ptr> targets = {make_single("MyCol")}; + const auto serialized = vector_index::serialize_targets(targets); + + BOOST_CHECK_EQUAL(serialized, "\"MyCol\""); + BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "MyCol"); +} + +BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_quotes_from_escaped_string) { + std::vector<::shared_ptr> targets = {make_single("a\"b")}; + const auto serialized = vector_index::serialize_targets(targets); + + BOOST_CHECK_EQUAL(serialized, "\"a\"\"b\""); + BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "a\"b"); +} + +BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_uppercase_letters_from_json) { + std::vector<::shared_ptr> targets = { + make_single("MyCol"), + make_single("f1"), + }; + const auto serialized = vector_index::serialize_targets(targets); + + BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "MyCol"); +} + +BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_quotes_from_json) { + std::vector<::shared_ptr> targets = { + make_single("a\"b"), + make_single("f1"), + }; + const auto serialized = vector_index::serialize_targets(targets); + + BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "a\"b"); +} + + +BOOST_AUTO_TEST_SUITE_END() diff --git a/test/cqlpy/test_secondary_index.py b/test/cqlpy/test_secondary_index.py index a4f8990e24..09437209fc 100644 --- a/test/cqlpy/test_secondary_index.py +++ b/test/cqlpy/test_secondary_index.py @@ -4,6 +4,7 @@ # Tests for secondary indexes +import json import random import itertools import time @@ -2031,6 +2032,82 @@ def test_index_in_system_schema_indexes(cql, built_index): assert res[0].kind == 'COMPOSITES' assert res[0].options == {'target': 'v'} +# Test that the "target" option in system_schema.indexes is serialized +# correctly for secondary indexes on collection columns. +# This format is critical for backward compatibility, as it's read from +# disk on startup to rebuild indexes. An incompatible change would prevent +# existing indexes from being recreated after an upgrade. +def test_global_collection_index_target_serialization(cql, test_keyspace): + schema = "p int PRIMARY KEY, m map, fl frozen>" + with new_test_table(cql, test_keyspace, schema) as table: + # Scylla normalizes full(col) targets to just the column name; + # Cassandra keeps the full(col) prefix. + full_target = "fl" if is_scylla(cql) else "full(fl)" + cases = [ + ("keys(m)", "keys(m)"), + ("values(m)", "values(m)"), + ("entries(m)", "entries(m)"), + ("full(fl)", full_target), + ] + for index_expr, expected_target in cases: + index_name = unique_name() + cql.execute(f"CREATE INDEX {index_name} ON {table}({index_expr})") + wait_for_index(cql, test_keyspace, index_name) + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + assert res[0].kind == 'COMPOSITES' + assert res[0].options == {'target': expected_target}, \ + f"For index expression '{index_expr}': expected target '{expected_target}', got '{res[0].options}'" + +# Test that the "target" option in system_schema.indexes is serialized +# correctly when the indexed column name contains special characters +# (e.g., upper-case, spaces, braces, or keywords like "keys(m)"). +# The encoding uses the CQL quoted-identifier form, so e.g. column "hEllo" +# is stored as '"hEllo"'. An incompatible change here would break index +# lookup after an upgrade. +def test_global_index_target_serialization_quoted_names(cql, test_keyspace): + # Column names requiring quoting in CQL (mixed-case, space, characters + # that would otherwise be confused with target-format prefixes or JSON). + quoted_names = ['"hEllo"', '"x y"', '"keys(m)"'] + schema = 'p int PRIMARY KEY, ' + ', '.join(name + " int" for name in quoted_names) + with new_test_table(cql, test_keyspace, schema) as table: + for name in quoted_names: + index_name = unique_name() + cql.execute(f"CREATE INDEX {index_name} ON {table}({name})") + wait_for_index(cql, test_keyspace, index_name) + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + assert res[0].kind == 'COMPOSITES' + # The target is the CQL representation of the column name, + # i.e., quoted exactly as provided in the CREATE INDEX statement. + assert res[0].options == {'target': name}, \ + f"For column {name}: got target '{res[0].options}'" + +# Test that the "target" option in system_schema.indexes is serialized +# correctly for local secondary indexes. This format is critical for +# backward compatibility, as it's read from disk on startup to rebuild +# indexes. An incompatible change would prevent existing local indexes +# from being recreated after an upgrade. +def test_local_index_target_serialization(cql, test_keyspace, scylla_only): + schema = "a int, b int, c int, v int, PRIMARY KEY ((a, b), c)" + with new_test_table(cql, test_keyspace, schema) as table: + index_name = unique_name() + cql.execute(f"CREATE INDEX {index_name} ON {table}((a, b), v)") + wait_for_index(cql, test_keyspace, index_name) + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + # The target for a local secondary index is stored as JSON. + assert json.loads(res[0].options['target']) == {"pk": ["a", "b"], "ck": ["v"]} + # Test index representation in REST API def test_index_in_API(cql, test_keyspace): with new_test_table(cql, test_keyspace, "p int PRIMARY KEY, v int") as table: diff --git a/test/cqlpy/test_vector_index.py b/test/cqlpy/test_vector_index.py index 491b7a23b3..ba0219cf05 100644 --- a/test/cqlpy/test_vector_index.py +++ b/test/cqlpy/test_vector_index.py @@ -7,6 +7,7 @@ ############################################################################### import pytest +import json from .util import new_test_table, is_scylla, unique_name from cassandra.protocol import InvalidRequest, ConfigurationException @@ -199,6 +200,36 @@ def test_describe_custom_index(cql, test_keyspace, skip_without_tablets): assert f"CREATE CUSTOM INDEX custom ON {table}{maybe_space}(v1) USING '{custom_class}'" in a_desc assert f"CREATE CUSTOM INDEX custom1 ON {table}{maybe_space}(v2) USING '{custom_class}'" in b_desc +def test_describe_vector_index_with_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p int primary key, v vector, f1 int, f2 int' + with new_test_table(cql, test_keyspace, schema) as table: + idx = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {idx} ON {table}(v, f1, f2) USING 'vector_index'") + + desc = cql.execute(f"DESC INDEX {test_keyspace}.{idx}").one().create_statement + + assert f"CREATE CUSTOM INDEX {idx} ON {table}(v, f1, f2) USING 'vector_index'" in desc + +def test_describe_vector_index_local(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p1 int, p2 int, c int, v vector, PRIMARY KEY ((p1, p2), c)' + with new_test_table(cql, test_keyspace, schema) as table: + idx = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {idx} ON {table}((p1, p2), v) USING 'vector_index'") + + desc = cql.execute(f"DESC INDEX {test_keyspace}.{idx}").one().create_statement + + assert f"CREATE CUSTOM INDEX {idx} ON {table}((p1, p2), v) USING 'vector_index'" in desc + +def test_describe_vector_index_local_with_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p1 int, p2 int, c int, v vector, f1 text, f2 text, PRIMARY KEY ((p1, p2), c)' + with new_test_table(cql, test_keyspace, schema) as table: + idx = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {idx} ON {table}((p1, p2), v, f1, f2) USING 'vector_index'") + + desc = cql.execute(f"DESC INDEX {test_keyspace}.{idx}").one().create_statement + + assert f"CREATE CUSTOM INDEX {idx} ON {table}((p1, p2), v, f1, f2) USING 'vector_index'" in desc + def test_vector_index_version_on_recreate(cql, test_keyspace, scylla_only, skip_without_tablets): schema = 'p int primary key, v vector' @@ -264,6 +295,64 @@ def test_vector_index_version_fail_given_as_option(cql, test_keyspace, scylla_on with pytest.raises(InvalidRequest, match="Cannot specify index_version as a CUSTOM option"): cql.execute(f"CREATE CUSTOM INDEX abc ON {table}(v) USING 'vector_index' WITH OPTIONS = {{'index_version': '18ad2003-05ea-17d9-1855-0325ac0a755d'}}") +# Test that the "target" option in system_schema.indexes is serialized +# correctly for a vector index on a single vector column. This format is +# critical for backward compatibility, as it's read from disk on startup +# to rebuild indexes. An incompatible change would prevent existing vector +# indexes from being recreated after an upgrade. +# This is also an interface with the vector-store service, which relies on the "target" +# option to identify the target column. +def test_vector_index_target_serialization(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p int primary key, v vector' + with new_test_table(cql, test_keyspace, schema) as table: + index_name = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}(v) USING 'vector_index'") + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + assert res[0].options['target'] == 'v' + +# Test "target" option serialization for vector index with filtering columns. +def test_vector_index_target_serialization_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p int primary key, v vector, f1 int, f2 int' + with new_test_table(cql, test_keyspace, schema) as table: + index_name = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}(v, f1, f2) USING 'vector_index'") + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + assert json.loads(res[0].options['target']) == {"tc": "v", "fc": ["f1", "f2"]} + +# Test "target" option serialization for local vector index. +def test_vector_index_target_serialization_local_index(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p1 int, p2 int, c1 int, c2 int, v vector, PRIMARY KEY ((p1, p2), c1, c2)' + with new_test_table(cql, test_keyspace, schema) as table: + index_name = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}((p1, p2), v) USING 'vector_index'") + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + assert json.loads(res[0].options['target']) == {"tc": "v", "pk": ["p1", "p2"]} + +# Test "target" option serialization for local vector index with filtering columns. +def test_vector_index_target_serialization_local_index_with_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets): + schema = 'p1 int, p2 int, c1 int, c2 int, v vector, f1 text, f2 text, PRIMARY KEY ((p1, p2), c1, c2)' + with new_test_table(cql, test_keyspace, schema) as table: + index_name = unique_name() + cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}((p1, p2), v, f1, f2) USING 'vector_index'") + + res = [r for r in cql.execute('select * from system_schema.indexes') + if r.index_name == index_name] + + assert len(res) == 1 + assert json.loads(res[0].options['target']) == {"tc": "v", "pk": ["p1", "p2"], "fc": ["f1", "f2"]} + def test_one_vector_index_on_column(cql, test_keyspace, skip_without_tablets): schema = "p int primary key, v vector" if is_scylla(cql): diff --git a/test/vector_search/vector_store_client_test.cc b/test/vector_search/vector_store_client_test.cc index f8c10f4f28..83e21f43fe 100644 --- a/test/vector_search/vector_store_client_test.cc +++ b/test/vector_search/vector_store_client_test.cc @@ -83,6 +83,25 @@ timeout_config make_query_timeout(std::chrono::seconds timeout) { return cfg; } +future<> do_with_vector_store_mock(std::function(cql_test_env&, vs_mock_server&)> func) { + auto server = co_await make_vs_mock_server(); + + auto cfg = make_config(); + cfg.db_config->vector_store_primary_uri.set(format("http://server.node:{}", server->port())); + co_await do_with_cql_env( + [&](cql_test_env& env) -> future<> { + co_await create_test_table(env, "ks", "test"); + auto& vs = env.local_qp().vector_store_client(); + configure(vs).with_dns({{"server.node", std::vector{server->host()}}}); + vs.start_background_tasks(); + co_await func(env, *server); + }, + cfg) + .finally(seastar::coroutine::lambda([&] -> future<> { + co_await server->stop(); + })); +} + } // namespace BOOST_AUTO_TEST_CASE(vector_store_client_test_ctor) { @@ -1222,32 +1241,19 @@ SEASTAR_TEST_CASE(vector_store_client_abort_due_to_query_timeout) { /// Verify that the HTTP error description from the vector store is propagated /// through the CQL interface as part of the invalid_request_exception message. SEASTAR_TEST_CASE(vector_store_client_cql_error_contains_http_error_description) { - auto server = co_await make_vs_mock_server(); + co_await do_with_vector_store_mock([](cql_test_env& env, vs_mock_server& server) -> future<> { + co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test (embedding) USING 'vector_index'"); - auto cfg = make_config(); - cfg.db_config->vector_store_primary_uri.set(format("http://server.node:{}", server->port())); - co_await do_with_cql_env( - [&](cql_test_env& env) -> future<> { - auto schema = co_await create_test_table(env, "ks", "test"); - auto& vs = env.local_qp().vector_store_client(); - configure(vs).with_dns({{"server.node", std::vector{server->host()}}}); - vs.start_background_tasks(); - co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test (embedding) USING 'vector_index'"); + // Configure mock to return 404 with a specific error message + server.next_ann_response({status_type::not_found, "index does not exist"}); - // Configure mock to return 404 with a specific error message - server->next_ann_response({status_type::not_found, "index does not exist"}); - - BOOST_CHECK_EXCEPTION(co_await env.execute_cql("SELECT * FROM ks.test ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;"), - exceptions::invalid_request_exception, [](const exceptions::invalid_request_exception& ex) { - auto msg = std::string(ex.what()); - // Verify the error message contains both the HTTP status and the error description - return msg.find("404") != std::string::npos && msg.find("index does not exist") != std::string::npos; - }); - }, - cfg) - .finally(seastar::coroutine::lambda([&] -> future<> { - co_await server->stop(); - })); + BOOST_CHECK_EXCEPTION(co_await env.execute_cql("SELECT * FROM ks.test ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;"), + exceptions::invalid_request_exception, [](const exceptions::invalid_request_exception& ex) { + auto msg = std::string(ex.what()); + // Verify the error message contains both the HTTP status and the error description + return msg.find("404") != std::string::npos && msg.find("index does not exist") != std::string::npos; + }); + }); } // Create a vector index with an additional filtering column. @@ -1257,23 +1263,20 @@ SEASTAR_TEST_CASE(vector_store_client_cql_error_contains_http_error_description) // ANN ordering by vector requires the column to be indexed using 'vector_index'. // Reproduces SCYLLADB-635. SEASTAR_TEST_CASE(vector_store_client_vector_index_with_additional_filtering_column) { - auto server = co_await make_vs_mock_server(); + co_await do_with_vector_store_mock([](cql_test_env& env, vs_mock_server&) -> future<> { + // Create a vector index on the embedding column, including ck1 for filtered ANN search support. + co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test (embedding, ck1) USING 'vector_index'"); - auto cfg = make_config(); - cfg.db_config->vector_store_primary_uri.set(format("http://server.node:{}", server->port())); - co_await do_with_cql_env( - [&](cql_test_env& env) -> future<> { - auto schema = co_await create_test_table(env, "ks", "test"); - auto& vs = env.local_qp().vector_store_client(); - configure(vs).with_dns({{"server.node", std::vector{server->host()}}}); - vs.start_background_tasks(); - // Create a vector index on the embedding column, including ck1 for filtered ANN search support. - auto result = co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test (embedding, ck1) USING 'vector_index'"); - - BOOST_CHECK_NO_THROW(co_await env.execute_cql("SELECT * FROM ks.test ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;")); - }, - cfg) - .finally(seastar::coroutine::lambda([&] -> future<> { - co_await server->stop(); - })); + BOOST_CHECK_NO_THROW(co_await env.execute_cql("SELECT * FROM ks.test ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;")); + }); +} + +SEASTAR_TEST_CASE(vector_store_client_local_vector_index) { + co_await do_with_vector_store_mock([](cql_test_env& env, vs_mock_server&) -> future<> { + // Create a local vector index on the 'embedding' column. + co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test ((pk1, pk2), embedding) USING 'vector_index'"); + + BOOST_CHECK_NO_THROW( + co_await env.execute_cql("SELECT * FROM ks.test WHERE pk1 = 1 AND pk2 = 2 ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;")); + }); }