Merge '[Backport 2026.1] vector_search: fix SELECT on local vector index' from Scylladb[bot]

Queries against local vector indexes were failing with the error:
```ANN ordering by vector requires the column to be indexed using 'vector_index'```

This was a regression introduced by 15788c3734, which incorrectly
assumed the first column in the targets list is always the vector column.
For local vector indexes, the first column is the partition key, causing
the failure.

Previously, serialization logic for the target index option was shared
between vector and secondary indexes. This is no longer viable due to
the introduction of local vector indexes and vector indexes with filtering
columns, which have  different target format.

This commit introduces a dedicated JSON-based serialization format for
vector index targets, identifying the target column (tc), filtering
columns (fc), and partition key columns (pk). This ensures unambiguous
serialization and deserialization for all vector index types.

This change is backward compatible for regular vector indexes. However,
it breaks compatibility for local vector indexes and vector indexes with
filtering columns created in version 2026.1.0. To mitigate this, usage
of these specific index types will be blocked in the 2026.1.0 release
by failing ANN queries against them in vector-store service.

Fixes: SCYLLADB-1424

Backport to 2026.1 is required as this issue occurs also on this branch.

- (cherry picked from commit 22e7ef46a7)

Parent PR: #28862

Closes scylladb/scylladb#29383

* github.com:scylladb/scylladb:
  index: fix DESC INDEX for vector index
  vector_search: test: refactor boilerplate setup
  vector_search: fix SELECT on local vector index
  index: test: vector index target option serialization test
  index: test: secondary index target option serialization test
This commit is contained in:
Marcin Maliszkiewicz
2026-04-20 12:34:30 +02:00
11 changed files with 554 additions and 56 deletions

View File

@@ -1701,6 +1701,7 @@ deps['test/boost/combined_tests'] += [
'test/boost/tracing_test.cc',
'test/boost/user_function_test.cc',
'test/boost/user_types_test.cc',
'test/boost/vector_index_test.cc',
'test/boost/view_build_test.cc',
'test/boost/view_complex_test.cc',
'test/boost/view_schema_ckey_test.cc',

View File

@@ -8,6 +8,7 @@
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
*/
#include <boost/algorithm/string.hpp>
#include <seastar/core/coroutine.hh>
#include "create_index_statement.hh"
#include "db/config.hh"
@@ -37,6 +38,7 @@
#include "types/concrete_types.hh"
#include "db/tags/extension.hh"
#include "tombstone_gc_extension.hh"
#include "index/secondary_index.hh"
#include <stdexcept>
@@ -116,6 +118,15 @@ static data_type type_for_computed_column(cql3::statements::index_target::target
}
}
static bool is_vector_capable_class(const sstring& class_name) {
return boost::iequals(class_name, "vector_index");
}
static bool is_vector_index(const index_options_map& options) {
auto class_it = options.find(db::index::secondary_index::custom_class_option_name);
return class_it != options.end() && is_vector_capable_class(class_it->second);
}
view_ptr create_index_statement::create_view_for_index(const schema_ptr schema, const index_metadata& im,
const data_dictionary::database& db) const
{
@@ -266,7 +277,7 @@ create_index_statement::validate(query_processor& qp, const service::client_stat
_idx_properties->validate();
// FIXME: This is ugly and can be improved.
const bool is_vector_index = _idx_properties->custom_class && *_idx_properties->custom_class == "vector_index";
const bool is_vector_index = _idx_properties->custom_class && is_vector_capable_class(*_idx_properties->custom_class);
const bool uses_view_properties = _view_properties.properties()->count() > 0
|| _view_properties.use_compact_storage()
|| _view_properties.defined_ordering().size() > 0;
@@ -697,7 +708,9 @@ index_metadata create_index_statement::make_index_metadata(const std::vector<::s
const index_options_map& options)
{
index_options_map new_options = options;
auto target_option = secondary_index::target_parser::serialize_targets(targets);
auto target_option = is_vector_index(options)
? secondary_index::vector_index::serialize_targets(targets)
: secondary_index::target_parser::serialize_targets(targets);
new_options.emplace(index_target::target_option_name, target_option);
const auto& first_target = targets.front()->value;

View File

@@ -37,8 +37,17 @@ Global index's target is usually just the indexed column name, unless the index
- index on map, set or list values: VALUES(v)
- index on map entries: ENTRIES(v)
Their serialization is just string representation, so:
"v", "FULL(v)", "KEYS(v)", "VALUES(v)", "ENTRIES(v)" are all valid targets.
Their serialization uses lowercase type names as prefixes, except for `full` which is serialized
as just the column name (without any prefix):
`"v"`, `"keys(v)"`, `"values(v)"`, `"entries(v)"` are valid targets; a frozen full collection
index on column `v` is stored simply as `"v"` (same as a regular index).
If the column name contains characters that could be confused with the above formats
(e.g., a name containing parentheses or braces), it is escaped using the CQL
quoted-identifier syntax (column_identifier::to_cql_string()), which wraps the
name in double quotes and doubles any embedded double-quote characters. For example,
a column named `hEllo` is stored as `"hEllo"`, and a column named `keys(m)` is
stored as `"keys(m)"`.
## Local index

10
docs/dev/vector_index.md Normal file
View File

@@ -0,0 +1,10 @@
# Vector index in Scylla
Vector indexes are custom indexes (USING 'vector\_index'). Their `target` option in `system_schema.indexes` uses following format:
- Simple single-column vector index `(v)`: just the (escaped) column name, e.g. `v`
- Vector index with filtering columns `(v, f1, f2)`: JSON with `tc` (target column) and `fc` (filtering columns): `{"tc":"v","fc":["f1","f2"]}`
- Local vector index `((p1, p2), v)`: JSON with `tc` and `pk` (partition key columns): `{"tc":"v","pk":["p1","p2"]}`
- Local vector index with filtering columns `((p1, p2), v, f1, f2)`: JSON with `tc`, `pk`, and `fc`: `{"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]}`
The `target` option acts as the interface for the vector-store service, providing the metadata necessary to determine which columns are indexed and how they are structured.

View File

@@ -20,6 +20,7 @@
#include "types/concrete_types.hh"
#include "types/types.hh"
#include "utils/managed_string.hh"
#include <ranges>
#include <seastar/core/sstring.hh>
#include <boost/algorithm/string.hpp>
@@ -103,19 +104,122 @@ const static std::unordered_map<sstring, std::function<void(const sstring&, cons
{"oversampling", std::bind_front(validate_factor_option, 1.0f, 100.0f)},
// 'rescoring' enables recalculating of similarity scores of candidates retrieved from vector store when quantization is used.
{"rescoring", std::bind_front(validate_enumerated_option, boolean_values)},
};
};
sstring get_vector_index_target_column(const sstring& targets) {
static constexpr auto TC_TARGET_KEY = "tc";
static constexpr auto PK_TARGET_KEY = "pk";
static constexpr auto FC_TARGET_KEY = "fc";
// Convert a serialized targets string (as produced by serialize_targets())
// back into the CQL column list used inside CREATE INDEX ... ON table(<here>).
//
// JSON examples:
// {"tc":"v","fc":["f1","f2"]} -> "v, f1, f2"
// {"tc":"v","pk":["p1","p2"]} -> "(p1, p2), v"
// {"tc":"v","pk":["p1","p2"],"fc":["f1"]} -> "(p1, p2), v, f1"
static sstring targets_to_cql(const sstring& targets) {
std::optional<rjson::value> json_value = rjson::try_parse(targets);
if (!json_value || !json_value->IsObject()) {
return target_parser::get_target_column_name_from_string(targets);
return cql3::util::maybe_quote(cql3::statements::index_target::column_name_from_target_string(targets));
}
rjson::value* pk = rjson::find(*json_value, "pk");
sstring result;
const rjson::value* pk = rjson::find(*json_value, PK_TARGET_KEY);
if (pk && pk->IsArray() && !pk->Empty()) {
return sstring(rjson::to_string_view(pk->GetArray()[0]));
result += "(";
auto pk_cols = std::views::all(pk->GetArray()) | std::views::transform([&](const rjson::value& col) {
return cql3::util::maybe_quote(sstring(rjson::to_string_view(col)));
}) | std::ranges::to<std::vector<sstring>>();
result += boost::algorithm::join(pk_cols, ", ");
result += "), ";
}
return target_parser::get_target_column_name_from_string(targets);
const rjson::value* tc = rjson::find(*json_value, TC_TARGET_KEY);
if (tc && tc->IsString()) {
result += cql3::util::maybe_quote(sstring(rjson::to_string_view(*tc)));
}
const rjson::value* fc = rjson::find(*json_value, FC_TARGET_KEY);
if (fc && fc->IsArray()) {
for (rapidjson::SizeType i = 0; i < fc->Size(); ++i) {
result += ", ";
result += cql3::util::maybe_quote(sstring(rjson::to_string_view((*fc)[i])));
}
}
return result;
}
// Serialize vector index targets into a format using:
// "tc" for the target (vector) column,
// "pk" for partition key columns (local index),
// "fc" for filtering columns.
// For a simple single-column vector index, returns just the column name.
// Examples:
// (v) -> "v"
// (v, f1, f2) -> {"tc":"v","fc":["f1","f2"]}
// ((p1, p2), v) -> {"tc":"v","pk":["p1","p2"]}
// ((p1, p2), v, f1, f2) -> {"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]}
sstring vector_index::serialize_targets(const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) {
using cql3::statements::index_target;
if (targets.size() == 0) {
throw exceptions::invalid_request_exception("Vector index must have at least one target column");
}
if (targets.size() == 1) {
auto tc = targets[0]->value;
if (!std::holds_alternative<index_target::single_column>(tc)) {
throw exceptions::invalid_request_exception("Missing vector column target for local vector index");
}
return index_target::escape_target_column(*std::get<index_target::single_column>(tc));
}
const bool has_pk = std::holds_alternative<index_target::multiple_columns>(targets.front()->value);
const size_t tc_idx = has_pk ? 1 : 0;
const size_t fc_count = targets.size() - tc_idx - 1;
if (!std::holds_alternative<index_target::single_column>(targets[tc_idx]->value)) {
throw exceptions::invalid_request_exception("Vector index target column must be a single column");
}
rjson::value json_map = rjson::empty_object();
rjson::add_with_string_name(json_map, TC_TARGET_KEY, rjson::from_string(std::get<index_target::single_column>(targets[tc_idx]->value)->text()));
if (has_pk) {
rjson::value pk_json = rjson::empty_array();
for (const auto& col : std::get<index_target::multiple_columns>(targets.front()->value)) {
rjson::push_back(pk_json, rjson::from_string(col->text()));
}
rjson::add_with_string_name(json_map, PK_TARGET_KEY, std::move(pk_json));
}
if (fc_count > 0) {
rjson::value fc_json = rjson::empty_array();
for (size_t i = tc_idx + 1; i < targets.size(); ++i) {
if (!std::holds_alternative<index_target::single_column>(targets[i]->value)) {
throw exceptions::invalid_request_exception("Vector index filtering column must be a single column");
}
rjson::push_back(fc_json, rjson::from_string(std::get<index_target::single_column>(targets[i]->value)->text()));
}
rjson::add_with_string_name(json_map, FC_TARGET_KEY, std::move(fc_json));
}
return rjson::print(json_map);
}
sstring vector_index::get_target_column(const sstring& targets) {
std::optional<rjson::value> json_value = rjson::try_parse(targets);
if (!json_value || !json_value->IsObject()) {
return cql3::statements::index_target::column_name_from_target_string(targets);
}
rjson::value* tc = rjson::find(*json_value, TC_TARGET_KEY);
if (tc && tc->IsString()) {
return sstring(rjson::to_string_view(*tc));
}
return cql3::statements::index_target::column_name_from_target_string(targets);
}
bool vector_index::is_rescoring_enabled(const index_options_map& properties) {
@@ -147,9 +251,8 @@ bool vector_index::view_should_exist() const {
std::optional<cql3::description> vector_index::describe(const index_metadata& im, const schema& base_schema) const {
fragmented_ostringstream os;
os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON "
<< cql3::util::maybe_quote(base_schema.ks_name()) << "." << cql3::util::maybe_quote(base_schema.cf_name())
<< "(" << cql3::util::maybe_quote(im.options().at(cql3::statements::index_target::target_option_name)) << ")"
os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON " << cql3::util::maybe_quote(base_schema.ks_name()) << "."
<< cql3::util::maybe_quote(base_schema.cf_name()) << "(" << targets_to_cql(im.options().at(cql3::statements::index_target::target_option_name)) << ")"
<< " USING 'vector_index'";
return cql3::description{
@@ -346,7 +449,7 @@ bool vector_index::is_vector_index_on_column(const index_metadata& im, const sst
auto target_it = im.options().find(cql3_parser::index_target::target_option_name);
if (class_it != im.options().end() && target_it != im.options().end()) {
auto custom_class = secondary_index_manager::get_custom_class_factory(class_it->second);
return custom_class && dynamic_cast<vector_index*>((*custom_class)().get()) && get_vector_index_target_column(target_it->second) == target_name;
return custom_class && dynamic_cast<vector_index*>((*custom_class)().get()) && get_target_column(target_it->second) == target_name;
}
return false;
}

View File

@@ -37,6 +37,9 @@ public:
static bool is_vector_index_on_column(const index_metadata& im, const sstring& target_name);
static void check_cdc_options(const schema& schema);
static sstring serialize_targets(const std::vector<::shared_ptr<cql3::statements::index_target>>& targets);
static sstring get_target_column(const sstring& targets);
static bool is_rescoring_enabled(const index_options_map& properties);
static float get_oversampling(const index_options_map& properties);
static sstring get_cql_similarity_function_name(const index_options_map& properties);

View File

@@ -378,6 +378,7 @@ add_scylla_test(combined_tests
tracing_test.cc
user_function_test.cc
user_types_test.cc
vector_index_test.cc
view_build_test.cc
view_complex_test.cc
view_schema_ckey_test.cc

View File

@@ -0,0 +1,189 @@
/*
* Copyright (C) 2026-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include <boost/test/unit_test.hpp>
#include "cql3/column_identifier.hh"
#include "cql3/statements/index_target.hh"
#include "exceptions/exceptions.hh"
#include "index/vector_index.hh"
#include "utils/rjson.hh"
using namespace secondary_index;
using namespace cql3;
using statements::index_target;
BOOST_AUTO_TEST_SUITE(vector_index_test)
namespace {
::shared_ptr<index_target> make_single(const sstring& name) {
auto col = ::make_shared<cql3::column_identifier>(name, true);
return ::make_shared<index_target>(col, index_target::target_type::regular_values);
}
::shared_ptr<index_target> make_multi(const std::vector<sstring>& names) {
std::vector<::shared_ptr<column_identifier>> cols;
cols.reserve(names.size());
for (const auto& n : names) {
cols.push_back(::make_shared<column_identifier>(n, true));
}
return ::make_shared<index_target>(std::move(cols), index_target::target_type::regular_values);
}
} // namespace
BOOST_AUTO_TEST_CASE(serialize_targets_empty_targets_throws) {
std::vector<::shared_ptr<index_target>> targets;
BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception);
}
BOOST_AUTO_TEST_CASE(serialize_targets_single_pk_only_target_throws) {
std::vector<::shared_ptr<index_target>> targets = {make_multi({"p1", "p2"})};
BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception);
}
BOOST_AUTO_TEST_CASE(serialize_targets_single_column) {
std::vector<::shared_ptr<index_target>> targets = {
make_single("v"),
};
BOOST_CHECK_EQUAL(vector_index::serialize_targets(targets), "v");
}
BOOST_AUTO_TEST_CASE(serialize_targets_with_filtering_columns) {
std::vector<::shared_ptr<index_target>> targets = {
make_single("v"),
make_single("f1"),
make_single("f2"),
};
const auto result = vector_index::serialize_targets(targets);
const auto json = rjson::parse(result);
BOOST_REQUIRE(json.IsObject());
BOOST_REQUIRE(!rjson::find(json, "pk"));
auto tc = rjson::find(json, "tc");
BOOST_REQUIRE(tc);
BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v");
const auto* fc = rjson::find(json, "fc");
BOOST_REQUIRE(fc);
BOOST_REQUIRE(fc->IsArray());
BOOST_REQUIRE_EQUAL(fc->Size(), 2);
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[0])), "f1");
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[1])), "f2");
}
BOOST_AUTO_TEST_CASE(serialize_targets_local_index) {
std::vector<::shared_ptr<index_target>> targets = {
make_multi({"p1", "p2"}),
make_single("v"),
};
const auto result = vector_index::serialize_targets(targets);
const auto json = rjson::parse(result);
BOOST_REQUIRE(json.IsObject());
const auto* pk = rjson::find(json, "pk");
BOOST_REQUIRE(pk);
BOOST_REQUIRE(pk->IsArray());
BOOST_REQUIRE_EQUAL(pk->Size(), 2);
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[0])), "p1");
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[1])), "p2");
auto tc = rjson::find(json, "tc");
BOOST_REQUIRE(tc);
BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v");
BOOST_REQUIRE(!rjson::find(json, "fc"));
}
BOOST_AUTO_TEST_CASE(serialize_targets_local_index_with_filtering_columns) {
std::vector<::shared_ptr<index_target>> targets = {
make_multi({"p1", "p2"}),
make_single("v"),
make_single("f1"),
make_single("f2"),
};
const auto result = vector_index::serialize_targets(targets);
const auto json = rjson::parse(result);
BOOST_REQUIRE(json.IsObject());
const auto* pk = rjson::find(json, "pk");
BOOST_REQUIRE(pk);
BOOST_REQUIRE(pk->IsArray());
BOOST_REQUIRE_EQUAL(pk->Size(), 2);
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[0])), "p1");
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[1])), "p2");
auto tc = rjson::find(json, "tc");
BOOST_REQUIRE(tc);
BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v");
const auto* fc = rjson::find(json, "fc");
BOOST_REQUIRE(fc);
BOOST_REQUIRE(fc->IsArray());
BOOST_REQUIRE_EQUAL(fc->Size(), 2);
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[0])), "f1");
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[1])), "f2");
}
BOOST_AUTO_TEST_CASE(serialize_targets_multi_column_target_throws) {
std::vector<::shared_ptr<index_target>> targets = {make_multi({"p1"}), make_multi({"c1"})};
BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception);
}
BOOST_AUTO_TEST_CASE(serialize_targets_multi_column_filtering_throws) {
std::vector<::shared_ptr<index_target>> targets = {make_single("v"), make_multi({"c1"})};
BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception);
}
BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_uppercase_letters_from_escaped_string) {
std::vector<::shared_ptr<index_target>> targets = {make_single("MyCol")};
const auto serialized = vector_index::serialize_targets(targets);
BOOST_CHECK_EQUAL(serialized, "\"MyCol\"");
BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "MyCol");
}
BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_quotes_from_escaped_string) {
std::vector<::shared_ptr<index_target>> targets = {make_single("a\"b")};
const auto serialized = vector_index::serialize_targets(targets);
BOOST_CHECK_EQUAL(serialized, "\"a\"\"b\"");
BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "a\"b");
}
BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_uppercase_letters_from_json) {
std::vector<::shared_ptr<index_target>> targets = {
make_single("MyCol"),
make_single("f1"),
};
const auto serialized = vector_index::serialize_targets(targets);
BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "MyCol");
}
BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_quotes_from_json) {
std::vector<::shared_ptr<index_target>> targets = {
make_single("a\"b"),
make_single("f1"),
};
const auto serialized = vector_index::serialize_targets(targets);
BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "a\"b");
}
BOOST_AUTO_TEST_SUITE_END()

View File

@@ -4,6 +4,7 @@
# Tests for secondary indexes
import json
import random
import itertools
import time
@@ -2031,6 +2032,82 @@ def test_index_in_system_schema_indexes(cql, built_index):
assert res[0].kind == 'COMPOSITES'
assert res[0].options == {'target': 'v'}
# Test that the "target" option in system_schema.indexes is serialized
# correctly for secondary indexes on collection columns.
# This format is critical for backward compatibility, as it's read from
# disk on startup to rebuild indexes. An incompatible change would prevent
# existing indexes from being recreated after an upgrade.
def test_global_collection_index_target_serialization(cql, test_keyspace):
schema = "p int PRIMARY KEY, m map<int,int>, fl frozen<list<int>>"
with new_test_table(cql, test_keyspace, schema) as table:
# Scylla normalizes full(col) targets to just the column name;
# Cassandra keeps the full(col) prefix.
full_target = "fl" if is_scylla(cql) else "full(fl)"
cases = [
("keys(m)", "keys(m)"),
("values(m)", "values(m)"),
("entries(m)", "entries(m)"),
("full(fl)", full_target),
]
for index_expr, expected_target in cases:
index_name = unique_name()
cql.execute(f"CREATE INDEX {index_name} ON {table}({index_expr})")
wait_for_index(cql, test_keyspace, index_name)
res = [r for r in cql.execute('select * from system_schema.indexes')
if r.index_name == index_name]
assert len(res) == 1
assert res[0].kind == 'COMPOSITES'
assert res[0].options == {'target': expected_target}, \
f"For index expression '{index_expr}': expected target '{expected_target}', got '{res[0].options}'"
# Test that the "target" option in system_schema.indexes is serialized
# correctly when the indexed column name contains special characters
# (e.g., upper-case, spaces, braces, or keywords like "keys(m)").
# The encoding uses the CQL quoted-identifier form, so e.g. column "hEllo"
# is stored as '"hEllo"'. An incompatible change here would break index
# lookup after an upgrade.
def test_global_index_target_serialization_quoted_names(cql, test_keyspace):
# Column names requiring quoting in CQL (mixed-case, space, characters
# that would otherwise be confused with target-format prefixes or JSON).
quoted_names = ['"hEllo"', '"x y"', '"keys(m)"']
schema = 'p int PRIMARY KEY, ' + ', '.join(name + " int" for name in quoted_names)
with new_test_table(cql, test_keyspace, schema) as table:
for name in quoted_names:
index_name = unique_name()
cql.execute(f"CREATE INDEX {index_name} ON {table}({name})")
wait_for_index(cql, test_keyspace, index_name)
res = [r for r in cql.execute('select * from system_schema.indexes')
if r.index_name == index_name]
assert len(res) == 1
assert res[0].kind == 'COMPOSITES'
# The target is the CQL representation of the column name,
# i.e., quoted exactly as provided in the CREATE INDEX statement.
assert res[0].options == {'target': name}, \
f"For column {name}: got target '{res[0].options}'"
# Test that the "target" option in system_schema.indexes is serialized
# correctly for local secondary indexes. This format is critical for
# backward compatibility, as it's read from disk on startup to rebuild
# indexes. An incompatible change would prevent existing local indexes
# from being recreated after an upgrade.
def test_local_index_target_serialization(cql, test_keyspace, scylla_only):
schema = "a int, b int, c int, v int, PRIMARY KEY ((a, b), c)"
with new_test_table(cql, test_keyspace, schema) as table:
index_name = unique_name()
cql.execute(f"CREATE INDEX {index_name} ON {table}((a, b), v)")
wait_for_index(cql, test_keyspace, index_name)
res = [r for r in cql.execute('select * from system_schema.indexes')
if r.index_name == index_name]
assert len(res) == 1
# The target for a local secondary index is stored as JSON.
assert json.loads(res[0].options['target']) == {"pk": ["a", "b"], "ck": ["v"]}
# Test index representation in REST API
def test_index_in_API(cql, test_keyspace):
with new_test_table(cql, test_keyspace, "p int PRIMARY KEY, v int") as table:

View File

@@ -7,6 +7,7 @@
###############################################################################
import pytest
import json
from .util import new_test_table, is_scylla, unique_name
from cassandra.protocol import InvalidRequest, ConfigurationException
@@ -199,6 +200,36 @@ def test_describe_custom_index(cql, test_keyspace, skip_without_tablets):
assert f"CREATE CUSTOM INDEX custom ON {table}{maybe_space}(v1) USING '{custom_class}'" in a_desc
assert f"CREATE CUSTOM INDEX custom1 ON {table}{maybe_space}(v2) USING '{custom_class}'" in b_desc
def test_describe_vector_index_with_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets):
schema = 'p int primary key, v vector<float, 3>, f1 int, f2 int'
with new_test_table(cql, test_keyspace, schema) as table:
idx = unique_name()
cql.execute(f"CREATE CUSTOM INDEX {idx} ON {table}(v, f1, f2) USING 'vector_index'")
desc = cql.execute(f"DESC INDEX {test_keyspace}.{idx}").one().create_statement
assert f"CREATE CUSTOM INDEX {idx} ON {table}(v, f1, f2) USING 'vector_index'" in desc
def test_describe_vector_index_local(cql, test_keyspace, scylla_only, skip_without_tablets):
schema = 'p1 int, p2 int, c int, v vector<float, 3>, PRIMARY KEY ((p1, p2), c)'
with new_test_table(cql, test_keyspace, schema) as table:
idx = unique_name()
cql.execute(f"CREATE CUSTOM INDEX {idx} ON {table}((p1, p2), v) USING 'vector_index'")
desc = cql.execute(f"DESC INDEX {test_keyspace}.{idx}").one().create_statement
assert f"CREATE CUSTOM INDEX {idx} ON {table}((p1, p2), v) USING 'vector_index'" in desc
def test_describe_vector_index_local_with_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets):
schema = 'p1 int, p2 int, c int, v vector<float, 3>, f1 text, f2 text, PRIMARY KEY ((p1, p2), c)'
with new_test_table(cql, test_keyspace, schema) as table:
idx = unique_name()
cql.execute(f"CREATE CUSTOM INDEX {idx} ON {table}((p1, p2), v, f1, f2) USING 'vector_index'")
desc = cql.execute(f"DESC INDEX {test_keyspace}.{idx}").one().create_statement
assert f"CREATE CUSTOM INDEX {idx} ON {table}((p1, p2), v, f1, f2) USING 'vector_index'" in desc
def test_vector_index_version_on_recreate(cql, test_keyspace, scylla_only, skip_without_tablets):
schema = 'p int primary key, v vector<float, 3>'
@@ -264,6 +295,64 @@ def test_vector_index_version_fail_given_as_option(cql, test_keyspace, scylla_on
with pytest.raises(InvalidRequest, match="Cannot specify index_version as a CUSTOM option"):
cql.execute(f"CREATE CUSTOM INDEX abc ON {table}(v) USING 'vector_index' WITH OPTIONS = {{'index_version': '18ad2003-05ea-17d9-1855-0325ac0a755d'}}")
# Test that the "target" option in system_schema.indexes is serialized
# correctly for a vector index on a single vector column. This format is
# critical for backward compatibility, as it's read from disk on startup
# to rebuild indexes. An incompatible change would prevent existing vector
# indexes from being recreated after an upgrade.
# This is also an interface with the vector-store service, which relies on the "target"
# option to identify the target column.
def test_vector_index_target_serialization(cql, test_keyspace, scylla_only, skip_without_tablets):
schema = 'p int primary key, v vector<float, 3>'
with new_test_table(cql, test_keyspace, schema) as table:
index_name = unique_name()
cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}(v) USING 'vector_index'")
res = [r for r in cql.execute('select * from system_schema.indexes')
if r.index_name == index_name]
assert len(res) == 1
assert res[0].options['target'] == 'v'
# Test "target" option serialization for vector index with filtering columns.
def test_vector_index_target_serialization_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets):
schema = 'p int primary key, v vector<float, 3>, f1 int, f2 int'
with new_test_table(cql, test_keyspace, schema) as table:
index_name = unique_name()
cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}(v, f1, f2) USING 'vector_index'")
res = [r for r in cql.execute('select * from system_schema.indexes')
if r.index_name == index_name]
assert len(res) == 1
assert json.loads(res[0].options['target']) == {"tc": "v", "fc": ["f1", "f2"]}
# Test "target" option serialization for local vector index.
def test_vector_index_target_serialization_local_index(cql, test_keyspace, scylla_only, skip_without_tablets):
schema = 'p1 int, p2 int, c1 int, c2 int, v vector<float, 3>, PRIMARY KEY ((p1, p2), c1, c2)'
with new_test_table(cql, test_keyspace, schema) as table:
index_name = unique_name()
cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}((p1, p2), v) USING 'vector_index'")
res = [r for r in cql.execute('select * from system_schema.indexes')
if r.index_name == index_name]
assert len(res) == 1
assert json.loads(res[0].options['target']) == {"tc": "v", "pk": ["p1", "p2"]}
# Test "target" option serialization for local vector index with filtering columns.
def test_vector_index_target_serialization_local_index_with_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets):
schema = 'p1 int, p2 int, c1 int, c2 int, v vector<float, 3>, f1 text, f2 text, PRIMARY KEY ((p1, p2), c1, c2)'
with new_test_table(cql, test_keyspace, schema) as table:
index_name = unique_name()
cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}((p1, p2), v, f1, f2) USING 'vector_index'")
res = [r for r in cql.execute('select * from system_schema.indexes')
if r.index_name == index_name]
assert len(res) == 1
assert json.loads(res[0].options['target']) == {"tc": "v", "pk": ["p1", "p2"], "fc": ["f1", "f2"]}
def test_one_vector_index_on_column(cql, test_keyspace, skip_without_tablets):
schema = "p int primary key, v vector<float, 3>"
if is_scylla(cql):

View File

@@ -83,6 +83,25 @@ timeout_config make_query_timeout(std::chrono::seconds timeout) {
return cfg;
}
future<> do_with_vector_store_mock(std::function<future<>(cql_test_env&, vs_mock_server&)> func) {
auto server = co_await make_vs_mock_server();
auto cfg = make_config();
cfg.db_config->vector_store_primary_uri.set(format("http://server.node:{}", server->port()));
co_await do_with_cql_env(
[&](cql_test_env& env) -> future<> {
co_await create_test_table(env, "ks", "test");
auto& vs = env.local_qp().vector_store_client();
configure(vs).with_dns({{"server.node", std::vector<std::string>{server->host()}}});
vs.start_background_tasks();
co_await func(env, *server);
},
cfg)
.finally(seastar::coroutine::lambda([&] -> future<> {
co_await server->stop();
}));
}
} // namespace
BOOST_AUTO_TEST_CASE(vector_store_client_test_ctor) {
@@ -1222,32 +1241,19 @@ SEASTAR_TEST_CASE(vector_store_client_abort_due_to_query_timeout) {
/// Verify that the HTTP error description from the vector store is propagated
/// through the CQL interface as part of the invalid_request_exception message.
SEASTAR_TEST_CASE(vector_store_client_cql_error_contains_http_error_description) {
auto server = co_await make_vs_mock_server();
co_await do_with_vector_store_mock([](cql_test_env& env, vs_mock_server& server) -> future<> {
co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test (embedding) USING 'vector_index'");
auto cfg = make_config();
cfg.db_config->vector_store_primary_uri.set(format("http://server.node:{}", server->port()));
co_await do_with_cql_env(
[&](cql_test_env& env) -> future<> {
auto schema = co_await create_test_table(env, "ks", "test");
auto& vs = env.local_qp().vector_store_client();
configure(vs).with_dns({{"server.node", std::vector<std::string>{server->host()}}});
vs.start_background_tasks();
co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test (embedding) USING 'vector_index'");
// Configure mock to return 404 with a specific error message
server.next_ann_response({status_type::not_found, "index does not exist"});
// Configure mock to return 404 with a specific error message
server->next_ann_response({status_type::not_found, "index does not exist"});
BOOST_CHECK_EXCEPTION(co_await env.execute_cql("SELECT * FROM ks.test ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;"),
exceptions::invalid_request_exception, [](const exceptions::invalid_request_exception& ex) {
auto msg = std::string(ex.what());
// Verify the error message contains both the HTTP status and the error description
return msg.find("404") != std::string::npos && msg.find("index does not exist") != std::string::npos;
});
},
cfg)
.finally(seastar::coroutine::lambda([&] -> future<> {
co_await server->stop();
}));
BOOST_CHECK_EXCEPTION(co_await env.execute_cql("SELECT * FROM ks.test ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;"),
exceptions::invalid_request_exception, [](const exceptions::invalid_request_exception& ex) {
auto msg = std::string(ex.what());
// Verify the error message contains both the HTTP status and the error description
return msg.find("404") != std::string::npos && msg.find("index does not exist") != std::string::npos;
});
});
}
// Create a vector index with an additional filtering column.
@@ -1257,23 +1263,20 @@ SEASTAR_TEST_CASE(vector_store_client_cql_error_contains_http_error_description)
// ANN ordering by vector requires the column to be indexed using 'vector_index'.
// Reproduces SCYLLADB-635.
SEASTAR_TEST_CASE(vector_store_client_vector_index_with_additional_filtering_column) {
auto server = co_await make_vs_mock_server();
co_await do_with_vector_store_mock([](cql_test_env& env, vs_mock_server&) -> future<> {
// Create a vector index on the embedding column, including ck1 for filtered ANN search support.
co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test (embedding, ck1) USING 'vector_index'");
auto cfg = make_config();
cfg.db_config->vector_store_primary_uri.set(format("http://server.node:{}", server->port()));
co_await do_with_cql_env(
[&](cql_test_env& env) -> future<> {
auto schema = co_await create_test_table(env, "ks", "test");
auto& vs = env.local_qp().vector_store_client();
configure(vs).with_dns({{"server.node", std::vector<std::string>{server->host()}}});
vs.start_background_tasks();
// Create a vector index on the embedding column, including ck1 for filtered ANN search support.
auto result = co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test (embedding, ck1) USING 'vector_index'");
BOOST_CHECK_NO_THROW(co_await env.execute_cql("SELECT * FROM ks.test ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;"));
},
cfg)
.finally(seastar::coroutine::lambda([&] -> future<> {
co_await server->stop();
}));
BOOST_CHECK_NO_THROW(co_await env.execute_cql("SELECT * FROM ks.test ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;"));
});
}
SEASTAR_TEST_CASE(vector_store_client_local_vector_index) {
co_await do_with_vector_store_mock([](cql_test_env& env, vs_mock_server&) -> future<> {
// Create a local vector index on the 'embedding' column.
co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test ((pk1, pk2), embedding) USING 'vector_index'");
BOOST_CHECK_NO_THROW(
co_await env.execute_cql("SELECT * FROM ks.test WHERE pk1 = 1 AND pk2 = 2 ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;"));
});
}