vector_search: fix SELECT on local vector index

Queries against local vector indexes were failing with the error:
"ANN ordering by vector requires the column to be indexed using 'vector_index'"

This was a regression introduced by 15788c3734, which incorrectly
assumed the first column in the targets list is always the vector column.
For local vector indexes, the first column is the partition key, causing
the failure.

Previously, serialization logic for the target index option was shared
between vector and secondary indexes. This is no longer viable due to
the introduction of local vector indexes and vector indexes with filtering
columns, which have  different target format.

This commit introduces a dedicated JSON-based serialization format for
vector index targets, identifying the target column (tc), filtering
columns (fc), and partition key columns (pk). This ensures unambiguous
serialization and deserialization for all vector index types.

This change is backward compatible for regular vector indexes. However,
it breaks compatibility for local vector indexes and vector indexes with
filtering columns created in version 2026.1.0. To mitigate this, usage
of these specific index types will be blocked in the 2026.1.0 release
by failing ANN queries against them in vector-store service.

Fixes: SCYLLADB-895
This commit is contained in:
Karol Nowacki
2026-03-24 05:40:04 +01:00
parent c0b78477a5
commit 6bc88e817f
9 changed files with 353 additions and 11 deletions

View File

@@ -1714,6 +1714,7 @@ deps['test/boost/combined_tests'] += [
'test/boost/tracing_test.cc',
'test/boost/user_function_test.cc',
'test/boost/user_types_test.cc',
'test/boost/vector_index_test.cc',
'test/boost/view_build_test.cc',
'test/boost/view_complex_test.cc',
'test/boost/view_schema_ckey_test.cc',

View File

@@ -8,6 +8,7 @@
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
*/
#include <boost/algorithm/string.hpp>
#include <seastar/core/coroutine.hh>
#include "create_index_statement.hh"
#include "db/config.hh"
@@ -37,6 +38,7 @@
#include "types/concrete_types.hh"
#include "db/tags/extension.hh"
#include "tombstone_gc_extension.hh"
#include "index/secondary_index.hh"
#include <stdexcept>
@@ -116,6 +118,15 @@ static data_type type_for_computed_column(cql3::statements::index_target::target
}
}
static bool is_vector_capable_class(const sstring& class_name) {
return boost::iequals(class_name, "vector_index");
}
static bool is_vector_index(const index_options_map& options) {
auto class_it = options.find(db::index::secondary_index::custom_class_option_name);
return class_it != options.end() && is_vector_capable_class(class_it->second);
}
view_ptr create_index_statement::create_view_for_index(const schema_ptr schema, const index_metadata& im,
const data_dictionary::database& db) const
{
@@ -266,7 +277,7 @@ create_index_statement::validate(query_processor& qp, const service::client_stat
_idx_properties->validate();
// FIXME: This is ugly and can be improved.
const bool is_vector_index = _idx_properties->custom_class && *_idx_properties->custom_class == "vector_index";
const bool is_vector_index = _idx_properties->custom_class && is_vector_capable_class(*_idx_properties->custom_class);
const bool uses_view_properties = _view_properties.properties()->count() > 0
|| _view_properties.use_compact_storage()
|| _view_properties.defined_ordering().size() > 0;
@@ -697,7 +708,9 @@ index_metadata create_index_statement::make_index_metadata(const std::vector<::s
const index_options_map& options)
{
index_options_map new_options = options;
auto target_option = secondary_index::target_parser::serialize_targets(targets);
auto target_option = is_vector_index(options)
? secondary_index::vector_index::serialize_targets(targets)
: secondary_index::target_parser::serialize_targets(targets);
new_options.emplace(index_target::target_option_name, target_option);
const auto& first_target = targets.front()->value;

10
docs/dev/vector_index.md Normal file
View File

@@ -0,0 +1,10 @@
# Vector index in Scylla
Vector indexes are custom indexes (USING 'vector\_index'). Their `target` option in `system_schema.indexes` uses following format:
- Simple single-column vector index `(v)`: just the (escaped) column name, e.g. `v`
- Vector index with filtering columns `(v, f1, f2)`: JSON with `tc` (target column) and `fc` (filtering columns): `{"tc":"v","fc":["f1","f2"]}`
- Local vector index `((p1, p2), v)`: JSON with `tc` and `pk` (partition key columns): `{"tc":"v","pk":["p1","p2"]}`
- Local vector index with filtering columns `((p1, p2), v, f1, f2)`: JSON with `tc`, `pk`, and `fc`: `{"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]}`
The `target` option acts as the interface for the vector-store service, providing the metadata necessary to determine which columns are indexed and how they are structured.

View File

@@ -105,17 +105,79 @@ const static std::unordered_map<sstring, std::function<void(const sstring&, cons
{"rescoring", std::bind_front(validate_enumerated_option, boolean_values)},
};
sstring get_vector_index_target_column(const sstring& targets) {
std::optional<rjson::value> json_value = rjson::try_parse(targets);
if (!json_value || !json_value->IsObject()) {
return target_parser::get_target_column_name_from_string(targets);
static constexpr auto TC_TARGET_KEY = "tc";
static constexpr auto PK_TARGET_KEY = "pk";
static constexpr auto FC_TARGET_KEY = "fc";
// Serialize vector index targets into a format using:
// "tc" for the target (vector) column,
// "pk" for partition key columns (local index),
// "fc" for filtering columns.
// For a simple single-column vector index, returns just the column name.
// Examples:
// (v) -> "v"
// (v, f1, f2) -> {"tc":"v","fc":["f1","f2"]}
// ((p1, p2), v) -> {"tc":"v","pk":["p1","p2"]}
// ((p1, p2), v, f1, f2) -> {"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]}
sstring vector_index::serialize_targets(const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) {
using cql3::statements::index_target;
if (targets.size() == 0) {
throw exceptions::invalid_request_exception("Vector index must have at least one target column");
}
rjson::value* pk = rjson::find(*json_value, "pk");
if (pk && pk->IsArray() && !pk->Empty()) {
return sstring(rjson::to_string_view(pk->GetArray()[0]));
if (targets.size() == 1) {
auto tc = targets[0]->value;
if (!std::holds_alternative<index_target::single_column>(tc)) {
throw exceptions::invalid_request_exception("Missing vector column target for local vector index");
}
return index_target::escape_target_column(*std::get<index_target::single_column>(tc));
}
return target_parser::get_target_column_name_from_string(targets);
const bool has_pk = std::holds_alternative<index_target::multiple_columns>(targets.front()->value);
const size_t tc_idx = has_pk ? 1 : 0;
const size_t fc_count = targets.size() - tc_idx - 1;
if (!std::holds_alternative<index_target::single_column>(targets[tc_idx]->value)) {
throw exceptions::invalid_request_exception("Vector index target column must be a single column");
}
rjson::value json_map = rjson::empty_object();
rjson::add_with_string_name(json_map, TC_TARGET_KEY, rjson::from_string(std::get<index_target::single_column>(targets[tc_idx]->value)->text()));
if (has_pk) {
rjson::value pk_json = rjson::empty_array();
for (const auto& col : std::get<index_target::multiple_columns>(targets.front()->value)) {
rjson::push_back(pk_json, rjson::from_string(col->text()));
}
rjson::add_with_string_name(json_map, PK_TARGET_KEY, std::move(pk_json));
}
if (fc_count > 0) {
rjson::value fc_json = rjson::empty_array();
for (size_t i = tc_idx + 1; i < targets.size(); ++i) {
if (!std::holds_alternative<index_target::single_column>(targets[i]->value)) {
throw exceptions::invalid_request_exception("Vector index filtering column must be a single column");
}
rjson::push_back(fc_json, rjson::from_string(std::get<index_target::single_column>(targets[i]->value)->text()));
}
rjson::add_with_string_name(json_map, FC_TARGET_KEY, std::move(fc_json));
}
return rjson::print(json_map);
}
sstring vector_index::get_target_column(const sstring& targets) {
std::optional<rjson::value> json_value = rjson::try_parse(targets);
if (!json_value || !json_value->IsObject()) {
return cql3::statements::index_target::column_name_from_target_string(targets);
}
rjson::value* tc = rjson::find(*json_value, TC_TARGET_KEY);
if (tc && tc->IsString()) {
return sstring(rjson::to_string_view(*tc));
}
return cql3::statements::index_target::column_name_from_target_string(targets);
}
bool vector_index::is_rescoring_enabled(const index_options_map& properties) {
@@ -346,7 +408,7 @@ bool vector_index::is_vector_index_on_column(const index_metadata& im, const sst
auto target_it = im.options().find(cql3_parser::index_target::target_option_name);
if (class_it != im.options().end() && target_it != im.options().end()) {
auto custom_class = secondary_index_manager::get_custom_class_factory(class_it->second);
return custom_class && dynamic_cast<vector_index*>((*custom_class)().get()) && get_vector_index_target_column(target_it->second) == target_name;
return custom_class && dynamic_cast<vector_index*>((*custom_class)().get()) && get_target_column(target_it->second) == target_name;
}
return false;
}

View File

@@ -37,6 +37,9 @@ public:
static bool is_vector_index_on_column(const index_metadata& im, const sstring& target_name);
static void check_cdc_options(const schema& schema);
static sstring serialize_targets(const std::vector<::shared_ptr<cql3::statements::index_target>>& targets);
static sstring get_target_column(const sstring& targets);
static bool is_rescoring_enabled(const index_options_map& properties);
static float get_oversampling(const index_options_map& properties);
static sstring get_cql_similarity_function_name(const index_options_map& properties);

View File

@@ -375,6 +375,7 @@ add_scylla_test(combined_tests
tracing_test.cc
user_function_test.cc
user_types_test.cc
vector_index_test.cc
view_build_test.cc
view_complex_test.cc
view_schema_ckey_test.cc

View File

@@ -0,0 +1,189 @@
/*
* Copyright (C) 2026-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include <boost/test/unit_test.hpp>
#include "cql3/column_identifier.hh"
#include "cql3/statements/index_target.hh"
#include "exceptions/exceptions.hh"
#include "index/vector_index.hh"
#include "utils/rjson.hh"
using namespace secondary_index;
using namespace cql3;
using statements::index_target;
BOOST_AUTO_TEST_SUITE(vector_index_test)
namespace {
::shared_ptr<index_target> make_single(const sstring& name) {
auto col = ::make_shared<cql3::column_identifier>(name, true);
return ::make_shared<index_target>(col, index_target::target_type::regular_values);
}
::shared_ptr<index_target> make_multi(const std::vector<sstring>& names) {
std::vector<::shared_ptr<column_identifier>> cols;
cols.reserve(names.size());
for (const auto& n : names) {
cols.push_back(::make_shared<column_identifier>(n, true));
}
return ::make_shared<index_target>(std::move(cols), index_target::target_type::regular_values);
}
} // namespace
BOOST_AUTO_TEST_CASE(serialize_targets_empty_targets_throws) {
std::vector<::shared_ptr<index_target>> targets;
BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception);
}
BOOST_AUTO_TEST_CASE(serialize_targets_single_pk_only_target_throws) {
std::vector<::shared_ptr<index_target>> targets = {make_multi({"p1", "p2"})};
BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception);
}
BOOST_AUTO_TEST_CASE(serialize_targets_single_column) {
std::vector<::shared_ptr<index_target>> targets = {
make_single("v"),
};
BOOST_CHECK_EQUAL(vector_index::serialize_targets(targets), "v");
}
BOOST_AUTO_TEST_CASE(serialize_targets_with_filtering_columns) {
std::vector<::shared_ptr<index_target>> targets = {
make_single("v"),
make_single("f1"),
make_single("f2"),
};
const auto result = vector_index::serialize_targets(targets);
const auto json = rjson::parse(result);
BOOST_REQUIRE(json.IsObject());
BOOST_REQUIRE(!rjson::find(json, "pk"));
auto tc = rjson::find(json, "tc");
BOOST_REQUIRE(tc);
BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v");
const auto* fc = rjson::find(json, "fc");
BOOST_REQUIRE(fc);
BOOST_REQUIRE(fc->IsArray());
BOOST_REQUIRE_EQUAL(fc->Size(), 2);
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[0])), "f1");
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[1])), "f2");
}
BOOST_AUTO_TEST_CASE(serialize_targets_local_index) {
std::vector<::shared_ptr<index_target>> targets = {
make_multi({"p1", "p2"}),
make_single("v"),
};
const auto result = vector_index::serialize_targets(targets);
const auto json = rjson::parse(result);
BOOST_REQUIRE(json.IsObject());
const auto* pk = rjson::find(json, "pk");
BOOST_REQUIRE(pk);
BOOST_REQUIRE(pk->IsArray());
BOOST_REQUIRE_EQUAL(pk->Size(), 2);
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[0])), "p1");
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[1])), "p2");
auto tc = rjson::find(json, "tc");
BOOST_REQUIRE(tc);
BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v");
BOOST_REQUIRE(!rjson::find(json, "fc"));
}
BOOST_AUTO_TEST_CASE(serialize_targets_local_index_with_filtering_columns) {
std::vector<::shared_ptr<index_target>> targets = {
make_multi({"p1", "p2"}),
make_single("v"),
make_single("f1"),
make_single("f2"),
};
const auto result = vector_index::serialize_targets(targets);
const auto json = rjson::parse(result);
BOOST_REQUIRE(json.IsObject());
const auto* pk = rjson::find(json, "pk");
BOOST_REQUIRE(pk);
BOOST_REQUIRE(pk->IsArray());
BOOST_REQUIRE_EQUAL(pk->Size(), 2);
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[0])), "p1");
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[1])), "p2");
auto tc = rjson::find(json, "tc");
BOOST_REQUIRE(tc);
BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v");
const auto* fc = rjson::find(json, "fc");
BOOST_REQUIRE(fc);
BOOST_REQUIRE(fc->IsArray());
BOOST_REQUIRE_EQUAL(fc->Size(), 2);
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[0])), "f1");
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[1])), "f2");
}
BOOST_AUTO_TEST_CASE(serialize_targets_multi_column_target_throws) {
std::vector<::shared_ptr<index_target>> targets = {make_multi({"p1"}), make_multi({"c1"})};
BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception);
}
BOOST_AUTO_TEST_CASE(serialize_targets_multi_column_filtering_throws) {
std::vector<::shared_ptr<index_target>> targets = {make_single("v"), make_multi({"c1"})};
BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception);
}
BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_uppercase_letters_from_escaped_string) {
std::vector<::shared_ptr<index_target>> targets = {make_single("MyCol")};
const auto serialized = vector_index::serialize_targets(targets);
BOOST_CHECK_EQUAL(serialized, "\"MyCol\"");
BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "MyCol");
}
BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_quotes_from_escaped_string) {
std::vector<::shared_ptr<index_target>> targets = {make_single("a\"b")};
const auto serialized = vector_index::serialize_targets(targets);
BOOST_CHECK_EQUAL(serialized, "\"a\"\"b\"");
BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "a\"b");
}
BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_uppercase_letters_from_json) {
std::vector<::shared_ptr<index_target>> targets = {
make_single("MyCol"),
make_single("f1"),
};
const auto serialized = vector_index::serialize_targets(targets);
BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "MyCol");
}
BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_quotes_from_json) {
std::vector<::shared_ptr<index_target>> targets = {
make_single("a\"b"),
make_single("f1"),
};
const auto serialized = vector_index::serialize_targets(targets);
BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "a\"b");
}
BOOST_AUTO_TEST_SUITE_END()

View File

@@ -7,6 +7,7 @@
###############################################################################
import pytest
import json
from .util import new_test_table, is_scylla, unique_name
from cassandra.protocol import InvalidRequest, ConfigurationException
@@ -283,6 +284,45 @@ def test_vector_index_target_serialization(cql, test_keyspace, scylla_only, skip
assert len(res) == 1
assert res[0].options['target'] == 'v'
# Test "target" option serialization for vector index with filtering columns.
def test_vector_index_target_serialization_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets):
schema = 'p int primary key, v vector<float, 3>, f1 int, f2 int'
with new_test_table(cql, test_keyspace, schema) as table:
index_name = unique_name()
cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}(v, f1, f2) USING 'vector_index'")
res = [r for r in cql.execute('select * from system_schema.indexes')
if r.index_name == index_name]
assert len(res) == 1
assert json.loads(res[0].options['target']) == {"tc": "v", "fc": ["f1", "f2"]}
# Test "target" option serialization for local vector index.
def test_vector_index_target_serialization_local_index(cql, test_keyspace, scylla_only, skip_without_tablets):
schema = 'p1 int, p2 int, c1 int, c2 int, v vector<float, 3>, PRIMARY KEY ((p1, p2), c1, c2)'
with new_test_table(cql, test_keyspace, schema) as table:
index_name = unique_name()
cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}((p1, p2), v) USING 'vector_index'")
res = [r for r in cql.execute('select * from system_schema.indexes')
if r.index_name == index_name]
assert len(res) == 1
assert json.loads(res[0].options['target']) == {"tc": "v", "pk": ["p1", "p2"]}
# Test "target" option serialization for local vector index with filtering columns.
def test_vector_index_target_serialization_local_index_with_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets):
schema = 'p1 int, p2 int, c1 int, c2 int, v vector<float, 3>, f1 text, f2 text, PRIMARY KEY ((p1, p2), c1, c2)'
with new_test_table(cql, test_keyspace, schema) as table:
index_name = unique_name()
cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}((p1, p2), v, f1, f2) USING 'vector_index'")
res = [r for r in cql.execute('select * from system_schema.indexes')
if r.index_name == index_name]
assert len(res) == 1
assert json.loads(res[0].options['target']) == {"tc": "v", "pk": ["p1", "p2"], "fc": ["f1", "f2"]}
def test_one_vector_index_on_column(cql, test_keyspace, skip_without_tablets):
schema = "p int primary key, v vector<float, 3>"
if is_scylla(cql):

View File

@@ -1277,3 +1277,26 @@ SEASTAR_TEST_CASE(vector_store_client_vector_index_with_additional_filtering_col
co_await server->stop();
}));
}
SEASTAR_TEST_CASE(vector_store_client_local_vector_index) {
auto server = co_await make_vs_mock_server();
auto cfg = make_config();
cfg.db_config->vector_store_primary_uri.set(format("http://server.node:{}", server->port()));
co_await do_with_cql_env(
[&](cql_test_env& env) -> future<> {
auto schema = co_await create_test_table(env, "ks", "test");
auto& vs = env.local_qp().vector_store_client();
configure(vs).with_dns({{"server.node", std::vector<std::string>{server->host()}}});
vs.start_background_tasks();
// Create a local vector index on the 'embedding' column.
auto result = co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test ((pk1, pk2), embedding) USING 'vector_index'");
BOOST_CHECK_NO_THROW(
co_await env.execute_cql("SELECT * FROM ks.test WHERE pk1 = 1 AND pk2 = 2 ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;"));
},
cfg)
.finally(seastar::coroutine::lambda([&] -> future<> {
co_await server->stop();
}));
}