mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-22 09:30:45 +00:00
Merge '[Backport 2026.1] vector_search: fix SELECT on local vector index' from Scylladb[bot]
Queries against local vector indexes were failing with the error: ```ANN ordering by vector requires the column to be indexed using 'vector_index'``` This was a regression introduced by15788c3734, which incorrectly assumed the first column in the targets list is always the vector column. For local vector indexes, the first column is the partition key, causing the failure. Previously, serialization logic for the target index option was shared between vector and secondary indexes. This is no longer viable due to the introduction of local vector indexes and vector indexes with filtering columns, which have different target format. This commit introduces a dedicated JSON-based serialization format for vector index targets, identifying the target column (tc), filtering columns (fc), and partition key columns (pk). This ensures unambiguous serialization and deserialization for all vector index types. This change is backward compatible for regular vector indexes. However, it breaks compatibility for local vector indexes and vector indexes with filtering columns created in version 2026.1.0. To mitigate this, usage of these specific index types will be blocked in the 2026.1.0 release by failing ANN queries against them in vector-store service. Fixes: SCYLLADB-1424 Backport to 2026.1 is required as this issue occurs also on this branch. - (cherry picked from commit22e7ef46a7) Parent PR: #28862 Closes scylladb/scylladb#29383 * github.com:scylladb/scylladb: index: fix DESC INDEX for vector index vector_search: test: refactor boilerplate setup vector_search: fix SELECT on local vector index index: test: vector index target option serialization test index: test: secondary index target option serialization test
This commit is contained in:
@@ -1701,6 +1701,7 @@ deps['test/boost/combined_tests'] += [
|
||||
'test/boost/tracing_test.cc',
|
||||
'test/boost/user_function_test.cc',
|
||||
'test/boost/user_types_test.cc',
|
||||
'test/boost/vector_index_test.cc',
|
||||
'test/boost/view_build_test.cc',
|
||||
'test/boost/view_complex_test.cc',
|
||||
'test/boost/view_schema_ckey_test.cc',
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
|
||||
*/
|
||||
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include "create_index_statement.hh"
|
||||
#include "db/config.hh"
|
||||
@@ -37,6 +38,7 @@
|
||||
#include "types/concrete_types.hh"
|
||||
#include "db/tags/extension.hh"
|
||||
#include "tombstone_gc_extension.hh"
|
||||
#include "index/secondary_index.hh"
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
@@ -116,6 +118,15 @@ static data_type type_for_computed_column(cql3::statements::index_target::target
|
||||
}
|
||||
}
|
||||
|
||||
static bool is_vector_capable_class(const sstring& class_name) {
|
||||
return boost::iequals(class_name, "vector_index");
|
||||
}
|
||||
|
||||
static bool is_vector_index(const index_options_map& options) {
|
||||
auto class_it = options.find(db::index::secondary_index::custom_class_option_name);
|
||||
return class_it != options.end() && is_vector_capable_class(class_it->second);
|
||||
}
|
||||
|
||||
view_ptr create_index_statement::create_view_for_index(const schema_ptr schema, const index_metadata& im,
|
||||
const data_dictionary::database& db) const
|
||||
{
|
||||
@@ -266,7 +277,7 @@ create_index_statement::validate(query_processor& qp, const service::client_stat
|
||||
_idx_properties->validate();
|
||||
|
||||
// FIXME: This is ugly and can be improved.
|
||||
const bool is_vector_index = _idx_properties->custom_class && *_idx_properties->custom_class == "vector_index";
|
||||
const bool is_vector_index = _idx_properties->custom_class && is_vector_capable_class(*_idx_properties->custom_class);
|
||||
const bool uses_view_properties = _view_properties.properties()->count() > 0
|
||||
|| _view_properties.use_compact_storage()
|
||||
|| _view_properties.defined_ordering().size() > 0;
|
||||
@@ -697,7 +708,9 @@ index_metadata create_index_statement::make_index_metadata(const std::vector<::s
|
||||
const index_options_map& options)
|
||||
{
|
||||
index_options_map new_options = options;
|
||||
auto target_option = secondary_index::target_parser::serialize_targets(targets);
|
||||
auto target_option = is_vector_index(options)
|
||||
? secondary_index::vector_index::serialize_targets(targets)
|
||||
: secondary_index::target_parser::serialize_targets(targets);
|
||||
new_options.emplace(index_target::target_option_name, target_option);
|
||||
|
||||
const auto& first_target = targets.front()->value;
|
||||
|
||||
@@ -37,8 +37,17 @@ Global index's target is usually just the indexed column name, unless the index
|
||||
- index on map, set or list values: VALUES(v)
|
||||
- index on map entries: ENTRIES(v)
|
||||
|
||||
Their serialization is just string representation, so:
|
||||
"v", "FULL(v)", "KEYS(v)", "VALUES(v)", "ENTRIES(v)" are all valid targets.
|
||||
Their serialization uses lowercase type names as prefixes, except for `full` which is serialized
|
||||
as just the column name (without any prefix):
|
||||
`"v"`, `"keys(v)"`, `"values(v)"`, `"entries(v)"` are valid targets; a frozen full collection
|
||||
index on column `v` is stored simply as `"v"` (same as a regular index).
|
||||
|
||||
If the column name contains characters that could be confused with the above formats
|
||||
(e.g., a name containing parentheses or braces), it is escaped using the CQL
|
||||
quoted-identifier syntax (column_identifier::to_cql_string()), which wraps the
|
||||
name in double quotes and doubles any embedded double-quote characters. For example,
|
||||
a column named `hEllo` is stored as `"hEllo"`, and a column named `keys(m)` is
|
||||
stored as `"keys(m)"`.
|
||||
|
||||
## Local index
|
||||
|
||||
|
||||
10
docs/dev/vector_index.md
Normal file
10
docs/dev/vector_index.md
Normal file
@@ -0,0 +1,10 @@
|
||||
# Vector index in Scylla
|
||||
|
||||
Vector indexes are custom indexes (USING 'vector\_index'). Their `target` option in `system_schema.indexes` uses following format:
|
||||
|
||||
- Simple single-column vector index `(v)`: just the (escaped) column name, e.g. `v`
|
||||
- Vector index with filtering columns `(v, f1, f2)`: JSON with `tc` (target column) and `fc` (filtering columns): `{"tc":"v","fc":["f1","f2"]}`
|
||||
- Local vector index `((p1, p2), v)`: JSON with `tc` and `pk` (partition key columns): `{"tc":"v","pk":["p1","p2"]}`
|
||||
- Local vector index with filtering columns `((p1, p2), v, f1, f2)`: JSON with `tc`, `pk`, and `fc`: `{"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]}`
|
||||
|
||||
The `target` option acts as the interface for the vector-store service, providing the metadata necessary to determine which columns are indexed and how they are structured.
|
||||
@@ -20,6 +20,7 @@
|
||||
#include "types/concrete_types.hh"
|
||||
#include "types/types.hh"
|
||||
#include "utils/managed_string.hh"
|
||||
#include <ranges>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
@@ -103,19 +104,122 @@ const static std::unordered_map<sstring, std::function<void(const sstring&, cons
|
||||
{"oversampling", std::bind_front(validate_factor_option, 1.0f, 100.0f)},
|
||||
// 'rescoring' enables recalculating of similarity scores of candidates retrieved from vector store when quantization is used.
|
||||
{"rescoring", std::bind_front(validate_enumerated_option, boolean_values)},
|
||||
};
|
||||
};
|
||||
|
||||
sstring get_vector_index_target_column(const sstring& targets) {
|
||||
static constexpr auto TC_TARGET_KEY = "tc";
|
||||
static constexpr auto PK_TARGET_KEY = "pk";
|
||||
static constexpr auto FC_TARGET_KEY = "fc";
|
||||
|
||||
// Convert a serialized targets string (as produced by serialize_targets())
|
||||
// back into the CQL column list used inside CREATE INDEX ... ON table(<here>).
|
||||
//
|
||||
// JSON examples:
|
||||
// {"tc":"v","fc":["f1","f2"]} -> "v, f1, f2"
|
||||
// {"tc":"v","pk":["p1","p2"]} -> "(p1, p2), v"
|
||||
// {"tc":"v","pk":["p1","p2"],"fc":["f1"]} -> "(p1, p2), v, f1"
|
||||
static sstring targets_to_cql(const sstring& targets) {
|
||||
std::optional<rjson::value> json_value = rjson::try_parse(targets);
|
||||
if (!json_value || !json_value->IsObject()) {
|
||||
return target_parser::get_target_column_name_from_string(targets);
|
||||
return cql3::util::maybe_quote(cql3::statements::index_target::column_name_from_target_string(targets));
|
||||
}
|
||||
|
||||
rjson::value* pk = rjson::find(*json_value, "pk");
|
||||
sstring result;
|
||||
|
||||
const rjson::value* pk = rjson::find(*json_value, PK_TARGET_KEY);
|
||||
if (pk && pk->IsArray() && !pk->Empty()) {
|
||||
return sstring(rjson::to_string_view(pk->GetArray()[0]));
|
||||
result += "(";
|
||||
auto pk_cols = std::views::all(pk->GetArray()) | std::views::transform([&](const rjson::value& col) {
|
||||
return cql3::util::maybe_quote(sstring(rjson::to_string_view(col)));
|
||||
}) | std::ranges::to<std::vector<sstring>>();
|
||||
result += boost::algorithm::join(pk_cols, ", ");
|
||||
result += "), ";
|
||||
}
|
||||
return target_parser::get_target_column_name_from_string(targets);
|
||||
|
||||
const rjson::value* tc = rjson::find(*json_value, TC_TARGET_KEY);
|
||||
if (tc && tc->IsString()) {
|
||||
result += cql3::util::maybe_quote(sstring(rjson::to_string_view(*tc)));
|
||||
}
|
||||
|
||||
const rjson::value* fc = rjson::find(*json_value, FC_TARGET_KEY);
|
||||
if (fc && fc->IsArray()) {
|
||||
for (rapidjson::SizeType i = 0; i < fc->Size(); ++i) {
|
||||
result += ", ";
|
||||
result += cql3::util::maybe_quote(sstring(rjson::to_string_view((*fc)[i])));
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Serialize vector index targets into a format using:
|
||||
// "tc" for the target (vector) column,
|
||||
// "pk" for partition key columns (local index),
|
||||
// "fc" for filtering columns.
|
||||
// For a simple single-column vector index, returns just the column name.
|
||||
// Examples:
|
||||
// (v) -> "v"
|
||||
// (v, f1, f2) -> {"tc":"v","fc":["f1","f2"]}
|
||||
// ((p1, p2), v) -> {"tc":"v","pk":["p1","p2"]}
|
||||
// ((p1, p2), v, f1, f2) -> {"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]}
|
||||
sstring vector_index::serialize_targets(const std::vector<::shared_ptr<cql3::statements::index_target>>& targets) {
|
||||
using cql3::statements::index_target;
|
||||
|
||||
if (targets.size() == 0) {
|
||||
throw exceptions::invalid_request_exception("Vector index must have at least one target column");
|
||||
}
|
||||
|
||||
if (targets.size() == 1) {
|
||||
auto tc = targets[0]->value;
|
||||
if (!std::holds_alternative<index_target::single_column>(tc)) {
|
||||
throw exceptions::invalid_request_exception("Missing vector column target for local vector index");
|
||||
}
|
||||
return index_target::escape_target_column(*std::get<index_target::single_column>(tc));
|
||||
}
|
||||
|
||||
const bool has_pk = std::holds_alternative<index_target::multiple_columns>(targets.front()->value);
|
||||
const size_t tc_idx = has_pk ? 1 : 0;
|
||||
const size_t fc_count = targets.size() - tc_idx - 1;
|
||||
|
||||
if (!std::holds_alternative<index_target::single_column>(targets[tc_idx]->value)) {
|
||||
throw exceptions::invalid_request_exception("Vector index target column must be a single column");
|
||||
}
|
||||
|
||||
rjson::value json_map = rjson::empty_object();
|
||||
rjson::add_with_string_name(json_map, TC_TARGET_KEY, rjson::from_string(std::get<index_target::single_column>(targets[tc_idx]->value)->text()));
|
||||
|
||||
if (has_pk) {
|
||||
rjson::value pk_json = rjson::empty_array();
|
||||
for (const auto& col : std::get<index_target::multiple_columns>(targets.front()->value)) {
|
||||
rjson::push_back(pk_json, rjson::from_string(col->text()));
|
||||
}
|
||||
rjson::add_with_string_name(json_map, PK_TARGET_KEY, std::move(pk_json));
|
||||
}
|
||||
|
||||
if (fc_count > 0) {
|
||||
rjson::value fc_json = rjson::empty_array();
|
||||
for (size_t i = tc_idx + 1; i < targets.size(); ++i) {
|
||||
if (!std::holds_alternative<index_target::single_column>(targets[i]->value)) {
|
||||
throw exceptions::invalid_request_exception("Vector index filtering column must be a single column");
|
||||
}
|
||||
rjson::push_back(fc_json, rjson::from_string(std::get<index_target::single_column>(targets[i]->value)->text()));
|
||||
}
|
||||
rjson::add_with_string_name(json_map, FC_TARGET_KEY, std::move(fc_json));
|
||||
}
|
||||
|
||||
return rjson::print(json_map);
|
||||
}
|
||||
|
||||
sstring vector_index::get_target_column(const sstring& targets) {
|
||||
std::optional<rjson::value> json_value = rjson::try_parse(targets);
|
||||
if (!json_value || !json_value->IsObject()) {
|
||||
return cql3::statements::index_target::column_name_from_target_string(targets);
|
||||
}
|
||||
|
||||
rjson::value* tc = rjson::find(*json_value, TC_TARGET_KEY);
|
||||
if (tc && tc->IsString()) {
|
||||
return sstring(rjson::to_string_view(*tc));
|
||||
}
|
||||
return cql3::statements::index_target::column_name_from_target_string(targets);
|
||||
}
|
||||
|
||||
bool vector_index::is_rescoring_enabled(const index_options_map& properties) {
|
||||
@@ -147,9 +251,8 @@ bool vector_index::view_should_exist() const {
|
||||
|
||||
std::optional<cql3::description> vector_index::describe(const index_metadata& im, const schema& base_schema) const {
|
||||
fragmented_ostringstream os;
|
||||
os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON "
|
||||
<< cql3::util::maybe_quote(base_schema.ks_name()) << "." << cql3::util::maybe_quote(base_schema.cf_name())
|
||||
<< "(" << cql3::util::maybe_quote(im.options().at(cql3::statements::index_target::target_option_name)) << ")"
|
||||
os << "CREATE CUSTOM INDEX " << cql3::util::maybe_quote(im.name()) << " ON " << cql3::util::maybe_quote(base_schema.ks_name()) << "."
|
||||
<< cql3::util::maybe_quote(base_schema.cf_name()) << "(" << targets_to_cql(im.options().at(cql3::statements::index_target::target_option_name)) << ")"
|
||||
<< " USING 'vector_index'";
|
||||
|
||||
return cql3::description{
|
||||
@@ -346,7 +449,7 @@ bool vector_index::is_vector_index_on_column(const index_metadata& im, const sst
|
||||
auto target_it = im.options().find(cql3_parser::index_target::target_option_name);
|
||||
if (class_it != im.options().end() && target_it != im.options().end()) {
|
||||
auto custom_class = secondary_index_manager::get_custom_class_factory(class_it->second);
|
||||
return custom_class && dynamic_cast<vector_index*>((*custom_class)().get()) && get_vector_index_target_column(target_it->second) == target_name;
|
||||
return custom_class && dynamic_cast<vector_index*>((*custom_class)().get()) && get_target_column(target_it->second) == target_name;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -37,6 +37,9 @@ public:
|
||||
static bool is_vector_index_on_column(const index_metadata& im, const sstring& target_name);
|
||||
static void check_cdc_options(const schema& schema);
|
||||
|
||||
static sstring serialize_targets(const std::vector<::shared_ptr<cql3::statements::index_target>>& targets);
|
||||
static sstring get_target_column(const sstring& targets);
|
||||
|
||||
static bool is_rescoring_enabled(const index_options_map& properties);
|
||||
static float get_oversampling(const index_options_map& properties);
|
||||
static sstring get_cql_similarity_function_name(const index_options_map& properties);
|
||||
|
||||
@@ -378,6 +378,7 @@ add_scylla_test(combined_tests
|
||||
tracing_test.cc
|
||||
user_function_test.cc
|
||||
user_types_test.cc
|
||||
vector_index_test.cc
|
||||
view_build_test.cc
|
||||
view_complex_test.cc
|
||||
view_schema_ckey_test.cc
|
||||
|
||||
189
test/boost/vector_index_test.cc
Normal file
189
test/boost/vector_index_test.cc
Normal file
@@ -0,0 +1,189 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include <boost/test/unit_test.hpp>
|
||||
#include "cql3/column_identifier.hh"
|
||||
#include "cql3/statements/index_target.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include "index/vector_index.hh"
|
||||
#include "utils/rjson.hh"
|
||||
|
||||
using namespace secondary_index;
|
||||
using namespace cql3;
|
||||
|
||||
using statements::index_target;
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(vector_index_test)
|
||||
|
||||
namespace {
|
||||
|
||||
::shared_ptr<index_target> make_single(const sstring& name) {
|
||||
auto col = ::make_shared<cql3::column_identifier>(name, true);
|
||||
return ::make_shared<index_target>(col, index_target::target_type::regular_values);
|
||||
}
|
||||
|
||||
::shared_ptr<index_target> make_multi(const std::vector<sstring>& names) {
|
||||
std::vector<::shared_ptr<column_identifier>> cols;
|
||||
cols.reserve(names.size());
|
||||
for (const auto& n : names) {
|
||||
cols.push_back(::make_shared<column_identifier>(n, true));
|
||||
}
|
||||
return ::make_shared<index_target>(std::move(cols), index_target::target_type::regular_values);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
BOOST_AUTO_TEST_CASE(serialize_targets_empty_targets_throws) {
|
||||
std::vector<::shared_ptr<index_target>> targets;
|
||||
BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(serialize_targets_single_pk_only_target_throws) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {make_multi({"p1", "p2"})};
|
||||
BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception);
|
||||
}
|
||||
|
||||
|
||||
BOOST_AUTO_TEST_CASE(serialize_targets_single_column) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {
|
||||
make_single("v"),
|
||||
};
|
||||
BOOST_CHECK_EQUAL(vector_index::serialize_targets(targets), "v");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(serialize_targets_with_filtering_columns) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {
|
||||
make_single("v"),
|
||||
make_single("f1"),
|
||||
make_single("f2"),
|
||||
};
|
||||
|
||||
const auto result = vector_index::serialize_targets(targets);
|
||||
const auto json = rjson::parse(result);
|
||||
|
||||
BOOST_REQUIRE(json.IsObject());
|
||||
|
||||
BOOST_REQUIRE(!rjson::find(json, "pk"));
|
||||
|
||||
auto tc = rjson::find(json, "tc");
|
||||
BOOST_REQUIRE(tc);
|
||||
BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v");
|
||||
|
||||
const auto* fc = rjson::find(json, "fc");
|
||||
BOOST_REQUIRE(fc);
|
||||
BOOST_REQUIRE(fc->IsArray());
|
||||
BOOST_REQUIRE_EQUAL(fc->Size(), 2);
|
||||
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[0])), "f1");
|
||||
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[1])), "f2");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(serialize_targets_local_index) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {
|
||||
make_multi({"p1", "p2"}),
|
||||
make_single("v"),
|
||||
};
|
||||
|
||||
const auto result = vector_index::serialize_targets(targets);
|
||||
const auto json = rjson::parse(result);
|
||||
|
||||
BOOST_REQUIRE(json.IsObject());
|
||||
|
||||
const auto* pk = rjson::find(json, "pk");
|
||||
BOOST_REQUIRE(pk);
|
||||
BOOST_REQUIRE(pk->IsArray());
|
||||
BOOST_REQUIRE_EQUAL(pk->Size(), 2);
|
||||
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[0])), "p1");
|
||||
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[1])), "p2");
|
||||
|
||||
auto tc = rjson::find(json, "tc");
|
||||
BOOST_REQUIRE(tc);
|
||||
BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v");
|
||||
|
||||
BOOST_REQUIRE(!rjson::find(json, "fc"));
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(serialize_targets_local_index_with_filtering_columns) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {
|
||||
make_multi({"p1", "p2"}),
|
||||
make_single("v"),
|
||||
make_single("f1"),
|
||||
make_single("f2"),
|
||||
};
|
||||
|
||||
const auto result = vector_index::serialize_targets(targets);
|
||||
const auto json = rjson::parse(result);
|
||||
|
||||
BOOST_REQUIRE(json.IsObject());
|
||||
|
||||
const auto* pk = rjson::find(json, "pk");
|
||||
BOOST_REQUIRE(pk);
|
||||
BOOST_REQUIRE(pk->IsArray());
|
||||
BOOST_REQUIRE_EQUAL(pk->Size(), 2);
|
||||
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[0])), "p1");
|
||||
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(pk->GetArray()[1])), "p2");
|
||||
|
||||
auto tc = rjson::find(json, "tc");
|
||||
BOOST_REQUIRE(tc);
|
||||
BOOST_CHECK_EQUAL(rjson::to_string_view(*tc), "v");
|
||||
|
||||
const auto* fc = rjson::find(json, "fc");
|
||||
BOOST_REQUIRE(fc);
|
||||
BOOST_REQUIRE(fc->IsArray());
|
||||
BOOST_REQUIRE_EQUAL(fc->Size(), 2);
|
||||
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[0])), "f1");
|
||||
BOOST_CHECK_EQUAL(sstring(rjson::to_string_view(fc->GetArray()[1])), "f2");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(serialize_targets_multi_column_target_throws) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {make_multi({"p1"}), make_multi({"c1"})};
|
||||
BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(serialize_targets_multi_column_filtering_throws) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {make_single("v"), make_multi({"c1"})};
|
||||
BOOST_CHECK_THROW(vector_index::serialize_targets(targets), exceptions::invalid_request_exception);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_uppercase_letters_from_escaped_string) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {make_single("MyCol")};
|
||||
const auto serialized = vector_index::serialize_targets(targets);
|
||||
|
||||
BOOST_CHECK_EQUAL(serialized, "\"MyCol\"");
|
||||
BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "MyCol");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_quotes_from_escaped_string) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {make_single("a\"b")};
|
||||
const auto serialized = vector_index::serialize_targets(targets);
|
||||
|
||||
BOOST_CHECK_EQUAL(serialized, "\"a\"\"b\"");
|
||||
BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "a\"b");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_uppercase_letters_from_json) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {
|
||||
make_single("MyCol"),
|
||||
make_single("f1"),
|
||||
};
|
||||
const auto serialized = vector_index::serialize_targets(targets);
|
||||
|
||||
BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "MyCol");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(get_target_column_returns_column_with_quotes_from_json) {
|
||||
std::vector<::shared_ptr<index_target>> targets = {
|
||||
make_single("a\"b"),
|
||||
make_single("f1"),
|
||||
};
|
||||
const auto serialized = vector_index::serialize_targets(targets);
|
||||
|
||||
BOOST_CHECK_EQUAL(vector_index::get_target_column(serialized), "a\"b");
|
||||
}
|
||||
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
# Tests for secondary indexes
|
||||
|
||||
import json
|
||||
import random
|
||||
import itertools
|
||||
import time
|
||||
@@ -2031,6 +2032,82 @@ def test_index_in_system_schema_indexes(cql, built_index):
|
||||
assert res[0].kind == 'COMPOSITES'
|
||||
assert res[0].options == {'target': 'v'}
|
||||
|
||||
# Test that the "target" option in system_schema.indexes is serialized
|
||||
# correctly for secondary indexes on collection columns.
|
||||
# This format is critical for backward compatibility, as it's read from
|
||||
# disk on startup to rebuild indexes. An incompatible change would prevent
|
||||
# existing indexes from being recreated after an upgrade.
|
||||
def test_global_collection_index_target_serialization(cql, test_keyspace):
|
||||
schema = "p int PRIMARY KEY, m map<int,int>, fl frozen<list<int>>"
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
# Scylla normalizes full(col) targets to just the column name;
|
||||
# Cassandra keeps the full(col) prefix.
|
||||
full_target = "fl" if is_scylla(cql) else "full(fl)"
|
||||
cases = [
|
||||
("keys(m)", "keys(m)"),
|
||||
("values(m)", "values(m)"),
|
||||
("entries(m)", "entries(m)"),
|
||||
("full(fl)", full_target),
|
||||
]
|
||||
for index_expr, expected_target in cases:
|
||||
index_name = unique_name()
|
||||
cql.execute(f"CREATE INDEX {index_name} ON {table}({index_expr})")
|
||||
wait_for_index(cql, test_keyspace, index_name)
|
||||
|
||||
res = [r for r in cql.execute('select * from system_schema.indexes')
|
||||
if r.index_name == index_name]
|
||||
|
||||
assert len(res) == 1
|
||||
assert res[0].kind == 'COMPOSITES'
|
||||
assert res[0].options == {'target': expected_target}, \
|
||||
f"For index expression '{index_expr}': expected target '{expected_target}', got '{res[0].options}'"
|
||||
|
||||
# Test that the "target" option in system_schema.indexes is serialized
|
||||
# correctly when the indexed column name contains special characters
|
||||
# (e.g., upper-case, spaces, braces, or keywords like "keys(m)").
|
||||
# The encoding uses the CQL quoted-identifier form, so e.g. column "hEllo"
|
||||
# is stored as '"hEllo"'. An incompatible change here would break index
|
||||
# lookup after an upgrade.
|
||||
def test_global_index_target_serialization_quoted_names(cql, test_keyspace):
|
||||
# Column names requiring quoting in CQL (mixed-case, space, characters
|
||||
# that would otherwise be confused with target-format prefixes or JSON).
|
||||
quoted_names = ['"hEllo"', '"x y"', '"keys(m)"']
|
||||
schema = 'p int PRIMARY KEY, ' + ', '.join(name + " int" for name in quoted_names)
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
for name in quoted_names:
|
||||
index_name = unique_name()
|
||||
cql.execute(f"CREATE INDEX {index_name} ON {table}({name})")
|
||||
wait_for_index(cql, test_keyspace, index_name)
|
||||
|
||||
res = [r for r in cql.execute('select * from system_schema.indexes')
|
||||
if r.index_name == index_name]
|
||||
|
||||
assert len(res) == 1
|
||||
assert res[0].kind == 'COMPOSITES'
|
||||
# The target is the CQL representation of the column name,
|
||||
# i.e., quoted exactly as provided in the CREATE INDEX statement.
|
||||
assert res[0].options == {'target': name}, \
|
||||
f"For column {name}: got target '{res[0].options}'"
|
||||
|
||||
# Test that the "target" option in system_schema.indexes is serialized
|
||||
# correctly for local secondary indexes. This format is critical for
|
||||
# backward compatibility, as it's read from disk on startup to rebuild
|
||||
# indexes. An incompatible change would prevent existing local indexes
|
||||
# from being recreated after an upgrade.
|
||||
def test_local_index_target_serialization(cql, test_keyspace, scylla_only):
|
||||
schema = "a int, b int, c int, v int, PRIMARY KEY ((a, b), c)"
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
index_name = unique_name()
|
||||
cql.execute(f"CREATE INDEX {index_name} ON {table}((a, b), v)")
|
||||
wait_for_index(cql, test_keyspace, index_name)
|
||||
|
||||
res = [r for r in cql.execute('select * from system_schema.indexes')
|
||||
if r.index_name == index_name]
|
||||
|
||||
assert len(res) == 1
|
||||
# The target for a local secondary index is stored as JSON.
|
||||
assert json.loads(res[0].options['target']) == {"pk": ["a", "b"], "ck": ["v"]}
|
||||
|
||||
# Test index representation in REST API
|
||||
def test_index_in_API(cql, test_keyspace):
|
||||
with new_test_table(cql, test_keyspace, "p int PRIMARY KEY, v int") as table:
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
###############################################################################
|
||||
|
||||
import pytest
|
||||
import json
|
||||
from .util import new_test_table, is_scylla, unique_name
|
||||
from cassandra.protocol import InvalidRequest, ConfigurationException
|
||||
|
||||
@@ -199,6 +200,36 @@ def test_describe_custom_index(cql, test_keyspace, skip_without_tablets):
|
||||
assert f"CREATE CUSTOM INDEX custom ON {table}{maybe_space}(v1) USING '{custom_class}'" in a_desc
|
||||
assert f"CREATE CUSTOM INDEX custom1 ON {table}{maybe_space}(v2) USING '{custom_class}'" in b_desc
|
||||
|
||||
def test_describe_vector_index_with_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets):
|
||||
schema = 'p int primary key, v vector<float, 3>, f1 int, f2 int'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
idx = unique_name()
|
||||
cql.execute(f"CREATE CUSTOM INDEX {idx} ON {table}(v, f1, f2) USING 'vector_index'")
|
||||
|
||||
desc = cql.execute(f"DESC INDEX {test_keyspace}.{idx}").one().create_statement
|
||||
|
||||
assert f"CREATE CUSTOM INDEX {idx} ON {table}(v, f1, f2) USING 'vector_index'" in desc
|
||||
|
||||
def test_describe_vector_index_local(cql, test_keyspace, scylla_only, skip_without_tablets):
|
||||
schema = 'p1 int, p2 int, c int, v vector<float, 3>, PRIMARY KEY ((p1, p2), c)'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
idx = unique_name()
|
||||
cql.execute(f"CREATE CUSTOM INDEX {idx} ON {table}((p1, p2), v) USING 'vector_index'")
|
||||
|
||||
desc = cql.execute(f"DESC INDEX {test_keyspace}.{idx}").one().create_statement
|
||||
|
||||
assert f"CREATE CUSTOM INDEX {idx} ON {table}((p1, p2), v) USING 'vector_index'" in desc
|
||||
|
||||
def test_describe_vector_index_local_with_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets):
|
||||
schema = 'p1 int, p2 int, c int, v vector<float, 3>, f1 text, f2 text, PRIMARY KEY ((p1, p2), c)'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
idx = unique_name()
|
||||
cql.execute(f"CREATE CUSTOM INDEX {idx} ON {table}((p1, p2), v, f1, f2) USING 'vector_index'")
|
||||
|
||||
desc = cql.execute(f"DESC INDEX {test_keyspace}.{idx}").one().create_statement
|
||||
|
||||
assert f"CREATE CUSTOM INDEX {idx} ON {table}((p1, p2), v, f1, f2) USING 'vector_index'" in desc
|
||||
|
||||
|
||||
def test_vector_index_version_on_recreate(cql, test_keyspace, scylla_only, skip_without_tablets):
|
||||
schema = 'p int primary key, v vector<float, 3>'
|
||||
@@ -264,6 +295,64 @@ def test_vector_index_version_fail_given_as_option(cql, test_keyspace, scylla_on
|
||||
with pytest.raises(InvalidRequest, match="Cannot specify index_version as a CUSTOM option"):
|
||||
cql.execute(f"CREATE CUSTOM INDEX abc ON {table}(v) USING 'vector_index' WITH OPTIONS = {{'index_version': '18ad2003-05ea-17d9-1855-0325ac0a755d'}}")
|
||||
|
||||
# Test that the "target" option in system_schema.indexes is serialized
|
||||
# correctly for a vector index on a single vector column. This format is
|
||||
# critical for backward compatibility, as it's read from disk on startup
|
||||
# to rebuild indexes. An incompatible change would prevent existing vector
|
||||
# indexes from being recreated after an upgrade.
|
||||
# This is also an interface with the vector-store service, which relies on the "target"
|
||||
# option to identify the target column.
|
||||
def test_vector_index_target_serialization(cql, test_keyspace, scylla_only, skip_without_tablets):
|
||||
schema = 'p int primary key, v vector<float, 3>'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
index_name = unique_name()
|
||||
cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}(v) USING 'vector_index'")
|
||||
|
||||
res = [r for r in cql.execute('select * from system_schema.indexes')
|
||||
if r.index_name == index_name]
|
||||
|
||||
assert len(res) == 1
|
||||
assert res[0].options['target'] == 'v'
|
||||
|
||||
# Test "target" option serialization for vector index with filtering columns.
|
||||
def test_vector_index_target_serialization_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets):
|
||||
schema = 'p int primary key, v vector<float, 3>, f1 int, f2 int'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
index_name = unique_name()
|
||||
cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}(v, f1, f2) USING 'vector_index'")
|
||||
|
||||
res = [r for r in cql.execute('select * from system_schema.indexes')
|
||||
if r.index_name == index_name]
|
||||
|
||||
assert len(res) == 1
|
||||
assert json.loads(res[0].options['target']) == {"tc": "v", "fc": ["f1", "f2"]}
|
||||
|
||||
# Test "target" option serialization for local vector index.
|
||||
def test_vector_index_target_serialization_local_index(cql, test_keyspace, scylla_only, skip_without_tablets):
|
||||
schema = 'p1 int, p2 int, c1 int, c2 int, v vector<float, 3>, PRIMARY KEY ((p1, p2), c1, c2)'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
index_name = unique_name()
|
||||
cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}((p1, p2), v) USING 'vector_index'")
|
||||
|
||||
res = [r for r in cql.execute('select * from system_schema.indexes')
|
||||
if r.index_name == index_name]
|
||||
|
||||
assert len(res) == 1
|
||||
assert json.loads(res[0].options['target']) == {"tc": "v", "pk": ["p1", "p2"]}
|
||||
|
||||
# Test "target" option serialization for local vector index with filtering columns.
|
||||
def test_vector_index_target_serialization_local_index_with_filtering_columns(cql, test_keyspace, scylla_only, skip_without_tablets):
|
||||
schema = 'p1 int, p2 int, c1 int, c2 int, v vector<float, 3>, f1 text, f2 text, PRIMARY KEY ((p1, p2), c1, c2)'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
index_name = unique_name()
|
||||
cql.execute(f"CREATE CUSTOM INDEX {index_name} ON {table}((p1, p2), v, f1, f2) USING 'vector_index'")
|
||||
|
||||
res = [r for r in cql.execute('select * from system_schema.indexes')
|
||||
if r.index_name == index_name]
|
||||
|
||||
assert len(res) == 1
|
||||
assert json.loads(res[0].options['target']) == {"tc": "v", "pk": ["p1", "p2"], "fc": ["f1", "f2"]}
|
||||
|
||||
def test_one_vector_index_on_column(cql, test_keyspace, skip_without_tablets):
|
||||
schema = "p int primary key, v vector<float, 3>"
|
||||
if is_scylla(cql):
|
||||
|
||||
@@ -83,6 +83,25 @@ timeout_config make_query_timeout(std::chrono::seconds timeout) {
|
||||
return cfg;
|
||||
}
|
||||
|
||||
future<> do_with_vector_store_mock(std::function<future<>(cql_test_env&, vs_mock_server&)> func) {
|
||||
auto server = co_await make_vs_mock_server();
|
||||
|
||||
auto cfg = make_config();
|
||||
cfg.db_config->vector_store_primary_uri.set(format("http://server.node:{}", server->port()));
|
||||
co_await do_with_cql_env(
|
||||
[&](cql_test_env& env) -> future<> {
|
||||
co_await create_test_table(env, "ks", "test");
|
||||
auto& vs = env.local_qp().vector_store_client();
|
||||
configure(vs).with_dns({{"server.node", std::vector<std::string>{server->host()}}});
|
||||
vs.start_background_tasks();
|
||||
co_await func(env, *server);
|
||||
},
|
||||
cfg)
|
||||
.finally(seastar::coroutine::lambda([&] -> future<> {
|
||||
co_await server->stop();
|
||||
}));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
BOOST_AUTO_TEST_CASE(vector_store_client_test_ctor) {
|
||||
@@ -1222,32 +1241,19 @@ SEASTAR_TEST_CASE(vector_store_client_abort_due_to_query_timeout) {
|
||||
/// Verify that the HTTP error description from the vector store is propagated
|
||||
/// through the CQL interface as part of the invalid_request_exception message.
|
||||
SEASTAR_TEST_CASE(vector_store_client_cql_error_contains_http_error_description) {
|
||||
auto server = co_await make_vs_mock_server();
|
||||
co_await do_with_vector_store_mock([](cql_test_env& env, vs_mock_server& server) -> future<> {
|
||||
co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test (embedding) USING 'vector_index'");
|
||||
|
||||
auto cfg = make_config();
|
||||
cfg.db_config->vector_store_primary_uri.set(format("http://server.node:{}", server->port()));
|
||||
co_await do_with_cql_env(
|
||||
[&](cql_test_env& env) -> future<> {
|
||||
auto schema = co_await create_test_table(env, "ks", "test");
|
||||
auto& vs = env.local_qp().vector_store_client();
|
||||
configure(vs).with_dns({{"server.node", std::vector<std::string>{server->host()}}});
|
||||
vs.start_background_tasks();
|
||||
co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test (embedding) USING 'vector_index'");
|
||||
// Configure mock to return 404 with a specific error message
|
||||
server.next_ann_response({status_type::not_found, "index does not exist"});
|
||||
|
||||
// Configure mock to return 404 with a specific error message
|
||||
server->next_ann_response({status_type::not_found, "index does not exist"});
|
||||
|
||||
BOOST_CHECK_EXCEPTION(co_await env.execute_cql("SELECT * FROM ks.test ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;"),
|
||||
exceptions::invalid_request_exception, [](const exceptions::invalid_request_exception& ex) {
|
||||
auto msg = std::string(ex.what());
|
||||
// Verify the error message contains both the HTTP status and the error description
|
||||
return msg.find("404") != std::string::npos && msg.find("index does not exist") != std::string::npos;
|
||||
});
|
||||
},
|
||||
cfg)
|
||||
.finally(seastar::coroutine::lambda([&] -> future<> {
|
||||
co_await server->stop();
|
||||
}));
|
||||
BOOST_CHECK_EXCEPTION(co_await env.execute_cql("SELECT * FROM ks.test ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;"),
|
||||
exceptions::invalid_request_exception, [](const exceptions::invalid_request_exception& ex) {
|
||||
auto msg = std::string(ex.what());
|
||||
// Verify the error message contains both the HTTP status and the error description
|
||||
return msg.find("404") != std::string::npos && msg.find("index does not exist") != std::string::npos;
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Create a vector index with an additional filtering column.
|
||||
@@ -1257,23 +1263,20 @@ SEASTAR_TEST_CASE(vector_store_client_cql_error_contains_http_error_description)
|
||||
// ANN ordering by vector requires the column to be indexed using 'vector_index'.
|
||||
// Reproduces SCYLLADB-635.
|
||||
SEASTAR_TEST_CASE(vector_store_client_vector_index_with_additional_filtering_column) {
|
||||
auto server = co_await make_vs_mock_server();
|
||||
co_await do_with_vector_store_mock([](cql_test_env& env, vs_mock_server&) -> future<> {
|
||||
// Create a vector index on the embedding column, including ck1 for filtered ANN search support.
|
||||
co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test (embedding, ck1) USING 'vector_index'");
|
||||
|
||||
auto cfg = make_config();
|
||||
cfg.db_config->vector_store_primary_uri.set(format("http://server.node:{}", server->port()));
|
||||
co_await do_with_cql_env(
|
||||
[&](cql_test_env& env) -> future<> {
|
||||
auto schema = co_await create_test_table(env, "ks", "test");
|
||||
auto& vs = env.local_qp().vector_store_client();
|
||||
configure(vs).with_dns({{"server.node", std::vector<std::string>{server->host()}}});
|
||||
vs.start_background_tasks();
|
||||
// Create a vector index on the embedding column, including ck1 for filtered ANN search support.
|
||||
auto result = co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test (embedding, ck1) USING 'vector_index'");
|
||||
|
||||
BOOST_CHECK_NO_THROW(co_await env.execute_cql("SELECT * FROM ks.test ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;"));
|
||||
},
|
||||
cfg)
|
||||
.finally(seastar::coroutine::lambda([&] -> future<> {
|
||||
co_await server->stop();
|
||||
}));
|
||||
BOOST_CHECK_NO_THROW(co_await env.execute_cql("SELECT * FROM ks.test ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;"));
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(vector_store_client_local_vector_index) {
|
||||
co_await do_with_vector_store_mock([](cql_test_env& env, vs_mock_server&) -> future<> {
|
||||
// Create a local vector index on the 'embedding' column.
|
||||
co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test ((pk1, pk2), embedding) USING 'vector_index'");
|
||||
|
||||
BOOST_CHECK_NO_THROW(
|
||||
co_await env.execute_cql("SELECT * FROM ks.test WHERE pk1 = 1 AND pk2 = 2 ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;"));
|
||||
});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user