mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-20 00:20:47 +00:00
index: fix vector index with filtering target column
The secondary index mechanism is currently used to determine the target column. This mechanism works incorrectly for vector indexes with filtering because it returns the last specified column as the target (vectors) column. However, the syntax for a vector index requires the first column to be the target: ``` CREATE CUSTOM INDEX ON t(vectors, users) USING 'vector_index'; ``` This discrepancy eventually leads to the following exception when performing an ANN search on a vector index with filtering columns: ```` ANN ordering by vector requires the column to be indexed using 'vector_index' ```` This commit fixes the issue by introducing dedicated logic for vector indexes to correctly identify the target(vectors) column. Fixes: SCYLLADB-635
This commit is contained in:
@@ -2004,9 +2004,7 @@ static std::optional<ann_ordering_info> get_ann_ordering_info(
|
||||
|
||||
auto indexes = sim.list_indexes();
|
||||
auto it = std::find_if(indexes.begin(), indexes.end(), [&prepared_ann_ordering](const auto& ind) {
|
||||
return (ind.metadata().options().contains(db::index::secondary_index::custom_class_option_name) &&
|
||||
ind.metadata().options().at(db::index::secondary_index::custom_class_option_name) == ANN_CUSTOM_INDEX_OPTION) &&
|
||||
(ind.target_column() == prepared_ann_ordering.first->name_as_text());
|
||||
return secondary_index::vector_index::is_vector_index_on_column(ind.metadata(), prepared_ann_ordering.first->name_as_text());
|
||||
});
|
||||
|
||||
if (it == indexes.end()) {
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "index/vector_index.hh"
|
||||
#include "index/secondary_index.hh"
|
||||
#include "index/secondary_index_manager.hh"
|
||||
#include "index/target_parser.hh"
|
||||
#include "types/concrete_types.hh"
|
||||
#include "types/types.hh"
|
||||
#include "utils/managed_string.hh"
|
||||
@@ -104,6 +105,19 @@ const static std::unordered_map<sstring, std::function<void(const sstring&, cons
|
||||
{"rescoring", std::bind_front(validate_enumerated_option, boolean_values)},
|
||||
};
|
||||
|
||||
sstring get_vector_index_target_column(const sstring& targets) {
|
||||
std::optional<rjson::value> json_value = rjson::try_parse(targets);
|
||||
if (!json_value || !json_value->IsObject()) {
|
||||
return target_parser::get_target_column_name_from_string(targets);
|
||||
}
|
||||
|
||||
rjson::value* pk = rjson::find(*json_value, "pk");
|
||||
if (pk && pk->IsArray() && !pk->Empty()) {
|
||||
return sstring(rjson::to_string_view(pk->GetArray()[0]));
|
||||
}
|
||||
return target_parser::get_target_column_name_from_string(targets);
|
||||
}
|
||||
|
||||
bool vector_index::is_rescoring_enabled(const index_options_map& properties) {
|
||||
auto q = properties.find("quantization");
|
||||
auto r = properties.find("rescoring");
|
||||
@@ -320,16 +334,23 @@ bool vector_index::has_vector_index(const schema& s) {
|
||||
|
||||
bool vector_index::has_vector_index_on_column(const schema& s, const sstring& target_name) {
|
||||
for (const auto& index : s.indices()) {
|
||||
auto class_it = index.options().find(db::index::secondary_index::custom_class_option_name);
|
||||
auto target_it = index.options().find(cql3_parser::index_target::target_option_name);
|
||||
if (class_it != index.options().end() && target_it != index.options().end()) {
|
||||
auto custom_class = secondary_index_manager::get_custom_class_factory(class_it->second);
|
||||
return custom_class && dynamic_cast<vector_index*>((*custom_class)().get()) && target_it->second == target_name;
|
||||
if (is_vector_index_on_column(index, target_name)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool vector_index::is_vector_index_on_column(const index_metadata& im, const sstring& target_name) {
|
||||
auto class_it = im.options().find(db::index::secondary_index::custom_class_option_name);
|
||||
auto target_it = im.options().find(cql3_parser::index_target::target_option_name);
|
||||
if (class_it != im.options().end() && target_it != im.options().end()) {
|
||||
auto custom_class = secondary_index_manager::get_custom_class_factory(class_it->second);
|
||||
return custom_class && dynamic_cast<vector_index*>((*custom_class)().get()) && get_vector_index_target_column(target_it->second) == target_name;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Returns the schema version of the base table at which the index was created.
|
||||
/// This is used to determine if the index needs to be rebuilt after a schema change.
|
||||
/// The CREATE INDEX and DROP INDEX statements does change the schema version.
|
||||
|
||||
@@ -34,6 +34,7 @@ public:
|
||||
table_schema_version index_version(const schema& schema) override;
|
||||
static bool has_vector_index(const schema& s);
|
||||
static bool has_vector_index_on_column(const schema& s, const sstring& target_name);
|
||||
static bool is_vector_index_on_column(const index_metadata& im, const sstring& target_name);
|
||||
static void check_cdc_options(const schema& schema);
|
||||
|
||||
static bool is_rescoring_enabled(const index_options_map& properties);
|
||||
|
||||
@@ -1184,3 +1184,31 @@ SEASTAR_TEST_CASE(vector_store_client_abort_due_to_query_timeout) {
|
||||
co_await server->stop();
|
||||
}));
|
||||
}
|
||||
|
||||
// Create a vector index with an additional filtering column.
|
||||
// Because the local secondary index logic was used to determine the index target column,
|
||||
// the implementation wrongly selects last column as the target(vectors) column, leading to an exception
|
||||
// on the SELECT query:
|
||||
// ANN ordering by vector requires the column to be indexed using 'vector_index'.
|
||||
// Reproduces SCYLLADB-635.
|
||||
SEASTAR_TEST_CASE(vector_store_client_vector_index_with_additional_filtering_column) {
|
||||
auto server = co_await make_vs_mock_server();
|
||||
|
||||
auto cfg = make_config();
|
||||
cfg.db_config->vector_store_primary_uri.set(format("http://server.node:{}", server->port()));
|
||||
co_await do_with_cql_env(
|
||||
[&](cql_test_env& env) -> future<> {
|
||||
auto schema = co_await create_test_table(env, "ks", "test");
|
||||
auto& vs = env.local_qp().vector_store_client();
|
||||
configure(vs).with_dns({{"server.node", std::vector<std::string>{server->host()}}});
|
||||
vs.start_background_tasks();
|
||||
// Create a vector index on the embedding column, including ck1 for filtered ANN search support.
|
||||
auto result = co_await env.execute_cql("CREATE CUSTOM INDEX idx ON ks.test (embedding, ck1) USING 'vector_index'");
|
||||
|
||||
BOOST_CHECK_NO_THROW(co_await env.execute_cql("SELECT * FROM ks.test ORDER BY embedding ANN OF [0.1, 0.2, 0.3] LIMIT 5;"));
|
||||
},
|
||||
cfg)
|
||||
.finally(seastar::coroutine::lambda([&] -> future<> {
|
||||
co_await server->stop();
|
||||
}));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user