Merge 'index: implement schema management layer for vector search indexes' from null

This pull request adds support for creating custom indexes (at a metadata level) as long as a supported custom class is provided (currently only vector search).

The patch contains:

- a change in CREATE INDEX statement that allows for the USING keyword to be present as long as one of the supported classes is used
-  support for describing custom indexes in the DESCRIBE statement
- unit tests

Co-authored by: @Balwancia

Closes scylladb/scylladb#23720

* github.com:scylladb/scylladb:
  test/cqlpy: add custom index tests
  index: support storing metadata for custom indices
This commit is contained in:
Nadav Har'El
2025-05-22 12:19:36 +03:00
9 changed files with 139 additions and 20 deletions

View File

@@ -12,6 +12,7 @@
#include "create_index_statement.hh"
#include "exceptions/exceptions.hh"
#include "prepared_statement.hh"
#include "types/types.hh"
#include "validation.hh"
#include "service/storage_proxy.hh"
#include "service/migration_manager.hh"
@@ -101,6 +102,15 @@ std::vector<::shared_ptr<index_target>> create_index_statement::validate_while_e
targets.emplace_back(raw_target->prepare(*schema));
}
if (_properties && _properties->custom_class) {
if (!secondary_index::secondary_index_manager::is_custom_class_supported(db.features(), *_properties->custom_class)) {
throw exceptions::invalid_request_exception(format("Non-supported custom class \'{}\' provided", *(_properties->custom_class)));
}
//TODO: run custom index class validation once added
//See https://github.com/scylladb/vector-store/issues/115
}
if (targets.size() > 1) {
validate_targets_for_multi_column_index(targets);
}
@@ -348,9 +358,9 @@ std::optional<create_index_statement::base_schema_with_new_index> create_index_s
}
index_metadata_kind kind;
index_options_map index_options;
if (_properties->is_custom) {
kind = index_metadata_kind::custom;
if (_properties->custom_class) {
index_options = _properties->get_options();
kind = index_metadata_kind::custom;
} else {
kind = schema->is_compound() ? index_metadata_kind::composites : index_metadata_kind::keys;
}

View File

@@ -22,10 +22,7 @@ void cql3::statements::index_prop_defs::validate() {
if (is_custom && !custom_class) {
throw exceptions::invalid_request_exception("CUSTOM index requires specifying the index class");
}
if (!is_custom && custom_class) {
throw exceptions::invalid_request_exception("Cannot specify index class for a non-CUSTOM index");
}
if (!is_custom && !_properties.empty()) {
throw exceptions::invalid_request_exception("Cannot specify options for a non-CUSTOM index");
}
@@ -36,15 +33,6 @@ void cql3::statements::index_prop_defs::validate() {
db::index::secondary_index::custom_index_option_name));
}
// Currently, Scylla does not support *any* class of custom index
// implementation. If in the future we do (e.g., SASI, or something
// new), we'll need to check for valid values here.
if (is_custom && custom_class) {
throw exceptions::invalid_request_exception(
format("Unsupported CUSTOM INDEX class {}. Note that currently, Scylla does not support SASI or any other CUSTOM INDEX class.",
*custom_class));
}
}
index_options_map

View File

@@ -8,13 +8,18 @@
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
*/
#include <optional>
#include <ranges>
#include <seastar/core/shared_ptr.hh>
#include <string_view>
#include <unordered_set>
#include "index/secondary_index_manager.hh"
#include "cql3/statements/index_target.hh"
#include "cql3/expr/expression.hh"
#include "index/target_parser.hh"
#include "schema/schema.hh"
#include "schema/schema_builder.hh"
#include "db/view/view.hh"
#include "concrete_types.hh"
@@ -341,4 +346,27 @@ bool secondary_index_manager::is_global_index(const schema& s) const {
});
}
std::optional<sstring> secondary_index_manager::custom_index_class(const schema& s) const {
auto idx = _indices.find(index_name_from_table_name(s.cf_name()));
if (idx == _indices.end() || !(*idx).second.metadata().options().contains(cql3::statements::index_target::custom_index_option_name)) {
return std::nullopt;
} else {
return (*idx).second.metadata().options().at(cql3::statements::index_target::custom_index_option_name);
}
}
// We pass the feature_service as the supported custom classes will depend on the features
bool secondary_index_manager::is_custom_class_supported(const gms::feature_service& fs, const sstring& class_name) {
// TODO: Change this set to a map to implementation
// when https://github.com/scylladb/vector-store/issues/115 is done
const static std::unordered_set<std::string_view> classes = {
"vector_index",
};
return classes.contains(class_name);
}
}

View File

@@ -10,11 +10,13 @@
#pragma once
#include "gms/feature_service.hh"
#include "schema/schema.hh"
#include "data_dictionary/data_dictionary.hh"
#include "cql3/statements/index_target.hh"
#include <string_view>
#include <vector>
namespace cql3::expr {
@@ -99,6 +101,8 @@ public:
bool is_index(view_ptr) const;
bool is_index(const schema& s) const;
bool is_global_index(const schema& s) const;
std::optional<sstring> custom_index_class(const schema& s) const;
static bool is_custom_class_supported(const gms::feature_service& fs, const sstring& class_name);
private:
void add_index(const index_metadata& im);
};

View File

@@ -11,6 +11,8 @@
#include "data_dictionary/data_dictionary.hh"
#include "index/secondary_index_manager.hh"
#include "schema/schema.hh"
#include <seastar/core/sstring.hh>
#include <optional>
namespace replica {
@@ -27,6 +29,10 @@ public:
return _db.find_column_family(base_id).get_index_manager().is_index(view_s);
}
virtual std::optional<sstring> custom_index_class(const table_id& base_id, const schema& view_s) const override {
return _db.find_column_family(base_id).get_index_manager().custom_index_class(view_s);
}
virtual schema_ptr find_schema(const table_id& id) const override {
return _db.find_schema(id);
}

View File

@@ -1010,11 +1010,21 @@ sstring schema::get_create_statement(const schema_describe_helper& helper, bool
if (is_view()) {
if (helper.is_index(view_info()->base_id(), *this)) {
auto is_local = !helper.is_global_index(view_info()->base_id(), *this);
auto custom_index_class = helper.custom_index_class(view_info()->base_id(), *this);
if (custom_index_class) {
os << "CUSTOM ";
}
os << "INDEX " << cql3::util::maybe_quote(secondary_index::index_name_from_table_name(cf_name())) << " ON "
<< cql3::util::maybe_quote(ks_name()) << "." << cql3::util::maybe_quote(view_info()->base_name());
describe_index_columns(os, is_local, *this, helper.find_schema(view_info()->base_id()));
if (custom_index_class) {
os << " USING '" << *custom_index_class << "'";
}
os << ";\n";
return std::move(os).str();

View File

@@ -506,6 +506,7 @@ class schema_describe_helper {
public:
virtual bool is_global_index(const table_id& base_id, const schema& view_s) const = 0;
virtual bool is_index(const table_id& base_id, const schema& view_s) const = 0;
virtual std::optional<sstring> custom_index_class(const table_id& base_id, const schema& view_s) const = 0;
virtual schema_ptr find_schema(const table_id& id) const = 0;
virtual ~schema_describe_helper() = default;
};

View File

@@ -711,11 +711,6 @@ SEASTAR_TEST_CASE(test_secondary_index_create_custom_index) {
// "exceptions::invalid_request_exception: CUSTOM index requires
// specifying the index class"
assert_that_failed(e.execute_cql("create custom index on cf (a)"));
// It's also a syntax error to try to specify a "USING" without
// specifying CUSTOM. We expect the exception:
// "exceptions::invalid_request_exception: Cannot specify index class
// for a non-CUSTOM index"
assert_that_failed(e.execute_cql("create index on cf (a) using 'org.apache.cassandra.index.sasi.SASIIndex'"));
});
}

View File

@@ -0,0 +1,77 @@
# Copyright 2025-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
###############################################################################
# Tests for vector indexes
###############################################################################
import pytest
from .util import new_test_table, is_scylla
from cassandra.protocol import InvalidRequest, ConfigurationException
def test_create_vector_search_index(cql, test_keyspace, scylla_only):
schema = 'p int primary key, v vector<float, 3>'
with new_test_table(cql, test_keyspace, schema) as table:
cql.execute(f"CREATE CUSTOM INDEX ON {table}(v) USING 'vector_index'")
def test_create_vector_search_index_without_custom_keyword(cql, test_keyspace):
schema = 'p int primary key, v vector<float, 3>'
with new_test_table(cql, test_keyspace, schema) as table:
if is_scylla(cql):
custom_class = 'vector_index'
else:
custom_class = 'sai'
cql.execute(f"CREATE INDEX ON {table}(v) USING '{custom_class}'")
def test_create_custom_index_with_invalid_class(cql, test_keyspace):
schema = 'p int primary key, v vector<float, 3>'
with new_test_table(cql, test_keyspace, schema) as table:
invalid_custom_class = "invalid.custom.class"
with pytest.raises((InvalidRequest, ConfigurationException), match=r"Non-supported custom class|Unable to find"):
cql.execute(f"CREATE CUSTOM INDEX ON {table}(v) USING '{invalid_custom_class}'")
def test_create_custom_index_without_custom_class(cql, test_keyspace):
schema = 'p int primary key, v vector<float, 3>'
with new_test_table(cql, test_keyspace, schema) as table:
with pytest.raises((InvalidRequest, ConfigurationException), match=r"CUSTOM index requires specifying|Unable to find"):
cql.execute(f"CREATE CUSTOM INDEX ON {table}(v)")
@pytest.mark.xfail(reason="Scylla doesn't validate vector indexes, as they are not implemented yet.")
def test_create_vector_search_index_on_nonvector_column(cql, test_keyspace, scylla_only):
schema = 'p int primary key, v int'
with new_test_table(cql, test_keyspace, schema) as table:
with pytest.raises(InvalidRequest):
cql.execute(f"CREATE CUSTOM INDEX ON {table}(v) USING 'vector_index'")
def test_describe_custom_index(cql, test_keyspace):
schema = 'p int primary key, v1 vector<float, 3>, v2 vector<float, 3>'
with new_test_table(cql, test_keyspace, schema) as table:
# Cassandra inserts a space between the table name and parentheses,
# Scylla doesn't. This difference doesn't matter because both are
# valid CQL commands
# Scylla doesn't support sai custom class.
if is_scylla(cql):
maybe_space = ''
custom_class = 'vector_index'
else:
maybe_space = ' '
custom_class = 'sai'
create_idx_a = f"CREATE INDEX custom ON {table}(v1) USING '{custom_class}'"
create_idx_b = f"CREATE CUSTOM INDEX custom1 ON {table}(v2) USING '{custom_class}'"
cql.execute(create_idx_a)
cql.execute(create_idx_b)
a_desc = cql.execute(f"DESC INDEX {test_keyspace}.custom").one().create_statement
b_desc = cql.execute(f"DESC INDEX {test_keyspace}.custom1").one().create_statement
assert f"CREATE CUSTOM INDEX custom ON {table}{maybe_space}(v1) USING '{custom_class}'" in a_desc
assert f"CREATE CUSTOM INDEX custom1 ON {table}{maybe_space}(v2) USING '{custom_class}'" in b_desc