mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-28 10:41:12 +00:00
cql: rewrite CassIO SAI metadata index to regular secondary index
When CassIO creates a SAI ENTRIES index on a map column,
ScyllaDB now rewrites it to a regular secondary index and emits
a CQL warning. This allows LangChain/CassIO applications to work
without DDL errors.
The rewrite is gated behind the enable_cassio_compatibility flag
(disabled by default).
Refs: SCYLLADB-2113
(cherry picked from commit 5ee339b11d)
This commit is contained in:
committed by
scylladbbot
parent
4251357118
commit
c30c9f3a82
@@ -133,17 +133,30 @@ static bool is_vector_capable_class(const sstring& class_name) {
|
||||
return class_name == "vector_index" || is_sai_class_name(class_name);
|
||||
}
|
||||
|
||||
// When the custom class is SAI, verify that at least one target is a
|
||||
// vector column and rewrite the class to ScyllaDB's native "vector_index".
|
||||
// Non-vector single-column targets and multi-column (local-index partition
|
||||
// key) targets are skipped — they are treated as filtering columns by
|
||||
// vector_index::check_target().
|
||||
static void maybe_rewrite_sai_to_vector_index(
|
||||
// When the custom class is SAI, attempt to rewrite the index to a
|
||||
// ScyllaDB-native equivalent. Returns false when the rewrite is fully
|
||||
// transparent (vector_index), or true when the resulting index is only
|
||||
// partially compatible with Cassandra SAI semantics and the caller should
|
||||
// emit a warning to the user.
|
||||
//
|
||||
// 1. Vector columns: rewrite to ScyllaDB's native "vector_index".
|
||||
// Non-vector single-column targets and multi-column (local-index
|
||||
// partition key) targets are skipped — they are treated as filtering
|
||||
// columns by vector_index::check_target().
|
||||
//
|
||||
// 2. CassIO metadata pattern (ENTRIES on a non-frozen map): clear the
|
||||
// custom class so the index is created as a standard secondary index.
|
||||
// This rewrite is only performed when the enable_cassio_compatibility
|
||||
// configuration option is enabled (disabled by default).
|
||||
//
|
||||
// If neither pattern matches, throws invalid_request_exception.
|
||||
static bool maybe_rewrite_sai_index(
|
||||
const schema& schema,
|
||||
const std::vector<::shared_ptr<index_target>>& targets,
|
||||
index_specific_prop_defs& props) {
|
||||
index_specific_prop_defs& props,
|
||||
bool cassio_compat) {
|
||||
if (!props.custom_class || !is_sai_class_name(*props.custom_class)) {
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
for (const auto& target : targets) {
|
||||
auto* ident = std::get_if<::shared_ptr<column_identifier>>(&target->value);
|
||||
@@ -158,7 +171,38 @@ static void maybe_rewrite_sai_to_vector_index(
|
||||
}
|
||||
if (dynamic_cast<const vector_type_impl*>(cd->type.get())) {
|
||||
props.custom_class = "vector_index";
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// No vector column found. Check if this is the CassIO metadata-map
|
||||
// pattern: a single ENTRIES target on a non-frozen map column with no
|
||||
// custom options (CassIO does not pass WITH OPTIONS for this index).
|
||||
if (targets.size() == 1) {
|
||||
const auto& target = targets.front();
|
||||
if (target->type == index_target::target_type::keys_and_values) {
|
||||
auto* ident = std::get_if<::shared_ptr<column_identifier>>(&target->value);
|
||||
if (ident) {
|
||||
auto cd = schema.get_column_definition((*ident)->name());
|
||||
if (cd && cd->type->is_multi_cell() && cd->type->is_map()) {
|
||||
if (props.has_property(index_specific_prop_defs::KW_OPTIONS)) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
"StorageAttachedIndex (SAI) on map entries cannot be "
|
||||
"rewritten to a regular secondary index when WITH OPTIONS "
|
||||
"are specified; remove the options or use a secondary index directly");
|
||||
}
|
||||
if (!cassio_compat) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
"StorageAttachedIndex (SAI) on ENTRIES of a map column is not "
|
||||
"supported; if this is a CassIO/LangChain workload, enable the "
|
||||
"enable_cassio_compatibility configuration option to rewrite "
|
||||
"this index as a regular secondary index");
|
||||
}
|
||||
// Rewrite to a regular secondary index on map entries.
|
||||
props.custom_class.reset();
|
||||
props.is_custom = false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
throw exceptions::invalid_request_exception(
|
||||
@@ -380,6 +424,21 @@ create_index_statement::validate_while_executing(data_dictionary::database db, l
|
||||
throw exceptions::invalid_request_exception(format("index names shouldn't be more than {:d} characters long (got \"{}\")", schema::NAME_LENGTH, _index_name.c_str()));
|
||||
}
|
||||
|
||||
validate_for_local_index(*schema);
|
||||
|
||||
std::vector<::shared_ptr<index_target>> targets;
|
||||
for (auto& raw_target : _raw_targets) {
|
||||
targets.emplace_back(raw_target->prepare(*schema));
|
||||
}
|
||||
|
||||
bool cassio_compat = db.get_config().enable_cassio_compatibility();
|
||||
if (maybe_rewrite_sai_index(*schema, targets, *_idx_properties, cassio_compat)) {
|
||||
warnings.emplace_back(
|
||||
"SAI (StorageAttachedIndex) is not supported by ScyllaDB. "
|
||||
"This statement was rewritten to use a regular secondary index. "
|
||||
"Metadata filtering behavior may differ from Cassandra SAI.");
|
||||
}
|
||||
|
||||
// Regular secondary indexes require rf-rack-validity.
|
||||
// Custom indexes need to validate this property themselves, if they need it.
|
||||
if (!_idx_properties || !_idx_properties->custom_class) {
|
||||
@@ -400,15 +459,6 @@ create_index_statement::validate_while_executing(data_dictionary::database db, l
|
||||
}
|
||||
}
|
||||
|
||||
validate_for_local_index(*schema);
|
||||
|
||||
std::vector<::shared_ptr<index_target>> targets;
|
||||
for (auto& raw_target : _raw_targets) {
|
||||
targets.emplace_back(raw_target->prepare(*schema));
|
||||
}
|
||||
|
||||
maybe_rewrite_sai_to_vector_index(*schema, targets, *_idx_properties);
|
||||
|
||||
if (_idx_properties && _idx_properties->custom_class) {
|
||||
auto custom_index_factory = secondary_index::secondary_index_manager::get_custom_class_factory(*_idx_properties->custom_class);
|
||||
if (!custom_index_factory) {
|
||||
|
||||
@@ -307,11 +307,28 @@ The ``similarity_function`` option is supported by both Cassandra SAI and Scylla
|
||||
|
||||
.. note::
|
||||
|
||||
SAI class names are only supported on **vector columns**. Using an SAI class name on a
|
||||
non-vector column (e.g., ``text`` or ``int``) will result in an error. General SAI
|
||||
indexing of non-vector columns is not supported by ScyllaDB; use a
|
||||
SAI class names are supported on **vector columns** and on **ENTRIES of non-frozen map
|
||||
columns** (the CassIO metadata-map pattern).
|
||||
|
||||
* For vector columns, the index is rewritten to a native ``vector_index``.
|
||||
* For ``ENTRIES(map_column)``, the SAI class is stripped and a standard secondary index
|
||||
is created instead. A CQL warning is emitted noting possible behavioral differences
|
||||
with Cassandra SAI metadata filtering. This rewrite requires the
|
||||
``enable_cassio_compatibility`` configuration option to be set to ``true``.
|
||||
|
||||
Using an SAI class name on any other non-vector column (e.g., ``text`` or ``int``) will
|
||||
result in an error. General SAI indexing is not supported by ScyllaDB; use a
|
||||
:doc:`secondary index </cql/secondary-indexes>` instead.
|
||||
|
||||
Example of the metadata-map rewrite::
|
||||
|
||||
-- CassIO issues this during schema setup:
|
||||
CREATE CUSTOM INDEX ON my_table (ENTRIES(metadata_s))
|
||||
USING 'org.apache.cassandra.index.sai.StorageAttachedIndex';
|
||||
|
||||
-- ScyllaDB creates the equivalent of:
|
||||
CREATE INDEX ON my_table (ENTRIES(metadata_s));
|
||||
|
||||
.. _drop-index-statement:
|
||||
|
||||
DROP INDEX
|
||||
|
||||
@@ -239,7 +239,9 @@ Indexing and Caching
|
||||
|:doc:`Materialized Views </features/materialized-views>` | |v| |
|
||||
+----------------------------------------------------------------+--------------------------------------------------------------------------------------+
|
||||
|
||||
:sup:`*` SAI class name on vector columns is rewritten to native ``vector_index``
|
||||
:sup:`*` SAI class name on vector columns is rewritten to native ``vector_index``;
|
||||
SAI on ``ENTRIES(map)`` is rewritten to a regular secondary index
|
||||
(requires ``enable_cassio_compatibility: true``)
|
||||
|
||||
|
||||
Additional Features
|
||||
|
||||
@@ -204,8 +204,8 @@ std::optional<sstring> secondary_index_manager::custom_index_class(const schema&
|
||||
// We prefer this over a static custom index class instance, as it allows us to avoid any issues with thread safety.
|
||||
//
|
||||
// Note: SAI class names (StorageAttachedIndex, sai) are not listed here
|
||||
// because maybe_rewrite_sai_to_vector_index() in create_index_statement.cc
|
||||
// rewrites them to "vector_index" before the index metadata is persisted.
|
||||
// because maybe_rewrite_sai_index() in create_index_statement.cc rewrites
|
||||
// them to "vector_index" (or a regular index) before the metadata is persisted.
|
||||
std::optional<std::function<std::unique_ptr<custom_index>()>> secondary_index_manager::get_custom_class_factory(const sstring& class_name) {
|
||||
sstring lower_class_name = class_name;
|
||||
std::transform(lower_class_name.begin(), lower_class_name.end(), lower_class_name.begin(), ::tolower);
|
||||
|
||||
@@ -22,7 +22,7 @@
|
||||
|
||||
import pytest
|
||||
import json
|
||||
from .util import new_test_table, is_scylla, unique_name
|
||||
from .util import new_test_table, is_scylla, unique_name, config_value_context
|
||||
from cassandra.protocol import InvalidRequest, ConfigurationException
|
||||
|
||||
supported_filtering_types = [
|
||||
@@ -619,18 +619,121 @@ def test_sai_on_regular_column_rejected(cql, test_keyspace, scylla_only):
|
||||
cql.execute(f"CREATE CUSTOM INDEX ON {table}(x) USING 'sai'")
|
||||
|
||||
|
||||
def test_sai_entries_on_map_rejected(cql, test_keyspace, scylla_only):
|
||||
"""SAI ENTRIES index on a MAP column is rejected by ScyllaDB.
|
||||
On Cassandra this is a common pattern for metadata filtering."""
|
||||
schema = 'p int PRIMARY KEY, metadata_s map<text, text>'
|
||||
def test_sai_entries_on_map_rejected_without_flag(cql, test_keyspace, scylla_only):
|
||||
"""Without enable_cassio_compatibility, SAI ENTRIES on map is rejected
|
||||
with a helpful message suggesting to enable the flag."""
|
||||
schema = 'p int PRIMARY KEY, m map<text, text>'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
with pytest.raises(InvalidRequest, match='SAI.*only supported on vector columns'):
|
||||
with pytest.raises(InvalidRequest, match='enable_cassio_compatibility'):
|
||||
cql.execute(
|
||||
f"CREATE CUSTOM INDEX ON {table}(ENTRIES(metadata_s)) "
|
||||
f"USING 'sai'"
|
||||
f"CREATE CUSTOM INDEX ON {table}(ENTRIES(m)) USING 'sai'"
|
||||
)
|
||||
|
||||
|
||||
def test_sai_entries_on_map_creates_regular_index(cql, test_keyspace, scylla_only):
|
||||
"""SAI ENTRIES index on a MAP column (CassIO metadata pattern) is rewritten
|
||||
to a regular secondary index on ScyllaDB with a compatibility warning."""
|
||||
schema = 'p int PRIMARY KEY, metadata_s map<text, text>'
|
||||
with config_value_context(cql, 'enable_cassio_compatibility', 'true'):
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
idx = unique_name()
|
||||
result = cql.execute(
|
||||
f"CREATE CUSTOM INDEX {idx} ON {table}(ENTRIES(metadata_s)) "
|
||||
f"USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'"
|
||||
)
|
||||
# Verify a compatibility warning was returned.
|
||||
warnings = result.response_future.warnings
|
||||
assert warnings is not None
|
||||
warning_text = '\n'.join(warnings)
|
||||
assert 'SAI' in warning_text
|
||||
assert 'regular secondary index' in warning_text
|
||||
# Verify the index was actually created.
|
||||
ks, tbl = table.split('.')
|
||||
rows = list(cql.execute(
|
||||
"SELECT index_name FROM system_schema.indexes "
|
||||
"WHERE keyspace_name = %s AND table_name = %s AND index_name = %s",
|
||||
(ks.replace('"', ''), tbl.replace('"', ''), idx)
|
||||
))
|
||||
assert len(rows) == 1
|
||||
|
||||
|
||||
def test_sai_entries_on_map_short_name(cql, test_keyspace, scylla_only):
|
||||
"""Same as above but using the short 'sai' class name alias."""
|
||||
schema = 'p int PRIMARY KEY, m map<text, text>'
|
||||
with config_value_context(cql, 'enable_cassio_compatibility', 'true'):
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
result = cql.execute(
|
||||
f"CREATE CUSTOM INDEX ON {table}(ENTRIES(m)) USING 'sai'"
|
||||
)
|
||||
warnings = result.response_future.warnings
|
||||
assert warnings is not None
|
||||
assert any('SAI' in w for w in warnings)
|
||||
|
||||
|
||||
def test_sai_entries_on_map_with_options_rejected(cql, test_keyspace, scylla_only):
|
||||
"""SAI ENTRIES on map with WITH OPTIONS should be rejected — regular secondary
|
||||
indexes do not support custom options and CassIO never passes them."""
|
||||
schema = 'p int PRIMARY KEY, m map<text, text>'
|
||||
with config_value_context(cql, 'enable_cassio_compatibility', 'true'):
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
# Non-empty options
|
||||
with pytest.raises(InvalidRequest, match='cannot be rewritten.*WITH OPTIONS'):
|
||||
cql.execute(
|
||||
f"CREATE CUSTOM INDEX ON {table}(ENTRIES(m)) "
|
||||
f"USING 'sai' WITH OPTIONS = {{'some_option': 'value'}}"
|
||||
)
|
||||
# Empty options map — still rejected because OPTIONS property is present
|
||||
with pytest.raises(InvalidRequest, match='cannot be rewritten.*WITH OPTIONS'):
|
||||
cql.execute(
|
||||
f"CREATE CUSTOM INDEX ON {table}(ENTRIES(m)) "
|
||||
f"USING 'sai' WITH OPTIONS = {{}}"
|
||||
)
|
||||
|
||||
|
||||
def test_cassio_setup_mode_sync_simulation(cql, test_keyspace, scylla_only):
|
||||
"""Replay the exact DDL sequence that CassIO runs during SetupMode.SYNC
|
||||
for LangChain vector stores (SCYLLADB-2113). This validates end-to-end
|
||||
that the SAI rewrite allows the full CassIO setup flow to succeed."""
|
||||
# CassIO creates a table with metadata_s MAP<TEXT,TEXT> and a vector column,
|
||||
# then creates: (1) SAI index on vector, (2) SAI ENTRIES index on metadata_s.
|
||||
schema = (
|
||||
'row_id TEXT PRIMARY KEY, '
|
||||
'body_blob TEXT, '
|
||||
'attributes_blob TEXT, '
|
||||
'metadata_s map<text, text>, '
|
||||
'vector vector<float, 384>'
|
||||
)
|
||||
with config_value_context(cql, 'enable_cassio_compatibility', 'true'):
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
# (1) CassIO vector SAI index — rewritten to vector_index
|
||||
cql.execute(
|
||||
f"CREATE CUSTOM INDEX IF NOT EXISTS ON {table}(vector) "
|
||||
f"USING 'org.apache.cassandra.index.sai.StorageAttachedIndex' "
|
||||
f"WITH OPTIONS = {{'similarity_function': 'cosine'}}"
|
||||
)
|
||||
# (2) CassIO metadata SAI ENTRIES index — rewritten to regular index
|
||||
idx_name = unique_name()
|
||||
result = cql.execute(
|
||||
f"CREATE CUSTOM INDEX IF NOT EXISTS {idx_name} ON {table}"
|
||||
f"(ENTRIES(metadata_s)) "
|
||||
f"USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'"
|
||||
)
|
||||
# Verify warning about SAI rewrite
|
||||
warnings = result.response_future.warnings
|
||||
assert warnings is not None
|
||||
assert any('SAI' in w and 'regular secondary index' in w for w in warnings)
|
||||
# Verify both indexes exist in schema
|
||||
ks, tbl = table.split('.')
|
||||
ks = ks.replace('"', '')
|
||||
tbl = tbl.replace('"', '')
|
||||
rows = list(cql.execute(
|
||||
"SELECT index_name FROM system_schema.indexes "
|
||||
"WHERE keyspace_name = %s AND table_name = %s",
|
||||
(ks, tbl)
|
||||
))
|
||||
assert len(rows) == 2, f"Expected 2 indexes, got {len(rows)}: {rows}"
|
||||
|
||||
|
||||
def test_sai_on_nonexistent_column(cql, test_keyspace, skip_on_scylla_vnodes):
|
||||
"""SAI on a non-existent column should fail with an appropriate error."""
|
||||
schema = 'p int PRIMARY KEY, v vector<float, 3>'
|
||||
|
||||
Reference in New Issue
Block a user