cql: rewrite CassIO SAI metadata index to regular secondary index

When CassIO creates a SAI ENTRIES index on a map column,
ScyllaDB now rewrites it to a regular secondary index and emits
a CQL warning. This allows LangChain/CassIO applications to work
without DDL errors.

The rewrite is gated behind the enable_cassio_compatibility flag
(disabled by default).

Refs: SCYLLADB-2113
(cherry picked from commit 5ee339b11d)
This commit is contained in:
Szymon Wasik
2026-05-21 12:19:15 +02:00
committed by scylladbbot
parent 4251357118
commit c30c9f3a82
5 changed files with 204 additions and 32 deletions

View File

@@ -133,17 +133,30 @@ static bool is_vector_capable_class(const sstring& class_name) {
return class_name == "vector_index" || is_sai_class_name(class_name);
}
// When the custom class is SAI, verify that at least one target is a
// vector column and rewrite the class to ScyllaDB's native "vector_index".
// Non-vector single-column targets and multi-column (local-index partition
// key) targets are skipped — they are treated as filtering columns by
// vector_index::check_target().
static void maybe_rewrite_sai_to_vector_index(
// When the custom class is SAI, attempt to rewrite the index to a
// ScyllaDB-native equivalent. Returns false when the rewrite is fully
// transparent (vector_index), or true when the resulting index is only
// partially compatible with Cassandra SAI semantics and the caller should
// emit a warning to the user.
//
// 1. Vector columns: rewrite to ScyllaDB's native "vector_index".
// Non-vector single-column targets and multi-column (local-index
// partition key) targets are skipped — they are treated as filtering
// columns by vector_index::check_target().
//
// 2. CassIO metadata pattern (ENTRIES on a non-frozen map): clear the
// custom class so the index is created as a standard secondary index.
// This rewrite is only performed when the enable_cassio_compatibility
// configuration option is enabled (disabled by default).
//
// If neither pattern matches, throws invalid_request_exception.
static bool maybe_rewrite_sai_index(
const schema& schema,
const std::vector<::shared_ptr<index_target>>& targets,
index_specific_prop_defs& props) {
index_specific_prop_defs& props,
bool cassio_compat) {
if (!props.custom_class || !is_sai_class_name(*props.custom_class)) {
return;
return false;
}
for (const auto& target : targets) {
auto* ident = std::get_if<::shared_ptr<column_identifier>>(&target->value);
@@ -158,7 +171,38 @@ static void maybe_rewrite_sai_to_vector_index(
}
if (dynamic_cast<const vector_type_impl*>(cd->type.get())) {
props.custom_class = "vector_index";
return;
return false;
}
}
// No vector column found. Check if this is the CassIO metadata-map
// pattern: a single ENTRIES target on a non-frozen map column with no
// custom options (CassIO does not pass WITH OPTIONS for this index).
if (targets.size() == 1) {
const auto& target = targets.front();
if (target->type == index_target::target_type::keys_and_values) {
auto* ident = std::get_if<::shared_ptr<column_identifier>>(&target->value);
if (ident) {
auto cd = schema.get_column_definition((*ident)->name());
if (cd && cd->type->is_multi_cell() && cd->type->is_map()) {
if (props.has_property(index_specific_prop_defs::KW_OPTIONS)) {
throw exceptions::invalid_request_exception(
"StorageAttachedIndex (SAI) on map entries cannot be "
"rewritten to a regular secondary index when WITH OPTIONS "
"are specified; remove the options or use a secondary index directly");
}
if (!cassio_compat) {
throw exceptions::invalid_request_exception(
"StorageAttachedIndex (SAI) on ENTRIES of a map column is not "
"supported; if this is a CassIO/LangChain workload, enable the "
"enable_cassio_compatibility configuration option to rewrite "
"this index as a regular secondary index");
}
// Rewrite to a regular secondary index on map entries.
props.custom_class.reset();
props.is_custom = false;
return true;
}
}
}
}
throw exceptions::invalid_request_exception(
@@ -380,6 +424,21 @@ create_index_statement::validate_while_executing(data_dictionary::database db, l
throw exceptions::invalid_request_exception(format("index names shouldn't be more than {:d} characters long (got \"{}\")", schema::NAME_LENGTH, _index_name.c_str()));
}
validate_for_local_index(*schema);
std::vector<::shared_ptr<index_target>> targets;
for (auto& raw_target : _raw_targets) {
targets.emplace_back(raw_target->prepare(*schema));
}
bool cassio_compat = db.get_config().enable_cassio_compatibility();
if (maybe_rewrite_sai_index(*schema, targets, *_idx_properties, cassio_compat)) {
warnings.emplace_back(
"SAI (StorageAttachedIndex) is not supported by ScyllaDB. "
"This statement was rewritten to use a regular secondary index. "
"Metadata filtering behavior may differ from Cassandra SAI.");
}
// Regular secondary indexes require rf-rack-validity.
// Custom indexes need to validate this property themselves, if they need it.
if (!_idx_properties || !_idx_properties->custom_class) {
@@ -400,15 +459,6 @@ create_index_statement::validate_while_executing(data_dictionary::database db, l
}
}
validate_for_local_index(*schema);
std::vector<::shared_ptr<index_target>> targets;
for (auto& raw_target : _raw_targets) {
targets.emplace_back(raw_target->prepare(*schema));
}
maybe_rewrite_sai_to_vector_index(*schema, targets, *_idx_properties);
if (_idx_properties && _idx_properties->custom_class) {
auto custom_index_factory = secondary_index::secondary_index_manager::get_custom_class_factory(*_idx_properties->custom_class);
if (!custom_index_factory) {

View File

@@ -307,11 +307,28 @@ The ``similarity_function`` option is supported by both Cassandra SAI and Scylla
.. note::
SAI class names are only supported on **vector columns**. Using an SAI class name on a
non-vector column (e.g., ``text`` or ``int``) will result in an error. General SAI
indexing of non-vector columns is not supported by ScyllaDB; use a
SAI class names are supported on **vector columns** and on **ENTRIES of non-frozen map
columns** (the CassIO metadata-map pattern).
* For vector columns, the index is rewritten to a native ``vector_index``.
* For ``ENTRIES(map_column)``, the SAI class is stripped and a standard secondary index
is created instead. A CQL warning is emitted noting possible behavioral differences
with Cassandra SAI metadata filtering. This rewrite requires the
``enable_cassio_compatibility`` configuration option to be set to ``true``.
Using an SAI class name on any other non-vector column (e.g., ``text`` or ``int``) will
result in an error. General SAI indexing is not supported by ScyllaDB; use a
:doc:`secondary index </cql/secondary-indexes>` instead.
Example of the metadata-map rewrite::
-- CassIO issues this during schema setup:
CREATE CUSTOM INDEX ON my_table (ENTRIES(metadata_s))
USING 'org.apache.cassandra.index.sai.StorageAttachedIndex';
-- ScyllaDB creates the equivalent of:
CREATE INDEX ON my_table (ENTRIES(metadata_s));
.. _drop-index-statement:
DROP INDEX

View File

@@ -239,7 +239,9 @@ Indexing and Caching
|:doc:`Materialized Views </features/materialized-views>` | |v| |
+----------------------------------------------------------------+--------------------------------------------------------------------------------------+
:sup:`*` SAI class name on vector columns is rewritten to native ``vector_index``
:sup:`*` SAI class name on vector columns is rewritten to native ``vector_index``;
SAI on ``ENTRIES(map)`` is rewritten to a regular secondary index
(requires ``enable_cassio_compatibility: true``)
Additional Features

View File

@@ -204,8 +204,8 @@ std::optional<sstring> secondary_index_manager::custom_index_class(const schema&
// We prefer this over a static custom index class instance, as it allows us to avoid any issues with thread safety.
//
// Note: SAI class names (StorageAttachedIndex, sai) are not listed here
// because maybe_rewrite_sai_to_vector_index() in create_index_statement.cc
// rewrites them to "vector_index" before the index metadata is persisted.
// because maybe_rewrite_sai_index() in create_index_statement.cc rewrites
// them to "vector_index" (or a regular index) before the metadata is persisted.
std::optional<std::function<std::unique_ptr<custom_index>()>> secondary_index_manager::get_custom_class_factory(const sstring& class_name) {
sstring lower_class_name = class_name;
std::transform(lower_class_name.begin(), lower_class_name.end(), lower_class_name.begin(), ::tolower);

View File

@@ -22,7 +22,7 @@
import pytest
import json
from .util import new_test_table, is_scylla, unique_name
from .util import new_test_table, is_scylla, unique_name, config_value_context
from cassandra.protocol import InvalidRequest, ConfigurationException
supported_filtering_types = [
@@ -619,18 +619,121 @@ def test_sai_on_regular_column_rejected(cql, test_keyspace, scylla_only):
cql.execute(f"CREATE CUSTOM INDEX ON {table}(x) USING 'sai'")
def test_sai_entries_on_map_rejected(cql, test_keyspace, scylla_only):
"""SAI ENTRIES index on a MAP column is rejected by ScyllaDB.
On Cassandra this is a common pattern for metadata filtering."""
schema = 'p int PRIMARY KEY, metadata_s map<text, text>'
def test_sai_entries_on_map_rejected_without_flag(cql, test_keyspace, scylla_only):
"""Without enable_cassio_compatibility, SAI ENTRIES on map is rejected
with a helpful message suggesting to enable the flag."""
schema = 'p int PRIMARY KEY, m map<text, text>'
with new_test_table(cql, test_keyspace, schema) as table:
with pytest.raises(InvalidRequest, match='SAI.*only supported on vector columns'):
with pytest.raises(InvalidRequest, match='enable_cassio_compatibility'):
cql.execute(
f"CREATE CUSTOM INDEX ON {table}(ENTRIES(metadata_s)) "
f"USING 'sai'"
f"CREATE CUSTOM INDEX ON {table}(ENTRIES(m)) USING 'sai'"
)
def test_sai_entries_on_map_creates_regular_index(cql, test_keyspace, scylla_only):
"""SAI ENTRIES index on a MAP column (CassIO metadata pattern) is rewritten
to a regular secondary index on ScyllaDB with a compatibility warning."""
schema = 'p int PRIMARY KEY, metadata_s map<text, text>'
with config_value_context(cql, 'enable_cassio_compatibility', 'true'):
with new_test_table(cql, test_keyspace, schema) as table:
idx = unique_name()
result = cql.execute(
f"CREATE CUSTOM INDEX {idx} ON {table}(ENTRIES(metadata_s)) "
f"USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'"
)
# Verify a compatibility warning was returned.
warnings = result.response_future.warnings
assert warnings is not None
warning_text = '\n'.join(warnings)
assert 'SAI' in warning_text
assert 'regular secondary index' in warning_text
# Verify the index was actually created.
ks, tbl = table.split('.')
rows = list(cql.execute(
"SELECT index_name FROM system_schema.indexes "
"WHERE keyspace_name = %s AND table_name = %s AND index_name = %s",
(ks.replace('"', ''), tbl.replace('"', ''), idx)
))
assert len(rows) == 1
def test_sai_entries_on_map_short_name(cql, test_keyspace, scylla_only):
"""Same as above but using the short 'sai' class name alias."""
schema = 'p int PRIMARY KEY, m map<text, text>'
with config_value_context(cql, 'enable_cassio_compatibility', 'true'):
with new_test_table(cql, test_keyspace, schema) as table:
result = cql.execute(
f"CREATE CUSTOM INDEX ON {table}(ENTRIES(m)) USING 'sai'"
)
warnings = result.response_future.warnings
assert warnings is not None
assert any('SAI' in w for w in warnings)
def test_sai_entries_on_map_with_options_rejected(cql, test_keyspace, scylla_only):
"""SAI ENTRIES on map with WITH OPTIONS should be rejected — regular secondary
indexes do not support custom options and CassIO never passes them."""
schema = 'p int PRIMARY KEY, m map<text, text>'
with config_value_context(cql, 'enable_cassio_compatibility', 'true'):
with new_test_table(cql, test_keyspace, schema) as table:
# Non-empty options
with pytest.raises(InvalidRequest, match='cannot be rewritten.*WITH OPTIONS'):
cql.execute(
f"CREATE CUSTOM INDEX ON {table}(ENTRIES(m)) "
f"USING 'sai' WITH OPTIONS = {{'some_option': 'value'}}"
)
# Empty options map — still rejected because OPTIONS property is present
with pytest.raises(InvalidRequest, match='cannot be rewritten.*WITH OPTIONS'):
cql.execute(
f"CREATE CUSTOM INDEX ON {table}(ENTRIES(m)) "
f"USING 'sai' WITH OPTIONS = {{}}"
)
def test_cassio_setup_mode_sync_simulation(cql, test_keyspace, scylla_only):
"""Replay the exact DDL sequence that CassIO runs during SetupMode.SYNC
for LangChain vector stores (SCYLLADB-2113). This validates end-to-end
that the SAI rewrite allows the full CassIO setup flow to succeed."""
# CassIO creates a table with metadata_s MAP<TEXT,TEXT> and a vector column,
# then creates: (1) SAI index on vector, (2) SAI ENTRIES index on metadata_s.
schema = (
'row_id TEXT PRIMARY KEY, '
'body_blob TEXT, '
'attributes_blob TEXT, '
'metadata_s map<text, text>, '
'vector vector<float, 384>'
)
with config_value_context(cql, 'enable_cassio_compatibility', 'true'):
with new_test_table(cql, test_keyspace, schema) as table:
# (1) CassIO vector SAI index — rewritten to vector_index
cql.execute(
f"CREATE CUSTOM INDEX IF NOT EXISTS ON {table}(vector) "
f"USING 'org.apache.cassandra.index.sai.StorageAttachedIndex' "
f"WITH OPTIONS = {{'similarity_function': 'cosine'}}"
)
# (2) CassIO metadata SAI ENTRIES index — rewritten to regular index
idx_name = unique_name()
result = cql.execute(
f"CREATE CUSTOM INDEX IF NOT EXISTS {idx_name} ON {table}"
f"(ENTRIES(metadata_s)) "
f"USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'"
)
# Verify warning about SAI rewrite
warnings = result.response_future.warnings
assert warnings is not None
assert any('SAI' in w and 'regular secondary index' in w for w in warnings)
# Verify both indexes exist in schema
ks, tbl = table.split('.')
ks = ks.replace('"', '')
tbl = tbl.replace('"', '')
rows = list(cql.execute(
"SELECT index_name FROM system_schema.indexes "
"WHERE keyspace_name = %s AND table_name = %s",
(ks, tbl)
))
assert len(rows) == 2, f"Expected 2 indexes, got {len(rows)}: {rows}"
def test_sai_on_nonexistent_column(cql, test_keyspace, skip_on_scylla_vnodes):
"""SAI on a non-existent column should fail with an appropriate error."""
schema = 'p int PRIMARY KEY, v vector<float, 3>'