mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-31 03:56:42 +00:00
Merge 'cql: rewrite CassIO SAI metadata index to regular secondary index' from Szymon Wasik
CassIO (the library backing LangChain's `langchain_community.vectorstores.Cassandra` integration) issues the following DDL during schema setup to create a metadata index: ```sql CREATE CUSTOM INDEX IF NOT EXISTS eidx_metadata_s_<table> ON <keyspace>.<table> (ENTRIES(metadata_s)) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'; ``` ScyllaDB does not support Cassandra's StorageAttachedIndex (SAI) for non-vector columns and previously rejected this statement with: ``` StorageAttachedIndex (SAI) is only supported on vector columns; use a secondary index for non-vector columns ``` This blocks seamless migration of existing LangChain/CassIO applications from Cassandra to ScyllaDB — applications fail during initialization before any application-level workaround can run, even when metadata filtering is not used (`metadata_indexing="none"`). CassIO is no longer actively maintained but remains the only official LangChain integration path for Apache Cassandra over CQL, meaning existing applications will continue using this setup pattern. Instead of rejecting the CassIO metadata-map SAI DDL, detect the pattern and rewrite it to a standard ScyllaDB secondary index on collection entries: - **Detection**: SAI class name + single `ENTRIES` target on a non-frozen `map` column - **Rewrite**: Clear the custom class so the index is created through the standard secondary index path (which already fully supports indexing map entries) - **Warning**: Emit a CQL warning informing the user that SAI is not supported by ScyllaDB, a regular secondary index was created instead, and metadata filtering behavior may differ from Cassandra SAI The rewrite is placed early in `validate_while_executing()`, before the rf-rack-validity check, so the standard secondary index code path handles all subsequent validation naturally — no code duplication. After this change, the CassIO schema setup succeeds on ScyllaDB: - `CREATE CUSTOM INDEX ... USING 'sai'` on `ENTRIES(metadata_s)` creates a real secondary index - The index is functional and can accelerate metadata filtering queries - A CQL warning makes the rewrite transparent to operators - SAI on non-vector, non-map-entries columns is still rejected as before - Vector SAI indexes continue to be rewritten to `vector_index` as before - `test_sai_entries_on_map_creates_regular_index` — verifies the index is created and the warning is emitted (fully-qualified SAI class name) - `test_sai_entries_on_map_short_name` — same with the `'sai'` short alias - `test_sai_on_regular_column_rejected` — confirms SAI on regular scalar columns is still rejected All 148 tests in `test_vector_index.py` and `test_secondary_index.py` pass with no regressions (125 passed, 22 xfailed, 1 skipped). Fixes: SCYLLADB-2113 Backport: 2026.2 as this is the version where the support for SAI class needed by LangChain was added. Closes scylladb/scylladb#29981 * github.com:scylladb/scylladb: cql: rewrite CassIO SAI metadata index to regular secondary index db/config: add enable_cassio_compatibility flag
This commit is contained in:
@@ -133,17 +133,30 @@ static bool is_vector_capable_class(const sstring& class_name) {
|
||||
return class_name == "vector_index" || is_sai_class_name(class_name);
|
||||
}
|
||||
|
||||
// When the custom class is SAI, verify that at least one target is a
|
||||
// vector column and rewrite the class to ScyllaDB's native "vector_index".
|
||||
// Non-vector single-column targets and multi-column (local-index partition
|
||||
// key) targets are skipped — they are treated as filtering columns by
|
||||
// vector_index::check_target().
|
||||
static void maybe_rewrite_sai_to_vector_index(
|
||||
// When the custom class is SAI, attempt to rewrite the index to a
|
||||
// ScyllaDB-native equivalent. Returns false when the rewrite is fully
|
||||
// transparent (vector_index), or true when the resulting index is only
|
||||
// partially compatible with Cassandra SAI semantics and the caller should
|
||||
// emit a warning to the user.
|
||||
//
|
||||
// 1. Vector columns: rewrite to ScyllaDB's native "vector_index".
|
||||
// Non-vector single-column targets and multi-column (local-index
|
||||
// partition key) targets are skipped — they are treated as filtering
|
||||
// columns by vector_index::check_target().
|
||||
//
|
||||
// 2. CassIO metadata pattern (ENTRIES on a non-frozen map): clear the
|
||||
// custom class so the index is created as a standard secondary index.
|
||||
// This rewrite is only performed when the enable_cassio_compatibility
|
||||
// configuration option is enabled (disabled by default).
|
||||
//
|
||||
// If neither pattern matches, throws invalid_request_exception.
|
||||
static bool maybe_rewrite_sai_index(
|
||||
const schema& schema,
|
||||
const std::vector<::shared_ptr<index_target>>& targets,
|
||||
index_specific_prop_defs& props) {
|
||||
index_specific_prop_defs& props,
|
||||
bool cassio_compat) {
|
||||
if (!props.custom_class || !is_sai_class_name(*props.custom_class)) {
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
for (const auto& target : targets) {
|
||||
auto* ident = std::get_if<::shared_ptr<column_identifier>>(&target->value);
|
||||
@@ -158,7 +171,38 @@ static void maybe_rewrite_sai_to_vector_index(
|
||||
}
|
||||
if (dynamic_cast<const vector_type_impl*>(cd->type.get())) {
|
||||
props.custom_class = "vector_index";
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// No vector column found. Check if this is the CassIO metadata-map
|
||||
// pattern: a single ENTRIES target on a non-frozen map column with no
|
||||
// custom options (CassIO does not pass WITH OPTIONS for this index).
|
||||
if (targets.size() == 1) {
|
||||
const auto& target = targets.front();
|
||||
if (target->type == index_target::target_type::keys_and_values) {
|
||||
auto* ident = std::get_if<::shared_ptr<column_identifier>>(&target->value);
|
||||
if (ident) {
|
||||
auto cd = schema.get_column_definition((*ident)->name());
|
||||
if (cd && cd->type->is_multi_cell() && cd->type->is_map()) {
|
||||
if (props.has_property(index_specific_prop_defs::KW_OPTIONS)) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
"StorageAttachedIndex (SAI) on map entries cannot be "
|
||||
"rewritten to a regular secondary index when WITH OPTIONS "
|
||||
"are specified; remove the options or use a secondary index directly");
|
||||
}
|
||||
if (!cassio_compat) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
"StorageAttachedIndex (SAI) on ENTRIES of a map column is not "
|
||||
"supported; if this is a CassIO/LangChain workload, enable the "
|
||||
"enable_cassio_compatibility configuration option to rewrite "
|
||||
"this index as a regular secondary index");
|
||||
}
|
||||
// Rewrite to a regular secondary index on map entries.
|
||||
props.custom_class.reset();
|
||||
props.is_custom = false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
throw exceptions::invalid_request_exception(
|
||||
@@ -388,6 +432,21 @@ create_index_statement::validate_while_executing(data_dictionary::database db, l
|
||||
throw exceptions::invalid_request_exception(format("index names shouldn't be more than {:d} characters long (got \"{}\")", schema::NAME_LENGTH, _index_name.c_str()));
|
||||
}
|
||||
|
||||
validate_for_local_index(*schema);
|
||||
|
||||
std::vector<::shared_ptr<index_target>> targets;
|
||||
for (auto& raw_target : _raw_targets) {
|
||||
targets.emplace_back(raw_target->prepare(*schema));
|
||||
}
|
||||
|
||||
bool cassio_compat = db.get_config().enable_cassio_compatibility();
|
||||
if (maybe_rewrite_sai_index(*schema, targets, *_idx_properties, cassio_compat)) {
|
||||
warnings.emplace_back(
|
||||
"SAI (StorageAttachedIndex) is not supported by ScyllaDB. "
|
||||
"This statement was rewritten to use a regular secondary index. "
|
||||
"Metadata filtering behavior may differ from Cassandra SAI.");
|
||||
}
|
||||
|
||||
// Regular secondary indexes require rf-rack-validity.
|
||||
// Custom indexes need to validate this property themselves, if they need it.
|
||||
if (!_idx_properties || !_idx_properties->custom_class) {
|
||||
@@ -408,15 +467,6 @@ create_index_statement::validate_while_executing(data_dictionary::database db, l
|
||||
}
|
||||
}
|
||||
|
||||
validate_for_local_index(*schema);
|
||||
|
||||
std::vector<::shared_ptr<index_target>> targets;
|
||||
for (auto& raw_target : _raw_targets) {
|
||||
targets.emplace_back(raw_target->prepare(*schema));
|
||||
}
|
||||
|
||||
maybe_rewrite_sai_to_vector_index(*schema, targets, *_idx_properties);
|
||||
|
||||
if (_idx_properties && _idx_properties->custom_class) {
|
||||
auto custom_index_factory = secondary_index::secondary_index_manager::get_custom_class_factory(*_idx_properties->custom_class);
|
||||
if (!custom_index_factory) {
|
||||
|
||||
@@ -1296,6 +1296,11 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
, vector_store_encryption_options(this, "vector_store_encryption_options", value_status::Used, {},
|
||||
"Options for encrypted connections to the vector store. These options are used for HTTPS URIs in `vector_store_primary_uri` and `vector_store_secondary_uri`. The available options are:\n"
|
||||
"* truststore: (Default: <not set, use system truststore>) Location of the truststore containing the trusted certificate for authenticating remote servers.")
|
||||
, enable_cassio_compatibility(this, "enable_cassio_compatibility", liveness::LiveUpdate, value_status::Used, false,
|
||||
"When enabled, ScyllaDB rewrites CassIO's SAI index DDL on map entries "
|
||||
"(e.g. CREATE CUSTOM INDEX ... ON table(ENTRIES(col)) USING 'StorageAttachedIndex') "
|
||||
"to a regular secondary index, allowing LangChain/CassIO applications to run "
|
||||
"without DDL errors. Disabled by default.")
|
||||
/**
|
||||
* @Group Security properties
|
||||
* @GroupDescription Server and client security settings.
|
||||
|
||||
@@ -375,6 +375,7 @@ public:
|
||||
named_value<sstring> vector_store_secondary_uri;
|
||||
named_value<uint32_t> vector_store_unreachable_node_detection_time_in_ms;
|
||||
named_value<string_map> vector_store_encryption_options;
|
||||
named_value<bool> enable_cassio_compatibility;
|
||||
named_value<sstring> authenticator;
|
||||
named_value<sstring> internode_authenticator;
|
||||
named_value<sstring> authorizer;
|
||||
|
||||
@@ -307,11 +307,28 @@ The ``similarity_function`` option is supported by both Cassandra SAI and Scylla
|
||||
|
||||
.. note::
|
||||
|
||||
SAI class names are only supported on **vector columns**. Using an SAI class name on a
|
||||
non-vector column (e.g., ``text`` or ``int``) will result in an error. General SAI
|
||||
indexing of non-vector columns is not supported by ScyllaDB; use a
|
||||
SAI class names are supported on **vector columns** and on **ENTRIES of non-frozen map
|
||||
columns** (the CassIO metadata-map pattern).
|
||||
|
||||
* For vector columns, the index is rewritten to a native ``vector_index``.
|
||||
* For ``ENTRIES(map_column)``, the SAI class is stripped and a standard secondary index
|
||||
is created instead. A CQL warning is emitted noting possible behavioral differences
|
||||
with Cassandra SAI metadata filtering. This rewrite requires the
|
||||
``enable_cassio_compatibility`` configuration option to be set to ``true``.
|
||||
|
||||
Using an SAI class name on any other non-vector column (e.g., ``text`` or ``int``) will
|
||||
result in an error. General SAI indexing is not supported by ScyllaDB; use a
|
||||
:doc:`secondary index </cql/secondary-indexes>` instead.
|
||||
|
||||
Example of the metadata-map rewrite::
|
||||
|
||||
-- CassIO issues this during schema setup:
|
||||
CREATE CUSTOM INDEX ON my_table (ENTRIES(metadata_s))
|
||||
USING 'org.apache.cassandra.index.sai.StorageAttachedIndex';
|
||||
|
||||
-- ScyllaDB creates the equivalent of:
|
||||
CREATE INDEX ON my_table (ENTRIES(metadata_s));
|
||||
|
||||
.. _drop-index-statement:
|
||||
|
||||
DROP INDEX
|
||||
|
||||
@@ -239,7 +239,9 @@ Indexing and Caching
|
||||
|:doc:`Materialized Views </features/materialized-views>` | |v| |
|
||||
+----------------------------------------------------------------+--------------------------------------------------------------------------------------+
|
||||
|
||||
:sup:`*` SAI class name on vector columns is rewritten to native ``vector_index``
|
||||
:sup:`*` SAI class name on vector columns is rewritten to native ``vector_index``;
|
||||
SAI on ``ENTRIES(map)`` is rewritten to a regular secondary index
|
||||
(requires ``enable_cassio_compatibility: true``)
|
||||
|
||||
|
||||
Additional Features
|
||||
|
||||
@@ -208,8 +208,8 @@ std::optional<sstring> secondary_index_manager::custom_index_class(const schema&
|
||||
// We prefer this over a static custom index class instance, as it allows us to avoid any issues with thread safety.
|
||||
//
|
||||
// Note: SAI class names (StorageAttachedIndex, sai) are not listed here
|
||||
// because maybe_rewrite_sai_to_vector_index() in create_index_statement.cc
|
||||
// rewrites them to "vector_index" before the index metadata is persisted.
|
||||
// because maybe_rewrite_sai_index() in create_index_statement.cc rewrites
|
||||
// them to "vector_index" (or a regular index) before the metadata is persisted.
|
||||
std::optional<std::function<std::unique_ptr<custom_index>()>> secondary_index_manager::get_custom_class_factory(const sstring& class_name) {
|
||||
sstring lower_class_name = class_name;
|
||||
std::transform(lower_class_name.begin(), lower_class_name.end(), lower_class_name.begin(), ::tolower);
|
||||
|
||||
@@ -22,7 +22,7 @@
|
||||
|
||||
import pytest
|
||||
import json
|
||||
from .util import new_test_table, is_scylla, unique_name
|
||||
from .util import new_test_table, is_scylla, unique_name, config_value_context
|
||||
from cassandra.protocol import InvalidRequest, ConfigurationException
|
||||
|
||||
supported_filtering_types = [
|
||||
@@ -619,18 +619,121 @@ def test_sai_on_regular_column_rejected(cql, test_keyspace, scylla_only):
|
||||
cql.execute(f"CREATE CUSTOM INDEX ON {table}(x) USING 'sai'")
|
||||
|
||||
|
||||
def test_sai_entries_on_map_rejected(cql, test_keyspace, scylla_only):
|
||||
"""SAI ENTRIES index on a MAP column is rejected by ScyllaDB.
|
||||
On Cassandra this is a common pattern for metadata filtering."""
|
||||
schema = 'p int PRIMARY KEY, metadata_s map<text, text>'
|
||||
def test_sai_entries_on_map_rejected_without_flag(cql, test_keyspace, scylla_only):
|
||||
"""Without enable_cassio_compatibility, SAI ENTRIES on map is rejected
|
||||
with a helpful message suggesting to enable the flag."""
|
||||
schema = 'p int PRIMARY KEY, m map<text, text>'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
with pytest.raises(InvalidRequest, match='SAI.*only supported on vector columns'):
|
||||
with pytest.raises(InvalidRequest, match='enable_cassio_compatibility'):
|
||||
cql.execute(
|
||||
f"CREATE CUSTOM INDEX ON {table}(ENTRIES(metadata_s)) "
|
||||
f"USING 'sai'"
|
||||
f"CREATE CUSTOM INDEX ON {table}(ENTRIES(m)) USING 'sai'"
|
||||
)
|
||||
|
||||
|
||||
def test_sai_entries_on_map_creates_regular_index(cql, test_keyspace, scylla_only):
|
||||
"""SAI ENTRIES index on a MAP column (CassIO metadata pattern) is rewritten
|
||||
to a regular secondary index on ScyllaDB with a compatibility warning."""
|
||||
schema = 'p int PRIMARY KEY, metadata_s map<text, text>'
|
||||
with config_value_context(cql, 'enable_cassio_compatibility', 'true'):
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
idx = unique_name()
|
||||
result = cql.execute(
|
||||
f"CREATE CUSTOM INDEX {idx} ON {table}(ENTRIES(metadata_s)) "
|
||||
f"USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'"
|
||||
)
|
||||
# Verify a compatibility warning was returned.
|
||||
warnings = result.response_future.warnings
|
||||
assert warnings is not None
|
||||
warning_text = '\n'.join(warnings)
|
||||
assert 'SAI' in warning_text
|
||||
assert 'regular secondary index' in warning_text
|
||||
# Verify the index was actually created.
|
||||
ks, tbl = table.split('.')
|
||||
rows = list(cql.execute(
|
||||
"SELECT index_name FROM system_schema.indexes "
|
||||
"WHERE keyspace_name = %s AND table_name = %s AND index_name = %s",
|
||||
(ks.replace('"', ''), tbl.replace('"', ''), idx)
|
||||
))
|
||||
assert len(rows) == 1
|
||||
|
||||
|
||||
def test_sai_entries_on_map_short_name(cql, test_keyspace, scylla_only):
|
||||
"""Same as above but using the short 'sai' class name alias."""
|
||||
schema = 'p int PRIMARY KEY, m map<text, text>'
|
||||
with config_value_context(cql, 'enable_cassio_compatibility', 'true'):
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
result = cql.execute(
|
||||
f"CREATE CUSTOM INDEX ON {table}(ENTRIES(m)) USING 'sai'"
|
||||
)
|
||||
warnings = result.response_future.warnings
|
||||
assert warnings is not None
|
||||
assert any('SAI' in w for w in warnings)
|
||||
|
||||
|
||||
def test_sai_entries_on_map_with_options_rejected(cql, test_keyspace, scylla_only):
|
||||
"""SAI ENTRIES on map with WITH OPTIONS should be rejected — regular secondary
|
||||
indexes do not support custom options and CassIO never passes them."""
|
||||
schema = 'p int PRIMARY KEY, m map<text, text>'
|
||||
with config_value_context(cql, 'enable_cassio_compatibility', 'true'):
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
# Non-empty options
|
||||
with pytest.raises(InvalidRequest, match='cannot be rewritten.*WITH OPTIONS'):
|
||||
cql.execute(
|
||||
f"CREATE CUSTOM INDEX ON {table}(ENTRIES(m)) "
|
||||
f"USING 'sai' WITH OPTIONS = {{'some_option': 'value'}}"
|
||||
)
|
||||
# Empty options map — still rejected because OPTIONS property is present
|
||||
with pytest.raises(InvalidRequest, match='cannot be rewritten.*WITH OPTIONS'):
|
||||
cql.execute(
|
||||
f"CREATE CUSTOM INDEX ON {table}(ENTRIES(m)) "
|
||||
f"USING 'sai' WITH OPTIONS = {{}}"
|
||||
)
|
||||
|
||||
|
||||
def test_cassio_setup_mode_sync_simulation(cql, test_keyspace, scylla_only):
|
||||
"""Replay the exact DDL sequence that CassIO runs during SetupMode.SYNC
|
||||
for LangChain vector stores (SCYLLADB-2113). This validates end-to-end
|
||||
that the SAI rewrite allows the full CassIO setup flow to succeed."""
|
||||
# CassIO creates a table with metadata_s MAP<TEXT,TEXT> and a vector column,
|
||||
# then creates: (1) SAI index on vector, (2) SAI ENTRIES index on metadata_s.
|
||||
schema = (
|
||||
'row_id TEXT PRIMARY KEY, '
|
||||
'body_blob TEXT, '
|
||||
'attributes_blob TEXT, '
|
||||
'metadata_s map<text, text>, '
|
||||
'vector vector<float, 384>'
|
||||
)
|
||||
with config_value_context(cql, 'enable_cassio_compatibility', 'true'):
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
# (1) CassIO vector SAI index — rewritten to vector_index
|
||||
cql.execute(
|
||||
f"CREATE CUSTOM INDEX IF NOT EXISTS ON {table}(vector) "
|
||||
f"USING 'org.apache.cassandra.index.sai.StorageAttachedIndex' "
|
||||
f"WITH OPTIONS = {{'similarity_function': 'cosine'}}"
|
||||
)
|
||||
# (2) CassIO metadata SAI ENTRIES index — rewritten to regular index
|
||||
idx_name = unique_name()
|
||||
result = cql.execute(
|
||||
f"CREATE CUSTOM INDEX IF NOT EXISTS {idx_name} ON {table}"
|
||||
f"(ENTRIES(metadata_s)) "
|
||||
f"USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'"
|
||||
)
|
||||
# Verify warning about SAI rewrite
|
||||
warnings = result.response_future.warnings
|
||||
assert warnings is not None
|
||||
assert any('SAI' in w and 'regular secondary index' in w for w in warnings)
|
||||
# Verify both indexes exist in schema
|
||||
ks, tbl = table.split('.')
|
||||
ks = ks.replace('"', '')
|
||||
tbl = tbl.replace('"', '')
|
||||
rows = list(cql.execute(
|
||||
"SELECT index_name FROM system_schema.indexes "
|
||||
"WHERE keyspace_name = %s AND table_name = %s",
|
||||
(ks, tbl)
|
||||
))
|
||||
assert len(rows) == 2, f"Expected 2 indexes, got {len(rows)}: {rows}"
|
||||
|
||||
|
||||
def test_sai_on_nonexistent_column(cql, test_keyspace, skip_on_scylla_vnodes):
|
||||
"""SAI on a non-existent column should fail with an appropriate error."""
|
||||
schema = 'p int PRIMARY KEY, v vector<float, 3>'
|
||||
|
||||
Reference in New Issue
Block a user