From c30c9f3a82bec3db4103dab14bf893e80be1bf7f Mon Sep 17 00:00:00 2001 From: Szymon Wasik Date: Thu, 21 May 2026 12:19:15 +0200 Subject: [PATCH] cql: rewrite CassIO SAI metadata index to regular secondary index When CassIO creates a SAI ENTRIES index on a map column, ScyllaDB now rewrites it to a regular secondary index and emits a CQL warning. This allows LangChain/CassIO applications to work without DDL errors. The rewrite is gated behind the enable_cassio_compatibility flag (disabled by default). Refs: SCYLLADB-2113 (cherry picked from commit 5ee339b11d5169257cfaee1bf2123461a48b8252) --- cql3/statements/create_index_statement.cc | 86 ++++++++++--- docs/cql/secondary-indexes.rst | 23 +++- docs/using-scylla/cassandra-compatibility.rst | 4 +- index/secondary_index_manager.cc | 4 +- test/cqlpy/test_vector_index.py | 119 ++++++++++++++++-- 5 files changed, 204 insertions(+), 32 deletions(-) diff --git a/cql3/statements/create_index_statement.cc b/cql3/statements/create_index_statement.cc index 8ebac074c9..8ae703abed 100644 --- a/cql3/statements/create_index_statement.cc +++ b/cql3/statements/create_index_statement.cc @@ -133,17 +133,30 @@ static bool is_vector_capable_class(const sstring& class_name) { return class_name == "vector_index" || is_sai_class_name(class_name); } -// When the custom class is SAI, verify that at least one target is a -// vector column and rewrite the class to ScyllaDB's native "vector_index". -// Non-vector single-column targets and multi-column (local-index partition -// key) targets are skipped — they are treated as filtering columns by -// vector_index::check_target(). -static void maybe_rewrite_sai_to_vector_index( +// When the custom class is SAI, attempt to rewrite the index to a +// ScyllaDB-native equivalent. Returns false when the rewrite is fully +// transparent (vector_index), or true when the resulting index is only +// partially compatible with Cassandra SAI semantics and the caller should +// emit a warning to the user. +// +// 1. Vector columns: rewrite to ScyllaDB's native "vector_index". +// Non-vector single-column targets and multi-column (local-index +// partition key) targets are skipped — they are treated as filtering +// columns by vector_index::check_target(). +// +// 2. CassIO metadata pattern (ENTRIES on a non-frozen map): clear the +// custom class so the index is created as a standard secondary index. +// This rewrite is only performed when the enable_cassio_compatibility +// configuration option is enabled (disabled by default). +// +// If neither pattern matches, throws invalid_request_exception. +static bool maybe_rewrite_sai_index( const schema& schema, const std::vector<::shared_ptr>& targets, - index_specific_prop_defs& props) { + index_specific_prop_defs& props, + bool cassio_compat) { if (!props.custom_class || !is_sai_class_name(*props.custom_class)) { - return; + return false; } for (const auto& target : targets) { auto* ident = std::get_if<::shared_ptr>(&target->value); @@ -158,7 +171,38 @@ static void maybe_rewrite_sai_to_vector_index( } if (dynamic_cast(cd->type.get())) { props.custom_class = "vector_index"; - return; + return false; + } + } + // No vector column found. Check if this is the CassIO metadata-map + // pattern: a single ENTRIES target on a non-frozen map column with no + // custom options (CassIO does not pass WITH OPTIONS for this index). + if (targets.size() == 1) { + const auto& target = targets.front(); + if (target->type == index_target::target_type::keys_and_values) { + auto* ident = std::get_if<::shared_ptr>(&target->value); + if (ident) { + auto cd = schema.get_column_definition((*ident)->name()); + if (cd && cd->type->is_multi_cell() && cd->type->is_map()) { + if (props.has_property(index_specific_prop_defs::KW_OPTIONS)) { + throw exceptions::invalid_request_exception( + "StorageAttachedIndex (SAI) on map entries cannot be " + "rewritten to a regular secondary index when WITH OPTIONS " + "are specified; remove the options or use a secondary index directly"); + } + if (!cassio_compat) { + throw exceptions::invalid_request_exception( + "StorageAttachedIndex (SAI) on ENTRIES of a map column is not " + "supported; if this is a CassIO/LangChain workload, enable the " + "enable_cassio_compatibility configuration option to rewrite " + "this index as a regular secondary index"); + } + // Rewrite to a regular secondary index on map entries. + props.custom_class.reset(); + props.is_custom = false; + return true; + } + } } } throw exceptions::invalid_request_exception( @@ -380,6 +424,21 @@ create_index_statement::validate_while_executing(data_dictionary::database db, l throw exceptions::invalid_request_exception(format("index names shouldn't be more than {:d} characters long (got \"{}\")", schema::NAME_LENGTH, _index_name.c_str())); } + validate_for_local_index(*schema); + + std::vector<::shared_ptr> targets; + for (auto& raw_target : _raw_targets) { + targets.emplace_back(raw_target->prepare(*schema)); + } + + bool cassio_compat = db.get_config().enable_cassio_compatibility(); + if (maybe_rewrite_sai_index(*schema, targets, *_idx_properties, cassio_compat)) { + warnings.emplace_back( + "SAI (StorageAttachedIndex) is not supported by ScyllaDB. " + "This statement was rewritten to use a regular secondary index. " + "Metadata filtering behavior may differ from Cassandra SAI."); + } + // Regular secondary indexes require rf-rack-validity. // Custom indexes need to validate this property themselves, if they need it. if (!_idx_properties || !_idx_properties->custom_class) { @@ -400,15 +459,6 @@ create_index_statement::validate_while_executing(data_dictionary::database db, l } } - validate_for_local_index(*schema); - - std::vector<::shared_ptr> targets; - for (auto& raw_target : _raw_targets) { - targets.emplace_back(raw_target->prepare(*schema)); - } - - maybe_rewrite_sai_to_vector_index(*schema, targets, *_idx_properties); - if (_idx_properties && _idx_properties->custom_class) { auto custom_index_factory = secondary_index::secondary_index_manager::get_custom_class_factory(*_idx_properties->custom_class); if (!custom_index_factory) { diff --git a/docs/cql/secondary-indexes.rst b/docs/cql/secondary-indexes.rst index db5f3e3553..aa94d00c1a 100644 --- a/docs/cql/secondary-indexes.rst +++ b/docs/cql/secondary-indexes.rst @@ -307,11 +307,28 @@ The ``similarity_function`` option is supported by both Cassandra SAI and Scylla .. note:: - SAI class names are only supported on **vector columns**. Using an SAI class name on a - non-vector column (e.g., ``text`` or ``int``) will result in an error. General SAI - indexing of non-vector columns is not supported by ScyllaDB; use a + SAI class names are supported on **vector columns** and on **ENTRIES of non-frozen map + columns** (the CassIO metadata-map pattern). + + * For vector columns, the index is rewritten to a native ``vector_index``. + * For ``ENTRIES(map_column)``, the SAI class is stripped and a standard secondary index + is created instead. A CQL warning is emitted noting possible behavioral differences + with Cassandra SAI metadata filtering. This rewrite requires the + ``enable_cassio_compatibility`` configuration option to be set to ``true``. + + Using an SAI class name on any other non-vector column (e.g., ``text`` or ``int``) will + result in an error. General SAI indexing is not supported by ScyllaDB; use a :doc:`secondary index ` instead. + Example of the metadata-map rewrite:: + + -- CassIO issues this during schema setup: + CREATE CUSTOM INDEX ON my_table (ENTRIES(metadata_s)) + USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'; + + -- ScyllaDB creates the equivalent of: + CREATE INDEX ON my_table (ENTRIES(metadata_s)); + .. _drop-index-statement: DROP INDEX diff --git a/docs/using-scylla/cassandra-compatibility.rst b/docs/using-scylla/cassandra-compatibility.rst index a6626f6df6..5b88357032 100644 --- a/docs/using-scylla/cassandra-compatibility.rst +++ b/docs/using-scylla/cassandra-compatibility.rst @@ -239,7 +239,9 @@ Indexing and Caching |:doc:`Materialized Views ` | |v| | +----------------------------------------------------------------+--------------------------------------------------------------------------------------+ -:sup:`*` SAI class name on vector columns is rewritten to native ``vector_index`` +:sup:`*` SAI class name on vector columns is rewritten to native ``vector_index``; +SAI on ``ENTRIES(map)`` is rewritten to a regular secondary index +(requires ``enable_cassio_compatibility: true``) Additional Features diff --git a/index/secondary_index_manager.cc b/index/secondary_index_manager.cc index 1483c0bef1..ed84e1edcf 100644 --- a/index/secondary_index_manager.cc +++ b/index/secondary_index_manager.cc @@ -204,8 +204,8 @@ std::optional secondary_index_manager::custom_index_class(const schema& // We prefer this over a static custom index class instance, as it allows us to avoid any issues with thread safety. // // Note: SAI class names (StorageAttachedIndex, sai) are not listed here -// because maybe_rewrite_sai_to_vector_index() in create_index_statement.cc -// rewrites them to "vector_index" before the index metadata is persisted. +// because maybe_rewrite_sai_index() in create_index_statement.cc rewrites +// them to "vector_index" (or a regular index) before the metadata is persisted. std::optional()>> secondary_index_manager::get_custom_class_factory(const sstring& class_name) { sstring lower_class_name = class_name; std::transform(lower_class_name.begin(), lower_class_name.end(), lower_class_name.begin(), ::tolower); diff --git a/test/cqlpy/test_vector_index.py b/test/cqlpy/test_vector_index.py index 17d664fe34..1fbde03acb 100644 --- a/test/cqlpy/test_vector_index.py +++ b/test/cqlpy/test_vector_index.py @@ -22,7 +22,7 @@ import pytest import json -from .util import new_test_table, is_scylla, unique_name +from .util import new_test_table, is_scylla, unique_name, config_value_context from cassandra.protocol import InvalidRequest, ConfigurationException supported_filtering_types = [ @@ -619,18 +619,121 @@ def test_sai_on_regular_column_rejected(cql, test_keyspace, scylla_only): cql.execute(f"CREATE CUSTOM INDEX ON {table}(x) USING 'sai'") -def test_sai_entries_on_map_rejected(cql, test_keyspace, scylla_only): - """SAI ENTRIES index on a MAP column is rejected by ScyllaDB. - On Cassandra this is a common pattern for metadata filtering.""" - schema = 'p int PRIMARY KEY, metadata_s map' +def test_sai_entries_on_map_rejected_without_flag(cql, test_keyspace, scylla_only): + """Without enable_cassio_compatibility, SAI ENTRIES on map is rejected + with a helpful message suggesting to enable the flag.""" + schema = 'p int PRIMARY KEY, m map' with new_test_table(cql, test_keyspace, schema) as table: - with pytest.raises(InvalidRequest, match='SAI.*only supported on vector columns'): + with pytest.raises(InvalidRequest, match='enable_cassio_compatibility'): cql.execute( - f"CREATE CUSTOM INDEX ON {table}(ENTRIES(metadata_s)) " - f"USING 'sai'" + f"CREATE CUSTOM INDEX ON {table}(ENTRIES(m)) USING 'sai'" ) +def test_sai_entries_on_map_creates_regular_index(cql, test_keyspace, scylla_only): + """SAI ENTRIES index on a MAP column (CassIO metadata pattern) is rewritten + to a regular secondary index on ScyllaDB with a compatibility warning.""" + schema = 'p int PRIMARY KEY, metadata_s map' + with config_value_context(cql, 'enable_cassio_compatibility', 'true'): + with new_test_table(cql, test_keyspace, schema) as table: + idx = unique_name() + result = cql.execute( + f"CREATE CUSTOM INDEX {idx} ON {table}(ENTRIES(metadata_s)) " + f"USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'" + ) + # Verify a compatibility warning was returned. + warnings = result.response_future.warnings + assert warnings is not None + warning_text = '\n'.join(warnings) + assert 'SAI' in warning_text + assert 'regular secondary index' in warning_text + # Verify the index was actually created. + ks, tbl = table.split('.') + rows = list(cql.execute( + "SELECT index_name FROM system_schema.indexes " + "WHERE keyspace_name = %s AND table_name = %s AND index_name = %s", + (ks.replace('"', ''), tbl.replace('"', ''), idx) + )) + assert len(rows) == 1 + + +def test_sai_entries_on_map_short_name(cql, test_keyspace, scylla_only): + """Same as above but using the short 'sai' class name alias.""" + schema = 'p int PRIMARY KEY, m map' + with config_value_context(cql, 'enable_cassio_compatibility', 'true'): + with new_test_table(cql, test_keyspace, schema) as table: + result = cql.execute( + f"CREATE CUSTOM INDEX ON {table}(ENTRIES(m)) USING 'sai'" + ) + warnings = result.response_future.warnings + assert warnings is not None + assert any('SAI' in w for w in warnings) + + +def test_sai_entries_on_map_with_options_rejected(cql, test_keyspace, scylla_only): + """SAI ENTRIES on map with WITH OPTIONS should be rejected — regular secondary + indexes do not support custom options and CassIO never passes them.""" + schema = 'p int PRIMARY KEY, m map' + with config_value_context(cql, 'enable_cassio_compatibility', 'true'): + with new_test_table(cql, test_keyspace, schema) as table: + # Non-empty options + with pytest.raises(InvalidRequest, match='cannot be rewritten.*WITH OPTIONS'): + cql.execute( + f"CREATE CUSTOM INDEX ON {table}(ENTRIES(m)) " + f"USING 'sai' WITH OPTIONS = {{'some_option': 'value'}}" + ) + # Empty options map — still rejected because OPTIONS property is present + with pytest.raises(InvalidRequest, match='cannot be rewritten.*WITH OPTIONS'): + cql.execute( + f"CREATE CUSTOM INDEX ON {table}(ENTRIES(m)) " + f"USING 'sai' WITH OPTIONS = {{}}" + ) + + +def test_cassio_setup_mode_sync_simulation(cql, test_keyspace, scylla_only): + """Replay the exact DDL sequence that CassIO runs during SetupMode.SYNC + for LangChain vector stores (SCYLLADB-2113). This validates end-to-end + that the SAI rewrite allows the full CassIO setup flow to succeed.""" + # CassIO creates a table with metadata_s MAP and a vector column, + # then creates: (1) SAI index on vector, (2) SAI ENTRIES index on metadata_s. + schema = ( + 'row_id TEXT PRIMARY KEY, ' + 'body_blob TEXT, ' + 'attributes_blob TEXT, ' + 'metadata_s map, ' + 'vector vector' + ) + with config_value_context(cql, 'enable_cassio_compatibility', 'true'): + with new_test_table(cql, test_keyspace, schema) as table: + # (1) CassIO vector SAI index — rewritten to vector_index + cql.execute( + f"CREATE CUSTOM INDEX IF NOT EXISTS ON {table}(vector) " + f"USING 'org.apache.cassandra.index.sai.StorageAttachedIndex' " + f"WITH OPTIONS = {{'similarity_function': 'cosine'}}" + ) + # (2) CassIO metadata SAI ENTRIES index — rewritten to regular index + idx_name = unique_name() + result = cql.execute( + f"CREATE CUSTOM INDEX IF NOT EXISTS {idx_name} ON {table}" + f"(ENTRIES(metadata_s)) " + f"USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'" + ) + # Verify warning about SAI rewrite + warnings = result.response_future.warnings + assert warnings is not None + assert any('SAI' in w and 'regular secondary index' in w for w in warnings) + # Verify both indexes exist in schema + ks, tbl = table.split('.') + ks = ks.replace('"', '') + tbl = tbl.replace('"', '') + rows = list(cql.execute( + "SELECT index_name FROM system_schema.indexes " + "WHERE keyspace_name = %s AND table_name = %s", + (ks, tbl) + )) + assert len(rows) == 2, f"Expected 2 indexes, got {len(rows)}: {rows}" + + def test_sai_on_nonexistent_column(cql, test_keyspace, skip_on_scylla_vnodes): """SAI on a non-existent column should fail with an appropriate error.""" schema = 'p int PRIMARY KEY, v vector'