Files
scylladb/test/cqlpy/test_fulltext_index.py
Dawid Pawlik c2d27d1a50 index: remove Chinese, Japanese, and Korean language analyzers
Remove "chinese", "japanese", and "korean" from the list of accepted
full-text search analyzer options. Exposing these options commits
ScyllaDB to supporting them long-term — if we ever switch from one
backend search engine to another, CJK analyzers are the most likely
to lose out-of-the-box support, unlike the popular European languages
that are broadly available across text analysis libraries.

Restrict the accepted set now, while FTS is still new, to avoid a
future compatibility burden.

Add a test to check if the CJK language analyzer options are rejected.

Fixes: VECTOR-672

Closes scylladb/scylladb#29877
2026-05-18 18:20:47 +03:00

220 lines
10 KiB
Python

# Copyright 2026-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
###############################################################################
# Tests for fulltext indexes
#
# This file tests the fulltext_index custom index class mainly covering schema
# and options validation.
# It does not test the actual indexing or querying behavior.
###############################################################################
import pytest
import re
from .util import new_test_table, unique_name
from cassandra.protocol import InvalidRequest
@pytest.mark.parametrize("column_type", ["text", "varchar", "ascii"])
def test_create_fulltext_index_on_supported_text_column(cql, test_keyspace, scylla_only, column_type):
"""Fulltext index should accept all supported textual CQL columns."""
schema = f'p int primary key, content {column_type}'
with new_test_table(cql, test_keyspace, schema) as table:
cql.execute(f"CREATE CUSTOM INDEX ON {table}(content) USING 'fulltext_index'")
def test_create_fulltext_index_uppercase_class(cql, test_keyspace, scylla_only):
"""Custom index class name lookup is case-insensitive."""
schema = 'p int primary key, content text'
with new_test_table(cql, test_keyspace, schema) as table:
cql.execute(f"CREATE CUSTOM INDEX ON {table}(content) USING 'FULLTEXT_INDEX'")
@pytest.mark.parametrize("column_type", ["int", "blob", "vector<float, 3>"])
def test_create_fulltext_index_on_unsupported_column_fails(cql, test_keyspace, scylla_only, column_type):
"""Fulltext index must reject non-text column types."""
schema = f'p int primary key, v {column_type}'
with new_test_table(cql, test_keyspace, schema) as table:
with pytest.raises(InvalidRequest, match="Fulltext index is only supported on text, varchar, or ascii columns"):
cql.execute(f"CREATE CUSTOM INDEX ON {table}(v) USING 'fulltext_index'")
def test_create_fulltext_index_with_analyzer_option(cql, test_keyspace, scylla_only):
"""All supported analyzer values should be accepted."""
analyzers = [
'standard', 'english', 'german', 'french', 'spanish', 'italian',
'portuguese', 'russian', 'simple', 'whitespace',
]
schema = 'p int primary key, content text'
with new_test_table(cql, test_keyspace, schema) as table:
for analyzer in analyzers:
index_name = unique_name()
cql.execute(
f"CREATE CUSTOM INDEX {index_name} ON {table}(content) USING 'fulltext_index' "
f"WITH OPTIONS = {{'analyzer': '{analyzer}'}}"
)
cql.execute(f"DROP INDEX {test_keyspace}.{index_name}")
def test_create_fulltext_index_with_cjk_analyzers_fails(cql, test_keyspace, scylla_only):
"""CJK analyzers should be rejected. (VECTOR-672)"""
analyzers = [
'chinese', 'japanese', 'korean',
]
schema = 'p int primary key, content text'
with new_test_table(cql, test_keyspace, schema) as table:
for analyzer in analyzers:
index_name = unique_name()
with pytest.raises(InvalidRequest, match="Invalid value in option 'analyzer'"):
cql.execute(
f"CREATE CUSTOM INDEX {index_name} ON {table}(content) USING 'fulltext_index' "
f"WITH OPTIONS = {{'analyzer': '{analyzer}'}}"
)
def test_create_fulltext_index_with_analyzer_case_insensitive(cql, test_keyspace, scylla_only):
"""Analyzer option values should be case-insensitive."""
schema = 'p int primary key, content text'
with new_test_table(cql, test_keyspace, schema) as table:
cql.execute(
f"CREATE CUSTOM INDEX ON {table}(content) USING 'fulltext_index' "
f"WITH OPTIONS = {{'analyzer': 'ENGLISH'}}"
)
def test_create_fulltext_index_with_bad_analyzer_fails(cql, test_keyspace, scylla_only):
"""Unsupported analyzer values should be rejected."""
schema = 'p int primary key, content text'
with new_test_table(cql, test_keyspace, schema) as table:
with pytest.raises(InvalidRequest, match="Invalid value in option 'analyzer'"):
cql.execute(
f"CREATE CUSTOM INDEX ON {table}(content) USING 'fulltext_index' "
f"WITH OPTIONS = {{'analyzer': 'nonexistent_analyzer'}}"
)
def test_create_fulltext_index_with_positions_option(cql, test_keyspace, scylla_only):
"""positions option accepts 'true' and 'false'."""
schema = 'p int primary key, content text'
with new_test_table(cql, test_keyspace, schema) as table:
for val in ['true', 'false']:
index_name = unique_name()
cql.execute(
f"CREATE CUSTOM INDEX {index_name} ON {table}(content) USING 'fulltext_index' "
f"WITH OPTIONS = {{'positions': '{val}'}}"
)
cql.execute(f"DROP INDEX {test_keyspace}.{index_name}")
def test_create_fulltext_index_with_bad_positions_fails(cql, test_keyspace, scylla_only):
"""Invalid positions value should be rejected."""
schema = 'p int primary key, content text'
with new_test_table(cql, test_keyspace, schema) as table:
with pytest.raises(InvalidRequest, match="Invalid value in option 'positions'"):
cql.execute(
f"CREATE CUSTOM INDEX ON {table}(content) USING 'fulltext_index' "
f"WITH OPTIONS = {{'positions': 'maybe'}}"
)
def test_create_fulltext_index_with_unsupported_option_fails(cql, test_keyspace, scylla_only):
"""Unknown WITH OPTIONS keys should be rejected."""
schema = 'p int primary key, content text'
with new_test_table(cql, test_keyspace, schema) as table:
with pytest.raises(InvalidRequest, match="Unsupported option bad_option for fulltext index"):
cql.execute(
f"CREATE CUSTOM INDEX ON {table}(content) USING 'fulltext_index' "
f"WITH OPTIONS = {{'bad_option': 'bad_value'}}"
)
def test_create_fulltext_index_with_multiple_options(cql, test_keyspace, scylla_only):
"""Multiple valid options should be accepted together."""
schema = 'p int primary key, content text'
with new_test_table(cql, test_keyspace, schema) as table:
cql.execute(
f"CREATE CUSTOM INDEX ON {table}(content) USING 'fulltext_index' "
f"WITH OPTIONS = {{'analyzer': 'english', 'positions': 'false'}}"
)
def test_no_view_for_fulltext_index(cql, test_keyspace, scylla_only):
"""Fulltext index should not create a backing materialized view."""
schema = 'p int primary key, content text'
with new_test_table(cql, test_keyspace, schema) as table:
table_name = table.split('.')[1]
cql.execute(f"CREATE CUSTOM INDEX ON {table}(content) USING 'fulltext_index'")
result = cql.execute(
f"SELECT * FROM system_schema.views "
f"WHERE keyspace_name = '{test_keyspace}' AND base_table_name = '{table_name}' "
f"ALLOW FILTERING"
)
assert len(result.current_rows) == 0, \
"Fulltext index should not create a view in system_schema.views"
def test_describe_fulltext_index(cql, test_keyspace, scylla_only):
"""DESCRIBE INDEX should output correct CQL with USING 'fulltext_index'."""
schema = 'p int primary key, content text'
with new_test_table(cql, test_keyspace, schema) as table:
index_name = unique_name()
cql.execute(
f"CREATE CUSTOM INDEX {index_name} ON {table}(content) USING 'fulltext_index' "
f"WITH OPTIONS = {{'analyzer': 'english'}}"
)
desc = cql.execute(f"DESCRIBE INDEX {test_keyspace}.{index_name}")
desc_stmt = desc.one().create_statement
expected_desc_pattern = (
rf"CREATE CUSTOM INDEX {re.escape(index_name)} "
rf"ON {re.escape(table)}\(content\) "
r"USING 'fulltext_index' "
r"WITH OPTIONS = "
r"\{'analyzer': 'english'\};?"
)
assert re.fullmatch(expected_desc_pattern, desc_stmt)
def test_describe_fulltext_index_no_options(cql, test_keyspace, scylla_only):
"""DESCRIBE INDEX without user options should not include WITH OPTIONS."""
schema = 'p int primary key, content text'
with new_test_table(cql, test_keyspace, schema) as table:
index_name = unique_name()
cql.execute(
f"CREATE CUSTOM INDEX {index_name} ON {table}(content) USING 'fulltext_index'"
)
desc = cql.execute(f"DESCRIBE INDEX {test_keyspace}.{index_name}")
desc_stmt = desc.one().create_statement
assert "USING 'fulltext_index'" in desc_stmt
assert "WITH OPTIONS" not in desc_stmt
def test_create_fulltext_index_if_not_exists(cql, test_keyspace, scylla_only):
"""IF NOT EXISTS should silently succeed when the index already exists."""
schema = 'p int primary key, content text'
with new_test_table(cql, test_keyspace, schema) as table:
cql.execute(f"CREATE CUSTOM INDEX ON {table}(content) USING 'fulltext_index'")
# Second creation with IF NOT EXISTS should not raise
cql.execute(f"CREATE CUSTOM INDEX IF NOT EXISTS ON {table}(content) USING 'fulltext_index'")
def test_fulltext_index_in_system_schema(cql, test_keyspace, scylla_only):
"""Verify fulltext index metadata is stored in system_schema.indexes."""
schema = 'p int primary key, content text'
with new_test_table(cql, test_keyspace, schema) as table:
index_name = unique_name()
cql.execute(
f"CREATE CUSTOM INDEX {index_name} ON {table}(content) USING 'fulltext_index' "
f"WITH OPTIONS = {{'analyzer': 'standard'}}"
)
result = cql.execute(
f"SELECT * FROM system_schema.indexes "
f"WHERE keyspace_name = '{test_keyspace}' AND table_name = '{table.split('.')[1]}' "
f"AND index_name = '{index_name}'"
)
rows = result.current_rows
assert len(rows) == 1
options = rows[0].options
assert options['class_name'] == 'fulltext_index'
assert options.get('analyzer') == 'standard'