This commit increases the maximum length of names for keyspaces, tables, materialized views, and indexes from 48 to 192 bytes. The previous 48-bytes limit was inherited from Cassandra 3 for compatibility. However, this validation was removed in Cassandra 4 and 5 (see CASSANDRA-20389) and some usage scenarios (such as some feature store workflows generating long table names) now depend on this relaxed constraint. This change brings ScyllaDB's behavior in line with modern Cassandra versions and better supports these use cases. The new limit of 192 bytes is derived from underlying filesystem limitations to prevent runtime errors when creating directories for table data. When a new table is created, ScyllaDB generates a directory for its SSTables. The directory name is constructed from the table name, a dash, and a 32-character UUID. For a CDC-enabled table, an associated log table is also created, which has the suffix `_scylla_cdc_log` appended to its name. The directory name for this log table becomes the longest possible representation. Additionally we reserve 15 bytes for future use, allowing for potential future extensions without breaking existing schemas. To guarantee that directory creation never fails due to exceeding filesystem name limits, the maximum name length is calculated as follows: 255 bytes (common filesystem limit for a path component) - 32 bytes (for the 32-character UUID string) - 1 byte (for the '-' separator) - 15 bytes (for the '_scylla_cdc_log' suffix) - 15 bytes (reserved for future use) ---------- = 192 bytes (Maximum allowed name length) This calculation is similar in principle to the one proposed for Cassandra to fix related directory creation failures (see apache/cassandra/pull/4038). This patch also updates/adds all associated tests to validate the new 192-byte limit. The documentation has been updated accordingly.
103 lines
5.2 KiB
Python
103 lines
5.2 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
|
# Copyright 2020-present ScyllaDB
|
|
#
|
|
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
|
|
#############################################################################
|
|
# Tests for finer points of UTF-8 support. The issue of *invalid* UTF-8 input
|
|
# is tested in a separate test file - test_validation.py
|
|
#############################################################################
|
|
|
|
import pytest
|
|
import unicodedata
|
|
from cassandra.protocol import SyntaxException, AlreadyExists, InvalidRequest, ConfigurationException, ReadFailure
|
|
from .util import unique_name, unique_key_string
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def table1(cql, test_keyspace):
|
|
table = test_keyspace + "." + unique_name()
|
|
cql.execute(f"CREATE TABLE {table} (k text, c text, primary key (k, c))")
|
|
yield table
|
|
cql.execute("DROP TABLE " + table)
|
|
|
|
# Demonstrate that Scylla, like Cassandra, does NOT support the notion of
|
|
# "Unicode equivalence" (a.k.a. Unicode normalization). Consider the Spanish
|
|
# letter ñ - it can be represented by a single Unicode character 00F1, but
|
|
# can also be represented as a 006E (lowercase "n") followed by a 0303
|
|
# ("combining tilde"). But if you write one of these representations, and
|
|
# then look up the other, Scylla will not find the item. So Scylla does
|
|
# not support unicode equivalence.
|
|
# See https://en.wikipedia.org/wiki/Unicode_equivalence for more information
|
|
# on the issue of Unicode equivalence.
|
|
def test_unicode_equivalence(cql, table1):
|
|
u1 = "\u00F1" # Spanish ñ as one character
|
|
u2 = "\u006E\u0303" # Two characters: n followed by combining tilde.
|
|
# Confirm that u1 and u2 are different Unicode strings, but are
|
|
# equivalent, i.e., have the same normalized value:
|
|
assert u1 != u2
|
|
assert unicodedata.normalize('NFC', u1) == unicodedata.normalize('NFC', u2)
|
|
|
|
insert = cql.prepare(f"INSERT INTO {table1} (k, c) VALUES (?, ?)")
|
|
search = cql.prepare(f"SELECT k, c FROM {table1} WHERE k=? and c=?")
|
|
s = unique_key_string()
|
|
# Test that writing u1 as a *clustering key* and looking up u2 will not
|
|
# work.
|
|
cql.execute(insert, [s, u1])
|
|
assert len(list(cql.execute(search, [s, u1]))) == 1
|
|
assert len(list(cql.execute(search, [s, u2]))) == 0
|
|
# Test that writing u1 as a *partition key* and looking up u2 will not
|
|
# work.
|
|
cql.execute(insert, [u1, s])
|
|
assert len(list(cql.execute(search, [u1, s]))) == 1
|
|
assert len(list(cql.execute(search, [u2, s]))) == 0
|
|
|
|
# Demonstrate that the LIKE operation is also not aware of Unicode
|
|
# equivalence: a 'n%' pattern can match one representation of ñ but not
|
|
# another. This is a Scylla-only test, because the LIKE operator doesn't
|
|
# exist in Cassandra.
|
|
def test_unicode_equivalence_like(scylla_only, cql, table1):
|
|
u1 = "\u00F1" # Spanish ñ as one character
|
|
u2 = "\u006E\u0303" # Two characters: n followed by combining tilde.
|
|
# Confirm that u1 and u2 are different Unicode strings, but are
|
|
# equivalent, i.e., have the same normalized value:
|
|
assert u1 != u2
|
|
assert unicodedata.normalize('NFC', u1) == unicodedata.normalize('NFC', u2)
|
|
|
|
insert = cql.prepare(f"INSERT INTO {table1} (k, c) VALUES (?, ?)")
|
|
search = cql.prepare(f"SELECT k, c FROM {table1} WHERE k=? AND c LIKE ? ALLOW FILTERING")
|
|
s = unique_key_string()
|
|
# u1 does not match the pattern 'n%':
|
|
cql.execute(insert, [s, u1])
|
|
assert set(cql.execute(search, [s, 'n%'])) == set()
|
|
# u1 matches the pattern '_' (a single character though not a single byte)
|
|
assert set(cql.execute(search, [s, '_'])) == set([(s, u1)])
|
|
# but u2 does match 'n%', but not '_':
|
|
cql.execute(insert, [s, u2])
|
|
assert set(cql.execute(search, [s, 'n%'])) == set([(s, u2)])
|
|
assert set(cql.execute(search, [s, '_'])) == set([(s, u1)])
|
|
|
|
# The CREATE TABLE documentation on Datastax's site says that "The name of a
|
|
# table can be a string of alphanumeric characters and underscores, but it
|
|
# must begin with a letter.". The words "alphanumeric" and "letter" are
|
|
# ambiguous - are various Unicode letters (e.g., Hebrew letters) allowed, or
|
|
# not, in table names? The Apache Cassandra and Scylla DDL documentation is
|
|
# more explicit - table names must match the regular expression
|
|
# [a-zA-Z_0-9]{1, 192} - so must use the Latin alphabet and nothing else.
|
|
# Let's confirm this in a test.
|
|
def test_unicode_in_table_names(cql, test_keyspace):
|
|
n = unique_name()
|
|
for s in ['עברית', 'Français']:
|
|
# The non-Latin alphabet shouldn't work neither quoted, nor unquoted.
|
|
# The specific error returned is different in both cases - invalid
|
|
# characters in an unquoted name result in a SyntaxException, while
|
|
# inside quotes it is detected later, and generates an InvalidRequest
|
|
# in Scylla or ConfigurationException in Cassandra (we'll allow both).
|
|
with pytest.raises(SyntaxException):
|
|
cql.execute(f'CREATE TABLE {test_keyspace}.{n}{s} (p int PRIMARY KEY)')
|
|
cql.execute(f'DROP TABLE {test_keyspace}.{n}{s}')
|
|
with pytest.raises((InvalidRequest, ConfigurationException)):
|
|
cql.execute(f'CREATE TABLE {test_keyspace}."{n}{s}" (p int PRIMARY KEY)')
|
|
cql.execute(f'DROP TABLE {test_keyspace}."{n}{s}"')
|