Remove many unused "import" statements or parts of import statement. All of them were detected by Copilot, but I verified each one manually and prepared this patch. Signed-off-by: Nadav Har'El <nyh@scylladb.com> Closes scylladb/scylladb#27675
103 lines
5.1 KiB
Python
103 lines
5.1 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
|
# Copyright 2020-present ScyllaDB
|
|
#
|
|
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
|
|
#############################################################################
|
|
# Tests for finer points of UTF-8 support. The issue of *invalid* UTF-8 input
|
|
# is tested in a separate test file - test_validation.py
|
|
#############################################################################
|
|
|
|
import pytest
|
|
import unicodedata
|
|
from cassandra.protocol import SyntaxException, InvalidRequest, ConfigurationException
|
|
from .util import unique_name, unique_key_string
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def table1(cql, test_keyspace):
|
|
table = test_keyspace + "." + unique_name()
|
|
cql.execute(f"CREATE TABLE {table} (k text, c text, primary key (k, c))")
|
|
yield table
|
|
cql.execute("DROP TABLE " + table)
|
|
|
|
# Demonstrate that Scylla, like Cassandra, does NOT support the notion of
|
|
# "Unicode equivalence" (a.k.a. Unicode normalization). Consider the Spanish
|
|
# letter ñ - it can be represented by a single Unicode character 00F1, but
|
|
# can also be represented as a 006E (lowercase "n") followed by a 0303
|
|
# ("combining tilde"). But if you write one of these representations, and
|
|
# then look up the other, Scylla will not find the item. So Scylla does
|
|
# not support unicode equivalence.
|
|
# See https://en.wikipedia.org/wiki/Unicode_equivalence for more information
|
|
# on the issue of Unicode equivalence.
|
|
def test_unicode_equivalence(cql, table1):
|
|
u1 = "\u00F1" # Spanish ñ as one character
|
|
u2 = "\u006E\u0303" # Two characters: n followed by combining tilde.
|
|
# Confirm that u1 and u2 are different Unicode strings, but are
|
|
# equivalent, i.e., have the same normalized value:
|
|
assert u1 != u2
|
|
assert unicodedata.normalize('NFC', u1) == unicodedata.normalize('NFC', u2)
|
|
|
|
insert = cql.prepare(f"INSERT INTO {table1} (k, c) VALUES (?, ?)")
|
|
search = cql.prepare(f"SELECT k, c FROM {table1} WHERE k=? and c=?")
|
|
s = unique_key_string()
|
|
# Test that writing u1 as a *clustering key* and looking up u2 will not
|
|
# work.
|
|
cql.execute(insert, [s, u1])
|
|
assert len(list(cql.execute(search, [s, u1]))) == 1
|
|
assert len(list(cql.execute(search, [s, u2]))) == 0
|
|
# Test that writing u1 as a *partition key* and looking up u2 will not
|
|
# work.
|
|
cql.execute(insert, [u1, s])
|
|
assert len(list(cql.execute(search, [u1, s]))) == 1
|
|
assert len(list(cql.execute(search, [u2, s]))) == 0
|
|
|
|
# Demonstrate that the LIKE operation is also not aware of Unicode
|
|
# equivalence: a 'n%' pattern can match one representation of ñ but not
|
|
# another. This is a Scylla-only test, because the LIKE operator doesn't
|
|
# exist in Cassandra.
|
|
def test_unicode_equivalence_like(scylla_only, cql, table1):
|
|
u1 = "\u00F1" # Spanish ñ as one character
|
|
u2 = "\u006E\u0303" # Two characters: n followed by combining tilde.
|
|
# Confirm that u1 and u2 are different Unicode strings, but are
|
|
# equivalent, i.e., have the same normalized value:
|
|
assert u1 != u2
|
|
assert unicodedata.normalize('NFC', u1) == unicodedata.normalize('NFC', u2)
|
|
|
|
insert = cql.prepare(f"INSERT INTO {table1} (k, c) VALUES (?, ?)")
|
|
search = cql.prepare(f"SELECT k, c FROM {table1} WHERE k=? AND c LIKE ? ALLOW FILTERING")
|
|
s = unique_key_string()
|
|
# u1 does not match the pattern 'n%':
|
|
cql.execute(insert, [s, u1])
|
|
assert set(cql.execute(search, [s, 'n%'])) == set()
|
|
# u1 matches the pattern '_' (a single character though not a single byte)
|
|
assert set(cql.execute(search, [s, '_'])) == set([(s, u1)])
|
|
# but u2 does match 'n%', but not '_':
|
|
cql.execute(insert, [s, u2])
|
|
assert set(cql.execute(search, [s, 'n%'])) == set([(s, u2)])
|
|
assert set(cql.execute(search, [s, '_'])) == set([(s, u1)])
|
|
|
|
# The CREATE TABLE documentation on Datastax's site says that "The name of a
|
|
# table can be a string of alphanumeric characters and underscores, but it
|
|
# must begin with a letter.". The words "alphanumeric" and "letter" are
|
|
# ambiguous - are various Unicode letters (e.g., Hebrew letters) allowed, or
|
|
# not, in table names? The Apache Cassandra and Scylla DDL documentation is
|
|
# more explicit - table names must match the regular expression
|
|
# [a-zA-Z_0-9]{1, 192} - so must use the Latin alphabet and nothing else.
|
|
# Let's confirm this in a test.
|
|
def test_unicode_in_table_names(cql, test_keyspace):
|
|
n = unique_name()
|
|
for s in ['עברית', 'Français']:
|
|
# The non-Latin alphabet shouldn't work neither quoted, nor unquoted.
|
|
# The specific error returned is different in both cases - invalid
|
|
# characters in an unquoted name result in a SyntaxException, while
|
|
# inside quotes it is detected later, and generates an InvalidRequest
|
|
# in Scylla or ConfigurationException in Cassandra (we'll allow both).
|
|
with pytest.raises(SyntaxException):
|
|
cql.execute(f'CREATE TABLE {test_keyspace}.{n}{s} (p int PRIMARY KEY)')
|
|
cql.execute(f'DROP TABLE {test_keyspace}.{n}{s}')
|
|
with pytest.raises((InvalidRequest, ConfigurationException)):
|
|
cql.execute(f'CREATE TABLE {test_keyspace}."{n}{s}" (p int PRIMARY KEY)')
|
|
cql.execute(f'DROP TABLE {test_keyspace}."{n}{s}"')
|