Files
scylladb/test/cql-pytest/test_utf8.py
Nadav Har'El 05d6eff850 cql-pytest: add tests for non-support of unicode equivalence`
In issue #7843 there were questions raised on how much does Scylla
support the notion of Unicode Equivalence, a.k.a. Unicode normalization.

Consider the Spanish letter ñ - it can be represented by a single Unicode
character 00F1, but can also be represented as a 006E (lowercase "n")
followed by a 0303 ("combining tilde"). Unicode specifies that these
two representations should be considered "equivalent" for purposes of
sorting or searching. But the following tests demonstrates that this
is not, in fact, supported in Scylla or Cassandra:

1. If you use one representation as the key, then looking up the other one
   will not find the row. Scylla (and Cassandra) do *not* consider
   the two strings equivalent.

2. The LIKE operator (a Scylla-only extension) doesn't know that
   the single-character ñ begins with an n, or that the two-character
   ñ is just a single character.
   This is despite the thinking on #7843 which by using ICU in the
   implementation of LIKE, we somehow got support for this. We didn't.

Refs #7843

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20201229125330.3401954-1-nyh@scylladb.com>
2021-01-04 18:25:28 +01:00

94 lines
4.3 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2020 ScyllaDB
#
# This file is part of Scylla.
#
# Scylla is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Scylla is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with Scylla. If not, see <http://www.gnu.org/licenses/>.
#############################################################################
# Tests for finer points of UTF-8 support. The issue of *invalid* UTF-8 input
# is tested in a separate test file - test_validation.py
#############################################################################
import pytest
import random
import unicodedata
from cassandra.protocol import SyntaxException, AlreadyExists, InvalidRequest, ConfigurationException, ReadFailure
from util import unique_name, random_string
@pytest.fixture(scope="session")
def table1(cql, test_keyspace):
table = test_keyspace + "." + unique_name()
cql.execute(f"CREATE TABLE {table} (k text, c text, primary key (k, c))")
yield table
cql.execute("DROP TABLE " + table)
# Demonstrate that Scylla, like Cassandra, does NOT support the notion of
# "Unicode equivalence" (a.k.a. Unicode normalization). Consider the Spanish
# letter ñ - it can be represented by a single Unicode character 00F1, but
# can also be represented as a 006E (lowercase "n") followed by a 0303
# ("combining tilde"). But if you write one of these representations, and
# then look up the other, Scylla will not find the item. So Scylla does
# not support unicode equivalence.
# See https://en.wikipedia.org/wiki/Unicode_equivalence for more information
# on the issue of Unicode equivalence.
def test_unicode_equivalence(cql, table1):
u1 = "\u00F1" # Spanish ñ as one character
u2 = "\u006E\u0303" # Two characters: n followed by combining tilde.
# Confirm that u1 and u2 are different Unicode strings, but are
# equivalent, i.e., have the same normalized value:
assert u1 != u2
assert unicodedata.normalize('NFC', u1) == unicodedata.normalize('NFC', u2)
insert = cql.prepare(f"INSERT INTO {table1} (k, c) VALUES (?, ?)")
search = cql.prepare(f"SELECT k, c FROM {table1} WHERE k=? and c=?")
s = random_string()
# Test that writing u1 as a *clustering key* and looking up u2 will not
# work.
cql.execute(insert, [s, u1])
assert len(list(cql.execute(search, [s, u1]))) == 1
assert len(list(cql.execute(search, [s, u2]))) == 0
# Test that writing u1 as a *partition key* and looking up u2 will not
# work.
cql.execute(insert, [u1, s])
assert len(list(cql.execute(search, [u1, s]))) == 1
assert len(list(cql.execute(search, [u2, s]))) == 0
# Demonstrate that the LIKE operation is also not aware of Unicode
# equivalence: a 'n%' pattern can match one representation of ñ but not
# another. This is a Scylla-only test, because the LIKE operator doesn't
# exist in Cassandra.
def test_unicode_equivalence_like(scylla_only, cql, table1):
u1 = "\u00F1" # Spanish ñ as one character
u2 = "\u006E\u0303" # Two characters: n followed by combining tilde.
# Confirm that u1 and u2 are different Unicode strings, but are
# equivalent, i.e., have the same normalized value:
assert u1 != u2
assert unicodedata.normalize('NFC', u1) == unicodedata.normalize('NFC', u2)
insert = cql.prepare(f"INSERT INTO {table1} (k, c) VALUES (?, ?)")
search = cql.prepare(f"SELECT k, c FROM {table1} WHERE k=? AND c LIKE ? ALLOW FILTERING")
s = random_string()
# u1 does not match the pattern 'n%':
cql.execute(insert, [s, u1])
assert set(cql.execute(search, [s, 'n%'])) == set()
# u1 matches the pattern '_' (a single character though not a single byte)
assert set(cql.execute(search, [s, '_'])) == set([(s, u1)])
# but u2 does match 'n%', but not '_':
cql.execute(insert, [s, u2])
assert set(cql.execute(search, [s, 'n%'])) == set([(s, u2)])
assert set(cql.execute(search, [s, '_'])) == set([(s, u1)])