Remove many unused "import" statements or parts of import statement. All of them were detected by Copilot, but I verified each one manually and prepared this patch. Signed-off-by: Nadav Har'El <nyh@scylladb.com> Closes scylladb/scylladb#27675
380 lines
18 KiB
Python
380 lines
18 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Copyright 2020-present ScyllaDB
|
|
#
|
|
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
|
|
#############################################################################
|
|
# Tests for the type validation in CQL. For example, it should not be
|
|
# possible to insert a non-ASCII string into a column of type "ascii", or an
|
|
# invalid UTF-8 string into a column of type "text".
|
|
#############################################################################
|
|
|
|
import pytest
|
|
import re
|
|
import random
|
|
from cassandra.protocol import InvalidRequest
|
|
from .util import unique_name, unique_key_int, new_test_table
|
|
|
|
@pytest.fixture(scope="module")
|
|
def table1(cql, test_keyspace):
|
|
table = test_keyspace + "." + unique_name()
|
|
cql.execute(f"CREATE TABLE {table} (k int primary key, a ascii, t text)")
|
|
yield table
|
|
cql.execute("DROP TABLE " + table)
|
|
|
|
#############################################################################
|
|
# The following tests verify that inserting an invalid UTF-8 string into a
|
|
# "text" column is forbidden. There are multiple ways in which we can try to
|
|
# inject an invalid UTF-8 into a request, and each of them exercises a
|
|
# different code path so should check all of them below.
|
|
|
|
# Examples of invalid UTF-8 strings, with comments on why they are invalid.
|
|
# Note that currently, Scylla's UTF-8 parser is stricter than Cassandra's,
|
|
# and rejects the following cases which Cassandra does *not* reject:
|
|
# 1. \xC0\x80 as another non-minimal representation of null (other non-
|
|
# minimal encodings are rejected as expected)
|
|
# 2. Characters beyond the official Unicode range.
|
|
# 3. UTF-16 surrogates (which are not valid UTF-8).
|
|
bad_utf8 = [
|
|
# Non-minimal representations (in this case of 0x00) are not valid UTF-8
|
|
b'\xC0\x80', # NOTE: not recognized invalid by Cassandra
|
|
b'\xE0\x80\x80',
|
|
b'\xF0\x80\x80\x80',
|
|
# 0x80-0xBF are continuation bytes - can never be the first byte
|
|
b'\x80',
|
|
b'\xBF',
|
|
# 0xC0-0xDF indicate the first byte of a 2-byte sequence. The next byte
|
|
# must be a continuation byte (0x80-0xBF).
|
|
# Above we also checked that \xC0\x80 is also invalid (non-minal
|
|
# representation), but more generally any sequence starting in C0 or
|
|
# C1 will be non-minimal (they can only encode characters between
|
|
# 00-7F), so the characters C0 or C1 can never appear in valid UTF8.
|
|
b'\xC0',
|
|
b'\xC0\x7F',
|
|
b'\xC0\xC0',
|
|
b'\xC0\x81',
|
|
b'\xC0\xBE',
|
|
b'\xC1\x81',
|
|
b'\xC1\xBE',
|
|
b'\xC2',
|
|
b'\xC2\x7F',
|
|
b'\xC2\xC0',
|
|
# 0xE0-0xEF indicate the first byte of a 3-byte sequence. The next two
|
|
# bytes must be continuation bytes (0x80-0xBF).
|
|
b'\xE0',
|
|
b'\xE0\xA0',
|
|
b'\xE0\xA0\x79',
|
|
b'\xE0\xA0\xC0',
|
|
# 0xF0-0xF4 indicate the first byte of a 4-byte sequence. Note that 0xF4
|
|
# is the last possible byte in this range because unicode ends at 0x10FFFF.
|
|
# The next three bytes must be continuation bytes (0x80-0xBF).
|
|
b'\xF0',
|
|
b'\xF0\x90',
|
|
b'\xF0\x90\x81',
|
|
b'\xF0\x90\x81\x79',
|
|
b'\xF0\x90\x81\xC0',
|
|
# Actually, the unicode range ends in the middle of 0xF4.
|
|
b'\xF4\x90\x80\x80', # NOTE: not recognized invalid by Cassandra
|
|
# 0xF5-0xFF cannot be the first byte because of Unicode's range as
|
|
# explained above. These bytes can't appear inside any UTF8.
|
|
b'\xF5',
|
|
b'\xF5\x81',
|
|
b'\xF5\x81\x81',
|
|
#b'\xF5\x81\x81\x81',
|
|
# UTF-16 surrogates are not valid UTF-8
|
|
b'\xED\xA0\x80', # NOTE: not recognized invalid by Cassandra
|
|
b'\xED\xAF\xBF', # NOTE: not recognized invalid by Cassandra
|
|
b'\xED\xB0\x80', # NOTE: not recognized invalid by Cassandra
|
|
b'\xED\xBF\xBF', # NOTE: not recognized invalid by Cassandra
|
|
]
|
|
# Some examples of good UTF-8 strings, as byte strings. It is important that
|
|
# all the tests below check that good UTF-8 works, not just that bad UTF-8
|
|
# fails. That confirms that the tests can actually tell apart good and bad
|
|
# results - and aren't just buggy and always fail! (we had such a bug in the
|
|
# first version of this test...)
|
|
good_utf8 = [
|
|
# ASCII
|
|
b'hello',
|
|
# Null is fine
|
|
b'\x00',
|
|
# Some Hebrew :-)
|
|
'שלום'.encode('utf-8'),
|
|
# 0xC0-0xDF indicate the first byte of a 2-byte sequence. The next byte
|
|
# must be a continuation byte (0x80-0xBF). As explained above, because of
|
|
# minimal encoding requirement, 0xC0 and 0xC1 aren't actually legal, but
|
|
# 0xC2 is:
|
|
b'\xC2\x80',
|
|
b'\xC2\xBF',
|
|
# 0xE0-0xEF indicate the first byte of a 3-byte sequence. The next two
|
|
# bytes must be continuation bytes (0x80-0xBF), but again for the encoding
|
|
# to be minimal, when the first byte is the first possible one (E0) the
|
|
# second byte needs to be A0 or above.
|
|
b'\xE0\xA0\x80',
|
|
b'\xE0\xA0\xBF',
|
|
b'\xE0\xBF\x80',
|
|
b'\xE0\xBF\xBF',
|
|
b'\xEF\x80\x80',
|
|
b'\xEF\x80\xBF',
|
|
b'\xEF\xBF\x80',
|
|
b'\xEF\xBF\xBF',
|
|
# 0xF0-0xF4 indicate the first byte of a 4-byte sequence.
|
|
# The next three bytes must be continuation bytes (0x80-0xBF).
|
|
# Again, because of minimal encoding, the earlier sequences with these
|
|
# bytes aren't actually allowed.
|
|
b'\xF0\x90\x80\x80', # the lowest sequence allowed because of minimalism
|
|
b'\xF0\x90\x80\xBF',
|
|
b'\xF0\x90\xBF\x80',
|
|
b'\xF0\x90\xBF\xBF',
|
|
b'\xF0\xBF\x80\x80',
|
|
b'\xF0\xBF\x80\xBF',
|
|
b'\xF0\xBF\xBF\x80',
|
|
b'\xF0\xBF\xBF\xBF',
|
|
b'\xF4\x80\x80\x80',
|
|
b'\xF4\x80\x80\xBF',
|
|
b'\xF4\x80\xBF\x80',
|
|
b'\xF4\x80\xBF\xBF',
|
|
b'\xF4\x8F\xBF\xBF', # the highest allowed sequence because of unicode range
|
|
]
|
|
|
|
# 1. We can pass a string using the blob representation of its bytes (0x...)
|
|
# and the builtin blobAsText function. This function converts the blob into
|
|
# a string assuming it has UTF-8 encoding, and should complain when it's
|
|
# invalid. The error Cassandra and Scylla print in this case looks like
|
|
# "In call to function blobastext [or system.blobastext], value 0xc0 is
|
|
# not a valid binary representation for type text".
|
|
# Note that currently, Scylla's UTF-8 parser is stricter than Cassandra's
|
|
# (see comment above listing the relevant cases), so this test, as all tests
|
|
# using the bad_utf8 array, will fail on Cassandra.
|
|
def test_validation_utf8_as_blob(scylla_only, cql, table1):
|
|
cmd = "INSERT INTO {} (k, t) VALUES (1, blobAsText(0x{}))"
|
|
for b in good_utf8:
|
|
print(b)
|
|
cql.execute(cmd.format(table1, b.hex()))
|
|
# verify that the successfully-written value can be read correctly
|
|
results = list(cql.execute(f"SELECT k, t FROM {table1} WHERE k=1"))
|
|
assert len(results) == 1
|
|
assert results[0].k == 1 and results[0].t.encode('utf-8') == b
|
|
for b in bad_utf8:
|
|
print(b)
|
|
with pytest.raises(InvalidRequest, match='not a valid binary representation for type text'):
|
|
cql.execute(cmd.format(table1, b.hex()))
|
|
|
|
# 2. We can pass the string as a bound argument to a prepared statement.
|
|
# Convincing Python to put an invalid UTF-8 here is not trivial, because
|
|
# the driver outputs strings, which are not supposed to be able to contain
|
|
# invalid UTF-8. We use a rather funky workaround here use a wrapped
|
|
# version ("surrogateescape") of bad UTF-8, and monkey-patch the driver to
|
|
# unescape it when converting it back to bytes.
|
|
# Note that currently, Scylla's UTF-8 parser is stricter than Cassandra's
|
|
# (see comment above listing the relevant cases), so this test, as all tests
|
|
# using the bad_utf8 array, will fail on Cassandra.
|
|
def test_validation_utf8_bound_column(scylla_only, cql, table1):
|
|
import cassandra.cqltypes
|
|
orig_serialize = cassandra.cqltypes.UTF8Type.serialize
|
|
def myserialize(ustr, protocol_version):
|
|
return ustr.encode('utf-8', errors='surrogateescape')
|
|
cassandra.cqltypes.UTF8Type.serialize = myserialize
|
|
try:
|
|
stmt = cql.prepare(f'INSERT INTO {table1} (k, t) VALUES (1, ?)')
|
|
for b in good_utf8:
|
|
print(b)
|
|
cql.execute(stmt, [b.decode()])
|
|
results = list(cql.execute(f"SELECT k, t FROM {table1} WHERE k=1"))
|
|
assert len(results) == 1
|
|
assert results[0].k == 1 and results[0].t.encode('utf-8') == b
|
|
for b in bad_utf8:
|
|
print(b)
|
|
# Scylla prints "Exception while binding column t: marshaling error:
|
|
# Validation failed - non-UTF8 character in a UTF8 string, at byte
|
|
# offset 0". Cassandra prints "String didn't validate.". The only
|
|
# thing in common is the word 'validat' :-)
|
|
with pytest.raises(InvalidRequest, match=re.compile('validat', re.IGNORECASE)):
|
|
cql.execute(stmt, [b.decode(errors='surrogateescape')])
|
|
finally:
|
|
cassandra.cqltypes.UTF8Type.serialize = orig_serialize
|
|
|
|
# 3. We can also insert the bad UTF-8 as part of the request string itself.
|
|
# This will make the entire request string invalid UTF-8, not just the
|
|
# value to be inserted, so Scylla and Cassandra should complain that the
|
|
# entire request is bad - not just the inserted value.
|
|
# FIXME: this test is INCOMPLETE! It's very hard to get Python to output an
|
|
# illegal UTF-8 string in this case, and I gave up. Unlike the prepared-
|
|
# statement case above where it was easy to find and monkey-patch the
|
|
# function responsible for converting the string to bytes, in this case it
|
|
# was harder to find this function and I gave up.
|
|
def test_validation_utf8_query(cql, table1):
|
|
for b in good_utf8:
|
|
s = b.decode('utf-8')
|
|
print(s)
|
|
cql.execute(f"INSERT INTO {table1} (k, t) VALUES (1, '{s}')")
|
|
results = list(cql.execute(f"SELECT k, t FROM {table1} WHERE k=1"))
|
|
assert len(results) == 1
|
|
assert results[0].k == 1 and results[0].t == s
|
|
# FIXME: Need to figure out the appropriate monkey-patching or other
|
|
# trick to make the following work (i.e., pass the invalid string to
|
|
# the server, and let the server - not the driver - fail.
|
|
# for b in bad_utf8:
|
|
# print(b)
|
|
# cql.execute("INSERT INTO {} (k, t) VALUES (1, '{}')".format(table1, b.decode(errors='surrogateescape')))
|
|
|
|
# 4. The invalid UTF-8 can be the result of a user-defined function in Lua,
|
|
# which can easily produce invalid UTF-8. This is a Scylla-only test,
|
|
# because Cassandra does not have user-defined functions in Lua.
|
|
# Notes:
|
|
# * This test doesn't try to insert data like other tests - the UTF-8
|
|
# conversion attempt is done during a select.
|
|
def test_validation_utf8_from_lua(scylla_only, cql, test_keyspace, table1):
|
|
# Create one row that the Lua functions below will run on
|
|
cql.execute(f"INSERT INTO {table1} (k, a) VALUES (1, 'hello')")
|
|
# This test is significantly slower than the rest, because we run the
|
|
# CREATE FUNCTION operation separately for each tested string, and it is
|
|
# very slow. So we only try a random sample of the good and bad strings.
|
|
# TODO: can we have a faster Lua test, which uses a single function only
|
|
# with different parameters, instead of multiple functions?
|
|
for b in random.sample(good_utf8, 3):
|
|
fname = unique_name()
|
|
# translate byte 0xAB into the string "\xAB"
|
|
b_lua = ''.join([('\\x%02x' % c) for c in b])
|
|
print(b_lua)
|
|
cql.execute(f"CREATE FUNCTION {test_keyspace}.{fname}(k int) CALLED ON NULL INPUT RETURNS text LANGUAGE Lua AS 'return \"{b_lua}\"';")
|
|
results = list(cql.execute(f"SELECT {fname}(k) FROM {table1} WHERE k=1"))
|
|
assert len(results) == 1
|
|
assert len(results[0]) == 1
|
|
assert results[0][0].encode('utf-8') == b
|
|
cql.execute(f"DROP FUNCTION {test_keyspace}.{fname}")
|
|
for b in random.sample(bad_utf8, 3):
|
|
fname = unique_name()
|
|
b_lua = ''.join([('\\x%02x' % c) for c in b])
|
|
print(b_lua)
|
|
cql.execute(f"CREATE FUNCTION {test_keyspace}.{fname}(k int) CALLED ON NULL INPUT RETURNS text LANGUAGE Lua AS 'return \"{b_lua}\"';")
|
|
with pytest.raises(InvalidRequest, match='value is not valid utf8'):
|
|
cql.execute(f"SELECT {fname}(k) FROM {table1} WHERE k=1")
|
|
cql.execute(f"DROP FUNCTION {test_keyspace}.{fname}")
|
|
|
|
#############################################################################
|
|
# The following tests verify that inserting a non-ASCII string into an
|
|
# "ascii" column should be forbidden. There are multiple ways in which we
|
|
# can try to inject non-ASCII into a request, and each of them exercises
|
|
# a different code path so should check all of them below.
|
|
|
|
# Examples of non-ASCII and ASCII byte strings:
|
|
bad_ascii = [
|
|
'שלום',
|
|
]
|
|
good_ascii = [
|
|
'hello',
|
|
# A null is considered valid ASCII
|
|
'\x00',
|
|
]
|
|
|
|
# 1. We can pass a string using the blob representation of its bytes (0x...)
|
|
# and the builtin blobAsAscii function. This function converts the blob into
|
|
# a string assuming it has ASCII encoding, and should complain when it's
|
|
# invalid. The error Cassandra and Scylla print in this case looks like
|
|
# "In call to function blobastext [or system.blobastext], value 0xc0 is
|
|
# not a valid binary representation for type ascii".
|
|
def test_validation_ascii_as_blob(cql, table1):
|
|
cmd = "INSERT INTO {} (k, t) VALUES (1, blobAsAscii(0x{}))"
|
|
for s in good_ascii:
|
|
print(s)
|
|
cql.execute(cmd.format(table1, s.encode().hex()))
|
|
results = list(cql.execute(f"SELECT k, t FROM {table1} WHERE k=1"))
|
|
assert len(results) == 1
|
|
assert results[0].k == 1 and results[0].t == s
|
|
for s in bad_ascii:
|
|
print(s)
|
|
with pytest.raises(InvalidRequest, match='not a valid binary representation for type ascii'):
|
|
cql.execute(cmd.format(table1, s.encode().hex()))
|
|
|
|
# 2. We can pass the string as a bound argument to a prepared statement.
|
|
# Again, a non-ASCII one should produce an error
|
|
def test_validation_ascii_bound_column(cql, table1):
|
|
# Unfortunately, the Python CQL driver checks the ASCII encoding itself
|
|
# in cassandra.cqltypes.AsciiType.serialize, so we need to monkey-patch
|
|
# this function to avoid the client-side checking.
|
|
import cassandra.cqltypes
|
|
orig_serialize = cassandra.cqltypes.AsciiType.serialize
|
|
def myserialize(ustr, protocol_version):
|
|
# The original implementation has encode('ascii') here
|
|
return ustr.encode('utf-8')
|
|
cassandra.cqltypes.AsciiType.serialize = myserialize
|
|
try:
|
|
stmt = cql.prepare(f'INSERT INTO {table1} (k, a) VALUES (1, ?)')
|
|
for s in good_ascii:
|
|
print(s)
|
|
cql.execute(stmt, [s])
|
|
results = list(cql.execute(f"SELECT k, a FROM {table1} WHERE k=1"))
|
|
assert len(results) == 1
|
|
assert results[0].k == 1 and results[0].a == s
|
|
for s in bad_ascii:
|
|
print(s)
|
|
# Scylla prints "Exception while binding column t: marshaling error:
|
|
# Validation failed - non-ASCII character in an ASCII string".
|
|
# Cassandra prints "Invalid byte for ascii: -41". The only thing
|
|
# in common is the word 'ascii' in a different case...
|
|
with pytest.raises(InvalidRequest, match=re.compile('ascii', re.IGNORECASE)):
|
|
cql.execute(stmt, [s])
|
|
finally:
|
|
cassandra.cqltypes.AsciiType.serialize = orig_serialize
|
|
|
|
# 3. Insert the non-ASCII string as an integral part of the request string
|
|
# itself. The request itself is valid (it just needs to be UTF-8), but
|
|
# the non-ASCII insertion should be refused.
|
|
# Reproduces issue #5421.
|
|
# Reproduces issue #14320.
|
|
def test_validation_ascii_query(cql, table1):
|
|
for s in good_ascii:
|
|
print(s)
|
|
cql.execute(f"INSERT INTO {table1} (k, a) VALUES (1, '{s}')")
|
|
results = list(cql.execute(f"SELECT k, a FROM {table1} WHERE k=1"))
|
|
assert len(results) == 1
|
|
assert results[0].k == 1 and results[0].a == s
|
|
for s in bad_ascii:
|
|
print(s)
|
|
with pytest.raises(InvalidRequest, match='Invalid ASCII character'):
|
|
cql.execute(f"INSERT INTO {table1} (k, a) VALUES (1, '{s}')")
|
|
|
|
# 4. The invalid ASCII can be the result of a user-defined function in Lua,
|
|
# which can easily produce invalid ASCII. This is a Scylla-only test,
|
|
# because Cassandra does not have user-defined functions in Lua.
|
|
def test_validation_ascii_from_lua(scylla_only, cql, test_keyspace, table1):
|
|
# Create one row that the Lua function below will run on
|
|
cql.execute(f"INSERT INTO {table1} (k, a) VALUES (1, 'hello')")
|
|
fname = unique_name()
|
|
cql.execute(f"CREATE FUNCTION {test_keyspace}.{fname}(k int) CALLED ON NULL INPUT RETURNS ascii LANGUAGE Lua AS 'return \"שלום\"';")
|
|
with pytest.raises(InvalidRequest, match='value is not valid ascii'):
|
|
cql.execute(f"SELECT {fname}(k) FROM {table1} WHERE k=1")
|
|
cql.execute(f"DROP FUNCTION {test_keyspace}.{fname}")
|
|
cql.execute(f"CREATE FUNCTION {test_keyspace}.{fname}(k int) CALLED ON NULL INPUT RETURNS ascii LANGUAGE Lua AS 'return \"hello\"';")
|
|
results = list(cql.execute(f"SELECT {fname}(k) FROM {table1} WHERE k=1"))
|
|
assert len(results) == 1
|
|
assert len(results[0]) == 1
|
|
assert results[0][0] == 'hello'
|
|
|
|
#############################################################################
|
|
# The functions like blobAsInt() take a byte array and read an integer from
|
|
# it. The byte array must have a specific number of bytes (for blobAsInt, it
|
|
# is 4) - trying to pass less or more is a validation error.
|
|
# Note that an *empty* blob is actually allowed and results in a so-called
|
|
# empty value (which is distinct from a null value). We have a separate test
|
|
# for that - test_empty.py::test_empty_int.
|
|
def test_validation_blob_as_int_len(cql, test_keyspace):
|
|
types = [ ('i', 'int', 4),
|
|
('b', 'bigint', 8),
|
|
('s', 'smallint', 2),
|
|
('t', 'tinyint', 1),
|
|
]
|
|
types_def = ','.join([f'{x[0]} {x[1]}' for x in types])
|
|
with new_test_table(cql, test_keyspace, f'k int primary key, {types_def}') as table:
|
|
k = unique_key_int();
|
|
for var, typ, length in types:
|
|
# Check that a blob with exactly length bytes is fine, one with one
|
|
# less or one more is rejected as an invalid request:
|
|
cql.execute(f"INSERT INTO {table} (k, {var}) VALUES ({k}, blobAs{typ}(0x{'00'*length}))")
|
|
assert 0 == getattr(cql.execute(f"SELECT {var} FROM {table} WHERE k = {k}").one(), var)
|
|
with pytest.raises(InvalidRequest, match='is not a valid binary'):
|
|
cql.execute(f"INSERT INTO {table} (k, {var}) VALUES ({k}, blobAs{typ}(0x{'00'*(length+1)}))")
|
|
if length - 1 != 0:
|
|
with pytest.raises(InvalidRequest, match='is not a valid binary'):
|
|
cql.execute(f"INSERT INTO {table} (k, {var}) VALUES ({k}, blobAs{typ}(0x{'00'*(length-1)}))")
|