Compare commits
7 Commits
master
...
copilot/do
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6ce6c33b65 | ||
|
|
fe77675455 | ||
|
|
2fd5383bd0 | ||
|
|
c7969f7a46 | ||
|
|
2c17c90825 | ||
|
|
e00dbfa334 | ||
|
|
744034eec6 |
310
test/alternator/test_encoding.py
Normal file
310
test/alternator/test_encoding.py
Normal file
@@ -0,0 +1,310 @@
|
||||
# Copyright 2026-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
|
||||
# Tests for the on-disk encoding of Alternator data. Specifically, these
|
||||
# tests verify that the internal format used to store DynamoDB attribute
|
||||
# values in the underlying Scylla table hasn't accidentally changed. If
|
||||
# Alternator's encoding were to change, sstables written by an older version
|
||||
# would become unreadable by a newer version - an unacceptable compatibility
|
||||
# breakage. So if any of these tests fail, the reason should be carefully
|
||||
# analyzed, and the test should only be updated if the encoding change was
|
||||
# intentional and backward compatibility was handled.
|
||||
#
|
||||
# Background on the encoding (see also issue #19770):
|
||||
# Alternator stores each DynamoDB table in keyspace "alternator_{table_name}",
|
||||
# table "{table_name}". The key attributes (hash key and optional range key)
|
||||
# are stored as regular CQL columns with their native CQL types (text for S,
|
||||
# blob for B, decimal for N). All other (non-key) attributes are stored
|
||||
# together in a single CQL column named ":attrs" of type map<text, blob>.
|
||||
# The map key is the attribute name; the map value encodes the type and value
|
||||
# of the attribute:
|
||||
# - "Optimized" types S, B, BOOL, N are encoded as one type byte followed by
|
||||
# Scylla's native serialization of the value. The type bytes are defined by
|
||||
# enum class alternator_type in alternator/serialization.hh:
|
||||
# S = 0
|
||||
# B = 1
|
||||
# BOOL = 2
|
||||
# N = 3
|
||||
# - All other DynamoDB types (NULL, L, M, SS, NS, BS) are stored as type byte
|
||||
# 4 (NOT_SUPPORTED_YET) followed by the JSON encoding of the full typed
|
||||
# DynamoDB value (e.g., {"NULL":true} or {"L":[...]}).
|
||||
#
|
||||
# The order of entries in the alternator_type enum is critical: the numeric
|
||||
# value of each type is written to disk, so it must not change.
|
||||
#
|
||||
# This file is related to issue #19770.
|
||||
|
||||
import json
|
||||
from decimal import Decimal
|
||||
|
||||
import pytest
|
||||
|
||||
from .util import new_test_table, random_string
|
||||
|
||||
# All tests in this file are scylla-only (they access CQL internals)
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def all_tests_are_scylla_only(scylla_only):
|
||||
pass
|
||||
|
||||
# A module-scoped table with both a GSI and an LSI, where "x" is the
|
||||
# GSI hash key and "y" is the LSI range key - both are non-base-table-key
|
||||
# attributes. Used by test_index_naming and test_index_key_not_schema_column.
|
||||
@pytest.fixture(scope='module')
|
||||
def table_with_indexes(dynamodb):
|
||||
with new_test_table(dynamodb,
|
||||
KeySchema=[
|
||||
{'AttributeName': 'p', 'KeyType': 'HASH'},
|
||||
{'AttributeName': 'c', 'KeyType': 'RANGE'},
|
||||
],
|
||||
AttributeDefinitions=[
|
||||
{'AttributeName': 'p', 'AttributeType': 'S'},
|
||||
{'AttributeName': 'c', 'AttributeType': 'S'},
|
||||
{'AttributeName': 'x', 'AttributeType': 'S'},
|
||||
{'AttributeName': 'y', 'AttributeType': 'S'},
|
||||
],
|
||||
LocalSecondaryIndexes=[
|
||||
{ 'IndexName': 'lsi1',
|
||||
'KeySchema': [
|
||||
{'AttributeName': 'p', 'KeyType': 'HASH'},
|
||||
{'AttributeName': 'y', 'KeyType': 'RANGE'},
|
||||
],
|
||||
'Projection': {'ProjectionType': 'ALL'}
|
||||
}
|
||||
],
|
||||
GlobalSecondaryIndexes=[
|
||||
{ 'IndexName': 'gsi1',
|
||||
'KeySchema': [{'AttributeName': 'x', 'KeyType': 'HASH'}],
|
||||
'Projection': {'ProjectionType': 'ALL'}
|
||||
}
|
||||
],
|
||||
) as table:
|
||||
yield table
|
||||
|
||||
# Serialize a DynamoDB number string as Scylla's decimal_type binary format:
|
||||
# 4-byte big-endian signed scale followed by a big-endian two's-complement
|
||||
# varint (minimum bytes) for the unscaled value.
|
||||
# This matches how big_decimal::big_decimal(string_view) in Scylla parses
|
||||
# the number string and how decimal_type serializes it.
|
||||
def _serialize_number(s):
|
||||
d = Decimal(s)
|
||||
sign, digits, exp = d.as_tuple()
|
||||
# Scylla's big_decimal sets scale = -exp and unscaled = int(digits)
|
||||
cql_scale = -exp
|
||||
unscaled = int(''.join(str(x) for x in digits)) if digits else 0
|
||||
if sign:
|
||||
unscaled = -unscaled
|
||||
scale_bytes = cql_scale.to_bytes(4, 'big', signed=True)
|
||||
# Encode unscaled as a Cassandra varint (big-endian two's complement, minimum bytes)
|
||||
if unscaled == 0:
|
||||
varint_bytes = b'\x00'
|
||||
elif unscaled > 0:
|
||||
# Need enough bytes so the most-significant bit is 0 (positive sign)
|
||||
num_bytes = (unscaled.bit_length() + 8) // 8
|
||||
varint_bytes = unscaled.to_bytes(num_bytes, 'big')
|
||||
else:
|
||||
num_bytes = ((-unscaled).bit_length() + 7) // 8
|
||||
try:
|
||||
varint_bytes = unscaled.to_bytes(num_bytes, 'big', signed=True)
|
||||
except OverflowError:
|
||||
varint_bytes = unscaled.to_bytes(num_bytes + 1, 'big', signed=True)
|
||||
return scale_bytes + varint_bytes
|
||||
|
||||
# Test that the encoding of all DynamoDB attribute types in the ":attrs"
|
||||
# map column of the underlying CQL table is as documented in the header
|
||||
# comment of this file, and has not accidentally changed. Specifically:
|
||||
# - For S, B, BOOL, N (optimized types): check the exact binary encoding.
|
||||
# - For NULL, L, M, SS, NS, BS: check the type byte (0x04) and verify the
|
||||
# remaining bytes decode to the expected JSON structure.
|
||||
# Multiple N values are stored to exercise _serialize_number() with diverse
|
||||
# inputs: positive scale, zero, negative unscaled, negative scale, and a
|
||||
# large number whose unscaled value requires multiple bytes.
|
||||
def test_attrs_encoding(dynamodb, cql, test_table_ss):
|
||||
p = random_string()
|
||||
c = random_string()
|
||||
test_table_ss.put_item(Item={
|
||||
'p': p,
|
||||
'c': c,
|
||||
'a_s': 'hello', # S
|
||||
'a_n': Decimal('3.14'), # N: positive scale=2, positive unscaled=314
|
||||
'a_n_zero': Decimal('0'), # N: zero
|
||||
'a_n_neg': Decimal('-5'), # N: negative unscaled, zero scale
|
||||
'a_n_negscale': Decimal('1e10'), # N: negative scale (stored as 1E+10)
|
||||
'a_n_large': Decimal('12345678901234567890'), # N: large multi-byte unscaled
|
||||
'a_b': b'\x01\x02\x03', # B
|
||||
'a_bool_t': True, # BOOL true
|
||||
'a_bool_f': False, # BOOL false
|
||||
'a_null': None, # NULL
|
||||
'a_l': ['x'], # L (list with one string)
|
||||
'a_m': {'k': 'v'}, # M (map with one string value)
|
||||
'a_ss': {'hello'}, # SS (single-element string set)
|
||||
'a_ns': {Decimal('1')}, # NS (single-element number set)
|
||||
'a_bs': {b'\x01'}, # BS (single-element binary set)
|
||||
})
|
||||
|
||||
ks = 'alternator_' + test_table_ss.name
|
||||
rows = list(cql.execute(
|
||||
f'SELECT ":attrs" FROM "{ks}"."{test_table_ss.name}" WHERE p = %s AND c = %s',
|
||||
[p, c]
|
||||
))
|
||||
assert len(rows) == 1
|
||||
attrs = rows[0][0]
|
||||
|
||||
# S (alternator_type::S = 0): type byte 0x00 followed by raw UTF-8 bytes
|
||||
assert attrs['a_s'] == b'\x00' + b'hello'
|
||||
|
||||
# N (alternator_type::N = 3): type byte 0x03 followed by decimal_type serialization
|
||||
# (4-byte big-endian scale + varint unscaled value)
|
||||
assert attrs['a_n'] == b'\x03' + _serialize_number('3.14')
|
||||
assert attrs['a_n_zero'] == b'\x03' + _serialize_number('0')
|
||||
assert attrs['a_n_neg'] == b'\x03' + _serialize_number('-5')
|
||||
assert attrs['a_n_negscale'] == b'\x03' + _serialize_number('1e10')
|
||||
assert attrs['a_n_large'] == b'\x03' + _serialize_number('12345678901234567890')
|
||||
|
||||
# B (alternator_type::B = 1): type byte 0x01 followed by raw bytes
|
||||
assert attrs['a_b'] == b'\x01' + b'\x01\x02\x03'
|
||||
|
||||
# BOOL true (alternator_type::BOOL = 2): type byte 0x02 followed by 0x01
|
||||
assert attrs['a_bool_t'] == b'\x02\x01'
|
||||
|
||||
# BOOL false: type byte 0x02 followed by 0x00
|
||||
assert attrs['a_bool_f'] == b'\x02\x00'
|
||||
|
||||
# For the following types (NOT_SUPPORTED_YET = 4), the encoding is:
|
||||
# type byte 0x04 followed by the compact JSON of the full typed value.
|
||||
# We check the type byte and that the JSON decodes to the expected structure.
|
||||
|
||||
# NULL
|
||||
assert attrs['a_null'][0:1] == b'\x04'
|
||||
assert json.loads(attrs['a_null'][1:]) == {'NULL': True}
|
||||
|
||||
# L (list)
|
||||
assert attrs['a_l'][0:1] == b'\x04'
|
||||
assert json.loads(attrs['a_l'][1:]) == {'L': [{'S': 'x'}]}
|
||||
|
||||
# M (map)
|
||||
assert attrs['a_m'][0:1] == b'\x04'
|
||||
assert json.loads(attrs['a_m'][1:]) == {'M': {'k': {'S': 'v'}}}
|
||||
|
||||
# SS (string set, single element so no ordering ambiguity)
|
||||
assert attrs['a_ss'][0:1] == b'\x04'
|
||||
assert json.loads(attrs['a_ss'][1:]) == {'SS': ['hello']}
|
||||
|
||||
# NS (number set, single element so no ordering ambiguity)
|
||||
assert attrs['a_ns'][0:1] == b'\x04'
|
||||
assert json.loads(attrs['a_ns'][1:]) == {'NS': ['1']}
|
||||
|
||||
# BS (binary set, binary values are base64-encoded in the JSON)
|
||||
assert attrs['a_bs'][0:1] == b'\x04'
|
||||
assert json.loads(attrs['a_bs'][1:]) == {'BS': ['AQ==']}
|
||||
|
||||
# Test that both hash keys and range keys of all three DynamoDB key types
|
||||
# (S, B, N) are stored as the correct native CQL types. Specifically:
|
||||
# S (string) -> CQL text
|
||||
# B (binary) -> CQL blob
|
||||
# N (number) -> CQL decimal
|
||||
# These type mappings are part of the on-disk format and must not change.
|
||||
def test_key_column_types(dynamodb, cql, test_table_sn, test_table_b, test_table_ss, test_table_sb):
|
||||
def get_col_type(table, col_name):
|
||||
ks = 'alternator_' + table.name
|
||||
rows = list(cql.execute(
|
||||
"SELECT type FROM system_schema.columns "
|
||||
f"WHERE keyspace_name = '{ks}' AND table_name = '{table.name}' "
|
||||
f"AND column_name = '{col_name}'"
|
||||
))
|
||||
return rows[0].type if rows else None
|
||||
|
||||
# Hash key type S -> CQL text
|
||||
assert get_col_type(test_table_sn, 'p') == 'text'
|
||||
|
||||
# Hash key type B -> CQL blob
|
||||
assert get_col_type(test_table_b, 'p') == 'blob'
|
||||
|
||||
# Hash key type N -> CQL decimal (no shared fixture for N-hash tables)
|
||||
with new_test_table(dynamodb,
|
||||
KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}],
|
||||
AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'N'}]) as table:
|
||||
assert get_col_type(table, 'p') == 'decimal'
|
||||
|
||||
# Range key type S -> CQL text
|
||||
assert get_col_type(test_table_ss, 'c') == 'text'
|
||||
|
||||
# Range key type B -> CQL blob
|
||||
assert get_col_type(test_table_sb, 'c') == 'blob'
|
||||
|
||||
# Range key type N -> CQL decimal
|
||||
assert get_col_type(test_table_sn, 'c') == 'decimal'
|
||||
|
||||
# Test that Alternator tables use the expected keyspace and table names in
|
||||
# the underlying CQL schema. This naming convention must remain stable, as
|
||||
# changing it would break access to existing data.
|
||||
# See also test_cql_schema.py::test_cql_keyspace_and_table which tests this
|
||||
# in a different way.
|
||||
def test_table_naming(cql, test_table_s):
|
||||
table_name = test_table_s.name
|
||||
ks = 'alternator_' + table_name
|
||||
# Verify we can query the underlying CQL table using the expected keyspace
|
||||
# and table name. The query will succeed only if naming is as expected.
|
||||
rows = list(cql.execute(f'SELECT p FROM "{ks}"."{table_name}" LIMIT 0'))
|
||||
assert rows == []
|
||||
|
||||
# Test that GSI and LSI view names in the underlying CQL schema follow the
|
||||
# expected naming convention:
|
||||
# GSI named "gsi1" -> CQL view "{table_name}:gsi1"
|
||||
# LSI named "lsi1" -> CQL view "{table_name}!:lsi1"
|
||||
# These naming conventions must remain stable, as changing them would break
|
||||
# access to existing data. See also test_cql_schema.py::test_alternator_aux_tables
|
||||
# which tests related properties in a different way.
|
||||
def test_index_naming(cql, table_with_indexes):
|
||||
table_name = table_with_indexes.name
|
||||
ks = 'alternator_' + table_name
|
||||
gsi_view = table_name + ':gsi1'
|
||||
lsi_view = table_name + '!:lsi1'
|
||||
# Check the view names in system_schema.views (the view names contain
|
||||
# special characters such as '!' and ':', so we look them up in the
|
||||
# system table rather than attempting to query the views directly).
|
||||
rows = list(cql.execute(
|
||||
"SELECT view_name FROM system_schema.views "
|
||||
f"WHERE keyspace_name = '{ks}' AND view_name = '{gsi_view}'"
|
||||
))
|
||||
assert len(rows) == 1, f"Expected GSI view '{gsi_view}' not found in system_schema.views"
|
||||
rows = list(cql.execute(
|
||||
"SELECT view_name FROM system_schema.views "
|
||||
f"WHERE keyspace_name = '{ks}' AND view_name = '{lsi_view}'"
|
||||
))
|
||||
assert len(rows) == 1, f"Expected LSI view '{lsi_view}' not found in system_schema.views"
|
||||
|
||||
# Test that GSI/LSI key attributes that are not base-table key attributes
|
||||
# are NOT stored as separate CQL schema columns in the base table - they are
|
||||
# stored in ":attrs" instead. In the table_with_indexes fixture, "x" is the
|
||||
# GSI hash key and "y" is the LSI range key, but neither is a base-table key.
|
||||
# Note: this behavior changed recently. Before https://github.com/scylladb/scylladb/pull/24991,
|
||||
# LSI key columns (and before an even earlier change, also GSI key columns) were
|
||||
# added as real schema columns. Now all non-base-table-key attributes, including
|
||||
# GSI and LSI key attributes, are stored in ":attrs".
|
||||
def test_index_key_not_schema_column(dynamodb, cql, table_with_indexes):
|
||||
table = table_with_indexes
|
||||
ks = 'alternator_' + table.name
|
||||
# Neither "x" (GSI key) nor "y" (LSI key) must appear as columns in the
|
||||
# base table's CQL schema.
|
||||
for attr in ('x', 'y'):
|
||||
rows = list(cql.execute(
|
||||
"SELECT column_name FROM system_schema.columns "
|
||||
f"WHERE keyspace_name = '{ks}' AND table_name = '{table.name}' "
|
||||
f"AND column_name = '{attr}'"
|
||||
))
|
||||
assert rows == [], f"Attribute '{attr}' (GSI/LSI key) should not be a schema column, but found: {rows}"
|
||||
# Write an item with both "x" and "y" set, then confirm they are stored in ":attrs".
|
||||
p = random_string()
|
||||
c = random_string()
|
||||
table.put_item(Item={'p': p, 'c': c, 'x': 'hello', 'y': 'world'})
|
||||
rows = list(cql.execute(
|
||||
f'SELECT ":attrs" FROM "{ks}"."{table.name}" WHERE p = %s AND c = %s',
|
||||
[p, c]
|
||||
))
|
||||
assert len(rows) == 1
|
||||
attrs = rows[0][0]
|
||||
# Both "x" and "y" should be stored in ":attrs" with S-type encoding
|
||||
# (type byte 0x00 followed by raw UTF-8 bytes).
|
||||
assert attrs['x'] == b'\x00' + b'hello'
|
||||
assert attrs['y'] == b'\x00' + b'world'
|
||||
Reference in New Issue
Block a user