Files
scylladb/test/alternator/test_encoding.py
Nadav Har'El 47e8206482 test/alternator: test, and document, Alternator's data encoding
This patch adds a test file, test/alternator/test_encoding.py, testing
how Alternator stores its data in the underlying CQL database. We test
how tables are named, and how attributes of different types are encoded
into CQL.

The test, which begins with a long comment, also doubles as developer-
oriented *documention* on how Alternator encodes its data in the CQL
database. This documentation is not intended for end-users - we do
not want to officially support reading or writing Alternator tables
through CQL. But once in a while, this information can come in handy
for developers.

More importantly, this test will also serve as a regression test,
verifying that Alternator's encoding doesn't change unintentionally.
If we make an unintentional change to the way that Alternator stores
its data, this can break upgrades: The new code might not be able to
read or write the old table with its old encoding. So it's important
to make sure we never make such unintentional changes to the encoding
of Alternator's data. If we ever do make *intentional* changes to
Alternator's data encoding, we will need to fix the relevant test;
But also not forget to make sure that the new code is able to read
the old encoding as well.

The new tests use both "dynamodb" (Alternator) and "cql" fixtures,
to test how CQL sees the Alternator tables. So naturally are these
tests are marked "scylla_only" and skipped on DynamoDB.

Fixes #19770.

Closes scylladb/scylladb#28866
2026-03-09 10:50:09 +01:00

311 lines
14 KiB
Python

# Copyright 2026-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
# Tests for the on-disk encoding of Alternator data. Specifically, these
# tests verify that the internal format used to store DynamoDB attribute
# values in the underlying Scylla table hasn't accidentally changed. If
# Alternator's encoding were to change, sstables written by an older version
# would become unreadable by a newer version - an unacceptable compatibility
# breakage. So if any of these tests fail, the reason should be carefully
# analyzed, and the test should only be updated if the encoding change was
# intentional and backward compatibility was handled.
#
# Background on the encoding (see also issue #19770):
# Alternator stores each DynamoDB table in keyspace "alternator_{table_name}",
# table "{table_name}". The key attributes (hash key and optional range key)
# are stored as regular CQL columns with their native CQL types (text for S,
# blob for B, decimal for N). All other (non-key) attributes are stored
# together in a single CQL column named ":attrs" of type map<text, blob>.
# The map key is the attribute name; the map value encodes the type and value
# of the attribute:
# - "Optimized" types S, B, BOOL, N are encoded as one type byte followed by
# Scylla's native serialization of the value. The type bytes are defined by
# enum class alternator_type in alternator/serialization.hh:
# S = 0
# B = 1
# BOOL = 2
# N = 3
# - All other DynamoDB types (NULL, L, M, SS, NS, BS) are stored as type byte
# 4 (NOT_SUPPORTED_YET) followed by the JSON encoding of the full typed
# DynamoDB value (e.g., {"NULL":true} or {"L":[...]}).
#
# The order of entries in the alternator_type enum is critical: the numeric
# value of each type is written to disk, so it must not change.
#
# This file is related to issue #19770.
import json
from decimal import Decimal
import pytest
from .util import new_test_table, random_string
# All tests in this file are scylla-only (they access CQL internals)
@pytest.fixture(scope="function", autouse=True)
def all_tests_are_scylla_only(scylla_only):
pass
# A module-scoped table with both a GSI and an LSI, where "x" is the
# GSI hash key and "y" is the LSI range key - both are non-base-table-key
# attributes. Used by test_index_naming and test_index_key_not_schema_column.
@pytest.fixture(scope='module')
def table_with_indexes(dynamodb):
with new_test_table(dynamodb,
KeySchema=[
{'AttributeName': 'p', 'KeyType': 'HASH'},
{'AttributeName': 'c', 'KeyType': 'RANGE'},
],
AttributeDefinitions=[
{'AttributeName': 'p', 'AttributeType': 'S'},
{'AttributeName': 'c', 'AttributeType': 'S'},
{'AttributeName': 'x', 'AttributeType': 'S'},
{'AttributeName': 'y', 'AttributeType': 'S'},
],
LocalSecondaryIndexes=[
{ 'IndexName': 'lsi1',
'KeySchema': [
{'AttributeName': 'p', 'KeyType': 'HASH'},
{'AttributeName': 'y', 'KeyType': 'RANGE'},
],
'Projection': {'ProjectionType': 'ALL'}
}
],
GlobalSecondaryIndexes=[
{ 'IndexName': 'gsi1',
'KeySchema': [{'AttributeName': 'x', 'KeyType': 'HASH'}],
'Projection': {'ProjectionType': 'ALL'}
}
],
) as table:
yield table
# Serialize a DynamoDB number string as Scylla's decimal_type binary format:
# 4-byte big-endian signed scale followed by a big-endian two's-complement
# varint (minimum bytes) for the unscaled value.
# This matches how big_decimal::big_decimal(string_view) in Scylla parses
# the number string and how decimal_type serializes it.
def _serialize_number(s):
d = Decimal(s)
sign, digits, exp = d.as_tuple()
# Scylla's big_decimal sets scale = -exp and unscaled = int(digits)
cql_scale = -exp
unscaled = int(''.join(str(x) for x in digits)) if digits else 0
if sign:
unscaled = -unscaled
scale_bytes = cql_scale.to_bytes(4, 'big', signed=True)
# Encode unscaled as a Cassandra varint (big-endian two's complement, minimum bytes)
if unscaled == 0:
varint_bytes = b'\x00'
elif unscaled > 0:
# Need enough bytes so the most-significant bit is 0 (positive sign)
num_bytes = (unscaled.bit_length() + 8) // 8
varint_bytes = unscaled.to_bytes(num_bytes, 'big')
else:
num_bytes = ((-unscaled).bit_length() + 7) // 8
try:
varint_bytes = unscaled.to_bytes(num_bytes, 'big', signed=True)
except OverflowError:
varint_bytes = unscaled.to_bytes(num_bytes + 1, 'big', signed=True)
return scale_bytes + varint_bytes
# Test that the encoding of all DynamoDB attribute types in the ":attrs"
# map column of the underlying CQL table is as documented in the header
# comment of this file, and has not accidentally changed. Specifically:
# - For S, B, BOOL, N (optimized types): check the exact binary encoding.
# - For NULL, L, M, SS, NS, BS: check the type byte (0x04) and verify the
# remaining bytes decode to the expected JSON structure.
# Multiple N values are stored to exercise _serialize_number() with diverse
# inputs: positive scale, zero, negative unscaled, negative scale, and a
# large number whose unscaled value requires multiple bytes.
def test_attrs_encoding(dynamodb, cql, test_table_ss):
p = random_string()
c = random_string()
test_table_ss.put_item(Item={
'p': p,
'c': c,
'a_s': 'hello', # S
'a_n': Decimal('3.14'), # N: positive scale=2, positive unscaled=314
'a_n_zero': Decimal('0'), # N: zero
'a_n_neg': Decimal('-5'), # N: negative unscaled, zero scale
'a_n_negscale': Decimal('1e10'), # N: negative scale (stored as 1E+10)
'a_n_large': Decimal('12345678901234567890'), # N: large multi-byte unscaled
'a_b': b'\x01\x02\x03', # B
'a_bool_t': True, # BOOL true
'a_bool_f': False, # BOOL false
'a_null': None, # NULL
'a_l': ['x'], # L (list with one string)
'a_m': {'k': 'v'}, # M (map with one string value)
'a_ss': {'hello'}, # SS (single-element string set)
'a_ns': {Decimal('1')}, # NS (single-element number set)
'a_bs': {b'\x01'}, # BS (single-element binary set)
})
ks = 'alternator_' + test_table_ss.name
rows = list(cql.execute(
f'SELECT ":attrs" FROM "{ks}"."{test_table_ss.name}" WHERE p = %s AND c = %s',
[p, c]
))
assert len(rows) == 1
attrs = rows[0][0]
# S (alternator_type::S = 0): type byte 0x00 followed by raw UTF-8 bytes
assert attrs['a_s'] == b'\x00' + b'hello'
# N (alternator_type::N = 3): type byte 0x03 followed by decimal_type serialization
# (4-byte big-endian scale + varint unscaled value)
assert attrs['a_n'] == b'\x03' + _serialize_number('3.14')
assert attrs['a_n_zero'] == b'\x03' + _serialize_number('0')
assert attrs['a_n_neg'] == b'\x03' + _serialize_number('-5')
assert attrs['a_n_negscale'] == b'\x03' + _serialize_number('1e10')
assert attrs['a_n_large'] == b'\x03' + _serialize_number('12345678901234567890')
# B (alternator_type::B = 1): type byte 0x01 followed by raw bytes
assert attrs['a_b'] == b'\x01' + b'\x01\x02\x03'
# BOOL true (alternator_type::BOOL = 2): type byte 0x02 followed by 0x01
assert attrs['a_bool_t'] == b'\x02\x01'
# BOOL false: type byte 0x02 followed by 0x00
assert attrs['a_bool_f'] == b'\x02\x00'
# For the following types (NOT_SUPPORTED_YET = 4), the encoding is:
# type byte 0x04 followed by the compact JSON of the full typed value.
# We check the type byte and that the JSON decodes to the expected structure.
# NULL
assert attrs['a_null'][0:1] == b'\x04'
assert json.loads(attrs['a_null'][1:]) == {'NULL': True}
# L (list)
assert attrs['a_l'][0:1] == b'\x04'
assert json.loads(attrs['a_l'][1:]) == {'L': [{'S': 'x'}]}
# M (map)
assert attrs['a_m'][0:1] == b'\x04'
assert json.loads(attrs['a_m'][1:]) == {'M': {'k': {'S': 'v'}}}
# SS (string set, single element so no ordering ambiguity)
assert attrs['a_ss'][0:1] == b'\x04'
assert json.loads(attrs['a_ss'][1:]) == {'SS': ['hello']}
# NS (number set, single element so no ordering ambiguity)
assert attrs['a_ns'][0:1] == b'\x04'
assert json.loads(attrs['a_ns'][1:]) == {'NS': ['1']}
# BS (binary set, binary values are base64-encoded in the JSON)
assert attrs['a_bs'][0:1] == b'\x04'
assert json.loads(attrs['a_bs'][1:]) == {'BS': ['AQ==']}
# Test that both hash keys and range keys of all three DynamoDB key types
# (S, B, N) are stored as the correct native CQL types. Specifically:
# S (string) -> CQL text
# B (binary) -> CQL blob
# N (number) -> CQL decimal
# These type mappings are part of the on-disk format and must not change.
def test_key_column_types(dynamodb, cql, test_table_sn, test_table_b, test_table_ss, test_table_sb):
def get_col_type(table, col_name):
ks = 'alternator_' + table.name
rows = list(cql.execute(
"SELECT type FROM system_schema.columns "
f"WHERE keyspace_name = '{ks}' AND table_name = '{table.name}' "
f"AND column_name = '{col_name}'"
))
return rows[0].type if rows else None
# Hash key type S -> CQL text
assert get_col_type(test_table_sn, 'p') == 'text'
# Hash key type B -> CQL blob
assert get_col_type(test_table_b, 'p') == 'blob'
# Hash key type N -> CQL decimal (no shared fixture for N-hash tables)
with new_test_table(dynamodb,
KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}],
AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'N'}]) as table:
assert get_col_type(table, 'p') == 'decimal'
# Range key type S -> CQL text
assert get_col_type(test_table_ss, 'c') == 'text'
# Range key type B -> CQL blob
assert get_col_type(test_table_sb, 'c') == 'blob'
# Range key type N -> CQL decimal
assert get_col_type(test_table_sn, 'c') == 'decimal'
# Test that Alternator tables use the expected keyspace and table names in
# the underlying CQL schema. This naming convention must remain stable, as
# changing it would break access to existing data.
# See also test_cql_schema.py::test_cql_keyspace_and_table which tests this
# in a different way.
def test_table_naming(cql, test_table_s):
table_name = test_table_s.name
ks = 'alternator_' + table_name
# Verify we can query the underlying CQL table using the expected keyspace
# and table name. The query will succeed only if naming is as expected.
rows = list(cql.execute(f'SELECT p FROM "{ks}"."{table_name}"'))
assert rows == []
# Test that GSI and LSI view names in the underlying CQL schema follow the
# expected naming convention:
# GSI named "gsi1" -> CQL view "{table_name}:gsi1"
# LSI named "lsi1" -> CQL view "{table_name}!:lsi1"
# These naming conventions must remain stable, as changing them would break
# access to existing data. See also test_cql_schema.py::test_alternator_aux_tables
# which tests related properties in a different way.
def test_index_naming(cql, table_with_indexes):
table_name = table_with_indexes.name
ks = 'alternator_' + table_name
gsi_view = table_name + ':gsi1'
lsi_view = table_name + '!:lsi1'
# Check the view names in system_schema.views (the view names contain
# special characters such as '!' and ':', so we look them up in the
# system table rather than attempting to query the views directly).
rows = list(cql.execute(
"SELECT view_name FROM system_schema.views "
f"WHERE keyspace_name = '{ks}' AND view_name = '{gsi_view}'"
))
assert len(rows) == 1, f"Expected GSI view '{gsi_view}' not found in system_schema.views"
rows = list(cql.execute(
"SELECT view_name FROM system_schema.views "
f"WHERE keyspace_name = '{ks}' AND view_name = '{lsi_view}'"
))
assert len(rows) == 1, f"Expected LSI view '{lsi_view}' not found in system_schema.views"
# Test that GSI/LSI key attributes that are not base-table key attributes
# are NOT stored as separate CQL schema columns in the base table - they are
# stored in ":attrs" instead. In the table_with_indexes fixture, "x" is the
# GSI hash key and "y" is the LSI range key, but neither is a base-table key.
# Note: this behavior changed recently. Before https://github.com/scylladb/scylladb/pull/24991,
# LSI key columns (and before an even earlier change, also GSI key columns) were
# added as real schema columns. Now all non-base-table-key attributes, including
# GSI and LSI key attributes, are stored in ":attrs".
def test_index_key_not_schema_column(dynamodb, cql, table_with_indexes):
table = table_with_indexes
ks = 'alternator_' + table.name
# Neither "x" (GSI key) nor "y" (LSI key) must appear as columns in the
# base table's CQL schema.
for attr in ('x', 'y'):
rows = list(cql.execute(
"SELECT column_name FROM system_schema.columns "
f"WHERE keyspace_name = '{ks}' AND table_name = '{table.name}' "
f"AND column_name = '{attr}'"
))
assert rows == [], f"Attribute '{attr}' (GSI/LSI key) should not be a schema column, but found: {rows}"
# Write an item with both "x" and "y" set, then confirm they are stored in ":attrs".
p = random_string()
c = random_string()
table.put_item(Item={'p': p, 'c': c, 'x': 'hello', 'y': 'world'})
rows = list(cql.execute(
f'SELECT ":attrs" FROM "{ks}"."{table.name}" WHERE p = %s AND c = %s',
[p, c]
))
assert len(rows) == 1
attrs = rows[0][0]
# Both "x" and "y" should be stored in ":attrs" with S-type encoding
# (type byte 0x00 followed by raw UTF-8 bytes).
assert attrs['x'] == b'\x00' + b'hello'
assert attrs['y'] == b'\x00' + b'world'