test/alternator: fix copyright year to 2026

Co-authored-by: mykaul <4655593+mykaul@users.noreply.github.com>
test/alternator: rename fixture, split x/y attrs, reorder tests, fix index naming check
2026-05-13 03:12:13 +00:00 · 2026-03-03 16:28:02 +00:00 · 2026-03-03 15:46:23 +00:00 · 2026-03-03 15:28:34 +00:00 · 2026-03-03 15:16:11 +00:00 · 2026-03-03 15:00:56 +00:00
1 changed files with 310 additions and 0 deletions
--- a/test/alternator/test_encoding.py
+++ b/test/alternator/test_encoding.py
@@ -0,0 +1,310 @@
+# Copyright 2026-present ScyllaDB
+#
+# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+
+# Tests for the on-disk encoding of Alternator data. Specifically, these
+# tests verify that the internal format used to store DynamoDB attribute
+# values in the underlying Scylla table hasn't accidentally changed. If
+# Alternator's encoding were to change, sstables written by an older version
+# would become unreadable by a newer version - an unacceptable compatibility
+# breakage. So if any of these tests fail, the reason should be carefully
+# analyzed, and the test should only be updated if the encoding change was
+# intentional and backward compatibility was handled.
+#
+# Background on the encoding (see also issue #19770):
+# Alternator stores each DynamoDB table in keyspace "alternator_{table_name}",
+# table "{table_name}". The key attributes (hash key and optional range key)
+# are stored as regular CQL columns with their native CQL types (text for S,
+# blob for B, decimal for N). All other (non-key) attributes are stored
+# together in a single CQL column named ":attrs" of type map<text, blob>.
+# The map key is the attribute name; the map value encodes the type and value
+# of the attribute:
+# - "Optimized" types S, B, BOOL, N are encoded as one type byte followed by
+#   Scylla's native serialization of the value. The type bytes are defined by
+#   enum class alternator_type in alternator/serialization.hh:
+#     S    = 0
+#     B    = 1
+#     BOOL = 2
+#     N    = 3
+# - All other DynamoDB types (NULL, L, M, SS, NS, BS) are stored as type byte
+#   4 (NOT_SUPPORTED_YET) followed by the JSON encoding of the full typed
+#   DynamoDB value (e.g., {"NULL":true} or {"L":[...]}).
+#
+# The order of entries in the alternator_type enum is critical: the numeric
+# value of each type is written to disk, so it must not change.
+#
+# This file is related to issue #19770.
+
+import json
+from decimal import Decimal
+
+import pytest
+
+from .util import new_test_table, random_string
+
+# All tests in this file are scylla-only (they access CQL internals)
+@pytest.fixture(scope="function", autouse=True)
+def all_tests_are_scylla_only(scylla_only):
+    pass
+
+# A module-scoped table with both a GSI and an LSI, where "x" is the
+# GSI hash key and "y" is the LSI range key - both are non-base-table-key
+# attributes. Used by test_index_naming and test_index_key_not_schema_column.
+@pytest.fixture(scope='module')
+def table_with_indexes(dynamodb):
+    with new_test_table(dynamodb,
+        KeySchema=[
+            {'AttributeName': 'p', 'KeyType': 'HASH'},
+            {'AttributeName': 'c', 'KeyType': 'RANGE'},
+        ],
+        AttributeDefinitions=[
+            {'AttributeName': 'p', 'AttributeType': 'S'},
+            {'AttributeName': 'c', 'AttributeType': 'S'},
+            {'AttributeName': 'x', 'AttributeType': 'S'},
+            {'AttributeName': 'y', 'AttributeType': 'S'},
+        ],
+        LocalSecondaryIndexes=[
+            {   'IndexName': 'lsi1',
+                'KeySchema': [
+                    {'AttributeName': 'p', 'KeyType': 'HASH'},
+                    {'AttributeName': 'y', 'KeyType': 'RANGE'},
+                ],
+                'Projection': {'ProjectionType': 'ALL'}
+            }
+        ],
+        GlobalSecondaryIndexes=[
+            {   'IndexName': 'gsi1',
+                'KeySchema': [{'AttributeName': 'x', 'KeyType': 'HASH'}],
+                'Projection': {'ProjectionType': 'ALL'}
+            }
+        ],
+    ) as table:
+        yield table
+
+# Serialize a DynamoDB number string as Scylla's decimal_type binary format:
+# 4-byte big-endian signed scale followed by a big-endian two's-complement
+# varint (minimum bytes) for the unscaled value.
+# This matches how big_decimal::big_decimal(string_view) in Scylla parses
+# the number string and how decimal_type serializes it.
+def _serialize_number(s):
+    d = Decimal(s)
+    sign, digits, exp = d.as_tuple()
+    # Scylla's big_decimal sets scale = -exp and unscaled = int(digits)
+    cql_scale = -exp
+    unscaled = int(''.join(str(x) for x in digits)) if digits else 0
+    if sign:
+        unscaled = -unscaled
+    scale_bytes = cql_scale.to_bytes(4, 'big', signed=True)
+    # Encode unscaled as a Cassandra varint (big-endian two's complement, minimum bytes)
+    if unscaled == 0:
+        varint_bytes = b'\x00'
+    elif unscaled > 0:
+        # Need enough bytes so the most-significant bit is 0 (positive sign)
+        num_bytes = (unscaled.bit_length() + 8) // 8
+        varint_bytes = unscaled.to_bytes(num_bytes, 'big')
+    else:
+        num_bytes = ((-unscaled).bit_length() + 7) // 8
+        try:
+            varint_bytes = unscaled.to_bytes(num_bytes, 'big', signed=True)
+        except OverflowError:
+            varint_bytes = unscaled.to_bytes(num_bytes + 1, 'big', signed=True)
+    return scale_bytes + varint_bytes
+
+# Test that the encoding of all DynamoDB attribute types in the ":attrs"
+# map column of the underlying CQL table is as documented in the header
+# comment of this file, and has not accidentally changed. Specifically:
+# - For S, B, BOOL, N (optimized types): check the exact binary encoding.
+# - For NULL, L, M, SS, NS, BS: check the type byte (0x04) and verify the
+#   remaining bytes decode to the expected JSON structure.
+# Multiple N values are stored to exercise _serialize_number() with diverse
+# inputs: positive scale, zero, negative unscaled, negative scale, and a
+# large number whose unscaled value requires multiple bytes.
+def test_attrs_encoding(dynamodb, cql, test_table_ss):
+    p = random_string()
+    c = random_string()
+    test_table_ss.put_item(Item={
+        'p': p,
+        'c': c,
+        'a_s': 'hello',                          # S
+        'a_n': Decimal('3.14'),                  # N: positive scale=2, positive unscaled=314
+        'a_n_zero': Decimal('0'),                # N: zero
+        'a_n_neg': Decimal('-5'),                # N: negative unscaled, zero scale
+        'a_n_negscale': Decimal('1e10'),         # N: negative scale (stored as 1E+10)
+        'a_n_large': Decimal('12345678901234567890'),  # N: large multi-byte unscaled
+        'a_b': b'\x01\x02\x03',                 # B
+        'a_bool_t': True,                        # BOOL true
+        'a_bool_f': False,                       # BOOL false
+        'a_null': None,                          # NULL
+        'a_l': ['x'],                            # L (list with one string)
+        'a_m': {'k': 'v'},                       # M (map with one string value)
+        'a_ss': {'hello'},                       # SS (single-element string set)
+        'a_ns': {Decimal('1')},                  # NS (single-element number set)
+        'a_bs': {b'\x01'},                       # BS (single-element binary set)
+    })
+
+    ks = 'alternator_' + test_table_ss.name
+    rows = list(cql.execute(
+        f'SELECT ":attrs" FROM "{ks}"."{test_table_ss.name}" WHERE p = %s AND c = %s',
+        [p, c]
+    ))
+    assert len(rows) == 1
+    attrs = rows[0][0]
+
+    # S (alternator_type::S = 0): type byte 0x00 followed by raw UTF-8 bytes
+    assert attrs['a_s'] == b'\x00' + b'hello'
+
+    # N (alternator_type::N = 3): type byte 0x03 followed by decimal_type serialization
+    # (4-byte big-endian scale + varint unscaled value)
+    assert attrs['a_n'] == b'\x03' + _serialize_number('3.14')
+    assert attrs['a_n_zero'] == b'\x03' + _serialize_number('0')
+    assert attrs['a_n_neg'] == b'\x03' + _serialize_number('-5')
+    assert attrs['a_n_negscale'] == b'\x03' + _serialize_number('1e10')
+    assert attrs['a_n_large'] == b'\x03' + _serialize_number('12345678901234567890')
+
+    # B (alternator_type::B = 1): type byte 0x01 followed by raw bytes
+    assert attrs['a_b'] == b'\x01' + b'\x01\x02\x03'
+
+    # BOOL true (alternator_type::BOOL = 2): type byte 0x02 followed by 0x01
+    assert attrs['a_bool_t'] == b'\x02\x01'
+
+    # BOOL false: type byte 0x02 followed by 0x00
+    assert attrs['a_bool_f'] == b'\x02\x00'
+
+    # For the following types (NOT_SUPPORTED_YET = 4), the encoding is:
+    # type byte 0x04 followed by the compact JSON of the full typed value.
+    # We check the type byte and that the JSON decodes to the expected structure.
+
+    # NULL
+    assert attrs['a_null'][0:1] == b'\x04'
+    assert json.loads(attrs['a_null'][1:]) == {'NULL': True}
+
+    # L (list)
+    assert attrs['a_l'][0:1] == b'\x04'
+    assert json.loads(attrs['a_l'][1:]) == {'L': [{'S': 'x'}]}
+
+    # M (map)
+    assert attrs['a_m'][0:1] == b'\x04'
+    assert json.loads(attrs['a_m'][1:]) == {'M': {'k': {'S': 'v'}}}
+
+    # SS (string set, single element so no ordering ambiguity)
+    assert attrs['a_ss'][0:1] == b'\x04'
+    assert json.loads(attrs['a_ss'][1:]) == {'SS': ['hello']}
+
+    # NS (number set, single element so no ordering ambiguity)
+    assert attrs['a_ns'][0:1] == b'\x04'
+    assert json.loads(attrs['a_ns'][1:]) == {'NS': ['1']}
+
+    # BS (binary set, binary values are base64-encoded in the JSON)
+    assert attrs['a_bs'][0:1] == b'\x04'
+    assert json.loads(attrs['a_bs'][1:]) == {'BS': ['AQ==']}
+
+# Test that both hash keys and range keys of all three DynamoDB key types
+# (S, B, N) are stored as the correct native CQL types. Specifically:
+#   S (string) -> CQL text
+#   B (binary) -> CQL blob
+#   N (number) -> CQL decimal
+# These type mappings are part of the on-disk format and must not change.
+def test_key_column_types(dynamodb, cql, test_table_sn, test_table_b, test_table_ss, test_table_sb):
+    def get_col_type(table, col_name):
+        ks = 'alternator_' + table.name
+        rows = list(cql.execute(
+            "SELECT type FROM system_schema.columns "
+            f"WHERE keyspace_name = '{ks}' AND table_name = '{table.name}' "
+            f"AND column_name = '{col_name}'"
+        ))
+        return rows[0].type if rows else None
+
+    # Hash key type S -> CQL text
+    assert get_col_type(test_table_sn, 'p') == 'text'
+
+    # Hash key type B -> CQL blob
+    assert get_col_type(test_table_b, 'p') == 'blob'
+
+    # Hash key type N -> CQL decimal (no shared fixture for N-hash tables)
+    with new_test_table(dynamodb,
+            KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}],
+            AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'N'}]) as table:
+        assert get_col_type(table, 'p') == 'decimal'
+
+    # Range key type S -> CQL text
+    assert get_col_type(test_table_ss, 'c') == 'text'
+
+    # Range key type B -> CQL blob
+    assert get_col_type(test_table_sb, 'c') == 'blob'
+
+    # Range key type N -> CQL decimal
+    assert get_col_type(test_table_sn, 'c') == 'decimal'
+
+# Test that Alternator tables use the expected keyspace and table names in
+# the underlying CQL schema. This naming convention must remain stable, as
+# changing it would break access to existing data.
+# See also test_cql_schema.py::test_cql_keyspace_and_table which tests this
+# in a different way.
+def test_table_naming(cql, test_table_s):
+    table_name = test_table_s.name
+    ks = 'alternator_' + table_name
+    # Verify we can query the underlying CQL table using the expected keyspace
+    # and table name. The query will succeed only if naming is as expected.
+    rows = list(cql.execute(f'SELECT p FROM "{ks}"."{table_name}" LIMIT 0'))
+    assert rows == []
+
+# Test that GSI and LSI view names in the underlying CQL schema follow the
+# expected naming convention:
+#   GSI named "gsi1" -> CQL view "{table_name}:gsi1"
+#   LSI named "lsi1" -> CQL view "{table_name}!:lsi1"
+# These naming conventions must remain stable, as changing them would break
+# access to existing data. See also test_cql_schema.py::test_alternator_aux_tables
+# which tests related properties in a different way.
+def test_index_naming(cql, table_with_indexes):
+    table_name = table_with_indexes.name
+    ks = 'alternator_' + table_name
+    gsi_view = table_name + ':gsi1'
+    lsi_view = table_name + '!:lsi1'
+    # Check the view names in system_schema.views (the view names contain
+    # special characters such as '!' and ':', so we look them up in the
+    # system table rather than attempting to query the views directly).
+    rows = list(cql.execute(
+        "SELECT view_name FROM system_schema.views "
+        f"WHERE keyspace_name = '{ks}' AND view_name = '{gsi_view}'"
+    ))
+    assert len(rows) == 1, f"Expected GSI view '{gsi_view}' not found in system_schema.views"
+    rows = list(cql.execute(
+        "SELECT view_name FROM system_schema.views "
+        f"WHERE keyspace_name = '{ks}' AND view_name = '{lsi_view}'"
+    ))
+    assert len(rows) == 1, f"Expected LSI view '{lsi_view}' not found in system_schema.views"
+
+# Test that GSI/LSI key attributes that are not base-table key attributes
+# are NOT stored as separate CQL schema columns in the base table - they are
+# stored in ":attrs" instead. In the table_with_indexes fixture, "x" is the
+# GSI hash key and "y" is the LSI range key, but neither is a base-table key.
+# Note: this behavior changed recently. Before https://github.com/scylladb/scylladb/pull/24991,
+# LSI key columns (and before an even earlier change, also GSI key columns) were
+# added as real schema columns. Now all non-base-table-key attributes, including
+# GSI and LSI key attributes, are stored in ":attrs".
+def test_index_key_not_schema_column(dynamodb, cql, table_with_indexes):
+    table = table_with_indexes
+    ks = 'alternator_' + table.name
+    # Neither "x" (GSI key) nor "y" (LSI key) must appear as columns in the
+    # base table's CQL schema.
+    for attr in ('x', 'y'):
+        rows = list(cql.execute(
+            "SELECT column_name FROM system_schema.columns "
+            f"WHERE keyspace_name = '{ks}' AND table_name = '{table.name}' "
+            f"AND column_name = '{attr}'"
+        ))
+        assert rows == [], f"Attribute '{attr}' (GSI/LSI key) should not be a schema column, but found: {rows}"
+    # Write an item with both "x" and "y" set, then confirm they are stored in ":attrs".
+    p = random_string()
+    c = random_string()
+    table.put_item(Item={'p': p, 'c': c, 'x': 'hello', 'y': 'world'})
+    rows = list(cql.execute(
+        f'SELECT ":attrs" FROM "{ks}"."{table.name}" WHERE p = %s AND c = %s',
+        [p, c]
+    ))
+    assert len(rows) == 1
+    attrs = rows[0][0]
+    # Both "x" and "y" should be stored in ":attrs" with S-type encoding
+    # (type byte 0x00 followed by raw UTF-8 bytes).
+    assert attrs['x'] == b'\x00' + b'hello'
+    assert attrs['y'] == b'\x00' + b'world'
Author	SHA1	Message	Date
copilot-swe-agent[bot]	6ce6c33b65	test/alternator: fix copyright year to 2026 Co-authored-by: mykaul <4655593+mykaul@users.noreply.github.com>	2026-03-03 16:28:02 +00:00
copilot-swe-agent[bot]	fe77675455	test/alternator: rename fixture, split x/y attrs, reorder tests, fix index naming check Co-authored-by: nyh <584227+nyh@users.noreply.github.com>	2026-03-03 15:46:23 +00:00
copilot-swe-agent[bot]	2fd5383bd0	test/alternator: add GSI/LSI naming and index key encoding tests Co-authored-by: nyh <584227+nyh@users.noreply.github.com>	2026-03-03 15:28:34 +00:00
copilot-swe-agent[bot]	c7969f7a46	test/alternator: minor cleanup in test_encoding.py per review Co-authored-by: nyh <584227+nyh@users.noreply.github.com>	2026-03-03 15:16:11 +00:00
copilot-swe-agent[bot]	2c17c90825	test/alternator: address review feedback on test_encoding.py Co-authored-by: nyh <584227+nyh@users.noreply.github.com>	2026-03-03 15:00:56 +00:00
copilot-swe-agent[bot]	e00dbfa334	test/alternator: add test_encoding.py to test Alternator's on-disk data encoding Co-authored-by: nyh <584227+nyh@users.noreply.github.com>	2026-03-03 11:07:10 +00:00
copilot-swe-agent[bot]	744034eec6	Initial plan	2026-03-03 10:51:13 +00:00