From 32afcdbaf0bc2d1aa2db6b82d7eaecad70fe6ba5 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@scylladb.com>
Date: Mon, 17 Nov 2025 19:38:28 +0200
Subject: [PATCH] test/alternator: enable, and add, tests for gzip'ed requests

After in the previous patch we implemented support in Alternator for
gzip-compressed requests ("Content-Encoding: gzip"), here we enable
an existing xfail-ing test for this feature, and also add more tests
for more cases:

  * A test for longer compressed requests, or a short compressed
    request which expands to a longer request. Since the decompression
    uses small buffers, this test reaches additional code paths.

  * Check for various cases of a malformed gzip'ed request, and also
    an attempt to use an unsupported Content-Encoding. DynamoDB
    returns error 500 for both cases, so we want to test that we
    do to - and not silently ignore such errors.

  * Check that two concatenated gzip'ed streams is a valid request,
    and check that garbage at the end of the gzip - or a missing
    character at the end of the gzip - is recognized as an error.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
---
 test/alternator/test_compressed_request.py | 197 ++++++++++++++++++++-
 test/alternator/test_manual_requests.py    |   1 -
 2 files changed, 196 insertions(+), 2 deletions(-)

diff --git a/test/alternator/test_compressed_request.py b/test/alternator/test_compressed_request.py
index 75cf8742f3..68d82bf40d 100644
--- a/test/alternator/test_compressed_request.py
+++ b/test/alternator/test_compressed_request.py
@@ -30,9 +30,13 @@
 
 import boto3
 import botocore
+import gzip
+import requests
 import pytest
+from contextlib import contextmanager
 
 from .util import random_string
+from .test_manual_requests import get_signed_request
 
 # The compressed_req fixture is like the dynamodb fixture - providing a
 # connection to a DynamDB-API server. But the unique feature of compressed_req
@@ -72,7 +76,6 @@ def compressed_req(dynamodb):
     ret.meta.client.close()
 
 # A basic test for a compressed request, using PutItem and GetItem
-@pytest.mark.xfail(reason='issue #5041')
 def test_compressed_request(test_table_s, compressed_req):
     tab = compressed_req.Table(test_table_s.name)
     p = random_string()
@@ -81,3 +84,195 @@ def test_compressed_request(test_table_s, compressed_req):
     tab.put_item(Item=item)
     got_item = tab.get_item(Key={'p': p}, ConsistentRead=True)['Item']
     assert got_item == item
+
+# Test a longer PutItem request. Our decompression implementation wants to
+# decompress it in pieces, to avoid one long contiguous allocation of the
+# output. So this test will check this code path.
+def test_long_compressed_request(test_table_s, compressed_req):
+    tab = compressed_req.Table(test_table_s.name)
+    p = random_string()
+    x = random_string()
+    # First, make the request compress very well so the compressed request
+    # will be very short, but the uncompressed output is long and may
+    # be split across multiple output buffers.
+    long = p + 'x'*10000
+    item = {'p': p, 'x': x, 'long': long}
+    tab.put_item(Item=item)
+    got_item = tab.get_item(Key={'p': p}, ConsistentRead=True)['Item']
+    assert got_item == item
+    # Now try a request that doesn't compress as well. Our implementation
+    # may need to split both input and output buffer boundries.
+    long = random_string(5000)*2
+    item = {'p': p, 'x': x, 'long': long}
+    tab.put_item(Item=item)
+    got_item = tab.get_item(Key={'p': p}, ConsistentRead=True)['Item']
+    assert got_item == item
+
+# The tests above configured boto3 to compress its requests so we could
+# test them. We now want to test unusual scenarios - including corrupt
+# compressed requests that should fail, or requests compressed in a non-
+# traditional way but still should work. We can't test these scenarios
+# using boto3, and need to construct the requests on our own using functions
+# from test_manual_requests.py.
+
+# Test the error when we send a wrong Content-Encoding header. The only
+# Content-Encoding supported by DynamoDB is "gzip" - see test file
+# test_compressed_request.py for tests of successful compression with
+# a valid Content-Encoding.
+def test_garbage_content_encoding(dynamodb, test_table):
+    p = random_string()
+    payload = '{"TableName": "' + test_table.name + '", "Item": {"p": {"S": "' + p + '"}, "c": {"S": "x"}}}'
+    req = get_signed_request(dynamodb, 'PutItem', payload)
+    # Add a bad Content-Encoding header. The request signature is still valid,
+    # but the server will not know how to decode the request.
+    headers = dict(req.headers)
+    headers.update({'Content-Encoding': 'garbage'})
+    r = requests.post(req.url, headers=headers, data=req.body, verify=False)
+    assert r.status_code == 500
+    # Check the PutItem request really wasn't done
+    assert 'Item' not in test_table.get_item(Key={'p': p, 'c': 'x'}, ConsistentRead=True)
+
+# If Content-Encoding is "gzip" but the content is *not* valid gzip encoded,
+# DynamoDB returns an InternalServerError. I'm not sure this is the most
+# appropriate error to return (among other things it suggests that the
+# broken request is retryable), but this is what DynamoDB does so Alternator
+# should too.
+def test_broken_gzip_content(dynamodb, test_table):
+    p = random_string()
+    payload = '{"TableName": "' + test_table.name + '", "Item": {"p": {"S": "' + p + '"}, "c": {"S": "x"}}}'
+    req = get_signed_request(dynamodb, 'PutItem', payload)
+    # Add a Content-Encoding header suggesting this is gzipped content.
+    # Of course it isn't - it's a valid uncompressed request. The server
+    # should fail decompressing it and return a 500 error
+    headers = dict(req.headers)
+    headers.update({'Content-Encoding': 'gzip'})
+    r = requests.post(req.url, headers=headers, data=req.body, verify=False)
+    assert r.status_code == 500
+    # Check the PutItem request really wasn't done
+    assert 'Item' not in test_table.get_item(Key={'p': p, 'c': 'x'}, ConsistentRead=True)
+
+# Test a valid gziped request created manually (without boto3's help). The
+# fact that this test passes a sanity test preparing us for the next tests
+# where we change the compressed stream and see what happens.
+def test_gzip_request_valid(dynamodb, test_table):
+    p = random_string()
+    v = random_string()
+    payload = '{"TableName": "' + test_table.name + '", "Item": {"p": {"S": "' + p + '"}, "c": {"S": "x"}, "v": {"S": "' + v + '"}}}'
+    # Compress the payload. The new "payload" will be bytes instead of a
+    # string - this is perfectly fine.
+    payload = gzip.compress(payload.encode('utf-8'))
+    req = get_signed_request(dynamodb, 'PutItem', payload)
+    # Need to tell the server with a Content-Encoding header that the
+    # payload is compressed:
+    headers = dict(req.headers)
+    headers.update({'Content-Encoding': 'gzip'})
+    r = requests.post(req.url, headers=headers, data=req.body, verify=False)
+    assert r.status_code == 200
+    got = test_table.get_item(Key={'p': p, 'c': 'x'}, ConsistentRead=True)['Item']
+    assert got == {'p': p, 'c': 'x', 'v': v}
+
+# Same test as test_gzip_request_valid() but compress the payload in two
+# pieces, concatenating the two gzip outputs. This isn't something users
+# will typically do, but is allowed according to the gzip standard so we
+# want to support it.
+def test_gzip_request_two_gzips(dynamodb, test_table):
+    p = random_string()
+    v = random_string()
+    payload = '{"TableName": "' + test_table.name + '", "Item": {"p": {"S": "' + p + '"}, "c": {"S": "x"}, "v": {"S": "' + v + '"}}}'
+    # Compress the payload in two halves - first compress the first 10
+    # characters, then the rest, and concatenate the two resulting gzips.
+    payload = gzip.compress(payload[:10].encode('utf-8')) + gzip.compress(payload[10:].encode('utf-8'))
+    req = get_signed_request(dynamodb, 'PutItem', payload)
+    headers = dict(req.headers)
+    headers.update({'Content-Encoding': 'gzip'})
+    r = requests.post(req.url, headers=headers, data=req.body, verify=False)
+    assert r.status_code == 200
+    got = test_table.get_item(Key={'p': p, 'c': 'x'}, ConsistentRead=True)['Item']
+    assert got == {'p': p, 'c': 'x', 'v': v}
+
+# Same test as test_gzip_request_valid() but add extra junk - not another
+# valid gzip - following the valid gzip string. This should be an error,
+# the extra junk should not be just silently ignored.
+# Strangely, although we see in other tests for bad gzip that DynamoDB
+# returns an error, in this specific case it doesn't. I consider this a
+# bug so this is one of the rare tests with the dynamodb_bug marker.
+def test_gzip_request_with_extra_junk(dynamodb, test_table, dynamodb_bug):
+    p = random_string()
+    payload = '{"TableName": "' + test_table.name + '", "Item": {"p": {"S": "' + p + '"}, "c": {"S": "x"}}}'
+    payload = gzip.compress(payload.encode('utf-8'))
+    # Add junk - which isn't a second valid gzip - at the end of the payload
+    payload += b'junk'
+    req = get_signed_request(dynamodb, 'PutItem', payload)
+    headers = dict(req.headers)
+    headers.update({'Content-Encoding': 'gzip'})
+    r = requests.post(req.url, headers=headers, data=req.body, verify=False)
+    assert r.status_code == 500
+    assert 'Item' not in test_table.get_item(Key={'p': p, 'c': 'x'}, ConsistentRead=True)
+
+# Same test as test_gzip_request_valid() but remove one character from the
+# end of compressed payload, so it is missing the proper ending marker.
+# Decompression should fail and generate an error.
+def test_gzip_request_with_missing_character(dynamodb, test_table):
+    p = random_string()
+    payload = '{"TableName": "' + test_table.name + '", "Item": {"p": {"S": "' + p + '"}, "c": {"S": "x"}}}'
+    payload = gzip.compress(payload.encode('utf-8'))
+    # Remove the one last character from the compressed payload
+    payload = payload[:-1]
+    req = get_signed_request(dynamodb, 'PutItem', payload)
+    headers = dict(req.headers)
+    headers.update({'Content-Encoding': 'gzip'})
+    r = requests.post(req.url, headers=headers, data=req.body, verify=False)
+    assert r.status_code == 500
+    assert 'Item' not in test_table.get_item(Key={'p': p, 'c': 'x'}, ConsistentRead=True)
+
+# We put a limit (request_content_length_limit = 16 MB) on the size of the
+# request. If the request is compressed, even if the compressed request
+# is tiny we should still limit the size of the uncompressed request.
+# Let's try a 20 MB that compresses extremely well to a tiny string, but
+# should still be rejected as an oversized request.
+#
+# This test is marked scylla_only because DynamoDB does NOT limit the
+# uncompressed size of a compressed request:
+# DynamoDB does (as we check in test_manual_requests.py) limit non-compressed
+# request sizes, but these limits are not enforced for compressed requests -
+# I checked that even a 1 GB (!) pre-compression-size request is accepted
+# (but a 10 GB pre-compression-size request is silently rejected with 500
+# error). It is debateable whether this should be considered a DynamoDB bug -
+# or a "feature". I would claim it's a bug - there is no reason why a huge
+# well-compressible request should be allowed while not allowed when not
+# compressed. A counter-argument is that the biggest reason why a large
+# request is not allowed is the signature algorithm - we need to read
+# the entire request to verify its validity and only then we can start acting
+# on it. But with a compressed request we validate the small compressed
+# version's validity, so we can then - at least in theory - read the
+# uncompressed request in a streaming fashion, without ever reading the
+# entire request into memory.
+def test_gzip_request_oversized(dynamodb, test_table, scylla_only):
+    # Take a legal PutItem payload and add a lot of spaces to make it very
+    # long, but it's highly compressible so the compressed payload will be
+    # very small. The server should still reject the oversized uncompressed
+    # content.
+    long_len = 20*1024*1024
+    p = random_string()
+    payload = '{"TableName": "' + test_table.name + '", "Item": {"p": {"S": "' + p + '"}, "c": {"S": "x"}}}'
+    payload = payload[:-1] + ' '*long_len + payload[-1]
+    payload = gzip.compress(payload.encode('utf-8'))
+    # the compressed payload is very small, it won't cause the 413 error
+    assert len(payload) < 16*1024*1024
+    req = get_signed_request(dynamodb, 'PutItem', payload)
+    headers = dict(req.headers)
+    headers.update({'Content-Encoding': 'gzip'})
+    r = requests.post(req.url, headers=headers, data=req.body, verify=False)
+    assert r.status_code == 413
+    assert 'Item' not in test_table.get_item(Key={'p': p, 'c': 'x'}, ConsistentRead=True)
+
+# An empty string is NOT a valid gzip, so if we try to pass it off as a
+# a gzip'ed request, the result should be a 500 error like all other
+# invalid gzip content.
+def test_gzip_request_empty(dynamodb):
+    # pass the empty string '' as a (incorrect) compressed payload
+    req = get_signed_request(dynamodb, 'PutItem', '')
+    headers = dict(req.headers)
+    headers.update({'Content-Encoding': 'gzip'})
+    r = requests.post(req.url, headers=headers, data=req.body, verify=False)
+    assert r.status_code == 500
diff --git a/test/alternator/test_manual_requests.py b/test/alternator/test_manual_requests.py
index 1e6b4fd081..cb25f93e56 100644
--- a/test/alternator/test_manual_requests.py
+++ b/test/alternator/test_manual_requests.py
@@ -7,7 +7,6 @@
 
 import base64
 import json
-
 import pytest
 import requests
 import urllib3