Simplify ungzip implementation per review feedback

- Remove manual gzip header parsing - libdeflate handles all format details - Rename linearize_chunked_content to build_input_buffer and free chunks as we copy - Add output chunking to split large decompressed data into 1MB chunks - Add comment explaining libdeflate's whole-buffer requirement - Use better initial size heuristic based on compression ratio Co-authored-by: nyh <584227+nyh@users.noreply.github.com>
Add edge case check for length limit in ungzip
2025-11-19 12:47:02 +00:00 · 2025-11-19 11:50:31 +00:00 · 2025-11-19 11:48:35 +00:00 · 2025-11-19 11:46:29 +00:00 · 2025-11-19 11:32:38 +00:00
5 changed files with 540 additions and 0 deletions
--- a/test/boost/CMakeLists.txt
+++ b/test/boost/CMakeLists.txt
@@ -61,6 +61,8 @@ add_scylla_test(compound_test
  KIND SEASTAR)
 add_scylla_test(compress_test
  KIND BOOST)
 add_scylla_test(gzip_test
  KIND SEASTAR)
 add_scylla_test(config_test
  KIND SEASTAR)
 add_scylla_test(continuous_data_consumer_test
--- a/test/boost/gzip_test.cc
+++ b/test/boost/gzip_test.cc
@@ -0,0 +1,317 @@
 /*
 * Copyright 2025-present ScyllaDB
 */
 /*
 * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
 */
 #include <boost/test/unit_test.hpp>
 #include <seastar/core/thread.hh>
 #include <seastar/testing/test_case.hh>
 #include "utils/gzip.hh"
 #include <libdeflate.h>
 #include <cstring>
 using namespace seastar;
 namespace {
 // Helper function to compress data with gzip
 std::vector<char> gzip_compress(const std::vector<char>& data) {
    auto* compressor = libdeflate_alloc_compressor(6);
    if (!compressor) {
        throw std::bad_alloc();
    }
    size_t max_compressed_size = libdeflate_gzip_compress_bound(compressor, data.size());
    std::vector<char> compressed(max_compressed_size);
    size_t actual_size = libdeflate_gzip_compress(
        compressor,
        data.data(),
        data.size(),
        compressed.data(),
        compressed.size()
    );
    libdeflate_free_compressor(compressor);
    compressed.resize(actual_size);
    return compressed;
 }
 // Convert vector to chunked_content
 rjson::chunked_content to_chunked_content(const std::vector<char>& data) {
    rjson::chunked_content result;
    temporary_buffer<char> buf(data.size());
    std::memcpy(buf.get_write(), data.data(), data.size());
    result.push_back(std::move(buf));
    return result;
 }
 // Convert chunked_content to vector
 std::vector<char> from_chunked_content(const rjson::chunked_content& chunks) {
    std::vector<char> result;
    for (const auto& chunk : chunks) {
        result.insert(result.end(), chunk.begin(), chunk.end());
    }
    return result;
 }
 } // anonymous namespace
 SEASTAR_TEST_CASE(test_ungzip_simple) {
    return async([] {
        // Test simple gzip compression/decompression
        std::vector<char> original_data = {'H', 'e', 'l', 'l', 'o', ',', ' ', 'W', 'o', 'r', 'l', 'd', '!'};
        auto compressed = gzip_compress(original_data);
        auto chunked_compressed = to_chunked_content(compressed);
        auto decompressed_chunks = utils::ungzip(std::move(chunked_compressed), 1024).get();
        auto decompressed = from_chunked_content(decompressed_chunks);
        BOOST_REQUIRE_EQUAL(decompressed.size(), original_data.size());
        BOOST_CHECK(std::equal(decompressed.begin(), decompressed.end(), original_data.begin()));
    });
 }
 SEASTAR_TEST_CASE(test_ungzip_empty) {
    return async([] {
        // Test empty input
        std::vector<char> original_data;
        auto compressed = gzip_compress(original_data);
        auto chunked_compressed = to_chunked_content(compressed);
        auto decompressed_chunks = utils::ungzip(std::move(chunked_compressed), 1024).get();
        auto decompressed = from_chunked_content(decompressed_chunks);
        BOOST_CHECK_EQUAL(decompressed.size(), 0);
    });
 }
 SEASTAR_TEST_CASE(test_ungzip_large_data) {
    return async([] {
        // Test with larger data that compresses well
        std::vector<char> original_data(10000, 'A');
        original_data.insert(original_data.end(), 10000, 'B');
        original_data.insert(original_data.end(), 10000, 'C');
        auto compressed = gzip_compress(original_data);
        auto chunked_compressed = to_chunked_content(compressed);
        auto decompressed_chunks = utils::ungzip(std::move(chunked_compressed), 100000).get();
        auto decompressed = from_chunked_content(decompressed_chunks);
        BOOST_REQUIRE_EQUAL(decompressed.size(), original_data.size());
        BOOST_CHECK(std::equal(decompressed.begin(), decompressed.end(), original_data.begin()));
    });
 }
 SEASTAR_TEST_CASE(test_ungzip_concatenated) {
    return async([] {
        // Test multiple concatenated gzip files
        std::vector<char> data1 = {'H', 'e', 'l', 'l', 'o'};
        std::vector<char> data2 = {'W', 'o', 'r', 'l', 'd'};
        auto compressed1 = gzip_compress(data1);
        auto compressed2 = gzip_compress(data2);
        // Concatenate the compressed data
        std::vector<char> concatenated;
        concatenated.insert(concatenated.end(), compressed1.begin(), compressed1.end());
        concatenated.insert(concatenated.end(), compressed2.begin(), compressed2.end());
        auto chunked_compressed = to_chunked_content(concatenated);
        auto decompressed_chunks = utils::ungzip(std::move(chunked_compressed), 1024).get();
        auto decompressed = from_chunked_content(decompressed_chunks);
        // Should decompress to "HelloWorld"
        std::vector<char> expected;
        expected.insert(expected.end(), data1.begin(), data1.end());
        expected.insert(expected.end(), data2.begin(), data2.end());
        BOOST_REQUIRE_EQUAL(decompressed.size(), expected.size());
        BOOST_CHECK(std::equal(decompressed.begin(), decompressed.end(), expected.begin()));
    });
 }
 SEASTAR_TEST_CASE(test_ungzip_multiple_concatenated) {
    return async([] {
        // Test multiple concatenated gzip files (more than 2)
        std::vector<std::vector<char>> parts = {
            {'A', 'B', 'C'},
            {'D', 'E', 'F'},
            {'G', 'H', 'I'},
            {'J', 'K', 'L'}
        };
        std::vector<char> concatenated;
        std::vector<char> expected;
        for (const auto& part : parts) {
            auto compressed = gzip_compress(part);
            concatenated.insert(concatenated.end(), compressed.begin(), compressed.end());
            expected.insert(expected.end(), part.begin(), part.end());
        }
        auto chunked_compressed = to_chunked_content(concatenated);
        auto decompressed_chunks = utils::ungzip(std::move(chunked_compressed), 1024).get();
        auto decompressed = from_chunked_content(decompressed_chunks);
        BOOST_REQUIRE_EQUAL(decompressed.size(), expected.size());
        BOOST_CHECK(std::equal(decompressed.begin(), decompressed.end(), expected.begin()));
    });
 }
 SEASTAR_TEST_CASE(test_ungzip_invalid_magic) {
    return async([] {
        // Test invalid gzip magic bytes
        std::vector<char> bad_data = {0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
        auto chunked = to_chunked_content(bad_data);
        BOOST_CHECK_THROW(
            utils::ungzip(std::move(chunked), 1024).get(),
            std::runtime_error
        );
    });
 }
 SEASTAR_TEST_CASE(test_ungzip_truncated) {
    return async([] {
        // Test truncated gzip data
        std::vector<char> original_data = {'H', 'e', 'l', 'l', 'o'};
        auto compressed = gzip_compress(original_data);
        // Truncate the compressed data
        compressed.resize(compressed.size() / 2);
        auto chunked = to_chunked_content(compressed);
        BOOST_CHECK_THROW(
            utils::ungzip(std::move(chunked), 1024).get(),
            std::runtime_error
        );
    });
 }
 SEASTAR_TEST_CASE(test_ungzip_corrupted) {
    return async([] {
        // Test corrupted gzip data
        std::vector<char> original_data = {'H', 'e', 'l', 'l', 'o', ' ', 'W', 'o', 'r', 'l', 'd'};
        auto compressed = gzip_compress(original_data);
        // Corrupt some bytes in the middle
        if (compressed.size() > 20) {
            compressed[15] ^= 0xFF;
            compressed[16] ^= 0xFF;
        }
        auto chunked = to_chunked_content(compressed);
        BOOST_CHECK_THROW(
            utils::ungzip(std::move(chunked), 1024).get(),
            std::runtime_error
        );
    });
 }
 SEASTAR_TEST_CASE(test_ungzip_junk_appended) {
    return async([] {
        // Test gzip data with junk appended
        std::vector<char> original_data = {'H', 'e', 'l', 'l', 'o'};
        auto compressed = gzip_compress(original_data);
        // Append junk
        std::vector<char> junk = {'J', 'U', 'N', 'K'};
        compressed.insert(compressed.end(), junk.begin(), junk.end());
        auto chunked = to_chunked_content(compressed);
        BOOST_CHECK_THROW(
            utils::ungzip(std::move(chunked), 1024).get(),
            std::runtime_error
        );
    });
 }
 SEASTAR_TEST_CASE(test_ungzip_length_limit_exceeded) {
    return async([] {
        // Test length limit enforcement
        std::vector<char> original_data(1000, 'A');
        auto compressed = gzip_compress(original_data);
        auto chunked = to_chunked_content(compressed);
        // Set limit lower than actual size
        BOOST_CHECK_THROW(
            utils::ungzip(std::move(chunked), 500).get(),
            std::runtime_error
        );
    });
 }
 SEASTAR_TEST_CASE(test_ungzip_length_limit_exact) {
    return async([] {
        // Test that exact limit works
        std::vector<char> original_data(1000, 'B');
        auto compressed = gzip_compress(original_data);
        auto chunked = to_chunked_content(compressed);
        // Set limit to exact size
        auto decompressed_chunks = utils::ungzip(std::move(chunked), 1000).get();
        auto decompressed = from_chunked_content(decompressed_chunks);
        BOOST_CHECK_EQUAL(decompressed.size(), 1000);
    });
 }
 SEASTAR_TEST_CASE(test_ungzip_very_short_input) {
    return async([] {
        // Test with input too short to be valid gzip
        std::vector<char> bad_data = {0x1f, 0x8b};
        auto chunked = to_chunked_content(bad_data);
        BOOST_CHECK_THROW(
            utils::ungzip(std::move(chunked), 1024).get(),
            std::runtime_error
        );
    });
 }
 SEASTAR_TEST_CASE(test_ungzip_empty_input) {
    return async([] {
        // Test with completely empty input
        rjson::chunked_content empty;
        BOOST_CHECK_THROW(
            utils::ungzip(std::move(empty), 1024).get(),
            std::runtime_error
        );
    });
 }
 SEASTAR_TEST_CASE(test_ungzip_chunked_input) {
    return async([] {
        // Test with input split across multiple chunks
        std::vector<char> original_data = {'H', 'e', 'l', 'l', 'o', ' ', 'W', 'o', 'r', 'l', 'd', '!'};
        auto compressed = gzip_compress(original_data);
        // Split compressed data into multiple chunks
        rjson::chunked_content chunked;
        size_t chunk_size = compressed.size() / 3 + 1;
        for (size_t i = 0; i < compressed.size(); i += chunk_size) {
            size_t this_chunk_size = std::min(chunk_size, compressed.size() - i);
            temporary_buffer<char> buf(this_chunk_size);
            std::memcpy(buf.get_write(), compressed.data() + i, this_chunk_size);
            chunked.push_back(std::move(buf));
        }
        auto decompressed_chunks = utils::ungzip(std::move(chunked), 1024).get();
        auto decompressed = from_chunked_content(decompressed_chunks);
        BOOST_REQUIRE_EQUAL(decompressed.size(), original_data.size());
        BOOST_CHECK(std::equal(decompressed.begin(), decompressed.end(), original_data.begin()));
    });
 }
--- a/utils/CMakeLists.txt
+++ b/utils/CMakeLists.txt
@@ -22,6 +22,7 @@ target_sources(utils
    error_injection.cc
    exceptions.cc
    file_lock.cc
    gzip.cc
    gz/crc_combine.cc
    gz/crc_combine_table.cc
    hashers.cc
@@ -79,6 +80,7 @@ target_link_libraries(utils
    Boost::regex
 	crypto
    cryptopp::cryptopp
    libdeflate::libdeflate
    rapidxml::rapidxml
    yaml-cpp::yaml-cpp
    GnuTLS::gnutls)
--- a/utils/gzip.cc
+++ b/utils/gzip.cc
@@ -0,0 +1,187 @@
 /*
 * Copyright 2025-present ScyllaDB
 */
 /*
 * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
 */
 #include "utils/gzip.hh"
 #include <libdeflate.h>
 #include <seastar/core/coroutine.hh>
 #include <seastar/core/temporary_buffer.hh>
 #include <seastar/core/thread.hh>
 #include <stdexcept>
 using namespace seastar;
 namespace utils {
 namespace {
 // Maximum size for a single output chunk (1 MB)
 constexpr size_t MAX_OUTPUT_CHUNK_SIZE = 1024 * 1024;
 // Build a contiguous buffer from chunks for decompression
 // This is necessary because libdeflate requires complete input
 // We build the buffer incrementally and free input chunks as we go
 std::vector<char> build_input_buffer(rjson::chunked_content& chunks) {
    size_t total_size = 0;
    for (const auto& chunk : chunks) {
        total_size += chunk.size();
    }
    std::vector<char> result;
    result.reserve(total_size);
    for (auto& chunk : chunks) {
        result.insert(result.end(), chunk.begin(), chunk.end());
        // Free the chunk immediately after copying to save memory
        chunk = temporary_buffer<char>();
    }
    chunks.clear();
    return result;
 }
 } // anonymous namespace
 future<rjson::chunked_content> ungzip(rjson::chunked_content&& compressed_body, size_t length_limit) {
    // Use thread context for potentially blocking operations
    return seastar::async([compressed_body = std::move(compressed_body), length_limit] () mutable {
        if (compressed_body.empty()) {
            throw std::runtime_error("Invalid gzip data: empty input");
        }
        // Build input buffer from chunks, freeing them as we go
        // Unfortunately, libdeflate requires the complete compressed input at once
        std::vector<char> compressed_data = build_input_buffer(compressed_body);
        if (compressed_data.empty()) {
            throw std::runtime_error("Invalid gzip data: empty input");
        }
        // Create decompressor
        auto* decompressor = libdeflate_alloc_decompressor();
        if (!decompressor) {
            throw std::bad_alloc();
        }
        // RAII wrapper for decompressor
        auto decompressor_deleter = [](libdeflate_decompressor* d) {
            if (d) {
                libdeflate_free_decompressor(d);
            }
        };
        std::unique_ptr<libdeflate_decompressor, decltype(decompressor_deleter)> decompressor_guard(
            decompressor, decompressor_deleter);
        rjson::chunked_content result;
        size_t total_decompressed = 0;
        size_t input_offset = 0;
        // Process potentially multiple concatenated gzip members
        // libdeflate_gzip_decompress handles all gzip format details (headers, trailers, etc.)
        while (input_offset < compressed_data.size()) {
            const char* current_input = compressed_data.data() + input_offset;
            size_t remaining_input = compressed_data.size() - input_offset;
            // Check if we've reached the limit before starting decompression
            if (total_decompressed >= length_limit) {
                throw std::runtime_error("Decompressed data exceeds length limit");
            }
            // Allocate output buffer - start with a reasonable size and grow if needed
            // Limit chunk size to avoid allocating too much at once
            const size_t initial_chunk_size = std::min({
                size_t(MAX_OUTPUT_CHUNK_SIZE),
                length_limit - total_decompressed,
                remaining_input * 10  // Heuristic: decompressed size often < 10x compressed
            });
            std::vector<char> output_buffer(initial_chunk_size);
            size_t actual_in_bytes = 0;
            size_t actual_out_bytes = 0;
            // Try decompression with progressively larger output buffers if needed
            libdeflate_result res;
            size_t max_output_size = length_limit - total_decompressed;
            for (size_t attempt = 0; attempt < 10; ++attempt) {
                res = libdeflate_gzip_decompress(
                    decompressor,
                    current_input,
                    remaining_input,
                    output_buffer.data(),
                    output_buffer.size(),
                    &actual_in_bytes,
                    &actual_out_bytes
                );
                if (res == LIBDEFLATE_SUCCESS) {
                    break;
                } else if (res == LIBDEFLATE_INSUFFICIENT_SPACE) {
                    // Need a larger output buffer
                    size_t new_size = std::min(output_buffer.size() * 2, max_output_size);
                    if (new_size <= output_buffer.size()) {
                        throw std::runtime_error("Decompressed data exceeds length limit");
                    }
                    output_buffer.resize(new_size);
                } else {
                    // Other error (bad data, short input, etc.)
                    break;
                }
            }
            if (res != LIBDEFLATE_SUCCESS) {
                if (res == LIBDEFLATE_BAD_DATA) {
                    throw std::runtime_error("Invalid gzip data: corrupt or truncated");
                } else if (res == LIBDEFLATE_SHORT_OUTPUT) {
                    throw std::runtime_error("Decompressed data exceeds length limit");
                } else if (res == LIBDEFLATE_INSUFFICIENT_SPACE) {
                    throw std::runtime_error("Decompressed data exceeds length limit");
                } else {
                    throw std::runtime_error("Gzip decompression failed");
                }
            }
            // libdeflate_gzip_decompress returns how many bytes were consumed
            // This includes the entire gzip member (header, compressed data, and trailer)
            if (actual_in_bytes == 0) {
                throw std::runtime_error("Invalid gzip data: no bytes consumed");
            }
            // Check total size limit
            total_decompressed += actual_out_bytes;
            if (total_decompressed > length_limit) {
                throw std::runtime_error("Decompressed data exceeds length limit");
            }
            // Move decompressed data into temporary_buffer chunks
            // Split into reasonably-sized chunks to avoid holding too much contiguous memory
            size_t offset = 0;
            while (offset < actual_out_bytes) {
                size_t chunk_size = std::min(MAX_OUTPUT_CHUNK_SIZE, actual_out_bytes - offset);
                temporary_buffer<char> chunk(chunk_size);
                std::memcpy(chunk.get_write(), output_buffer.data() + offset, chunk_size);
                result.push_back(std::move(chunk));
                offset += chunk_size;
            }
            // Move to the next gzip member
            input_offset += actual_in_bytes;
            // Yield to the reactor periodically
            seastar::thread::maybe_yield();
        }
        // Check if we consumed all input
        if (input_offset != compressed_data.size()) {
            throw std::runtime_error("Invalid gzip data: unconsumed trailing data");
        }
        return result;
    });
 }
 } // namespace utils
--- a/utils/gzip.hh
+++ b/utils/gzip.hh
@@ -0,0 +1,32 @@
 /*
 * Copyright 2025-present ScyllaDB
 */
 /*
 * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
 */
 #pragma once
 #include <seastar/core/future.hh>
 #include "utils/rjson.hh"
 namespace utils {
 /**
 * Decompresses gzip-compressed data stored in chunked_content format.
 * 
 * @param compressed_body The gzip-compressed data in chunked_content format
 * @param length_limit Maximum allowed size of the uncompressed data (throws if exceeded)
 * @return A future containing the uncompressed data in chunked_content format
 * 
 * Features:
 * - Supports concatenated gzip files (multiple gzip files appended together)
 * - Throws exception if input is not valid gzip
 * - Throws exception if input is truncated
 * - Throws exception if non-gzip junk is appended
 * - Throws exception if uncompressed size exceeds length_limit
 */
 seastar::future<rjson::chunked_content> ungzip(rjson::chunked_content&& compressed_body, size_t length_limit);
 } // namespace utils
Author	SHA1	Message	Date
copilot-swe-agent[bot]	8f6296b905	Simplify ungzip implementation per review feedback - Remove manual gzip header parsing - libdeflate handles all format details - Rename linearize_chunked_content to build_input_buffer and free chunks as we copy - Add output chunking to split large decompressed data into 1MB chunks - Add comment explaining libdeflate's whole-buffer requirement - Use better initial size heuristic based on compression ratio Co-authored-by: nyh <584227+nyh@users.noreply.github.com>	2025-11-19 12:47:02 +00:00
copilot-swe-agent[bot]	4f44a61b3a	Add edge case check for length limit in ungzip - Check if total_decompressed >= length_limit before allocating output buffer - Prevents allocating a zero-sized buffer when limit is already reached - Ensures clear error message when limit is exceeded Co-authored-by: nyh <584227+nyh@users.noreply.github.com>	2025-11-19 11:50:31 +00:00
copilot-swe-agent[bot]	362491a650	Fix ungzip implementation to properly handle concatenated gzip files - Removed unused get_gzip_member_size function - Rely on libdeflate_gzip_decompress to tell us how many input bytes were consumed - Added check for zero bytes consumed to detect invalid state - Simplified the logic by removing unnecessary header size tracking Co-authored-by: nyh <584227+nyh@users.noreply.github.com>	2025-11-19 11:48:35 +00:00
copilot-swe-agent[bot]	b818331420	Add ungzip function implementation with libdeflate - Created utils/gzip.hh header with ungzip function declaration - Created utils/gzip.cc implementation using libdeflate - Updated utils/CMakeLists.txt to include gzip.cc and link libdeflate - Created comprehensive test suite in test/boost/gzip_test.cc - Added gzip_test to test/boost/CMakeLists.txt The implementation: - Uses libdeflate for high-performance gzip decompression - Handles chunked_content input/output (vector of temporary_buffer) - Supports concatenated gzip files - Validates gzip headers and detects invalid/truncated/corrupted data - Enforces size limits to prevent memory exhaustion - Runs in async context to avoid blocking the reactor Co-authored-by: nyh <584227+nyh@users.noreply.github.com>	2025-11-19 11:46:29 +00:00
copilot-swe-agent[bot]	c714159d5c	Initial plan	2025-11-19 11:32:38 +00:00