scylladb/test/boost/comparable_bytes_test.cc

/*
 * Copyright (C) 2024-present ScyllaDB
 */

/*
 * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
 */
#include "test/lib/scylla_test_case.hh"

#include <seastar/net/inet_address.hh>
#include <seastar/net/ipv4_address.hh>
#include <seastar/util/lazy.hh>
#include <vector>

#include "bytes_ostream.hh"
#include "db/marshal/type_parser.hh"
#include "test/lib/log.hh"
#include "test/lib/random_utils.hh"
#include "test/lib/sstable_test_env.hh"
#include "types/comparable_bytes.hh"
#include "types/json_utils.hh"
#include "types/list.hh"
#include "types/map.hh"
#include "types/set.hh"
#include "types/types.hh"
#include "types/vector.hh"
#include "utils/big_decimal.hh"
#include "utils/fragment_range.hh"
#include "utils/managed_bytes.hh"
#include "utils/multiprecision_int.hh"
#include "utils/UUID.hh"
#include "utils/UUID_gen.hh"
#include "utils/rjson.hh"

BOOST_AUTO_TEST_CASE(test_comparable_bytes_opt) {
    BOOST_REQUIRE(comparable_bytes::from_data_value(data_value::make_null(int32_type)) == comparable_bytes_opt());
    BOOST_REQUIRE(comparable_bytes::from_serialized_bytes(*int32_type, managed_bytes_opt()) == comparable_bytes_opt());
}

BOOST_AUTO_TEST_CASE(test_bool) {
    auto test_bool_value = [] (comparable_bytes_opt& comparable_bytes, bool value) {
        BOOST_REQUIRE_EQUAL(comparable_bytes->size(), 1);
        BOOST_REQUIRE_MESSAGE(comparable_bytes->as_managed_bytes_view().front() == uint8_t(value),
                              fmt::format("comparable bytes encode failed for bool value : {}", value));
        BOOST_REQUIRE_MESSAGE(value == comparable_bytes->to_data_value(boolean_type),
                              fmt::format("comparable bytes decode failed for bool value : {}", value));
    };

    auto cb_false = comparable_bytes::from_data_value(false);
    test_bool_value(cb_false, false);
    auto cb_true = comparable_bytes::from_data_value(true);
    test_bool_value(cb_true, true);
    // Verify order
    BOOST_REQUIRE(cb_false < cb_true);
}

void byte_comparable_test(std::vector<data_value>&& test_data, bool test_reversed_type = false) {
    struct test_item {
        managed_bytes serialized_bytes;
        comparable_bytes comparable_bytes;
    };
    std::vector<test_item> test_items;

    // test encode/decode
    const auto test_data_type = test_reversed_type ? reversed(test_data.at(0).type()) : test_data.at(0).type();
    testlog.info("testing type '{}' with {} items...",
        test_reversed_type ? format("reversed<{}>", test_data_type.get()->cql3_type_name()) : test_data_type.get()->cql3_type_name(),
        test_data.size());
    testlog.trace("test data : {}", test_data);
    for (const data_value& value : test_data) {
        // verify comparable bytes encode/decode
        auto original_serialized_bytes = managed_bytes(value.serialize_nonnull());
        comparable_bytes comparable_bytes(*test_data_type, original_serialized_bytes);
        auto decoded_serialized_bytes = comparable_bytes.to_serialized_bytes(*test_data_type).value();
        if (test_data_type == decimal_type || test_data_type->is_tuple()) {
            // 1. The `decimal_type` requires special handling because its comparable byte representation
            // normalizes the scale and unscaled value. This means the serialized bytes after
            // decoding from comparable bytes might not be identical to the original serialized bytes,
            // despite them representing the same decimal value.
            // For instance, 2e-1 (scale=1, unscaled_value=2) and 20e-2 (scale=2, unscaled_value=20)
            // are equivalent decimals but have different serialized forms. Comparable byte encoding
            // will normalize them. So, instead of directly comparing serialized bytes, compare the
            // deserialized decoded value against the original decimal value.
            // 2. When encoding `tuple_type`, any trailing nulls are trimmed, so the serialized bytes
            // cannot be compared directly.
            auto decoded_value = test_data_type->deserialize_value(managed_bytes_view(decoded_serialized_bytes));
            BOOST_REQUIRE_MESSAGE(value == decoded_value, seastar::value_of([&] () {
                return fmt::format("comparable bytes encode/decode failed for value : {}", value);
            }));
        } else {
            // Compare the serialized bytes directly
            BOOST_REQUIRE_MESSAGE(original_serialized_bytes == decoded_serialized_bytes, seastar::value_of([&] () {
                return fmt::format("comparable bytes encode/decode failed for value : {}", value);
            }));
        }

        // collect the data in a vector to verify ordering later
        test_items.emplace_back(original_serialized_bytes, comparable_bytes);
    };

    // Verify that decoding succeeds even when the comparable bytes contain
    // extra data appended after the value to be converted.
    // This required for decode to work on composite types.
    bytes_ostream bos;
    // Select an item from the middle to test this case as front and back items
    // are often edge cases (e.g. min/max values).
    const auto item_id = test_items.size() / 2;
    auto test_value = test_items.at(item_id);
    auto cb_view = test_value.comparable_bytes.as_managed_bytes_view();
    bos.write(cb_view);
    bos.write(bytes("this-still-should-work"));
    auto cb = comparable_bytes(std::move(bos).to_managed_bytes());
    auto decoded_value = cb.to_data_value(test_data_type);
    BOOST_REQUIRE_MESSAGE(test_data.at(item_id) == decoded_value, seastar::value_of([&] () {
        return fmt::format("comparable bytes decode failed with appended bytes; expected : {}; actual : {}", test_data.at(0), decoded_value);
    }));

    // Sort the items based on comparable bytes
    std::ranges::sort(test_items, [] (const test_item& a, const test_item& b) {
        return a.comparable_bytes < b.comparable_bytes;
    });

    // Verify that ordering them based on comparable bytes, sorts the values as expected
    BOOST_REQUIRE_MESSAGE(std::ranges::is_sorted(test_items, [&test_data_type] (const test_item& a, const test_item& b) {
        return test_data_type->compare(a.serialized_bytes, b.serialized_bytes) == std::strong_ordering::less;
    }), "sorting items based on comparable bytes failed");
}

template <std::integral int_type>
static std::vector<data_value> generate_integer_test_data(
        // Function to create a data_value from the underlying integer type.
        std::function<data_value(int_type)> create_data_value_func = {},
        // Function to filter out values that should not be included in the test data.
        std::function<bool(int_type)> filter_func = {}) {
    if (!create_data_value_func) {
        if constexpr (std::is_signed_v<int_type>) {
            // If a custom create_data_value_fn is not provided, create data_value
            // directly from the underlying integer type.
            create_data_value_func = [](int_type num) {
                return data_value(num);
            };
        } else {
            // For unsigned integer types, the caller must provide a custom create_data_value_fn,
            // as the data_value class doesn't have an unambiguous constructor for unsigned values.
            SCYLLA_ASSERT(false);
        }
    }

    std::vector<data_value> test_data;
    auto push_to_test_data = [&] (int_type num) {
        for (int_type n : std::initializer_list<int_type>{num, ~num}) {
            if (!filter_func || filter_func(n)) {
                test_data.push_back(create_data_value_func(n));
            }
        }
    };

    // Generates test values by shifting bit(1) through all possible positions and then deriving
    // multiple test cases from each value. This helps test edge cases and boundary conditions
    // by covering values with different bit patterns across the entire range of the type.
    auto num = int_type(1);
    auto num_bits = sizeof(int_type) * 8;
    test_data.reserve(num_bits * 4);
    while (num_bits-- > 0) {
        // for every num, we push [num, ~num, num - 1, ~(num - 1)] to the test data.
        push_to_test_data(num);
        if (num != std::numeric_limits<int_type>::min()) {
            push_to_test_data(num - 1);
        }

        num <<= 1;
    }

    return test_data;
}

BOOST_AUTO_TEST_CASE(test_tinyint) {
    byte_comparable_test(generate_integer_test_data<int8_t>());
}

BOOST_AUTO_TEST_CASE(test_smallint) {
    byte_comparable_test(generate_integer_test_data<int16_t>());
}

BOOST_AUTO_TEST_CASE(test_int) {
    byte_comparable_test(generate_integer_test_data<int32_t>());
}

BOOST_AUTO_TEST_CASE(test_bigint) {
    byte_comparable_test(generate_integer_test_data<int64_t>());
}

BOOST_AUTO_TEST_CASE(test_simple_date) {
    byte_comparable_test(generate_integer_test_data<uint32_t>([] (uint32_t days) {
        return data_value(simple_date_native_type{days});
    }));
}

BOOST_AUTO_TEST_CASE(test_time) {
    constexpr int64_t max_ns_in_a_day = 24L * 60 * 60 * 1000 * 1000 * 1000;
    byte_comparable_test(generate_integer_test_data<int64_t>([] (int64_t nanoseconds) {
        return data_value(time_native_type{nanoseconds});
    }, [] (int64_t ns_candidate) {
        // allow only valid nanosecond values
        return ns_candidate >= 0 && ns_candidate <= max_ns_in_a_day;
    }));
}

BOOST_AUTO_TEST_CASE(test_timestamp) {
    byte_comparable_test(generate_integer_test_data<db_clock::rep>([] (db_clock::rep milliseconds) {
        return data_value(db_clock::time_point(db_clock::duration(milliseconds)));
    }));
}

BOOST_AUTO_TEST_CASE(test_date) {
    byte_comparable_test(generate_integer_test_data<db_clock::rep>([] (db_clock::rep milliseconds) {
        return data_value(date_type_native_type{db_clock::time_point{db_clock::duration(milliseconds)}});
    }));
}

template <std::floating_point fp_type>
static std::vector<data_value> generate_floating_point_test_data() {
    std::vector<data_value> test_data;
    for (fp_type n : {-1e30f, -1e3f, -1.0f, -0.001f, -1e-30f, -0.0f, 0.0f, 1e-30f, 0.001f, 1.0f, 1e3f, 1e30f,
                -std::numeric_limits<float>::min(), std::numeric_limits<float>::min(),
                -std::numeric_limits<float>::max(), std::numeric_limits<float>::max(),
                -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
                std::numeric_limits<float>::quiet_NaN()}) {
        test_data.emplace_back(n);
    }

    // double has a few more test items
    int random_exponent_min = -30, random_exponent_max = 30;
    if constexpr (std::is_same_v<fp_type, double>) {
        for (fp_type n : std::vector<double>{-1e200, -1e100, 1e100, 1e200,
                    -std::numeric_limits<double>::min(), std::numeric_limits<double>::min(),
                    -std::numeric_limits<double>::max(), std::numeric_limits<double>::max()}) {
            test_data.emplace_back(n);
        }
        random_exponent_min = -300;
        random_exponent_max = 300;
    }

    // generate some random test data
    for (int i = 0; i < 100; i++) {
        const auto significand = tests::random::get_int<int64_t>(std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max());
        const auto scale = std::pow(10, tests::random::get_int<int>(random_exponent_min, random_exponent_max));
        test_data.push_back(fp_type(significand * scale));
    }

    return test_data;
}

BOOST_AUTO_TEST_CASE(test_float) {
    byte_comparable_test(generate_floating_point_test_data<float>());
}

BOOST_AUTO_TEST_CASE(test_double) {
    byte_comparable_test(generate_floating_point_test_data<double>());
}

void encode_varint_length(uint64_t length, int64_t sign_mask, bytes_ostream& out);
uint64_t decode_varint_length(managed_bytes_view& src, int64_t sign_only_byte);
BOOST_AUTO_TEST_CASE(test_varint_length_encoding) {
    for (int shift = 0; shift < 64; shift++) {
        uint64_t length = (uint64_t(1) << shift) - 1;
        for (int64_t sign_mask : {0, -1}) {
            bytes_ostream out;
            encode_varint_length(length, sign_mask, out);
            auto mb = std::move(out).to_managed_bytes();
            auto mbv = managed_bytes_view(mb);
            BOOST_REQUIRE_EQUAL(length, decode_varint_length(mbv, sign_mask));
        }
    }
}

BOOST_AUTO_TEST_CASE(test_varint) {
    // Generate small integers
    std::vector<data_value> test_data = generate_integer_test_data<int64_t>([] (int64_t n) {
        return data_value(utils::multiprecision_int(n));
    });

    // Generate more large numbers
    test_data.reserve(test_data.size() + (20 * 4 * 4));
    auto multiprecision_one = utils::multiprecision_int(1);
    for (int shift = 1; shift <= 20; shift++) {
        for (auto shift_multiplier : {64, 100, 256, 512}) {
            auto large_number = multiprecision_one << shift * shift_multiplier;
            for (auto number : std::initializer_list<utils::multiprecision_int>{large_number, large_number - 1, -large_number, -(large_number - 1)}) {
                test_data.emplace_back(number);
            }
        }
    }
    byte_comparable_test(std::move(test_data));
}

static int64_t msb_with_version(int64_t msb, int version) {
    // Set the version bits in the msb of the UUID
    return (msb & ~(0xF << 12)) | (version << 12);
}

static void test_uuid_and_flipped_uuid(utils::UUID&& uuid, std::vector<data_value>& test_data,
        std::function<data_value(utils::UUID&&)>& create_data_value) {
    auto uuid_dv = create_data_value(std::move(uuid));
    // negate the uuid to create a flipped version
    auto flipped_uuid = utils::UUID_gen::negate(uuid);
    auto flipped_uuid_dv = create_data_value(std::move(flipped_uuid));
    // verify that the original and flipped uuids compare correctly in byte-comparable format
    BOOST_REQUIRE(uuid <=> flipped_uuid == comparable_bytes::from_data_value(uuid_dv) <=> comparable_bytes::from_data_value(flipped_uuid_dv));
    // add both original and flipped uuids to the test data
    test_data.push_back(std::move(uuid_dv));
    test_data.push_back(std::move(flipped_uuid_dv));
}

static std::vector<data_value> generate_timeuuid_test_data(bool create_timeuuid_native_type) {
    std::function<data_value(utils::UUID&&)> create_data_value;
    if (create_timeuuid_native_type) {
        // create data_value for timeuuid data type
        create_data_value = [] (utils::UUID&& time_uuid) {
            return data_value(timeuuid_native_type(std::move(time_uuid)));
        };
    } else {
        // create data_value for uuid data type
        create_data_value = [] (utils::UUID&& time_uuid) {
            return data_value(std::move(time_uuid));
        };
    }

    std::vector<data_value> test_data;
    for (auto [msb, lsb] : std::initializer_list<std::pair<int64_t, int64_t>>{
                 {0, 0},
                 {std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::min()},
                 {std::numeric_limits<int64_t>::max(), std::numeric_limits<int64_t>::max()},
         }) {
        test_uuid_and_flipped_uuid(utils::UUID(msb_with_version(msb, 1), lsb), test_data, create_data_value);
    }

    for (int i = 0; i < 500; i++) {
        // Generate a random msb with version set to 1 (time-based UUID)
        test_uuid_and_flipped_uuid(
                utils::UUID(msb_with_version(tests::random::get_int<int64_t>(std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max()), 1),
                        tests::random::get_int<int64_t>(std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max())),
                test_data, create_data_value);
    }

    return test_data;
}

BOOST_AUTO_TEST_CASE(test_timeuuid) {
    byte_comparable_test(generate_timeuuid_test_data(true));
}

BOOST_AUTO_TEST_CASE(test_uuid) {
    // generate time uuids
    auto test_data = generate_timeuuid_test_data(false);

    // test few edge cases
    test_data.emplace_back(utils::null_uuid());
    test_data.emplace_back(utils::UUID(std::numeric_limits<int64_t>::max(), std::numeric_limits<int64_t>::max()));
    test_data.emplace_back(utils::UUID(std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::min()));
    test_data.emplace_back(utils::UUID("ffffffff-ffff-ffff-ffff-ffffffffffff"));
    // test name based, type 3 uuids
    test_data.emplace_back(utils::UUID_gen::get_name_UUID("scylladb"));
    test_data.emplace_back(utils::UUID_gen::get_name_UUID("lakshminarayanansreethar"));

    // generate few random uuids
    std::function<data_value(utils::UUID&&)> create_data_value = [] (utils::UUID&& time_uuid) {
        return data_value(std::move(time_uuid));
    };
    for (auto i = 0; i < 500; i++) {
        // Generate a random msb with version set to 4
        test_uuid_and_flipped_uuid(
                utils::UUID(msb_with_version(tests::random::get_int<int64_t>(std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max()), 4),
                        tests::random::get_int<int64_t>(std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max())),
                test_data, create_data_value);
    }

    byte_comparable_test(std::move(test_data));
}

extern std::size_t count_digits(const boost::multiprecision::cpp_int& value);
BOOST_AUTO_TEST_CASE(test_count_digits) {
    auto test_precision = [] (boost::multiprecision::cpp_int&& num) {
        const auto expected_length = num.str().length();
        BOOST_REQUIRE_EQUAL(count_digits(num), expected_length);
        BOOST_REQUIRE_EQUAL(count_digits(-num), expected_length);
    };

    test_precision(boost::multiprecision::cpp_int("0"));
    test_precision(boost::multiprecision::cpp_int("123"));
    test_precision(boost::multiprecision::cpp_int("123456"));
    test_precision(boost::multiprecision::cpp_int("12345600"));
    test_precision(boost::multiprecision::cpp_int("9999999"));
    test_precision(boost::multiprecision::cpp_int(
        "123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"));
}

BOOST_AUTO_TEST_CASE(test_decimal) {
    // generate few multiprecision ints to be used as unscaled_values in the big_decimal
    std::vector<boost::multiprecision::cpp_int> unscaled_values;
    auto multiprecision_one = utils::multiprecision_int(1);
    for (int shift = 1; shift <= 10; shift++) {
        for (auto shift_prod : {1, 2, 4, 8, 10, 32, 64, 100, 256}) {
            auto mp_num = multiprecision_one << shift * shift_prod;
            for (auto n : std::initializer_list<utils::multiprecision_int>{mp_num, mp_num - 1, -mp_num, -(mp_num - 1)}) {
                unscaled_values.push_back(std::move(n));
            }
        }
    }
    // scales to generate the big_decimal
    std::vector<int32_t> scales{1, 2, 4, 5, 10, 100, 1000};

    std::vector<data_value> _test_data;
    _test_data.reserve(unscaled_values.size() * scales.size() * 5);
    for (const auto& unscaled_value : unscaled_values) {
        _test_data.emplace_back(big_decimal(0, unscaled_value));
        _test_data.emplace_back(big_decimal(std::numeric_limits<int32_t>::min(), unscaled_value));
        _test_data.emplace_back(big_decimal(std::numeric_limits<int32_t>::max(), unscaled_value));
        for (const auto& scale : scales) {
            _test_data.emplace_back(big_decimal(scale, unscaled_value));
            _test_data.emplace_back(big_decimal(-scale, unscaled_value));
        }
    }

    byte_comparable_test(std::move(_test_data));
}

BOOST_AUTO_TEST_CASE(test_blob) {
    auto random_bytes = [] (size_t length) {
        std::vector<int8_t> data(length);
        for (auto& byte : data) {
            byte = tests::random::get_int<uint8_t>();
        }
        return bytes(reinterpret_cast<const int8_t*>(data.data()), length);
    };

    std::vector<data_value> test_data;
    test_data.reserve(500);
    for (int i = 0; i < 100; i++) {
        for (int length : {1, 10, 100, 1000}) {
            test_data.emplace_back(random_bytes(length));
        }
    }

    // test a few cases that are stored across multiple fragments
    for (int i = 0; i < 10; i++) {
        for (int frag_count = 1; frag_count <= 10; frag_count++) {
            const size_t length = 128 * 1024 * frag_count;
            test_data.emplace_back(random_bytes(length));
        }
    }

    byte_comparable_test(std::move(test_data));
}

static std::vector<data_value> generate_string_test_data(
    std::function<data_value(std::string&&)> create_data_value_func) {
    const std::string charset = "0123456789"
                                "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
                                "abcdefghijklmnopqrstuvwxyz";

    auto random_text = [&charset] (size_t length) {
        std::string generated_text;
        generated_text.reserve(length);
        for (size_t i = 0; i < length; ++i) {
            generated_text += charset[tests::random::get_int<size_t>(0, charset.size() - 1)];
        }
        return generated_text;
    };

    std::vector<data_value> test_data;
    test_data.reserve(500);
    for (int i = 0; i < 100; i++) {
        for (int length : {1, 10, 100, 1000}) {
            test_data.push_back(create_data_value_func(random_text(length)));
        }
    }

    // test a few cases that are stored across multiple fragments
    for (int i = 0; i < 10; i++) {
        for (int frag_count = 1; frag_count <= 10; frag_count++) {
            const size_t length = 128 * 1024 * frag_count;
            test_data.push_back(create_data_value_func(random_text(length)));
        }
    }

    return test_data;
}

BOOST_AUTO_TEST_CASE(test_ascii) {
    byte_comparable_test(generate_string_test_data([] (std::string&& str) {
        return data_value(ascii_native_type(str));
    }));
}

BOOST_AUTO_TEST_CASE(test_text) {
    byte_comparable_test(generate_string_test_data([] (std::string&& str) {
        return data_value(str);
    }));
}

BOOST_AUTO_TEST_CASE(test_duration) {
    constexpr int64_t max_ns_in_a_day = 24L * 60 * 60 * 1000 * 1000 * 1000;
    std::vector<data_value> test_data;
    test_data.reserve(1000);
    for (int i = 0; i < 1000; i++) {
        const auto months = months_counter{tests::random::get_int<int32_t>(0, 12)};
        const auto days = days_counter{tests::random::get_int<int32_t>(0, 28)};
        const auto ns = nanoseconds_counter{tests::random::get_int<int64_t>(0, max_ns_in_a_day)};
        test_data.emplace_back(cql_duration(months, days, ns));
    }

    byte_comparable_test(std::move(test_data));
}

BOOST_AUTO_TEST_CASE(test_inet) {
    auto test_data = generate_integer_test_data<uint32_t>([](uint32_t value) {
        return data_value(seastar::net::ipv4_address(value));
    });

    // Include few more addresses
    for (const std::string& addr : {
                 // IPv4
                 "127.0.0.1",
                 "10.0.0.1",
                 "172.16.1.1",
                 "192.168.2.2",
                 "224.3.3.3",
                 // IPv6
                 "0000:0000:0000:0000:0000:0000:0000:0000",
                 "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff",
                 "fe80:1:23:456:7890:1:23:456",
         }) {
        test_data.emplace_back(seastar::net::inet_address(addr));
    }

    byte_comparable_test(std::move(test_data));
}

static data_value make_random_data_value_uuid() { return data_value(utils::make_random_uuid()); }
static data_value make_random_data_value_bytes() {
    constexpr size_t max_bytes_size = 128 * 1024; // 128 KB
    return data_value(tests::random::get_bytes(tests::random::get_int<size_t>(1, max_bytes_size)));
}

extern void encode_component(const abstract_type& type, managed_bytes_view serialized_bytes_view, bytes_ostream& out);
extern void decode_component(const abstract_type& type, managed_bytes_view& comparable_bytes_view, bytes_ostream& out);
BOOST_AUTO_TEST_CASE(test_encode_decode_component) {
    // Verify encode and decode works
    bytes_ostream out;
    constexpr uint8_t NEXT_COMPONENT = 0x40;
    for (const auto& test_value : {
        make_random_data_value_uuid(), // data type with fixed length
        make_random_data_value_bytes(), // data type with variable length
    }) {
        const auto& type = *test_value.type();
        out.clear();
        auto serialized_bytes = test_value.serialize_nonnull();
        encode_component(type, managed_bytes_view(serialized_bytes), out);
        auto comparable_bytes = std::move(out).to_managed_bytes();
        auto comparable_bytes_view = managed_bytes_view(comparable_bytes);
        // encoded component should begin with a NEXT_COMPONENT marker
        BOOST_REQUIRE_EQUAL(read_simple_native<uint8_t>(comparable_bytes_view), NEXT_COMPONENT);
        out.clear();
        decode_component(type, comparable_bytes_view, out);
        auto decoded_bytes = std::move(out).to_managed_bytes();
        auto decoded_bytes_view = managed_bytes_view(decoded_bytes);
        // decoded bytes should match the serialized form
        BOOST_REQUIRE_EQUAL(read_simple<int32_t>(decoded_bytes_view), test_value.serialized_size());
        BOOST_REQUIRE(decoded_bytes_view == managed_bytes_view(serialized_bytes));
    }
}

// Generates a vector of vectors of data_value, where each inner vector represents a collection of data_values.
template<size_t collection_size = 0>
static auto generate_collection_test_data(const std::function<data_value()>& create_data_value) {
    constexpr size_t test_data_size = 500, max_collection_size = 25;
    std::vector<std::vector<data_value>> test_data;
    test_data.reserve(test_data_size + 21);
    for (size_t i = 0; i < test_data_size; i++) {
        // Generate a single collection and add it to test data
        std::vector<data_value> collection;
        if constexpr (collection_size == 0) {
            collection.reserve(tests::random::get_int<size_t>(1, max_collection_size));
        } else {
            collection.reserve(collection_size);
        }
        for (size_t j = 0; j < collection.capacity(); j++) {
            collection.push_back(create_data_value());
        }
        test_data.push_back(std::move(collection));
    }

    // Include few duplicates in the test data with variations
    for (int i = 0; i < 10; i++) {
        test_data.emplace_back(test_data.at(tests::random::get_int<size_t>(test_data_size - 1)));
        // include a partial duplicate
        auto test_item = test_data.at(tests::random::get_int<size_t>(test_data_size - 1));
        test_data.emplace_back(test_item.begin(), test_item.begin() + tests::random::get_int<size_t>(1, test_item.size()));
        if constexpr (collection_size != 0) {
            // For fixed-size collections, the partial duplicate must be padded with random data to meet the required size.
            auto& partial_duplicate = test_data.back();
            while (partial_duplicate.size() < collection_size) {
                partial_duplicate.push_back(create_data_value());
            }
        }
    }

    if constexpr (collection_size == 0) {
        // Add an empty collection to the test data
        test_data.push_back({});
    }

    return test_data;
}

// Common test method for lists and sets. Note that a set is expected to be sorted and unique,
// but it doesn't matter during tests, as both lists and sets internally use the same underlying
// implementation based on std::vectors.
static void test_set_or_list(const std::function<data_type(data_type, bool)>& get_collection_type,
                               const std::function<data_value(data_type, std::vector<data_value>)>& make_collection_value) {
    // Generate vector of collections for each underlying type, with and without
    // multi-cell enabled and run the tests on them.
    auto do_test = [&] (const data_type& underlying_type, std::vector<std::vector<data_value>>&& test_data) {
        for (bool is_multi_cell : {false, true}) {
            std::vector<data_value> collection_test_data;
            collection_test_data.reserve(test_data.size());
            auto collection_type = get_collection_type(underlying_type, is_multi_cell);
            for (const auto& data : test_data) {
                collection_test_data.emplace_back(make_collection_value(collection_type, data));
            }

            byte_comparable_test(std::move(collection_test_data));
        }
    };

    // Test the collection with a data type that has fixed length : UUID (128 bits)
    do_test(uuid_type, generate_collection_test_data(make_random_data_value_uuid));
    // Test the collection with a data type that has variable length : bytes
    do_test(bytes_type, generate_collection_test_data(make_random_data_value_bytes));
}

BOOST_AUTO_TEST_CASE(test_set) {
    test_set_or_list(set_type_impl::get_instance, make_set_value);
}

BOOST_AUTO_TEST_CASE(test_list) {
    test_set_or_list(list_type_impl::get_instance, make_list_value);
}

BOOST_AUTO_TEST_CASE(test_map) {
    // Generate the test data for a map with UUID keys and bytes values.
    constexpr size_t test_data_size = 500, max_entries_per_map = 25;
    std::vector<map_type_impl::native_type> map_test_data;
    map_test_data.reserve(test_data_size + 21);
    for (size_t i = 0; i < test_data_size; i++) {
        map_type_impl::native_type test_item;
        size_t num_entries = tests::random::get_int<size_t>(1, max_entries_per_map);
        for (size_t j = 0; j < num_entries; j++) {
            // Generate a random UUID and a random bytes value
            test_item.emplace_back(make_random_data_value_uuid(), make_random_data_value_bytes());
        }

        // Add the map to the test data
        map_test_data.emplace_back(test_item.begin(), test_item.end());
    }

    // Include duplicates with some variants
    for (int i = 0; i < 10; i++) {
        auto test_item = map_test_data.at(tests::random::get_int<size_t>(test_data_size - 1));
        map_test_data.emplace_back(test_item);
        map_type_impl::native_type duplicate_with_different_values;
        for (const auto& [key, value] : test_item) {
            duplicate_with_different_values.emplace_back(key, make_random_data_value_bytes());
        }
        map_test_data.emplace_back(std::move(duplicate_with_different_values));
    }

    // Add an empty entry to the map
    map_test_data.emplace_back();

    for (bool is_multi_cell : {false, true}) {
        const auto map_type = map_type_impl::get_instance(uuid_type, bytes_type, is_multi_cell);
        std::vector<data_value> collection_test_data;
        collection_test_data.reserve(map_test_data.size());
        for (const auto& data : map_test_data) {
            collection_test_data.emplace_back(make_map_value(map_type, data));
        }

        byte_comparable_test(std::move(collection_test_data));
    }
}

BOOST_AUTO_TEST_CASE(test_tuple) {
    // Generate the test data for tuple with UUID and bytes types
    constexpr int test_data_size = 1000;
    std::vector<data_value> tuple_test_data;
    tuple_test_data.reserve(test_data_size + 30 + 3);
    const auto test_tuple_type = tuple_type_impl::get_instance({uuid_type, bytes_type});
    for (int i = 0; i < test_data_size; i++) {
        tuple_test_data.emplace_back(make_tuple_value(test_tuple_type, {make_random_data_value_uuid(), make_random_data_value_bytes()}));
    }

    // Include few duplicates in the test data with variations
    for (int i = 0; i < 10; i++) {
        auto test_item = value_cast<tuple_type_impl::native_type>(
            tuple_test_data.at(tests::random::get_int<size_t>(test_data_size - 1)));
        tuple_test_data.emplace_back(make_tuple_value(test_tuple_type, {test_item.at(0), make_random_data_value_bytes()}));
        tuple_test_data.emplace_back(make_tuple_value(test_tuple_type, {make_random_data_value_uuid(), test_item.at(1)}));
        tuple_test_data.emplace_back(make_tuple_value(test_tuple_type, {test_item.at(0), test_item.at(1)}));
    }

    // Include tuples with nulls in the testdata
    tuple_test_data.emplace_back(make_tuple_value(test_tuple_type, {make_random_data_value_uuid(), data_value::make_null(bytes_type)}));
    tuple_test_data.emplace_back(make_tuple_value(test_tuple_type, {data_value::make_null(uuid_type), make_random_data_value_bytes()}));
    tuple_test_data.emplace_back(make_tuple_value(test_tuple_type, {data_value::make_null(uuid_type), data_value::make_null(bytes_type)}));

    byte_comparable_test(std::move(tuple_test_data));
}

BOOST_AUTO_TEST_CASE(test_udt) {
    // Generate data for UDT with following types : uuid, bytes, int64_t
    constexpr int test_data_size = 1000;
    std::vector<user_type_impl::native_type> udt_test_data;
    udt_test_data.reserve(test_data_size + 100);
    auto make_random_data_value_int64 = [] () {
        return data_value(tests::random::get_int<int64_t>(std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max()));
    };
    for (int i = 0; i < test_data_size; i++) {
        udt_test_data.emplace_back(user_type_impl::native_type{
            make_random_data_value_uuid(), make_random_data_value_bytes(), make_random_data_value_int64()});
    }

    // Include few duplicates in the test data with variations
    for (int i = 0; i < 10; i ++) {
        auto test_item = udt_test_data.at(tests::random::get_int<size_t>(test_data_size - 1));
        udt_test_data.emplace_back(user_type_impl::native_type{test_item.at(0), test_item.at(1), make_random_data_value_int64()});
        udt_test_data.emplace_back(user_type_impl::native_type{test_item.at(0), make_random_data_value_bytes(), test_item.at(2)});
        udt_test_data.emplace_back(user_type_impl::native_type{make_random_data_value_uuid(), test_item.at(1), test_item.at(2)});
        udt_test_data.emplace_back(user_type_impl::native_type{test_item.at(0), make_random_data_value_bytes(), make_random_data_value_int64()});
        udt_test_data.emplace_back(user_type_impl::native_type{make_random_data_value_uuid(), test_item.at(1), make_random_data_value_int64()});
        udt_test_data.emplace_back(user_type_impl::native_type{make_random_data_value_uuid(), make_random_data_value_bytes(), test_item.at(2)});
        udt_test_data.emplace_back(test_item);
    }

    // Include tuples with nulls in the testdata
    udt_test_data.emplace_back(user_type_impl::native_type{make_random_data_value_uuid(), make_random_data_value_bytes(), data_value::make_null(long_type)});
    udt_test_data.emplace_back(user_type_impl::native_type{make_random_data_value_uuid(), data_value::make_null(bytes_type), make_random_data_value_int64()});
    udt_test_data.emplace_back(user_type_impl::native_type{data_value::make_null(uuid_type), make_random_data_value_bytes(), make_random_data_value_int64()});
    udt_test_data.emplace_back(user_type_impl::native_type{make_random_data_value_uuid(), data_value::make_null(bytes_type), data_value::make_null(long_type)});
    udt_test_data.emplace_back(user_type_impl::native_type{data_value::make_null(uuid_type), make_random_data_value_bytes(), data_value::make_null(long_type)});
    udt_test_data.emplace_back(user_type_impl::native_type{data_value::make_null(uuid_type), data_value::make_null(bytes_type), make_random_data_value_int64()});
    udt_test_data.emplace_back(user_type_impl::native_type{data_value::make_null(uuid_type), data_value::make_null(bytes_type), data_value::make_null(long_type)});


    // Run the test for both frozen and non frozen types
    for (auto is_multi_cell : {false, true}) {
        const auto test_udt_type = user_type_impl::get_instance("ks_test", "cb_test_udt",
        std::vector<bytes>{"field1", "field2", "field3"},
        std::vector<data_type>{uuid_type, bytes_type, long_type}, is_multi_cell);
        std::vector<data_value> collection_test_data;
        collection_test_data.reserve(udt_test_data.size());
        for (const auto& data : udt_test_data) {
            collection_test_data.emplace_back(make_user_value(test_udt_type, data));
        }

        byte_comparable_test(std::move(collection_test_data));
    }
}

BOOST_AUTO_TEST_CASE(test_vector) {
    auto do_test = [&] (const data_type& underlying_type, std::vector<std::vector<data_value>>&& test_data) {
        std::vector<data_value> collection_test_data;
        collection_test_data.reserve(test_data.size());
        auto collection_type = vector_type_impl::get_instance(underlying_type, test_data.at(0).size());
        for (const auto& data : test_data) {
            collection_test_data.emplace_back(make_vector_value(collection_type, data));
        }

        byte_comparable_test(std::move(collection_test_data));
    };

    // Test the collection with a data type that has fixed length : UUID (128 bits)
    do_test(uuid_type, generate_collection_test_data<128>(make_random_data_value_uuid));
        // Test the collection with a data type that has variable length : bytes
    do_test(bytes_type, generate_collection_test_data<16>(make_random_data_value_bytes));
}

BOOST_AUTO_TEST_CASE(test_reversed) {
    // Test reversed with native types
    byte_comparable_test(generate_integer_test_data<int64_t>(), true);
    byte_comparable_test(generate_string_test_data([] (std::string&& str) {
        return data_value(str);
    }), true);

    // Test reversed with a collection
    const auto list_type = list_type_impl::get_instance(bytes_type, false);
    std::vector<data_value> collection_test_data;
    collection_test_data.reserve(510);
    for (const auto& test_case : generate_collection_test_data(make_random_data_value_bytes)) {
        collection_test_data.emplace_back(make_list_value(list_type, test_case));
    }
    byte_comparable_test(std::move(collection_test_data), true);
}

BOOST_AUTO_TEST_CASE(test_empty) {
    auto test_data = data_value(empty_type_representation{});
    auto test_data_cb = comparable_bytes::from_data_value(test_data);
    BOOST_REQUIRE(test_data_cb->size() == 0);
    BOOST_REQUIRE(test_data == test_data_cb->to_data_value(empty_type));
}

// Test Scylla's byte-comparable encoding compatibility with Cassandra's implementation by
// verifying that serialized values produce the same comparable bytes as those generated by Cassandra.
// The test data was generated using the cassandra unit test pushed to the following branch:
// https://github.com/scylladb/scylla-dev/blob/byte-comparable-compatibility-generator
SEASTAR_TEST_CASE(test_compatibility) {
    return sstables::test_env::do_with_async([] (sstables::test_env&) {
        auto file = open_file_dma("test/resource/byte_comparable_compatibility_data.csv", open_flags::ro).get();
        auto fs = make_file_input_stream(file);
        temporary_buffer<char> buf = fs.read().get();
        // Read file contents in a loop and handle them line by line.
        data_type type;
        std::string input_buffer;
        while (!buf.empty()) {
            input_buffer.append(buf.get(), buf.size());
            size_t pos = 0;
            while (pos != input_buffer.size()) {
                // Extract the CSV entry from the next line
                size_t end = input_buffer.find('\n', pos);
                if (end == std::string::npos) {
                    // no \n in the input, need to read more data from the file
                    break;
                }
                std::string curr_line = input_buffer.substr(pos, end - pos);
                pos = end + 1;

                // Test data has `type` followed by the test data in subsequent lines.
                // Extract them from curr_line.
                if (curr_line.starts_with("org.apache.cassandra.db.marshal")) {
                    // This is the type line, parse it and continue to the next line.
                    type = db::marshal::type_parser::parse(std::string_view(curr_line));
                    testlog.info("testing compatibility of type: {}",
                        type->is_reversed() ? format("reversed<{}>", type->cql3_type_name()) : type->cql3_type_name());
                    continue;
                }

                // This line has the test data for the type.
                // Test data has two columns: actual value and comparable bytes encoded by cassandra
                const auto comma_pos = curr_line.rfind(',');
                BOOST_REQUIRE_MESSAGE(comma_pos != std::string::npos, "invalid CSV entry");
                const auto actual_value = curr_line.substr(0, comma_pos);
                const auto origin_encoded_cb = comparable_bytes(managed_bytes(bytes_type->from_string(curr_line.substr(comma_pos + 1))));

                bytes serialized_bytes;
                if (type->is_native()) {
                    serialized_bytes = type->from_string(actual_value);
                } else {
                    // Workaround for composite types as abstract_type::from_string() doesn't support them.
                    serialized_bytes = from_json_object(*type, rjson::parse(actual_value));
                }

                // Verify encoding
                comparable_bytes scylla_encoded_cb(*type, managed_bytes_view(serialized_bytes));
                BOOST_REQUIRE_MESSAGE(scylla_encoded_cb == origin_encoded_cb, seastar::value_of([&] () {
                    return fmt::format("encoding failed for value : {}", actual_value);
                }));

                // Verify decoding
                BOOST_REQUIRE_MESSAGE(origin_encoded_cb.to_data_value(type) == type->deserialize(serialized_bytes), seastar::value_of([&] () {
                    return fmt::format("decoding failed for value : {}", actual_value);
                }));
            }

            // Remove the lines that were processed from the input buffer.
            input_buffer.erase(0, pos);

            buf = fs.read().get();
        }
        file.close().get();
    });
}