This patch removes the dependence of vector search module on the cql3 module by moving the contents of cql3/type_json.hh to types/json_utils.hh and removing the usage of cql3 primary_key object in vector_store_client. We also make the needed adjustments to files that were previously using the afformentioned type_json.hh file. This fixes the circular dependency cql3 <-> vector_search. Closes scylladb/scylladb#26482
883 lines
41 KiB
C++
883 lines
41 KiB
C++
/*
|
|
* Copyright (C) 2024-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
*/
|
|
#include "test/lib/scylla_test_case.hh"
|
|
|
|
#include <seastar/net/inet_address.hh>
|
|
#include <seastar/net/ipv4_address.hh>
|
|
#include <seastar/util/lazy.hh>
|
|
#include <vector>
|
|
|
|
#include "bytes_ostream.hh"
|
|
#include "db/marshal/type_parser.hh"
|
|
#include "test/lib/log.hh"
|
|
#include "test/lib/random_utils.hh"
|
|
#include "test/lib/sstable_test_env.hh"
|
|
#include "types/comparable_bytes.hh"
|
|
#include "types/json_utils.hh"
|
|
#include "types/list.hh"
|
|
#include "types/map.hh"
|
|
#include "types/set.hh"
|
|
#include "types/types.hh"
|
|
#include "types/vector.hh"
|
|
#include "utils/big_decimal.hh"
|
|
#include "utils/fragment_range.hh"
|
|
#include "utils/managed_bytes.hh"
|
|
#include "utils/multiprecision_int.hh"
|
|
#include "utils/UUID.hh"
|
|
#include "utils/UUID_gen.hh"
|
|
#include "utils/rjson.hh"
|
|
|
|
BOOST_AUTO_TEST_CASE(test_comparable_bytes_opt) {
|
|
BOOST_REQUIRE(comparable_bytes::from_data_value(data_value::make_null(int32_type)) == comparable_bytes_opt());
|
|
BOOST_REQUIRE(comparable_bytes::from_serialized_bytes(*int32_type, managed_bytes_opt()) == comparable_bytes_opt());
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_bool) {
|
|
auto test_bool_value = [] (comparable_bytes_opt& comparable_bytes, bool value) {
|
|
BOOST_REQUIRE_EQUAL(comparable_bytes->size(), 1);
|
|
BOOST_REQUIRE_MESSAGE(comparable_bytes->as_managed_bytes_view().front() == uint8_t(value),
|
|
fmt::format("comparable bytes encode failed for bool value : {}", value));
|
|
BOOST_REQUIRE_MESSAGE(value == comparable_bytes->to_data_value(boolean_type),
|
|
fmt::format("comparable bytes decode failed for bool value : {}", value));
|
|
};
|
|
|
|
auto cb_false = comparable_bytes::from_data_value(false);
|
|
test_bool_value(cb_false, false);
|
|
auto cb_true = comparable_bytes::from_data_value(true);
|
|
test_bool_value(cb_true, true);
|
|
// Verify order
|
|
BOOST_REQUIRE(cb_false < cb_true);
|
|
}
|
|
|
|
void byte_comparable_test(std::vector<data_value>&& test_data, bool test_reversed_type = false) {
|
|
struct test_item {
|
|
managed_bytes serialized_bytes;
|
|
comparable_bytes comparable_bytes;
|
|
};
|
|
std::vector<test_item> test_items;
|
|
|
|
// test encode/decode
|
|
const auto test_data_type = test_reversed_type ? reversed(test_data.at(0).type()) : test_data.at(0).type();
|
|
testlog.info("testing type '{}' with {} items...",
|
|
test_reversed_type ? format("reversed<{}>", test_data_type.get()->cql3_type_name()) : test_data_type.get()->cql3_type_name(),
|
|
test_data.size());
|
|
testlog.trace("test data : {}", test_data);
|
|
for (const data_value& value : test_data) {
|
|
// verify comparable bytes encode/decode
|
|
auto original_serialized_bytes = managed_bytes(value.serialize_nonnull());
|
|
comparable_bytes comparable_bytes(*test_data_type, original_serialized_bytes);
|
|
auto decoded_serialized_bytes = comparable_bytes.to_serialized_bytes(*test_data_type).value();
|
|
if (test_data_type == decimal_type || test_data_type->is_tuple()) {
|
|
// 1. The `decimal_type` requires special handling because its comparable byte representation
|
|
// normalizes the scale and unscaled value. This means the serialized bytes after
|
|
// decoding from comparable bytes might not be identical to the original serialized bytes,
|
|
// despite them representing the same decimal value.
|
|
// For instance, 2e-1 (scale=1, unscaled_value=2) and 20e-2 (scale=2, unscaled_value=20)
|
|
// are equivalent decimals but have different serialized forms. Comparable byte encoding
|
|
// will normalize them. So, instead of directly comparing serialized bytes, compare the
|
|
// deserialized decoded value against the original decimal value.
|
|
// 2. When encoding `tuple_type`, any trailing nulls are trimmed, so the serialized bytes
|
|
// cannot be compared directly.
|
|
auto decoded_value = test_data_type->deserialize_value(managed_bytes_view(decoded_serialized_bytes));
|
|
BOOST_REQUIRE_MESSAGE(value == decoded_value, seastar::value_of([&] () {
|
|
return fmt::format("comparable bytes encode/decode failed for value : {}", value);
|
|
}));
|
|
} else {
|
|
// Compare the serialized bytes directly
|
|
BOOST_REQUIRE_MESSAGE(original_serialized_bytes == decoded_serialized_bytes, seastar::value_of([&] () {
|
|
return fmt::format("comparable bytes encode/decode failed for value : {}", value);
|
|
}));
|
|
}
|
|
|
|
// collect the data in a vector to verify ordering later
|
|
test_items.emplace_back(original_serialized_bytes, comparable_bytes);
|
|
};
|
|
|
|
// Verify that decoding succeeds even when the comparable bytes contain
|
|
// extra data appended after the value to be converted.
|
|
// This required for decode to work on composite types.
|
|
bytes_ostream bos;
|
|
// Select an item from the middle to test this case as front and back items
|
|
// are often edge cases (e.g. min/max values).
|
|
const auto item_id = test_items.size() / 2;
|
|
auto test_value = test_items.at(item_id);
|
|
auto cb_view = test_value.comparable_bytes.as_managed_bytes_view();
|
|
bos.write(cb_view);
|
|
bos.write(bytes("this-still-should-work"));
|
|
auto cb = comparable_bytes(std::move(bos).to_managed_bytes());
|
|
auto decoded_value = cb.to_data_value(test_data_type);
|
|
BOOST_REQUIRE_MESSAGE(test_data.at(item_id) == decoded_value, seastar::value_of([&] () {
|
|
return fmt::format("comparable bytes decode failed with appended bytes; expected : {}; actual : {}", test_data.at(0), decoded_value);
|
|
}));
|
|
|
|
// Sort the items based on comparable bytes
|
|
std::ranges::sort(test_items, [] (const test_item& a, const test_item& b) {
|
|
return a.comparable_bytes < b.comparable_bytes;
|
|
});
|
|
|
|
// Verify that ordering them based on comparable bytes, sorts the values as expected
|
|
BOOST_REQUIRE_MESSAGE(std::ranges::is_sorted(test_items, [&test_data_type] (const test_item& a, const test_item& b) {
|
|
return test_data_type->compare(a.serialized_bytes, b.serialized_bytes) == std::strong_ordering::less;
|
|
}), "sorting items based on comparable bytes failed");
|
|
}
|
|
|
|
template <std::integral int_type>
|
|
static std::vector<data_value> generate_integer_test_data(
|
|
// Function to create a data_value from the underlying integer type.
|
|
std::function<data_value(int_type)> create_data_value_func = {},
|
|
// Function to filter out values that should not be included in the test data.
|
|
std::function<bool(int_type)> filter_func = {}) {
|
|
if (!create_data_value_func) {
|
|
if constexpr (std::is_signed_v<int_type>) {
|
|
// If a custom create_data_value_fn is not provided, create data_value
|
|
// directly from the underlying integer type.
|
|
create_data_value_func = [](int_type num) {
|
|
return data_value(num);
|
|
};
|
|
} else {
|
|
// For unsigned integer types, the caller must provide a custom create_data_value_fn,
|
|
// as the data_value class doesn't have an unambiguous constructor for unsigned values.
|
|
SCYLLA_ASSERT(false);
|
|
}
|
|
}
|
|
|
|
std::vector<data_value> test_data;
|
|
auto push_to_test_data = [&] (int_type num) {
|
|
for (int_type n : std::initializer_list<int_type>{num, ~num}) {
|
|
if (!filter_func || filter_func(n)) {
|
|
test_data.push_back(create_data_value_func(n));
|
|
}
|
|
}
|
|
};
|
|
|
|
// Generates test values by shifting bit(1) through all possible positions and then deriving
|
|
// multiple test cases from each value. This helps test edge cases and boundary conditions
|
|
// by covering values with different bit patterns across the entire range of the type.
|
|
auto num = int_type(1);
|
|
auto num_bits = sizeof(int_type) * 8;
|
|
test_data.reserve(num_bits * 4);
|
|
while (num_bits-- > 0) {
|
|
// for every num, we push [num, ~num, num - 1, ~(num - 1)] to the test data.
|
|
push_to_test_data(num);
|
|
if (num != std::numeric_limits<int_type>::min()) {
|
|
push_to_test_data(num - 1);
|
|
}
|
|
|
|
num <<= 1;
|
|
}
|
|
|
|
return test_data;
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_tinyint) {
|
|
byte_comparable_test(generate_integer_test_data<int8_t>());
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_smallint) {
|
|
byte_comparable_test(generate_integer_test_data<int16_t>());
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_int) {
|
|
byte_comparable_test(generate_integer_test_data<int32_t>());
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_bigint) {
|
|
byte_comparable_test(generate_integer_test_data<int64_t>());
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_simple_date) {
|
|
byte_comparable_test(generate_integer_test_data<uint32_t>([] (uint32_t days) {
|
|
return data_value(simple_date_native_type{days});
|
|
}));
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_time) {
|
|
constexpr int64_t max_ns_in_a_day = 24L * 60 * 60 * 1000 * 1000 * 1000;
|
|
byte_comparable_test(generate_integer_test_data<int64_t>([] (int64_t nanoseconds) {
|
|
return data_value(time_native_type{nanoseconds});
|
|
}, [] (int64_t ns_candidate) {
|
|
// allow only valid nanosecond values
|
|
return ns_candidate >= 0 && ns_candidate <= max_ns_in_a_day;
|
|
}));
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_timestamp) {
|
|
byte_comparable_test(generate_integer_test_data<db_clock::rep>([] (db_clock::rep milliseconds) {
|
|
return data_value(db_clock::time_point(db_clock::duration(milliseconds)));
|
|
}));
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_date) {
|
|
byte_comparable_test(generate_integer_test_data<db_clock::rep>([] (db_clock::rep milliseconds) {
|
|
return data_value(date_type_native_type{db_clock::time_point{db_clock::duration(milliseconds)}});
|
|
}));
|
|
}
|
|
|
|
template <std::floating_point fp_type>
|
|
static std::vector<data_value> generate_floating_point_test_data() {
|
|
std::vector<data_value> test_data;
|
|
for (fp_type n : {-1e30f, -1e3f, -1.0f, -0.001f, -1e-30f, -0.0f, 0.0f, 1e-30f, 0.001f, 1.0f, 1e3f, 1e30f,
|
|
-std::numeric_limits<float>::min(), std::numeric_limits<float>::min(),
|
|
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max(),
|
|
-std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
|
|
std::numeric_limits<float>::quiet_NaN()}) {
|
|
test_data.emplace_back(n);
|
|
}
|
|
|
|
// double has a few more test items
|
|
int random_exponent_min = -30, random_exponent_max = 30;
|
|
if constexpr (std::is_same_v<fp_type, double>) {
|
|
for (fp_type n : std::vector<double>{-1e200, -1e100, 1e100, 1e200,
|
|
-std::numeric_limits<double>::min(), std::numeric_limits<double>::min(),
|
|
-std::numeric_limits<double>::max(), std::numeric_limits<double>::max()}) {
|
|
test_data.emplace_back(n);
|
|
}
|
|
random_exponent_min = -300;
|
|
random_exponent_max = 300;
|
|
}
|
|
|
|
// generate some random test data
|
|
for (int i = 0; i < 100; i++) {
|
|
const auto significand = tests::random::get_int<int64_t>(std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max());
|
|
const auto scale = std::pow(10, tests::random::get_int<int>(random_exponent_min, random_exponent_max));
|
|
test_data.push_back(fp_type(significand * scale));
|
|
}
|
|
|
|
return test_data;
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_float) {
|
|
byte_comparable_test(generate_floating_point_test_data<float>());
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_double) {
|
|
byte_comparable_test(generate_floating_point_test_data<double>());
|
|
}
|
|
|
|
void encode_varint_length(uint64_t length, int64_t sign_mask, bytes_ostream& out);
|
|
uint64_t decode_varint_length(managed_bytes_view& src, int64_t sign_only_byte);
|
|
BOOST_AUTO_TEST_CASE(test_varint_length_encoding) {
|
|
for (int shift = 0; shift < 64; shift++) {
|
|
uint64_t length = (uint64_t(1) << shift) - 1;
|
|
for (int64_t sign_mask : {0, -1}) {
|
|
bytes_ostream out;
|
|
encode_varint_length(length, sign_mask, out);
|
|
auto mb = std::move(out).to_managed_bytes();
|
|
auto mbv = managed_bytes_view(mb);
|
|
BOOST_REQUIRE_EQUAL(length, decode_varint_length(mbv, sign_mask));
|
|
}
|
|
}
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_varint) {
|
|
// Generate small integers
|
|
std::vector<data_value> test_data = generate_integer_test_data<int64_t>([] (int64_t n) {
|
|
return data_value(utils::multiprecision_int(n));
|
|
});
|
|
|
|
// Generate more large numbers
|
|
test_data.reserve(test_data.size() + (20 * 4 * 4));
|
|
auto multiprecision_one = utils::multiprecision_int(1);
|
|
for (int shift = 1; shift <= 20; shift++) {
|
|
for (auto shift_multiplier : {64, 100, 256, 512}) {
|
|
auto large_number = multiprecision_one << shift * shift_multiplier;
|
|
for (auto number : std::initializer_list<utils::multiprecision_int>{large_number, large_number - 1, -large_number, -(large_number - 1)}) {
|
|
test_data.emplace_back(number);
|
|
}
|
|
}
|
|
}
|
|
byte_comparable_test(std::move(test_data));
|
|
}
|
|
|
|
static int64_t msb_with_version(int64_t msb, int version) {
|
|
// Set the version bits in the msb of the UUID
|
|
return (msb & ~(0xF << 12)) | (version << 12);
|
|
}
|
|
|
|
static void test_uuid_and_flipped_uuid(utils::UUID&& uuid, std::vector<data_value>& test_data,
|
|
std::function<data_value(utils::UUID&&)>& create_data_value) {
|
|
auto uuid_dv = create_data_value(std::move(uuid));
|
|
// negate the uuid to create a flipped version
|
|
auto flipped_uuid = utils::UUID_gen::negate(uuid);
|
|
auto flipped_uuid_dv = create_data_value(std::move(flipped_uuid));
|
|
// verify that the original and flipped uuids compare correctly in byte-comparable format
|
|
BOOST_REQUIRE(uuid <=> flipped_uuid == comparable_bytes::from_data_value(uuid_dv) <=> comparable_bytes::from_data_value(flipped_uuid_dv));
|
|
// add both original and flipped uuids to the test data
|
|
test_data.push_back(std::move(uuid_dv));
|
|
test_data.push_back(std::move(flipped_uuid_dv));
|
|
}
|
|
|
|
static std::vector<data_value> generate_timeuuid_test_data(bool create_timeuuid_native_type) {
|
|
std::function<data_value(utils::UUID&&)> create_data_value;
|
|
if (create_timeuuid_native_type) {
|
|
// create data_value for timeuuid data type
|
|
create_data_value = [] (utils::UUID&& time_uuid) {
|
|
return data_value(timeuuid_native_type(std::move(time_uuid)));
|
|
};
|
|
} else {
|
|
// create data_value for uuid data type
|
|
create_data_value = [] (utils::UUID&& time_uuid) {
|
|
return data_value(std::move(time_uuid));
|
|
};
|
|
}
|
|
|
|
std::vector<data_value> test_data;
|
|
for (auto [msb, lsb] : std::initializer_list<std::pair<int64_t, int64_t>>{
|
|
{0, 0},
|
|
{std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::min()},
|
|
{std::numeric_limits<int64_t>::max(), std::numeric_limits<int64_t>::max()},
|
|
}) {
|
|
test_uuid_and_flipped_uuid(utils::UUID(msb_with_version(msb, 1), lsb), test_data, create_data_value);
|
|
}
|
|
|
|
for (int i = 0; i < 500; i++) {
|
|
// Generate a random msb with version set to 1 (time-based UUID)
|
|
test_uuid_and_flipped_uuid(
|
|
utils::UUID(msb_with_version(tests::random::get_int<int64_t>(std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max()), 1),
|
|
tests::random::get_int<int64_t>(std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max())),
|
|
test_data, create_data_value);
|
|
}
|
|
|
|
return test_data;
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_timeuuid) {
|
|
byte_comparable_test(generate_timeuuid_test_data(true));
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_uuid) {
|
|
// generate time uuids
|
|
auto test_data = generate_timeuuid_test_data(false);
|
|
|
|
// test few edge cases
|
|
test_data.emplace_back(utils::null_uuid());
|
|
test_data.emplace_back(utils::UUID(std::numeric_limits<int64_t>::max(), std::numeric_limits<int64_t>::max()));
|
|
test_data.emplace_back(utils::UUID(std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::min()));
|
|
test_data.emplace_back(utils::UUID("ffffffff-ffff-ffff-ffff-ffffffffffff"));
|
|
// test name based, type 3 uuids
|
|
test_data.emplace_back(utils::UUID_gen::get_name_UUID("scylladb"));
|
|
test_data.emplace_back(utils::UUID_gen::get_name_UUID("lakshminarayanansreethar"));
|
|
|
|
// generate few random uuids
|
|
std::function<data_value(utils::UUID&&)> create_data_value = [] (utils::UUID&& time_uuid) {
|
|
return data_value(std::move(time_uuid));
|
|
};
|
|
for (auto i = 0; i < 500; i++) {
|
|
// Generate a random msb with version set to 4
|
|
test_uuid_and_flipped_uuid(
|
|
utils::UUID(msb_with_version(tests::random::get_int<int64_t>(std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max()), 4),
|
|
tests::random::get_int<int64_t>(std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max())),
|
|
test_data, create_data_value);
|
|
}
|
|
|
|
byte_comparable_test(std::move(test_data));
|
|
}
|
|
|
|
extern std::size_t count_digits(const boost::multiprecision::cpp_int& value);
|
|
BOOST_AUTO_TEST_CASE(test_count_digits) {
|
|
auto test_precision = [] (boost::multiprecision::cpp_int&& num) {
|
|
const auto expected_length = num.str().length();
|
|
BOOST_REQUIRE_EQUAL(count_digits(num), expected_length);
|
|
BOOST_REQUIRE_EQUAL(count_digits(-num), expected_length);
|
|
};
|
|
|
|
test_precision(boost::multiprecision::cpp_int("0"));
|
|
test_precision(boost::multiprecision::cpp_int("123"));
|
|
test_precision(boost::multiprecision::cpp_int("123456"));
|
|
test_precision(boost::multiprecision::cpp_int("12345600"));
|
|
test_precision(boost::multiprecision::cpp_int("9999999"));
|
|
test_precision(boost::multiprecision::cpp_int(
|
|
"123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"));
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_decimal) {
|
|
// generate few multiprecision ints to be used as unscaled_values in the big_decimal
|
|
std::vector<boost::multiprecision::cpp_int> unscaled_values;
|
|
auto multiprecision_one = utils::multiprecision_int(1);
|
|
for (int shift = 1; shift <= 10; shift++) {
|
|
for (auto shift_prod : {1, 2, 4, 8, 10, 32, 64, 100, 256}) {
|
|
auto mp_num = multiprecision_one << shift * shift_prod;
|
|
for (auto n : std::initializer_list<utils::multiprecision_int>{mp_num, mp_num - 1, -mp_num, -(mp_num - 1)}) {
|
|
unscaled_values.push_back(std::move(n));
|
|
}
|
|
}
|
|
}
|
|
// scales to generate the big_decimal
|
|
std::vector<int32_t> scales{1, 2, 4, 5, 10, 100, 1000};
|
|
|
|
std::vector<data_value> _test_data;
|
|
_test_data.reserve(unscaled_values.size() * scales.size() * 5);
|
|
for (const auto& unscaled_value : unscaled_values) {
|
|
_test_data.emplace_back(big_decimal(0, unscaled_value));
|
|
_test_data.emplace_back(big_decimal(std::numeric_limits<int32_t>::min(), unscaled_value));
|
|
_test_data.emplace_back(big_decimal(std::numeric_limits<int32_t>::max(), unscaled_value));
|
|
for (const auto& scale : scales) {
|
|
_test_data.emplace_back(big_decimal(scale, unscaled_value));
|
|
_test_data.emplace_back(big_decimal(-scale, unscaled_value));
|
|
}
|
|
}
|
|
|
|
byte_comparable_test(std::move(_test_data));
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_blob) {
|
|
auto random_bytes = [] (size_t length) {
|
|
std::vector<int8_t> data(length);
|
|
for (auto& byte : data) {
|
|
byte = tests::random::get_int<uint8_t>();
|
|
}
|
|
return bytes(reinterpret_cast<const int8_t*>(data.data()), length);
|
|
};
|
|
|
|
std::vector<data_value> test_data;
|
|
test_data.reserve(500);
|
|
for (int i = 0; i < 100; i++) {
|
|
for (int length : {1, 10, 100, 1000}) {
|
|
test_data.emplace_back(random_bytes(length));
|
|
}
|
|
}
|
|
|
|
// test a few cases that are stored across multiple fragments
|
|
for (int i = 0; i < 10; i++) {
|
|
for (int frag_count = 1; frag_count <= 10; frag_count++) {
|
|
const size_t length = 128 * 1024 * frag_count;
|
|
test_data.emplace_back(random_bytes(length));
|
|
}
|
|
}
|
|
|
|
byte_comparable_test(std::move(test_data));
|
|
}
|
|
|
|
static std::vector<data_value> generate_string_test_data(
|
|
std::function<data_value(std::string&&)> create_data_value_func) {
|
|
const std::string charset = "0123456789"
|
|
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
"abcdefghijklmnopqrstuvwxyz";
|
|
|
|
auto random_text = [&charset] (size_t length) {
|
|
std::string generated_text;
|
|
generated_text.reserve(length);
|
|
for (size_t i = 0; i < length; ++i) {
|
|
generated_text += charset[tests::random::get_int<size_t>(0, charset.size() - 1)];
|
|
}
|
|
return generated_text;
|
|
};
|
|
|
|
std::vector<data_value> test_data;
|
|
test_data.reserve(500);
|
|
for (int i = 0; i < 100; i++) {
|
|
for (int length : {1, 10, 100, 1000}) {
|
|
test_data.push_back(create_data_value_func(random_text(length)));
|
|
}
|
|
}
|
|
|
|
// test a few cases that are stored across multiple fragments
|
|
for (int i = 0; i < 10; i++) {
|
|
for (int frag_count = 1; frag_count <= 10; frag_count++) {
|
|
const size_t length = 128 * 1024 * frag_count;
|
|
test_data.push_back(create_data_value_func(random_text(length)));
|
|
}
|
|
}
|
|
|
|
return test_data;
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_ascii) {
|
|
byte_comparable_test(generate_string_test_data([] (std::string&& str) {
|
|
return data_value(ascii_native_type(str));
|
|
}));
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_text) {
|
|
byte_comparable_test(generate_string_test_data([] (std::string&& str) {
|
|
return data_value(str);
|
|
}));
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_duration) {
|
|
constexpr int64_t max_ns_in_a_day = 24L * 60 * 60 * 1000 * 1000 * 1000;
|
|
std::vector<data_value> test_data;
|
|
test_data.reserve(1000);
|
|
for (int i = 0; i < 1000; i++) {
|
|
const auto months = months_counter{tests::random::get_int<int32_t>(0, 12)};
|
|
const auto days = days_counter{tests::random::get_int<int32_t>(0, 28)};
|
|
const auto ns = nanoseconds_counter{tests::random::get_int<int64_t>(0, max_ns_in_a_day)};
|
|
test_data.emplace_back(cql_duration(months, days, ns));
|
|
}
|
|
|
|
byte_comparable_test(std::move(test_data));
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_inet) {
|
|
auto test_data = generate_integer_test_data<uint32_t>([](uint32_t value) {
|
|
return data_value(seastar::net::ipv4_address(value));
|
|
});
|
|
|
|
// Include few more addresses
|
|
for (const std::string& addr : {
|
|
// IPv4
|
|
"127.0.0.1",
|
|
"10.0.0.1",
|
|
"172.16.1.1",
|
|
"192.168.2.2",
|
|
"224.3.3.3",
|
|
// IPv6
|
|
"0000:0000:0000:0000:0000:0000:0000:0000",
|
|
"ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff",
|
|
"fe80:1:23:456:7890:1:23:456",
|
|
}) {
|
|
test_data.emplace_back(seastar::net::inet_address(addr));
|
|
}
|
|
|
|
byte_comparable_test(std::move(test_data));
|
|
}
|
|
|
|
static data_value make_random_data_value_uuid() { return data_value(utils::make_random_uuid()); }
|
|
static data_value make_random_data_value_bytes() {
|
|
constexpr size_t max_bytes_size = 128 * 1024; // 128 KB
|
|
return data_value(tests::random::get_bytes(tests::random::get_int<size_t>(1, max_bytes_size)));
|
|
}
|
|
|
|
extern void encode_component(const abstract_type& type, managed_bytes_view serialized_bytes_view, bytes_ostream& out);
|
|
extern void decode_component(const abstract_type& type, managed_bytes_view& comparable_bytes_view, bytes_ostream& out);
|
|
BOOST_AUTO_TEST_CASE(test_encode_decode_component) {
|
|
// Verify encode and decode works
|
|
bytes_ostream out;
|
|
constexpr uint8_t NEXT_COMPONENT = 0x40;
|
|
for (const auto& test_value : {
|
|
make_random_data_value_uuid(), // data type with fixed length
|
|
make_random_data_value_bytes(), // data type with variable length
|
|
}) {
|
|
const auto& type = *test_value.type();
|
|
out.clear();
|
|
auto serialized_bytes = test_value.serialize_nonnull();
|
|
encode_component(type, managed_bytes_view(serialized_bytes), out);
|
|
auto comparable_bytes = std::move(out).to_managed_bytes();
|
|
auto comparable_bytes_view = managed_bytes_view(comparable_bytes);
|
|
// encoded component should begin with a NEXT_COMPONENT marker
|
|
BOOST_REQUIRE_EQUAL(read_simple_native<uint8_t>(comparable_bytes_view), NEXT_COMPONENT);
|
|
out.clear();
|
|
decode_component(type, comparable_bytes_view, out);
|
|
auto decoded_bytes = std::move(out).to_managed_bytes();
|
|
auto decoded_bytes_view = managed_bytes_view(decoded_bytes);
|
|
// decoded bytes should match the serialized form
|
|
BOOST_REQUIRE_EQUAL(read_simple<int32_t>(decoded_bytes_view), test_value.serialized_size());
|
|
BOOST_REQUIRE(decoded_bytes_view == managed_bytes_view(serialized_bytes));
|
|
}
|
|
}
|
|
|
|
// Generates a vector of vectors of data_value, where each inner vector represents a collection of data_values.
|
|
template<size_t collection_size = 0>
|
|
static auto generate_collection_test_data(const std::function<data_value()>& create_data_value) {
|
|
constexpr size_t test_data_size = 500, max_collection_size = 25;
|
|
std::vector<std::vector<data_value>> test_data;
|
|
test_data.reserve(test_data_size + 21);
|
|
for (size_t i = 0; i < test_data_size; i++) {
|
|
// Generate a single collection and add it to test data
|
|
std::vector<data_value> collection;
|
|
if constexpr (collection_size == 0) {
|
|
collection.reserve(tests::random::get_int<size_t>(1, max_collection_size));
|
|
} else {
|
|
collection.reserve(collection_size);
|
|
}
|
|
for (size_t j = 0; j < collection.capacity(); j++) {
|
|
collection.push_back(create_data_value());
|
|
}
|
|
test_data.push_back(std::move(collection));
|
|
}
|
|
|
|
// Include few duplicates in the test data with variations
|
|
for (int i = 0; i < 10; i++) {
|
|
test_data.emplace_back(test_data.at(tests::random::get_int<size_t>(test_data_size - 1)));
|
|
// include a partial duplicate
|
|
auto test_item = test_data.at(tests::random::get_int<size_t>(test_data_size - 1));
|
|
test_data.emplace_back(test_item.begin(), test_item.begin() + tests::random::get_int<size_t>(1, test_item.size()));
|
|
if constexpr (collection_size != 0) {
|
|
// For fixed-size collections, the partial duplicate must be padded with random data to meet the required size.
|
|
auto& partial_duplicate = test_data.back();
|
|
while (partial_duplicate.size() < collection_size) {
|
|
partial_duplicate.push_back(create_data_value());
|
|
}
|
|
}
|
|
}
|
|
|
|
if constexpr (collection_size == 0) {
|
|
// Add an empty collection to the test data
|
|
test_data.push_back({});
|
|
}
|
|
|
|
return test_data;
|
|
}
|
|
|
|
// Common test method for lists and sets. Note that a set is expected to be sorted and unique,
|
|
// but it doesn't matter during tests, as both lists and sets internally use the same underlying
|
|
// implementation based on std::vectors.
|
|
static void test_set_or_list(const std::function<data_type(data_type, bool)>& get_collection_type,
|
|
const std::function<data_value(data_type, std::vector<data_value>)>& make_collection_value) {
|
|
// Generate vector of collections for each underlying type, with and without
|
|
// multi-cell enabled and run the tests on them.
|
|
auto do_test = [&] (const data_type& underlying_type, std::vector<std::vector<data_value>>&& test_data) {
|
|
for (bool is_multi_cell : {false, true}) {
|
|
std::vector<data_value> collection_test_data;
|
|
collection_test_data.reserve(test_data.size());
|
|
auto collection_type = get_collection_type(underlying_type, is_multi_cell);
|
|
for (const auto& data : test_data) {
|
|
collection_test_data.emplace_back(make_collection_value(collection_type, data));
|
|
}
|
|
|
|
byte_comparable_test(std::move(collection_test_data));
|
|
}
|
|
};
|
|
|
|
// Test the collection with a data type that has fixed length : UUID (128 bits)
|
|
do_test(uuid_type, generate_collection_test_data(make_random_data_value_uuid));
|
|
// Test the collection with a data type that has variable length : bytes
|
|
do_test(bytes_type, generate_collection_test_data(make_random_data_value_bytes));
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_set) {
|
|
test_set_or_list(set_type_impl::get_instance, make_set_value);
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_list) {
|
|
test_set_or_list(list_type_impl::get_instance, make_list_value);
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_map) {
|
|
// Generate the test data for a map with UUID keys and bytes values.
|
|
constexpr size_t test_data_size = 500, max_entries_per_map = 25;
|
|
std::vector<map_type_impl::native_type> map_test_data;
|
|
map_test_data.reserve(test_data_size + 21);
|
|
for (size_t i = 0; i < test_data_size; i++) {
|
|
map_type_impl::native_type test_item;
|
|
size_t num_entries = tests::random::get_int<size_t>(1, max_entries_per_map);
|
|
for (size_t j = 0; j < num_entries; j++) {
|
|
// Generate a random UUID and a random bytes value
|
|
test_item.emplace_back(make_random_data_value_uuid(), make_random_data_value_bytes());
|
|
}
|
|
|
|
// Add the map to the test data
|
|
map_test_data.emplace_back(test_item.begin(), test_item.end());
|
|
}
|
|
|
|
// Include duplicates with some variants
|
|
for (int i = 0; i < 10; i++) {
|
|
auto test_item = map_test_data.at(tests::random::get_int<size_t>(test_data_size - 1));
|
|
map_test_data.emplace_back(test_item);
|
|
map_type_impl::native_type duplicate_with_different_values;
|
|
for (const auto& [key, value] : test_item) {
|
|
duplicate_with_different_values.emplace_back(key, make_random_data_value_bytes());
|
|
}
|
|
map_test_data.emplace_back(std::move(duplicate_with_different_values));
|
|
}
|
|
|
|
// Add an empty entry to the map
|
|
map_test_data.emplace_back();
|
|
|
|
for (bool is_multi_cell : {false, true}) {
|
|
const auto map_type = map_type_impl::get_instance(uuid_type, bytes_type, is_multi_cell);
|
|
std::vector<data_value> collection_test_data;
|
|
collection_test_data.reserve(map_test_data.size());
|
|
for (const auto& data : map_test_data) {
|
|
collection_test_data.emplace_back(make_map_value(map_type, data));
|
|
}
|
|
|
|
byte_comparable_test(std::move(collection_test_data));
|
|
}
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_tuple) {
|
|
// Generate the test data for tuple with UUID and bytes types
|
|
constexpr int test_data_size = 1000;
|
|
std::vector<data_value> tuple_test_data;
|
|
tuple_test_data.reserve(test_data_size + 30 + 3);
|
|
const auto test_tuple_type = tuple_type_impl::get_instance({uuid_type, bytes_type});
|
|
for (int i = 0; i < test_data_size; i++) {
|
|
tuple_test_data.emplace_back(make_tuple_value(test_tuple_type, {make_random_data_value_uuid(), make_random_data_value_bytes()}));
|
|
}
|
|
|
|
// Include few duplicates in the test data with variations
|
|
for (int i = 0; i < 10; i++) {
|
|
auto test_item = value_cast<tuple_type_impl::native_type>(
|
|
tuple_test_data.at(tests::random::get_int<size_t>(test_data_size - 1)));
|
|
tuple_test_data.emplace_back(make_tuple_value(test_tuple_type, {test_item.at(0), make_random_data_value_bytes()}));
|
|
tuple_test_data.emplace_back(make_tuple_value(test_tuple_type, {make_random_data_value_uuid(), test_item.at(1)}));
|
|
tuple_test_data.emplace_back(make_tuple_value(test_tuple_type, {test_item.at(0), test_item.at(1)}));
|
|
}
|
|
|
|
// Include tuples with nulls in the testdata
|
|
tuple_test_data.emplace_back(make_tuple_value(test_tuple_type, {make_random_data_value_uuid(), data_value::make_null(bytes_type)}));
|
|
tuple_test_data.emplace_back(make_tuple_value(test_tuple_type, {data_value::make_null(uuid_type), make_random_data_value_bytes()}));
|
|
tuple_test_data.emplace_back(make_tuple_value(test_tuple_type, {data_value::make_null(uuid_type), data_value::make_null(bytes_type)}));
|
|
|
|
byte_comparable_test(std::move(tuple_test_data));
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_udt) {
|
|
// Generate data for UDT with following types : uuid, bytes, int64_t
|
|
constexpr int test_data_size = 1000;
|
|
std::vector<user_type_impl::native_type> udt_test_data;
|
|
udt_test_data.reserve(test_data_size + 100);
|
|
auto make_random_data_value_int64 = [] () {
|
|
return data_value(tests::random::get_int<int64_t>(std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max()));
|
|
};
|
|
for (int i = 0; i < test_data_size; i++) {
|
|
udt_test_data.emplace_back(user_type_impl::native_type{
|
|
make_random_data_value_uuid(), make_random_data_value_bytes(), make_random_data_value_int64()});
|
|
}
|
|
|
|
// Include few duplicates in the test data with variations
|
|
for (int i = 0; i < 10; i ++) {
|
|
auto test_item = udt_test_data.at(tests::random::get_int<size_t>(test_data_size - 1));
|
|
udt_test_data.emplace_back(user_type_impl::native_type{test_item.at(0), test_item.at(1), make_random_data_value_int64()});
|
|
udt_test_data.emplace_back(user_type_impl::native_type{test_item.at(0), make_random_data_value_bytes(), test_item.at(2)});
|
|
udt_test_data.emplace_back(user_type_impl::native_type{make_random_data_value_uuid(), test_item.at(1), test_item.at(2)});
|
|
udt_test_data.emplace_back(user_type_impl::native_type{test_item.at(0), make_random_data_value_bytes(), make_random_data_value_int64()});
|
|
udt_test_data.emplace_back(user_type_impl::native_type{make_random_data_value_uuid(), test_item.at(1), make_random_data_value_int64()});
|
|
udt_test_data.emplace_back(user_type_impl::native_type{make_random_data_value_uuid(), make_random_data_value_bytes(), test_item.at(2)});
|
|
udt_test_data.emplace_back(test_item);
|
|
}
|
|
|
|
// Include tuples with nulls in the testdata
|
|
udt_test_data.emplace_back(user_type_impl::native_type{make_random_data_value_uuid(), make_random_data_value_bytes(), data_value::make_null(long_type)});
|
|
udt_test_data.emplace_back(user_type_impl::native_type{make_random_data_value_uuid(), data_value::make_null(bytes_type), make_random_data_value_int64()});
|
|
udt_test_data.emplace_back(user_type_impl::native_type{data_value::make_null(uuid_type), make_random_data_value_bytes(), make_random_data_value_int64()});
|
|
udt_test_data.emplace_back(user_type_impl::native_type{make_random_data_value_uuid(), data_value::make_null(bytes_type), data_value::make_null(long_type)});
|
|
udt_test_data.emplace_back(user_type_impl::native_type{data_value::make_null(uuid_type), make_random_data_value_bytes(), data_value::make_null(long_type)});
|
|
udt_test_data.emplace_back(user_type_impl::native_type{data_value::make_null(uuid_type), data_value::make_null(bytes_type), make_random_data_value_int64()});
|
|
udt_test_data.emplace_back(user_type_impl::native_type{data_value::make_null(uuid_type), data_value::make_null(bytes_type), data_value::make_null(long_type)});
|
|
|
|
|
|
// Run the test for both frozen and non frozen types
|
|
for (auto is_multi_cell : {false, true}) {
|
|
const auto test_udt_type = user_type_impl::get_instance("ks_test", "cb_test_udt",
|
|
std::vector<bytes>{"field1", "field2", "field3"},
|
|
std::vector<data_type>{uuid_type, bytes_type, long_type}, is_multi_cell);
|
|
std::vector<data_value> collection_test_data;
|
|
collection_test_data.reserve(udt_test_data.size());
|
|
for (const auto& data : udt_test_data) {
|
|
collection_test_data.emplace_back(make_user_value(test_udt_type, data));
|
|
}
|
|
|
|
byte_comparable_test(std::move(collection_test_data));
|
|
}
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_vector) {
|
|
auto do_test = [&] (const data_type& underlying_type, std::vector<std::vector<data_value>>&& test_data) {
|
|
std::vector<data_value> collection_test_data;
|
|
collection_test_data.reserve(test_data.size());
|
|
auto collection_type = vector_type_impl::get_instance(underlying_type, test_data.at(0).size());
|
|
for (const auto& data : test_data) {
|
|
collection_test_data.emplace_back(make_vector_value(collection_type, data));
|
|
}
|
|
|
|
byte_comparable_test(std::move(collection_test_data));
|
|
};
|
|
|
|
// Test the collection with a data type that has fixed length : UUID (128 bits)
|
|
do_test(uuid_type, generate_collection_test_data<128>(make_random_data_value_uuid));
|
|
// Test the collection with a data type that has variable length : bytes
|
|
do_test(bytes_type, generate_collection_test_data<16>(make_random_data_value_bytes));
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_reversed) {
|
|
// Test reversed with native types
|
|
byte_comparable_test(generate_integer_test_data<int64_t>(), true);
|
|
byte_comparable_test(generate_string_test_data([] (std::string&& str) {
|
|
return data_value(str);
|
|
}), true);
|
|
|
|
// Test reversed with a collection
|
|
const auto list_type = list_type_impl::get_instance(bytes_type, false);
|
|
std::vector<data_value> collection_test_data;
|
|
collection_test_data.reserve(510);
|
|
for (const auto& test_case : generate_collection_test_data(make_random_data_value_bytes)) {
|
|
collection_test_data.emplace_back(make_list_value(list_type, test_case));
|
|
}
|
|
byte_comparable_test(std::move(collection_test_data), true);
|
|
}
|
|
|
|
BOOST_AUTO_TEST_CASE(test_empty) {
|
|
auto test_data = data_value(empty_type_representation{});
|
|
auto test_data_cb = comparable_bytes::from_data_value(test_data);
|
|
BOOST_REQUIRE(test_data_cb->size() == 0);
|
|
BOOST_REQUIRE(test_data == test_data_cb->to_data_value(empty_type));
|
|
}
|
|
|
|
// Test Scylla's byte-comparable encoding compatibility with Cassandra's implementation by
|
|
// verifying that serialized values produce the same comparable bytes as those generated by Cassandra.
|
|
// The test data was generated using the cassandra unit test pushed to the following branch:
|
|
// https://github.com/scylladb/scylla-dev/blob/byte-comparable-compatibility-generator
|
|
SEASTAR_TEST_CASE(test_compatibility) {
|
|
return sstables::test_env::do_with_async([] (sstables::test_env&) {
|
|
auto file = open_file_dma("test/resource/byte_comparable_compatibility_data.csv", open_flags::ro).get();
|
|
auto fs = make_file_input_stream(file);
|
|
temporary_buffer<char> buf = fs.read().get();
|
|
// Read file contents in a loop and handle them line by line.
|
|
data_type type;
|
|
std::string input_buffer;
|
|
while (!buf.empty()) {
|
|
input_buffer.append(buf.get(), buf.size());
|
|
size_t pos = 0;
|
|
while (pos != input_buffer.size()) {
|
|
// Extract the CSV entry from the next line
|
|
size_t end = input_buffer.find('\n', pos);
|
|
if (end == std::string::npos) {
|
|
// no \n in the input, need to read more data from the file
|
|
break;
|
|
}
|
|
std::string curr_line = input_buffer.substr(pos, end - pos);
|
|
pos = end + 1;
|
|
|
|
// Test data has `type` followed by the test data in subsequent lines.
|
|
// Extract them from curr_line.
|
|
if (curr_line.starts_with("org.apache.cassandra.db.marshal")) {
|
|
// This is the type line, parse it and continue to the next line.
|
|
type = db::marshal::type_parser::parse(std::string_view(curr_line));
|
|
testlog.info("testing compatibility of type: {}",
|
|
type->is_reversed() ? format("reversed<{}>", type->cql3_type_name()) : type->cql3_type_name());
|
|
continue;
|
|
}
|
|
|
|
// This line has the test data for the type.
|
|
// Test data has two columns: actual value and comparable bytes encoded by cassandra
|
|
const auto comma_pos = curr_line.rfind(',');
|
|
BOOST_REQUIRE_MESSAGE(comma_pos != std::string::npos, "invalid CSV entry");
|
|
const auto actual_value = curr_line.substr(0, comma_pos);
|
|
const auto origin_encoded_cb = comparable_bytes(managed_bytes(bytes_type->from_string(curr_line.substr(comma_pos + 1))));
|
|
|
|
bytes serialized_bytes;
|
|
if (type->is_native()) {
|
|
serialized_bytes = type->from_string(actual_value);
|
|
} else {
|
|
// Workaround for composite types as abstract_type::from_string() doesn't support them.
|
|
serialized_bytes = from_json_object(*type, rjson::parse(actual_value));
|
|
}
|
|
|
|
// Verify encoding
|
|
comparable_bytes scylla_encoded_cb(*type, managed_bytes_view(serialized_bytes));
|
|
BOOST_REQUIRE_MESSAGE(scylla_encoded_cb == origin_encoded_cb, seastar::value_of([&] () {
|
|
return fmt::format("encoding failed for value : {}", actual_value);
|
|
}));
|
|
|
|
// Verify decoding
|
|
BOOST_REQUIRE_MESSAGE(origin_encoded_cb.to_data_value(type) == type->deserialize(serialized_bytes), seastar::value_of([&] () {
|
|
return fmt::format("decoding failed for value : {}", actual_value);
|
|
}));
|
|
}
|
|
|
|
// Remove the lines that were processed from the input buffer.
|
|
input_buffer.erase(0, pos);
|
|
|
|
buf = fs.read().get();
|
|
}
|
|
file.close().get();
|
|
});
|
|
}
|