/* * Copyright (c) 2018, Arm Limited and affiliates. All rights reserved. */ /* * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 */ #define BOOST_TEST_MODULE core #include #include #include #include #include "utils/utf8.hh" #include "utils/fragmented_temporary_buffer.hh" struct test_str { const void *data; size_t len; size_t bad_pos = 0; }; // Positive strings static const std::vector positive = { {"", 0}, {"\x00", 1}, {"\x66", 1}, {"\x7F", 1}, {"\x00\x7F", 2}, {"\x7F\x00", 2}, {"\xC2\x80", 2}, {"\xDF\xBF", 2}, {"\xE0\xA0\x80", 3}, {"\xE0\xA0\xBF", 3}, {"\xED\x9F\x80", 3}, {"\xEF\x80\xBF", 3}, {"\xF0\x90\xBF\x80", 4}, {"\xF2\x81\xBE\x99", 4}, {"\xF4\x8F\x88\xAA", 4}, }; // Negative strings static const std::vector negative = { {"\x80", 1}, {"\xBF", 1}, {"\xC0\x80", 2}, {"\xC1\x00", 2}, {"\xC2\x7F", 2}, {"\xDF\xC0", 2}, {"\xE0\x9F\x80", 3}, {"\xE0\xC2\x80", 3}, {"\xED\xA0\x80", 3}, {"\xED\x7F\x80", 3}, {"\xEF\x80\x00", 3}, {"\xF0\x8F\x80\x80", 4}, {"\xF0\xEE\x80\x80", 4}, {"\xF2\x90\x91\x7F", 4}, {"\xF4\x90\x88\xAA", 4}, {"\xF4\x00\xBF\xBF", 4}, {"\x00\x00\x00\x00\x00\xC2\x80\x00\x00\x00\xE1\x80\x80\x00\x00\xC2" \ "\xC2\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", 32, 15}, {"\x00\x00\x00\x00\x00\xC2\xC2\x80\x00\x00\xE1\x80\x80\x00\x00\x00", 16, 5}, {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1\x80", 32, 30}, {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1", 32, 31}, {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1\x80" \ "\x80", 33, 30}, {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1\x80" \ "\xC2\x80", 34, 30}, {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF0" \ "\x80\x80\x80", 35, 31}, }; // Round concatenate positive test strings to 1024 bytes static void prepare_test_buf(uint8_t *buf, size_t pos_idx) { int buf_idx = 0; while (buf_idx < 1024) { size_t buf_len = 1024 - buf_idx; if (buf_len >= positive[pos_idx].len) { memcpy(buf+buf_idx, positive[pos_idx].data, positive[pos_idx].len); buf_idx += positive[pos_idx].len; } else { // Fill remaining buffer with 0 memset(buf+buf_idx, 0, buf_len); buf_idx += buf_len; } if (++pos_idx == positive.size()) { pos_idx = 0; } } } BOOST_AUTO_TEST_CASE(test_utf8_positive) { // Test single positive string for (auto &test : positive) { BOOST_CHECK(utils::utf8::validate((const uint8_t*)test.data, test.len)); } const int max_size = 1024 + 32; uint64_t buf64[max_size/8 + 2]; // Unalign buffer address: offset 8 bytes boundary by 1 byte uint8_t *buf = (reinterpret_cast(buf64)) + 1; // Test concatenated and shifted positive strings to cover 1k length for (size_t i = 0; i < positive.size(); ++i) { // Round concatenate strings staring from i-th positive string size_t buf_len = 1024; prepare_test_buf(buf, i); // Shift 16 bytes, validate each shift for (int j = 0; j < 16; ++j) { BOOST_CHECK(utils::utf8::validate(buf, buf_len)); for (int k = buf_len; k >= 1; --k) buf[k] = buf[k-1]; buf[0] = '\x55'; ++buf_len; } } } BOOST_AUTO_TEST_CASE(test_utf8_negative) { // Test single negative string for (auto &test : negative) { BOOST_CHECK(!utils::utf8::validate((const uint8_t*)test.data, test.len)); } // Must be larger than 1024 + 16 + max(negative string length) uint8_t buf[1024*2]; for (size_t i = 0; i < negative.size(); ++i) { prepare_test_buf(buf, i % positive.size()); // Append one error string memcpy(buf+1024, negative[i].data, negative[i].len); size_t buf_len = 1024 + negative[i].len; // Shift 16 bytes, validate each shift for (int j = 0; j < 16; ++j) { BOOST_CHECK(!utils::utf8::validate(buf, buf_len)); for (int k = buf_len; k >= 1; --k) buf[k] = buf[k-1]; buf[0] = '\x66'; ++buf_len; } } } BOOST_AUTO_TEST_CASE(test_utf8_position) { auto test_string = [](const char* str, std::optional expected) { BOOST_CHECK(utils::utf8::validate_with_error_position(reinterpret_cast(str), strlen(str)) == expected); }; test_string("valid string", std::nullopt); test_string("ab\xc3\x28 xx", 2); test_string("abc\xe2\x82\x28", 3); test_string("abcd\xf0\x28\x8c\x28", 4); test_string("abcd\xc3\x28", 4); } BOOST_AUTO_TEST_CASE(test_utf8_fragmented) { auto random_engine = std::default_random_engine(std::random_device()()); std::vector tmp; tmp.reserve(20000); // avoid reallocations std::optional bad_pos; for (unsigned i = 0; i < 100000; ++i) { auto nr_positive_begin = std::uniform_int_distribution(0, 30)(random_engine); auto nr_negative = std::uniform_int_distribution(0, 1)(random_engine); auto nr_positive_end = std::uniform_int_distribution(0, 20)(random_engine); auto nr_frags = std::uniform_int_distribution(1, 4)(random_engine); tmp.clear(); bad_pos.reset(); auto random_test_str = [&] (const std::vector& test_set) -> test_str { auto idx = std::uniform_int_distribution(0, test_set.size() - 1)(random_engine); return test_set[idx]; }; auto add_test_str = [&] (test_str t) { auto data = reinterpret_cast(t.data); tmp.insert(tmp.end(), data, data + t.len); }; auto add_negative_test_str = [&] (test_str t) { if (!bad_pos) { bad_pos = tmp.size() + t.bad_pos; } add_test_str(t); }; auto fragmentize = [&] (int nr_frags) -> fragmented_temporary_buffer { std::vector> vec; std::vector breakpoints; vec.reserve(nr_frags); breakpoints.reserve(nr_frags + 1); breakpoints.push_back(0); for (int i = 0; i < nr_frags - 1; ++i) { breakpoints.push_back(std::uniform_int_distribution(0, tmp.size())(random_engine)); } breakpoints.push_back(tmp.size()); std::sort(breakpoints.begin(), breakpoints.end()); auto data = reinterpret_cast(tmp.data()); for (int i = 0; i < nr_frags; ++i) { vec.push_back(temporary_buffer(data + breakpoints[i], breakpoints[i+1] - breakpoints[i])); } return fragmented_temporary_buffer(std::move(vec), tmp.size()); }; for (int j = 0; j != nr_positive_begin; ++j) { add_test_str(random_test_str(positive)); } for (int j = 0; j != nr_negative; ++j) { add_negative_test_str(random_test_str(negative)); } for (int j = 0; j != nr_positive_end; ++j) { add_test_str(random_test_str(positive)); } auto frag_buf = fragmentize(nr_frags); auto result = utils::utf8::validate_with_error_position_fragmented(fragmented_temporary_buffer::view(frag_buf)); BOOST_REQUIRE(result == bad_pos); } }