Files
scylladb/utils/utf8.hh
Piotr Grabowski ffd8c8c505 utf8: Print invalid UTF-8 character position
Add new validate_with_error_position function
which returns -1 if data is a valid UTF-8 string
or otherwise a byte position of first invalid
character. The position is added to exception
messages of all UTF-8 parsing errors in Scylla.

validate_with_error_position is done in two
passes in order to preserve the same performance
in common case when the string is valid.
2020-09-07 18:11:21 +03:00

57 lines
1.7 KiB
C++

/*
* Leverage SIMD for fast UTF-8 validation with range base algorithm.
* Details at https://github.com/cyb70289/utf8/.
*
* Copyright (c) 2018, Arm Limited and affiliates. All rights reserved.
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include <cstdint>
#include "bytes.hh"
namespace utils {
namespace utf8 {
bool validate(const uint8_t *data, size_t len);
inline bool validate(bytes_view string) {
const uint8_t *data = reinterpret_cast<const uint8_t*>(string.data());
size_t len = string.size();
return validate(data, len);
}
// If data represents a correct UTF-8 string, return std::nullopt,
// otherwise return a position of first error byte.
std::optional<size_t> validate_with_error_position(const uint8_t *data, size_t len);
inline std::optional<size_t> validate_with_error_position(bytes_view string) {
const uint8_t *data = reinterpret_cast<const uint8_t*>(string.data());
size_t len = string.size();
return validate_with_error_position(data, len);
}
} // namespace utf8
} // namespace utils