mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-22 07:42:16 +00:00
value_to_json() converts CQL values to JSON for vector search filters. For decimal and varint types, it used rjson::parse() on the JSON string, which parses through a double and silently loses precision for values exceeding ~15 significant digits — producing wrong filter results. Additionally, for decimal type we need an exact string representation that preserves the original (unscaled, scale) pair, because partition keys use byte-level identity: different serialized representations of the same numeric value are distinct rows, so the filter must reproduce the exact representation stored in the key. Add big_decimal::to_string_canonical() which follows the Java BigDecimal toString() spec (JDK 8+), producing a bijective string representation that uses exponential notation for extreme scales instead of expanding trailing zeros (which could cause OOM). This could replace to_string(), but doing so has wider consequences (e.g. hash/equality contract for decimal_type) described in SCYLLADB-1574. Use it in value_to_json() for decimal_type, and use rjson::from_string() for varint_type, both bypassing the lossy double parse path. Tests cover the new to_string_canonical() and the filter fix, as well as existing decimal type behavior (key representation, clustering order, toJson) that we rely on and must not break. The CQL decimal type tests (test_type_decimal.py) also pass against Cassandra. Fixes: https://scylladb.atlassian.net/browse/SCYLLADB-1583 Refs: https://scylladb.atlassian.net/browse/SCYLLADB-1574 Closes scylladb/scylladb#29505
386 lines
14 KiB
C++
386 lines
14 KiB
C++
/*
|
|
* Copyright (C) 2015-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
|
*/
|
|
|
|
#include "utils/assert.hh"
|
|
#include "big_decimal.hh"
|
|
#include <cassert>
|
|
#include "marshal_exception.hh"
|
|
#include <seastar/core/format.hh>
|
|
|
|
#ifdef __clang__
|
|
|
|
// Clang or boost have a problem navigating the enable_if maze
|
|
// that is cpp_int's constructor. It ends up treating the
|
|
// string_view as binary and "0" ends up 48.
|
|
|
|
// Work around by casting to string.
|
|
using string_view_workaround = std::string;
|
|
|
|
#else
|
|
|
|
using string_view_workaround = std::string_view;
|
|
|
|
#endif
|
|
|
|
uint64_t from_varint_to_integer(const utils::multiprecision_int& varint) {
|
|
// The behavior CQL expects on overflow is for values to wrap
|
|
// around. For cpp_int conversion functions, the behavior is to
|
|
// return the largest or smallest number that the target type can
|
|
// represent. To implement one with the other, we first mask the
|
|
// low 64 bits, convert to a uint64_t, and then let c++ convert,
|
|
// with possible overflow, to ToType.
|
|
return static_cast<uint64_t>(~static_cast<uint64_t>(0) & boost::multiprecision::cpp_int(varint));
|
|
}
|
|
|
|
big_decimal::big_decimal() : big_decimal(0, 0) {}
|
|
big_decimal::big_decimal(int32_t scale, boost::multiprecision::cpp_int unscaled_value)
|
|
: _scale(scale), _unscaled_value(std::move(unscaled_value)) {}
|
|
|
|
big_decimal::big_decimal(std::string_view text)
|
|
{
|
|
size_t e_pos = text.find_first_of("eE");
|
|
std::string_view base = text.substr(0, e_pos);
|
|
std::string_view exponent;
|
|
if (e_pos != std::string_view::npos) {
|
|
exponent = text.substr(e_pos + 1);
|
|
if (exponent.empty()) {
|
|
throw marshal_exception(seastar::format("big_decimal - incorrect empty exponent: {}", text));
|
|
}
|
|
}
|
|
size_t dot_pos = base.find_first_of(".");
|
|
std::string integer_str(base.substr(0, dot_pos));
|
|
std::string_view fraction;
|
|
if (dot_pos != std::string_view::npos) {
|
|
fraction = base.substr(dot_pos + 1);
|
|
integer_str.append(fraction);
|
|
}
|
|
std::string_view integer(integer_str);
|
|
const bool negative = !integer.empty() && integer.front() == '-';
|
|
integer.remove_prefix(negative || (!integer.empty() && integer.front() == '+'));
|
|
|
|
if (integer.empty()) {
|
|
throw marshal_exception(format("big_decimal - both integer and fraction are empty"));
|
|
} else if (!::isdigit(integer.front())) {
|
|
throw marshal_exception(seastar::format("big_decimal - incorrect integer: {}", text));
|
|
}
|
|
|
|
integer.remove_prefix(std::min(integer.find_first_not_of("0"), integer.size() - 1));
|
|
try {
|
|
_unscaled_value = boost::multiprecision::cpp_int(string_view_workaround(integer));
|
|
} catch (...) {
|
|
throw marshal_exception(seastar::format("big_decimal - failed to parse integer value: {}", integer));
|
|
}
|
|
if (negative) {
|
|
_unscaled_value *= -1;
|
|
}
|
|
// parse scale as int64_t, so that it can be adjusted with fraction size and then checked for overflow.
|
|
int64_t scale = 0;
|
|
try {
|
|
scale = exponent.empty() ? 0 : -boost::lexical_cast<int64_t>(exponent);
|
|
} catch (...) {
|
|
throw marshal_exception(seastar::format("big_decimal - failed to parse exponent: {}", exponent));
|
|
}
|
|
scale += fraction.size();
|
|
if (scale < std::numeric_limits<int32_t>::min() || scale > std::numeric_limits<int32_t>::max()) {
|
|
throw marshal_exception(seastar::format("big_decimal - scale out of range: {}", scale));
|
|
}
|
|
_scale = static_cast<int32_t>(scale);
|
|
}
|
|
|
|
boost::multiprecision::cpp_rational big_decimal::as_rational() const {
|
|
boost::multiprecision::cpp_int ten(10);
|
|
auto unscaled_value = static_cast<const boost::multiprecision::cpp_int&>(_unscaled_value);
|
|
boost::multiprecision::cpp_rational r = unscaled_value;
|
|
int32_t abs_scale = std::abs(_scale);
|
|
auto pow = boost::multiprecision::pow(ten, abs_scale);
|
|
if (_scale < 0) {
|
|
r *= pow;
|
|
} else {
|
|
r /= pow;
|
|
}
|
|
return r;
|
|
}
|
|
|
|
sstring big_decimal::to_string() const
|
|
{
|
|
if (!_unscaled_value) {
|
|
return "0";
|
|
}
|
|
boost::multiprecision::cpp_int num = boost::multiprecision::abs(_unscaled_value);
|
|
auto str = num.str();
|
|
if (_scale < 0) {
|
|
for (int i = 0; i > _scale; i--) {
|
|
str.push_back('0');
|
|
}
|
|
} else if (_scale > 0) {
|
|
if (str.size() > unsigned(_scale)) {
|
|
str.insert(str.size() - _scale, 1, '.');
|
|
} else {
|
|
std::string nstr = "0.";
|
|
nstr.append(_scale - str.size(), '0');
|
|
nstr.append(str);
|
|
str = std::move(nstr);
|
|
}
|
|
|
|
while (str.back() == '0') {
|
|
str.pop_back();
|
|
}
|
|
if (str.back() == '.') {
|
|
str.pop_back();
|
|
}
|
|
}
|
|
if (_unscaled_value < 0) {
|
|
str.insert(0, 1, '-');
|
|
}
|
|
return str;
|
|
}
|
|
|
|
// Java BigDecimal.toString() spec (JDK 8+):
|
|
// See: https://docs.oracle.com/javase/8/docs/api/java/math/BigDecimal.html#toString--
|
|
// adjusted_exp = -scale + (precision - 1)
|
|
// if scale >= 0 AND adjusted_exp >= -6: plain decimal (no exponent)
|
|
// otherwise: exponential notation with 'E' suffix
|
|
//
|
|
// This guarantees a bijective mapping from (unscaled, scale) to string.
|
|
// Using int64_t for adjusted_exp to avoid overflow when scale is extreme.
|
|
//
|
|
// This could replace to_string(), but doing so has wider consequences
|
|
// (e.g. hash/equality contract for decimal_type) described in SCYLLADB-1574.
|
|
sstring big_decimal::to_string_canonical() const
|
|
{
|
|
boost::multiprecision::cpp_int num = boost::multiprecision::abs(_unscaled_value);
|
|
auto digits = !_unscaled_value ? std::string("0") : num.str(); // decimal digits, no sign
|
|
int64_t precision = static_cast<int64_t>(digits.size());
|
|
int64_t adjusted_exp = -static_cast<int64_t>(_scale) + (precision - 1);
|
|
|
|
std::string result;
|
|
if (_unscaled_value < 0) {
|
|
result += '-';
|
|
}
|
|
|
|
if (_scale >= 0 && adjusted_exp >= -6) {
|
|
// Plain decimal form (no exponent).
|
|
int64_t int_digits = precision - static_cast<int64_t>(_scale);
|
|
if (int_digits > 0) {
|
|
result.append(digits.data(), int_digits);
|
|
if (_scale > 0) {
|
|
result += '.';
|
|
result.append(digits.data() + int_digits, _scale);
|
|
}
|
|
} else {
|
|
result += "0.";
|
|
result.append(-int_digits, '0');
|
|
result.append(digits);
|
|
}
|
|
} else {
|
|
// Exponential notation.
|
|
result += digits[0];
|
|
if (precision > 1) {
|
|
result += '.';
|
|
result.append(digits.data() + 1, precision - 1);
|
|
}
|
|
result += 'E';
|
|
if (adjusted_exp >= 0) {
|
|
result += '+';
|
|
}
|
|
result += std::to_string(adjusted_exp);
|
|
}
|
|
|
|
return sstring(result);
|
|
}
|
|
|
|
std::strong_ordering big_decimal::tri_cmp_slow(const big_decimal& other) const
|
|
{
|
|
auto max_scale = std::max(_scale, other._scale);
|
|
boost::multiprecision::cpp_int rescale(10);
|
|
boost::multiprecision::cpp_int x = _unscaled_value * boost::multiprecision::pow(rescale, max_scale - _scale);
|
|
boost::multiprecision::cpp_int y = other._unscaled_value * boost::multiprecision::pow(rescale, max_scale - other._scale);
|
|
return x.compare(y) <=> 0;
|
|
}
|
|
|
|
std::strong_ordering big_decimal::operator<=>(const big_decimal& other) const
|
|
{
|
|
if (_scale == other._scale) {
|
|
return _unscaled_value.compare(other._unscaled_value) <=> 0;
|
|
}
|
|
|
|
// boost::multiprecision::sign() returns -1, 0 or 1
|
|
const int sign = boost::multiprecision::sign(_unscaled_value);
|
|
const int sign_other = boost::multiprecision::sign(other._unscaled_value);
|
|
if (sign != sign_other) {
|
|
return sign <=> sign_other;
|
|
}
|
|
// At this point we know the two signs are equal, so if sign == 0, both signs
|
|
// and consequently both numbers are zeros.
|
|
if (sign == 0) {
|
|
return std::strong_ordering::equal;
|
|
}
|
|
|
|
// At this point we know that both numbers have the same sign, so if one is negative, the other is too.
|
|
// If the number are negative, we invert the sign and compare them in reverse.
|
|
// This creates a copy, but the copy cannot be avoided anyway, because
|
|
// boost::multiprecision::msb() (used below) doesn't work with negative numbers.
|
|
if (sign < 0) {
|
|
auto a = -*this;
|
|
auto b = -other;
|
|
return b.tri_cmp_positive_nonzero_different_scale(a);
|
|
}
|
|
return tri_cmp_positive_nonzero_different_scale(other);
|
|
}
|
|
|
|
std::strong_ordering big_decimal::tri_cmp_positive_nonzero_different_scale(const big_decimal& other) const
|
|
{
|
|
// At this point we know that the numbers:
|
|
// * have different scale
|
|
// * are positive
|
|
// * neither is zero
|
|
//
|
|
// The numbers have the form:
|
|
//
|
|
// auto number = _unscaled_value * std::pow(10, -_scale);
|
|
// auto number_other = other._unscaled_value * std::pow(10, -other._scale);
|
|
//
|
|
// To compare them, we make the unscaled values the same (or close), so we can directly compare the scales.
|
|
// To do that we want to compute unscaled_ratio_log2:
|
|
//
|
|
// auto unscaled_ratio = _unscaled_value / other._unscaled_value;
|
|
// auto unscaled_ratio_log2 = log2(unscaled_ratio);
|
|
//
|
|
// To avoid using division and then calculating a log2(), we use the MSB of
|
|
// both numbers to infer unscaled_ratio_log2 directly.
|
|
const int64_t msb = boost::multiprecision::msb(_unscaled_value);
|
|
const int64_t msb_other = boost::multiprecision::msb(other._unscaled_value);
|
|
const int64_t unscaled_ratio_log2 = msb - msb_other;
|
|
|
|
// Now we can rewrite the original numbers as follows:
|
|
//
|
|
// auto number = _unscaled_value * std::pow(10, -_scale);
|
|
// auto number_other = other._unscaled_value * std::pow(10, -other._scale);
|
|
// auto number_other_approx = _unscaled_value * std::pow(2, unscaled_ratio_log2) * std::pow(10, -other._scale);
|
|
//
|
|
// Notice that number_other_approx != number_other, but it is close, the following holds:
|
|
//
|
|
// assert(number_other/2 <= number_other_approx && number_other_approx <= number_other*2);
|
|
//
|
|
// Now we can almost compare the two scales, we just need to bring the scale bases to the same base of 10.
|
|
// We can observe that:
|
|
//
|
|
// std::pow(2, x) = std::pow(10, x / log2(10));
|
|
//
|
|
// Using this we can rewrite the above numbers again:
|
|
//
|
|
// auto scale_adjustement = unscaled_ratio_log2 / log2_10;
|
|
//
|
|
// auto number = _unscaled_value * std::pow(10, -_scale);
|
|
// auto number_other = other._unscaled_value * std::pow(10, -other._scale);
|
|
// auto number_other_approx = _unscaled_value * std::pow(10, scale_adjustement - other._scale);
|
|
const static double log2_10 = std::log2(10.0);
|
|
const double scale_adjustement = double(unscaled_ratio_log2) / log2_10;
|
|
|
|
// Now the scales are directly comparable.
|
|
double diff_scale = double(_scale) - double(other._scale);
|
|
// Note that diff_scale has inverted sign, because the implicit sign of _scale is negative,
|
|
// We have to use subtraction here to account for that.
|
|
diff_scale -= scale_adjustement;
|
|
|
|
// This is our confidence window for estimating the difference (in the power of 10) between the numbers.
|
|
// We have to account for two things here:
|
|
// * inaccuracy in the log2(10)
|
|
// * maximum difference between the unscaled values, after normalizing them to the same bit-count, which is order of 2
|
|
//
|
|
// If the numbers are closer than our confidence window, we fall back to slow but precise tri_cmp_slow().
|
|
if (-1.0 < diff_scale && diff_scale < 1.0) {
|
|
return tri_cmp_slow(other);
|
|
}
|
|
// Need to invert the sign, see comment above calculating diff_scale.
|
|
return int64_t(-diff_scale) <=> 0;
|
|
}
|
|
|
|
big_decimal& big_decimal::operator+=(const big_decimal& other)
|
|
{
|
|
if (_scale == other._scale) {
|
|
_unscaled_value += other._unscaled_value;
|
|
} else {
|
|
boost::multiprecision::cpp_int rescale(10);
|
|
auto max_scale = std::max(_scale, other._scale);
|
|
boost::multiprecision::cpp_int u = _unscaled_value * boost::multiprecision::pow(rescale, max_scale - _scale);
|
|
boost::multiprecision::cpp_int v = other._unscaled_value * boost::multiprecision::pow(rescale, max_scale - other._scale);
|
|
_unscaled_value = u + v;
|
|
_scale = max_scale;
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
big_decimal& big_decimal::operator-=(const big_decimal& other) {
|
|
if (_scale == other._scale) {
|
|
_unscaled_value -= other._unscaled_value;
|
|
} else {
|
|
boost::multiprecision::cpp_int rescale(10);
|
|
auto max_scale = std::max(_scale, other._scale);
|
|
boost::multiprecision::cpp_int u = _unscaled_value * boost::multiprecision::pow(rescale, max_scale - _scale);
|
|
boost::multiprecision::cpp_int v = other._unscaled_value * boost::multiprecision::pow(rescale, max_scale - other._scale);
|
|
_unscaled_value = u - v;
|
|
_scale = max_scale;
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
big_decimal big_decimal::operator+(const big_decimal& other) const {
|
|
big_decimal ret(*this);
|
|
ret += other;
|
|
return ret;
|
|
}
|
|
|
|
big_decimal big_decimal::operator-(const big_decimal& other) const {
|
|
big_decimal ret(*this);
|
|
ret -= other;
|
|
return ret;
|
|
}
|
|
|
|
big_decimal big_decimal::operator-() const {
|
|
big_decimal ret;
|
|
ret._unscaled_value = -_unscaled_value;
|
|
ret._scale = _scale;
|
|
return ret;
|
|
}
|
|
|
|
big_decimal big_decimal::div(const ::uint64_t y, const rounding_mode mode) const
|
|
{
|
|
if (mode != rounding_mode::HALF_EVEN) {
|
|
SCYLLA_ASSERT(0);
|
|
}
|
|
|
|
// Implementation of Division with Half to Even (aka Bankers) Rounding
|
|
const boost::multiprecision::cpp_int sign = _unscaled_value >= 0 ? +1 : -1;
|
|
const boost::multiprecision::cpp_int a = sign * _unscaled_value;
|
|
// cpp_int uses lazy evaluation and for older versions of boost and some
|
|
// versions of gcc, expression templates have problem to implicitly
|
|
// convert to cpp_int, so we force the conversion explicitly before cpp_int
|
|
// is converted to uint64_t.
|
|
const uint64_t r = boost::multiprecision::cpp_int{a % y}.convert_to<uint64_t>();
|
|
|
|
boost::multiprecision::cpp_int q = a / y;
|
|
|
|
/*
|
|
* Value r/y is fractional part of (*this)/y that is used to determine
|
|
* the direction of rounding.
|
|
* For rounding one has to consider r/y cmp 1/2 or equivalently:
|
|
* 2*r cmp y.
|
|
*/
|
|
if (2*r < y) {
|
|
/* Number has its final value */
|
|
} else if (2*r > y) {
|
|
q += 1;
|
|
} else if (q % 2 == 1) {
|
|
/* Change to closest even number */
|
|
q += 1;
|
|
}
|
|
|
|
return big_decimal(_scale, sign * q);
|
|
}
|