Files
scylladb/test/cql-pytest/test_aggregate.py
Nadav Har'El 9c3907bb3c test/cql-pytest: reproducers for incorrect AVG of "decimal" type
This patch contains tests reproducing issue #13601 and the corresponding
Cassandra issue CASSANDRA-18470. These issues are about what the AVG
aggregation does for arbitrary-precision "decimal" numbers - the tests
we add here show examples where the current behavior doesn't make sense:

The problem is that "decimal" has arbitrary precision - so, should an
average of 1/3 be returned as 0.3 or 0.33333333333333333? This is not
specified, so Scylla (and Cassandra) decided to pick the result precision
based on the input precision. In particular, the average of 1 and 2
is returned as 2 (zero digits after the decimal point, like in the
inputs) instead of the expected 1.5. Arguably this isn't useful behavior.

The test adds a second test which fails on Cassandra, but does pass
on Scylla: Cassandra returns as the average of 1, 2, 2, 3 the integer 1
whereas the correct average is 2 (and Scylla returns it correctly).
The reason why this bug is even worse on Cassandra is that Scylla's AVG
only loses precision when dividing the sum and count, but Cassandra
tries to maintain only the average, and loses precision at every step.

Refs #13601

Signed-off-by: Nadav Har'El <nyh@scylladb.com>

Closes #13603
2023-04-21 08:32:30 +03:00

205 lines
11 KiB
Python

# Copyright 2021-present ScyllaDB
#
# SPDX-License-Identifier: AGPL-3.0-or-later
#############################################################################
# Tests aggregation functions COUNT(), MIN(), MAX(), SUM()
#############################################################################
import pytest
import math
from decimal import Decimal
from util import new_test_table, unique_key_int, project
from cassandra.util import Date
@pytest.fixture(scope="module")
def table1(cql, test_keyspace):
with new_test_table(cql, test_keyspace, "p int, c int, v int, d double, dc decimal, PRIMARY KEY (p, c)") as table:
yield table
# When there is no row matching the selection, the count should be 0.
# First check a "=" expression matching no row:
def test_count_empty_eq(cql, table1):
p = unique_key_int()
assert [(0,)] == list(cql.execute(f"select count(*) from {table1} where p = {p}"))
# The aggregation "0" for no results is true for count(), but not all
# aggregators - min() returns null for no results, while sum() and
# avg() return 0 for no results (see discussion in issue #13027):
assert [(None,)] == list(cql.execute(f"select min(v) from {table1} where p = {p}"))
assert [(0,)] == list(cql.execute(f"select sum(v) from {table1} where p = {p}"))
assert [(0,)] == list(cql.execute(f"select avg(v) from {table1} where p = {p}"))
# Above in test_count_empty_eq() we tested that aggregates return some value -
# sometimes 0 and sometimes null - when aggregating no data. If we select
# not just the aggregate but also something else like p, the result is even
# more bizarre: we get a null for p. One might argue that if there are no
# rows we should have just got back nothing in return - just like Cassandra
# does for GROUP BY (see below tests for #12477), but Cassandra doesn't do
# this, and this test accepts Cassandra's (and Scylla's) behavior here as
# being correct.
# See issue #13027.
def test_count_and_p_empty_eq(cql, table1):
p = unique_key_int()
assert [(None,0)] == list(cql.execute(f"select p, count(*) from {table1} where p = {p}"))
assert [(None,None)] == list(cql.execute(f"select p, min(v) from {table1} where p = {p}"))
assert [(None,0)] == list(cql.execute(f"select p, sum(v) from {table1} where p = {p}"))
# The query "p = null" also matches no row so should have a count 0, but
# it's a special case where no partition belongs to the query range so the
# aggregator doesn't need to send the query to any node. This reproduces
# issue #12475.
# This test is scylla_only because Cassandra doesn't support "p = null".
# it does support "p IN ()" with the same effect, though, so that will
# be the next test.
def test_count_empty_eq_null(cql, table1, scylla_only):
assert [(0,)] == list(cql.execute(f"select count(*) from {table1} where p = null"))
# A more complex list of aggregators, some return zero and some null
# for an empty result set:
assert [(0,0,None,0)] == list(cql.execute(f"select count(*), count(v), min(v), sum(v) from {table1} where p = null"))
# Another special case of a query which matches no partition - an IN with
# an empty list. Reproduces #12475.
def test_count_empty_in(cql, table1):
assert [(0,)] == list(cql.execute(f"select count(*) from {table1} where p in ()"))
assert [(0,0,None,0)] == list(cql.execute(f"select count(*), count(v), min(v), sum(v) from {table1} where p in ()"))
# Simple test of counting the number of rows in a single partition
def test_count_in_partition(cql, table1):
p = unique_key_int()
stmt = cql.prepare(f"insert into {table1} (p, c, v) values (?, ?, ?)")
cql.execute(stmt, [p, 1, 1])
cql.execute(stmt, [p, 2, 2])
cql.execute(stmt, [p, 3, 3])
assert [(3,)] == list(cql.execute(f"select count(*) from {table1} where p = {p}"))
# Using count(v) instead of count(*) allows counting only rows with a set
# value in v
def test_count_specific_column(cql, table1):
p = unique_key_int()
stmt = cql.prepare(f"insert into {table1} (p, c, v) values (?, ?, ?)")
cql.execute(stmt, [p, 1, 1])
cql.execute(stmt, [p, 2, 2])
cql.execute(stmt, [p, 3, 3])
cql.execute(stmt, [p, 4, None])
assert [(4,)] == list(cql.execute(f"select count(*) from {table1} where p = {p}"))
assert [(3,)] == list(cql.execute(f"select count(v) from {table1} where p = {p}"))
# COUNT can be combined with GROUP BY to count separately for each partition
# or row.
def test_count_and_group_by_row(cql, table1):
p = unique_key_int()
stmt = cql.prepare(f"insert into {table1} (p, c, v) values (?, ?, ?)")
cql.execute(stmt, [p, 1, 1])
cql.execute(stmt, [p, 2, 2])
cql.execute(stmt, [p, 3, 3])
cql.execute(stmt, [p, 4, None])
assert [(p, 1, 1), (p, 2, 1), (p, 3, 1), (p, 4, 0)] == list(cql.execute(f"select p, c, count(v) from {table1} where p = {p} group by p,c"))
def test_count_and_group_by_partition(cql, table1):
p1 = unique_key_int()
p2 = unique_key_int()
stmt = cql.prepare(f"insert into {table1} (p, c, v) values (?, ?, ?)")
cql.execute(stmt, [p1, 1, 1])
cql.execute(stmt, [p1, 2, 2])
cql.execute(stmt, [p2, 3, 3])
cql.execute(stmt, [p2, 4, None])
assert [(p1, 2), (p2, 1)] == list(cql.execute(f"select p, count(v) from {table1} where p in ({p1},{p2}) group by p"))
# In the above tests we looked for per-row or per-partition counts and got
# back more than one count. But if our query matches no row, we should get
# back no count.
@pytest.mark.xfail(reason="issue #12477")
def test_count_and_group_by_row_none(cql, table1):
p = unique_key_int()
assert [] == list(cql.execute(f"select p, c, count(v) from {table1} where p = {p} group by p,c"))
@pytest.mark.xfail(reason="issue #12477")
def test_count_and_group_by_partition_none(cql, table1):
p = unique_key_int()
assert [] == list(cql.execute(f"select p, count(v) from {table1} where p = {p} group by p"))
# Regression-test for #7729.
def test_timeuuid(cql, test_keyspace):
schema = "a int, b timeuuid, primary key (a,b)"
with new_test_table(cql, test_keyspace, schema) as table:
cql.execute(f'insert into {table} (a, b) values (0, 13814000-1dd2-11ff-8080-808080808080)')
cql.execute(f'insert into {table} (a, b) values (0, 6b1b3620-33fd-11eb-8080-808080808080)')
assert project('system_todate_system_min_b',
cql.execute(f'select todate(min(b)) from {table} where a = 0')) == [Date('2020-12-01')]
assert project('system_todate_system_max_b',
cql.execute(f'select todate(max(b)) from {table} where a = 0')) == [Date('2038-09-06')]
# Check floating-point aggregations with non-finite results - inf and nan.
# Reproduces issue #13551: sum of +inf and -inf produced an error instead
# of a NaN as in Cassandra (and in older Scylla).
def test_aggregation_inf_nan(cql, table1):
p = unique_key_int()
# Add a single infinity value, see we can read it back as infinity,
# and its sum() is also infinity of course.
cql.execute(f"insert into {table1} (p, c, d) values ({p}, 1, infinity)")
assert [(math.inf,)] == list(cql.execute(f"select d from {table1} where p = {p} and c = 1"))
assert [(math.inf,)] == list(cql.execute(f"select sum(d) from {table1} where p = {p}"))
# Add a second value, a negative infinity. See we can read it back, and
# now the sum of +Inf and -Inf should be a NaN.
# Note that we have to use isnan() to check that the result is a NaN,
# because equality check doesn't work on NaN:
cql.execute(f"insert into {table1} (p, c, d) values ({p}, 2, -infinity)")
assert [(-math.inf,)] == list(cql.execute(f"select d from {table1} where p = {p} and c = 2"))
assert math.isnan(list(cql.execute(f"select sum(d) from {table1} where p = {p}"))[0][0])
# When averaging "decimal" (arbitrary-precision floating point) values,
# which precision should the output use? The average of 0, 0, 1 is
# 0.3333333333..., but with how many threes?
# The correct answer isn't clear. As explained in #13601 and CASSANDRA-18470,
# both Scylla and Cassandra took the number of significant digits in the
# average from the number of significant digits in the sum. So for example,
# the average of 1 and 2 is 2 (in Scylla, or 1 in Cassandra), but the average
# of 1.0 and 2.0 is 1.5. Similarly the average of 1.1 and 1.2 is 1.2 (or 1.1),
# not 1.15. This is surprising.
#
# Given that there is no way for the user to configure the desired precision,
# I don't know what would be a "correct" solution. I just feel that the
# scenarios tested in this test - averaging 1 and 2 or 1.1 and 1.2 -
# don't do something that any reasonable user might expect, and will need
# to be fixed.
@pytest.mark.xfail(reason="issue #13601")
def test_avg_decimal(cql, table1, cassandra_bug):
p = unique_key_int()
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 1, 1.0)")
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 2, 2.0)")
assert [(Decimal('1.5'),)] == list(cql.execute(f"select avg(dc) from {table1} where p = {p}"))
p = unique_key_int()
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 1, 1)")
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 2, 2.0)")
assert [(Decimal('1.5'),)] == list(cql.execute(f"select avg(dc) from {table1} where p = {p}"))
p = unique_key_int()
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 1, 1.0)")
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 2, 2)")
assert [(Decimal('1.5'),)] == list(cql.execute(f"select avg(dc) from {table1} where p = {p}"))
p = unique_key_int()
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 1, 1)")
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 2, 2)")
# Reproduces #13601: the average of 1 and 2 was rounded up to the
# integer 2 instead of returning 1.5:
assert [(Decimal('1.5'),)] == list(cql.execute(f"select avg(dc) from {table1} where p = {p}"))
# Similarly, average of 1.1 and 1.2 was rounded up to 1.2 instead of
# returning 1.15.
p = unique_key_int()
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 1, 1.1)")
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 2, 1.2)")
assert [(Decimal('1.15'),)] == list(cql.execute(f"select avg(dc) from {table1} where p = {p}"))
# Another test for average of "decimal" values: the average of 1, 2, 2, 3.
# In this case the average is an integer, 2, but Cassandra doesn't even
# get this right (see CASSANDRA-18470) because instead of of keeping the
# integer sum and count separately, it tries to update the average and does
# it in a too-low precision. Scylla passes this test correctly.
def test_avg_decimal_2(cql, table1, cassandra_bug):
p = unique_key_int()
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 1, 1)")
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 2, 2)")
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 3, 2)")
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 4, 3)")
# Reproduces CASSANDRA-18470:
assert [(Decimal('2'),)] == list(cql.execute(f"select avg(dc) from {table1} where p = {p}"))