Drop the AGPL license in favor of a source-available license. See the blog post [1] for details. [1] https://www.scylladb.com/2024/12/18/why-were-moving-to-a-source-available-license/
264 lines
14 KiB
Python
264 lines
14 KiB
Python
# Copyright 2021-present ScyllaDB
|
|
#
|
|
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
|
|
#############################################################################
|
|
# Tests aggregation functions COUNT(), MIN(), MAX(), SUM()
|
|
#############################################################################
|
|
|
|
import pytest
|
|
import math
|
|
from decimal import Decimal
|
|
from .util import new_test_table, unique_key_int, project, new_type
|
|
from cassandra.util import Date
|
|
from cassandra.protocol import SyntaxException
|
|
from .cassandra_tests.porting import assert_invalid_message
|
|
|
|
@pytest.fixture(scope="module")
|
|
def table1(cql, test_keyspace):
|
|
with new_test_table(cql, test_keyspace, "p int, c int, v int, d double, dc decimal, PRIMARY KEY (p, c)") as table:
|
|
yield table
|
|
|
|
@pytest.fixture(scope="module")
|
|
def table2(cql, test_keyspace):
|
|
with new_test_table(cql, test_keyspace, 'p int, c int, somecolumn int, "SomeColumn" int, "OtherColumn" int, PRIMARY KEY (p, c)') as table:
|
|
yield table
|
|
|
|
# When there is no row matching the selection, the count should be 0.
|
|
# First check a "=" expression matching no row:
|
|
def test_count_empty_eq(cql, table1):
|
|
p = unique_key_int()
|
|
assert [(0,)] == list(cql.execute(f"select count(*) from {table1} where p = {p}"))
|
|
# The aggregation "0" for no results is true for count(), but not all
|
|
# aggregators - min() returns null for no results, while sum() and
|
|
# avg() return 0 for no results (see discussion in issue #13027):
|
|
assert [(None,)] == list(cql.execute(f"select min(v) from {table1} where p = {p}"))
|
|
assert [(0,)] == list(cql.execute(f"select sum(v) from {table1} where p = {p}"))
|
|
assert [(0,)] == list(cql.execute(f"select avg(v) from {table1} where p = {p}"))
|
|
|
|
# Above in test_count_empty_eq() we tested that aggregates return some value -
|
|
# sometimes 0 and sometimes null - when aggregating no data. If we select
|
|
# not just the aggregate but also something else like p, the result is even
|
|
# more bizarre: we get a null for p. One might argue that if there are no
|
|
# rows we should have just got back nothing in return - just like Cassandra
|
|
# does for GROUP BY (see below tests for #12477), but Cassandra doesn't do
|
|
# this, and this test accepts Cassandra's (and Scylla's) behavior here as
|
|
# being correct.
|
|
# See issue #13027.
|
|
def test_count_and_p_empty_eq(cql, table1):
|
|
p = unique_key_int()
|
|
assert [(None,0)] == list(cql.execute(f"select p, count(*) from {table1} where p = {p}"))
|
|
assert [(None,None)] == list(cql.execute(f"select p, min(v) from {table1} where p = {p}"))
|
|
assert [(None,0)] == list(cql.execute(f"select p, sum(v) from {table1} where p = {p}"))
|
|
|
|
# The query "p = null" also matches no row so should have a count 0, but
|
|
# it's a special case where no partition belongs to the query range so the
|
|
# aggregator doesn't need to send the query to any node. This reproduces
|
|
# issue #12475.
|
|
# This test is scylla_only because Cassandra doesn't support "p = null".
|
|
# it does support "p IN ()" with the same effect, though, so that will
|
|
# be the next test.
|
|
def test_count_empty_eq_null(cql, table1, scylla_only):
|
|
assert [(0,)] == list(cql.execute(f"select count(*) from {table1} where p = null"))
|
|
# A more complex list of aggregators, some return zero and some null
|
|
# for an empty result set:
|
|
assert [(0,0,None,0)] == list(cql.execute(f"select count(*), count(v), min(v), sum(v) from {table1} where p = null"))
|
|
|
|
# Another special case of a query which matches no partition - an IN with
|
|
# an empty list. Reproduces #12475.
|
|
def test_count_empty_in(cql, table1):
|
|
assert [(0,)] == list(cql.execute(f"select count(*) from {table1} where p in ()"))
|
|
assert [(0,0,None,0)] == list(cql.execute(f"select count(*), count(v), min(v), sum(v) from {table1} where p in ()"))
|
|
|
|
# Simple test of counting the number of rows in a single partition
|
|
def test_count_in_partition(cql, table1):
|
|
p = unique_key_int()
|
|
stmt = cql.prepare(f"insert into {table1} (p, c, v) values (?, ?, ?)")
|
|
cql.execute(stmt, [p, 1, 1])
|
|
cql.execute(stmt, [p, 2, 2])
|
|
cql.execute(stmt, [p, 3, 3])
|
|
assert [(3,)] == list(cql.execute(f"select count(*) from {table1} where p = {p}"))
|
|
|
|
# Using count(v) instead of count(*) allows counting only rows with a non-NULL
|
|
# value in v
|
|
def test_count_specific_column(cql, test_keyspace, table1):
|
|
p = unique_key_int()
|
|
stmt = cql.prepare(f"insert into {table1} (p, c, v) values (?, ?, ?)")
|
|
cql.execute(stmt, [p, 1, 1])
|
|
cql.execute(stmt, [p, 2, 2])
|
|
cql.execute(stmt, [p, 3, 3])
|
|
cql.execute(stmt, [p, 4, None])
|
|
assert [(4,)] == list(cql.execute(f"select count(*) from {table1} where p = {p}"))
|
|
assert [(3,)] == list(cql.execute(f"select count(v) from {table1} where p = {p}"))
|
|
# Check with non-scalar types too, reproduces #14198
|
|
with new_type(cql, test_keyspace, '(i int, t text)') as udt:
|
|
with new_test_table(cql, test_keyspace, f'p int, c int, fli frozen<list<int>>, tup tuple<int, bigint>, udt {udt}, PRIMARY KEY (p, c)') as table2:
|
|
cql.execute(f'INSERT INTO {table2}(p, c, fli, tup, udt) VALUES({p}, 5, [1, 2], (3, 4), {{i: 3, t: \'text\'}})')
|
|
cql.execute(f'INSERT INTO {table2}(p, c) VALUES({p}, 6)')
|
|
|
|
assert [(1,)] == list(cql.execute(f"select count(fli) from {table2} where p = {p}"))
|
|
assert [(1,)] == list(cql.execute(f"select count(tup) from {table2} where p = {p}"))
|
|
assert [(1,)] == list(cql.execute(f"select count(udt) from {table2} where p = {p}"))
|
|
assert [(2,)] == list(cql.execute(f"select count(*) from {table2} where p = {p}"))
|
|
|
|
# COUNT can be combined with GROUP BY to count separately for each partition
|
|
# or row.
|
|
def test_count_and_group_by_row(cql, table1):
|
|
p = unique_key_int()
|
|
stmt = cql.prepare(f"insert into {table1} (p, c, v) values (?, ?, ?)")
|
|
cql.execute(stmt, [p, 1, 1])
|
|
cql.execute(stmt, [p, 2, 2])
|
|
cql.execute(stmt, [p, 3, 3])
|
|
cql.execute(stmt, [p, 4, None])
|
|
assert [(p, 1, 1), (p, 2, 1), (p, 3, 1), (p, 4, 0)] == list(cql.execute(f"select p, c, count(v) from {table1} where p = {p} group by p,c"))
|
|
|
|
def test_count_and_group_by_partition(cql, table1):
|
|
p1 = unique_key_int()
|
|
p2 = unique_key_int()
|
|
stmt = cql.prepare(f"insert into {table1} (p, c, v) values (?, ?, ?)")
|
|
cql.execute(stmt, [p1, 1, 1])
|
|
cql.execute(stmt, [p1, 2, 2])
|
|
cql.execute(stmt, [p2, 3, 3])
|
|
cql.execute(stmt, [p2, 4, None])
|
|
assert [(p1, 2), (p2, 1)] == list(cql.execute(f"select p, count(v) from {table1} where p in ({p1},{p2}) group by p"))
|
|
|
|
# In the above tests we looked for per-row or per-partition counts and got
|
|
# back more than one count. But if our query matches no row, we should get
|
|
# back no count.
|
|
#
|
|
# Reproduces #12477
|
|
def test_count_and_group_by_row_none(cql, table1):
|
|
p = unique_key_int()
|
|
assert [] == list(cql.execute(f"select p, c, count(v) from {table1} where p = {p} group by p,c"))
|
|
|
|
def test_count_and_group_by_partition_none(cql, table1):
|
|
p = unique_key_int()
|
|
assert [] == list(cql.execute(f"select p, count(v) from {table1} where p = {p} group by p"))
|
|
|
|
# Regression-test for #7729.
|
|
def test_timeuuid(cql, test_keyspace):
|
|
schema = "a int, b timeuuid, primary key (a,b)"
|
|
with new_test_table(cql, test_keyspace, schema) as table:
|
|
cql.execute(f'insert into {table} (a, b) values (0, 13814000-1dd2-11ff-8080-808080808080)')
|
|
cql.execute(f'insert into {table} (a, b) values (0, 6b1b3620-33fd-11eb-8080-808080808080)')
|
|
assert project('system_todate_system_min_b',
|
|
cql.execute(f'select todate(min(b)) from {table} where a = 0')) == [Date('2020-12-01')]
|
|
assert project('system_todate_system_max_b',
|
|
cql.execute(f'select todate(max(b)) from {table} where a = 0')) == [Date('2038-09-06')]
|
|
|
|
# Check floating-point aggregations with non-finite results - inf and nan.
|
|
# Reproduces issue #13551: sum of +inf and -inf produced an error instead
|
|
# of a NaN as in Cassandra (and in older Scylla).
|
|
def test_aggregation_inf_nan(cql, table1):
|
|
p = unique_key_int()
|
|
# Add a single infinity value, see we can read it back as infinity,
|
|
# and its sum() is also infinity of course.
|
|
cql.execute(f"insert into {table1} (p, c, d) values ({p}, 1, infinity)")
|
|
assert [(math.inf,)] == list(cql.execute(f"select d from {table1} where p = {p} and c = 1"))
|
|
assert [(math.inf,)] == list(cql.execute(f"select sum(d) from {table1} where p = {p}"))
|
|
# Add a second value, a negative infinity. See we can read it back, and
|
|
# now the sum of +Inf and -Inf should be a NaN.
|
|
# Note that we have to use isnan() to check that the result is a NaN,
|
|
# because equality check doesn't work on NaN:
|
|
cql.execute(f"insert into {table1} (p, c, d) values ({p}, 2, -infinity)")
|
|
assert [(-math.inf,)] == list(cql.execute(f"select d from {table1} where p = {p} and c = 2"))
|
|
assert math.isnan(list(cql.execute(f"select sum(d) from {table1} where p = {p}"))[0][0])
|
|
|
|
# When averaging "decimal" (arbitrary-precision floating point) values,
|
|
# which precision should the output use? The average of 0, 0, 1 is
|
|
# 0.3333333333..., but with how many threes?
|
|
# The correct answer isn't clear. As explained in #13601 and CASSANDRA-18470,
|
|
# both Scylla and Cassandra took the number of significant digits in the
|
|
# average from the number of significant digits in the sum. So for example,
|
|
# the average of 1 and 2 is 2 (in Scylla, or 1 in Cassandra), but the average
|
|
# of 1.0 and 2.0 is 1.5. Similarly the average of 1.1 and 1.2 is 1.2 (or 1.1),
|
|
# not 1.15. This is surprising.
|
|
#
|
|
# Given that there is no way for the user to configure the desired precision,
|
|
# I don't know what would be a "correct" solution. I just feel that the
|
|
# scenarios tested in this test - averaging 1 and 2 or 1.1 and 1.2 -
|
|
# don't do something that any reasonable user might expect, and will need
|
|
# to be fixed.
|
|
@pytest.mark.xfail(reason="issue #13601")
|
|
def test_avg_decimal(cql, table1, cassandra_bug):
|
|
p = unique_key_int()
|
|
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 1, 1.0)")
|
|
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 2, 2.0)")
|
|
assert [(Decimal('1.5'),)] == list(cql.execute(f"select avg(dc) from {table1} where p = {p}"))
|
|
p = unique_key_int()
|
|
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 1, 1)")
|
|
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 2, 2.0)")
|
|
assert [(Decimal('1.5'),)] == list(cql.execute(f"select avg(dc) from {table1} where p = {p}"))
|
|
p = unique_key_int()
|
|
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 1, 1.0)")
|
|
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 2, 2)")
|
|
assert [(Decimal('1.5'),)] == list(cql.execute(f"select avg(dc) from {table1} where p = {p}"))
|
|
p = unique_key_int()
|
|
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 1, 1)")
|
|
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 2, 2)")
|
|
# Reproduces #13601: the average of 1 and 2 was rounded up to the
|
|
# integer 2 instead of returning 1.5:
|
|
assert [(Decimal('1.5'),)] == list(cql.execute(f"select avg(dc) from {table1} where p = {p}"))
|
|
# Similarly, average of 1.1 and 1.2 was rounded up to 1.2 instead of
|
|
# returning 1.15.
|
|
p = unique_key_int()
|
|
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 1, 1.1)")
|
|
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 2, 1.2)")
|
|
assert [(Decimal('1.15'),)] == list(cql.execute(f"select avg(dc) from {table1} where p = {p}"))
|
|
|
|
# Another test for average of "decimal" values: the average of 1, 2, 2, 3.
|
|
# In this case the average is an integer, 2, but Cassandra doesn't even
|
|
# get this right (see CASSANDRA-18470) because instead of of keeping the
|
|
# integer sum and count separately, it tries to update the average and does
|
|
# it in a too-low precision. Scylla passes this test correctly.
|
|
def test_avg_decimal_2(cql, table1, cassandra_bug):
|
|
p = unique_key_int()
|
|
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 1, 1)")
|
|
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 2, 2)")
|
|
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 3, 2)")
|
|
cql.execute(f"insert into {table1} (p, c, dc) values ({p}, 4, 3)")
|
|
# Reproduces CASSANDRA-18470:
|
|
assert [(Decimal('2'),)] == list(cql.execute(f"select avg(dc) from {table1} where p = {p}"))
|
|
|
|
def test_reject_aggregates_in_where_clause(cql, table1):
|
|
assert_invalid_message(cql, table1, 'Aggregation',
|
|
f'SELECT * FROM {table1} WHERE p = sum((int)4)')
|
|
|
|
# Reproduces #13265
|
|
# Aggregates must handle case-sensitive column names correctly.
|
|
def test_sum_case_sensitive_column(cql, table2):
|
|
p = unique_key_int()
|
|
cql.execute(f'insert into {table2} (p, c, somecolumn, "SomeColumn", "OtherColumn") VALUES ({p}, 1, 1, 10, 100)')
|
|
cql.execute(f'insert into {table2} (p, c, somecolumn, "SomeColumn", "OtherColumn") VALUES ({p}, 2, 2, 20, 200)')
|
|
cql.execute(f'insert into {table2} (p, c, somecolumn, "SomeColumn", "OtherColumn") VALUES ({p}, 3, 3, 30, 300)')
|
|
|
|
assert cql.execute(f'select sum(somecolumn) from {table2} where p = {p}').one()[0] == 6
|
|
assert cql.execute(f'select sum("somecolumn") from {table2} where p = {p}').one()[0] == 6
|
|
assert cql.execute(f'select sum("SomeColumn") from {table2} where p = {p}').one()[0] == 60
|
|
assert cql.execute(f'select sum(SomeColumn) from {table2} where p = {p}').one()[0] == 6
|
|
assert cql.execute(f'select sum("OtherColumn") from {table2} where p = {p}').one()[0] == 600
|
|
|
|
assert_invalid_message(cql, table2, 'someColumn',
|
|
f'select sum("someColumn") from {table2} where p = {p}')
|
|
assert_invalid_message(cql, table2, 'othercolumn',
|
|
f'select sum(OtherColumn) from {table2} where p = {p}')
|
|
|
|
# Check that a simple case of summing up a specific int column in a partition
|
|
# works correctly
|
|
def test_sum_in_partition(cql, table1):
|
|
p = unique_key_int()
|
|
stmt = cql.prepare(f"insert into {table1} (p, c, v) values (?, ?, ?)")
|
|
cql.execute(stmt, [p, 1, 4])
|
|
cql.execute(stmt, [p, 2, 5])
|
|
cql.execute(stmt, [p, 3, 6])
|
|
assert [(15,)] == list(cql.execute(f"select sum(v) from {table1} where p = {p}"))
|
|
|
|
# Check that you *can't* do "sum(*)" expecting (perhaps) sums of all the
|
|
# columns. It's a syntax error: The grammar allows "count(*)" because it has
|
|
# a separate rule for # "count", but other functions and aggregators (like
|
|
# "sum") don't allow "*" and expect a real column name as a parameter.
|
|
def test_sum_star(cql, table1):
|
|
p = unique_key_int()
|
|
with pytest.raises(SyntaxException):
|
|
cql.execute(f"select sum(*) from {table1} where p = {p}")
|