cql3: support literals and bind variables in selectors
Add support for literals in the SELECT clause. This allows
SELECT fn(column, 4) or SELECT fn(column, ?).
Note, "SELECT 7 FROM tab" becomes valid in the grammar, but is still
not accepted because of failed type inference - we cannot infer the
type of 7, and don't have a favored type for literals (like C favors
int). We might relax this later.
In the WHERE clause, and Cassandra in the SELECT clause, type hints
can also resolve type ambiguity: (bigint)7 or (text)?. But this is
deferred to a later patch.
A few changes to the grammar are needed on top of adding a `value`
alternative to `unaliasedSelector`:
- vectorSimilarityArg gained access to `value` via `unaliasedSelector`,
so it loses that alternate to avoid ambiguity. We may drop
`vectorSimilarityArg` later.
- COUNT(1) became ambiguous via the general function path (since
function arguments can now be literals), so we remove this case
from the COUNT special cases, remaining with count(*).
- SELECT JSON and SELECT DISTINCT became "ambiguous enough" for
ANTLR to complain, though as far as I can tell `value` does not
add real ambiguity. The solution is to commit early (via "=>") to
a parsing path.
Due to the loss of count(1) recognition in the parser, we have to
special-case it in prepare. We may relax it to count any expression
later, like modern Cassandra and SQL.
Testing is awkward because of the type inference problem in top-level.
We test via the set_intersection() function and via lua functions.
Example:
```
cqlsh> CREATE FUNCTION ks.sum(a int, b int) RETURNS NULL ON NULL INPUT RETURNS int LANGUAGE LUA AS 'return a + b';
cqlsh> SELECT ks.sum(1, 2) FROM system.local;
ks.sum(1, 2)
--------------
3
(1 rows)
cqlsh>
```
(There are no suitable system functions!)
Fixes https://scylladb.atlassian.net/browse/SCYLLADB-296
Closes scylladb/scylladb#28256
This commit is contained in:
12
cql3/Cql.g
12
cql3/Cql.g
@@ -389,8 +389,10 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
|
||||
bool is_ann_ordering = false;
|
||||
}
|
||||
: K_SELECT (
|
||||
( K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; } )?
|
||||
( K_DISTINCT { is_distinct = true; } )?
|
||||
( (K_JSON K_DISTINCT)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
|
||||
| (K_JSON selectClause K_FROM)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
|
||||
)?
|
||||
( (K_DISTINCT selectClause K_FROM)=> K_DISTINCT { is_distinct = true; } )?
|
||||
sclause=selectClause
|
||||
)
|
||||
K_FROM (
|
||||
@@ -425,6 +427,7 @@ selector returns [shared_ptr<raw_selector> s]
|
||||
|
||||
unaliasedSelector returns [uexpression tmp]
|
||||
: ( c=cident { tmp = unresolved_identifier{std::move(c)}; }
|
||||
| v=value { tmp = std::move(v); }
|
||||
| K_COUNT '(' countArgument ')' { tmp = make_count_rows_function_expression(); }
|
||||
| K_WRITETIME '(' c=cident ')' { tmp = column_mutation_attribute{column_mutation_attribute::attribute_kind::writetime,
|
||||
unresolved_identifier{std::move(c)}}; }
|
||||
@@ -455,14 +458,11 @@ vectorSimilarityArgs returns [std::vector<expression> a]
|
||||
|
||||
vectorSimilarityArg returns [uexpression a]
|
||||
: s=unaliasedSelector { a = std::move(s); }
|
||||
| v=value { a = std::move(v); }
|
||||
;
|
||||
|
||||
countArgument
|
||||
: '*'
|
||||
| i=INTEGER { if (i->getText() != "1") {
|
||||
add_recognition_error("Only COUNT(1) is supported, got COUNT(" + i->getText() + ")");
|
||||
} }
|
||||
/* COUNT(1) is also allowed, it is recognized via the general function(args) path */
|
||||
;
|
||||
|
||||
whereClause returns [uexpression clause]
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include "expr-utils.hh"
|
||||
#include "evaluate.hh"
|
||||
#include "cql3/functions/functions.hh"
|
||||
#include "cql3/functions/aggregate_fcts.hh"
|
||||
#include "cql3/functions/castas_fcts.hh"
|
||||
#include "cql3/functions/scalar_function.hh"
|
||||
#include "cql3/column_identifier.hh"
|
||||
@@ -1047,8 +1048,47 @@ prepare_function_args_for_type_inference(std::span<const expression> args, data_
|
||||
return partially_prepared_args;
|
||||
}
|
||||
|
||||
// Special case for count(1) - recognize it as the countRows() function. Note it is quite
|
||||
// artificial and we might relax it to the more general count(expression) later.
|
||||
static
|
||||
std::optional<expression>
|
||||
try_prepare_count_rows(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
|
||||
return std::visit(overloaded_functor{
|
||||
[&] (const functions::function_name& name) -> std::optional<expression> {
|
||||
auto native_name = name;
|
||||
if (!native_name.has_keyspace()) {
|
||||
native_name = name.as_native_function();
|
||||
}
|
||||
// Collapse count(1) into countRows()
|
||||
if (native_name == functions::function_name::native_function("count")) {
|
||||
if (fc.args.size() == 1) {
|
||||
if (auto uc_arg = expr::as_if<expr::untyped_constant>(&fc.args[0])) {
|
||||
if (uc_arg->partial_type == expr::untyped_constant::type_class::integer
|
||||
&& uc_arg->raw_text == "1") {
|
||||
return expr::function_call{
|
||||
.func = functions::aggregate_fcts::make_count_rows_function(),
|
||||
.args = {},
|
||||
};
|
||||
} else {
|
||||
throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument", fc.args[0]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return std::nullopt;
|
||||
},
|
||||
[] (const shared_ptr<functions::function>&) -> std::optional<expression> {
|
||||
// Already prepared, nothing to do
|
||||
return std::nullopt;
|
||||
},
|
||||
}, fc.func);
|
||||
}
|
||||
|
||||
std::optional<expression>
|
||||
prepare_function_call(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
|
||||
if (auto prepared = try_prepare_count_rows(fc, db, keyspace, schema_opt, receiver)) {
|
||||
return prepared;
|
||||
}
|
||||
// Try to extract a column family name from the available information.
|
||||
// Most functions can be prepared without information about the column family, usually just the keyspace is enough.
|
||||
// One exception is the token() function - in order to prepare system.token() we have to know the partition key of the table,
|
||||
|
||||
@@ -25,6 +25,8 @@ Querying data from data is done using a ``SELECT`` statement:
|
||||
: | CAST '(' `selector` AS `cql_type` ')'
|
||||
: | `function_name` '(' [ `selector` ( ',' `selector` )* ] ')'
|
||||
: | COUNT '(' '*' ')'
|
||||
: | literal
|
||||
: | bind_marker
|
||||
: )
|
||||
: ( '.' `field_name` | '[' `term` ']' )*
|
||||
where_clause: `relation` ( AND `relation` )*
|
||||
@@ -35,6 +37,8 @@ Querying data from data is done using a ``SELECT`` statement:
|
||||
operator: '=' | '<' | '>' | '<=' | '>=' | IN | NOT IN | CONTAINS | CONTAINS KEY
|
||||
ordering_clause: `column_name` [ ASC | DESC ] ( ',' `column_name` [ ASC | DESC ] )*
|
||||
timeout: `duration`
|
||||
literal: number | 'string' | boolean | NULL | tuple_literal | list_literal | map_literal
|
||||
bind_marker: '?' | ':' `identifier`
|
||||
|
||||
For instance::
|
||||
|
||||
@@ -81,6 +85,13 @@ A :token:`selector` can be one of the following:
|
||||
- A casting, which allows you to convert a nested selector to a (compatible) type.
|
||||
- A function call, where the arguments are selector themselves.
|
||||
- A call to the :ref:`COUNT function <count-function>`, which counts all non-null results.
|
||||
- A literal value (constant).
|
||||
- A bind variable (`?` or `:name`).
|
||||
|
||||
Note that due to a quirk of the type system, literals and bind markers cannot be
|
||||
used as top-level selectors, as the parser cannot infer their type. However, they can be used
|
||||
when nested inside functions, as the function formal parameter types provide the
|
||||
necessary context.
|
||||
|
||||
Aliases
|
||||
```````
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
# to reproduce bugs discovered by bigger Cassandra tests.
|
||||
#############################################################################
|
||||
|
||||
from .util import unique_name, unique_key_int
|
||||
from .util import unique_name, unique_key_int, new_test_table
|
||||
|
||||
from cassandra.protocol import FunctionFailure
|
||||
from cassandra.util import Date, Time
|
||||
@@ -603,3 +603,24 @@ def test_select_json_with_alias(cql, table1):
|
||||
}
|
||||
for input, output in input_and_output.items():
|
||||
assert list(cql.execute(f"SELECT JSON {input} from {table1} where p = {p}")) == [(EquivalentJson(output),)]
|
||||
|
||||
# The grammar around DISTINCT and JSON is hairy. Test the combination.
|
||||
def test_select_distinct_json(cql, test_keyspace):
|
||||
with new_test_table(cql, test_keyspace, "p int, c int, v int, PRIMARY KEY (p, c)") as table:
|
||||
p1 = unique_key_int()
|
||||
p2 = unique_key_int()
|
||||
# Insert two rows per partition
|
||||
cql.execute(f"INSERT INTO {table} (p, c, v) VALUES ({p1}, 1, 10)")
|
||||
cql.execute(f"INSERT INTO {table} (p, c, v) VALUES ({p1}, 2, 20)")
|
||||
cql.execute(f"INSERT INTO {table} (p, c, v) VALUES ({p2}, 1, 30)")
|
||||
cql.execute(f"INSERT INTO {table} (p, c, v) VALUES ({p2}, 2, 40)")
|
||||
# DISTINCT can only select partition key columns (p)
|
||||
# Should return exactly 2 rows (one per partition)
|
||||
result = list(cql.execute(f"SELECT JSON DISTINCT p FROM {table} WHERE p IN ({p1}, {p2})"))
|
||||
# Check that the results are valid JSON with the expected structure
|
||||
json_values = sorted([json.loads(row[0])["p"] for row in result])
|
||||
assert json_values == sorted([p1, p2])
|
||||
# Without DISTINCT, should return all 4 rows
|
||||
result = list(cql.execute(f"SELECT JSON p FROM {table} WHERE p IN ({p1}, {p2})"))
|
||||
json_values = [json.loads(row[0])["p"] for row in result]
|
||||
assert sorted(json_values) == sorted([p1, p1, p2, p2])
|
||||
|
||||
91
test/cqlpy/test_selector_literals.py
Normal file
91
test/cqlpy/test_selector_literals.py
Normal file
@@ -0,0 +1,91 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright 2026-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
|
||||
# Tests literals in the SELECT clause.
|
||||
#
|
||||
# Originally, the CQL grammar allowed literals (constants, bind markers, and
|
||||
# collections/tuples/UDTs of literals) only in the WHERE clause. This test suite
|
||||
# tests literals in the SELECT clause, which were added later [1].
|
||||
#
|
||||
# The simplest example, "SELECT 1" actually doesn't work since its type cannot
|
||||
# be inferred (is it a tinyint, int, or bigint?), so we use UDFs and other functions
|
||||
# that accept known types instead. We do test that "SELECT 1" and similar fail
|
||||
# in the expected way due to type inference failure.
|
||||
#
|
||||
# [1]: https://scylladb.atlassian.net/browse/SCYLLADB-296
|
||||
|
||||
from contextlib import contextmanager
|
||||
import pytest
|
||||
from .util import unique_name, new_function
|
||||
from .conftest import scylla_only
|
||||
from cassandra.protocol import InvalidRequest
|
||||
|
||||
want_lua = scylla_only
|
||||
|
||||
def test_simple_literal_selectors(cql, test_keyspace, want_lua):
|
||||
@contextmanager
|
||||
def new_sum_function(name: str, type: str, op: str):
|
||||
body = f"(i {type}, j {type}) RETURNS NULL ON NULL INPUT RETURNS {type} LANGUAGE lua AS 'return i {op} j;'"
|
||||
with new_function(cql, test_keyspace, body, name=name, args=f"{type}, {type}") as f:
|
||||
yield f
|
||||
|
||||
# Create two different functions with the same name fun, but a
|
||||
# different signature (different parameters):
|
||||
fun = unique_name()
|
||||
ksfun = f"{test_keyspace}.{fun}"
|
||||
with new_sum_function(name=fun, type="int", op="+"):
|
||||
rows = cql.execute(f"SELECT {ksfun}(1, 2) AS sum_int FROM system.local")
|
||||
assert rows.one().sum_int == 3
|
||||
stmt = cql.prepare(f"SELECT {ksfun}(?, ?) AS sum_int FROM system.local")
|
||||
rows = cql.execute(stmt, (10, 20))
|
||||
assert rows.one().sum_int == 30
|
||||
with pytest.raises(InvalidRequest, match="Type error"):
|
||||
cql.execute(f"SELECT {ksfun}(1, 'asf') AS sum_int FROM system.local")
|
||||
with new_sum_function(name=fun, type="text", op=".."):
|
||||
rows = cql.execute(f"SELECT {ksfun}('hello, ', 'world!') AS sum_text FROM system.local")
|
||||
assert rows.one().sum_text == "hello, world!"
|
||||
stmt = cql.prepare(f"SELECT {ksfun}(?, ?) AS sum_text FROM system.local")
|
||||
rows = cql.execute(stmt, ('foo', 'bar'))
|
||||
assert rows.one().sum_text == "foobar"
|
||||
with pytest.raises(InvalidRequest, match="Type error"):
|
||||
cql.execute(f"SELECT {ksfun}('asf', 1) AS sum_text FROM system.local")
|
||||
|
||||
# scylla-only due to set_intersection function
|
||||
def test_set_literal_selector(cql, test_keyspace, scylla_only):
|
||||
cql.execute(f"CREATE TABLE IF NOT EXISTS {test_keyspace}.sets (id int PRIMARY KEY, vals set<int>, vals2 set<frozen<map<text, int>>>)")
|
||||
cql.execute(f"INSERT INTO {test_keyspace}.sets (id, vals) VALUES (1, {{1, 2, 3, 4, 5}})")
|
||||
rows = cql.execute(f"SELECT set_intersection(vals, {{3,4,5,6,7}}) AS intersection FROM {test_keyspace}.sets WHERE id=1")
|
||||
assert rows.one().intersection == {3,4,5}
|
||||
|
||||
cql.execute(f"INSERT INTO {test_keyspace}.sets (id, vals2) VALUES (1, {{ {{ 'aa': 1, 'bb': 2 }}, {{ 'cc': 3, 'dd': 4 }} }})")
|
||||
rows = cql.execute(f"SELECT set_intersection(vals2, {{ {{ 'cc': 3, 'dd': 4 }}, {{ 'cc': 3, 'dd': 5 }} }}) AS intersection FROM {test_keyspace}.sets WHERE id=1")
|
||||
assert rows.one().intersection == {frozenset([('cc', 3), ('dd', 4)])}
|
||||
|
||||
# Test that simple literals without type hints fail as expected due to type inference failure.
|
||||
def test_simple_literal_type_inference_failure(cql, test_keyspace):
|
||||
with pytest.raises(InvalidRequest, match="infer type"):
|
||||
cql.execute("SELECT 1 AS one FROM system.local")
|
||||
with pytest.raises(InvalidRequest, match="infer type"):
|
||||
cql.execute("SELECT 'hello' AS greeting FROM system.local")
|
||||
with pytest.raises(InvalidRequest, match="infer type"):
|
||||
cql.execute("SELECT [1, 2, 3] AS lst FROM system.local")
|
||||
with pytest.raises(InvalidRequest, match="infer type"):
|
||||
cql.execute("SELECT { 'a': 1, 'b': 2 } AS mp FROM system.local")
|
||||
with pytest.raises(InvalidRequest, match="infer type"):
|
||||
cql.execute("SELECT (1, 'a', 3.0) AS tpl FROM system.local")
|
||||
with pytest.raises(InvalidRequest, match="infer type"):
|
||||
cql.execute("SELECT ? AS qm FROM system.local")
|
||||
with pytest.raises(InvalidRequest, match="infer type"):
|
||||
cql.execute("SELECT :bindvar AS bv FROM system.local")
|
||||
|
||||
# Test that count(2) fails as expected. We're likely to relax this restriction later
|
||||
# as it is quite artificial. scylla_only because Cassandra does allow it.
|
||||
def test_count_literal_only_1(cql, test_keyspace, scylla_only):
|
||||
with pytest.raises(InvalidRequest, match="expects a column or the literal 1 as an argument"):
|
||||
cql.execute("SELECT count(2) AS cnt FROM system.local")
|
||||
# Error message here is not the best, but tightening error messages
|
||||
# here is quite a hassle and we plan to relax the restriction later anyway.
|
||||
with pytest.raises(InvalidRequest, match="only valid when argument types are known"):
|
||||
cql.execute("SELECT count(?) AS cnt FROM system.local")
|
||||
Reference in New Issue
Block a user