cql3: support literals and bind variables in selectors

Add support for literals in the SELECT clause. This allows
SELECT fn(column, 4) or SELECT fn(column, ?).

Note, "SELECT 7 FROM tab" becomes valid in the grammar, but is still
not accepted because of failed type inference - we cannot infer the
type of 7, and don't have a favored type for literals (like C favors
int). We might relax this later.

In the WHERE clause, and Cassandra in the SELECT clause, type hints
can also resolve type ambiguity: (bigint)7 or (text)?. But this is
deferred to a later patch.

A few changes to the grammar are needed on top of adding a `value`
alternative to `unaliasedSelector`:

 - vectorSimilarityArg gained access to `value` via `unaliasedSelector`,
   so it loses that alternate to avoid ambiguity. We may drop
   `vectorSimilarityArg` later.
 - COUNT(1) became ambiguous via the general function path (since
   function arguments can now be literals), so we remove this case
   from the COUNT special cases, remaining with count(*).
 - SELECT JSON and SELECT DISTINCT became "ambiguous enough" for
   ANTLR to complain, though as far as I can tell `value` does not
   add real ambiguity. The solution is to commit early (via "=>") to
   a parsing path.

Due to the loss of count(1) recognition in the parser, we have to
special-case it in prepare. We may relax it to count any expression
later, like modern Cassandra and SQL.

Testing is awkward because of the type inference problem in top-level.
We test via the set_intersection() function and via lua functions.

Example:

```
cqlsh> CREATE FUNCTION ks.sum(a int, b int) RETURNS NULL ON NULL INPUT RETURNS int  LANGUAGE LUA AS 'return a + b';
cqlsh> SELECT ks.sum(1, 2) FROM system.local;

 ks.sum(1, 2)
--------------
            3

(1 rows)
cqlsh>
```

(There are no suitable system functions!)

Fixes https://scylladb.atlassian.net/browse/SCYLLADB-296

Closes scylladb/scylladb#28256
This commit is contained in:
Avi Kivity
2026-01-19 12:46:21 +02:00
committed by Nadav Har'El
parent 68b105b21c
commit cc03f5c89d
5 changed files with 170 additions and 7 deletions

View File

@@ -389,8 +389,10 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
bool is_ann_ordering = false;
}
: K_SELECT (
( K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; } )?
( K_DISTINCT { is_distinct = true; } )?
( (K_JSON K_DISTINCT)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
| (K_JSON selectClause K_FROM)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
)?
( (K_DISTINCT selectClause K_FROM)=> K_DISTINCT { is_distinct = true; } )?
sclause=selectClause
)
K_FROM (
@@ -425,6 +427,7 @@ selector returns [shared_ptr<raw_selector> s]
unaliasedSelector returns [uexpression tmp]
: ( c=cident { tmp = unresolved_identifier{std::move(c)}; }
| v=value { tmp = std::move(v); }
| K_COUNT '(' countArgument ')' { tmp = make_count_rows_function_expression(); }
| K_WRITETIME '(' c=cident ')' { tmp = column_mutation_attribute{column_mutation_attribute::attribute_kind::writetime,
unresolved_identifier{std::move(c)}}; }
@@ -455,14 +458,11 @@ vectorSimilarityArgs returns [std::vector<expression> a]
vectorSimilarityArg returns [uexpression a]
: s=unaliasedSelector { a = std::move(s); }
| v=value { a = std::move(v); }
;
countArgument
: '*'
| i=INTEGER { if (i->getText() != "1") {
add_recognition_error("Only COUNT(1) is supported, got COUNT(" + i->getText() + ")");
} }
/* COUNT(1) is also allowed, it is recognized via the general function(args) path */
;
whereClause returns [uexpression clause]

View File

@@ -10,6 +10,7 @@
#include "expr-utils.hh"
#include "evaluate.hh"
#include "cql3/functions/functions.hh"
#include "cql3/functions/aggregate_fcts.hh"
#include "cql3/functions/castas_fcts.hh"
#include "cql3/functions/scalar_function.hh"
#include "cql3/column_identifier.hh"
@@ -1047,8 +1048,47 @@ prepare_function_args_for_type_inference(std::span<const expression> args, data_
return partially_prepared_args;
}
// Special case for count(1) - recognize it as the countRows() function. Note it is quite
// artificial and we might relax it to the more general count(expression) later.
static
std::optional<expression>
try_prepare_count_rows(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
return std::visit(overloaded_functor{
[&] (const functions::function_name& name) -> std::optional<expression> {
auto native_name = name;
if (!native_name.has_keyspace()) {
native_name = name.as_native_function();
}
// Collapse count(1) into countRows()
if (native_name == functions::function_name::native_function("count")) {
if (fc.args.size() == 1) {
if (auto uc_arg = expr::as_if<expr::untyped_constant>(&fc.args[0])) {
if (uc_arg->partial_type == expr::untyped_constant::type_class::integer
&& uc_arg->raw_text == "1") {
return expr::function_call{
.func = functions::aggregate_fcts::make_count_rows_function(),
.args = {},
};
} else {
throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument", fc.args[0]));
}
}
}
}
return std::nullopt;
},
[] (const shared_ptr<functions::function>&) -> std::optional<expression> {
// Already prepared, nothing to do
return std::nullopt;
},
}, fc.func);
}
std::optional<expression>
prepare_function_call(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
if (auto prepared = try_prepare_count_rows(fc, db, keyspace, schema_opt, receiver)) {
return prepared;
}
// Try to extract a column family name from the available information.
// Most functions can be prepared without information about the column family, usually just the keyspace is enough.
// One exception is the token() function - in order to prepare system.token() we have to know the partition key of the table,

View File

@@ -25,6 +25,8 @@ Querying data from data is done using a ``SELECT`` statement:
: | CAST '(' `selector` AS `cql_type` ')'
: | `function_name` '(' [ `selector` ( ',' `selector` )* ] ')'
: | COUNT '(' '*' ')'
: | literal
: | bind_marker
: )
: ( '.' `field_name` | '[' `term` ']' )*
where_clause: `relation` ( AND `relation` )*
@@ -35,6 +37,8 @@ Querying data from data is done using a ``SELECT`` statement:
operator: '=' | '<' | '>' | '<=' | '>=' | IN | NOT IN | CONTAINS | CONTAINS KEY
ordering_clause: `column_name` [ ASC | DESC ] ( ',' `column_name` [ ASC | DESC ] )*
timeout: `duration`
literal: number | 'string' | boolean | NULL | tuple_literal | list_literal | map_literal
bind_marker: '?' | ':' `identifier`
For instance::
@@ -81,6 +85,13 @@ A :token:`selector` can be one of the following:
- A casting, which allows you to convert a nested selector to a (compatible) type.
- A function call, where the arguments are selector themselves.
- A call to the :ref:`COUNT function <count-function>`, which counts all non-null results.
- A literal value (constant).
- A bind variable (`?` or `:name`).
Note that due to a quirk of the type system, literals and bind markers cannot be
used as top-level selectors, as the parser cannot infer their type. However, they can be used
when nested inside functions, as the function formal parameter types provide the
necessary context.
Aliases
```````

View File

@@ -11,7 +11,7 @@
# to reproduce bugs discovered by bigger Cassandra tests.
#############################################################################
from .util import unique_name, unique_key_int
from .util import unique_name, unique_key_int, new_test_table
from cassandra.protocol import FunctionFailure
from cassandra.util import Date, Time
@@ -603,3 +603,24 @@ def test_select_json_with_alias(cql, table1):
}
for input, output in input_and_output.items():
assert list(cql.execute(f"SELECT JSON {input} from {table1} where p = {p}")) == [(EquivalentJson(output),)]
# The grammar around DISTINCT and JSON is hairy. Test the combination.
def test_select_distinct_json(cql, test_keyspace):
with new_test_table(cql, test_keyspace, "p int, c int, v int, PRIMARY KEY (p, c)") as table:
p1 = unique_key_int()
p2 = unique_key_int()
# Insert two rows per partition
cql.execute(f"INSERT INTO {table} (p, c, v) VALUES ({p1}, 1, 10)")
cql.execute(f"INSERT INTO {table} (p, c, v) VALUES ({p1}, 2, 20)")
cql.execute(f"INSERT INTO {table} (p, c, v) VALUES ({p2}, 1, 30)")
cql.execute(f"INSERT INTO {table} (p, c, v) VALUES ({p2}, 2, 40)")
# DISTINCT can only select partition key columns (p)
# Should return exactly 2 rows (one per partition)
result = list(cql.execute(f"SELECT JSON DISTINCT p FROM {table} WHERE p IN ({p1}, {p2})"))
# Check that the results are valid JSON with the expected structure
json_values = sorted([json.loads(row[0])["p"] for row in result])
assert json_values == sorted([p1, p2])
# Without DISTINCT, should return all 4 rows
result = list(cql.execute(f"SELECT JSON p FROM {table} WHERE p IN ({p1}, {p2})"))
json_values = [json.loads(row[0])["p"] for row in result]
assert sorted(json_values) == sorted([p1, p1, p2, p2])

View File

@@ -0,0 +1,91 @@
# -*- coding: utf-8 -*-
# Copyright 2026-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
# Tests literals in the SELECT clause.
#
# Originally, the CQL grammar allowed literals (constants, bind markers, and
# collections/tuples/UDTs of literals) only in the WHERE clause. This test suite
# tests literals in the SELECT clause, which were added later [1].
#
# The simplest example, "SELECT 1" actually doesn't work since its type cannot
# be inferred (is it a tinyint, int, or bigint?), so we use UDFs and other functions
# that accept known types instead. We do test that "SELECT 1" and similar fail
# in the expected way due to type inference failure.
#
# [1]: https://scylladb.atlassian.net/browse/SCYLLADB-296
from contextlib import contextmanager
import pytest
from .util import unique_name, new_function
from .conftest import scylla_only
from cassandra.protocol import InvalidRequest
want_lua = scylla_only
def test_simple_literal_selectors(cql, test_keyspace, want_lua):
@contextmanager
def new_sum_function(name: str, type: str, op: str):
body = f"(i {type}, j {type}) RETURNS NULL ON NULL INPUT RETURNS {type} LANGUAGE lua AS 'return i {op} j;'"
with new_function(cql, test_keyspace, body, name=name, args=f"{type}, {type}") as f:
yield f
# Create two different functions with the same name fun, but a
# different signature (different parameters):
fun = unique_name()
ksfun = f"{test_keyspace}.{fun}"
with new_sum_function(name=fun, type="int", op="+"):
rows = cql.execute(f"SELECT {ksfun}(1, 2) AS sum_int FROM system.local")
assert rows.one().sum_int == 3
stmt = cql.prepare(f"SELECT {ksfun}(?, ?) AS sum_int FROM system.local")
rows = cql.execute(stmt, (10, 20))
assert rows.one().sum_int == 30
with pytest.raises(InvalidRequest, match="Type error"):
cql.execute(f"SELECT {ksfun}(1, 'asf') AS sum_int FROM system.local")
with new_sum_function(name=fun, type="text", op=".."):
rows = cql.execute(f"SELECT {ksfun}('hello, ', 'world!') AS sum_text FROM system.local")
assert rows.one().sum_text == "hello, world!"
stmt = cql.prepare(f"SELECT {ksfun}(?, ?) AS sum_text FROM system.local")
rows = cql.execute(stmt, ('foo', 'bar'))
assert rows.one().sum_text == "foobar"
with pytest.raises(InvalidRequest, match="Type error"):
cql.execute(f"SELECT {ksfun}('asf', 1) AS sum_text FROM system.local")
# scylla-only due to set_intersection function
def test_set_literal_selector(cql, test_keyspace, scylla_only):
cql.execute(f"CREATE TABLE IF NOT EXISTS {test_keyspace}.sets (id int PRIMARY KEY, vals set<int>, vals2 set<frozen<map<text, int>>>)")
cql.execute(f"INSERT INTO {test_keyspace}.sets (id, vals) VALUES (1, {{1, 2, 3, 4, 5}})")
rows = cql.execute(f"SELECT set_intersection(vals, {{3,4,5,6,7}}) AS intersection FROM {test_keyspace}.sets WHERE id=1")
assert rows.one().intersection == {3,4,5}
cql.execute(f"INSERT INTO {test_keyspace}.sets (id, vals2) VALUES (1, {{ {{ 'aa': 1, 'bb': 2 }}, {{ 'cc': 3, 'dd': 4 }} }})")
rows = cql.execute(f"SELECT set_intersection(vals2, {{ {{ 'cc': 3, 'dd': 4 }}, {{ 'cc': 3, 'dd': 5 }} }}) AS intersection FROM {test_keyspace}.sets WHERE id=1")
assert rows.one().intersection == {frozenset([('cc', 3), ('dd', 4)])}
# Test that simple literals without type hints fail as expected due to type inference failure.
def test_simple_literal_type_inference_failure(cql, test_keyspace):
with pytest.raises(InvalidRequest, match="infer type"):
cql.execute("SELECT 1 AS one FROM system.local")
with pytest.raises(InvalidRequest, match="infer type"):
cql.execute("SELECT 'hello' AS greeting FROM system.local")
with pytest.raises(InvalidRequest, match="infer type"):
cql.execute("SELECT [1, 2, 3] AS lst FROM system.local")
with pytest.raises(InvalidRequest, match="infer type"):
cql.execute("SELECT { 'a': 1, 'b': 2 } AS mp FROM system.local")
with pytest.raises(InvalidRequest, match="infer type"):
cql.execute("SELECT (1, 'a', 3.0) AS tpl FROM system.local")
with pytest.raises(InvalidRequest, match="infer type"):
cql.execute("SELECT ? AS qm FROM system.local")
with pytest.raises(InvalidRequest, match="infer type"):
cql.execute("SELECT :bindvar AS bv FROM system.local")
# Test that count(2) fails as expected. We're likely to relax this restriction later
# as it is quite artificial. scylla_only because Cassandra does allow it.
def test_count_literal_only_1(cql, test_keyspace, scylla_only):
with pytest.raises(InvalidRequest, match="expects a column or the literal 1 as an argument"):
cql.execute("SELECT count(2) AS cnt FROM system.local")
# Error message here is not the best, but tightening error messages
# here is quite a hassle and we plan to relax the restriction later anyway.
with pytest.raises(InvalidRequest, match="only valid when argument types are known"):
cql.execute("SELECT count(?) AS cnt FROM system.local")