cql3: support literals and bind variables in selectors

Add support for literals in the SELECT clause. This allows SELECT fn(column, 4) or SELECT fn(column, ?). Note, "SELECT 7 FROM tab" becomes valid in the grammar, but is still not accepted because of failed type inference - we cannot infer the type of 7, and don't have a favored type for literals (like C favors int). We might relax this later. In the WHERE clause, and Cassandra in the SELECT clause, type hints can also resolve type ambiguity: (bigint)7 or (text)?. But this is deferred to a later patch. A few changes to the grammar are needed on top of adding a `value` alternative to `unaliasedSelector`: - vectorSimilarityArg gained access to `value` via `unaliasedSelector`, so it loses that alternate to avoid ambiguity. We may drop `vectorSimilarityArg` later. - COUNT(1) became ambiguous via the general function path (since function arguments can now be literals), so we remove this case from the COUNT special cases, remaining with count(*). - SELECT JSON and SELECT DISTINCT became "ambiguous enough" for ANTLR to complain, though as far as I can tell `value` does not add real ambiguity. The solution is to commit early (via "=>") to a parsing path. Due to the loss of count(1) recognition in the parser, we have to special-case it in prepare. We may relax it to count any expression later, like modern Cassandra and SQL. Testing is awkward because of the type inference problem in top-level. We test via the set_intersection() function and via lua functions. Example: ``` cqlsh> CREATE FUNCTION ks.sum(a int, b int) RETURNS NULL ON NULL INPUT RETURNS int LANGUAGE LUA AS 'return a + b'; cqlsh> SELECT ks.sum(1, 2) FROM system.local; ks.sum(1, 2) -------------- 3 (1 rows) cqlsh> ``` (There are no suitable system functions!) Fixes https://scylladb.atlassian.net/browse/SCYLLADB-296 Closes scylladb/scylladb#28256
2026-05-14 03:42:14 +00:00 · 2026-01-19 12:46:21 +02:00
parent 68b105b21c
commit cc03f5c89d
5 changed files with 170 additions and 7 deletions
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -389,8 +389,10 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
        bool is_ann_ordering = false;
    }
    : K_SELECT (
-                ( K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; } )?
-                ( K_DISTINCT { is_distinct = true; } )?
+                ( (K_JSON K_DISTINCT)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
+                | (K_JSON selectClause K_FROM)=> K_JSON { statement_subtype = raw::select_statement::parameters::statement_subtype::JSON; }
+                )?
+                ( (K_DISTINCT selectClause K_FROM)=> K_DISTINCT { is_distinct = true; } )?
                sclause=selectClause
               )
      K_FROM (
@@ -425,6 +427,7 @@ selector returns [shared_ptr<raw_selector> s]

 unaliasedSelector returns [uexpression tmp]
    :  ( c=cident                                  { tmp = unresolved_identifier{std::move(c)}; }
+       | v=value                                   { tmp = std::move(v); }
       | K_COUNT '(' countArgument ')'             { tmp = make_count_rows_function_expression(); }
       | K_WRITETIME '(' c=cident ')'              { tmp = column_mutation_attribute{column_mutation_attribute::attribute_kind::writetime,
                                                                                              unresolved_identifier{std::move(c)}}; }
@@ -455,14 +458,11 @@ vectorSimilarityArgs returns [std::vector<expression> a]

 vectorSimilarityArg returns [uexpression a]
    : s=unaliasedSelector { a = std::move(s); }
-    | v=value             { a = std::move(v); }
    ;

 countArgument
    : '*'
-    | i=INTEGER { if (i->getText() != "1") {
-                    add_recognition_error("Only COUNT(1) is supported, got COUNT(" + i->getText() + ")");
-                } }
+    /* COUNT(1) is also allowed, it is recognized via the general function(args) path */
    ;

 whereClause returns [uexpression clause]
--- a/cql3/expr/prepare_expr.cc
+++ b/cql3/expr/prepare_expr.cc
@@ -10,6 +10,7 @@
 #include "expr-utils.hh"
 #include "evaluate.hh"
 #include "cql3/functions/functions.hh"
+#include "cql3/functions/aggregate_fcts.hh"
 #include "cql3/functions/castas_fcts.hh"
 #include "cql3/functions/scalar_function.hh"
 #include "cql3/column_identifier.hh"
@@ -1047,8 +1048,47 @@ prepare_function_args_for_type_inference(std::span<const expression> args, data_
    return partially_prepared_args;
 }

+// Special case for count(1) - recognize it as the countRows() function. Note it is quite
+// artificial and we might relax it to the more general count(expression) later.
+static
+std::optional<expression>
+try_prepare_count_rows(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
+    return std::visit(overloaded_functor{
+        [&] (const functions::function_name& name) -> std::optional<expression> {
+            auto native_name = name;
+            if (!native_name.has_keyspace()) {
+                native_name = name.as_native_function();
+            }
+            // Collapse count(1) into countRows()
+            if (native_name == functions::function_name::native_function("count")) {
+                if (fc.args.size() == 1) {
+                    if (auto uc_arg = expr::as_if<expr::untyped_constant>(&fc.args[0])) {
+                        if (uc_arg->partial_type == expr::untyped_constant::type_class::integer
+                                && uc_arg->raw_text == "1") {
+                            return expr::function_call{
+                                .func = functions::aggregate_fcts::make_count_rows_function(),
+                                .args = {},
+                            };
+                        } else {
+                            throw exceptions::invalid_request_exception(format("count() expects a column or the literal 1 as an argument", fc.args[0]));
+                        }
+                    }
+                }
+            }
+            return std::nullopt;
+        },
+        [] (const shared_ptr<functions::function>&) -> std::optional<expression> {
+            // Already prepared, nothing to do
+            return std::nullopt;
+        },
+    }, fc.func);
+}
+
 std::optional<expression>
 prepare_function_call(const expr::function_call& fc, data_dictionary::database db, const sstring& keyspace, const schema* schema_opt, lw_shared_ptr<column_specification> receiver) {
+    if (auto prepared = try_prepare_count_rows(fc, db, keyspace, schema_opt, receiver)) {
+        return prepared;
+    }
    // Try to extract a column family name from the available information.
    // Most functions can be prepared without information about the column family, usually just the keyspace is enough.
    // One exception is the token() function - in order to prepare system.token() we have to know the partition key of the table,
--- a/docs/cql/dml/select.rst
+++ b/docs/cql/dml/select.rst
@@ -25,6 +25,8 @@ Querying data from data is done using a ``SELECT`` statement:
           : | CAST '(' `selector` AS `cql_type` ')'
           : | `function_name` '(' [ `selector` ( ',' `selector` )* ] ')'
           : | COUNT '(' '*' ')'
+           : | literal
+           : | bind_marker
           : )
           : ( '.' `field_name` | '[' `term` ']' )*
   where_clause: `relation` ( AND `relation` )*
@@ -35,6 +37,8 @@ Querying data from data is done using a ``SELECT`` statement:
   operator: '=' | '<' | '>' | '<=' | '>=' | IN | NOT IN | CONTAINS | CONTAINS KEY
   ordering_clause: `column_name` [ ASC | DESC ] ( ',' `column_name` [ ASC | DESC ] )*
   timeout: `duration`
+   literal: number | 'string' | boolean | NULL | tuple_literal | list_literal | map_literal
+   bind_marker: '?' | ':' `identifier`

 For instance::

@@ -81,6 +85,13 @@ A :token:`selector` can be one of the following:
 - A casting, which allows you to convert a nested selector to a (compatible) type.
 - A function call, where the arguments are selector themselves.
 - A call to the :ref:`COUNT function <count-function>`, which counts all non-null results.
+- A literal value (constant).
+- A bind variable (`?` or `:name`).
+
+Note that due to a quirk of the type system, literals and bind markers cannot be
+used as top-level selectors, as the parser cannot infer their type. However, they can be used
+when nested inside functions, as the function formal parameter types provide the
+necessary context.

 Aliases
 ```````
--- a/test/cqlpy/test_json.py
+++ b/test/cqlpy/test_json.py
@@ -11,7 +11,7 @@
 # to reproduce bugs discovered by bigger Cassandra tests.
 #############################################################################

-from .util import unique_name, unique_key_int
+from .util import unique_name, unique_key_int, new_test_table

 from cassandra.protocol import FunctionFailure
 from cassandra.util import Date, Time
@@ -603,3 +603,24 @@ def test_select_json_with_alias(cql, table1):
    }
    for input, output in input_and_output.items():
        assert list(cql.execute(f"SELECT JSON {input} from {table1} where p = {p}")) == [(EquivalentJson(output),)]
+
+# The grammar around DISTINCT and JSON is hairy. Test the combination.
+def test_select_distinct_json(cql, test_keyspace):
+    with new_test_table(cql, test_keyspace, "p int, c int, v int, PRIMARY KEY (p, c)") as table:
+        p1 = unique_key_int()
+        p2 = unique_key_int()
+        # Insert two rows per partition
+        cql.execute(f"INSERT INTO {table} (p, c, v) VALUES ({p1}, 1, 10)")
+        cql.execute(f"INSERT INTO {table} (p, c, v) VALUES ({p1}, 2, 20)")
+        cql.execute(f"INSERT INTO {table} (p, c, v) VALUES ({p2}, 1, 30)")
+        cql.execute(f"INSERT INTO {table} (p, c, v) VALUES ({p2}, 2, 40)")
+        # DISTINCT can only select partition key columns (p)
+        # Should return exactly 2 rows (one per partition)
+        result = list(cql.execute(f"SELECT JSON DISTINCT p FROM {table} WHERE p IN ({p1}, {p2})"))
+        # Check that the results are valid JSON with the expected structure
+        json_values = sorted([json.loads(row[0])["p"] for row in result])
+        assert json_values == sorted([p1, p2])
+        # Without DISTINCT, should return all 4 rows
+        result = list(cql.execute(f"SELECT JSON p FROM {table} WHERE p IN ({p1}, {p2})"))
+        json_values = [json.loads(row[0])["p"] for row in result]
+        assert sorted(json_values) == sorted([p1, p1, p2, p2])
--- a/test/cqlpy/test_selector_literals.py
+++ b/test/cqlpy/test_selector_literals.py
@@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+# Copyright 2026-present ScyllaDB
+#
+# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+
+# Tests literals in the SELECT clause.
+#
+# Originally, the CQL grammar allowed literals (constants, bind markers, and
+# collections/tuples/UDTs of literals) only in the WHERE clause. This test suite
+# tests literals in the SELECT clause, which were added later [1].
+#
+# The simplest example, "SELECT 1" actually doesn't work since its type cannot
+# be inferred (is it a tinyint, int, or bigint?), so we use UDFs and other functions
+# that accept known types instead. We do test that "SELECT 1" and similar fail
+# in the expected way due to type inference failure.
+#
+# [1]: https://scylladb.atlassian.net/browse/SCYLLADB-296
+
+from contextlib import contextmanager
+import pytest
+from .util import unique_name, new_function
+from .conftest import scylla_only
+from cassandra.protocol import InvalidRequest
+
+want_lua = scylla_only
+
+def test_simple_literal_selectors(cql, test_keyspace, want_lua):
+    @contextmanager
+    def new_sum_function(name: str, type: str, op: str):
+        body = f"(i {type}, j {type}) RETURNS NULL ON NULL INPUT RETURNS {type} LANGUAGE lua AS 'return i {op} j;'"
+        with new_function(cql, test_keyspace, body, name=name, args=f"{type}, {type}") as f:
+            yield f
+
+    # Create two different functions with the same name fun, but a
+    # different signature (different parameters):
+    fun = unique_name()
+    ksfun = f"{test_keyspace}.{fun}"
+    with new_sum_function(name=fun, type="int", op="+"):
+        rows = cql.execute(f"SELECT {ksfun}(1, 2) AS sum_int FROM system.local")
+        assert rows.one().sum_int == 3
+        stmt = cql.prepare(f"SELECT {ksfun}(?, ?) AS sum_int FROM system.local")
+        rows = cql.execute(stmt, (10, 20))
+        assert rows.one().sum_int == 30
+        with pytest.raises(InvalidRequest, match="Type error"):
+            cql.execute(f"SELECT {ksfun}(1, 'asf') AS sum_int FROM system.local")
+    with new_sum_function(name=fun, type="text", op=".."):
+        rows = cql.execute(f"SELECT {ksfun}('hello, ', 'world!') AS sum_text FROM system.local")
+        assert rows.one().sum_text == "hello, world!"
+        stmt = cql.prepare(f"SELECT {ksfun}(?, ?) AS sum_text FROM system.local")
+        rows = cql.execute(stmt, ('foo', 'bar'))
+        assert rows.one().sum_text == "foobar"
+        with pytest.raises(InvalidRequest, match="Type error"):
+            cql.execute(f"SELECT {ksfun}('asf', 1) AS sum_text FROM system.local")
+
+# scylla-only due to set_intersection function
+def test_set_literal_selector(cql, test_keyspace, scylla_only):
+    cql.execute(f"CREATE TABLE IF NOT EXISTS {test_keyspace}.sets (id int PRIMARY KEY, vals set<int>, vals2 set<frozen<map<text, int>>>)")
+    cql.execute(f"INSERT INTO {test_keyspace}.sets (id, vals) VALUES (1, {{1, 2, 3, 4, 5}})")
+    rows = cql.execute(f"SELECT set_intersection(vals, {{3,4,5,6,7}}) AS intersection FROM {test_keyspace}.sets WHERE id=1")
+    assert rows.one().intersection == {3,4,5}
+
+    cql.execute(f"INSERT INTO {test_keyspace}.sets (id, vals2) VALUES (1, {{ {{ 'aa': 1, 'bb': 2 }}, {{ 'cc': 3, 'dd': 4 }} }})")
+    rows = cql.execute(f"SELECT set_intersection(vals2, {{ {{ 'cc': 3, 'dd': 4 }}, {{ 'cc': 3, 'dd': 5 }} }}) AS intersection FROM {test_keyspace}.sets WHERE id=1")
+    assert rows.one().intersection == {frozenset([('cc', 3), ('dd', 4)])}
+
+# Test that simple literals without type hints fail as expected due to type inference failure.
+def test_simple_literal_type_inference_failure(cql, test_keyspace):
+    with pytest.raises(InvalidRequest, match="infer type"):
+        cql.execute("SELECT 1 AS one FROM system.local")
+    with pytest.raises(InvalidRequest, match="infer type"):
+        cql.execute("SELECT 'hello' AS greeting FROM system.local")
+    with pytest.raises(InvalidRequest, match="infer type"):
+        cql.execute("SELECT [1, 2, 3] AS lst FROM system.local")
+    with pytest.raises(InvalidRequest, match="infer type"):
+        cql.execute("SELECT { 'a': 1, 'b': 2 } AS mp FROM system.local")
+    with pytest.raises(InvalidRequest, match="infer type"):
+        cql.execute("SELECT (1, 'a', 3.0) AS tpl FROM system.local")
+    with pytest.raises(InvalidRequest, match="infer type"):
+        cql.execute("SELECT ? AS qm FROM system.local")
+    with pytest.raises(InvalidRequest, match="infer type"):
+        cql.execute("SELECT :bindvar AS bv FROM system.local")
+
+# Test that count(2) fails as expected. We're likely to relax this restriction later
+# as it is quite artificial. scylla_only because Cassandra does allow it.
+def test_count_literal_only_1(cql, test_keyspace, scylla_only):
+    with pytest.raises(InvalidRequest, match="expects a column or the literal 1 as an argument"):
+        cql.execute("SELECT count(2) AS cnt FROM system.local")
+    # Error message here is not the best, but tightening error messages
+    # here is quite a hassle and we plan to relax the restriction later anyway.
+    with pytest.raises(InvalidRequest, match="only valid when argument types are known"):
+        cql.execute("SELECT count(?) AS cnt FROM system.local")