Files
scylladb/alternator/expressions.g
Avi Kivity 0ae22a09d4 LICENSE: Update to version 1.1
Updated terms of non-commercial use (must be a never-customer).
2026-04-12 19:46:33 +03:00

309 lines
12 KiB
Plaintext

/*
* Copyright 2019-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
*/
/*
* The DynamoDB protocol is based on JSON, and most DynamoDB requests
* describe the operation and its parameters via JSON objects such as maps
* and lists. Nevertheless, in some types of requests an "expression" is
* passed as a single string, and we need to parse this string. These
* cases include:
* 1. Attribute paths, such as "a[3].b.c", are used in projection
* expressions as well as inside other expressions described below.
* 2. Condition expressions, such as "(NOT (a=b OR c=d)) AND e=f",
* used in conditional updates, filters, and other places.
* 3. Update expressions, such as "SET #a.b = :x, c = :y DELETE d"
*
* All these expression syntaxes are very simple: Most of them could be
* parsed as regular expressions, and the parenthesized condition expression
* could be done with a simple hand-written lexical analyzer and recursive-
* descent parser. Nevertheless, we decided to specify these parsers in the
* ANTLR3 language already used in the Scylla project, hopefully making these
* parsers easier to reason about, and easier to change if needed - and
* reducing the amount of boiler-plate code.
*/
grammar expressions;
options {
language = Cpp;
}
@parser::namespace{alternator}
@lexer::namespace{alternator}
/* TODO: explain what these traits things are. I haven't seen them explained
* in any document... Compilation fails without these fail because a definition
* of "expressionsLexerTraits" and "expressionParserTraits" is needed.
*/
@lexer::traits {
class expressionsLexer;
class expressionsParser;
typedef antlr3::Traits<expressionsLexer, expressionsParser> expressionsLexerTraits;
}
@parser::traits {
typedef expressionsLexerTraits expressionsParserTraits;
}
@lexer::header {
#include "alternator/expressions.hh"
// ANTLR generates a bunch of unused variables and functions. Yuck...
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wunused-function"
}
@parser::header {
#include "expressionsLexer.hpp"
}
/* By default, ANTLR3 composes elaborate syntax-error messages, saying which
* token was unexpected, where, and so on on, but then dutifully writes these
* error messages to the standard error, and returns from the parser as if
* everything was fine, with a half-constructed output object! If we define
* the "displayRecognitionError" method, it will be called upon to build this
* error message, and we can instead throw an exception to stop the parsing
* immediately. This is good enough for now, for our simple needs, but if
* we ever want to show more information about the syntax error, Cql3.g
* contains an elaborate implementation (it would be nice if we could reuse
* it, not duplicate it).
* Unfortunately, we have to repeat the same definition twice - once for the
* parser, and once for the lexer.
*/
@parser::context {
void displayRecognitionError(ANTLR_UINT8** token_names, ExceptionBaseType* ex) {
const char* err;
switch (ex->getType()) {
case antlr3::ExceptionType::FAILED_PREDICATE_EXCEPTION:
err = "expression nested too deeply";
break;
default:
err = "syntax error";
break;
}
// Alternator expressions are always single line so ex->get_line()
// is always 1, no sense to print it.
// TODO: return the position as part of the exception, so the
// caller in expressions.cc that knows the expression string can
// mark the error position in the final error message.
throw expressions_syntax_error(format("{} at char {}", err,
ex->get_charPositionInLine()));
}
// ANTLR3 tries to recover missing tokens - it tries to finish parsing
// and create valid objects, as if the missing token was there.
// But it has a bug and leaks these tokens.
// We override offending method and handle abandoned pointers.
std::vector<std::unique_ptr<TokenType>> _missing_tokens;
TokenType* getMissingSymbol(IntStreamType* istream, ExceptionBaseType* e,
ANTLR_UINT32 expectedTokenType, BitsetListType* follow) {
auto token = BaseType::getMissingSymbol(istream, e, expectedTokenType, follow);
_missing_tokens.emplace_back(token);
return token;
}
}
@lexer::context {
void displayRecognitionError(ANTLR_UINT8** token_names, ExceptionBaseType* ex) {
throw expressions_syntax_error("syntax error");
}
}
/* Unfortunately, ANTLR uses recursion - not the heap - to parse recursive
* expressions. To make things even worse, ANTLR has no way to limit the
* depth of this recursion (unlike Yacc which has YYMAXDEPTH). So deeply-
* nested expression like "(((((((((((((..." can easily crash Scylla on a
* stack overflow (see issue #14477).
*
* We are lucky that in the grammar for DynamoDB expressions (below),
* only a few specific rules can recurse, so it was fairly easy to add a
* "depth" counter to a few specific rules, and then use a predicate
* "{depth<MAX_DEPTH}?" to avoid parsing if the depth exceeds this limit,
* and throw a FAILED_PREDICATE_EXCEPTION in that case, which we will
* report to the user as a "expression nested too deeply" error.
*/
@parser::members {
static constexpr int MAX_DEPTH = 400;
}
/*
* Lexical analysis phase, i.e., splitting the input up to tokens.
* Lexical analyzer rules have names starting in capital letters.
* "fragment" rules do not generate tokens, and are just aliases used to
* make other rules more readable.
* Characters *not* listed here, e.g., '=', '(', etc., will be handled
* as individual tokens on their own right.
* Whitespace spans are skipped, so do not generate tokens.
*/
WHITESPACE: (' ' | '\t' | '\n' | '\r')+ { skip(); };
/* shortcuts for case-insensitive keywords */
fragment A:('a'|'A');
fragment B:('b'|'B');
fragment C:('c'|'C');
fragment D:('d'|'D');
fragment E:('e'|'E');
fragment F:('f'|'F');
fragment G:('g'|'G');
fragment H:('h'|'H');
fragment I:('i'|'I');
fragment J:('j'|'J');
fragment K:('k'|'K');
fragment L:('l'|'L');
fragment M:('m'|'M');
fragment N:('n'|'N');
fragment O:('o'|'O');
fragment P:('p'|'P');
fragment Q:('q'|'Q');
fragment R:('r'|'R');
fragment S:('s'|'S');
fragment T:('t'|'T');
fragment U:('u'|'U');
fragment V:('v'|'V');
fragment W:('w'|'W');
fragment X:('x'|'X');
fragment Y:('y'|'Y');
fragment Z:('z'|'Z');
/* These keywords must be appear before the generic NAME token below,
* because NAME matches too, and the first to match wins.
*/
SET: S E T;
REMOVE: R E M O V E;
ADD: A D D;
DELETE: D E L E T E;
AND: A N D;
OR: O R;
NOT: N O T;
BETWEEN: B E T W E E N;
IN: I N;
fragment ALPHA: 'A'..'Z' | 'a'..'z';
fragment DIGIT: '0'..'9';
fragment ALNUM: ALPHA | DIGIT | '_';
INTEGER: DIGIT+;
NAME: ALPHA ALNUM*;
NAMEREF: '#' ALNUM+;
VALREF: ':' ALNUM+;
/*
* Parsing phase - parsing the string of tokens generated by the lexical
* analyzer defined above.
*/
path_component: NAME | NAMEREF;
path returns [parsed::path p]:
root=path_component { $p.set_root($root.text); }
( '.' name=path_component { $p.add_dot($name.text); }
| '[' INTEGER ']' {
try {
$p.add_index(std::stoi($INTEGER.text));
} catch(std::out_of_range&) {
throw expressions_syntax_error("list index out of integer range");
}
}
)*;
/* See comment above why the "depth" counter was needed here */
value[int depth] returns [parsed::value v]:
VALREF { $v.set_valref($VALREF.text); }
| path { $v.set_path($path.p); }
| {depth<MAX_DEPTH}? NAME { $v.set_func_name($NAME.text); }
'(' x=value[depth+1] { $v.add_func_parameter($x.v); }
(',' x=value[depth+1] { $v.add_func_parameter($x.v); })*
')'
;
update_expression_set_rhs returns [parsed::set_rhs rhs]:
v=value[0] { $rhs.set_value(std::move($v.v)); }
( '+' v=value[0] { $rhs.set_plus(std::move($v.v)); }
| '-' v=value[0] { $rhs.set_minus(std::move($v.v)); }
)?
;
update_expression_set_action returns [parsed::update_expression::action a]:
path '=' rhs=update_expression_set_rhs { $a.assign_set($path.p, $rhs.rhs); };
update_expression_remove_action returns [parsed::update_expression::action a]:
path { $a.assign_remove($path.p); };
update_expression_add_action returns [parsed::update_expression::action a]:
path VALREF { $a.assign_add($path.p, $VALREF.text); };
update_expression_delete_action returns [parsed::update_expression::action a]:
path VALREF { $a.assign_del($path.p, $VALREF.text); };
update_expression_clause returns [parsed::update_expression e]:
SET s=update_expression_set_action { $e.add(s); }
(',' s=update_expression_set_action { $e.add(s); })*
| REMOVE r=update_expression_remove_action { $e.add(r); }
(',' r=update_expression_remove_action { $e.add(r); })*
| ADD a=update_expression_add_action { $e.add(a); }
(',' a=update_expression_add_action { $e.add(a); })*
| DELETE d=update_expression_delete_action { $e.add(d); }
(',' d=update_expression_delete_action { $e.add(d); })*
;
// Note the "EOF" token at the end of the update expression. We want to the
// parser to match the entire string given to it - not just its beginning!
update_expression returns [parsed::update_expression e]:
(update_expression_clause { e.append($update_expression_clause.e); })+ EOF;
projection_expression returns [std::vector<parsed::path> v]:
p=path { $v.push_back(std::move($p.p)); }
(',' p=path { $v.push_back(std::move($p.p)); } )* EOF;
primitive_condition returns [parsed::primitive_condition c]:
v=value[0] { $c.add_value(std::move($v.v));
$c.set_operator(parsed::primitive_condition::type::VALUE); }
( ( '=' { $c.set_operator(parsed::primitive_condition::type::EQ); }
| '<' '>' { $c.set_operator(parsed::primitive_condition::type::NE); }
| '<' { $c.set_operator(parsed::primitive_condition::type::LT); }
| '<' '=' { $c.set_operator(parsed::primitive_condition::type::LE); }
| '>' { $c.set_operator(parsed::primitive_condition::type::GT); }
| '>' '=' { $c.set_operator(parsed::primitive_condition::type::GE); }
)
v=value[0] { $c.add_value(std::move($v.v)); }
| BETWEEN { $c.set_operator(parsed::primitive_condition::type::BETWEEN); }
v=value[0] { $c.add_value(std::move($v.v)); }
AND
v=value[0] { $c.add_value(std::move($v.v)); }
| IN '(' { $c.set_operator(parsed::primitive_condition::type::IN); }
v=value[0] { $c.add_value(std::move($v.v)); }
(',' v=value[0] { $c.add_value(std::move($v.v)); })*
')'
)?
{
// Post-parse check to reject non-function single values
if ($c._op == parsed::primitive_condition::type::VALUE &&
!$c._values.front().is_func()) {
throw expressions_syntax_error("Single value must be a function");
}
}
;
// The following rules for parsing boolean expressions are verbose and
// somewhat strange because of Antlr 3's limitations on recursive rules,
// common rule prefixes, and (lack of) support for operator precedence.
// These rules could have been written more clearly using a more powerful
// parser generator - such as Yacc.
// See comment above why the "depth" counter was needed here.
boolean_expression[int depth] returns [parsed::condition_expression e]:
b=boolean_expression_1[depth] { $e.append(std::move($b.e), '|'); }
(OR b=boolean_expression_1[depth] { $e.append(std::move($b.e), '|'); } )*
;
boolean_expression_1[int depth] returns [parsed::condition_expression e]:
b=boolean_expression_2[depth] { $e.append(std::move($b.e), '&'); }
(AND b=boolean_expression_2[depth] { $e.append(std::move($b.e), '&'); } )*
;
boolean_expression_2[int depth] returns [parsed::condition_expression e]:
p=primitive_condition { $e.set_primitive(std::move($p.c)); }
| {depth<MAX_DEPTH}? NOT b=boolean_expression_2[depth+1] { $e = std::move($b.e); $e.apply_not(); }
| {depth<MAX_DEPTH}? '(' b=boolean_expression[depth+1] ')' { $e = std::move($b.e); }
;
condition_expression returns [parsed::condition_expression e]:
boolean_expression[0] { e=std::move($boolean_expression.e); } EOF;