mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-22 07:42:16 +00:00
309 lines
12 KiB
Plaintext
309 lines
12 KiB
Plaintext
/*
|
|
* Copyright 2019-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
|
*/
|
|
|
|
/*
|
|
* The DynamoDB protocol is based on JSON, and most DynamoDB requests
|
|
* describe the operation and its parameters via JSON objects such as maps
|
|
* and lists. Nevertheless, in some types of requests an "expression" is
|
|
* passed as a single string, and we need to parse this string. These
|
|
* cases include:
|
|
* 1. Attribute paths, such as "a[3].b.c", are used in projection
|
|
* expressions as well as inside other expressions described below.
|
|
* 2. Condition expressions, such as "(NOT (a=b OR c=d)) AND e=f",
|
|
* used in conditional updates, filters, and other places.
|
|
* 3. Update expressions, such as "SET #a.b = :x, c = :y DELETE d"
|
|
*
|
|
* All these expression syntaxes are very simple: Most of them could be
|
|
* parsed as regular expressions, and the parenthesized condition expression
|
|
* could be done with a simple hand-written lexical analyzer and recursive-
|
|
* descent parser. Nevertheless, we decided to specify these parsers in the
|
|
* ANTLR3 language already used in the Scylla project, hopefully making these
|
|
* parsers easier to reason about, and easier to change if needed - and
|
|
* reducing the amount of boiler-plate code.
|
|
*/
|
|
|
|
grammar expressions;
|
|
|
|
options {
|
|
language = Cpp;
|
|
}
|
|
|
|
@parser::namespace{alternator}
|
|
@lexer::namespace{alternator}
|
|
|
|
/* TODO: explain what these traits things are. I haven't seen them explained
|
|
* in any document... Compilation fails without these fail because a definition
|
|
* of "expressionsLexerTraits" and "expressionParserTraits" is needed.
|
|
*/
|
|
@lexer::traits {
|
|
class expressionsLexer;
|
|
class expressionsParser;
|
|
typedef antlr3::Traits<expressionsLexer, expressionsParser> expressionsLexerTraits;
|
|
}
|
|
@parser::traits {
|
|
typedef expressionsLexerTraits expressionsParserTraits;
|
|
}
|
|
|
|
@lexer::header {
|
|
#include "alternator/expressions.hh"
|
|
// ANTLR generates a bunch of unused variables and functions. Yuck...
|
|
#pragma GCC diagnostic ignored "-Wunused-variable"
|
|
#pragma GCC diagnostic ignored "-Wunused-function"
|
|
}
|
|
@parser::header {
|
|
#include "expressionsLexer.hpp"
|
|
}
|
|
|
|
/* By default, ANTLR3 composes elaborate syntax-error messages, saying which
|
|
* token was unexpected, where, and so on on, but then dutifully writes these
|
|
* error messages to the standard error, and returns from the parser as if
|
|
* everything was fine, with a half-constructed output object! If we define
|
|
* the "displayRecognitionError" method, it will be called upon to build this
|
|
* error message, and we can instead throw an exception to stop the parsing
|
|
* immediately. This is good enough for now, for our simple needs, but if
|
|
* we ever want to show more information about the syntax error, Cql3.g
|
|
* contains an elaborate implementation (it would be nice if we could reuse
|
|
* it, not duplicate it).
|
|
* Unfortunately, we have to repeat the same definition twice - once for the
|
|
* parser, and once for the lexer.
|
|
*/
|
|
@parser::context {
|
|
void displayRecognitionError(ANTLR_UINT8** token_names, ExceptionBaseType* ex) {
|
|
const char* err;
|
|
switch (ex->getType()) {
|
|
case antlr3::ExceptionType::FAILED_PREDICATE_EXCEPTION:
|
|
err = "expression nested too deeply";
|
|
break;
|
|
default:
|
|
err = "syntax error";
|
|
break;
|
|
}
|
|
// Alternator expressions are always single line so ex->get_line()
|
|
// is always 1, no sense to print it.
|
|
// TODO: return the position as part of the exception, so the
|
|
// caller in expressions.cc that knows the expression string can
|
|
// mark the error position in the final error message.
|
|
throw expressions_syntax_error(format("{} at char {}", err,
|
|
ex->get_charPositionInLine()));
|
|
}
|
|
|
|
// ANTLR3 tries to recover missing tokens - it tries to finish parsing
|
|
// and create valid objects, as if the missing token was there.
|
|
// But it has a bug and leaks these tokens.
|
|
// We override offending method and handle abandoned pointers.
|
|
std::vector<std::unique_ptr<TokenType>> _missing_tokens;
|
|
TokenType* getMissingSymbol(IntStreamType* istream, ExceptionBaseType* e,
|
|
ANTLR_UINT32 expectedTokenType, BitsetListType* follow) {
|
|
auto token = BaseType::getMissingSymbol(istream, e, expectedTokenType, follow);
|
|
_missing_tokens.emplace_back(token);
|
|
return token;
|
|
}
|
|
}
|
|
@lexer::context {
|
|
void displayRecognitionError(ANTLR_UINT8** token_names, ExceptionBaseType* ex) {
|
|
throw expressions_syntax_error("syntax error");
|
|
}
|
|
}
|
|
|
|
/* Unfortunately, ANTLR uses recursion - not the heap - to parse recursive
|
|
* expressions. To make things even worse, ANTLR has no way to limit the
|
|
* depth of this recursion (unlike Yacc which has YYMAXDEPTH). So deeply-
|
|
* nested expression like "(((((((((((((..." can easily crash Scylla on a
|
|
* stack overflow (see issue #14477).
|
|
*
|
|
* We are lucky that in the grammar for DynamoDB expressions (below),
|
|
* only a few specific rules can recurse, so it was fairly easy to add a
|
|
* "depth" counter to a few specific rules, and then use a predicate
|
|
* "{depth<MAX_DEPTH}?" to avoid parsing if the depth exceeds this limit,
|
|
* and throw a FAILED_PREDICATE_EXCEPTION in that case, which we will
|
|
* report to the user as a "expression nested too deeply" error.
|
|
*/
|
|
@parser::members {
|
|
static constexpr int MAX_DEPTH = 400;
|
|
}
|
|
|
|
/*
|
|
* Lexical analysis phase, i.e., splitting the input up to tokens.
|
|
* Lexical analyzer rules have names starting in capital letters.
|
|
* "fragment" rules do not generate tokens, and are just aliases used to
|
|
* make other rules more readable.
|
|
* Characters *not* listed here, e.g., '=', '(', etc., will be handled
|
|
* as individual tokens on their own right.
|
|
* Whitespace spans are skipped, so do not generate tokens.
|
|
*/
|
|
WHITESPACE: (' ' | '\t' | '\n' | '\r')+ { skip(); };
|
|
|
|
/* shortcuts for case-insensitive keywords */
|
|
fragment A:('a'|'A');
|
|
fragment B:('b'|'B');
|
|
fragment C:('c'|'C');
|
|
fragment D:('d'|'D');
|
|
fragment E:('e'|'E');
|
|
fragment F:('f'|'F');
|
|
fragment G:('g'|'G');
|
|
fragment H:('h'|'H');
|
|
fragment I:('i'|'I');
|
|
fragment J:('j'|'J');
|
|
fragment K:('k'|'K');
|
|
fragment L:('l'|'L');
|
|
fragment M:('m'|'M');
|
|
fragment N:('n'|'N');
|
|
fragment O:('o'|'O');
|
|
fragment P:('p'|'P');
|
|
fragment Q:('q'|'Q');
|
|
fragment R:('r'|'R');
|
|
fragment S:('s'|'S');
|
|
fragment T:('t'|'T');
|
|
fragment U:('u'|'U');
|
|
fragment V:('v'|'V');
|
|
fragment W:('w'|'W');
|
|
fragment X:('x'|'X');
|
|
fragment Y:('y'|'Y');
|
|
fragment Z:('z'|'Z');
|
|
/* These keywords must be appear before the generic NAME token below,
|
|
* because NAME matches too, and the first to match wins.
|
|
*/
|
|
SET: S E T;
|
|
REMOVE: R E M O V E;
|
|
ADD: A D D;
|
|
DELETE: D E L E T E;
|
|
|
|
AND: A N D;
|
|
OR: O R;
|
|
NOT: N O T;
|
|
BETWEEN: B E T W E E N;
|
|
IN: I N;
|
|
|
|
fragment ALPHA: 'A'..'Z' | 'a'..'z';
|
|
fragment DIGIT: '0'..'9';
|
|
fragment ALNUM: ALPHA | DIGIT | '_';
|
|
INTEGER: DIGIT+;
|
|
NAME: ALPHA ALNUM*;
|
|
NAMEREF: '#' ALNUM+;
|
|
VALREF: ':' ALNUM+;
|
|
|
|
/*
|
|
* Parsing phase - parsing the string of tokens generated by the lexical
|
|
* analyzer defined above.
|
|
*/
|
|
|
|
path_component: NAME | NAMEREF;
|
|
path returns [parsed::path p]:
|
|
root=path_component { $p.set_root($root.text); }
|
|
( '.' name=path_component { $p.add_dot($name.text); }
|
|
| '[' INTEGER ']' {
|
|
try {
|
|
$p.add_index(std::stoi($INTEGER.text));
|
|
} catch(std::out_of_range&) {
|
|
throw expressions_syntax_error("list index out of integer range");
|
|
}
|
|
}
|
|
)*;
|
|
|
|
/* See comment above why the "depth" counter was needed here */
|
|
value[int depth] returns [parsed::value v]:
|
|
VALREF { $v.set_valref($VALREF.text); }
|
|
| path { $v.set_path($path.p); }
|
|
| {depth<MAX_DEPTH}? NAME { $v.set_func_name($NAME.text); }
|
|
'(' x=value[depth+1] { $v.add_func_parameter($x.v); }
|
|
(',' x=value[depth+1] { $v.add_func_parameter($x.v); })*
|
|
')'
|
|
;
|
|
|
|
update_expression_set_rhs returns [parsed::set_rhs rhs]:
|
|
v=value[0] { $rhs.set_value(std::move($v.v)); }
|
|
( '+' v=value[0] { $rhs.set_plus(std::move($v.v)); }
|
|
| '-' v=value[0] { $rhs.set_minus(std::move($v.v)); }
|
|
)?
|
|
;
|
|
|
|
update_expression_set_action returns [parsed::update_expression::action a]:
|
|
path '=' rhs=update_expression_set_rhs { $a.assign_set($path.p, $rhs.rhs); };
|
|
|
|
update_expression_remove_action returns [parsed::update_expression::action a]:
|
|
path { $a.assign_remove($path.p); };
|
|
|
|
update_expression_add_action returns [parsed::update_expression::action a]:
|
|
path VALREF { $a.assign_add($path.p, $VALREF.text); };
|
|
|
|
update_expression_delete_action returns [parsed::update_expression::action a]:
|
|
path VALREF { $a.assign_del($path.p, $VALREF.text); };
|
|
|
|
update_expression_clause returns [parsed::update_expression e]:
|
|
SET s=update_expression_set_action { $e.add(s); }
|
|
(',' s=update_expression_set_action { $e.add(s); })*
|
|
| REMOVE r=update_expression_remove_action { $e.add(r); }
|
|
(',' r=update_expression_remove_action { $e.add(r); })*
|
|
| ADD a=update_expression_add_action { $e.add(a); }
|
|
(',' a=update_expression_add_action { $e.add(a); })*
|
|
| DELETE d=update_expression_delete_action { $e.add(d); }
|
|
(',' d=update_expression_delete_action { $e.add(d); })*
|
|
;
|
|
|
|
// Note the "EOF" token at the end of the update expression. We want to the
|
|
// parser to match the entire string given to it - not just its beginning!
|
|
update_expression returns [parsed::update_expression e]:
|
|
(update_expression_clause { e.append($update_expression_clause.e); })+ EOF;
|
|
|
|
projection_expression returns [std::vector<parsed::path> v]:
|
|
p=path { $v.push_back(std::move($p.p)); }
|
|
(',' p=path { $v.push_back(std::move($p.p)); } )* EOF;
|
|
|
|
|
|
primitive_condition returns [parsed::primitive_condition c]:
|
|
v=value[0] { $c.add_value(std::move($v.v));
|
|
$c.set_operator(parsed::primitive_condition::type::VALUE); }
|
|
( ( '=' { $c.set_operator(parsed::primitive_condition::type::EQ); }
|
|
| '<' '>' { $c.set_operator(parsed::primitive_condition::type::NE); }
|
|
| '<' { $c.set_operator(parsed::primitive_condition::type::LT); }
|
|
| '<' '=' { $c.set_operator(parsed::primitive_condition::type::LE); }
|
|
| '>' { $c.set_operator(parsed::primitive_condition::type::GT); }
|
|
| '>' '=' { $c.set_operator(parsed::primitive_condition::type::GE); }
|
|
)
|
|
v=value[0] { $c.add_value(std::move($v.v)); }
|
|
| BETWEEN { $c.set_operator(parsed::primitive_condition::type::BETWEEN); }
|
|
v=value[0] { $c.add_value(std::move($v.v)); }
|
|
AND
|
|
v=value[0] { $c.add_value(std::move($v.v)); }
|
|
| IN '(' { $c.set_operator(parsed::primitive_condition::type::IN); }
|
|
v=value[0] { $c.add_value(std::move($v.v)); }
|
|
(',' v=value[0] { $c.add_value(std::move($v.v)); })*
|
|
')'
|
|
)?
|
|
{
|
|
// Post-parse check to reject non-function single values
|
|
if ($c._op == parsed::primitive_condition::type::VALUE &&
|
|
!$c._values.front().is_func()) {
|
|
throw expressions_syntax_error("Single value must be a function");
|
|
}
|
|
}
|
|
;
|
|
|
|
// The following rules for parsing boolean expressions are verbose and
|
|
// somewhat strange because of Antlr 3's limitations on recursive rules,
|
|
// common rule prefixes, and (lack of) support for operator precedence.
|
|
// These rules could have been written more clearly using a more powerful
|
|
// parser generator - such as Yacc.
|
|
// See comment above why the "depth" counter was needed here.
|
|
boolean_expression[int depth] returns [parsed::condition_expression e]:
|
|
b=boolean_expression_1[depth] { $e.append(std::move($b.e), '|'); }
|
|
(OR b=boolean_expression_1[depth] { $e.append(std::move($b.e), '|'); } )*
|
|
;
|
|
boolean_expression_1[int depth] returns [parsed::condition_expression e]:
|
|
b=boolean_expression_2[depth] { $e.append(std::move($b.e), '&'); }
|
|
(AND b=boolean_expression_2[depth] { $e.append(std::move($b.e), '&'); } )*
|
|
;
|
|
boolean_expression_2[int depth] returns [parsed::condition_expression e]:
|
|
p=primitive_condition { $e.set_primitive(std::move($p.c)); }
|
|
| {depth<MAX_DEPTH}? NOT b=boolean_expression_2[depth+1] { $e = std::move($b.e); $e.apply_not(); }
|
|
| {depth<MAX_DEPTH}? '(' b=boolean_expression[depth+1] ')' { $e = std::move($b.e); }
|
|
;
|
|
|
|
condition_expression returns [parsed::condition_expression e]:
|
|
boolean_expression[0] { e=std::move($boolean_expression.e); } EOF;
|