/* * Copyright 2019-present ScyllaDB */ /* * SPDX-License-Identifier: AGPL-3.0-or-later */ /* * The DynamoDB protocol is based on JSON, and most DynamoDB requests * describe the operation and its parameters via JSON objects such as maps * and lists. Nevertheless, in some types of requests an "expression" is * passed as a single string, and we need to parse this string. These * cases include: * 1. Attribute paths, such as "a[3].b.c", are used in projection * expressions as well as inside other expressions described below. * 2. Condition expressions, such as "(NOT (a=b OR c=d)) AND e=f", * used in conditional updates, filters, and other places. * 3. Update expressions, such as "SET #a.b = :x, c = :y DELETE d" * * All these expression syntaxes are very simple: Most of them could be * parsed as regular expressions, and the parenthesized condition expression * could be done with a simple hand-written lexical analyzer and recursive- * descent parser. Nevertheless, we decided to specify these parsers in the * ANTLR3 language already used in the Scylla project, hopefully making these * parsers easier to reason about, and easier to change if needed - and * reducing the amount of boiler-plate code. */ grammar expressions; options { language = Cpp; } @parser::namespace{alternator} @lexer::namespace{alternator} /* TODO: explain what these traits things are. I haven't seen them explained * in any document... Compilation fails without these fail because a definition * of "expressionsLexerTraits" and "expressionParserTraits" is needed. */ @lexer::traits { class expressionsLexer; class expressionsParser; typedef antlr3::Traits expressionsLexerTraits; } @parser::traits { typedef expressionsLexerTraits expressionsParserTraits; } @lexer::header { #include "alternator/expressions.hh" // ANTLR generates a bunch of unused variables and functions. Yuck... #pragma GCC diagnostic ignored "-Wunused-variable" #pragma GCC diagnostic ignored "-Wunused-function" } @parser::header { #include "expressionsLexer.hpp" } /* By default, ANTLR3 composes elaborate syntax-error messages, saying which * token was unexpected, where, and so on on, but then dutifully writes these * error messages to the standard error, and returns from the parser as if * everything was fine, with a half-constructed output object! If we define * the "displayRecognitionError" method, it will be called upon to build this * error message, and we can instead throw an exception to stop the parsing * immediately. This is good enough for now, for our simple needs, but if * we ever want to show more information about the syntax error, Cql3.g * contains an elaborate implementation (it would be nice if we could reuse * it, not duplicate it). * Unfortunately, we have to repeat the same definition twice - once for the * parser, and once for the lexer. */ @parser::context { void displayRecognitionError(ANTLR_UINT8** token_names, ExceptionBaseType* ex) { const char* err; switch (ex->getType()) { case antlr3::ExceptionType::FAILED_PREDICATE_EXCEPTION: err = "expression nested too deeply"; break; default: err = "syntax error"; break; } // Alternator expressions are always single line so ex->get_line() // is always 1, no sense to print it. // TODO: return the position as part of the exception, so the // caller in expressions.cc that knows the expression string can // mark the error position in the final error message. throw expressions_syntax_error(format("{} at char {}", err, ex->get_charPositionInLine())); } } @lexer::context { void displayRecognitionError(ANTLR_UINT8** token_names, ExceptionBaseType* ex) { throw expressions_syntax_error("syntax error"); } } /* Unfortunately, ANTLR uses recursion - not the heap - to parse recursive * expressions. To make things even worse, ANTLR has no way to limit the * depth of this recursion (unlike Yacc which has YYMAXDEPTH). So deeply- * nested expression like "(((((((((((((..." can easily crash Scylla on a * stack overflow (see issue #14477). * * We are lucky that in the grammar for DynamoDB expressions (below), * only a few specific rules can recurse, so it was fairly easy to add a * "depth" counter to a few specific rules, and then use a predicate * "{depth v]: p=path { $v.push_back(std::move($p.p)); } (',' p=path { $v.push_back(std::move($p.p)); } )* EOF; primitive_condition returns [parsed::primitive_condition c]: v=value[0] { $c.add_value(std::move($v.v)); $c.set_operator(parsed::primitive_condition::type::VALUE); } ( ( '=' { $c.set_operator(parsed::primitive_condition::type::EQ); } | '<' '>' { $c.set_operator(parsed::primitive_condition::type::NE); } | '<' { $c.set_operator(parsed::primitive_condition::type::LT); } | '<' '=' { $c.set_operator(parsed::primitive_condition::type::LE); } | '>' { $c.set_operator(parsed::primitive_condition::type::GT); } | '>' '=' { $c.set_operator(parsed::primitive_condition::type::GE); } ) v=value[0] { $c.add_value(std::move($v.v)); } | BETWEEN { $c.set_operator(parsed::primitive_condition::type::BETWEEN); } v=value[0] { $c.add_value(std::move($v.v)); } AND v=value[0] { $c.add_value(std::move($v.v)); } | IN '(' { $c.set_operator(parsed::primitive_condition::type::IN); } v=value[0] { $c.add_value(std::move($v.v)); } (',' v=value[0] { $c.add_value(std::move($v.v)); })* ')' )? ; // The following rules for parsing boolean expressions are verbose and // somewhat strange because of Antlr 3's limitations on recursive rules, // common rule prefixes, and (lack of) support for operator precedence. // These rules could have been written more clearly using a more powerful // parser generator - such as Yacc. // See comment above why the "depth" counter was needed here. boolean_expression[int depth] returns [parsed::condition_expression e]: b=boolean_expression_1[depth] { $e.append(std::move($b.e), '|'); } (OR b=boolean_expression_1[depth] { $e.append(std::move($b.e), '|'); } )* ; boolean_expression_1[int depth] returns [parsed::condition_expression e]: b=boolean_expression_2[depth] { $e.append(std::move($b.e), '&'); } (AND b=boolean_expression_2[depth] { $e.append(std::move($b.e), '&'); } )* ; boolean_expression_2[int depth] returns [parsed::condition_expression e]: p=primitive_condition { $e.set_primitive(std::move($p.c)); } | {depth