From a810e57684fa98185ef6d39cc59377eb4031df38 Mon Sep 17 00:00:00 2001 From: Piotr Sarna Date: Mon, 15 Feb 2021 13:14:10 +0100 Subject: [PATCH] Merge 'Alternator: support nested attribute paths... in all expressions' from Nadav Har'El. This series fixes #5024 - which is about adding support for nested attribute paths (e.g., a.b.c[2]) to Alternator. The series adds complete support for this feature in ProjectionExpression, ConditionExpression, FilterExpression and UpdateExpression - and also its combination with ReturnValues. Many relevant tests - and also some new tests added in this series - now pass. The first patch in the series fixes #8043 a bug in some error cases in conditions, which was discovered while working in this series, and is conceptually separate from the rest of the series. Closes #8066 * github.com:scylladb/scylla: alternator: correct implemention of UpdateItem with nested attributes and ReturnValues alternator: fix bug in ReturnValues=UPDATED_NEW alternator: implemented nested attribute paths in UpdateExpression alternator: limit the depth of nested paths alternator: prepare for UpdateItem nested attribute paths alternator: overhaul ProjectionExpression hierarchy implementation alternator: make parsed::path object printable alternator-test: a few more ProjectionExpression conflict test cases alternator-test: improve tests for nested attributes in UpdateExpression alternator: support attribute paths in ConditionExpression, FilterExpression alternator-test: improve tests for nested attributes in ConditionExpression alternator: support attribute paths in ProjectionExpression alternator: overhaul attrs_to_get handling alternator-test: additional tests for attribute paths in ProjectionExpression alternator-test: harden attribute-path tests for ProjectionExpression alternator: fix ValidationException in FilterExpression - and more (cherry picked from commit cbbb7f08a04bd16d206aab7f56cab091f747c444) --- alternator/executor.cc | 612 ++++++++++++++---- alternator/executor.hh | 76 ++- alternator/expressions.cc | 111 +++- alternator/expressions_types.hh | 15 + alternator/streams.cc | 10 +- docs/alternator/alternator.md | 31 +- docs/alternator/compatibility.md | 6 - test/alternator/conftest.py | 16 + test/alternator/test_condition_expression.py | 23 +- test/alternator/test_filter_expression.py | 2 - test/alternator/test_gsi.py | 27 + test/alternator/test_item.py | 7 + test/alternator/test_projection_expression.py | 91 ++- test/alternator/test_returnvalues.py | 35 +- test/alternator/test_update_expression.py | 194 +++++- 15 files changed, 1038 insertions(+), 218 deletions(-) diff --git a/alternator/executor.cc b/alternator/executor.cc index 4d8b6121c8..2bac0c02b7 100644 --- a/alternator/executor.cc +++ b/alternator/executor.cc @@ -202,7 +202,7 @@ static schema_ptr get_table(service::storage_proxy& proxy, const rjson::value& r if (!schema) { // if we get here then the name was missing, since syntax or missing actual CF // checks throw. Slow path, but just call get_table_name to generate exception. - get_table_name(request); + get_table_name(request); } return schema; } @@ -1882,18 +1882,182 @@ static std::string get_item_type_string(const rjson::value& v) { return it->name.GetString(); } +// attrs_to_get saves for each top-level attribute an attrs_to_get_node, +// a hierarchy of subparts that need to be kept. The following function +// takes a given JSON value and drops its parts which weren't asked to be +// kept. It modifies the given JSON value, or returns false to signify that +// the entire object should be dropped. +// Note that The JSON value is assumed to be encoded using the DynamoDB +// conventions - i.e., it is really a map whose key has a type string, +// and the value is the real object. +template +static bool hierarchy_filter(rjson::value& val, const attribute_path_map_node& h) { + if (!val.IsObject() || val.MemberCount() != 1) { + // This shouldn't happen. We shouldn't have stored malformed objects. + // But today Alternator does not validate the structure of nested + // documents before storing them, so this can happen on read. + throw api_error::internal(format("Malformed value object read: {}", val)); + } + const char* type = val.MemberBegin()->name.GetString(); + rjson::value& v = val.MemberBegin()->value; + if (h.has_members()) { + const auto& members = h.get_members(); + if (type[0] != 'M' || !v.IsObject()) { + // If v is not an object (dictionary, map), none of the members + // can match. + return false; + } + rjson::value newv = rjson::empty_object(); + for (auto it = v.MemberBegin(); it != v.MemberEnd(); ++it) { + std::string attr = it->name.GetString(); + auto x = members.find(attr); + if (x != members.end()) { + if (x->second) { + // Only a part of this attribute is to be filtered, do it. + if (hierarchy_filter(it->value, *x->second)) { + rjson::set_with_string_name(newv, attr, std::move(it->value)); + } + } else { + // The entire attribute is to be kept + rjson::set_with_string_name(newv, attr, std::move(it->value)); + } + } + } + if (newv.MemberCount() == 0) { + return false; + } + v = newv; + } else if (h.has_indexes()) { + const auto& indexes = h.get_indexes(); + if (type[0] != 'L' || !v.IsArray()) { + return false; + } + rjson::value newv = rjson::empty_array(); + const auto& a = v.GetArray(); + for (unsigned i = 0; i < v.Size(); i++) { + auto x = indexes.find(i); + if (x != indexes.end()) { + if (x->second) { + if (hierarchy_filter(a[i], *x->second)) { + rjson::push_back(newv, std::move(a[i])); + } + } else { + // The entire attribute is to be kept + rjson::push_back(newv, std::move(a[i])); + } + } + } + if (newv.Size() == 0) { + return false; + } + v = newv; + } + return true; +} + +// Add a path to a attribute_path_map. Throws a validation error if the path +// "overlaps" with one already in the filter (one is a sub-path of the other) +// or "conflicts" with it (both a member and index is requested). +template +void attribute_path_map_add(const char* source, attribute_path_map& map, const parsed::path& p, T value = {}) { + using node = attribute_path_map_node; + // The first step is to look for the top-level attribute (p.root()): + auto it = map.find(p.root()); + if (it == map.end()) { + if (p.has_operators()) { + it = map.emplace(p.root(), node {std::nullopt}).first; + } else { + (void) map.emplace(p.root(), node {std::move(value)}).first; + // Value inserted for top-level node. We're done. + return; + } + } else if(!p.has_operators()) { + // If p is top-level and we already have it or a part of it + // in map, it's a forbidden overlapping path. + throw api_error::validation(format( + "Invalid {}: two document paths overlap at {}", source, p.root())); + } else if (it->second.has_value()) { + // If we're here, it != map.end() && p.has_operators && it->second.has_value(). + // This means the top-level attribute already has a value, and we're + // trying to add a non-top-level value. It's an overlap. + throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p.root())); + } + node* h = &it->second; + // The second step is to walk h from the top-level node to the inner node + // where we're supposed to insert the value: + for (const auto& op : p.operators()) { + std::visit(overloaded_functor { + [&] (const std::string& member) { + if (h->is_empty()) { + *h = node {typename node::members_t()}; + } else if (h->has_indexes()) { + throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p)); + } else if (h->has_value()) { + throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p)); + } + typename node::members_t& members = h->get_members(); + auto it = members.find(member); + if (it == members.end()) { + it = members.insert({member, make_shared()}).first; + } + h = it->second.get(); + }, + [&] (unsigned index) { + if (h->is_empty()) { + *h = node {typename node::indexes_t()}; + } else if (h->has_members()) { + throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p)); + } else if (h->has_value()) { + throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p)); + } + typename node::indexes_t& indexes = h->get_indexes(); + auto it = indexes.find(index); + if (it == indexes.end()) { + it = indexes.insert({index, make_shared()}).first; + } + h = it->second.get(); + } + }, op); + } + // Finally, insert the value in the node h. + if (h->is_empty()) { + *h = node {std::move(value)}; + } else { + throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p)); + } +} + +// A very simplified version of the above function for the special case of +// adding only top-level attribute. It's not only simpler, we also use a +// different error message, referring to a "duplicate attribute"instead of +// "overlapping paths". DynamoDB also has this distinction (errors in +// AttributesToGet refer to duplicates, not overlaps, but errors in +// ProjectionExpression refer to overlap - even if it's an exact duplicate). +template +void attribute_path_map_add(const char* source, attribute_path_map& map, const std::string& attr, T value = {}) { + using node = attribute_path_map_node; + auto it = map.find(attr); + if (it == map.end()) { + map.emplace(attr, node {std::move(value)}); + } else { + throw api_error::validation(format( + "Invalid {}: Duplicate attribute: {}", source, attr)); + } +} + // calculate_attrs_to_get() takes either AttributesToGet or // ProjectionExpression parameters (having both is *not* allowed), // and returns the list of cells we need to read, or an empty set when // *all* attributes are to be returned. -// In our current implementation, only top-level attributes are stored -// as cells, and nested documents are stored serialized as JSON. -// So this function currently returns only the the top-level attributes -// but we also need to add, after the query, filtering to keep only -// the parts of the JSON attributes that were chosen in the paths' -// operators. Because we don't have such filtering yet (FIXME), we fail here -// if the requested paths are anything but top-level attributes. -std::unordered_set calculate_attrs_to_get(const rjson::value& req, std::unordered_set& used_attribute_names) { +// However, in our current implementation, only top-level attributes are +// stored as separate cells - a nested document is stored serialized together +// (as JSON) in the same cell. So this function return a map - each key is the +// top-level attribute we will need need to read, and the value for each +// top-level attribute is the partial hierarchy (struct hierarchy_filter) +// that we will need to extract from that serialized JSON. +// For example, if ProjectionExpression lists a.b and a.c[2], we +// return one top-level attribute name, "a", with the value "{b, c[2]}". +static attrs_to_get calculate_attrs_to_get(const rjson::value& req, std::unordered_set& used_attribute_names) { const bool has_attributes_to_get = req.HasMember("AttributesToGet"); const bool has_projection_expression = req.HasMember("ProjectionExpression"); if (has_attributes_to_get && has_projection_expression) { @@ -1902,9 +2066,9 @@ std::unordered_set calculate_attrs_to_get(const rjson::value& req, } if (has_attributes_to_get) { const rjson::value& attributes_to_get = req["AttributesToGet"]; - std::unordered_set ret; + attrs_to_get ret; for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) { - ret.insert(it->GetString()); + attribute_path_map_add("AttributesToGet", ret, it->GetString()); } return ret; } else if (has_projection_expression) { @@ -1917,24 +2081,13 @@ std::unordered_set calculate_attrs_to_get(const rjson::value& req, throw api_error::validation(e.what()); } resolve_projection_expression(paths_to_get, expression_attribute_names, used_attribute_names); - std::unordered_set seen_column_names; - auto ret = boost::copy_range>(paths_to_get | - boost::adaptors::transformed([&] (const parsed::path& p) { - if (p.has_operators()) { - // FIXME: this check will need to change when we support non-toplevel attributes - throw api_error::validation("Non-toplevel attributes in ProjectionExpression not yet implemented"); - } - if (!seen_column_names.insert(p.root()).second) { - // FIXME: this check will need to change when we support non-toplevel attributes - throw api_error::validation( - format("Invalid ProjectionExpression: two document paths overlap with each other: {} and {}.", - p.root(), p.root())); - } - return p.root(); - })); + attrs_to_get ret; + for (const parsed::path& p : paths_to_get) { + attribute_path_map_add("ProjectionExpression", ret, p); + } return ret; } - // An empty set asks to read everything + // An empty map asks to read everything return {}; } @@ -1955,7 +2108,7 @@ std::unordered_set calculate_attrs_to_get(const rjson::value& req, */ void executor::describe_single_item(const cql3::selection::selection& selection, const std::vector& result_row, - const std::unordered_set& attrs_to_get, + const attrs_to_get& attrs_to_get, rjson::value& item, bool include_all_embedded_attributes) { @@ -1976,7 +2129,16 @@ void executor::describe_single_item(const cql3::selection::selection& selection, std::string attr_name = value_cast(entry.first); if (include_all_embedded_attributes || attrs_to_get.empty() || attrs_to_get.contains(attr_name)) { bytes value = value_cast(entry.second); - rjson::set_with_string_name(item, attr_name, deserialize_item(value)); + rjson::value v = deserialize_item(value); + auto it = attrs_to_get.find(attr_name); + if (it != attrs_to_get.end()) { + // attrs_to_get may have asked for only part of this attribute: + if (hierarchy_filter(v, it->second)) { + rjson::set_with_string_name(item, attr_name, std::move(v)); + } + } else { + rjson::set_with_string_name(item, attr_name, std::move(v)); + } } } } @@ -1988,7 +2150,7 @@ std::optional executor::describe_single_item(schema_ptr schema, const query::partition_slice& slice, const cql3::selection::selection& selection, const query::result& query_result, - const std::unordered_set& attrs_to_get) { + const attrs_to_get& attrs_to_get) { rjson::value item = rjson::empty_object(); cql3::selection::result_set_builder builder(selection, gc_clock::now(), cql_serialization_format::latest()); @@ -2024,8 +2186,16 @@ static bool check_needs_read_before_write(const parsed::value& v) { }, v._value); } -static bool check_needs_read_before_write(const parsed::update_expression& update_expression) { - return boost::algorithm::any_of(update_expression.actions(), [](const parsed::update_expression::action& action) { +static bool check_needs_read_before_write(const attribute_path_map& update_expression) { + return boost::algorithm::any_of(update_expression, [](const auto& p) { + if (!p.second.has_value()) { + // If the action is not on the top-level attribute, we need to + // read the old item: we change only a part of the top-level + // attribute, and write the full top-level attribute back. + return true; + } + // Otherwise, the action p.second.get_value() is just on top-level + // attribute. Check if it needs read-before-write: return std::visit(overloaded_functor { [&] (const parsed::update_expression::action::set& a) -> bool { return check_needs_read_before_write(a._rhs._v1) || (a._rhs._op != 'v' && check_needs_read_before_write(a._rhs._v2)); @@ -2039,7 +2209,7 @@ static bool check_needs_read_before_write(const parsed::update_expression& updat [&] (const parsed::update_expression::action::del& a) -> bool { return true; } - }, action._action); + }, p.second.get_value()._action); }); } @@ -2048,7 +2218,11 @@ public: // Some information parsed during the constructor to check for input // errors, and cached to be used again during apply(). rjson::value* _attribute_updates; - parsed::update_expression _update_expression; + // Instead of keeping a parsed::update_expression with an unsorted list + // list of actions, we keep them in an attribute_path_map which groups + // them by top-level attribute, and detects forbidden overlaps/conflicts. + attribute_path_map _update_expression; + parsed::condition_expression _condition_expression; update_item_operation(service::storage_proxy& proxy, rjson::value&& request); @@ -2079,16 +2253,22 @@ update_item_operation::update_item_operation(service::storage_proxy& proxy, rjso throw api_error::validation("UpdateExpression must be a string"); } try { - _update_expression = parse_update_expression(update_expression->GetString()); - resolve_update_expression(_update_expression, + parsed::update_expression expr = parse_update_expression(update_expression->GetString()); + resolve_update_expression(expr, expression_attribute_names, expression_attribute_values, used_attribute_names, used_attribute_values); + if (expr.empty()) { + throw api_error::validation("Empty expression in UpdateExpression is not allowed"); + } + for (auto& action : expr.actions()) { + // Unfortunately we need to copy the action's path, because + // we std::move the action object. + auto p = action._path; + attribute_path_map_add("UpdateExpression", _update_expression, p, std::move(action)); + } } catch(expressions_syntax_error& e) { throw api_error::validation(e.what()); } - if (_update_expression.empty()) { - throw api_error::validation("Empty expression in UpdateExpression is not allowed"); - } } _attribute_updates = rjson::find(_request, "AttributeUpdates"); if (_attribute_updates) { @@ -2130,6 +2310,187 @@ update_item_operation::needs_read_before_write() const { (_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::UPDATED_NEW); } +// action_result() returns the result of applying an UpdateItem action - +// this result is either a JSON object or an unset optional which indicates +// the action was a deletion. The caller (update_item_operation::apply() +// below) will either write this JSON as the content of a column, or +// use it as a piece in a bigger top-level attribute. +static std::optional action_result( + const parsed::update_expression::action& action, + const rjson::value* previous_item) { + return std::visit(overloaded_functor { + [&] (const parsed::update_expression::action::set& a) -> std::optional { + return calculate_value(a._rhs, previous_item); + }, + [&] (const parsed::update_expression::action::remove& a) -> std::optional { + return std::nullopt; + }, + [&] (const parsed::update_expression::action::add& a) -> std::optional { + parsed::value base; + parsed::value addition; + base.set_path(action._path); + addition.set_constant(a._valref); + rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item); + rjson::value v2 = calculate_value(addition, calculate_value_caller::UpdateExpression, previous_item); + rjson::value result; + // An ADD can be used to create a new attribute (when + // v1.IsNull()) or to add to a pre-existing attribute: + if (v1.IsNull()) { + std::string v2_type = get_item_type_string(v2); + if (v2_type == "N" || v2_type == "SS" || v2_type == "NS" || v2_type == "BS") { + result = v2; + } else { + throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v2)); + } + } else { + std::string v1_type = get_item_type_string(v1); + if (v1_type == "N") { + if (get_item_type_string(v2) != "N") { + throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2))); + } + result = number_add(v1, v2); + } else if (v1_type == "SS" || v1_type == "NS" || v1_type == "BS") { + if (get_item_type_string(v2) != v1_type) { + throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2))); + } + result = set_sum(v1, v2); + } else { + throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v1)); + } + } + return result; + }, + [&] (const parsed::update_expression::action::del& a) -> std::optional { + parsed::value base; + parsed::value subset; + base.set_path(action._path); + subset.set_constant(a._valref); + rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item); + rjson::value v2 = calculate_value(subset, calculate_value_caller::UpdateExpression, previous_item); + if (!v1.IsNull()) { + return set_diff(v1, v2); + } + // When we return nullopt here, we ask to *delete* this attribute, + // which is unnecessary because we know the attribute does not + // exist anyway. This is a waste, but a small one. Note that also + // for the "remove" action above we don't bother to check if the + // previous_item add anything to remove. + return std::nullopt; + } + }, action._action); +} + +// Print an attribute_path_map_node as the list of paths it contains: +static std::ostream& operator<<(std::ostream& out, const attribute_path_map_node& h) { + if (h.has_value()) { + out << " " << h.get_value()._path; + } else if (h.has_members()) { + for (auto& member : h.get_members()) { + out << *member.second; + } + } else if (h.has_indexes()) { + for (auto& index : h.get_indexes()) { + out << *index.second; + } + } + return out; +} + +// Apply the hierarchy of actions in an attribute_path_map_node to a +// JSON object which uses DynamoDB's serialization conventions. The complete, +// unmodified, previous_item is also necessary for the right-hand sides of the +// actions. Modifies obj in-place or returns false if it is to be removed. +static bool hierarchy_actions( + rjson::value& obj, + const attribute_path_map_node& h, + const rjson::value* previous_item) +{ + if (!obj.IsObject() || obj.MemberCount() != 1) { + // This shouldn't happen. We shouldn't have stored malformed objects. + // But today Alternator does not validate the structure of nested + // documents before storing them, so this can happen on read. + throw api_error::validation(format("Malformed value object read: {}", obj)); + } + const char* type = obj.MemberBegin()->name.GetString(); + rjson::value& v = obj.MemberBegin()->value; + if (h.has_value()) { + // Action replacing everything in this position in the hierarchy + std::optional newv = action_result(h.get_value(), previous_item); + if (newv) { + obj = std::move(*newv); + } else { + return false; + } + } else if (h.has_members()) { + if (type[0] != 'M' || !v.IsObject()) { + // A .something on a non-map doesn't work. + throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h)); + } + for (const auto& member : h.get_members()) { + std::string attr = member.first; + const attribute_path_map_node& subh = *member.second; + rjson::value *subobj = rjson::find(v, attr); + if (subobj) { + if (!hierarchy_actions(*subobj, subh, previous_item)) { + rjson::remove_member(v, attr); + } + } else { + // When a.b does not exist, setting a.b itself (i.e. + // subh.has_value()) is fine, but setting a.b.c is not. + if (subh.has_value()) { + std::optional newv = action_result(subh.get_value(), previous_item); + if (newv) { + rjson::set_with_string_name(v, attr, std::move(*newv)); + } else { + throw api_error::validation(format("Can't remove document path {} - not present in item", + subh.get_value()._path)); + } + } else { + throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h)); + } + } + } + } else if (h.has_indexes()) { + if (type[0] != 'L' || !v.IsArray()) { + // A [i] on a non-list doesn't work. + throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h)); + } + unsigned nremoved = 0; + for (const auto& index : h.get_indexes()) { + unsigned i = index.first - nremoved; + const attribute_path_map_node& subh = *index.second; + if (i < v.Size()) { + if (!hierarchy_actions(v[i], subh, previous_item)) { + v.Erase(v.Begin() + i); + // If we have the actions "REMOVE a[1] SET a[3] = :val", + // the index 3 refers to the original indexes, before any + // items were removed. So we offset the next indexes + // (which are guaranteed to be higher than i - indexes is + // a sorted map) by an increased "nremoved". + nremoved++; + } + } else { + // If a[7] does not exist, setting a[7] itself (i.e. + // subh.has_value()) is fine - and appends an item, though + // not necessarily with index 7. But setting a[7].b will + // not work. + if (subh.has_value()) { + std::optional newv = action_result(subh.get_value(), previous_item); + if (newv) { + rjson::push_back(v, std::move(*newv)); + } else { + // Removing a[7] when the list has fewer elements is + // silently ignored. It's not considered an error. + } + } else { + throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h)); + } + } + } + } + return true; +} + std::optional update_item_operation::apply(std::unique_ptr previous_item, api::timestamp_type ts) const { if (!verify_expected(_request, previous_item.get()) || @@ -2144,17 +2505,37 @@ update_item_operation::apply(std::unique_ptr previous_item, api::t auto& row = m.partition().clustered_row(*_schema, _ck); attribute_collector attrs_collector; bool any_updates = false; - auto do_update = [&] (bytes&& column_name, const rjson::value& json_value) { + auto do_update = [&] (bytes&& column_name, const rjson::value& json_value, + const attribute_path_map_node* h = nullptr) { any_updates = true; - if (_returnvalues == returnvalues::ALL_NEW || - _returnvalues == returnvalues::UPDATED_NEW) { + if (_returnvalues == returnvalues::ALL_NEW) { rjson::set_with_string_name(_return_attributes, - to_sstring_view(column_name), rjson::copy(json_value)); + to_sstring_view(column_name), rjson::copy(json_value)); + } else if (_returnvalues == returnvalues::UPDATED_NEW) { + rjson::value&& v = rjson::copy(json_value); + if (h) { + // If the operation was only on specific attribute paths, + // leave only them in _return_attributes. + if (hierarchy_filter(v, *h)) { + rjson::set_with_string_name(_return_attributes, + to_sstring_view(column_name), std::move(v)); + } + } else { + rjson::set_with_string_name(_return_attributes, + to_sstring_view(column_name), std::move(v)); + } } else if (_returnvalues == returnvalues::UPDATED_OLD && previous_item) { std::string_view cn = to_sstring_view(column_name); const rjson::value* col = rjson::find(*previous_item, cn); if (col) { - rjson::set_with_string_name(_return_attributes, cn, rjson::copy(*col)); + rjson::value&& v = rjson::copy(*col); + if (h) { + if (hierarchy_filter(v, *h)) { + rjson::set_with_string_name(_return_attributes, cn, std::move(v)); + } + } else { + rjson::set_with_string_name(_return_attributes, cn, std::move(v)); + } } } const column_definition* cdef = _schema->get_column_definition(column_name); @@ -2196,7 +2577,7 @@ update_item_operation::apply(std::unique_ptr previous_item, api::t // can just move previous_item later, when we don't need it any more. if (_returnvalues == returnvalues::ALL_NEW) { if (previous_item) { - _return_attributes = std::move(*previous_item); + _return_attributes = rjson::copy(*previous_item); } else { // If there is no previous item, usually a new item is created // and contains they given key. This may be cancelled at the end @@ -2209,88 +2590,44 @@ update_item_operation::apply(std::unique_ptr previous_item, api::t } if (!_update_expression.empty()) { - std::unordered_set seen_column_names; - for (auto& action : _update_expression.actions()) { - if (action._path.has_operators()) { - // FIXME: implement this case - throw api_error::validation("UpdateItem support for nested updates not yet implemented"); - } - std::string column_name = action._path.root(); + for (auto& actions : _update_expression) { + // The actions of _update_expression are grouped by top-level + // attributes. Here, all actions in actions.second share the same + // top-level attribute actions.first. + std::string column_name = actions.first; const column_definition* cdef = _schema->get_column_definition(to_bytes(column_name)); if (cdef && cdef->is_primary_key()) { - throw api_error::validation( - format("UpdateItem cannot update key column {}", column_name)); + throw api_error::validation(format("UpdateItem cannot update key column {}", column_name)); } - // DynamoDB forbids multiple updates in the same expression to - // modify overlapping document paths. Updates of one expression - // have the same timestamp, so it's unclear which would "win". - // FIXME: currently, without full support for document paths, - // we only check if the paths' roots are the same. - if (!seen_column_names.insert(column_name).second) { - throw api_error::validation( - format("Invalid UpdateExpression: two document paths overlap with each other: {} and {}.", - column_name, column_name)); - } - std::visit(overloaded_functor { - [&] (const parsed::update_expression::action::set& a) { - auto value = calculate_value(a._rhs, previous_item.get()); - do_update(to_bytes(column_name), value); - }, - [&] (const parsed::update_expression::action::remove& a) { + if (actions.second.has_value()) { + // An action on a top-level attribute column_name. The single + // action is actions.second.get_value(). We can simply invoke + // the action and replace the attribute with its result: + std::optional result = action_result(actions.second.get_value(), previous_item.get()); + if (result) { + do_update(to_bytes(column_name), *result); + } else { do_delete(to_bytes(column_name)); - }, - [&] (const parsed::update_expression::action::add& a) { - parsed::value base; - parsed::value addition; - base.set_path(action._path); - addition.set_constant(a._valref); - rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item.get()); - rjson::value v2 = calculate_value(addition, calculate_value_caller::UpdateExpression, previous_item.get()); - rjson::value result; - // An ADD can be used to create a new attribute (when - // v1.IsNull()) or to add to a pre-existing attribute: - if (v1.IsNull()) { - std::string v2_type = get_item_type_string(v2); - if (v2_type == "N" || v2_type == "SS" || v2_type == "NS" || v2_type == "BS") { - result = v2; - } else { - throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v2)); - } - } else { - std::string v1_type = get_item_type_string(v1); - if (v1_type == "N") { - if (get_item_type_string(v2) != "N") { - throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2))); - } - result = number_add(v1, v2); - } else if (v1_type == "SS" || v1_type == "NS" || v1_type == "BS") { - if (get_item_type_string(v2) != v1_type) { - throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2))); - } - result = set_sum(v1, v2); - } else { - throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v1)); - } - } - do_update(to_bytes(column_name), result); - }, - [&] (const parsed::update_expression::action::del& a) { - parsed::value base; - parsed::value subset; - base.set_path(action._path); - subset.set_constant(a._valref); - rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item.get()); - rjson::value v2 = calculate_value(subset, calculate_value_caller::UpdateExpression, previous_item.get()); - if (!v1.IsNull()) { - std::optional result = set_diff(v1, v2); - if (result) { - do_update(to_bytes(column_name), *result); - } else { - do_delete(to_bytes(column_name)); - } - } } - }, action._action); + } else { + // We have actions on a path or more than one path in the same + // top-level attribute column_name - but not on the top-level + // attribute as a whole. We already read the full top-level + // attribute (see check_needs_read_before_write()), and now we + // need to modify pieces of it and write back the entire + // top-level attribute. + if (!previous_item) { + throw api_error::validation(format("UpdateItem cannot update nested document path on non-existent item")); + } + const rjson::value *toplevel = rjson::find(*previous_item, column_name); + if (!toplevel) { + throw api_error::validation(format("UpdateItem cannot update document path: missing attribute {}", + column_name)); + } + rjson::value result = rjson::copy(*toplevel); + hierarchy_actions(result, actions.second, previous_item.get()); + do_update(to_bytes(column_name), std::move(result), &actions.second); + } } } if (_returnvalues == returnvalues::ALL_OLD && previous_item) { @@ -2408,7 +2745,7 @@ static rjson::value describe_item(schema_ptr schema, const query::partition_slice& slice, const cql3::selection::selection& selection, const query::result& query_result, - const std::unordered_set& attrs_to_get) { + const attrs_to_get& attrs_to_get) { std::optional opt_item = executor::describe_single_item(std::move(schema), slice, selection, std::move(query_result), attrs_to_get); if (!opt_item) { // If there is no matching item, we're supposed to return an empty @@ -2480,7 +2817,7 @@ future executor::batch_get_item(client_state& cli struct table_requests { schema_ptr schema; db::consistency_level cl; - std::unordered_set attrs_to_get; + attrs_to_get attrs_to_get; struct single_request { partition_key pk; clustering_key ck; @@ -2694,7 +3031,7 @@ void filter::for_filters_on(const noncopyable_function& class describe_items_visitor { typedef std::vector columns_t; const columns_t& _columns; - const std::unordered_set& _attrs_to_get; + const attrs_to_get& _attrs_to_get; std::unordered_set _extra_filter_attrs; const filter& _filter; typename columns_t::const_iterator _column_it; @@ -2703,7 +3040,7 @@ class describe_items_visitor { size_t _scanned_count; public: - describe_items_visitor(const columns_t& columns, const std::unordered_set& attrs_to_get, filter& filter) + describe_items_visitor(const columns_t& columns, const attrs_to_get& attrs_to_get, filter& filter) : _columns(columns) , _attrs_to_get(attrs_to_get) , _filter(filter) @@ -2752,6 +3089,12 @@ public: std::string attr_name = value_cast(entry.first); if (_attrs_to_get.empty() || _attrs_to_get.contains(attr_name) || _extra_filter_attrs.contains(attr_name)) { bytes value = value_cast(entry.second); + // Even if _attrs_to_get asked to keep only a part of a + // top-level attribute, we keep the entire attribute + // at this stage, because the item filter might still + // need the other parts (it was easier for us to keep + // extra_filter_attrs at top-level granularity). We'll + // filter the unneeded parts after item filtering. rjson::set_with_string_name(_item, attr_name, deserialize_item(value)); } } @@ -2762,11 +3105,24 @@ public: void end_row() { if (_filter.check(_item)) { + // As noted above, we kept entire top-level attributes listed in + // _attrs_to_get. We may need to only keep parts of them. + for (const auto& attr: _attrs_to_get) { + // If !attr.has_value() it means we were asked not to keep + // attr entirely, but just parts of it. + if (!attr.second.has_value()) { + rjson::value* toplevel= rjson::find(_item, attr.first); + if (toplevel && !hierarchy_filter(*toplevel, attr.second)) { + rjson::remove_member(_item, attr.first); + } + } + } // Remove the extra attributes _extra_filter_attrs which we had // to add just for the filter, and not requested to be returned: for (const auto& attr : _extra_filter_attrs) { rjson::remove_member(_item, attr); } + rjson::push_back(_items, std::move(_item)); } _item = rjson::empty_object(); @@ -2782,7 +3138,7 @@ public: } }; -static rjson::value describe_items(schema_ptr schema, const query::partition_slice& slice, const cql3::selection::selection& selection, std::unique_ptr result_set, std::unordered_set&& attrs_to_get, filter&& filter) { +static rjson::value describe_items(schema_ptr schema, const query::partition_slice& slice, const cql3::selection::selection& selection, std::unique_ptr result_set, attrs_to_get&& attrs_to_get, filter&& filter) { describe_items_visitor visitor(selection.get_columns(), attrs_to_get, filter); result_set->visit(visitor); auto scanned_count = visitor.get_scanned_count(); @@ -2823,7 +3179,7 @@ static future do_query(service::storage_proxy& pr const rjson::value* exclusive_start_key, dht::partition_range_vector&& partition_ranges, std::vector&& ck_bounds, - std::unordered_set&& attrs_to_get, + attrs_to_get&& attrs_to_get, uint32_t limit, db::consistency_level cl, filter&& filter, diff --git a/alternator/executor.hh b/alternator/executor.hh index 1fdd1aa00b..1a2894ec68 100644 --- a/alternator/executor.hh +++ b/alternator/executor.hh @@ -70,6 +70,76 @@ public: std::string to_json() const override; }; +namespace parsed { +class path; +}; + +// An attribute_path_map object is used to hold data for various attributes +// paths (parsed::path) in a hierarchy of attribute paths. Each attribute path +// has a root attribute, and then modified by member and index operators - +// for example in "a.b[2].c" we have "a" as the root, then ".b" member, then +// "[2]" index, and finally ".c" member. +// Data can be added to an attribute_path_map using the add() function, but +// requires that attributes with data not be *overlapping* or *conflicting*: +// +// 1. Two attribute paths which are identical or an ancestor of one another +// are considered *overlapping* and not allowed. If a.b.c has data, +// we can't add more data in a.b.c or any of its descendants like a.b.c.d. +// +// 2. Two attribute paths which need the same parent to have both a member and +// an index are considered *conflicting* and not allowed. E.g., if a.b has +// data, you can't add a[1]. The meaning of adding both would be that the +// attribute a is both a map and an array, which isn't sensible. +// +// These two requirements are common to the two places where Alternator uses +// this abstraction to describe how a hierarchical item is to be transformed: +// +// 1. In ProjectExpression: for filtering from a full top-level attribute +// only the parts for which user asked in ProjectionExpression. +// +// 2. In UpdateExpression: for taking the previous value of a top-level +// attribute, and modifying it based on the instructions in the user +// wrote in UpdateExpression. + +template +class attribute_path_map_node { +public: + using data_t = T; + // We need the extra shared_ptr<> here because libstdc++ unordered_map + // doesn't work with incomplete types :-( We couldn't use lw_shared_ptr<> + // because it doesn't work for incomplete types either. We couldn't use + // std::unique_ptr<> because it makes the entire object uncopyable. We + // don't often need to copy such a map, but we do have some code that + // copies an attrs_to_get object, and is hard to find and remove. + // The shared_ptr should never be null. + using members_t = std::unordered_map>>; + // The indexes list is sorted because DynamoDB requires handling writes + // beyond the end of a list in index order. + using indexes_t = std::map>>; + // The prohibition on "overlap" and "conflict" explained above means + // That only one of data, members or indexes is non-empty. + std::optional> _content; + + bool is_empty() const { return !_content; } + bool has_value() const { return _content && std::holds_alternative(*_content); } + bool has_members() const { return _content && std::holds_alternative(*_content); } + bool has_indexes() const { return _content && std::holds_alternative(*_content); } + // get_members() assumes that has_members() is true + members_t& get_members() { return std::get(*_content); } + const members_t& get_members() const { return std::get(*_content); } + indexes_t& get_indexes() { return std::get(*_content); } + const indexes_t& get_indexes() const { return std::get(*_content); } + T& get_value() { return std::get(*_content); } + const T& get_value() const { return std::get(*_content); } +}; + +template +using attribute_path_map = std::unordered_map>; + +using attrs_to_get_node = attribute_path_map_node; +using attrs_to_get = attribute_path_map; + + class executor : public peering_sharded_service { service::storage_proxy& _proxy; service::migration_manager& _mm; @@ -140,16 +210,14 @@ public: const query::partition_slice&, const cql3::selection::selection&, const query::result&, - const std::unordered_set&); + const attrs_to_get&); static void describe_single_item(const cql3::selection::selection&, const std::vector&, - const std::unordered_set&, + const attrs_to_get&, rjson::value&, bool = false); - - void add_stream_options(const rjson::value& stream_spec, schema_builder&) const; void supplement_table_info(rjson::value& descr, const schema& schema) const; void supplement_table_stream_info(rjson::value& descr, const schema& schema) const; diff --git a/alternator/expressions.cc b/alternator/expressions.cc index cabc637604..8ad4155d18 100644 --- a/alternator/expressions.cc +++ b/alternator/expressions.cc @@ -130,6 +130,27 @@ void condition_expression::append(condition_expression&& a, char op) { }, _expression); } +void path::check_depth_limit() { + if (1 + _operators.size() > depth_limit) { + throw expressions_syntax_error(format("Document path exceeded {} nesting levels", depth_limit)); + } +} + +std::ostream& operator<<(std::ostream& os, const path& p) { + os << p.root(); + for (const auto& op : p.operators()) { + std::visit(overloaded_functor { + [&] (const std::string& member) { + os << '.' << member; + }, + [&] (unsigned index) { + os << '[' << index << ']'; + } + }, op); + } + return os; +} + } // namespace parsed // The following resolve_*() functions resolve references in parsed @@ -151,10 +172,9 @@ void condition_expression::append(condition_expression&& a, char op) { // we need to resolve the expression just once but then use it many times // (once for each item to be filtered). -static void resolve_path(parsed::path& p, +static std::optional resolve_path_component(const std::string& column_name, const rjson::value* expression_attribute_names, std::unordered_set& used_attribute_names) { - const std::string& column_name = p.root(); if (column_name.size() > 0 && column_name.front() == '#') { if (!expression_attribute_names) { throw api_error::validation( @@ -166,7 +186,30 @@ static void resolve_path(parsed::path& p, format("ExpressionAttributeNames missing entry '{}' required by expression", column_name)); } used_attribute_names.emplace(column_name); - p.set_root(std::string(rjson::to_string_view(*value))); + return std::string(rjson::to_string_view(*value)); + } + return std::nullopt; +} + +static void resolve_path(parsed::path& p, + const rjson::value* expression_attribute_names, + std::unordered_set& used_attribute_names) { + std::optional r = resolve_path_component(p.root(), expression_attribute_names, used_attribute_names); + if (r) { + p.set_root(std::move(*r)); + } + for (auto& op : p.operators()) { + std::visit(overloaded_functor { + [&] (std::string& s) { + r = resolve_path_component(s, expression_attribute_names, used_attribute_names); + if (r) { + s = std::move(*r); + } + }, + [&] (unsigned index) { + // nothing to resolve + } + }, op); } } @@ -623,6 +666,55 @@ std::unordered_map function_handlers { }, }; +// Given a parsed::path and an item read from the table, extract the value +// of a certain attribute path, such as "a" or "a.b.c[3]". Returns a null +// value if the item or the requested attribute does not exist. +// Note that the item is assumed to be encoded in JSON using DynamoDB +// conventions - each level of a nested document is a map with one key - +// a type (e.g., "M" for map) - and its value is the representation of +// that value. +static rjson::value extract_path(const rjson::value* item, + const parsed::path& p, calculate_value_caller caller) { + if (!item) { + return rjson::null_value(); + } + const rjson::value* v = rjson::find(*item, p.root()); + if (!v) { + return rjson::null_value(); + } + for (const auto& op : p.operators()) { + if (!v->IsObject() || v->MemberCount() != 1) { + // This shouldn't happen. We shouldn't have stored malformed + // objects. But today Alternator does not validate the structure + // of nested documents before storing them, so this can happen on + // read. + throw api_error::validation(format("{}: malformed item read: {}", *item)); + } + const char* type = v->MemberBegin()->name.GetString(); + v = &(v->MemberBegin()->value); + std::visit(overloaded_functor { + [&] (const std::string& member) { + if (type[0] == 'M' && v->IsObject()) { + v = rjson::find(*v, member); + } else { + v = nullptr; + } + }, + [&] (unsigned index) { + if (type[0] == 'L' && v->IsArray() && index < v->Size()) { + v = &(v->GetArray()[index]); + } else { + v = nullptr; + } + } + }, op); + if (!v) { + return rjson::null_value(); + } + } + return rjson::copy(*v); +} + // Given a parsed::value, which can refer either to a constant value from // ExpressionAttributeValues, to the value of some attribute, or to a function // of other values, this function calculates the resulting value. @@ -640,21 +732,12 @@ rjson::value calculate_value(const parsed::value& v, auto function_it = function_handlers.find(std::string_view(f._function_name)); if (function_it == function_handlers.end()) { throw api_error::validation( - format("UpdateExpression: unknown function '{}' called.", f._function_name)); + format("{}: unknown function '{}' called.", caller, f._function_name)); } return function_it->second(caller, previous_item, f); }, [&] (const parsed::path& p) -> rjson::value { - if (!previous_item) { - return rjson::null_value(); - } - std::string update_path = p.root(); - if (p.has_operators()) { - // FIXME: support this - throw api_error::validation("Reading attribute paths not yet implemented"); - } - const rjson::value* previous_value = rjson::find(*previous_item, update_path); - return previous_value ? rjson::copy(*previous_value) : rjson::null_value(); + return extract_path(previous_item, p, caller); } }, v._value); } diff --git a/alternator/expressions_types.hh b/alternator/expressions_types.hh index 9bf42aae32..4508190c53 100644 --- a/alternator/expressions_types.hh +++ b/alternator/expressions_types.hh @@ -49,15 +49,23 @@ class path { // dot (e.g., ".xyz"). std::string _root; std::vector> _operators; + // It is useful to limit the depth of a user-specified path, because is + // allows us to use recursive algorithms without worrying about recursion + // depth. DynamoDB officially limits the length of paths to 32 components + // (including the root) so let's use the same limit. + static constexpr unsigned depth_limit = 32; + void check_depth_limit(); public: void set_root(std::string root) { _root = std::move(root); } void add_index(unsigned i) { _operators.emplace_back(i); + check_depth_limit(); } void add_dot(std::string(name)) { _operators.emplace_back(std::move(name)); + check_depth_limit(); } const std::string& root() const { return _root; @@ -65,6 +73,13 @@ public: bool has_operators() const { return !_operators.empty(); } + const std::vector>& operators() const { + return _operators; + } + std::vector>& operators() { + return _operators; + } + friend std::ostream& operator<<(std::ostream&, const path&); }; // When an expression is first parsed, all constants are references, like diff --git a/alternator/streams.cc b/alternator/streams.cc index cda2c14407..476cd2396b 100644 --- a/alternator/streams.cc +++ b/alternator/streams.cc @@ -845,16 +845,18 @@ future executor::get_records(client_state& client static const bytes op_column_name = cdc::log_meta_column_name_bytes("operation"); static const bytes eor_column_name = cdc::log_meta_column_name_bytes("end_of_batch"); - auto key_names = boost::copy_range>( + auto key_names = boost::copy_range( boost::range::join(std::move(base->partition_key_columns()), std::move(base->clustering_key_columns())) - | boost::adaptors::transformed([&] (const column_definition& cdef) { return cdef.name_as_text(); }) + | boost::adaptors::transformed([&] (const column_definition& cdef) { + return std::make_pair(cdef.name_as_text(), {}); }) ); // Include all base table columns as values (in case pre or post is enabled). // This will include attributes not stored in the frozen map column - auto attr_names = boost::copy_range>(base->regular_columns() + auto attr_names = boost::copy_range(base->regular_columns() // this will include the :attrs column, which we will also force evaluating. // But not having this set empty forces out any cdc columns from actual result - | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.name_as_text(); }) + | boost::adaptors::transformed([] (const column_definition& cdef) { + return std::make_pair(cdef.name_as_text(), {}); }) ); std::vector columns; diff --git a/docs/alternator/alternator.md b/docs/alternator/alternator.md index 755028c66e..fc2199ece3 100644 --- a/docs/alternator/alternator.md +++ b/docs/alternator/alternator.md @@ -87,25 +87,13 @@ progresses and compatibility continues to improve. * UpdateTable: Not supported. * ListTables: Supported. ### Item Operations -* GetItem: Support almost complete except that projection expressions can - only ask for top-level attributes. -* PutItem: Support almost complete except that condition expressions can - only refer to to-level attributes. -* UpdateItem: Nested documents are supported but updates to nested attributes - are not (e.g., `SET a.b[3].c=val`), and neither are nested attributes in - condition expressions. -* DeleteItem: Mostly works, but again does not support nested attributes - in condition expressions. +* GetItem, PutItem, UpdateItem, DeleteItem fully supported. ### Batch Operations -* BatchGetItem: Almost complete except that projection expressions can only - ask for top-level attributes. -* BatchWriteItem: Supported. Doesn't limit the number of items (DynamoDB - limits to 25) or size of items (400 KB) or total request size (16 MB). +* BatchGetItem, BatchWriteItem fully supported. + Doesn't limit the number of items (DynamoDB limits to 25) or size of items + (400 KB) or total request size (16 MB). ### Scans Scan and Query are mostly supported, with the following limitations: -* As above, projection expressions only support top-level attributes. -* The ScanFilter/QueryFilter parameter for filtering results is fully - supported, but the newer FilterExpression syntax is not yet supported. * The "Select" options which allows to count items instead of returning them is not yet supported. ### Secondary Indexes @@ -297,11 +285,10 @@ policies" section. DynamoDB allows attributes to be **nested** - a top-level attribute may be a list or a map, and each of its elements may further be lists or maps, etc. Alternator currently stores the entire content of a top-level -attribute as one JSON object. This is good enough for most needs, except -one DynamoDB feature which we cannot support safely: we cannot modify -a non-top-level attribute (e.g., a.b[3].c) directly without RMW. We plan -to fix this in a future version by rethinking the data model we use for -attributes, or rethinking our implementation of RMW (as explained above). +attribute as one JSON object. This means that UpdateItem requests which +want modify a non-top-level attribute directly (e.g., a.b[3].c) need RMW: +Alternator implements such requests by reading the entire top-level +attribute a, modifying only a.b[3].c, and then writing back a. ```eval_rst .. toctree:: @@ -309,4 +296,4 @@ attributes, or rethinking our implementation of RMW (as explained above). getting-started compatibility -``` \ No newline at end of file +``` diff --git a/docs/alternator/compatibility.md b/docs/alternator/compatibility.md index 0679da4852..b534aaf539 100644 --- a/docs/alternator/compatibility.md +++ b/docs/alternator/compatibility.md @@ -61,12 +61,6 @@ behave the same in Alternator. However, there are a few features which we have not implemented yet. Unimplemented features return an error when used, so they should be easy to detect. Here is a list of these unimplemented features: -* Missing support for **atribute paths** like `a.b[3].c`. - Nested attributes _are_ supported, but Alternator does not yet allow reading - or writing directly a piece of a nested attributes using an attribute path - - only top-level attributes can be read or written directly. - https://github.com/scylladb/scylla/issues/5024 - * Currently in Alternator, a GSI (Global Secondary Index) can only be added to a table at table creation time. Unlike DynamoDB which also allows adding a GSI (but not an LSI) to an existing table using an UpdateTable operation. diff --git a/test/alternator/conftest.py b/test/alternator/conftest.py index 751e6f5178..945e7d5b50 100644 --- a/test/alternator/conftest.py +++ b/test/alternator/conftest.py @@ -230,3 +230,19 @@ def filled_test_table(dynamodb): def scylla_only(dynamodb): if dynamodb.meta.client._endpoint.host.endswith('.amazonaws.com'): pytest.skip('Scylla-only feature not supported by AWS') + +# The "test_table_s_forbid_rmw" fixture is the same as test_table_s, except +# with the "forbid_rmw" write isolation mode. This is useful for verifying +# that writes that we think should not need a read-before-write in fact do +# not need it. +# Because forbid_rmw is a Scylla-only feature, this test is skipped when not +# running against Scylla. +@pytest.fixture(scope="session") +def test_table_s_forbid_rmw(dynamodb, scylla_only): + table = create_test_table(dynamodb, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, ], + AttributeDefinitions=[ { 'AttributeName': 'p', 'AttributeType': 'S' } ]) + arn = table.meta.client.describe_table(TableName=table.name)['Table']['TableArn'] + table.meta.client.tag_resource(ResourceArn=arn, Tags=[{'Key': 'system:write_isolation', 'Value': 'forbid_rmw'}]) + yield table + table.delete() diff --git a/test/alternator/test_condition_expression.py b/test/alternator/test_condition_expression.py index 7631477bf0..c41b8d513f 100644 --- a/test/alternator/test_condition_expression.py +++ b/test/alternator/test_condition_expression.py @@ -1262,7 +1262,6 @@ def test_update_condition_other_funcs(test_table_s): # ConditionExpressions also allows reading nested attributes, and we should # support that too. This test just checks a few random operators - we don't # test all the different operators here. -@pytest.mark.xfail(reason="nested attributes not yet implemented in ConditionExpression") def test_update_condition_nested_attributes(test_table_s): p = random_string() test_table_s.update_item(Key={'p': p}, @@ -1277,6 +1276,21 @@ def test_update_condition_nested_attributes(test_table_s): ConditionExpression='b.x < b.y[1]', ExpressionAttributeValues={':val': 2}) assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['c'] == 2 + # Also check the case of a failing condition + with pytest.raises(ClientError, match='ConditionalCheckFailedException'): + test_table_s.update_item(Key={'p': p}, + UpdateExpression='SET c = :val', + ConditionExpression='b.x < b.y[0]', + ExpressionAttributeValues={':val': 3}) + assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['c'] == 2 + # A condition involving an attribute which doesn't exist results in + # failed condition - not an error. + with pytest.raises(ClientError, match='ConditionalCheckFailedException'): + test_table_s.update_item(Key={'p': p}, + UpdateExpression='SET c = :val', + ConditionExpression='b.z < b.y[100]', + ExpressionAttributeValues={':val': 4}) + assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['c'] == 2 # All the previous tests refered to attributes using their name directly. # But the DynamoDB API also allows to refer to attributes using a #reference. @@ -1293,16 +1307,15 @@ def test_update_condition_attribute_reference(test_table_s): ExpressionAttributeValues={':val': 1}) assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['c'] == 1 -@pytest.mark.xfail(reason="nested attributes not yet implemented in ConditionExpression") def test_update_condition_nested_attribute_reference(test_table_s): p = random_string() test_table_s.update_item(Key={'p': p}, - AttributeUpdates={'and': {'Value': {'or': 1}, 'Action': 'PUT'}}) + AttributeUpdates={'and': {'Value': {'or': 2}, 'Action': 'PUT'}}) test_table_s.update_item(Key={'p': p}, UpdateExpression='SET c = :val', - ConditionExpression='attribute_exists (#name1.#name2)', + ConditionExpression='#name1.#name2 = :two', ExpressionAttributeNames={'#name1': 'and', '#name2': 'or'}, - ExpressionAttributeValues={':val': 1}) + ExpressionAttributeValues={':val': 1, ':two': 2}) assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['c'] == 1 # All the previous tests involved a single condition. The following tests diff --git a/test/alternator/test_filter_expression.py b/test/alternator/test_filter_expression.py index f88a2ba65a..ece8525954 100644 --- a/test/alternator/test_filter_expression.py +++ b/test/alternator/test_filter_expression.py @@ -743,7 +743,6 @@ def test_filter_expression_and_attributes_to_get(test_table): # support that too. This test just checks one operators - we don't # test all the different operators again here, we will assume the same # code is used internally so if one operator worked, all will work. -@pytest.mark.xfail(reason="nested attributes not yet implemented in FilterExpression") def test_filter_expression_nested_attribute(test_table): p = random_string() test_table.put_item(Item={'p': p, 'c': 'hi', 'x': {'a': 'dog', 'b': 3}}) @@ -754,7 +753,6 @@ def test_filter_expression_nested_attribute(test_table): ExpressionAttributeValues={':p': p, ':a': 'mouse'}) assert(got_items == [{'p': p, 'c': 'yo', 'x': {'a': 'mouse', 'b': 4}}]) -@pytest.mark.xfail(reason="nested attributes not yet implemented in FilterExpression") # This test is a version of test_filter_expression_and_projection_expression # involving nested attributes. In that test, we had a filter and projection # involving different attributes. Nested attributes open new corner cases: diff --git a/test/alternator/test_gsi.py b/test/alternator/test_gsi.py index 1b424742b4..eb40203b5b 100644 --- a/test/alternator/test_gsi.py +++ b/test/alternator/test_gsi.py @@ -334,6 +334,33 @@ def test_gsi_wrong_type_attribute_update(test_table_gsi_2): test_table_gsi_2.update_item(Key={'p': p}, AttributeUpdates={'x': {'Value': 3, 'Action': 'PUT'}}) assert test_table_gsi_2.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'x': x} +# Since a GSI key x cannot be a map or an array, in particular updates to +# nested attributes like x.y or x[1] are not legal. The error that DynamoDB +# reports is "Key attributes must be scalars; list random access '[]' and map +# lookup '.' are not allowed: IndexKey: x". +def test_gsi_wrong_type_attribute_update_nested(test_table_gsi_2): + p = random_string() + x = random_string() + test_table_gsi_2.put_item(Item={'p': p, 'x': x}) + # We can't write a map into a GSI key column, which in this case can only + # be a string and in any case can never be a map. DynamoDB and Alternator + # report different errors here: DynamoDB reports a type mismatch (exactly + # like in test test_gsi_wrong_type_attribute_update), but Alternator + # reports the obscure message "Malformed value object for key column x". + # Alternator's error message should probably be improved here, but let's + # not test it in this test. + with pytest.raises(ClientError, match='ValidationException'): + test_table_gsi_2.update_item(Key={'p': p}, UpdateExpression='SET x = :val1', + ExpressionAttributeValues={':val1': {'a': 3, 'b': 4}}) + # Here we try to set x.y for the GSI key column x. Again DynamoDB and + # Alternator produce different error messages - but both make sense. + # DynamoDB says "Key attributes must be scalars; list random access '[]' + # and map # lookup '.' are not allowed: IndexKey: x", while Alternator + # complains that "document paths not valid for this item: x.y". + with pytest.raises(ClientError, match='ValidationException'): + test_table_gsi_2.update_item(Key={'p': p}, UpdateExpression='SET x.y = :val1', + ExpressionAttributeValues={':val1': 3}) + def test_gsi_wrong_type_attribute_batch(test_table_gsi_2): # In a BatchWriteItem, if any update is forbidden, the entire batch is # rejected, and none of the updates happen at all. diff --git a/test/alternator/test_item.py b/test/alternator/test_item.py index 516bef7610..67ccae5e68 100644 --- a/test/alternator/test_item.py +++ b/test/alternator/test_item.py @@ -367,6 +367,13 @@ def test_getitem_attributes_to_get(dynamodb, test_table): expected_item = {k: item[k] for k in wanted if k in item} assert expected_item == got_item +# Verify that it is forbidden to ask for the same attribute multiple times +def test_getitem_attributes_to_get_duplicate(dynamodb, test_table): + p = random_string() + c = random_string() + with pytest.raises(ClientError, match='ValidationException.*Duplicate'): + test_table.get_item(Key={'p': p, 'c': c}, AttributesToGet=['a', 'a'], ConsistentRead=True) + # Basic test for DeleteItem, with hash key only def test_delete_item_hash(test_table_s): p = random_string() diff --git a/test/alternator/test_projection_expression.py b/test/alternator/test_projection_expression.py index 0d913645ea..954fff984b 100644 --- a/test/alternator/test_projection_expression.py +++ b/test/alternator/test_projection_expression.py @@ -116,7 +116,6 @@ def test_projection_expression_query(test_table): # but the previous test checked that the alternative syntax works correctly. # The following test checks fetching more elaborate attribute paths from # nested documents. -@pytest.mark.xfail(reason="ProjectionExpression does not yet support attribute paths") def test_projection_expression_path(test_table_s): p = random_string() test_table_s.put_item(Item={ @@ -143,11 +142,21 @@ def test_projection_expression_path(test_table_s): assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a.x')['Item'] == {} assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a.x.y')['Item'] == {} assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a.b[3].x')['Item'] == {} + # Similarly, indexing a dictionary as an array, or array as dictionary, or + # integer as either, yields an empty item. + assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a.b.x')['Item'] == {} + assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a[0]')['Item'] == {} + assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a.b[0].x')['Item'] == {} + assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a.b[0][0]')['Item'] == {} # We can read multiple paths - the result are merged into one object # structured the same was as in the original item: assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a.b[0],a.b[1]')['Item'] == {'a': {'b': [2, 4]}} assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a.b[0],a.c')['Item'] == {'a': {'b': [2], 'c': 5}} assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a.c,b')['Item'] == {'a': {'c': 5}, 'b': 'hello'} + # If some of the paths are not available, they are silently ignored (just + # like they returned an empty item when used alone earlier) + assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a.x, a.b[0], x, a.b[3].x')['Item'] == {'a': {'b': [2]}} + # It is not allowed to read the same path multiple times. The error from # DynamoDB looks like: "Invalid ProjectionExpression: Two document paths # overlap with each other; must remove or rewrite one of these paths; @@ -160,6 +169,14 @@ def test_projection_expression_path(test_table_s): with pytest.raises(ClientError, match='ValidationException'): test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a,a.b[0]')['Item'] + # Above we noted that asking for to project a non-existent attribute in an + # existing item yields an empty Item object. However, if the item does not + # exist at all, the Item object will be missing entirely: + p = random_string() + assert not 'Item' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='x') + assert not 'Item' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a.x') + assert not 'Item' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a[0]') + # Above in test_projection_expression_toplevel_syntax() we tested how # name references (#name) work in top-level attributes. In the following # two tests we test how they work in more elaborate paths: @@ -167,7 +184,6 @@ def test_projection_expression_path(test_table_s): # 2. Conversely, a single reference, e.g., "#a", is always a single path # component. Even if "#a" is "a.b", this refers to the literal attribute # "a.b" - with a dot in its name - and not to the b element in a. -@pytest.mark.xfail(reason="ProjectionExpression does not yet support attribute paths") def test_projection_expression_path_references(test_table_s): p = random_string() test_table_s.put_item(Item={'p': p, 'a': {'b': 1, 'c': 2}, 'b': 'hi'}) @@ -181,10 +197,9 @@ def test_projection_expression_path_references(test_table_s): with pytest.raises(ClientError, match='ValidationException'): test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a.#n2', ExpressionAttributeNames={'#n2': 'b', '#unused': 'x'}) -@pytest.mark.xfail(reason="ProjectionExpression does not yet support attribute paths") def test_projection_expression_path_dot(test_table_s): p = random_string() - test_table_s.put_item(Item={'p': p, 'a.b': 'hi', 'a': {'b': 'yo'}}) + test_table_s.put_item(Item={'p': p, 'a.b': 'hi', 'a': {'b': 'yo', 'c': 'jo'}}) assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a.b')['Item'] == {'a': {'b': 'yo'}} assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='#name', ExpressionAttributeNames={'#name': 'a.b'})['Item'] == {'a.b': 'hi'} @@ -192,7 +207,6 @@ def test_projection_expression_path_dot(test_table_s): # ProjectionExpression. This includes both identical paths, and paths where # one is a sub-path of the other - e.g. "a.b" and "a.b.c". As we already saw # above, paths with just a common *prefix* - e.g., "a.b, a.c" - are fine. -@pytest.mark.xfail(reason="ProjectionExpression does not yet support attribute paths") def test_projection_expression_path_overlap(test_table_s): # The overlap is tested symbolically, on the given paths, without any # relation to what the item contains, or whether it even exists. So we @@ -207,13 +221,57 @@ def test_projection_expression_path_overlap(test_table_s): 'a.b, a.b[2]', 'a.b, a.b.c', 'a, a.b[2].c', + 'a.b.d, a.b', + 'a.b.d.e, a.b', + 'a.b, a.b.d', + 'a.b, a.b.d.e', ]: with pytest.raises(ClientError, match='ValidationException.* overlap'): + print(expr) test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression=expr) + # The checks above can be easily passed by an over-zealos "overlap" check + # which declares everything an overlap :-) Let's also check some non- + # overlap cases - which shouldn't be declared an overlap. + for expr in ['a, b', + 'a.b, a.c', + 'a.b.d, a.b.e', + 'a[1], a[2]', + 'a.b, a.c[2]', + ]: + print(expr) + test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression=expr) + +# In addition to not allowing "overlapping" paths, DynamoDB also does not +# allow "conflicting" paths: It does not allow giving both a.b and a[1] in a +# single ProjectionExpression. It gives the error: +# "Invalid ProjectionExpression: Two document paths conflict with each other; +# must remove or rewrite one of these paths; path one: [a, b], path two: +# [a, [1]]". +# The reasoning is that asking for both in one request makes no sense because +# no item will ever be able to fulfill both. +def test_projection_expression_path_conflict(test_table_s): + # The conflict is tested symbolically, on the given paths, without any + # relation to what the item contains, or whether it even exists. So we + # don't even need to create an item for this test. We still need a + # key for the GetItem call :-) + p = random_string() + for expr in ['a.b, a[1]', + 'a[1], a.b', + 'a.b[1], a.b.c', + 'a.b.c, a.b[1]', + ]: + with pytest.raises(ClientError, match='ValidationException.* conflict'): + test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression=expr) + # The checks above can be easily passed by an over-zealos "conflict" check + # which declares everything a conflict :-) Let's also check some non- + # conflict cases - which shouldn't be declared a conflict. + for expr in ['a.b, a.c', + 'a.b, a.c[1]', + ]: + test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression=expr) # Above we nested paths in ProjectionExpression, but just for the GetItem # request. Let's verify they also work in Query and Scan requests: -@pytest.mark.xfail(reason="ProjectionExpression does not yet support attribute paths") def test_query_projection_expression_path(test_table): p = random_string() items = [{'p': p, 'c': str(i), 'a': {'x': str(i*10), 'y': 'hi'}, 'b': 'hello' } for i in range(10)] @@ -224,7 +282,6 @@ def test_query_projection_expression_path(test_table): expected_items = [{'a': {'x': x['a']['x']}} for x in items] assert multiset(expected_items) == multiset(got_items) -@pytest.mark.xfail(reason="ProjectionExpression does not yet support attribute paths") def test_scan_projection_expression_path(test_table): # This test is similar to test_query_projection_expression_path above, # but uses a scan instead of a query. The scan will generate unrelated @@ -241,9 +298,8 @@ def test_scan_projection_expression_path(test_table): # BatchGetItem also supports ProjectionExpression, let's test that it # applies to all items, and that it correctly suports document paths as well. -@pytest.mark.xfail(reason="ProjectionExpression does not yet support attribute paths") def test_batch_get_item_projection_expression_path(test_table_s): - items = [{'p': random_string(), 'a': {'b': random_string()}, 'c': random_string()} for i in range(3)] + items = [{'p': random_string(), 'a': {'b': random_string(), 'x': 'hi'}, 'c': random_string()} for i in range(3)] with test_table_s.batch_writer() as batch: for item in items: batch.put_item(item) @@ -285,3 +341,20 @@ def test_projection_expression_and_key_condition_expression(test_table_s): ExpressionAttributeNames={'#name1': 'p', '#name2': 'a'}, ExpressionAttributeValues={':val1': p}); assert got_items == [{'a': 'hello'}] + +# Test whether the nesting depth of an a path in a projection expression +# is limited. If the implementation is done using recursion, it is goood +# practice to limit it and not crash the server. According to the DynamoDB +# documentation, DynamoDB supports nested attributes up to 32 levels deep: +# https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Limits.html#limits-attributes-nested-depth +# There is no reason why Alternator should not use exactly the same limit +# as is officially documented by DynamoDB. +def test_projection_expression_path_nesting_levels(test_table_s): + p = random_string() + # 32 nesting levels (including the top-level attribute) work + test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a'+('.b'*31)) + # 33 nesting levels do not. DynamoDB gives an error: "Invalid + # ProjectionExpression: The document path has too many nesting levels; + # nesting levels: 33". + with pytest.raises(ClientError, match='ValidationException.*nesting levels'): + test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a'+('.b'*32)) diff --git a/test/alternator/test_returnvalues.py b/test/alternator/test_returnvalues.py index ef2e61cfdf..c125bbb717 100644 --- a/test/alternator/test_returnvalues.py +++ b/test/alternator/test_returnvalues.py @@ -346,7 +346,6 @@ def test_update_item_returnvalues_updated_new(test_table_s): # Test the ReturnValues from an UpdateItem directly modifying a *nested* # attribute, in the relevant ReturnValue modes: -@pytest.mark.xfail(reason="nested updates not yet implemented") def test_update_item_returnvalues_nested(test_table_s): p = random_string() test_table_s.put_item(Item={'p': p, 'a': {'b': 'dog', 'c': [1, 2, 3]}, 'd': 'cat'}) @@ -398,3 +397,37 @@ def test_update_item_returnvalues_nested(test_table_s): UpdateExpression='SET a.c[1] = :val, a.b = :val2', ExpressionAttributeValues={':val': 3, ':val2': 'dog'}) assert ret['Attributes'] == {'p': p, 'a': {'b': 'dog', 'c': [1, 3, 3]}, 'd': 'cat' } + # Test with REMOVE, and one of them doing nothing (so shouldn't be in UPDATED_OLD) + ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_OLD', + UpdateExpression='REMOVE a.c[1], a.c[3]') + assert ret['Attributes'] == {'a': {'c': [3]}} # a.c[3] did not exist + # When adding a new sub-attribute, UPDATED_OLD does not return anything, + # but UPDATED_NEW does: + ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_OLD', + UpdateExpression='SET a.x1 = :val', + ExpressionAttributeValues={':val': 8}) + assert not 'Attributes' in ret + ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_NEW', + UpdateExpression='SET a.x2 = :val', + ExpressionAttributeValues={':val': 8}) + assert ret['Attributes'] == {'a': {'x2': 8}} + # Nevertheless, there is one strange exception - although setting an array + # element *beyond* its end (e.g., a.c[100]) does add a new array item, it + # is *not* returned by UPDATED_NEW. I am not sure DynamoDB did this + # deliberately (is it a bug or a feature?), but it simplifies our + # implementation as well: after setting a.c[100], a.c[100] is not actually + # set (instead, maybe a.c[4] was set) so UPDATED_NEW returning a.c[100] + # returns nothing. + ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_NEW', + UpdateExpression='SET a.c[100] = :val', + ExpressionAttributeValues={':val': 70}) + assert not 'Attributes' in ret + # When removing an item, it shouldn't appear in UPDATED_NEW. Again there + # a strange exception - which I'm not sure if we should consider it a + # DynamoDB bug or feature - but it simplifies our own implementation as + # well: if we remove a.c[1], the new item *will* have a new a.c[1] (the + # previous a.c[2] is shifted back), so this value is returned. This is + # very odd, but this is what DynamoDB does, so we should too... + ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_NEW', + UpdateExpression='REMOVE a.c[1]') + assert ret['Attributes'] == {'a': {'c': [70]}} diff --git a/test/alternator/test_update_expression.py b/test/alternator/test_update_expression.py index 2085a1750b..f7c780070d 100644 --- a/test/alternator/test_update_expression.py +++ b/test/alternator/test_update_expression.py @@ -265,16 +265,64 @@ def test_update_expression_multi_overlap(test_table_s): # The problem isn't just with identical paths - we can't modify two paths that # "overlap" in the sense that one is the ancestor of the other. -@pytest.mark.xfail(reason="nested updates not yet implemented") def test_update_expression_multi_overlap_nested(test_table_s): p = random_string() - with pytest.raises(ClientError, match='ValidationException.*overlap'): - test_table_s.update_item(Key={'p': p}, UpdateExpression='SET a = :val1, a.b = :val2', - ExpressionAttributeValues={':val1': {'b': 7}, ':val2': 'there'}) - test_table_s.put_item(Item={'p': p, 'a': {'b': {'c': 2}}}) - with pytest.raises(ClientError, match='ValidationException.*overlap'): - test_table_s.update_item(Key={'p': p}, UpdateExpression='SET a.b = :val1, a.b.c = :val2', - ExpressionAttributeValues={':val1': 'hi', ':val2': 'there'}) + # Note that the overlap checks happen before checking the actual content + # of the item, so it doesn't matter that the item we want to modify + # doesn't even exist or have the structure referenced by the paths below. + for expr in ['SET a = :val1, a.b = :val2', + 'SET a.b = :val1, a = :val2', + 'SET a.b = :val1, a.b = :val2', + 'SET a.b = :val1, a.b.c = :val2', + 'SET a.b.c = :val1, a.b = :val2', + 'SET a.b.c = :val1, a.b.c = :val2', + 'SET a.b = :val1, a.b.c.d.e = :val2', + 'SET a.b.c.d.e = :val1, a.b = :val2', + 'SET a = :val1, a[1] = :val2', + 'SET a[1] = :val1, a = :val2', + 'SET a[1] = :val1, a[1] = :val2', + 'SET a[1][1] = :val1, a[1] = :val2', + 'SET a[1] = :val1, a[1][1] = :val2', + 'SET a[1][1] = :val1, a[1][1] = :val2', + 'SET a[1][1][1][1] = :val1, a[1][1] = :val2', + 'SET a[1][1] = :val1, a[1][1][1][1] = :val2', + 'SET a[1][1][1][1] = :val1, a[1][1][1][1] = :val2', + ]: + print(expr) + with pytest.raises(ClientError, match='ValidationException.*overlap'): + test_table_s.update_item(Key={'p': p}, UpdateExpression=expr, + ExpressionAttributeValues={':val1': 2, ':val2': 'there'}) + # Obviously this test can trivially pass if overlap checks wrongly labels + # everything as an overlap. So the test test_update_expression_multi_nested + # below is important - it confirms that we can do multiple modifications + # to the same item when they do not overlap. + +# Besides the concept of "overlapping" paths tested above, DynamoDB also has +# the concept of "conflicting" paths - e.g., attempting to set both a.b and +# a[1] together doesn't make sense. +def test_update_expression_multi_conflict_nested(test_table_s): + p = random_string() + for expr in ['SET a.b = :val1, a[1] = :val2', + 'SET a.b.c = :val1, a.b[2] = :val2', + ]: + print(expr) + with pytest.raises(ClientError, match='ValidationException.*conflict'): + test_table_s.update_item(Key={'p': p}, UpdateExpression=expr, + ExpressionAttributeValues={':val1': 2, ':val2': 'there'}) + +# We can do several non-overlapping modifications to the same top-level +# attribute and to different top-level attributes in the same update +# expression. +def test_update_expression_multi_nested(test_table_s): + p = random_string() + test_table_s.put_item(Item={'p': p, 'a': {'x': 3, 'y': 4, 'c': {'y': 3}}, 'b': {'x': 1, 'y': 2}}) + test_table_s.update_item(Key={'p': p}, + UpdateExpression='SET a.b = :val1, a.c.d = :val2, b.x = :val3 REMOVE a.x, b.y', + ExpressionAttributeValues={':val1': 10, ':val2': 'dog', ':val3': 17}) + assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == { + 'p': p, + 'a': {'y': 4, 'b': 10, 'c': {'y': 3, 'd': 'dog'}}, + 'b': {'x': 17}} # In the previous test we saw that *modifying* the same item twice in the same # update is forbidden; But it is allowed to *read* an item in the same update @@ -776,10 +824,19 @@ def test_update_expression_dot_in_name(test_table_s): ExpressionAttributeNames={'#a': 'a.b'}) assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'a.b': 3} + +# Below we have several tests of what happens when a nested attribute is +# on the left-hand side of an assignment, but an every simpler case of +# nested attributes is having one on the right hand side of an assignment: +def test_update_expression_nested_attribute_rhs(test_table_s): + p = random_string() + test_table_s.put_item(Item={'p': p, 'a': {'b': 3, 'c': {'x': 7, 'y': 8}}, 'd': 5}) + test_table_s.update_item(Key={'p': p}, UpdateExpression='SET z = a.c.x') + assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 7 + # A basic test for direct update of a nested attribute: One of the top-level # attributes is itself a document, and we update only one of that document's # nested attributes. -@pytest.mark.xfail(reason="nested updates not yet implemented") def test_update_expression_nested_attribute_dot(test_table_s): p = random_string() test_table_s.put_item(Item={'p': p, 'a': {'b': 3, 'c': 4}, 'd': 5}) @@ -795,7 +852,6 @@ def test_update_expression_nested_attribute_dot(test_table_s): # Similar test, for a list: one of the top-level attributes is a list, we # can update one of its items. -@pytest.mark.xfail(reason="nested updates not yet implemented") def test_update_expression_nested_attribute_index(test_table_s): p = random_string() test_table_s.put_item(Item={'p': p, 'a': ['one', 'two', 'three']}) @@ -817,7 +873,6 @@ def test_update_expression_nested_attribute_index_reference(test_table_s): # Test that just like happens in top-level attributes, also in nested # attributes, setting them replaces the old value - potentially an entire # nested document, by the whole value (which may have a different type) -@pytest.mark.xfail(reason="nested updates not yet implemented") def test_update_expression_nested_different_type(test_table_s): p = random_string() test_table_s.put_item(Item={'p': p, 'a': {'b': 3, 'c': {'one': 1, 'two': 2}}}) @@ -827,7 +882,6 @@ def test_update_expression_nested_different_type(test_table_s): # Yet another test of a nested attribute update. This one uses deeper # level of nesting (dots and indexes), adds #name references to the mix. -@pytest.mark.xfail(reason="nested updates not yet implemented") def test_update_expression_nested_deep(test_table_s): p = random_string() test_table_s.put_item(Item={'p': p, 'a': {'b': 3, 'c': ['hi', {'x': {'y': [3, 5, 7]}}]}}) @@ -841,20 +895,55 @@ def test_update_expression_nested_deep(test_table_s): # A REMOVE operation can be used to remove nested attributes, and also # individual list items. -@pytest.mark.xfail(reason="nested updates not yet implemented") def test_update_expression_nested_remove(test_table_s): p = random_string() test_table_s.put_item(Item={'p': p, 'a': {'b': 3, 'c': ['hi', {'x': {'y': [3, 5, 7]}, 'q': 2}]}}) test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE a.c[1].x.y[1], a.c[1].q') assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == {'b': 3, 'c': ['hi', {'x': {'y': [3, 7]}}]} +# Removing a list item beyond the end of the list (e.g., REMOVE a[17] when +# the list only has three items) is silently ignored. +def test_update_expression_nested_remove_list_item_after_end(test_table_s): + p = random_string() + test_table_s.put_item(Item={'p': p, 'a': [4, 5, 6]}) + test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE a[17]') + +# If we remove a[1] and then change a[3], the index "3" refers to the position +# *before* the first removal. +def test_update_expression_nested_remove_list_item_original_number(test_table_s): + p = random_string() + test_table_s.put_item(Item={'p': p, 'a': [2, 3, 4, 5, 6, 7]}) + test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE a[1] SET a[3] = :val', + ExpressionAttributeValues={':val': 17}) + assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == [2, 4, 17, 6, 7] + # The order of the operations doesn't matter + test_table_s.put_item(Item={'p': p, 'a': [2, 3, 4, 5, 6, 7]}) + test_table_s.update_item(Key={'p': p}, UpdateExpression='SET a[3] = :val REMOVE a[1]', + ExpressionAttributeValues={':val': 17}) + assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == [2, 4, 17, 6, 7] + +# DynamoDB allows an empty map. So removing the only member from a map leaves +# behind an empty map. +def test_update_expression_nested_remove_singleton_map(test_table_s): + p = random_string() + test_table_s.put_item(Item={'p': p, 'a': {'b': 1}}) + test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE a.b') + assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == {} + +# DynamoDB allows an empty list. So removing the only member from a list leaves +# behind an empty list. +def test_update_expression_nested_remove_singleton_list(test_table_s): + p = random_string() + test_table_s.put_item(Item={'p': p, 'a': [1]}) + test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE a[0]') + assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == [] + # The DynamoDB documentation specifies: "When you use SET to update a list # element, the contents of that element are replaced with the new data that # you specify. If the element does not already exist, SET will append the # new element at the end of the list." # So if we take a three-element list a[7], and set a[7], the new element # will be put at the end of the list, not position 7 specifically. -@pytest.mark.xfail(reason="nested updates not yet implemented") def test_nested_attribute_update_array_out_of_bounds(test_table_s): p = random_string() test_table_s.put_item(Item={'p': p, 'a': ['one', 'two', 'three']}) @@ -864,9 +953,9 @@ def test_nested_attribute_update_array_out_of_bounds(test_table_s): # The DynamoDB documentation also says: "If you add multiple elements # in a single SET operation, the elements are sorted in order by element # number. - test_table_s.update_item(Key={'p': p}, UpdateExpression='SET a[84] = :val1, a[37] = :val2', - ExpressionAttributeValues={':val1': 'a1', ':val2': 'a2'}) - assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'a': ['one', 'two', 'three', 'hello', 'a2', 'a1']} + test_table_s.update_item(Key={'p': p}, UpdateExpression='SET a[84] = :val1, a[37] = :val2, a[17] = :val3, a[50] = :val4', + ExpressionAttributeValues={':val1': 'a1', ':val2': 'a2', ':val3': 'a3', ':val4': 'a4'}) + assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'a': ['one', 'two', 'three', 'hello', 'a3', 'a2', 'a4', 'a1']} # Test what happens if we try to write to a.b, which would only make sense if # a were a nested document, but a doesn't exist, or exists and is NOT a nested @@ -875,9 +964,6 @@ def test_nested_attribute_update_array_out_of_bounds(test_table_s): # ClientError: An error occurred (ValidationException) when calling the # UpdateItem operation: The document path provided in the update expression # is invalid for update -# Because Scylla doesn't read before write, it cannot detect this as an error, -# so we'll probably want to allow for that possibility as well. -@pytest.mark.xfail(reason="nested updates not yet implemented") def test_nested_attribute_update_bad_path_dot(test_table_s): p = random_string() test_table_s.put_item(Item={'p': p, 'a': 'hello', 'b': ['hi']}) @@ -890,17 +976,56 @@ def test_nested_attribute_update_bad_path_dot(test_table_s): with pytest.raises(ClientError, match='ValidationException.*path'): test_table_s.update_item(Key={'p': p}, UpdateExpression='SET c.c = :val1', ExpressionAttributeValues={':val1': 7}) + # Same errors for "remove" operation. + with pytest.raises(ClientError, match='ValidationException.*path'): + test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE a.c') + with pytest.raises(ClientError, match='ValidationException.*path'): + test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE b.c') + with pytest.raises(ClientError, match='ValidationException.*path'): + test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE c.c') + # Same error when the item doesn't exist at all: + p = random_string() + with pytest.raises(ClientError, match='ValidationException.*path'): + test_table_s.update_item(Key={'p': p}, UpdateExpression='SET c.c = :val1', + ExpressionAttributeValues={':val1': 7}) + # HOWEVER, although it *is* allowed to remove a random path from a non- + # existent item. I don't know why. See the next test - + # test_nested_attribute_remove_from_missing_item +# Though in the above test (test_nested_attribute_update_bad_path_dot) we +# showed that DynamoDB does not allow REMOVE x.y if attribute x doesn't +# exist - and generates a ValidationException, it turns out that if the +# entire item doesn't exist, then a REMOVE x.y is silently ignored. +# I don't understand why they did this. +@pytest.mark.xfail(reason="for unknown reason, DynamoDB allows REMOVE x.y when item doesn't exist") +def test_nested_attribute_remove_from_missing_item(test_table_s): + p = random_string() + test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE x.y') + test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE x[0]') # Similarly for other types of bad paths - using [0] on something which -# isn't an array, -@pytest.mark.xfail(reason="nested updates not yet implemented") +# doesn't exist or isn't an array. def test_nested_attribute_update_bad_path_array(test_table_s): p = random_string() test_table_s.put_item(Item={'p': p, 'a': 'hello'}) with pytest.raises(ClientError, match='ValidationException.*path'): test_table_s.update_item(Key={'p': p}, UpdateExpression='SET a[0] = :val1', ExpressionAttributeValues={':val1': 7}) + with pytest.raises(ClientError, match='ValidationException.*path'): + test_table_s.update_item(Key={'p': p}, UpdateExpression='SET b[0] = :val1', + ExpressionAttributeValues={':val1': 7}) + # Same errors for "remove" operation. + with pytest.raises(ClientError, match='ValidationException.*path'): + test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE a[0]') + with pytest.raises(ClientError, match='ValidationException.*path'): + test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE b[0]') + # Same error when the item doesn't exist at all: + p = random_string() + with pytest.raises(ClientError, match='ValidationException.*path'): + test_table_s.update_item(Key={'p': p}, UpdateExpression='SET b[0] = :val1', + ExpressionAttributeValues={':val1': 7}) + # HOWEVER, although it *is* allowed to remove a random path from a non- + # existent item. I don't know why... See test_nested_attribute_remove_from_missing_item # DynamoDB Does not allow empty sets. # Trying to ask UpdateItem to put one of these in an attribute should be @@ -921,4 +1046,27 @@ def test_update_expression_empty_attribute(test_table_s): UpdateExpression='SET d = :v1, e = :v2, f = :v3, g = :v4', ExpressionAttributeValues={':v1': [], ':v2': {}, ':v3': '', ':v4': b''}) assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'd': [], 'e': {}, 'f': '', 'g': b''} -# + +# Verify which kind of update operations require a read-before-write (a.k.a +# read-modify-write, or RMW). We test this by using a table configured with +# "forbid_rmw" isolation mode and checking which writes succeed or pass. +# This is a Scylla-only test (the test_table_s_forbid_rmw implies scylla_only). +def test_update_expression_when_rmw(test_table_s_forbid_rmw): + table = test_table_s_forbid_rmw + p = random_string() + # A write with a RHS (right-hand side) being a constant from the query + # doesn't need RMW: + table.update_item(Key={'p': p}, + UpdateExpression='SET a = :val', + ExpressionAttributeValues={':val': 3}) + # But if the LHS (left-hand side) of the assignment is a document path, + # it *does* need RMW: + with pytest.raises(ClientError, match='ValidationException.*write isolation policy'): + table.update_item(Key={'p': p}, + UpdateExpression='SET a.b = :val', + ExpressionAttributeValues={':val': 3}) + assert table.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == 3 + # A write with a path in the RHS of a SET also needs RMW + with pytest.raises(ClientError, match='ValidationException.*write isolation policy'): + table.update_item(Key={'p': p}, + UpdateExpression='SET b = a')