mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-20 00:20:47 +00:00
Already six years ago, in #5783, we noticed that alternator/executor.cc has grown too large. The previous patches added hundreds of more lines to it to implement vector search, and it reached a whopping 7,000 lines of code. This is too much. This patch splits from executor.cc two major chunks: 1. The implementation of **read** requests - GetItem, BatchGetItem, Query (base table, GSI/LSI, and vector-search), and Scan - was moved to a new source file alternator/executor_read.cc. The new file has 2,000 lines. 2. Moved 250 lines of template functions dealing with attribute paths and maps of them to a new header file, attribute_path.hh. These utilities are used for many different operations - various read operations use them for ProjectionExpression, and UpdateItem uses them for modifications to nested attributes, so we need the new header file from both executor.cc and executor_read.cc The remaining executor.cc is still pretty big, 5,000 lines, and contains write operations (PutItem, UpdateItem, DeleteItem, BatchWriteItem) as well as various table and other operations, and also many utility functions used by many types of operations, so we can later continue this refactoring effort. Refs #5783 Signed-off-by: Nadav Har'El <nyh@scylladb.com>
254 lines
11 KiB
C++
254 lines
11 KiB
C++
/*
|
|
* Copyright 2019-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <map>
|
|
#include <memory>
|
|
#include <optional>
|
|
#include <string>
|
|
#include <unordered_map>
|
|
#include <variant>
|
|
|
|
#include "utils/rjson.hh"
|
|
#include "utils/overloaded_functor.hh"
|
|
#include "alternator/error.hh"
|
|
#include "alternator/expressions_types.hh"
|
|
|
|
namespace alternator {
|
|
|
|
// An attribute_path_map object is used to hold data for various attributes
|
|
// paths (parsed::path) in a hierarchy of attribute paths. Each attribute path
|
|
// has a root attribute, and then modified by member and index operators -
|
|
// for example in "a.b[2].c" we have "a" as the root, then ".b" member, then
|
|
// "[2]" index, and finally ".c" member.
|
|
// Data can be added to an attribute_path_map using the add() function, but
|
|
// requires that attributes with data not be *overlapping* or *conflicting*:
|
|
//
|
|
// 1. Two attribute paths which are identical or an ancestor of one another
|
|
// are considered *overlapping* and not allowed. If a.b.c has data,
|
|
// we can't add more data in a.b.c or any of its descendants like a.b.c.d.
|
|
//
|
|
// 2. Two attribute paths which need the same parent to have both a member and
|
|
// an index are considered *conflicting* and not allowed. E.g., if a.b has
|
|
// data, you can't add a[1]. The meaning of adding both would be that the
|
|
// attribute a is both a map and an array, which isn't sensible.
|
|
//
|
|
// These two requirements are common to the two places where Alternator uses
|
|
// this abstraction to describe how a hierarchical item is to be transformed:
|
|
//
|
|
// 1. In ProjectExpression: for filtering from a full top-level attribute
|
|
// only the parts for which user asked in ProjectionExpression.
|
|
//
|
|
// 2. In UpdateExpression: for taking the previous value of a top-level
|
|
// attribute, and modifying it based on the instructions in the user
|
|
// wrote in UpdateExpression.
|
|
|
|
template<typename T>
|
|
class attribute_path_map_node {
|
|
public:
|
|
using data_t = T;
|
|
// We need the extra unique_ptr<> here because libstdc++ unordered_map
|
|
// doesn't work with incomplete types :-(
|
|
using members_t = std::unordered_map<std::string, std::unique_ptr<attribute_path_map_node<T>>>;
|
|
// The indexes list is sorted because DynamoDB requires handling writes
|
|
// beyond the end of a list in index order.
|
|
using indexes_t = std::map<unsigned, std::unique_ptr<attribute_path_map_node<T>>>;
|
|
// The prohibition on "overlap" and "conflict" explained above means
|
|
// That only one of data, members or indexes is non-empty.
|
|
std::optional<std::variant<data_t, members_t, indexes_t>> _content;
|
|
|
|
bool is_empty() const { return !_content; }
|
|
bool has_value() const { return _content && std::holds_alternative<data_t>(*_content); }
|
|
bool has_members() const { return _content && std::holds_alternative<members_t>(*_content); }
|
|
bool has_indexes() const { return _content && std::holds_alternative<indexes_t>(*_content); }
|
|
// get_members() assumes that has_members() is true
|
|
members_t& get_members() { return std::get<members_t>(*_content); }
|
|
const members_t& get_members() const { return std::get<members_t>(*_content); }
|
|
indexes_t& get_indexes() { return std::get<indexes_t>(*_content); }
|
|
const indexes_t& get_indexes() const { return std::get<indexes_t>(*_content); }
|
|
T& get_value() { return std::get<T>(*_content); }
|
|
const T& get_value() const { return std::get<T>(*_content); }
|
|
};
|
|
|
|
template<typename T>
|
|
using attribute_path_map = std::unordered_map<std::string, attribute_path_map_node<T>>;
|
|
|
|
using attrs_to_get_node = attribute_path_map_node<std::monostate>;
|
|
// attrs_to_get lists which top-level attribute are needed, and possibly also
|
|
// which part of the top-level attribute is really needed (when nested
|
|
// attribute paths appeared in the query).
|
|
// Most code actually uses optional<attrs_to_get>. There, a disengaged
|
|
// optional means we should get all attributes, not specific ones.
|
|
using attrs_to_get = attribute_path_map<std::monostate>;
|
|
|
|
// takes a given JSON value and drops its parts which weren't asked to be
|
|
// kept. It modifies the given JSON value, or returns false to signify that
|
|
// the entire object should be dropped.
|
|
// Note that The JSON value is assumed to be encoded using the DynamoDB
|
|
// conventions - i.e., it is really a map whose key has a type string,
|
|
// and the value is the real object.
|
|
template<typename T>
|
|
bool hierarchy_filter(rjson::value& val, const attribute_path_map_node<T>& h) {
|
|
if (!val.IsObject() || val.MemberCount() != 1) {
|
|
// This shouldn't happen. We shouldn't have stored malformed objects.
|
|
// But today Alternator does not validate the structure of nested
|
|
// documents before storing them, so this can happen on read.
|
|
throw api_error::internal(format("Malformed value object read: {}", val));
|
|
}
|
|
const char* type = val.MemberBegin()->name.GetString();
|
|
rjson::value& v = val.MemberBegin()->value;
|
|
if (h.has_members()) {
|
|
const auto& members = h.get_members();
|
|
if (type[0] != 'M' || !v.IsObject()) {
|
|
// If v is not an object (dictionary, map), none of the members
|
|
// can match.
|
|
return false;
|
|
}
|
|
rjson::value newv = rjson::empty_object();
|
|
for (auto it = v.MemberBegin(); it != v.MemberEnd(); ++it) {
|
|
std::string attr = rjson::to_string(it->name);
|
|
auto x = members.find(attr);
|
|
if (x != members.end()) {
|
|
if (x->second) {
|
|
// Only a part of this attribute is to be filtered, do it.
|
|
if (hierarchy_filter(it->value, *x->second)) {
|
|
// because newv started empty and attr are unique
|
|
// (keys of v), we can use add() here
|
|
rjson::add_with_string_name(newv, attr, std::move(it->value));
|
|
}
|
|
} else {
|
|
// The entire attribute is to be kept
|
|
rjson::add_with_string_name(newv, attr, std::move(it->value));
|
|
}
|
|
}
|
|
}
|
|
if (newv.MemberCount() == 0) {
|
|
return false;
|
|
}
|
|
v = newv;
|
|
} else if (h.has_indexes()) {
|
|
const auto& indexes = h.get_indexes();
|
|
if (type[0] != 'L' || !v.IsArray()) {
|
|
return false;
|
|
}
|
|
rjson::value newv = rjson::empty_array();
|
|
const auto& a = v.GetArray();
|
|
for (unsigned i = 0; i < v.Size(); i++) {
|
|
auto x = indexes.find(i);
|
|
if (x != indexes.end()) {
|
|
if (x->second) {
|
|
if (hierarchy_filter(a[i], *x->second)) {
|
|
rjson::push_back(newv, std::move(a[i]));
|
|
}
|
|
} else {
|
|
// The entire attribute is to be kept
|
|
rjson::push_back(newv, std::move(a[i]));
|
|
}
|
|
}
|
|
}
|
|
if (newv.Size() == 0) {
|
|
return false;
|
|
}
|
|
v = newv;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Add a path to an attribute_path_map. Throws a validation error if the path
|
|
// "overlaps" with one already in the filter (one is a sub-path of the other)
|
|
// or "conflicts" with it (both a member and index is requested).
|
|
template<typename T>
|
|
void attribute_path_map_add(const char* source, attribute_path_map<T>& map, const parsed::path& p, T value = {}) {
|
|
using node = attribute_path_map_node<T>;
|
|
// The first step is to look for the top-level attribute (p.root()):
|
|
auto it = map.find(p.root());
|
|
if (it == map.end()) {
|
|
if (p.has_operators()) {
|
|
it = map.emplace(p.root(), node {std::nullopt}).first;
|
|
} else {
|
|
(void) map.emplace(p.root(), node {std::move(value)}).first;
|
|
// Value inserted for top-level node. We're done.
|
|
return;
|
|
}
|
|
} else if(!p.has_operators()) {
|
|
// If p is top-level and we already have it or a part of it
|
|
// in map, it's a forbidden overlapping path.
|
|
throw api_error::validation(fmt::format(
|
|
"Invalid {}: two document paths overlap at {}", source, p.root()));
|
|
} else if (it->second.has_value()) {
|
|
// If we're here, it != map.end() && p.has_operators && it->second.has_value().
|
|
// This means the top-level attribute already has a value, and we're
|
|
// trying to add a non-top-level value. It's an overlap.
|
|
throw api_error::validation(fmt::format("Invalid {}: two document paths overlap at {}", source, p.root()));
|
|
}
|
|
node* h = &it->second;
|
|
// The second step is to walk h from the top-level node to the inner node
|
|
// where we're supposed to insert the value:
|
|
for (const auto& op : p.operators()) {
|
|
std::visit(overloaded_functor {
|
|
[&] (const std::string& member) {
|
|
if (h->is_empty()) {
|
|
*h = node {typename node::members_t()};
|
|
} else if (h->has_indexes()) {
|
|
throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p));
|
|
} else if (h->has_value()) {
|
|
throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p));
|
|
}
|
|
typename node::members_t& members = h->get_members();
|
|
auto it = members.find(member);
|
|
if (it == members.end()) {
|
|
it = members.insert({member, std::make_unique<node>()}).first;
|
|
}
|
|
h = it->second.get();
|
|
},
|
|
[&] (unsigned index) {
|
|
if (h->is_empty()) {
|
|
*h = node {typename node::indexes_t()};
|
|
} else if (h->has_members()) {
|
|
throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p));
|
|
} else if (h->has_value()) {
|
|
throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p));
|
|
}
|
|
typename node::indexes_t& indexes = h->get_indexes();
|
|
auto it = indexes.find(index);
|
|
if (it == indexes.end()) {
|
|
it = indexes.insert({index, std::make_unique<node>()}).first;
|
|
}
|
|
h = it->second.get();
|
|
}
|
|
}, op);
|
|
}
|
|
// Finally, insert the value in the node h.
|
|
if (h->is_empty()) {
|
|
*h = node {std::move(value)};
|
|
} else {
|
|
throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p));
|
|
}
|
|
}
|
|
|
|
// A very simplified version of the above function for the special case of
|
|
// adding only top-level attribute. It's not only simpler, we also use a
|
|
// different error message, referring to a "duplicate attribute" instead of
|
|
// "overlapping paths". DynamoDB also has this distinction (errors in
|
|
// AttributesToGet refer to duplicates, not overlaps, but errors in
|
|
// ProjectionExpression refer to overlap - even if it's an exact duplicate).
|
|
template<typename T>
|
|
void attribute_path_map_add(const char* source, attribute_path_map<T>& map, const std::string& attr, T value = {}) {
|
|
using node = attribute_path_map_node<T>;
|
|
auto it = map.find(attr);
|
|
if (it == map.end()) {
|
|
map.emplace(attr, node {std::move(value)});
|
|
} else {
|
|
throw api_error::validation(fmt::format(
|
|
"Invalid {}: Duplicate attribute: {}", source, attr));
|
|
}
|
|
}
|
|
|
|
} // namespace alternator
|