/* * Copyright (C) 2021-present ScyllaDB */ /* * SPDX-License-Identifier: AGPL-3.0-or-later */ #pragma once #include #include #include #include "locator/token_metadata.hh" #include "message/messaging_service_fwd.hh" #include "query-request.hh" #include "replica/database_fwd.hh" namespace tracing { class trace_state_ptr; class trace_info; } namespace service { class storage_proxy; // `mapreduce_service` is a sharded service responsible for distributing and // executing aggregation requests across a cluster. // // To use this service, one needs to express its aggregation query using // `query::mapreduce_request` struct, and call the `dispatch` method with the // previously mentioned struct acting as an argument. What will happen after // calling it, is as follows: // 1. `dispatch` splits aggregation query into sub-queries. The caller of // this method is named a super-coordinator. // 2. Sub-queries are distributed across some group of coordinators. // 3. Each coordinator forwards received sub-query to all its shards. // 3. Each shard executes received sub-query and filters query partition // ranges so that they are contained in the set of partitions owned by // that shard. // 4. Each coordinator merges results produced by its shards and sends merged // result to super-coordinator. // 5. `dispatch` merges results from all coordinators and returns merged // result. // // Splitting query into sub-queries in is implemented as: // a. Partition ranges of the original query are split into a sequence of // vnodes. // b. Each vnode in the sequence is added to a set associated with some // endpoint that holds this vnode (this step can be though of as grouping // vnodes that are held by the same nodes together). // a. For each vnode set created in the previous point, replace partition // ranges of the original query with a partition ranges represented // by the vnode set. This replacement will create a sub-query whose // recipient is the endpoint that holds all vnodes in the set. // // Query splitting example (3 node cluster with num_tokens set to 3): // Original query: mapreduce_request{ // reduction_types=[reduction_type{count}], // cmd=read_command{contents omitted}, // pr={(-inf, +inf)}, // cl=ONE, // timeout(ms)=4864752279 // } // // Token ring: // // start_token | end_token | endpoint // ----------------------+----------------------+----------- // -7656156436523256816 | -6273657286650174294 | 127.0.0.2 // -6273657286650174294 | -885518633547994880 | 127.0.0.2 // -885518633547994880 | -881470678946355457 | 127.0.0.1 // -881470678946355457 | -589668175639820781 | 127.0.0.2 // -589668175639820781 | 1403899953968875783 | 127.0.0.3 // 1403899953968875783 | 6175622851574774197 | 127.0.0.3 // 6175622851574774197 | 7046230184729046062 | 127.0.0.3 // 7046230184729046062 | 7090132112022535426 | 127.0.0.1 // 7090132112022535426 | -7656156436523256816 | 127.0.0.1 // // Created sub-queries: // // mapreduce_request{ // reduction_types=[reduction_type{count}], // cmd=read_command{contents omitted}, // pr={ // (-inf, {-7656156436523256816, end}], // ({-885518633547994880, end}, {-881470678946355457, end}], // ({7046230184729046062, end}, {7090132112022535426, end}], // ({7090132112022535426, end}, +inf) // }, // cl=ONE, // timeout(ms)=4865767688 // } for 127.0.0.1 // // mapreduce_request{ // reduction_types=[reduction_type{count}], // cmd=read_command{contents omitted}, // pr={ // ({-7656156436523256816, end}, {-6273657286650174294, end}], // ({-6273657286650174294, end}, {-885518633547994880, end}], // ({-881470678946355457, end}, {-589668175639820781, end}] // }, // cl=ONE, // timeout(ms)=4865767688 // } for 127.0.0.2 // // mapreduce_request{ // reduction_types=[reduction_type{count}], // cmd=read_command{contents omitted}, // pr={ // ({-589668175639820781, end}, {1403899953968875783, end}], // ({1403899953968875783, end}, {6175622851574774197, end}], // ({6175622851574774197, end}, {7046230184729046062, end}] // }, // cl=ONE, // timeout(ms)=4865767688 // } for 127.0.0.3 // class mapreduce_service : public seastar::peering_sharded_service { netw::messaging_service& _messaging; service::storage_proxy& _proxy; distributed& _db; const locator::shared_token_metadata& _shared_token_metadata; struct stats { uint64_t requests_dispatched_to_other_nodes = 0; uint64_t requests_dispatched_to_own_shards = 0; uint64_t requests_executed = 0; } _stats; seastar::metrics::metric_groups _metrics; optimized_optional _early_abort_subscription; bool _shutdown = false; public: mapreduce_service(netw::messaging_service& ms, service::storage_proxy& p, distributed &db, const locator::shared_token_metadata& stm, abort_source& as) : _messaging(ms) , _proxy(p) , _db(db) , _shared_token_metadata(stm) , _early_abort_subscription(as.subscribe([this] () noexcept { _shutdown = true; })) { register_metrics(); init_messaging_service(); } future<> stop(); // Splits given `mapreduce_request` and distributes execution of resulting // subrequests across a cluster. future dispatch(query::mapreduce_request req, tracing::trace_state_ptr tr_state); private: // Used to distribute given `mapreduce_request` across shards. future dispatch_to_shards(query::mapreduce_request req, std::optional tr_info); // Used to execute a `mapreduce_request` on a shard. future execute_on_this_shard(query::mapreduce_request req, std::optional tr_info); locator::token_metadata_ptr get_token_metadata_ptr() const noexcept; void register_metrics(); void init_messaging_service(); future<> uninit_messaging_service(); friend class retrying_dispatcher; }; } // namespace service