/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * Copyright (C) 2015 ScyllaDB * * Modified by ScyllaDB */ /* * This file is part of Scylla. * * Scylla is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Scylla is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Scylla. If not, see . */ #pragma once #include "database.hh" #include "query-request.hh" #include "query-result.hh" #include "query-result-set.hh" #include "core/distributed.hh" #include "db/consistency_level_type.hh" #include "db/read_repair_decision.hh" #include "db/write_type.hh" #include "db/hints/manager.hh" #include "utils/histogram.hh" #include "utils/estimated_histogram.hh" #include "tracing/trace_state.hh" #include #include "frozen_mutation.hh" #include "db/config.hh" #include "storage_proxy_stats.hh" namespace compat { class one_or_two_partition_ranges; } namespace service { class abstract_write_response_handler; class abstract_read_executor; class mutation_holder; using replicas_per_token_range = std::unordered_map>; class storage_proxy : public seastar::async_sharded_service /*implements StorageProxyMBean*/ { public: using clock_type = lowres_clock; struct config { stdx::optional> hinted_handoff_enabled = {}; size_t available_memory; }; private: struct rh_entry { ::shared_ptr handler; timer expire_timer; rh_entry(::shared_ptr&& h, std::function&& cb); }; using response_id_type = uint64_t; struct unique_response_handler { response_id_type id; storage_proxy& p; unique_response_handler(storage_proxy& p_, response_id_type id_); unique_response_handler(const unique_response_handler&) = delete; unique_response_handler& operator=(const unique_response_handler&) = delete; unique_response_handler(unique_response_handler&& x); ~unique_response_handler(); response_id_type release(); }; public: static const sstring COORDINATOR_STATS_CATEGORY; static const sstring REPLICA_STATS_CATEGORY; using write_stats = storage_proxy_stats::write_stats; using stats = storage_proxy_stats::stats; class coordinator_query_options { clock_type::time_point _timeout; public: tracing::trace_state_ptr trace_state = nullptr; replicas_per_token_range preferred_replicas; stdx::optional read_repair_decision; coordinator_query_options(clock_type::time_point timeout, tracing::trace_state_ptr trace_state = nullptr, replicas_per_token_range preferred_replicas = { }, stdx::optional read_repair_decision = { }) : _timeout(timeout) , trace_state(std::move(trace_state)) , preferred_replicas(std::move(preferred_replicas)) , read_repair_decision(read_repair_decision) { } clock_type::time_point timeout(storage_proxy& sp) const { return _timeout; } }; struct coordinator_query_result { foreign_ptr> query_result; replicas_per_token_range last_replicas; db::read_repair_decision read_repair_decision; coordinator_query_result(foreign_ptr> query_result, replicas_per_token_range last_replicas = {}, db::read_repair_decision read_repair_decision = db::read_repair_decision::NONE) : query_result(std::move(query_result)) , last_replicas(std::move(last_replicas)) , read_repair_decision(std::move(read_repair_decision)) { } }; private: distributed& _db; response_id_type _next_response_id; std::unordered_map _response_handlers; // This buffer hold ids of throttled writes in case resource consumption goes // below the threshold and we want to unthrottle some of them. Without this throttled // request with dead or slow replica may wait for up to timeout ms before replying // even if resource consumption will go to zero. Note that some requests here may // be already completed by the point they tried to be unthrottled (request completion does // not remove request from the buffer), but this is fine since request ids are unique, so we // just skip an entry if request no longer exists. circular_buffer _throttled_writes; db::hints::resource_manager _hints_resource_manager; stdx::optional _hints_manager; db::hints::manager _hints_for_views_manager; stats _stats; static constexpr float CONCURRENT_SUBREQUESTS_MARGIN = 0.10; // for read repair chance calculation std::default_random_engine _urandom; std::uniform_real_distribution<> _read_repair_chance = std::uniform_real_distribution<>(0,1); seastar::metrics::metric_groups _metrics; uint64_t _background_write_throttle_threahsold; inheriting_concrete_execution_stage< future<>, storage_proxy*, std::vector, db::consistency_level, clock_type::time_point, tracing::trace_state_ptr, bool> _mutate_stage; private: void uninit_messaging_service(); future query_singular(lw_shared_ptr cmd, dht::partition_range_vector&& partition_ranges, db::consistency_level cl, coordinator_query_options optional_params); response_id_type register_response_handler(shared_ptr&& h); void remove_response_handler(response_id_type id); void got_response(response_id_type id, gms::inet_address from); void got_failure_response(response_id_type id, gms::inet_address from, size_t count); future<> response_wait(response_id_type id, clock_type::time_point timeout); ::shared_ptr& get_write_response_handler(storage_proxy::response_id_type id); response_id_type create_write_response_handler(keyspace& ks, db::consistency_level cl, db::write_type type, std::unique_ptr m, std::unordered_set targets, const std::vector& pending_endpoints, std::vector, tracing::trace_state_ptr tr_state, storage_proxy::write_stats& stats); response_id_type create_write_response_handler(const mutation&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state); response_id_type create_write_response_handler(const std::unordered_map>&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state); void send_to_live_endpoints(response_id_type response_id, clock_type::time_point timeout); template size_t hint_to_dead_endpoints(std::unique_ptr& mh, const Range& targets, db::write_type type, tracing::trace_state_ptr tr_state) noexcept; void hint_to_dead_endpoints(response_id_type, db::consistency_level); template bool cannot_hint(const Range& targets, db::write_type type); bool hints_enabled(db::write_type type) noexcept; db::hints::manager& hints_manager_for(db::write_type type); std::vector get_live_endpoints(keyspace& ks, const dht::token& token); std::vector get_live_sorted_endpoints(keyspace& ks, const dht::token& token); db::read_repair_decision new_read_repair_decision(const schema& s); ::shared_ptr get_read_executor(lw_shared_ptr cmd, schema_ptr schema, dht::partition_range pr, db::consistency_level cl, db::read_repair_decision repair_decision, tracing::trace_state_ptr trace_state, const std::vector& preferred_endpoints); future>, cache_temperature> query_result_local(schema_ptr, lw_shared_ptr cmd, const dht::partition_range& pr, query::result_options opts, tracing::trace_state_ptr trace_state, clock_type::time_point timeout, uint64_t max_size = query::result_memory_limiter::maximum_result_size); future query_result_local_digest(schema_ptr, lw_shared_ptr cmd, const dht::partition_range& pr, tracing::trace_state_ptr trace_state, clock_type::time_point timeout, query::digest_algorithm da, uint64_t max_size = query::result_memory_limiter::maximum_result_size); future query_partition_key_range(lw_shared_ptr cmd, dht::partition_range_vector partition_ranges, db::consistency_level cl, coordinator_query_options optional_params); float estimate_result_rows_per_range(lw_shared_ptr cmd, keyspace& ks); static std::vector intersection(const std::vector& l1, const std::vector& l2); future>>, replicas_per_token_range> query_partition_key_range_concurrent(clock_type::time_point timeout, std::vector>>&& results, lw_shared_ptr cmd, db::consistency_level cl, dht::partition_range_vector::iterator&& i, dht::partition_range_vector&& ranges, int concurrency_factor, tracing::trace_state_ptr trace_state, uint32_t remaining_row_count, uint32_t remaining_partition_count, replicas_per_token_range preferred_replicas); future do_query(schema_ptr, lw_shared_ptr cmd, dht::partition_range_vector&& partition_ranges, db::consistency_level cl, coordinator_query_options optional_params); template future> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, CreateWriteHandler handler); template future> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state); future<> mutate_begin(std::vector ids, db::consistency_level cl, stdx::optional timeout_opt = { }); future<> mutate_end(future<> mutate_result, utils::latency_counter, write_stats& stats, tracing::trace_state_ptr trace_state); future<> schedule_repair(std::unordered_map>> diffs, db::consistency_level cl, tracing::trace_state_ptr trace_state); bool need_throttle_writes() const; void unthrottle(); void handle_read_error(std::exception_ptr eptr, bool range); template future<> mutate_internal(Range mutations, db::consistency_level cl, bool counter_write, tracing::trace_state_ptr tr_state, stdx::optional timeout_opt = { }); future>, cache_temperature> query_nonsingular_mutations_locally( schema_ptr s, lw_shared_ptr cmd, const dht::partition_range_vector&& pr, tracing::trace_state_ptr trace_state, uint64_t max_size, clock_type::time_point timeout); future<> mutate_counters_on_leader(std::vector mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr trace_state); future<> mutate_counter_on_leader_and_replicate(const schema_ptr& s, frozen_mutation m, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr trace_state); gms::inet_address find_leader_for_counter_update(const mutation& m, db::consistency_level cl); future<> do_mutate(std::vector mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, bool); future<> send_to_endpoint( std::unique_ptr m, gms::inet_address target, std::vector pending_endpoints, db::write_type type, write_stats& stats); public: storage_proxy(distributed& db, config cfg); ~storage_proxy(); const distributed& get_db() const { return _db; } distributed& get_db() { return _db; } response_id_type get_next_response_id() { auto next = _next_response_id++; if (next == 0) { // 0 is reserved for unique_response_handler next = _next_response_id++; } return next; } void init_messaging_service(); // Applies mutation on this node. // Resolves with timed_out_error when timeout is reached. future<> mutate_locally(const mutation& m, clock_type::time_point timeout = clock_type::time_point::max()); // Applies mutation on this node. // Resolves with timed_out_error when timeout is reached. future<> mutate_locally(const schema_ptr&, const frozen_mutation& m, clock_type::time_point timeout = clock_type::time_point::max()); // Applies mutations on this node. // Resolves with timed_out_error when timeout is reached. future<> mutate_locally(std::vector mutation, clock_type::time_point timeout = clock_type::time_point::max()); future<> mutate_streaming_mutation(const schema_ptr&, utils::UUID plan_id, const frozen_mutation& m, bool fragmented); dht::partition_range_vector get_restricted_ranges(const schema& s, dht::partition_range range); /** * Use this method to have these Mutations applied * across all replicas. This method will take care * of the possibility of a replica being down and hint * the data across to some other replica. * * @param mutations the mutations to be applied across the replicas * @param consistency_level the consistency level for the operation * @param tr_state trace state handle */ future<> mutate(std::vector mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, bool raw_counters = false); future<> replicate_counter_from_leader(mutation m, db::consistency_level cl, tracing::trace_state_ptr tr_state, clock_type::time_point timeout); template future<> mutate_counters(Range&& mutations, db::consistency_level cl, tracing::trace_state_ptr tr_state, clock_type::time_point timeout); future<> mutate_with_triggers(std::vector mutations, db::consistency_level cl, clock_type::time_point timeout, bool should_mutate_atomically, tracing::trace_state_ptr tr_state, bool raw_counters = false); /** * See mutate. Adds additional steps before and after writing a batch. * Before writing the batch (but after doing availability check against the FD for the row replicas): * write the entire batch to a batchlog elsewhere in the cluster. * After: remove the batchlog entry (after writing hints for the batch rows, if necessary). * * @param mutations the Mutations to be applied across the replicas * @param consistency_level the consistency level for the operation * @param tr_state trace state handle */ future<> mutate_atomically(std::vector mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state); // Send a mutation to one specific remote target. // Inspired by Cassandra's StorageProxy.sendToHintedEndpoints but without // hinted handoff support, and just one target. See also // send_to_live_endpoints() - another take on the same original function. future<> send_to_endpoint(mutation m, gms::inet_address target, std::vector pending_endpoints, db::write_type type, write_stats& stats); future<> send_to_endpoint(mutation m, gms::inet_address target, std::vector pending_endpoints, db::write_type type); future<> send_to_endpoint(frozen_mutation_and_schema fm_a_s, gms::inet_address target, std::vector pending_endpoints, db::write_type type, write_stats& stats); future<> send_to_endpoint(frozen_mutation_and_schema fm_a_s, gms::inet_address target, std::vector pending_endpoints, db::write_type type); /** * Performs the truncate operatoin, which effectively deletes all data from * the column family cfname * @param keyspace * @param cfname */ future<> truncate_blocking(sstring keyspace, sstring cfname); /* * Executes data query on the whole cluster. * * Partitions for each range will be ordered according to decorated_key ordering. Results for * each range from "partition_ranges" may appear in any order. * * Will consider the preferred_replicas provided by the caller when selecting the replicas to * send read requests to. However this is merely a hint and it is not guaranteed that the read * requests will be sent to all or any of the listed replicas. After the query is done the list * of replicas that served it is also returned. * * IMPORTANT: Not all fibers started by this method have to be done by the time it returns so no * parameter can be changed after being passed to this method. */ future query(schema_ptr, lw_shared_ptr cmd, dht::partition_range_vector&& partition_ranges, db::consistency_level cl, coordinator_query_options optional_params); future>, cache_temperature> query_mutations_locally( schema_ptr, lw_shared_ptr cmd, const dht::partition_range&, clock_type::time_point timeout, tracing::trace_state_ptr trace_state = nullptr, uint64_t max_size = query::result_memory_limiter::maximum_result_size); future>, cache_temperature> query_mutations_locally( schema_ptr, lw_shared_ptr cmd, const ::compat::one_or_two_partition_ranges&, clock_type::time_point timeout, tracing::trace_state_ptr trace_state = nullptr, uint64_t max_size = query::result_memory_limiter::maximum_result_size); future>, cache_temperature> query_mutations_locally( schema_ptr s, lw_shared_ptr cmd, const dht::partition_range_vector& pr, clock_type::time_point timeout, tracing::trace_state_ptr trace_state = nullptr, uint64_t max_size = query::result_memory_limiter::maximum_result_size); future<> stop(); future<> stop_hints_manager(); future<> start_hints_manager(shared_ptr gossiper_ptr, shared_ptr ss_ptr); void allow_replaying_hints() noexcept; const stats& get_stats() const { return _stats; } friend class abstract_read_executor; friend class abstract_write_response_handler; friend class speculating_read_executor; }; extern distributed _the_storage_proxy; inline distributed& get_storage_proxy() { return _the_storage_proxy; } inline storage_proxy& get_local_storage_proxy() { return _the_storage_proxy.local(); } inline shared_ptr get_local_shared_storage_proxy() { return _the_storage_proxy.local_shared(); } dht::partition_range_vector get_restricted_ranges(locator::token_metadata&, const schema&, dht::partition_range); }