The get_live_endpoints matches the same method on the proxy side. Since the forward service carries proxy reference, it can use its method (which needs to be made public for that sake). Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
838 lines
43 KiB
C++
838 lines
43 KiB
C++
/*
|
|
*/
|
|
|
|
/*
|
|
* Copyright (C) 2015-present ScyllaDB
|
|
*
|
|
* Modified by ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <variant>
|
|
#include "replica/database_fwd.hh"
|
|
#include "data_dictionary/data_dictionary.hh"
|
|
#include "message/messaging_service_fwd.hh"
|
|
#include "query-request.hh"
|
|
#include "query-result.hh"
|
|
#include "query-result-set.hh"
|
|
#include "inet_address_vectors.hh"
|
|
#include <seastar/core/distributed.hh>
|
|
#include <seastar/core/execution_stage.hh>
|
|
#include <seastar/core/scheduling_specific.hh>
|
|
#include "db/consistency_level_type.hh"
|
|
#include "db/read_repair_decision.hh"
|
|
#include "db/write_type.hh"
|
|
#include "db/hints/manager.hh"
|
|
#include "db/view/view_update_backlog.hh"
|
|
#include "db/view/node_view_update_backlog.hh"
|
|
#include "utils/histogram.hh"
|
|
#include "utils/estimated_histogram.hh"
|
|
#include "tracing/trace_state.hh"
|
|
#include <seastar/core/metrics.hh>
|
|
#include <seastar/rpc/rpc_types.hh>
|
|
#include "storage_proxy_stats.hh"
|
|
#include "cache_temperature.hh"
|
|
#include "service_permit.hh"
|
|
#include "service/client_state.hh"
|
|
#include "cdc/stats.hh"
|
|
#include "locator/abstract_replication_strategy.hh"
|
|
#include "db/hints/host_filter.hh"
|
|
#include "utils/small_vector.hh"
|
|
#include "service/endpoint_lifecycle_subscriber.hh"
|
|
#include <seastar/core/circular_buffer.hh>
|
|
#include "query_ranges_to_vnodes.hh"
|
|
#include "partition_range_compat.hh"
|
|
#include "exceptions/exceptions.hh"
|
|
#include "exceptions/coordinator_result.hh"
|
|
|
|
class reconcilable_result;
|
|
class frozen_mutation_and_schema;
|
|
class frozen_mutation;
|
|
|
|
namespace seastar::rpc {
|
|
|
|
template <typename... T>
|
|
class tuple;
|
|
|
|
}
|
|
|
|
namespace compat {
|
|
|
|
class one_or_two_partition_ranges;
|
|
|
|
}
|
|
|
|
namespace cdc {
|
|
class cdc_service;
|
|
}
|
|
|
|
namespace gms {
|
|
class gossiper;
|
|
class feature_service;
|
|
}
|
|
|
|
namespace service {
|
|
|
|
namespace paxos {
|
|
class prepare_summary;
|
|
class proposal;
|
|
class promise;
|
|
using prepare_response = std::variant<utils::UUID, promise>;
|
|
}
|
|
|
|
class abstract_write_response_handler;
|
|
class paxos_response_handler;
|
|
class abstract_read_executor;
|
|
class mutation_holder;
|
|
class view_update_write_response_handler;
|
|
struct hint_wrapper;
|
|
|
|
using replicas_per_token_range = std::unordered_map<dht::token_range, std::vector<utils::UUID>>;
|
|
|
|
struct query_partition_key_range_concurrent_result {
|
|
std::vector<foreign_ptr<lw_shared_ptr<query::result>>> result;
|
|
replicas_per_token_range replicas;
|
|
};
|
|
|
|
struct view_update_backlog_timestamped {
|
|
db::view::update_backlog backlog;
|
|
api::timestamp_type ts;
|
|
};
|
|
|
|
struct allow_hints_tag {};
|
|
using allow_hints = bool_class<allow_hints_tag>;
|
|
|
|
struct storage_proxy_coordinator_query_result {
|
|
foreign_ptr<lw_shared_ptr<query::result>> query_result;
|
|
replicas_per_token_range last_replicas;
|
|
db::read_repair_decision read_repair_decision;
|
|
|
|
storage_proxy_coordinator_query_result(foreign_ptr<lw_shared_ptr<query::result>> query_result,
|
|
replicas_per_token_range last_replicas = {},
|
|
db::read_repair_decision read_repair_decision = db::read_repair_decision::NONE)
|
|
: query_result(std::move(query_result))
|
|
, last_replicas(std::move(last_replicas))
|
|
, read_repair_decision(std::move(read_repair_decision)) {
|
|
}
|
|
};
|
|
|
|
class cas_request;
|
|
|
|
class storage_proxy : public seastar::async_sharded_service<storage_proxy>, public peering_sharded_service<storage_proxy>, public service::endpoint_lifecycle_subscriber {
|
|
public:
|
|
enum class error : uint8_t {
|
|
NONE,
|
|
TIMEOUT,
|
|
FAILURE,
|
|
};
|
|
template<typename T = void>
|
|
using result = exceptions::coordinator_result<T>;
|
|
using clock_type = lowres_clock;
|
|
struct config {
|
|
db::hints::host_filter hinted_handoff_enabled = {};
|
|
db::hints::directory_initializer hints_directory_initializer;
|
|
size_t available_memory;
|
|
smp_service_group read_smp_service_group = default_smp_service_group();
|
|
smp_service_group write_smp_service_group = default_smp_service_group();
|
|
smp_service_group hints_write_smp_service_group = default_smp_service_group();
|
|
// Write acknowledgments might not be received on the correct shard, and
|
|
// they need a separate smp_service_group to prevent an ABBA deadlock
|
|
// with writes.
|
|
smp_service_group write_ack_smp_service_group = default_smp_service_group();
|
|
};
|
|
private:
|
|
|
|
using response_id_type = uint64_t;
|
|
struct unique_response_handler {
|
|
response_id_type id;
|
|
storage_proxy& p;
|
|
unique_response_handler(storage_proxy& p_, response_id_type id_);
|
|
unique_response_handler(const unique_response_handler&) = delete;
|
|
unique_response_handler& operator=(const unique_response_handler&) = delete;
|
|
unique_response_handler(unique_response_handler&& x) noexcept;
|
|
unique_response_handler& operator=(unique_response_handler&&) noexcept;
|
|
~unique_response_handler();
|
|
response_id_type release();
|
|
};
|
|
using unique_response_handler_vector = utils::small_vector<unique_response_handler, 1>;
|
|
using response_handlers_map = std::unordered_map<response_id_type, ::shared_ptr<abstract_write_response_handler>>;
|
|
|
|
public:
|
|
static const sstring COORDINATOR_STATS_CATEGORY;
|
|
static const sstring REPLICA_STATS_CATEGORY;
|
|
|
|
using write_stats = storage_proxy_stats::write_stats;
|
|
using stats = storage_proxy_stats::stats;
|
|
using global_stats = storage_proxy_stats::global_stats;
|
|
using cdc_stats = cdc::stats;
|
|
|
|
class coordinator_query_options {
|
|
clock_type::time_point _timeout;
|
|
|
|
public:
|
|
service_permit permit;
|
|
client_state& cstate;
|
|
tracing::trace_state_ptr trace_state = nullptr;
|
|
replicas_per_token_range preferred_replicas;
|
|
std::optional<db::read_repair_decision> read_repair_decision;
|
|
|
|
coordinator_query_options(clock_type::time_point timeout,
|
|
service_permit permit_,
|
|
client_state& client_state_,
|
|
tracing::trace_state_ptr trace_state = nullptr,
|
|
replicas_per_token_range preferred_replicas = { },
|
|
std::optional<db::read_repair_decision> read_repair_decision = { })
|
|
: _timeout(timeout)
|
|
, permit(std::move(permit_))
|
|
, cstate(client_state_)
|
|
, trace_state(std::move(trace_state))
|
|
, preferred_replicas(std::move(preferred_replicas))
|
|
, read_repair_decision(read_repair_decision) {
|
|
}
|
|
|
|
clock_type::time_point timeout(storage_proxy& sp) const {
|
|
return _timeout;
|
|
}
|
|
};
|
|
|
|
using coordinator_query_result = storage_proxy_coordinator_query_result;
|
|
|
|
// Holds a list of endpoints participating in CAS request, for a given
|
|
// consistency level, token, and state of joining/leaving nodes.
|
|
struct paxos_participants {
|
|
inet_address_vector_replica_set endpoints;
|
|
// How many participants are required for a quorum (i.e. is it SERIAL or LOCAL_SERIAL).
|
|
size_t required_participants;
|
|
bool has_dead_endpoints;
|
|
};
|
|
|
|
gms::feature_service& features() noexcept { return _features; }
|
|
const gms::feature_service& features() const { return _features; }
|
|
|
|
locator::effective_replication_map_factory& get_erm_factory() noexcept {
|
|
return _erm_factory;
|
|
}
|
|
|
|
const locator::effective_replication_map_factory& get_erm_factory() const noexcept {
|
|
return _erm_factory;
|
|
}
|
|
|
|
locator::token_metadata_ptr get_token_metadata_ptr() const noexcept;
|
|
|
|
query::max_result_size get_max_result_size(const query::partition_slice& slice) const;
|
|
inet_address_vector_replica_set get_live_endpoints(replica::keyspace& ks, const dht::token& token) const;
|
|
|
|
private:
|
|
distributed<replica::database>& _db;
|
|
gms::gossiper& _gossiper;
|
|
const locator::shared_token_metadata& _shared_token_metadata;
|
|
locator::effective_replication_map_factory& _erm_factory;
|
|
// _mm is initialized late in init_messaging_service() due to circular dependency
|
|
shared_ptr<migration_manager> _mm;
|
|
smp_service_group _read_smp_service_group;
|
|
smp_service_group _write_smp_service_group;
|
|
smp_service_group _hints_write_smp_service_group;
|
|
smp_service_group _write_ack_smp_service_group;
|
|
response_id_type _next_response_id;
|
|
response_handlers_map _response_handlers;
|
|
// This buffer hold ids of throttled writes in case resource consumption goes
|
|
// below the threshold and we want to unthrottle some of them. Without this throttled
|
|
// request with dead or slow replica may wait for up to timeout ms before replying
|
|
// even if resource consumption will go to zero. Note that some requests here may
|
|
// be already completed by the point they tried to be unthrottled (request completion does
|
|
// not remove request from the buffer), but this is fine since request ids are unique, so we
|
|
// just skip an entry if request no longer exists.
|
|
circular_buffer<response_id_type> _throttled_writes;
|
|
db::hints::resource_manager _hints_resource_manager;
|
|
db::hints::manager _hints_manager;
|
|
db::hints::directory_initializer _hints_directory_initializer;
|
|
db::hints::manager _hints_for_views_manager;
|
|
scheduling_group_key _stats_key;
|
|
storage_proxy_stats::global_stats _global_stats;
|
|
gms::feature_service& _features;
|
|
netw::messaging_service& _messaging;
|
|
static constexpr float CONCURRENT_SUBREQUESTS_MARGIN = 0.10;
|
|
// for read repair chance calculation
|
|
std::default_random_engine _urandom;
|
|
std::uniform_real_distribution<> _read_repair_chance = std::uniform_real_distribution<>(0,1);
|
|
seastar::metrics::metric_groups _metrics;
|
|
uint64_t _background_write_throttle_threahsold;
|
|
inheriting_concrete_execution_stage<
|
|
future<result<>>,
|
|
storage_proxy*,
|
|
std::vector<mutation>,
|
|
db::consistency_level,
|
|
clock_type::time_point,
|
|
tracing::trace_state_ptr,
|
|
service_permit,
|
|
bool,
|
|
lw_shared_ptr<cdc::operation_result_tracker>> _mutate_stage;
|
|
netw::connection_drop_slot_t _connection_dropped;
|
|
netw::connection_drop_registration_t _condrop_registration;
|
|
db::view::node_update_backlog& _max_view_update_backlog;
|
|
std::unordered_map<gms::inet_address, view_update_backlog_timestamped> _view_update_backlogs;
|
|
|
|
//NOTICE(sarna): This opaque pointer is here just to avoid moving write handler class definitions from .cc to .hh. It's slow path.
|
|
class view_update_handlers_list;
|
|
std::unique_ptr<view_update_handlers_list> _view_update_handlers_list;
|
|
|
|
/* This is a pointer to the shard-local part of the sharded cdc_service:
|
|
* storage_proxy needs access to cdc_service to augument mutations.
|
|
*
|
|
* It is a pointer and not a reference since cdc_service must be initialized after storage_proxy,
|
|
* because it uses storage_proxy to perform pre-image queries, for one thing.
|
|
* Therefore, at the moment of initializing storage_proxy, we don't have access to cdc_service yet.
|
|
*
|
|
* Furthermore, storage_proxy must keep the service object alive while augmenting mutations
|
|
* (storage_proxy is never deintialized, and even if it would be, it would be after deinitializing cdc_service).
|
|
* Thus cdc_service inherits from enable_shared_from_this and storage_proxy code must remember to call
|
|
* shared_from_this().
|
|
*
|
|
* Eventual deinitialization of cdc_service is enabled by cdc_service::stop setting this pointer to nullptr.
|
|
*/
|
|
cdc::cdc_service* _cdc = nullptr;
|
|
|
|
cdc_stats _cdc_stats;
|
|
private:
|
|
future<result<coordinator_query_result>> query_singular(lw_shared_ptr<query::read_command> cmd,
|
|
dht::partition_range_vector&& partition_ranges,
|
|
db::consistency_level cl,
|
|
coordinator_query_options optional_params);
|
|
response_id_type register_response_handler(shared_ptr<abstract_write_response_handler>&& h);
|
|
void remove_response_handler(response_id_type id);
|
|
void remove_response_handler_entry(response_handlers_map::iterator entry);
|
|
void got_response(response_id_type id, gms::inet_address from, std::optional<db::view::update_backlog> backlog);
|
|
void got_failure_response(response_id_type id, gms::inet_address from, size_t count, std::optional<db::view::update_backlog> backlog, error err, std::optional<sstring> msg);
|
|
future<result<>> response_wait(response_id_type id, clock_type::time_point timeout);
|
|
::shared_ptr<abstract_write_response_handler>& get_write_response_handler(storage_proxy::response_id_type id);
|
|
response_id_type create_write_response_handler_helper(schema_ptr s, const dht::token& token,
|
|
std::unique_ptr<mutation_holder> mh, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state,
|
|
service_permit permit);
|
|
response_id_type create_write_response_handler(replica::keyspace& ks, db::consistency_level cl, db::write_type type, std::unique_ptr<mutation_holder> m, inet_address_vector_replica_set targets,
|
|
const inet_address_vector_topology_change& pending_endpoints, inet_address_vector_topology_change, tracing::trace_state_ptr tr_state, storage_proxy::write_stats& stats, service_permit permit);
|
|
response_id_type create_write_response_handler(const mutation&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
|
|
response_id_type create_write_response_handler(const hint_wrapper&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
|
|
response_id_type create_write_response_handler(const std::unordered_map<gms::inet_address, std::optional<mutation>>&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
|
|
response_id_type create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>& proposal,
|
|
db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
|
|
response_id_type create_write_response_handler(const std::tuple<lw_shared_ptr<paxos::proposal>, schema_ptr, dht::token, inet_address_vector_replica_set>& meta,
|
|
db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
|
|
void register_cdc_operation_result_tracker(const storage_proxy::unique_response_handler_vector& ids, lw_shared_ptr<cdc::operation_result_tracker> tracker);
|
|
void send_to_live_endpoints(response_id_type response_id, clock_type::time_point timeout);
|
|
template<typename Range>
|
|
size_t hint_to_dead_endpoints(std::unique_ptr<mutation_holder>& mh, const Range& targets, db::write_type type, tracing::trace_state_ptr tr_state) noexcept;
|
|
void hint_to_dead_endpoints(response_id_type, db::consistency_level);
|
|
template<typename Range>
|
|
bool cannot_hint(const Range& targets, db::write_type type) const;
|
|
bool hints_enabled(db::write_type type) const noexcept;
|
|
db::hints::manager& hints_manager_for(db::write_type type);
|
|
static void sort_endpoints_by_proximity(inet_address_vector_replica_set& eps);
|
|
inet_address_vector_replica_set get_live_sorted_endpoints(replica::keyspace& ks, const dht::token& token) const;
|
|
db::read_repair_decision new_read_repair_decision(const schema& s);
|
|
::shared_ptr<abstract_read_executor> get_read_executor(lw_shared_ptr<query::read_command> cmd,
|
|
schema_ptr schema,
|
|
dht::partition_range pr,
|
|
db::consistency_level cl,
|
|
db::read_repair_decision repair_decision,
|
|
tracing::trace_state_ptr trace_state,
|
|
const inet_address_vector_replica_set& preferred_endpoints,
|
|
bool& is_bounced_read,
|
|
service_permit permit);
|
|
future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>> query_result_local(schema_ptr, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr,
|
|
query::result_options opts,
|
|
tracing::trace_state_ptr trace_state,
|
|
clock_type::time_point timeout);
|
|
future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>> query_result_local_digest(schema_ptr, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr,
|
|
tracing::trace_state_ptr trace_state,
|
|
clock_type::time_point timeout,
|
|
query::digest_algorithm da);
|
|
future<result<coordinator_query_result>> query_partition_key_range(lw_shared_ptr<query::read_command> cmd,
|
|
dht::partition_range_vector partition_ranges,
|
|
db::consistency_level cl,
|
|
coordinator_query_options optional_params);
|
|
static inet_address_vector_replica_set intersection(const inet_address_vector_replica_set& l1, const inet_address_vector_replica_set& l2);
|
|
future<result<query_partition_key_range_concurrent_result>> query_partition_key_range_concurrent(clock_type::time_point timeout,
|
|
std::vector<foreign_ptr<lw_shared_ptr<query::result>>>&& results,
|
|
lw_shared_ptr<query::read_command> cmd,
|
|
db::consistency_level cl,
|
|
query_ranges_to_vnodes_generator&& ranges_to_vnodes,
|
|
int concurrency_factor,
|
|
tracing::trace_state_ptr trace_state,
|
|
uint64_t remaining_row_count,
|
|
uint32_t remaining_partition_count,
|
|
replicas_per_token_range preferred_replicas,
|
|
service_permit permit);
|
|
|
|
future<result<coordinator_query_result>> do_query(schema_ptr,
|
|
lw_shared_ptr<query::read_command> cmd,
|
|
dht::partition_range_vector&& partition_ranges,
|
|
db::consistency_level cl,
|
|
coordinator_query_options optional_params);
|
|
future<coordinator_query_result> do_query_with_paxos(schema_ptr,
|
|
lw_shared_ptr<query::read_command> cmd,
|
|
dht::partition_range_vector&& partition_ranges,
|
|
db::consistency_level cl,
|
|
coordinator_query_options optional_params);
|
|
template<typename Range, typename CreateWriteHandler>
|
|
future<unique_response_handler_vector> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, service_permit permit, CreateWriteHandler handler);
|
|
template<typename Range>
|
|
future<unique_response_handler_vector> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
|
|
future<result<>> mutate_begin(unique_response_handler_vector ids, db::consistency_level cl, tracing::trace_state_ptr trace_state, std::optional<clock_type::time_point> timeout_opt = { });
|
|
future<result<>> mutate_end(future<result<>> mutate_result, utils::latency_counter, write_stats& stats, tracing::trace_state_ptr trace_state);
|
|
future<result<>> schedule_repair(std::unordered_map<dht::token, std::unordered_map<gms::inet_address, std::optional<mutation>>> diffs, db::consistency_level cl, tracing::trace_state_ptr trace_state, service_permit permit);
|
|
bool need_throttle_writes() const;
|
|
void unthrottle();
|
|
void handle_read_error(std::variant<exceptions::coordinator_exception_container, std::exception_ptr> failure, bool range);
|
|
template<typename Range>
|
|
future<result<>> mutate_internal(Range mutations, db::consistency_level cl, bool counter_write, tracing::trace_state_ptr tr_state, service_permit permit, std::optional<clock_type::time_point> timeout_opt = { }, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker = { });
|
|
future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> query_nonsingular_mutations_locally(
|
|
schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range_vector&& pr, tracing::trace_state_ptr trace_state,
|
|
clock_type::time_point timeout);
|
|
future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>> query_nonsingular_data_locally(
|
|
schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range_vector&& pr, query::result_options opts,
|
|
tracing::trace_state_ptr trace_state, clock_type::time_point timeout);
|
|
|
|
future<> mutate_counters_on_leader(std::vector<frozen_mutation_and_schema> mutations, db::consistency_level cl, clock_type::time_point timeout,
|
|
tracing::trace_state_ptr trace_state, service_permit permit);
|
|
future<> mutate_counter_on_leader_and_replicate(const schema_ptr& s, frozen_mutation m, db::consistency_level cl, clock_type::time_point timeout,
|
|
tracing::trace_state_ptr trace_state, service_permit permit);
|
|
|
|
gms::inet_address find_leader_for_counter_update(const mutation& m, db::consistency_level cl);
|
|
|
|
future<result<>> do_mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker);
|
|
|
|
future<> send_to_endpoint(
|
|
std::unique_ptr<mutation_holder> m,
|
|
gms::inet_address target,
|
|
inet_address_vector_topology_change pending_endpoints,
|
|
db::write_type type,
|
|
tracing::trace_state_ptr tr_state,
|
|
write_stats& stats,
|
|
allow_hints allow_hints = allow_hints::yes);
|
|
|
|
db::view::update_backlog get_view_update_backlog() const;
|
|
|
|
void maybe_update_view_backlog_of(gms::inet_address, std::optional<db::view::update_backlog>);
|
|
|
|
db::view::update_backlog get_backlog_of(gms::inet_address) const;
|
|
|
|
template<typename Range>
|
|
future<> mutate_counters(Range&& mutations, db::consistency_level cl, tracing::trace_state_ptr tr_state, service_permit permit, clock_type::time_point timeout);
|
|
|
|
void retire_view_response_handlers(noncopyable_function<bool(const abstract_write_response_handler&)> filter_fun);
|
|
void connection_dropped(gms::inet_address);
|
|
private:
|
|
future<> handle_counter_mutation(const rpc::client_info& cinfo, rpc::opt_time_point t, std::vector<frozen_mutation> fms, db::consistency_level cl, std::optional<tracing::trace_info> trace_info);
|
|
future<rpc::no_wait_type> handle_write(netw::msg_addr src_addr, rpc::opt_time_point t,
|
|
utils::UUID schema_version, auto in, inet_address_vector_replica_set forward, gms::inet_address reply_to,
|
|
unsigned shard, storage_proxy::response_id_type response_id, std::optional<tracing::trace_info> trace_info,
|
|
auto&& apply_fn, auto&& forward_fn);
|
|
future<rpc::no_wait_type> receive_mutation_handler (smp_service_group smp_grp, const rpc::client_info& cinfo, rpc::opt_time_point t, frozen_mutation in, inet_address_vector_replica_set forward,
|
|
gms::inet_address reply_to, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<std::optional<tracing::trace_info>> trace_info);
|
|
future<rpc::no_wait_type> handle_paxos_learn(const rpc::client_info& cinfo, rpc::opt_time_point t, paxos::proposal decision,
|
|
inet_address_vector_replica_set forward, gms::inet_address reply_to, unsigned shard,
|
|
storage_proxy::response_id_type response_id, std::optional<tracing::trace_info> trace_info);
|
|
future<rpc::no_wait_type> handle_mutation_done(const rpc::client_info& cinfo, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<db::view::update_backlog> backlog);
|
|
future<rpc::no_wait_type> handle_mutation_failed(const rpc::client_info& cinfo, unsigned shard, storage_proxy::response_id_type response_id, size_t num_failed, rpc::optional<db::view::update_backlog> backlog);
|
|
future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>> handle_read_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda);
|
|
future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> handle_read_mutation_data(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr);
|
|
future<rpc::tuple<query::result_digest, long, cache_temperature>> handle_read_digest(const rpc::client_info& cinfo, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda);
|
|
future<> handle_truncate(rpc::opt_time_point timeout, sstring ksname, sstring cfname);
|
|
future<foreign_ptr<std::unique_ptr<service::paxos::prepare_response>>> handle_paxos_prepare(const rpc::client_info& cinfo, rpc::opt_time_point timeout,
|
|
query::read_command cmd, partition_key key, utils::UUID ballot, bool only_digest, query::digest_algorithm da,
|
|
std::optional<tracing::trace_info> trace_info);
|
|
future<bool> handle_paxos_accept(const rpc::client_info& cinfo, rpc::opt_time_point timeout, paxos::proposal proposal,
|
|
std::optional<tracing::trace_info> trace_info);
|
|
future<rpc::no_wait_type> handle_paxos_prune(const rpc::client_info& cinfo, rpc::opt_time_point timeout,
|
|
utils::UUID schema_id, partition_key key, utils::UUID ballot, std::optional<tracing::trace_info> trace_info);
|
|
public:
|
|
storage_proxy(distributed<replica::database>& db, gms::gossiper& gossiper, config cfg, db::view::node_update_backlog& max_view_update_backlog,
|
|
scheduling_group_key stats_key, gms::feature_service& feat, const locator::shared_token_metadata& stm, locator::effective_replication_map_factory& erm_factory, netw::messaging_service& ms);
|
|
~storage_proxy();
|
|
const distributed<replica::database>& get_db() const {
|
|
return _db;
|
|
}
|
|
distributed<replica::database>& get_db() {
|
|
return _db;
|
|
}
|
|
const replica::database& local_db() const noexcept {
|
|
return _db.local();
|
|
}
|
|
|
|
const data_dictionary::database data_dictionary() const;
|
|
|
|
replica::database& local_db() noexcept {
|
|
return _db.local();
|
|
}
|
|
|
|
gms::gossiper& gossiper() noexcept {
|
|
return _gossiper;
|
|
}
|
|
|
|
void set_cdc_service(cdc::cdc_service* cdc) {
|
|
_cdc = cdc;
|
|
}
|
|
cdc::cdc_service* get_cdc_service() const {
|
|
return _cdc;
|
|
}
|
|
|
|
view_update_handlers_list& get_view_update_handlers_list() {
|
|
return *_view_update_handlers_list;
|
|
}
|
|
|
|
response_id_type get_next_response_id() {
|
|
auto next = _next_response_id++;
|
|
if (next == 0) { // 0 is reserved for unique_response_handler
|
|
next = _next_response_id++;
|
|
}
|
|
return next;
|
|
}
|
|
void init_messaging_service(shared_ptr<migration_manager>);
|
|
future<> uninit_messaging_service();
|
|
|
|
private:
|
|
// Applies mutation on this node.
|
|
// Resolves with timed_out_error when timeout is reached.
|
|
future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout, smp_service_group smp_grp);
|
|
// Applies mutation on this node.
|
|
// Resolves with timed_out_error when timeout is reached.
|
|
future<> mutate_locally(const schema_ptr&, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout,
|
|
smp_service_group smp_grp);
|
|
// Applies mutations on this node.
|
|
// Resolves with timed_out_error when timeout is reached.
|
|
future<> mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout, smp_service_group smp_grp);
|
|
|
|
public:
|
|
// Applies mutation on this node.
|
|
// Resolves with timed_out_error when timeout is reached.
|
|
future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max()) {
|
|
return mutate_locally(m, tr_state, sync, timeout, _write_smp_service_group);
|
|
}
|
|
// Applies mutation on this node.
|
|
// Resolves with timed_out_error when timeout is reached.
|
|
future<> mutate_locally(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max()) {
|
|
return mutate_locally(s, m, tr_state, sync, timeout, _write_smp_service_group);
|
|
}
|
|
// Applies mutations on this node.
|
|
// Resolves with timed_out_error when timeout is reached.
|
|
future<> mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout = clock_type::time_point::max());
|
|
|
|
future<> mutate_hint(const schema_ptr&, const frozen_mutation& m, tracing::trace_state_ptr tr_state, clock_type::time_point timeout = clock_type::time_point::max());
|
|
|
|
/**
|
|
* Use this method to have these Mutations applied
|
|
* across all replicas. This method will take care
|
|
* of the possibility of a replica being down and hint
|
|
* the data across to some other replica.
|
|
*
|
|
* @param mutations the mutations to be applied across the replicas
|
|
* @param consistency_level the consistency level for the operation
|
|
* @param tr_state trace state handle
|
|
*/
|
|
future<> mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters = false);
|
|
|
|
/**
|
|
* See mutate. Does the same, but returns some exceptions
|
|
* through the result<>, which allows for efficient inspection
|
|
* of the exception on the exception handling path.
|
|
*/
|
|
future<result<>> mutate_result(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters = false);
|
|
|
|
paxos_participants
|
|
get_paxos_participants(const sstring& ks_name, const dht::token& token, db::consistency_level consistency_for_paxos);
|
|
|
|
future<> replicate_counter_from_leader(mutation m, db::consistency_level cl, tracing::trace_state_ptr tr_state,
|
|
clock_type::time_point timeout, service_permit permit);
|
|
|
|
future<result<>> mutate_with_triggers(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout,
|
|
bool should_mutate_atomically, tracing::trace_state_ptr tr_state, service_permit permit, bool raw_counters = false);
|
|
|
|
/**
|
|
* See mutate. Adds additional steps before and after writing a batch.
|
|
* Before writing the batch (but after doing availability check against the FD for the row replicas):
|
|
* write the entire batch to a batchlog elsewhere in the cluster.
|
|
* After: remove the batchlog entry (after writing hints for the batch rows, if necessary).
|
|
*
|
|
* @param mutations the Mutations to be applied across the replicas
|
|
* @param consistency_level the consistency level for the operation
|
|
* @param tr_state trace state handle
|
|
*/
|
|
future<> mutate_atomically(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit);
|
|
|
|
/**
|
|
* See mutate_atomically. Does the same, but returns some exceptions
|
|
* through the result<>, which allows for efficient inspection
|
|
* of the exception on the exception handling path.
|
|
*/
|
|
future<result<>> mutate_atomically_result(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit);
|
|
|
|
future<> send_hint_to_all_replicas(frozen_mutation_and_schema fm_a_s);
|
|
|
|
// Send a mutation to one specific remote target.
|
|
// Inspired by Cassandra's StorageProxy.sendToHintedEndpoints but without
|
|
// hinted handoff support, and just one target. See also
|
|
// send_to_live_endpoints() - another take on the same original function.
|
|
future<> send_to_endpoint(frozen_mutation_and_schema fm_a_s, gms::inet_address target, inet_address_vector_topology_change pending_endpoints, db::write_type type,
|
|
tracing::trace_state_ptr tr_state, write_stats& stats, allow_hints allow_hints = allow_hints::yes);
|
|
future<> send_to_endpoint(frozen_mutation_and_schema fm_a_s, gms::inet_address target, inet_address_vector_topology_change pending_endpoints, db::write_type type,
|
|
tracing::trace_state_ptr tr_state, allow_hints allow_hints = allow_hints::yes);
|
|
|
|
// Send a mutation to a specific remote target as a hint.
|
|
// Unlike regular mutations during write operations, hints are sent on the streaming connection
|
|
// and use different RPC verb.
|
|
future<> send_hint_to_endpoint(frozen_mutation_and_schema fm_a_s, gms::inet_address target);
|
|
|
|
/**
|
|
* Performs the truncate operatoin, which effectively deletes all data from
|
|
* the column family cfname
|
|
* @param keyspace
|
|
* @param cfname
|
|
*/
|
|
future<> truncate_blocking(sstring keyspace, sstring cfname);
|
|
|
|
/*
|
|
* Executes data query on the whole cluster.
|
|
*
|
|
* Partitions for each range will be ordered according to decorated_key ordering. Results for
|
|
* each range from "partition_ranges" may appear in any order.
|
|
*
|
|
* Will consider the preferred_replicas provided by the caller when selecting the replicas to
|
|
* send read requests to. However this is merely a hint and it is not guaranteed that the read
|
|
* requests will be sent to all or any of the listed replicas. After the query is done the list
|
|
* of replicas that served it is also returned.
|
|
*
|
|
* IMPORTANT: Not all fibers started by this method have to be done by the time it returns so no
|
|
* parameter can be changed after being passed to this method.
|
|
*/
|
|
future<coordinator_query_result> query(schema_ptr,
|
|
lw_shared_ptr<query::read_command> cmd,
|
|
dht::partition_range_vector&& partition_ranges,
|
|
db::consistency_level cl,
|
|
coordinator_query_options optional_params);
|
|
|
|
/*
|
|
* Like query(), but is allowed to return some exceptions as a result.
|
|
* The caller must remember to handle them properly.
|
|
*/
|
|
future<result<coordinator_query_result>> query_result(schema_ptr,
|
|
lw_shared_ptr<query::read_command> cmd,
|
|
dht::partition_range_vector&& partition_ranges,
|
|
db::consistency_level cl,
|
|
coordinator_query_options optional_params);
|
|
|
|
future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> query_mutations_locally(
|
|
schema_ptr, lw_shared_ptr<query::read_command> cmd, const dht::partition_range&,
|
|
clock_type::time_point timeout,
|
|
tracing::trace_state_ptr trace_state = nullptr);
|
|
|
|
|
|
future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> query_mutations_locally(
|
|
schema_ptr, lw_shared_ptr<query::read_command> cmd, const ::compat::one_or_two_partition_ranges&,
|
|
clock_type::time_point timeout,
|
|
tracing::trace_state_ptr trace_state = nullptr);
|
|
|
|
future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> query_mutations_locally(
|
|
schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range_vector& pr,
|
|
clock_type::time_point timeout,
|
|
tracing::trace_state_ptr trace_state = nullptr);
|
|
|
|
future<bool> cas(schema_ptr schema, shared_ptr<cas_request> request, lw_shared_ptr<query::read_command> cmd,
|
|
dht::partition_range_vector partition_ranges, coordinator_query_options query_options,
|
|
db::consistency_level cl_for_paxos, db::consistency_level cl_for_learn,
|
|
clock_type::time_point write_timeout, clock_type::time_point cas_timeout, bool write = true);
|
|
|
|
mutation get_batchlog_mutation_for(const std::vector<mutation>& mutations, const utils::UUID& id, int32_t version, db_clock::time_point now);
|
|
|
|
future<> stop();
|
|
future<> start_hints_manager();
|
|
void allow_replaying_hints() noexcept;
|
|
future<> drain_on_shutdown();
|
|
|
|
future<> change_hints_host_filter(db::hints::host_filter new_filter);
|
|
const db::hints::host_filter& get_hints_host_filter() const;
|
|
|
|
future<db::hints::sync_point> create_hint_sync_point(const std::vector<gms::inet_address> target_hosts) const;
|
|
future<> wait_for_hint_sync_point(const db::hints::sync_point spoint, clock_type::time_point deadline);
|
|
|
|
const stats& get_stats() const {
|
|
return scheduling_group_get_specific<storage_proxy_stats::stats>(_stats_key);
|
|
}
|
|
stats& get_stats() {
|
|
return scheduling_group_get_specific<storage_proxy_stats::stats>(_stats_key);
|
|
}
|
|
const global_stats& get_global_stats() const {
|
|
return _global_stats;
|
|
}
|
|
global_stats& get_global_stats() {
|
|
return _global_stats;
|
|
}
|
|
const cdc_stats& get_cdc_stats() const {
|
|
return _cdc_stats;
|
|
}
|
|
cdc_stats& get_cdc_stats() {
|
|
return _cdc_stats;
|
|
}
|
|
|
|
scheduling_group_key get_stats_key() const {
|
|
return _stats_key;
|
|
}
|
|
|
|
static unsigned cas_shard(const schema& s, dht::token token);
|
|
|
|
virtual void on_join_cluster(const gms::inet_address& endpoint) override;
|
|
virtual void on_leave_cluster(const gms::inet_address& endpoint) override;
|
|
virtual void on_up(const gms::inet_address& endpoint) override;
|
|
virtual void on_down(const gms::inet_address& endpoint) override;
|
|
|
|
friend class abstract_read_executor;
|
|
friend class abstract_write_response_handler;
|
|
friend class speculating_read_executor;
|
|
friend class view_update_backlog_broker;
|
|
friend class view_update_write_response_handler;
|
|
friend class paxos_response_handler;
|
|
friend class mutation_holder;
|
|
friend class per_destination_mutation;
|
|
friend class shared_mutation;
|
|
friend class hint_mutation;
|
|
friend class cas_mutation;
|
|
};
|
|
|
|
// A Paxos (AKA Compare And Swap, CAS) protocol involves multiple roundtrips between the coordinator
|
|
// and endpoint participants. Some endpoints may be unavailable or slow, and this does not stop the
|
|
// protocol progress. paxos_response_handler stores the shared state of the storage proxy associated
|
|
// with all the futures associated with a Paxos protocol step (prepare, accept, learn), including
|
|
// those outstanding by the time the step ends.
|
|
//
|
|
class paxos_response_handler : public enable_shared_from_this<paxos_response_handler> {
|
|
private:
|
|
shared_ptr<storage_proxy> _proxy;
|
|
// The schema for the table the operation works upon.
|
|
schema_ptr _schema;
|
|
// Read command used by this CAS request.
|
|
lw_shared_ptr<query::read_command> _cmd;
|
|
// SERIAL or LOCAL SERIAL - influences what endpoints become Paxos protocol participants,
|
|
// as well as Paxos quorum size. Is either set explicitly in the query or derived from
|
|
// the value set by SERIAL CONSISTENCY [SERIAL|LOCAL SERIAL] control statement.
|
|
db::consistency_level _cl_for_paxos;
|
|
// QUORUM, LOCAL_QUORUM, etc - defines how many replicas to wait for in LEARN step.
|
|
// Is either set explicitly or derived from the consistency level set in keyspace options.
|
|
db::consistency_level _cl_for_learn;
|
|
// Live endpoints, as per get_paxos_participants()
|
|
inet_address_vector_replica_set _live_endpoints;
|
|
// How many endpoints need to respond favourably for the protocol to progress to the next step.
|
|
size_t _required_participants;
|
|
// A deadline when the entire CAS operation timeout expires, derived from write_request_timeout_in_ms
|
|
storage_proxy::clock_type::time_point _timeout;
|
|
// A deadline when the CAS operation gives up due to contention, derived from cas_contention_timeout_in_ms
|
|
storage_proxy::clock_type::time_point _cas_timeout;
|
|
// The key this request is working on.
|
|
dht::decorated_key _key;
|
|
// service permit from admission control
|
|
service_permit _permit;
|
|
// how many replicas replied to learn
|
|
uint64_t _learned = 0;
|
|
|
|
// Unique request id generator.
|
|
static thread_local uint64_t next_id;
|
|
|
|
// Unique request id for logging purposes.
|
|
const uint64_t _id = next_id++;
|
|
|
|
// max pruning operations to run in parralel
|
|
static constexpr uint16_t pruning_limit = 1000;
|
|
|
|
public:
|
|
tracing::trace_state_ptr tr_state;
|
|
|
|
public:
|
|
paxos_response_handler(shared_ptr<storage_proxy> proxy_arg, tracing::trace_state_ptr tr_state_arg,
|
|
service_permit permit_arg,
|
|
dht::decorated_key key_arg, schema_ptr schema_arg, lw_shared_ptr<query::read_command> cmd_arg,
|
|
db::consistency_level cl_for_paxos_arg, db::consistency_level cl_for_learn_arg,
|
|
storage_proxy::clock_type::time_point timeout_arg, storage_proxy::clock_type::time_point cas_timeout_arg)
|
|
|
|
: _proxy(proxy_arg)
|
|
, _schema(std::move(schema_arg))
|
|
, _cmd(cmd_arg)
|
|
, _cl_for_paxos(cl_for_paxos_arg)
|
|
, _cl_for_learn(cl_for_learn_arg)
|
|
, _timeout(timeout_arg)
|
|
, _cas_timeout(cas_timeout_arg)
|
|
, _key(std::move(key_arg))
|
|
, _permit(std::move(permit_arg))
|
|
, tr_state(tr_state_arg) {
|
|
storage_proxy::paxos_participants pp = _proxy->get_paxos_participants(_schema->ks_name(), _key.token(), _cl_for_paxos);
|
|
_live_endpoints = std::move(pp.endpoints);
|
|
_required_participants = pp.required_participants;
|
|
tracing::trace(tr_state, "Create paxos_response_handler for token {} with live: {} and required participants: {}",
|
|
_key.token(), _live_endpoints, _required_participants);
|
|
_proxy->get_stats().cas_foreground++;
|
|
_proxy->get_stats().cas_total_running++;
|
|
_proxy->get_stats().cas_total_operations++;
|
|
}
|
|
|
|
~paxos_response_handler() {
|
|
_proxy->get_stats().cas_total_running--;
|
|
}
|
|
|
|
// Result of PREPARE step, i.e. begin_and_repair_paxos().
|
|
struct ballot_and_data {
|
|
// Accepted ballot.
|
|
utils::UUID ballot;
|
|
// Current value of the requested key or none.
|
|
foreign_ptr<lw_shared_ptr<query::result>> data;
|
|
};
|
|
|
|
// Steps of the Paxos protocol
|
|
future<ballot_and_data> begin_and_repair_paxos(client_state& cs, unsigned& contentions, bool is_write);
|
|
future<paxos::prepare_summary> prepare_ballot(utils::UUID ballot);
|
|
future<bool> accept_proposal(lw_shared_ptr<paxos::proposal> proposal, bool timeout_if_partially_accepted = true);
|
|
future<> learn_decision(lw_shared_ptr<paxos::proposal> proposal, bool allow_hints = false);
|
|
void prune(utils::UUID ballot);
|
|
uint64_t id() const {
|
|
return _id;
|
|
}
|
|
size_t block_for() const {
|
|
return _required_participants;
|
|
}
|
|
schema_ptr schema() const {
|
|
return _schema;
|
|
}
|
|
const partition_key& key() const {
|
|
return _key.key();
|
|
}
|
|
void set_cl_for_learn(db::consistency_level cl) {
|
|
_cl_for_learn = cl;
|
|
}
|
|
// this is called with an id of a replica that replied to learn request
|
|
// adn returns true when quorum of such requests are accumulated
|
|
bool learned(gms::inet_address ep);
|
|
};
|
|
|
|
extern distributed<storage_proxy> _the_storage_proxy;
|
|
|
|
// DEPRECATED, DON'T USE!
|
|
// Pass references to services through constructor/function parameters. Don't use globals.
|
|
inline distributed<storage_proxy>& get_storage_proxy() {
|
|
return _the_storage_proxy;
|
|
}
|
|
|
|
// DEPRECATED, DON'T USE!
|
|
// Pass references to services through constructor/function parameters. Don't use globals.
|
|
inline storage_proxy& get_local_storage_proxy() {
|
|
return _the_storage_proxy.local();
|
|
}
|
|
|
|
// DEPRECATED, DON'T USE!
|
|
// Pass references to services through constructor/function parameters. Don't use globals.
|
|
inline shared_ptr<storage_proxy> get_local_shared_storage_proxy() {
|
|
return _the_storage_proxy.local_shared();
|
|
}
|
|
|
|
}
|