/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* Copyright (C) 2015 ScyllaDB
*
* Modified by ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see .
*/
#include "db/consistency_level.hh"
#include
#include
#include
#include "exceptions/exceptions.hh"
#include "core/sstring.hh"
#include "schema.hh"
#include "database.hh"
#include "unimplemented.hh"
#include "db/read_repair_decision.hh"
#include "locator/abstract_replication_strategy.hh"
#include "locator/network_topology_strategy.hh"
#include "utils/fb_utilities.hh"
#include "heat_load_balance.hh"
namespace db {
logging::logger cl_logger("consistency");
size_t quorum_for(keyspace& ks) {
return (ks.get_replication_strategy().get_replication_factor() / 2) + 1;
}
size_t local_quorum_for(keyspace& ks, const sstring& dc) {
using namespace locator;
auto& rs = ks.get_replication_strategy();
if (rs.get_type() == replication_strategy_type::network_topology) {
network_topology_strategy* nrs =
static_cast(&rs);
return (nrs->get_replication_factor(dc) / 2) + 1;
}
return quorum_for(ks);
}
size_t block_for_local_serial(keyspace& ks) {
using namespace locator;
//
// TODO: Consider caching the final result in order to avoid all these
// useless dereferencing. Note however that this will introduce quite
// a lot of complications since both snitch output for a local host
// and the snitch itself (and thus its output) may change dynamically.
//
auto& snitch_ptr = i_endpoint_snitch::get_local_snitch_ptr();
auto local_addr = utils::fb_utilities::get_broadcast_address();
return local_quorum_for(ks, snitch_ptr->get_datacenter(local_addr));
}
size_t block_for_each_quorum(keyspace& ks) {
using namespace locator;
auto& rs = ks.get_replication_strategy();
if (rs.get_type() == replication_strategy_type::network_topology) {
network_topology_strategy* nrs =
static_cast(&rs);
size_t n = 0;
for (auto& dc : nrs->get_datacenters()) {
n += local_quorum_for(ks, dc);
}
return n;
} else {
return quorum_for(ks);
}
}
size_t block_for(keyspace& ks, consistency_level cl) {
switch (cl) {
case consistency_level::ONE:
case consistency_level::LOCAL_ONE:
return 1;
case consistency_level::ANY:
return 1;
case consistency_level::TWO:
return 2;
case consistency_level::THREE:
return 3;
case consistency_level::QUORUM:
case consistency_level::SERIAL:
return quorum_for(ks);
case consistency_level::ALL:
return ks.get_replication_strategy().get_replication_factor();
case consistency_level::LOCAL_QUORUM:
case consistency_level::LOCAL_SERIAL:
return block_for_local_serial(ks);
case consistency_level::EACH_QUORUM:
return block_for_each_quorum(ks);
default:
abort();
}
}
bool is_datacenter_local(consistency_level l) {
return l == consistency_level::LOCAL_ONE || l == consistency_level::LOCAL_QUORUM;
}
bool is_local(gms::inet_address endpoint) {
using namespace locator;
auto& snitch_ptr = i_endpoint_snitch::get_local_snitch_ptr();
auto local_addr = utils::fb_utilities::get_broadcast_address();
return snitch_ptr->get_datacenter(local_addr) ==
snitch_ptr->get_datacenter(endpoint);
}
std::vector
filter_for_query(consistency_level cl,
keyspace& ks,
std::vector live_endpoints,
read_repair_decision read_repair, gms::inet_address* extra, column_family* cf) {
size_t local_count;
if (read_repair == read_repair_decision::GLOBAL) { // take RRD.GLOBAL out of the way
return std::move(live_endpoints);
}
if (read_repair == read_repair_decision::DC_LOCAL || is_datacenter_local(cl)) {
auto it = boost::range::stable_partition(live_endpoints, is_local);
local_count = std::distance(live_endpoints.begin(), it);
if (is_datacenter_local(cl)) {
live_endpoints.erase(it, live_endpoints.end());
}
}
size_t bf = block_for(ks, cl);
if (read_repair == read_repair_decision::DC_LOCAL) {
bf = std::max(block_for(ks, cl), local_count);
}
if (bf >= live_endpoints.size()) { // RRD.DC_LOCAL + CL.LOCAL or CL.ALL
return std::move(live_endpoints);
}
if (cf) {
auto get_hit_rate = [cf] (gms::inet_address ep) -> float {
constexpr float max_hit_rate = 0.999;
auto ht = cf->get_hit_rate(ep);
if (float(ht.rate) < 0) {
return float(ht.rate);
} else if (lowres_clock::now() - ht.last_updated > std::chrono::milliseconds(1000)) {
// if a cache entry is not updates for a while try to send traffic there
// to get more up to date data, mark it updated to not send to much traffic there
cf->set_hit_rate(ep, ht.rate);
return max_hit_rate;
} else {
return std::min(float(ht.rate), max_hit_rate); // calculation below cannot work with hit rate 1
}
};
float ht_max = 0;
float ht_min = 1;
bool old_node = false;
auto epi = boost::copy_range>>(live_endpoints | boost::adaptors::transformed([&] (gms::inet_address ep) {
auto ht = get_hit_rate(ep);
old_node = old_node || ht < 0;
ht_max = std::max(ht_max, ht);
ht_min = std::min(ht_min, ht);
return std::make_pair(ep, ht);
}));
if (!old_node && ht_max - ht_min > 0.01) { // if there is old node or hit rates are close skip calculations
// local node is always first if present (see storage_proxy::get_live_sorted_endpoints)
unsigned local_idx = epi[0].first == utils::fb_utilities::get_broadcast_address() ? 0 : epi.size() + 1;
live_endpoints = miss_equalizing_combination(epi, local_idx, bf, bool(extra));
}
}
if (extra) {
*extra = live_endpoints[bf]; // extra replica for speculation
}
live_endpoints.erase(live_endpoints.begin() + bf, live_endpoints.end());
return std::move(live_endpoints);
}
std::vector filter_for_query(consistency_level cl, keyspace& ks, std::vector& live_endpoints, column_family* cf) {
return filter_for_query(cl, ks, live_endpoints, read_repair_decision::NONE, nullptr, cf);
}
bool
is_sufficient_live_nodes(consistency_level cl,
keyspace& ks,
const std::vector& live_endpoints) {
using namespace locator;
switch (cl) {
case consistency_level::ANY:
// local hint is acceptable, and local node is always live
return true;
case consistency_level::LOCAL_ONE:
return count_local_endpoints(live_endpoints) >= 1;
case consistency_level::LOCAL_QUORUM:
return count_local_endpoints(live_endpoints) >= block_for(ks, cl);
case consistency_level::EACH_QUORUM:
{
auto& rs = ks.get_replication_strategy();
if (rs.get_type() == replication_strategy_type::network_topology) {
for (auto& entry : count_per_dc_endpoints(ks, live_endpoints)) {
if (entry.second.live < local_quorum_for(ks, entry.first)) {
return false;
}
}
return true;
}
}
// Fallthough on purpose for SimpleStrategy
default:
return live_endpoints.size() >= block_for(ks, cl);
}
}
void validate_for_read(const sstring& keyspace_name, consistency_level cl) {
switch (cl) {
case consistency_level::ANY:
throw exceptions::invalid_request_exception("ANY ConsistencyLevel is only supported for writes");
case consistency_level::EACH_QUORUM:
throw exceptions::invalid_request_exception("EACH_QUORUM ConsistencyLevel is only supported for writes");
default:
break;
}
}
void validate_for_write(const sstring& keyspace_name, consistency_level cl) {
switch (cl) {
case consistency_level::SERIAL:
case consistency_level::LOCAL_SERIAL:
throw exceptions::invalid_request_exception("You must use conditional updates for serializable writes");
default:
break;
}
}
#if 0
// This is the same than validateForWrite really, but we include a slightly different error message for SERIAL/LOCAL_SERIAL
public void validateForCasCommit(String keyspaceName) throws InvalidRequestException
{
switch (this)
{
case EACH_QUORUM:
requireNetworkTopologyStrategy(keyspaceName);
break;
case SERIAL:
case LOCAL_SERIAL:
throw new InvalidRequestException(this + " is not supported as conditional update commit consistency. Use ANY if you mean \"make sure it is accepted but I don't care how many replicas commit it for non-SERIAL reads\"");
}
}
public void validateForCas() throws InvalidRequestException
{
if (!isSerialConsistency())
throw new InvalidRequestException("Invalid consistency for conditional update. Must be one of SERIAL or LOCAL_SERIAL");
}
#endif
bool is_serial_consistency(consistency_level cl) {
return cl == consistency_level::SERIAL || cl == consistency_level::LOCAL_SERIAL;
}
void validate_counter_for_write(schema_ptr s, consistency_level cl) {
if (cl == consistency_level::ANY) {
throw exceptions::invalid_request_exception(sprint("Consistency level ANY is not yet supported for counter table %s", s->cf_name()));
}
if (is_serial_consistency(cl)) {
throw exceptions::invalid_request_exception("Counter operations are inherently non-serializable");
}
}
#if 0
private void requireNetworkTopologyStrategy(String keyspaceName) throws InvalidRequestException
{
AbstractReplicationStrategy strategy = Keyspace.open(keyspaceName).getReplicationStrategy();
if (!(strategy instanceof NetworkTopologyStrategy))
throw new InvalidRequestException(String.format("consistency level %s not compatible with replication strategy (%s)", this, strategy.getClass().getName()));
}
#endif
}