Files
scylladb/service/storage_service.hh

2250 lines
86 KiB
C++

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Modified by ScyllaDB
* Copyright (C) 2015 ScyllaDB
*
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include "gms/i_endpoint_state_change_subscriber.hh"
#include "service/endpoint_lifecycle_subscriber.hh"
#include "locator/token_metadata.hh"
#include "gms/gossiper.hh"
#include "utils/UUID_gen.hh"
#include "core/distributed.hh"
#include "dht/i_partitioner.hh"
#include "dht/boot_strapper.hh"
#include "dht/token_range_endpoints.hh"
#include "core/sleep.hh"
#include "gms/application_state.hh"
#include "db/system_keyspace.hh"
#include "core/semaphore.hh"
#include "utils/fb_utilities.hh"
#include "database.hh"
#include "streaming/stream_state.hh"
#include "streaming/stream_plan.hh"
#include <seastar/core/distributed.hh>
#include "disk-error-handler.hh"
#include "gms/feature.hh"
namespace transport {
class cql_server;
}
class thrift_server;
namespace service {
class load_broadcaster;
class storage_service;
extern distributed<storage_service> _the_storage_service;
inline distributed<storage_service>& get_storage_service() {
return _the_storage_service;
}
inline storage_service& get_local_storage_service() {
return _the_storage_service.local();
}
int get_generation_number();
enum class disk_error { regular, commit };
/**
* This abstraction contains the token/identifier of this node
* on the identifier space. This token gets gossiped around.
* This class will also maintain histograms of the load information
* of other nodes in the cluster.
*/
class storage_service : public service::migration_listener, public gms::i_endpoint_state_change_subscriber, public seastar::async_sharded_service<storage_service> {
public:
struct snapshot_details {
int64_t live;
int64_t total;
sstring cf;
sstring ks;
};
private:
using token = dht::token;
using token_range_endpoints = dht::token_range_endpoints;
using endpoint_details = dht::endpoint_details;
using boot_strapper = dht::boot_strapper;
using token_metadata = locator::token_metadata;
using application_state = gms::application_state;
using inet_address = gms::inet_address;
using versioned_value = gms::versioned_value;
#if 0
private static final Logger logger = LoggerFactory.getLogger(StorageService.class);
/* JMX notification serial number counter */
private final AtomicLong notificationSerialNumber = new AtomicLong();
#endif
distributed<database>& _db;
int _update_jobs{0};
// Note that this is obviously only valid for the current shard. Users of
// this facility should elect a shard to be the coordinator based on any
// given objective criteria
//
// It shouldn't be impossible to actively serialize two callers if the need
// ever arise.
bool _loading_new_sstables = false;
shared_ptr<load_broadcaster> _lb;
shared_ptr<distributed<transport::cql_server>> _cql_server;
shared_ptr<distributed<thrift_server>> _thrift_server;
sstring _operation_in_progress;
bool _force_remove_completion = false;
bool _ms_stopped = false;
bool _stream_manager_stopped = false;
public:
storage_service(distributed<database>& db);
void isolate_on_error();
void isolate_on_commit_error();
// Needed by distributed<>
future<> stop();
void init_messaging_service();
void uninit_messaging_service();
private:
void do_update_pending_ranges();
public:
future<> keyspace_changed(const sstring& ks_name);
future<> update_pending_ranges();
const locator::token_metadata& get_token_metadata() const {
return _token_metadata;
}
locator::token_metadata& get_token_metadata() {
return _token_metadata;
}
future<> gossip_snitch_info();
void set_load_broadcaster(shared_ptr<load_broadcaster> lb);
shared_ptr<load_broadcaster>& get_load_broadcaster();
distributed<database>& db() {
return _db;
}
private:
bool is_auto_bootstrap();
inet_address get_broadcast_address() const {
return utils::fb_utilities::get_broadcast_address();
}
/* This abstraction maintains the token/endpoint metadata information */
token_metadata _token_metadata;
token_metadata _shadow_token_metadata;
public:
std::chrono::milliseconds get_ring_delay();
gms::versioned_value::factory value_factory;
#if 0
public volatile VersionedValue.VersionedValueFactory valueFactory = new VersionedValue.VersionedValueFactory(getPartitioner());
private Thread drainOnShutdown = null;
public static final StorageService instance = new StorageService();
public static IPartitioner getPartitioner()
{
return DatabaseDescriptor.getPartitioner();
}
#endif
public:
dht::token_range_vector get_local_ranges(const sstring& keyspace_name) {
return get_ranges_for_endpoint(keyspace_name, get_broadcast_address());
}
#if 0
public Collection<Range<Token>> getPrimaryRanges(String keyspace)
{
return getPrimaryRangesForEndpoint(keyspace, FBUtilities.getBroadcastAddress());
}
public Collection<Range<Token>> getPrimaryRangesWithinDC(String keyspace)
{
return getPrimaryRangeForEndpointWithinDC(keyspace, FBUtilities.getBroadcastAddress());
}
private CassandraDaemon daemon;
#endif
private:
std::unordered_set<inet_address> _replicating_nodes;
std::experimental::optional<inet_address> _removing_node;
/* Are we starting this node in bootstrap mode? */
bool _is_bootstrap_mode;
/* we bootstrap but do NOT join the ring unless told to do so */
// FIXME: System.getProperty("cassandra.write_survey", "false")
bool _is_survey_mode = false;
bool _initialized;
bool _joined = false;
public:
enum class mode { STARTING, NORMAL, JOINING, LEAVING, DECOMMISSIONED, MOVING, DRAINING, DRAINED };
private:
mode _operation_mode = mode::STARTING;
friend std::ostream& operator<<(std::ostream& os, const mode& mode);
#if 0
/* the probability for tracing any particular request, 0 disables tracing and 1 enables for all */
private double traceProbability = 0.0;
#endif
/* Used for tracking drain progress */
public:
struct drain_progress {
int32_t total_cfs;
int32_t remaining_cfs;
drain_progress& operator+=(const drain_progress& other) {
total_cfs += other.total_cfs;
remaining_cfs += other.remaining_cfs;
return *this;
}
};
private:
drain_progress _drain_progress{};
#if 0
private static final AtomicInteger nextRepairCommand = new AtomicInteger();
#endif
std::vector<endpoint_lifecycle_subscriber*> _lifecycle_subscribers;
#if 0
private static final BackgroundActivityMonitor bgMonitor = new BackgroundActivityMonitor();
private final ObjectName jmxObjectName;
#endif
private:
std::unordered_set<token> _bootstrap_tokens;
gms::feature _range_tombstones_feature;
gms::feature _large_partitions_feature;
gms::feature _materialized_views_feature;
gms::feature _counters_feature;
gms::feature _indexes_feature;
public:
void enable_all_features() {
_range_tombstones_feature.enable();
_large_partitions_feature.enable();
_materialized_views_feature.enable();
_counters_feature.enable();
_indexes_feature.enable();
}
void finish_bootstrapping() {
_is_bootstrap_mode = false;
}
/** This method updates the local token on disk */
void set_tokens(std::unordered_set<token> tokens);
void set_gossip_tokens(const std::unordered_set<dht::token>& local_tokens);
#if 0
public void registerDaemon(CassandraDaemon daemon)
{
this.daemon = daemon;
}
#endif
void register_subscriber(endpoint_lifecycle_subscriber* subscriber);
void unregister_subscriber(endpoint_lifecycle_subscriber* subscriber);
// should only be called via JMX
future<> stop_gossiping();
// should only be called via JMX
future<> start_gossiping();
// should only be called via JMX
future<bool> is_gossip_running();
// should only be called via JMX
future<> start_rpc_server();
future<> stop_rpc_server();
future<bool> is_rpc_server_running();
future<> start_native_transport();
future<> stop_native_transport();
future<bool> is_native_transport_running();
private:
future<> do_stop_rpc_server();
future<> do_stop_native_transport();
future<> do_stop_ms();
future<> do_stop_stream_manager();
#if 0
public void stopTransports()
{
if (isInitialized())
{
logger.error("Stopping gossiper");
stopGossiping();
}
if (isRPCServerRunning())
{
logger.error("Stopping RPC server");
stopRPCServer();
}
if (isNativeTransportRunning())
{
logger.error("Stopping native transport");
stopNativeTransport();
}
}
#endif
private:
future<> shutdown_client_servers();
#if 0
public void stopClient()
{
Gossiper.instance.unregister(this);
Gossiper.instance.stop();
MessagingService.instance().shutdown();
// give it a second so that task accepted before the MessagingService shutdown gets submitted to the stage (to avoid RejectedExecutionException)
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
StageManager.shutdownNow();
}
#endif
public:
future<bool> is_initialized();
#if 0
public void stopDaemon()
{
if (daemon == null)
throw new IllegalStateException("No configured daemon");
daemon.deactivate();
}
#endif
public:
future<std::unordered_set<token>> prepare_replacement_info();
future<> check_for_endpoint_collision();
#if 0
// for testing only
public void unsafeInitialize() throws ConfigurationException
{
_initialized = true;
Gossiper.instance.register(this);
Gossiper.instance.start((int) (System.currentTimeMillis() / 1000)); // needed for node-ring gathering.
Gossiper.instance.addLocalApplicationState(ApplicationState.NET_VERSION, valueFactory.networkVersion());
if (!MessagingService.instance().isListening())
MessagingService.instance().listen(FBUtilities.getLocalAddress());
}
#endif
public:
future<> init_server() {
return init_server(get_ring_delay().count());
}
future<> init_server(int delay);
future<> drain_on_shutdown();
future<> stop_transport();
void flush_column_families();
#if 0
/**
* In the event of forceful termination we need to remove the shutdown hook to prevent hanging (OOM for instance)
*/
public void removeShutdownHook()
{
if (drainOnShutdown != null)
Runtime.getRuntime().removeShutdownHook(drainOnShutdown);
}
#endif
private:
bool should_bootstrap();
void prepare_to_join(std::vector<inet_address> loaded_endpoints);
void join_token_ring(int delay);
public:
future<> join_ring();
bool is_joined();
future<> rebuild(sstring source_dc);
#if 0
public void setStreamThroughputMbPerSec(int value)
{
DatabaseDescriptor.setStreamThroughputOutboundMegabitsPerSec(value);
logger.info("setstreamthroughput: throttle set to {}", value);
}
public int getStreamThroughputMbPerSec()
{
return DatabaseDescriptor.getStreamThroughputOutboundMegabitsPerSec();
}
public int getCompactionThroughputMbPerSec()
{
return DatabaseDescriptor.getCompactionThroughputMbPerSec();
}
public void setCompactionThroughputMbPerSec(int value)
{
DatabaseDescriptor.setCompactionThroughputMbPerSec(value);
}
public boolean isIncrementalBackupsEnabled()
{
return DatabaseDescriptor.isIncrementalBackupsEnabled();
}
public void setIncrementalBackupsEnabled(boolean value)
{
DatabaseDescriptor.setIncrementalBackupsEnabled(value);
}
#endif
private:
void set_mode(mode m, bool log);
void set_mode(mode m, sstring msg, bool log);
public:
void bootstrap(std::unordered_set<token> tokens);
bool is_bootstrap_mode() {
return _is_bootstrap_mode;
}
#if 0
public TokenMetadata getTokenMetadata()
{
return _token_metadata;
}
/**
* Increment about the known Compaction severity of the events in this node
*/
public void reportSeverity(double incr)
{
bgMonitor.incrCompactionSeverity(incr);
}
public void reportManualSeverity(double incr)
{
bgMonitor.incrManualSeverity(incr);
}
public double getSeverity(InetAddress endpoint)
{
return bgMonitor.getSeverity(endpoint);
}
/**
* for a keyspace, return the ranges and corresponding listen addresses.
* @param keyspace
* @return the endpoint map
*/
public Map<List<String>, List<String>> getRangeToEndpointMap(String keyspace)
{
/* All the ranges for the tokens */
Map<List<String>, List<String>> map = new HashMap<>();
for (Map.Entry<Range<Token>,List<InetAddress>> entry : getRangeToAddressMap(keyspace).entrySet())
{
map.put(entry.getKey().asList(), stringify(entry.getValue()));
}
return map;
}
#endif
/**
* Return the rpc address associated with an endpoint as a string.
* @param endpoint The endpoint to get rpc address for
* @return the rpc address
*/
sstring get_rpc_address(const inet_address& endpoint) const;
#if 0
/**
* for a keyspace, return the ranges and corresponding RPC addresses for a given keyspace.
* @param keyspace
* @return the endpoint map
*/
public Map<List<String>, List<String>> getRangeToRpcaddressMap(String keyspace)
{
/* All the ranges for the tokens */
Map<List<String>, List<String>> map = new HashMap<>();
for (Map.Entry<Range<Token>, List<InetAddress>> entry : getRangeToAddressMap(keyspace).entrySet())
{
List<String> rpcaddrs = new ArrayList<>(entry.getValue().size());
for (InetAddress endpoint: entry.getValue())
{
rpcaddrs.add(getRpcaddress(endpoint));
}
map.put(entry.getKey().asList(), rpcaddrs);
}
return map;
}
public Map<List<String>, List<String>> getPendingRangeToEndpointMap(String keyspace)
{
// some people just want to get a visual representation of things. Allow null and set it to the first
// non-system keyspace.
if (keyspace == null)
keyspace = Schema.instance.getNonSystemKeyspaces().get(0);
Map<List<String>, List<String>> map = new HashMap<>();
for (Map.Entry<Range<Token>, Collection<InetAddress>> entry : _token_metadata.getPendingRanges(keyspace).entrySet())
{
List<InetAddress> l = new ArrayList<>(entry.getValue());
map.put(entry.getKey().asList(), stringify(l));
}
return map;
}
#endif
std::unordered_map<dht::token_range, std::vector<inet_address>> get_range_to_address_map(const sstring& keyspace) const;
std::unordered_map<dht::token_range, std::vector<inet_address>> get_range_to_address_map_in_local_dc(
const sstring& keyspace) const;
std::vector<token> get_tokens_in_local_dc() const;
bool is_local_dc(const inet_address& targetHost) const;
std::unordered_map<dht::token_range, std::vector<inet_address>> get_range_to_address_map(const sstring& keyspace,
const std::vector<token>& sorted_tokens) const;
/**
* The same as {@code describeRing(String)} but converts TokenRange to the String for JMX compatibility
*
* @param keyspace The keyspace to fetch information about
*
* @return a List of TokenRange(s) converted to String for the given keyspace
*/
/*
* describeRingJMX will be implemented in the API
* It is left here just as a marker that there is no need to implement it
* here
*/
//std::vector<sstring> describeRingJMX(const sstring& keyspace) const {
#if 0
/**
* The same as {@code describeRing(String)} but considers only the part of the ring formed by nodes in the local DC.
*/
public List<TokenRange> describeLocalRing(String keyspace) throws InvalidRequestException
{
return describeRing(keyspace, true);
}
#endif
std::vector<token_range_endpoints> describe_ring(const sstring& keyspace, bool include_only_local_dc = false) const;
/**
* Retrieve a map of tokens to endpoints, including the bootstrapping ones.
*
* @return a map of tokens to endpoints in ascending order
*/
std::map<token, inet_address> get_token_to_endpoint_map();
#if 0
public String getLocalHostId()
{
return getTokenMetadata().getHostId(FBUtilities.getBroadcastAddress()).toString();
}
public Map<String, String> getHostIdMap()
{
Map<String, String> mapOut = new HashMap<>();
for (Map.Entry<InetAddress, UUID> entry : getTokenMetadata().getEndpointToHostIdMapForReading().entrySet())
mapOut.put(entry.getKey().getHostAddress(), entry.getValue().toString());
return mapOut;
}
#endif
/**
* Construct the range to endpoint mapping based on the true view
* of the world.
* @param ranges
* @return mapping of ranges to the replicas responsible for them.
*/
std::unordered_map<dht::token_range, std::vector<inet_address>> construct_range_to_endpoint_map(
const sstring& keyspace,
const dht::token_range_vector& ranges) const;
public:
virtual void on_join(gms::inet_address endpoint, gms::endpoint_state ep_state) override;
virtual void before_change(gms::inet_address endpoint, gms::endpoint_state current_state, gms::application_state new_state_key, const gms::versioned_value& new_value) override;
/*
* Handle the reception of a new particular ApplicationState for a particular endpoint. Note that the value of the
* ApplicationState has not necessarily "changed" since the last known value, if we already received the same update
* from somewhere else.
*
* onChange only ever sees one ApplicationState piece change at a time (even if many ApplicationState updates were
* received at the same time), so we perform a kind of state machine here. We are concerned with two events: knowing
* the token associated with an endpoint, and knowing its operation mode. Nodes can start in either bootstrap or
* normal mode, and from bootstrap mode can change mode to normal. A node in bootstrap mode needs to have
* pendingranges set in TokenMetadata; a node in normal mode should instead be part of the token ring.
*
* Normal progression of ApplicationState.STATUS values for a node should be like this:
* STATUS_BOOTSTRAPPING,token
* if bootstrapping. stays this way until all files are received.
* STATUS_NORMAL,token
* ready to serve reads and writes.
* STATUS_LEAVING,token
* get ready to leave the cluster as part of a decommission
* STATUS_LEFT,token
* set after decommission is completed.
*
* Other STATUS values that may be seen (possibly anywhere in the normal progression):
* STATUS_MOVING,newtoken
* set if node is currently moving to a new token in the ring
* REMOVING_TOKEN,deadtoken
* set if the node is dead and is being removed by its REMOVAL_COORDINATOR
* REMOVED_TOKEN,deadtoken
* set if the node is dead and has been removed by its REMOVAL_COORDINATOR
*
* Note: Any time a node state changes from STATUS_NORMAL, it will not be visible to new nodes. So it follows that
* you should never bootstrap a new node during a removenode, decommission or move.
*/
virtual void on_change(inet_address endpoint, application_state state, const versioned_value& value) override;
virtual void on_alive(gms::inet_address endpoint, gms::endpoint_state state) override;
virtual void on_dead(gms::inet_address endpoint, gms::endpoint_state state) override;
virtual void on_remove(gms::inet_address endpoint) override;
virtual void on_restart(gms::inet_address endpoint, gms::endpoint_state state) override;
public:
// For migration_listener
virtual void on_create_keyspace(const sstring& ks_name) override { keyspace_changed(ks_name).get(); }
virtual void on_create_column_family(const sstring& ks_name, const sstring& cf_name) override {}
virtual void on_create_user_type(const sstring& ks_name, const sstring& type_name) override {}
virtual void on_create_function(const sstring& ks_name, const sstring& function_name) override {}
virtual void on_create_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
virtual void on_create_view(const sstring& ks_name, const sstring& view_name) override {}
virtual void on_update_keyspace(const sstring& ks_name) override { keyspace_changed(ks_name).get(); }
virtual void on_update_column_family(const sstring& ks_name, const sstring& cf_name, bool) override {}
virtual void on_update_user_type(const sstring& ks_name, const sstring& type_name) override {}
virtual void on_update_function(const sstring& ks_name, const sstring& function_name) override {}
virtual void on_update_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
virtual void on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) override {}
virtual void on_drop_keyspace(const sstring& ks_name) override { keyspace_changed(ks_name).get(); }
virtual void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {}
virtual void on_drop_user_type(const sstring& ks_name, const sstring& type_name) override {}
virtual void on_drop_function(const sstring& ks_name, const sstring& function_name) override {}
virtual void on_drop_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
virtual void on_drop_view(const sstring& ks_name, const sstring& view_name) override {}
private:
void update_peer_info(inet_address endpoint);
void do_update_system_peers_table(gms::inet_address endpoint, const application_state& state, const versioned_value& value);
sstring get_application_state_value(inet_address endpoint, application_state appstate);
std::unordered_set<token> get_tokens_for(inet_address endpoint);
future<> replicate_to_all_cores();
semaphore _replicate_task{1};
private:
/**
* Replicates token_metadata contents on shard0 instance to other shards.
*
* Should be called with a _replicate_task semaphore taken.
* Should run on shard 0 only.
*
* @return a ready future when replication is complete.
*/
future<> replicate_tm_only();
/**
* Replicates token_metadata and gossiper::endpoint_state_map contents on
* shard0 instances to other shards.
*
* Should be called with a _replicate_task and a gossiper::timer_callback
* semaphores taken.
* Should run on shard 0 only.
*
* @param g0 a "shared_from_this()" pointer to a gossiper instance on shard0
*
* @return a ready future when replication is complete.
*/
future<> replicate_tm_and_ep_map(shared_ptr<gms::gossiper> g0);
/**
* Handle node bootstrap
*
* @param endpoint bootstrapping node
*/
void handle_state_bootstrap(inet_address endpoint);
/**
* Handle node move to normal state. That is, node is entering token ring and participating
* in reads.
*
* @param endpoint node
*/
void handle_state_normal(inet_address endpoint);
/**
* Handle node preparing to leave the ring
*
* @param endpoint node
*/
void handle_state_leaving(inet_address endpoint);
/**
* Handle node leaving the ring. This will happen when a node is decommissioned
*
* @param endpoint If reason for leaving is decommission, endpoint is the leaving node.
* @param pieces STATE_LEFT,token
*/
void handle_state_left(inet_address endpoint, std::vector<sstring> pieces);
/**
* Handle node moving inside the ring.
*
* @param endpoint moving endpoint address
* @param pieces STATE_MOVING, token
*/
void handle_state_moving(inet_address endpoint, std::vector<sstring> pieces);
/**
* Handle notification that a node being actively removed from the ring via 'removenode'
*
* @param endpoint node
* @param pieces either REMOVED_TOKEN (node is gone) or REMOVING_TOKEN (replicas need to be restored)
*/
void handle_state_removing(inet_address endpoint, std::vector<sstring> pieces);
private:
void excise(std::unordered_set<token> tokens, inet_address endpoint);
void excise(std::unordered_set<token> tokens, inet_address endpoint, long expire_time);
/** unlike excise we just need this endpoint gone without going through any notifications **/
void remove_endpoint(inet_address endpoint);
void add_expire_time_if_found(inet_address endpoint, int64_t expire_time);
int64_t extract_expire_time(const std::vector<sstring>& pieces) {
return std::stoll(pieces[2]);
}
/**
* Finds living endpoints responsible for the given ranges
*
* @param keyspaceName the keyspace ranges belong to
* @param ranges the ranges to find sources for
* @return multimap of addresses to ranges the address is responsible for
*/
std::unordered_multimap<inet_address, dht::token_range> get_new_source_ranges(const sstring& keyspaceName, const dht::token_range_vector& ranges);
public:
future<> confirm_replication(inet_address node);
private:
/**
* Sends a notification to a node indicating we have finished replicating data.
*
* @param remote node to send notification to
*/
future<> send_replication_notification(inet_address remote);
/**
* Called when an endpoint is removed from the ring. This function checks
* whether this node becomes responsible for new ranges as a
* consequence and streams data if needed.
*
* This is rather ineffective, but it does not matter so much
* since this is called very seldom
*
* @param endpoint the node that left
*/
future<> restore_replica_count(inet_address endpoint, inet_address notify_endpoint);
// needs to be modified to accept either a keyspace or ARS.
std::unordered_multimap<dht::token_range, inet_address> get_changed_ranges_for_leaving(sstring keyspace_name, inet_address endpoint);
public:
/** raw load value */
double get_load();
sstring get_load_string();
future<std::map<sstring, double>> get_load_map();
#if 0
public final void deliverHints(String host) throws UnknownHostException
{
HintedHandOffManager.instance.scheduleHintDelivery(host);
}
#endif
public:
future<std::unordered_set<dht::token>> get_local_tokens();
#if 0
/* These methods belong to the MBean interface */
public List<String> getTokens()
{
return getTokens(FBUtilities.getBroadcastAddress());
}
public List<String> getTokens(String endpoint) throws UnknownHostException
{
return getTokens(InetAddress.getByName(endpoint));
}
private List<String> getTokens(InetAddress endpoint)
{
List<String> strTokens = new ArrayList<>();
for (Token tok : getTokenMetadata().getTokens(endpoint))
strTokens.add(tok.toString());
return strTokens;
}
#endif
sstring get_release_version();
sstring get_schema_version();
future<std::unordered_map<sstring, std::vector<sstring>>> describe_schema_versions();
#if 0
public List<String> getLeavingNodes()
{
return stringify(_token_metadata.getLeavingEndpoints());
}
public List<String> getMovingNodes()
{
List<String> endpoints = new ArrayList<>();
for (Pair<Token, InetAddress> node : _token_metadata.getMovingEndpoints())
{
endpoints.add(node.right.getHostAddress());
}
return endpoints;
}
public List<String> getJoiningNodes()
{
return stringify(_token_metadata.getBootstrapTokens().valueSet());
}
public List<String> getLiveNodes()
{
return stringify(Gossiper.instance.getLiveMembers());
}
public List<String> getUnreachableNodes()
{
return stringify(Gossiper.instance.getUnreachableMembers());
}
private List<String> stringify(Iterable<InetAddress> endpoints)
{
List<String> stringEndpoints = new ArrayList<>();
for (InetAddress ep : endpoints)
{
stringEndpoints.add(ep.getHostAddress());
}
return stringEndpoints;
}
public int forceKeyspaceCleanup(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
{
if (keyspaceName.equals(SystemKeyspace.NAME))
throw new RuntimeException("Cleanup of the system keyspace is neither necessary nor wise");
CompactionManager.AllSSTableOpStatus status = CompactionManager.AllSSTableOpStatus.SUCCESSFUL;
for (ColumnFamilyStore cfStore : getValidColumnFamilies(false, false, keyspaceName, columnFamilies))
{
CompactionManager.AllSSTableOpStatus oneStatus = cfStore.forceCleanup();
if (oneStatus != CompactionManager.AllSSTableOpStatus.SUCCESSFUL)
status = oneStatus;
}
return status.statusCode;
}
public int scrub(boolean disableSnapshot, boolean skipCorrupted, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
{
CompactionManager.AllSSTableOpStatus status = CompactionManager.AllSSTableOpStatus.SUCCESSFUL;
for (ColumnFamilyStore cfStore : getValidColumnFamilies(false, false, keyspaceName, columnFamilies))
{
CompactionManager.AllSSTableOpStatus oneStatus = cfStore.scrub(disableSnapshot, skipCorrupted);
if (oneStatus != CompactionManager.AllSSTableOpStatus.SUCCESSFUL)
status = oneStatus;
}
return status.statusCode;
}
public int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
{
CompactionManager.AllSSTableOpStatus status = CompactionManager.AllSSTableOpStatus.SUCCESSFUL;
for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, true, keyspaceName, columnFamilies))
{
CompactionManager.AllSSTableOpStatus oneStatus = cfStore.sstablesRewrite(excludeCurrentVersion);
if (oneStatus != CompactionManager.AllSSTableOpStatus.SUCCESSFUL)
status = oneStatus;
}
return status.statusCode;
}
public void forceKeyspaceCompaction(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
{
for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, false, keyspaceName, columnFamilies))
{
cfStore.forceMajorCompaction();
}
}
#endif
/**
* Takes the snapshot for all keyspaces. A snapshot name must be specified.
*
* @param tag the tag given to the snapshot; may not be null or empty
*/
future<> take_snapshot(sstring tag) {
return take_snapshot(tag, {});
}
/**
* Takes the snapshot for the given keyspaces. A snapshot name must be specified.
*
* @param tag the tag given to the snapshot; may not be null or empty
* @param keyspaceNames the names of the keyspaces to snapshot; empty means "all."
*/
future<> take_snapshot(sstring tag, std::vector<sstring> keyspace_names);
/**
* Takes the snapshot of a specific column family. A snapshot name must be specified.
*
* @param keyspaceName the keyspace which holds the specified column family
* @param columnFamilyName the column family to snapshot
* @param tag the tag given to the snapshot; may not be null or empty
*/
future<> take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag);
#if 0
private Keyspace getValidKeyspace(String keyspaceName) throws IOException
{
if (!Schema.instance.getKeyspaces().contains(keyspaceName))
{
throw new IOException("Keyspace " + keyspaceName + " does not exist");
}
return Keyspace.open(keyspaceName);
}
#endif
/**
* Remove the snapshot with the given name from the given keyspaces.
* If no tag is specified we will remove all snapshots.
*/
future<> clear_snapshot(sstring tag, std::vector<sstring> keyspace_names);
future<std::unordered_map<sstring, std::vector<snapshot_details>>> get_snapshot_details();
future<int64_t> true_snapshots_size();
#if 0
/**
* @param allowIndexes Allow index CF names to be passed in
* @param autoAddIndexes Automatically add secondary indexes if a CF has them
* @param keyspaceName keyspace
* @param cfNames CFs
* @throws java.lang.IllegalArgumentException when given CF name does not exist
*/
public Iterable<ColumnFamilyStore> getValidColumnFamilies(boolean allowIndexes, boolean autoAddIndexes, String keyspaceName, String... cfNames) throws IOException
{
Keyspace keyspace = getValidKeyspace(keyspaceName);
Set<ColumnFamilyStore> valid = new HashSet<>();
if (cfNames.length == 0)
{
// all stores are interesting
for (ColumnFamilyStore cfStore : keyspace.getColumnFamilyStores())
{
valid.add(cfStore);
if (autoAddIndexes)
{
for (SecondaryIndex si : cfStore.indexManager.getIndexes())
{
if (si.getIndexCfs() != null) {
logger.info("adding secondary index {} to operation", si.getIndexName());
valid.add(si.getIndexCfs());
}
}
}
}
return valid;
}
// filter out interesting stores
for (String cfName : cfNames)
{
//if the CF name is an index, just flush the CF that owns the index
String baseCfName = cfName;
String idxName = null;
if (cfName.contains(".")) // secondary index
{
if(!allowIndexes)
{
logger.warn("Operation not allowed on secondary Index table ({})", cfName);
continue;
}
String[] parts = cfName.split("\\.", 2);
baseCfName = parts[0];
idxName = parts[1];
}
ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore(baseCfName);
if (idxName != null)
{
Collection< SecondaryIndex > indexes = cfStore.indexManager.getIndexesByNames(new HashSet<>(Arrays.asList(cfName)));
if (indexes.isEmpty())
logger.warn(String.format("Invalid index specified: %s/%s. Proceeding with others.", baseCfName, idxName));
else
valid.add(Iterables.get(indexes, 0).getIndexCfs());
}
else
{
valid.add(cfStore);
if(autoAddIndexes)
{
for(SecondaryIndex si : cfStore.indexManager.getIndexes())
{
if (si.getIndexCfs() != null) {
logger.info("adding secondary index {} to operation", si.getIndexName());
valid.add(si.getIndexCfs());
}
}
}
}
}
return valid;
}
/**
* Flush all memtables for a keyspace and column families.
* @param keyspaceName
* @param columnFamilies
* @throws IOException
*/
public void forceKeyspaceFlush(String keyspaceName, String... columnFamilies) throws IOException
{
for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, false, keyspaceName, columnFamilies))
{
logger.debug("Forcing flush on keyspace {}, CF {}", keyspaceName, cfStore.name);
cfStore.forceBlockingFlush();
}
}
/**
* Sends JMX notification to subscribers.
*
* @param type Message type
* @param message Message itself
* @param userObject Arbitrary object to attach to notification
*/
public void sendNotification(String type, String message, Object userObject)
{
Notification jmxNotification = new Notification(type, jmxObjectName, notificationSerialNumber.incrementAndGet(), message);
jmxNotification.setUserData(userObject);
sendNotification(jmxNotification);
}
public int repairAsync(String keyspace, Map<String, String> repairSpec)
{
RepairOption option = RepairOption.parse(repairSpec, getPartitioner());
// if ranges are not specified
if (option.getRanges().isEmpty())
{
if (option.isPrimaryRange())
{
// when repairing only primary range, neither dataCenters nor hosts can be set
if (option.getDataCenters().isEmpty() && option.getHosts().isEmpty())
option.getRanges().addAll(getPrimaryRanges(keyspace));
// except dataCenters only contain local DC (i.e. -local)
else if (option.getDataCenters().size() == 1 && option.getDataCenters().contains(DatabaseDescriptor.getLocalDataCenter()))
option.getRanges().addAll(getPrimaryRangesWithinDC(keyspace));
else
throw new IllegalArgumentException("You need to run primary range repair on all nodes in the cluster.");
}
else
{
option.getRanges().addAll(getLocalRanges(keyspace));
}
}
return forceRepairAsync(keyspace, option);
}
@Deprecated
public int forceRepairAsync(String keyspace,
boolean isSequential,
Collection<String> dataCenters,
Collection<String> hosts,
boolean primaryRange,
boolean fullRepair,
String... columnFamilies)
{
return forceRepairAsync(keyspace, isSequential ? RepairParallelism.SEQUENTIAL : RepairParallelism.PARALLEL, dataCenters, hosts, primaryRange, fullRepair, columnFamilies);
}
@Deprecated
public int forceRepairAsync(String keyspace,
RepairParallelism parallelismDegree,
Collection<String> dataCenters,
Collection<String> hosts,
boolean primaryRange,
boolean fullRepair,
String... columnFamilies)
{
if (FBUtilities.isWindows() && parallelismDegree != RepairParallelism.PARALLEL)
{
logger.warn("Snapshot-based repair is not yet supported on Windows. Reverting to parallel repair.");
parallelismDegree = RepairParallelism.PARALLEL;
}
RepairOption options = new RepairOption(parallelismDegree, primaryRange, !fullRepair, false, 1, Collections.<Range<Token>>emptyList());
if (dataCenters != null)
{
options.getDataCenters().addAll(dataCenters);
}
if (hosts != null)
{
options.getHosts().addAll(hosts);
}
if (columnFamilies != null)
{
for (String columnFamily : columnFamilies)
{
options.getColumnFamilies().add(columnFamily);
}
}
return forceRepairAsync(keyspace, options);
}
public int forceRepairAsync(String keyspace,
boolean isSequential,
boolean isLocal,
boolean primaryRange,
boolean fullRepair,
String... columnFamilies)
{
Set<String> dataCenters = null;
if (isLocal)
{
dataCenters = Sets.newHashSet(DatabaseDescriptor.getLocalDataCenter());
}
return forceRepairAsync(keyspace, isSequential, dataCenters, null, primaryRange, fullRepair, columnFamilies);
}
public int forceRepairRangeAsync(String beginToken,
String endToken,
String keyspaceName,
boolean isSequential,
Collection<String> dataCenters,
Collection<String> hosts,
boolean fullRepair,
String... columnFamilies)
{
return forceRepairRangeAsync(beginToken, endToken, keyspaceName, isSequential ? RepairParallelism.SEQUENTIAL : RepairParallelism.PARALLEL, dataCenters, hosts, fullRepair, columnFamilies);
}
public int forceRepairRangeAsync(String beginToken,
String endToken,
String keyspaceName,
RepairParallelism parallelismDegree,
Collection<String> dataCenters,
Collection<String> hosts,
boolean fullRepair,
String... columnFamilies)
{
if (FBUtilities.isWindows() && parallelismDegree != RepairParallelism.PARALLEL)
{
logger.warn("Snapshot-based repair is not yet supported on Windows. Reverting to parallel repair.");
parallelismDegree = RepairParallelism.PARALLEL;
}
Collection<Range<Token>> repairingRange = createRepairRangeFrom(beginToken, endToken);
RepairOption options = new RepairOption(parallelismDegree, false, !fullRepair, false, 1, repairingRange);
options.getDataCenters().addAll(dataCenters);
if (hosts != null)
{
options.getHosts().addAll(hosts);
}
if (columnFamilies != null)
{
for (String columnFamily : columnFamilies)
{
options.getColumnFamilies().add(columnFamily);
}
}
logger.info("starting user-requested repair of range {} for keyspace {} and column families {}",
repairingRange, keyspaceName, columnFamilies);
return forceRepairAsync(keyspaceName, options);
}
public int forceRepairRangeAsync(String beginToken,
String endToken,
String keyspaceName,
boolean isSequential,
boolean isLocal,
boolean fullRepair,
String... columnFamilies)
{
Set<String> dataCenters = null;
if (isLocal)
{
dataCenters = Sets.newHashSet(DatabaseDescriptor.getLocalDataCenter());
}
return forceRepairRangeAsync(beginToken, endToken, keyspaceName, isSequential, dataCenters, null, fullRepair, columnFamilies);
}
/**
* Create collection of ranges that match ring layout from given tokens.
*
* @param beginToken beginning token of the range
* @param endToken end token of the range
* @return collection of ranges that match ring layout in TokenMetadata
*/
@SuppressWarnings("unchecked")
@VisibleForTesting
Collection<Range<Token>> createRepairRangeFrom(String beginToken, String endToken)
{
Token parsedBeginToken = getPartitioner().getTokenFactory().fromString(beginToken);
Token parsedEndToken = getPartitioner().getTokenFactory().fromString(endToken);
// Break up given range to match ring layout in TokenMetadata
ArrayList<Range<Token>> repairingRange = new ArrayList<>();
ArrayList<Token> tokens = new ArrayList<>(_token_metadata.sortedTokens());
if (!tokens.contains(parsedBeginToken))
{
tokens.add(parsedBeginToken);
}
if (!tokens.contains(parsedEndToken))
{
tokens.add(parsedEndToken);
}
// tokens now contain all tokens including our endpoints
Collections.sort(tokens);
int start = tokens.indexOf(parsedBeginToken), end = tokens.indexOf(parsedEndToken);
for (int i = start; i != end; i = (i+1) % tokens.size())
{
Range<Token> range = new Range<>(tokens.get(i), tokens.get((i+1) % tokens.size()));
repairingRange.add(range);
}
return repairingRange;
}
public int forceRepairAsync(String keyspace, RepairOption options)
{
if (options.getRanges().isEmpty() || Keyspace.open(keyspace).getReplicationStrategy().getReplicationFactor() < 2)
return 0;
int cmd = nextRepairCommand.incrementAndGet();
new Thread(createRepairTask(cmd, keyspace, options)).start();
return cmd;
}
private Thread createQueryThread(final int cmd, final UUID sessionId)
{
return new Thread(new WrappedRunnable()
{
// Query events within a time interval that overlaps the last by one second. Ignore duplicates. Ignore local traces.
// Wake up upon local trace activity. Query when notified of trace activity with a timeout that doubles every two timeouts.
public void runMayThrow() throws Exception
{
TraceState state = Tracing.instance.get(sessionId);
if (state == null)
throw new Exception("no tracestate");
String format = "select event_id, source, activity from %s.%s where session_id = ? and event_id > ? and event_id < ?;";
String query = String.format(format, TraceKeyspace.NAME, TraceKeyspace.EVENTS);
SelectStatement statement = (SelectStatement) QueryProcessor.parseStatement(query).prepare().statement;
ByteBuffer sessionIdBytes = ByteBufferUtil.bytes(sessionId);
InetAddress source = FBUtilities.getBroadcastAddress();
HashSet<UUID>[] seen = new HashSet[] { new HashSet<UUID>(), new HashSet<UUID>() };
int si = 0;
UUID uuid;
long tlast = System.currentTimeMillis(), tcur;
TraceState.Status status;
long minWaitMillis = 125;
long maxWaitMillis = 1000 * 1024L;
long timeout = minWaitMillis;
boolean shouldDouble = false;
while ((status = state.waitActivity(timeout)) != TraceState.Status.STOPPED)
{
if (status == TraceState.Status.IDLE)
{
timeout = shouldDouble ? Math.min(timeout * 2, maxWaitMillis) : timeout;
shouldDouble = !shouldDouble;
}
else
{
timeout = minWaitMillis;
shouldDouble = false;
}
ByteBuffer tminBytes = ByteBufferUtil.bytes(UUIDGen.minTimeUUID(tlast - 1000));
ByteBuffer tmaxBytes = ByteBufferUtil.bytes(UUIDGen.maxTimeUUID(tcur = System.currentTimeMillis()));
QueryOptions options = QueryOptions.forInternalCalls(ConsistencyLevel.ONE, Lists.newArrayList(sessionIdBytes, tminBytes, tmaxBytes));
ResultMessage.Rows rows = statement.execute(QueryState.forInternalCalls(), options);
UntypedResultSet result = UntypedResultSet.create(rows.result);
for (UntypedResultSet.Row r : result)
{
if (source.equals(r.getInetAddress("source")))
continue;
if ((uuid = r.getUUID("event_id")).timestamp() > (tcur - 1000) * 10000)
seen[si].add(uuid);
if (seen[si == 0 ? 1 : 0].contains(uuid))
continue;
String message = String.format("%s: %s", r.getInetAddress("source"), r.getString("activity"));
sendNotification("repair", message, new int[]{cmd, ActiveRepairService.Status.RUNNING.ordinal()});
}
tlast = tcur;
si = si == 0 ? 1 : 0;
seen[si].clear();
}
}
});
}
private FutureTask<Object> createRepairTask(final int cmd, final String keyspace, final RepairOption options)
{
if (!options.getDataCenters().isEmpty() && options.getDataCenters().contains(DatabaseDescriptor.getLocalDataCenter()))
{
throw new IllegalArgumentException("the local data center must be part of the repair");
}
return new FutureTask<>(new WrappedRunnable()
{
protected void runMayThrow() throws Exception
{
final TraceState traceState;
String[] columnFamilies = options.getColumnFamilies().toArray(new String[options.getColumnFamilies().size()]);
Iterable<ColumnFamilyStore> validColumnFamilies = getValidColumnFamilies(false, false, keyspace, columnFamilies);
final long startTime = System.currentTimeMillis();
String message = String.format("Starting repair command #%d, repairing keyspace %s with %s", cmd, keyspace, options);
logger.info(message);
sendNotification("repair", message, new int[]{cmd, ActiveRepairService.Status.STARTED.ordinal()});
if (options.isTraced())
{
StringBuilder cfsb = new StringBuilder();
for (ColumnFamilyStore cfs : validColumnFamilies)
cfsb.append(", ").append(cfs.keyspace.getName()).append(".").append(cfs.name);
UUID sessionId = Tracing.instance.newSession(Tracing.TraceType.REPAIR);
traceState = Tracing.instance.begin("repair", ImmutableMap.of("keyspace", keyspace, "columnFamilies", cfsb.substring(2)));
Tracing.traceRepair(message);
traceState.enableActivityNotification();
traceState.setNotificationHandle(new int[]{ cmd, ActiveRepairService.Status.RUNNING.ordinal() });
Thread queryThread = createQueryThread(cmd, sessionId);
queryThread.setName("RepairTracePolling");
queryThread.start();
}
else
{
traceState = null;
}
final Set<InetAddress> allNeighbors = new HashSet<>();
Map<Range, Set<InetAddress>> rangeToNeighbors = new HashMap<>();
for (Range<Token> range : options.getRanges())
{
try
{
Set<InetAddress> neighbors = ActiveRepairService.getNeighbors(keyspace, range, options.getDataCenters(), options.getHosts());
rangeToNeighbors.put(range, neighbors);
allNeighbors.addAll(neighbors);
}
catch (IllegalArgumentException e)
{
logger.error("Repair failed:", e);
sendNotification("repair", e.getMessage(), new int[]{cmd, ActiveRepairService.Status.FINISHED.ordinal()});
return;
}
}
// Validate columnfamilies
List<ColumnFamilyStore> columnFamilyStores = new ArrayList<>();
try
{
Iterables.addAll(columnFamilyStores, validColumnFamilies);
}
catch (IllegalArgumentException e)
{
sendNotification("repair", e.getMessage(), new int[]{cmd, ActiveRepairService.Status.FINISHED.ordinal()});
return;
}
final UUID parentSession;
long repairedAt;
try
{
parentSession = ActiveRepairService.instance.prepareForRepair(allNeighbors, options, columnFamilyStores);
repairedAt = ActiveRepairService.instance.getParentRepairSession(parentSession).repairedAt;
}
catch (Throwable t)
{
sendNotification("repair", String.format("Repair failed with error %s", t.getMessage()), new int[]{cmd, ActiveRepairService.Status.FINISHED.ordinal()});
return;
}
// Set up RepairJob executor for this repair command.
final ListeningExecutorService executor = MoreExecutors.listeningDecorator(new JMXConfigurableThreadPoolExecutor(options.getJobThreads(),
Integer.MAX_VALUE,
TimeUnit.SECONDS,
new LinkedBlockingQueue<Runnable>(),
new NamedThreadFactory("Repair#" + cmd),
"internal"));
List<ListenableFuture<RepairSessionResult>> futures = new ArrayList<>(options.getRanges().size());
String[] cfnames = new String[columnFamilyStores.size()];
for (int i = 0; i < columnFamilyStores.size(); i++)
{
cfnames[i] = columnFamilyStores.get(i).name;
}
for (Range<Token> range : options.getRanges())
{
final RepairSession session = ActiveRepairService.instance.submitRepairSession(parentSession,
range,
keyspace,
options.getParallelism(),
rangeToNeighbors.get(range),
repairedAt,
executor,
cfnames);
if (session == null)
continue;
// After repair session completes, notify client its result
Futures.addCallback(session, new FutureCallback<RepairSessionResult>()
{
public void onSuccess(RepairSessionResult result)
{
String message = String.format("Repair session %s for range %s finished", session.getId(), session.getRange().toString());
logger.info(message);
sendNotification("repair", message, new int[]{cmd, ActiveRepairService.Status.SESSION_SUCCESS.ordinal()});
}
public void onFailure(Throwable t)
{
String message = String.format("Repair session %s for range %s failed with error %s", session.getId(), session.getRange().toString(), t.getMessage());
logger.error(message, t);
sendNotification("repair", message, new int[]{cmd, ActiveRepairService.Status.SESSION_FAILED.ordinal()});
}
});
futures.add(session);
}
// After all repair sessions completes(successful or not),
// run anticompaction if necessary and send finish notice back to client
final ListenableFuture<List<RepairSessionResult>> allSessions = Futures.successfulAsList(futures);
Futures.addCallback(allSessions, new FutureCallback<List<RepairSessionResult>>()
{
public void onSuccess(List<RepairSessionResult> result)
{
// filter out null(=failed) results and get successful ranges
Collection<Range<Token>> successfulRanges = new ArrayList<>();
for (RepairSessionResult sessionResult : result)
{
if (sessionResult != null)
{
successfulRanges.add(sessionResult.range);
}
}
try
{
ActiveRepairService.instance.finishParentSession(parentSession, allNeighbors, successfulRanges);
}
catch (Exception e)
{
logger.error("Error in incremental repair", e);
}
repairComplete();
}
public void onFailure(Throwable t)
{
repairComplete();
}
private void repairComplete()
{
String duration = DurationFormatUtils.formatDurationWords(System.currentTimeMillis() - startTime, true, true);
String message = String.format("Repair command #%d finished in %s", cmd, duration);
sendNotification("repair", message,
new int[]{cmd, ActiveRepairService.Status.FINISHED.ordinal()});
logger.info(message);
if (options.isTraced())
{
traceState.setNotificationHandle(null);
// Because DebuggableThreadPoolExecutor#afterExecute and this callback
// run in a nondeterministic order (within the same thread), the
// TraceState may have been nulled out at this point. The TraceState
// should be traceState, so just set it without bothering to check if it
// actually was nulled out.
Tracing.instance.set(traceState);
Tracing.traceRepair(message);
Tracing.instance.stopSession();
}
executor.shutdownNow();
}
});
}
}, null);
}
public void forceTerminateAllRepairSessions() {
ActiveRepairService.instance.terminateSessions();
}
/* End of MBean interface methods */
/**
* Get the "primary ranges" for the specified keyspace and endpoint.
* "Primary ranges" are the ranges that the node is responsible for storing replica primarily.
* The node that stores replica primarily is defined as the first node returned
* by {@link AbstractReplicationStrategy#calculateNaturalEndpoints}.
*
* @param keyspace Keyspace name to check primary ranges
* @param ep endpoint we are interested in.
* @return primary ranges for the specified endpoint.
*/
public Collection<Range<Token>> getPrimaryRangesForEndpoint(String keyspace, InetAddress ep)
{
AbstractReplicationStrategy strategy = Keyspace.open(keyspace).getReplicationStrategy();
Collection<Range<Token>> primaryRanges = new HashSet<>();
TokenMetadata metadata = _token_metadata.cloneOnlyTokenMap();
for (Token token : metadata.sortedTokens())
{
List<InetAddress> endpoints = strategy.calculateNaturalEndpoints(token, metadata);
if (endpoints.size() > 0 && endpoints.get(0).equals(ep))
primaryRanges.add(new Range<>(metadata.getPredecessor(token), token));
}
return primaryRanges;
}
/**
* Get the "primary ranges" within local DC for the specified keyspace and endpoint.
*
* @see #getPrimaryRangesForEndpoint(String, java.net.InetAddress)
* @param keyspace Keyspace name to check primary ranges
* @param referenceEndpoint endpoint we are interested in.
* @return primary ranges within local DC for the specified endpoint.
*/
public Collection<Range<Token>> getPrimaryRangeForEndpointWithinDC(String keyspace, InetAddress referenceEndpoint)
{
TokenMetadata metadata = _token_metadata.cloneOnlyTokenMap();
String localDC = DatabaseDescriptor.getEndpointSnitch().getDatacenter(referenceEndpoint);
Collection<InetAddress> localDcNodes = metadata.getTopology().getDatacenterEndpoints().get(localDC);
AbstractReplicationStrategy strategy = Keyspace.open(keyspace).getReplicationStrategy();
Collection<Range<Token>> localDCPrimaryRanges = new HashSet<>();
for (Token token : metadata.sortedTokens())
{
List<InetAddress> endpoints = strategy.calculateNaturalEndpoints(token, metadata);
for (InetAddress endpoint : endpoints)
{
if (localDcNodes.contains(endpoint))
{
if (endpoint.equals(referenceEndpoint))
{
localDCPrimaryRanges.add(new Range<>(metadata.getPredecessor(token), token));
}
break;
}
}
}
return localDCPrimaryRanges;
}
#endif
/**
* Get all ranges an endpoint is responsible for (by keyspace)
* Replication strategy's get_ranges() guarantees that no wrap-around range is returned.
* @param ep endpoint we are interested in.
* @return ranges for the specified endpoint.
*/
dht::token_range_vector get_ranges_for_endpoint(const sstring& name, const gms::inet_address& ep) const;
/**
* Get all ranges that span the ring given a set
* of tokens. All ranges are in sorted order of
* ranges.
* @return ranges in sorted order
*/
dht::token_range_vector get_all_ranges(const std::vector<token>& sorted_tokens) const;
/**
* This method returns the N endpoints that are responsible for storing the
* specified key i.e for replication.
*
* @param keyspaceName keyspace name also known as keyspace
* @param cf Column family name
* @param key key for which we need to find the endpoint
* @return the endpoint responsible for this key
*/
std::vector<gms::inet_address> get_natural_endpoints(const sstring& keyspace,
const sstring& cf, const sstring& key) const;
#if 0
public List<InetAddress> getNaturalEndpoints(String keyspaceName, ByteBuffer key)
{
return getNaturalEndpoints(keyspaceName, getPartitioner().getToken(key));
}
#endif
/**
* This method returns the N endpoints that are responsible for storing the
* specified key i.e for replication.
*
* @param keyspaceName keyspace name also known as keyspace
* @param pos position for which we need to find the endpoint
* @return the endpoint responsible for this token
*/
std::vector<gms::inet_address> get_natural_endpoints(const sstring& keyspace, const token& pos) const;
#if 0
/**
* This method attempts to return N endpoints that are responsible for storing the
* specified key i.e for replication.
*
* @param keyspace keyspace name also known as keyspace
* @param key key for which we need to find the endpoint
* @return the endpoint responsible for this key
*/
public List<InetAddress> getLiveNaturalEndpoints(Keyspace keyspace, ByteBuffer key)
{
return getLiveNaturalEndpoints(keyspace, getPartitioner().decorateKey(key));
}
public List<InetAddress> getLiveNaturalEndpoints(Keyspace keyspace, RingPosition pos)
{
List<InetAddress> endpoints = keyspace.getReplicationStrategy().getNaturalEndpoints(pos);
List<InetAddress> liveEps = new ArrayList<>(endpoints.size());
for (InetAddress endpoint : endpoints)
{
if (FailureDetector.instance.isAlive(endpoint))
liveEps.add(endpoint);
}
return liveEps;
}
public void setLoggingLevel(String classQualifier, String rawLevel) throws Exception
{
ch.qos.logback.classic.Logger logBackLogger = (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(classQualifier);
// if both classQualifer and rawLevel are empty, reload from configuration
if (StringUtils.isBlank(classQualifier) && StringUtils.isBlank(rawLevel) )
{
JMXConfiguratorMBean jmxConfiguratorMBean = JMX.newMBeanProxy(ManagementFactory.getPlatformMBeanServer(),
new ObjectName("ch.qos.logback.classic:Name=default,Type=ch.qos.logback.classic.jmx.JMXConfigurator"),
JMXConfiguratorMBean.class);
jmxConfiguratorMBean.reloadDefaultConfiguration();
return;
}
// classQualifer is set, but blank level given
else if (StringUtils.isNotBlank(classQualifier) && StringUtils.isBlank(rawLevel) )
{
if (logBackLogger.getLevel() != null || hasAppenders(logBackLogger))
logBackLogger.setLevel(null);
return;
}
ch.qos.logback.classic.Level level = ch.qos.logback.classic.Level.toLevel(rawLevel);
logBackLogger.setLevel(level);
logger.info("set log level to {} for classes under '{}' (if the level doesn't look like '{}' then the logger couldn't parse '{}')", level, classQualifier, rawLevel, rawLevel);
}
/**
* @return the runtime logging levels for all the configured loggers
*/
@Override
public Map<String,String>getLoggingLevels() {
Map<String, String> logLevelMaps = Maps.newLinkedHashMap();
LoggerContext lc = (LoggerContext) LoggerFactory.getILoggerFactory();
for (ch.qos.logback.classic.Logger logger : lc.getLoggerList())
{
if(logger.getLevel() != null || hasAppenders(logger))
logLevelMaps.put(logger.getName(), logger.getLevel().toString());
}
return logLevelMaps;
}
private boolean hasAppenders(ch.qos.logback.classic.Logger logger) {
Iterator<Appender<ILoggingEvent>> it = logger.iteratorForAppenders();
return it.hasNext();
}
#endif
/**
* @return Vector of Token ranges (_not_ keys!) together with estimated key count,
* breaking up the data this node is responsible for into pieces of roughly keys_per_split
*/
std::vector<std::pair<dht::token_range, uint64_t>> get_splits(const sstring& ks_name,
const sstring& cf_name,
range<dht::token> range,
uint32_t keys_per_split);
public:
future<> decommission();
private:
/**
* Broadcast leaving status and update local _token_metadata accordingly
*/
future<> start_leaving();
void leave_ring();
void unbootstrap();
future<> stream_hints();
public:
future<> move(sstring new_token) {
// FIXME: getPartitioner().getTokenFactory().validate(newToken);
return move(dht::global_partitioner().from_sstring(new_token));
}
private:
/**
* move the node to new token or find a new token to boot to according to load
*
* @param newToken new token to boot to, or if null, find balanced token to boot to
*
* @throws IOException on any I/O operation error
*/
future<> move(token new_token);
public:
class range_relocator {
private:
streaming::stream_plan _stream_plan;
public:
range_relocator(std::unordered_set<token> tokens, std::vector<sstring> keyspace_names)
: _stream_plan("Relocation") {
calculate_to_from_streams(std::move(tokens), std::move(keyspace_names));
}
private:
void calculate_to_from_streams(std::unordered_set<token> new_tokens, std::vector<sstring> keyspace_names);
public:
future<> stream() {
return _stream_plan.execute().discard_result();
}
bool streams_needed() {
return !_stream_plan.is_empty();
}
};
/**
* Get the status of a token removal.
*/
future<sstring> get_removal_status();
/**
* Force a remove operation to complete. This may be necessary if a remove operation
* blocks forever due to node/stream failure. removeToken() must be called
* first, this is a last resort measure. No further attempt will be made to restore replicas.
*/
future<> force_remove_completion();
public:
/**
* Remove a node that has died, attempting to restore the replica count.
* If the node is alive, decommission should be attempted. If decommission
* fails, then removeToken should be called. If we fail while trying to
* restore the replica count, finally forceRemoveCompleteion should be
* called to forcibly remove the node without regard to replica count.
*
* @param hostIdString token for the node
*/
future<> removenode(sstring host_id_string);
future<sstring> get_operation_mode();
future<bool> is_starting();
drain_progress get_drain_progress() const {
return _drain_progress;
}
/**
* Shuts node off to writes, empties memtables and the commit log.
* There are two differences between drain and the normal shutdown hook:
* - Drain waits for in-progress streaming to complete
* - Drain flushes *all* columnfamilies (shutdown hook only flushes non-durable CFs)
*/
future<> drain();
#if 0
// Never ever do this at home. Used by tests.
IPartitioner setPartitionerUnsafe(IPartitioner newPartitioner)
{
IPartitioner oldPartitioner = DatabaseDescriptor.getPartitioner();
DatabaseDescriptor.setPartitioner(newPartitioner);
valueFactory = new VersionedValue.VersionedValueFactory(getPartitioner());
return oldPartitioner;
}
TokenMetadata setTokenMetadataUnsafe(TokenMetadata tmd)
{
TokenMetadata old = _token_metadata;
_token_metadata = tmd;
return old;
}
public void truncate(String keyspace, String columnFamily) throws TimeoutException, IOException
{
try
{
StorageProxy.truncateBlocking(keyspace, columnFamily);
}
catch (UnavailableException e)
{
throw new IOException(e.getMessage());
}
}
#endif
public:
future<std::map<gms::inet_address, float>> get_ownership();
future<std::map<gms::inet_address, float>> effective_ownership(sstring keyspace_name);
#if 0
/**
* Calculates ownership. If there are multiple DC's and the replication strategy is DC aware then ownership will be
* calculated per dc, i.e. each DC will have total ring ownership divided amongst its nodes. Without replication
* total ownership will be a multiple of the number of DC's and this value will then go up within each DC depending
* on the number of replicas within itself. For DC unaware replication strategies, ownership without replication
* will be 100%.
*
* @throws IllegalStateException when node is not configured properly.
*/
public LinkedHashMap<InetAddress, Float> effectiveOwnership(String keyspace) throws IllegalStateException
{
if (keyspace != null)
{
Keyspace keyspaceInstance = Schema.instance.getKeyspaceInstance(keyspace);
if(keyspaceInstance == null)
throw new IllegalArgumentException("The keyspace " + keyspace + ", does not exist");
if(keyspaceInstance.getReplicationStrategy() instanceof LocalStrategy)
throw new IllegalStateException("Ownership values for keyspaces with LocalStrategy are meaningless");
}
else
{
List<String> nonSystemKeyspaces = Schema.instance.getNonSystemKeyspaces();
//system_traces is a non-system keyspace however it needs to be counted as one for this process
int specialTableCount = 0;
if (nonSystemKeyspaces.contains("system_traces"))
{
specialTableCount += 1;
}
if (nonSystemKeyspaces.size() > specialTableCount)
throw new IllegalStateException("Non-system keyspaces don't have the same replication settings, effective ownership information is meaningless");
keyspace = "system_traces";
}
TokenMetadata metadata = _token_metadata.cloneOnlyTokenMap();
Collection<Collection<InetAddress>> endpointsGroupedByDc = new ArrayList<>();
// mapping of dc's to nodes, use sorted map so that we get dcs sorted
SortedMap<String, Collection<InetAddress>> sortedDcsToEndpoints = new TreeMap<>();
sortedDcsToEndpoints.putAll(metadata.getTopology().getDatacenterEndpoints().asMap());
for (Collection<InetAddress> endpoints : sortedDcsToEndpoints.values())
endpointsGroupedByDc.add(endpoints);
Map<Token, Float> tokenOwnership = getPartitioner().describeOwnership(_token_metadata.sortedTokens());
LinkedHashMap<InetAddress, Float> finalOwnership = Maps.newLinkedHashMap();
// calculate ownership per dc
for (Collection<InetAddress> endpoints : endpointsGroupedByDc)
{
// calculate the ownership with replication and add the endpoint to the final ownership map
for (InetAddress endpoint : endpoints)
{
float ownership = 0.0f;
for (Range<Token> range : getRangesForEndpoint(keyspace, endpoint))
{
if (tokenOwnership.containsKey(range.right))
ownership += tokenOwnership.get(range.right);
}
finalOwnership.put(endpoint, ownership);
}
}
return finalOwnership;
}
private boolean hasSameReplication(List<String> list)
{
if (list.isEmpty())
return false;
for (int i = 0; i < list.size() -1; i++)
{
KSMetaData ksm1 = Schema.instance.getKSMetaData(list.get(i));
KSMetaData ksm2 = Schema.instance.getKSMetaData(list.get(i + 1));
if (!ksm1.strategyClass.equals(ksm2.strategyClass) ||
!Iterators.elementsEqual(ksm1.strategyOptions.entrySet().iterator(),
ksm2.strategyOptions.entrySet().iterator()))
return false;
}
return true;
}
public List<String> getKeyspaces()
{
List<String> keyspaceNamesList = new ArrayList<>(Schema.instance.getKeyspaces());
return Collections.unmodifiableList(keyspaceNamesList);
}
public List<String> getNonSystemKeyspaces()
{
List<String> keyspaceNamesList = new ArrayList<>(Schema.instance.getNonSystemKeyspaces());
return Collections.unmodifiableList(keyspaceNamesList);
}
public void updateSnitch(String epSnitchClassName, Boolean dynamic, Integer dynamicUpdateInterval, Integer dynamicResetInterval, Double dynamicBadnessThreshold) throws ClassNotFoundException
{
IEndpointSnitch oldSnitch = DatabaseDescriptor.getEndpointSnitch();
// new snitch registers mbean during construction
IEndpointSnitch newSnitch;
try
{
newSnitch = FBUtilities.construct(epSnitchClassName, "snitch");
}
catch (ConfigurationException e)
{
throw new ClassNotFoundException(e.getMessage());
}
if (dynamic)
{
DatabaseDescriptor.setDynamicUpdateInterval(dynamicUpdateInterval);
DatabaseDescriptor.setDynamicResetInterval(dynamicResetInterval);
DatabaseDescriptor.setDynamicBadnessThreshold(dynamicBadnessThreshold);
newSnitch = new DynamicEndpointSnitch(newSnitch);
}
// point snitch references to the new instance
DatabaseDescriptor.setEndpointSnitch(newSnitch);
for (String ks : Schema.instance.getKeyspaces())
{
Keyspace.open(ks).getReplicationStrategy().snitch = newSnitch;
}
if (oldSnitch instanceof DynamicEndpointSnitch)
((DynamicEndpointSnitch)oldSnitch).unregisterMBean();
}
#endif
private:
/**
* Seed data to the endpoints that will be responsible for it at the future
*
* @param rangesToStreamByKeyspace keyspaces and data ranges with endpoints included for each
* @return async Future for whether stream was success
*/
future<> stream_ranges(std::unordered_map<sstring, std::unordered_multimap<dht::token_range, inet_address>> ranges_to_stream_by_keyspace);
public:
/**
* Calculate pair of ranges to stream/fetch for given two range collections
* (current ranges for keyspace and ranges after move to new token)
*
* @param current collection of the ranges by current token
* @param updated collection of the ranges after token is changed
* @return pair of ranges to stream/fetch for given current and updated range collections
*/
std::pair<std::unordered_set<dht::token_range>, std::unordered_set<dht::token_range>>
calculate_stream_and_fetch_ranges(const dht::token_range_vector& current, const dht::token_range_vector& updated);
#if 0
public void bulkLoad(String directory)
{
try
{
bulkLoadInternal(directory).get();
}
catch (Exception e)
{
throw new RuntimeException(e);
}
}
public String bulkLoadAsync(String directory)
{
return bulkLoadInternal(directory).planId.toString();
}
private StreamResultFuture bulkLoadInternal(String directory)
{
File dir = new File(directory);
if (!dir.exists() || !dir.isDirectory())
throw new IllegalArgumentException("Invalid directory " + directory);
SSTableLoader.Client client = new SSTableLoader.Client()
{
public void init(String keyspace)
{
try
{
setPartitioner(DatabaseDescriptor.getPartitioner());
for (Map.Entry<Range<Token>, List<InetAddress>> entry : StorageService.instance.getRangeToAddressMap(keyspace).entrySet())
{
Range<Token> range = entry.getKey();
for (InetAddress endpoint : entry.getValue())
addRangeForEndpoint(range, endpoint);
}
}
catch (Exception e)
{
throw new RuntimeException(e);
}
}
public CFMetaData getCFMetaData(String keyspace, String cfName)
{
return Schema.instance.getCFMetaData(keyspace, cfName);
}
};
SSTableLoader loader = new SSTableLoader(dir, client, new OutputHandler.LogOutput());
return loader.stream();
}
#endif
public:
int32_t get_exception_count();
#if 0
public void rescheduleFailedDeletions()
{
SSTableDeletingTask.rescheduleFailedTasks();
}
#endif
/**
* Load new SSTables not currently tracked by the system
*
* This can be called, for instance, after copying a batch of SSTables to a CF directory.
*
* This should not be called in parallel for the same keyspace / column family, and doing
* so will throw an std::runtime_exception.
*
* @param ks_name the keyspace in which to search for new SSTables.
* @param cf_name the column family in which to search for new SSTables.
* @return a future<> when the operation finishes.
*/
future<> load_new_sstables(sstring ks_name, sstring cf_name);
#if 0
/**
* #{@inheritDoc}
*/
public List<String> sampleKeyRange() // do not rename to getter - see CASSANDRA-4452 for details
{
List<DecoratedKey> keys = new ArrayList<>();
for (Keyspace keyspace : Keyspace.nonSystem())
{
for (Range<Token> range : getPrimaryRangesForEndpoint(keyspace.getName(), FBUtilities.getBroadcastAddress()))
keys.addAll(keySamples(keyspace.getColumnFamilyStores(), range));
}
List<String> sampledKeys = new ArrayList<>(keys.size());
for (DecoratedKey key : keys)
sampledKeys.add(key.getToken().toString());
return sampledKeys;
}
public void rebuildSecondaryIndex(String ksName, String cfName, String... idxNames)
{
ColumnFamilyStore.rebuildSecondaryIndex(ksName, cfName, idxNames);
}
public void resetLocalSchema() throws IOException
{
MigrationManager.resetLocalSchema();
}
public void setTraceProbability(double probability)
{
this.traceProbability = probability;
}
public double getTraceProbability()
{
return traceProbability;
}
public void disableAutoCompaction(String ks, String... columnFamilies) throws IOException
{
for (ColumnFamilyStore cfs : getValidColumnFamilies(true, true, ks, columnFamilies))
{
cfs.disableAutoCompaction();
}
}
public void enableAutoCompaction(String ks, String... columnFamilies) throws IOException
{
for (ColumnFamilyStore cfs : getValidColumnFamilies(true, true, ks, columnFamilies))
{
cfs.enableAutoCompaction();
}
}
/** Returns the name of the cluster */
public String getClusterName()
{
return DatabaseDescriptor.getClusterName();
}
/** Returns the cluster partitioner */
public String getPartitionerName()
{
return DatabaseDescriptor.getPartitionerName();
}
public int getTombstoneWarnThreshold()
{
return DatabaseDescriptor.getTombstoneWarnThreshold();
}
public void setTombstoneWarnThreshold(int threshold)
{
DatabaseDescriptor.setTombstoneWarnThreshold(threshold);
}
public int getTombstoneFailureThreshold()
{
return DatabaseDescriptor.getTombstoneFailureThreshold();
}
public void setTombstoneFailureThreshold(int threshold)
{
DatabaseDescriptor.setTombstoneFailureThreshold(threshold);
}
public int getBatchSizeFailureThreshold()
{
return DatabaseDescriptor.getBatchSizeFailThresholdInKB();
}
public void setBatchSizeFailureThreshold(int threshold)
{
DatabaseDescriptor.setBatchSizeFailThresholdInKB(threshold);
}
public void setHintedHandoffThrottleInKB(int throttleInKB)
{
DatabaseDescriptor.setHintedHandoffThrottleInKB(throttleInKB);
logger.info(String.format("Updated hinted_handoff_throttle_in_kb to %d", throttleInKB));
}
#endif
template <typename Func>
auto run_with_api_lock(sstring operation, Func&& func) {
return get_storage_service().invoke_on(0, [operation = std::move(operation),
func = std::forward<Func>(func)] (storage_service& ss) mutable {
if (!ss._operation_in_progress.empty()) {
throw std::runtime_error(sprint("Operation %s is in progress, try again", ss._operation_in_progress));
}
ss._operation_in_progress = std::move(operation);
return func(ss).finally([&ss] {
ss._operation_in_progress = sstring();
});
});
}
template <typename Func>
auto run_with_no_api_lock(Func&& func) {
return get_storage_service().invoke_on(0, [func = std::forward<Func>(func)] (storage_service& ss) mutable {
return func(ss);
});
}
private:
void do_isolate_on_error(disk_error type);
utils::UUID _local_host_id;
public:
utils::UUID get_local_id() { return _local_host_id; }
static sstring get_config_supported_features();
bool cluster_supports_range_tombstones() {
return bool(_range_tombstones_feature);
}
bool cluster_supports_large_partitions() const {
return bool(_large_partitions_feature);
}
bool cluster_supports_materialized_views() const {
return bool(_materialized_views_feature);
}
bool cluster_supports_counters() const {
return bool(_counters_feature);
}
bool cluster_supports_indexes() const {
return bool(_indexes_feature);
}
};
inline future<> init_storage_service(distributed<database>& db) {
return service::get_storage_service().start(std::ref(db));
}
inline future<> deinit_storage_service() {
return service::get_storage_service().stop();
}
}