/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Modified by ScyllaDB
* Copyright (C) 2015 ScyllaDB
*
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see .
*/
#pragma once
#include "gms/i_endpoint_state_change_subscriber.hh"
#include "service/endpoint_lifecycle_subscriber.hh"
#include "locator/token_metadata.hh"
#include "gms/gossiper.hh"
#include "utils/UUID_gen.hh"
#include "core/distributed.hh"
#include "dht/i_partitioner.hh"
#include "dht/boot_strapper.hh"
#include "dht/token_range_endpoints.hh"
#include "core/sleep.hh"
#include "gms/application_state.hh"
#include "db/system_keyspace.hh"
#include "core/semaphore.hh"
#include "utils/fb_utilities.hh"
#include "database.hh"
#include "streaming/stream_state.hh"
#include "streaming/stream_plan.hh"
#include
#include "disk-error-handler.hh"
#include "gms/feature.hh"
namespace transport {
class cql_server;
}
class thrift_server;
namespace service {
class load_broadcaster;
class storage_service;
extern distributed _the_storage_service;
inline distributed& get_storage_service() {
return _the_storage_service;
}
inline storage_service& get_local_storage_service() {
return _the_storage_service.local();
}
int get_generation_number();
enum class disk_error { regular, commit };
/**
* This abstraction contains the token/identifier of this node
* on the identifier space. This token gets gossiped around.
* This class will also maintain histograms of the load information
* of other nodes in the cluster.
*/
class storage_service : public service::migration_listener, public gms::i_endpoint_state_change_subscriber, public seastar::async_sharded_service {
public:
struct snapshot_details {
int64_t live;
int64_t total;
sstring cf;
sstring ks;
};
private:
using token = dht::token;
using token_range_endpoints = dht::token_range_endpoints;
using endpoint_details = dht::endpoint_details;
using boot_strapper = dht::boot_strapper;
using token_metadata = locator::token_metadata;
using application_state = gms::application_state;
using inet_address = gms::inet_address;
using versioned_value = gms::versioned_value;
#if 0
private static final Logger logger = LoggerFactory.getLogger(StorageService.class);
/* JMX notification serial number counter */
private final AtomicLong notificationSerialNumber = new AtomicLong();
#endif
distributed& _db;
int _update_jobs{0};
// Note that this is obviously only valid for the current shard. Users of
// this facility should elect a shard to be the coordinator based on any
// given objective criteria
//
// It shouldn't be impossible to actively serialize two callers if the need
// ever arise.
bool _loading_new_sstables = false;
shared_ptr _lb;
shared_ptr> _cql_server;
shared_ptr> _thrift_server;
sstring _operation_in_progress;
bool _force_remove_completion = false;
bool _ms_stopped = false;
bool _stream_manager_stopped = false;
public:
storage_service(distributed& db);
void isolate_on_error();
void isolate_on_commit_error();
// Needed by distributed<>
future<> stop();
void init_messaging_service();
void uninit_messaging_service();
private:
void do_update_pending_ranges();
public:
future<> keyspace_changed(const sstring& ks_name);
future<> update_pending_ranges();
const locator::token_metadata& get_token_metadata() const {
return _token_metadata;
}
locator::token_metadata& get_token_metadata() {
return _token_metadata;
}
future<> gossip_snitch_info();
void set_load_broadcaster(shared_ptr lb);
shared_ptr& get_load_broadcaster();
distributed& db() {
return _db;
}
private:
bool is_auto_bootstrap();
inet_address get_broadcast_address() const {
return utils::fb_utilities::get_broadcast_address();
}
/* This abstraction maintains the token/endpoint metadata information */
token_metadata _token_metadata;
token_metadata _shadow_token_metadata;
public:
std::chrono::milliseconds get_ring_delay();
gms::versioned_value::factory value_factory;
#if 0
public volatile VersionedValue.VersionedValueFactory valueFactory = new VersionedValue.VersionedValueFactory(getPartitioner());
private Thread drainOnShutdown = null;
public static final StorageService instance = new StorageService();
public static IPartitioner getPartitioner()
{
return DatabaseDescriptor.getPartitioner();
}
#endif
public:
dht::token_range_vector get_local_ranges(const sstring& keyspace_name) {
return get_ranges_for_endpoint(keyspace_name, get_broadcast_address());
}
#if 0
public Collection> getPrimaryRanges(String keyspace)
{
return getPrimaryRangesForEndpoint(keyspace, FBUtilities.getBroadcastAddress());
}
public Collection> getPrimaryRangesWithinDC(String keyspace)
{
return getPrimaryRangeForEndpointWithinDC(keyspace, FBUtilities.getBroadcastAddress());
}
private CassandraDaemon daemon;
#endif
private:
std::unordered_set _replicating_nodes;
std::experimental::optional _removing_node;
/* Are we starting this node in bootstrap mode? */
bool _is_bootstrap_mode;
/* we bootstrap but do NOT join the ring unless told to do so */
// FIXME: System.getProperty("cassandra.write_survey", "false")
bool _is_survey_mode = false;
bool _initialized;
bool _joined = false;
public:
enum class mode { STARTING, NORMAL, JOINING, LEAVING, DECOMMISSIONED, MOVING, DRAINING, DRAINED };
private:
mode _operation_mode = mode::STARTING;
friend std::ostream& operator<<(std::ostream& os, const mode& mode);
#if 0
/* the probability for tracing any particular request, 0 disables tracing and 1 enables for all */
private double traceProbability = 0.0;
#endif
/* Used for tracking drain progress */
public:
struct drain_progress {
int32_t total_cfs;
int32_t remaining_cfs;
drain_progress& operator+=(const drain_progress& other) {
total_cfs += other.total_cfs;
remaining_cfs += other.remaining_cfs;
return *this;
}
};
private:
drain_progress _drain_progress{};
#if 0
private static final AtomicInteger nextRepairCommand = new AtomicInteger();
#endif
std::vector _lifecycle_subscribers;
#if 0
private static final BackgroundActivityMonitor bgMonitor = new BackgroundActivityMonitor();
private final ObjectName jmxObjectName;
#endif
private:
std::unordered_set _bootstrap_tokens;
gms::feature _range_tombstones_feature;
gms::feature _large_partitions_feature;
gms::feature _materialized_views_feature;
gms::feature _counters_feature;
gms::feature _indexes_feature;
public:
void enable_all_features() {
_range_tombstones_feature.enable();
_large_partitions_feature.enable();
_materialized_views_feature.enable();
_counters_feature.enable();
_indexes_feature.enable();
}
void finish_bootstrapping() {
_is_bootstrap_mode = false;
}
/** This method updates the local token on disk */
void set_tokens(std::unordered_set tokens);
void set_gossip_tokens(const std::unordered_set& local_tokens);
#if 0
public void registerDaemon(CassandraDaemon daemon)
{
this.daemon = daemon;
}
#endif
void register_subscriber(endpoint_lifecycle_subscriber* subscriber);
void unregister_subscriber(endpoint_lifecycle_subscriber* subscriber);
// should only be called via JMX
future<> stop_gossiping();
// should only be called via JMX
future<> start_gossiping();
// should only be called via JMX
future is_gossip_running();
// should only be called via JMX
future<> start_rpc_server();
future<> stop_rpc_server();
future is_rpc_server_running();
future<> start_native_transport();
future<> stop_native_transport();
future is_native_transport_running();
private:
future<> do_stop_rpc_server();
future<> do_stop_native_transport();
future<> do_stop_ms();
future<> do_stop_stream_manager();
#if 0
public void stopTransports()
{
if (isInitialized())
{
logger.error("Stopping gossiper");
stopGossiping();
}
if (isRPCServerRunning())
{
logger.error("Stopping RPC server");
stopRPCServer();
}
if (isNativeTransportRunning())
{
logger.error("Stopping native transport");
stopNativeTransport();
}
}
#endif
private:
future<> shutdown_client_servers();
#if 0
public void stopClient()
{
Gossiper.instance.unregister(this);
Gossiper.instance.stop();
MessagingService.instance().shutdown();
// give it a second so that task accepted before the MessagingService shutdown gets submitted to the stage (to avoid RejectedExecutionException)
Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
StageManager.shutdownNow();
}
#endif
public:
future is_initialized();
#if 0
public void stopDaemon()
{
if (daemon == null)
throw new IllegalStateException("No configured daemon");
daemon.deactivate();
}
#endif
public:
future> prepare_replacement_info();
future<> check_for_endpoint_collision();
#if 0
// for testing only
public void unsafeInitialize() throws ConfigurationException
{
_initialized = true;
Gossiper.instance.register(this);
Gossiper.instance.start((int) (System.currentTimeMillis() / 1000)); // needed for node-ring gathering.
Gossiper.instance.addLocalApplicationState(ApplicationState.NET_VERSION, valueFactory.networkVersion());
if (!MessagingService.instance().isListening())
MessagingService.instance().listen(FBUtilities.getLocalAddress());
}
#endif
public:
future<> init_server() {
return init_server(get_ring_delay().count());
}
future<> init_server(int delay);
future<> drain_on_shutdown();
future<> stop_transport();
void flush_column_families();
#if 0
/**
* In the event of forceful termination we need to remove the shutdown hook to prevent hanging (OOM for instance)
*/
public void removeShutdownHook()
{
if (drainOnShutdown != null)
Runtime.getRuntime().removeShutdownHook(drainOnShutdown);
}
#endif
private:
bool should_bootstrap();
void prepare_to_join(std::vector loaded_endpoints);
void join_token_ring(int delay);
public:
future<> join_ring();
bool is_joined();
future<> rebuild(sstring source_dc);
#if 0
public void setStreamThroughputMbPerSec(int value)
{
DatabaseDescriptor.setStreamThroughputOutboundMegabitsPerSec(value);
logger.info("setstreamthroughput: throttle set to {}", value);
}
public int getStreamThroughputMbPerSec()
{
return DatabaseDescriptor.getStreamThroughputOutboundMegabitsPerSec();
}
public int getCompactionThroughputMbPerSec()
{
return DatabaseDescriptor.getCompactionThroughputMbPerSec();
}
public void setCompactionThroughputMbPerSec(int value)
{
DatabaseDescriptor.setCompactionThroughputMbPerSec(value);
}
public boolean isIncrementalBackupsEnabled()
{
return DatabaseDescriptor.isIncrementalBackupsEnabled();
}
public void setIncrementalBackupsEnabled(boolean value)
{
DatabaseDescriptor.setIncrementalBackupsEnabled(value);
}
#endif
private:
void set_mode(mode m, bool log);
void set_mode(mode m, sstring msg, bool log);
public:
void bootstrap(std::unordered_set tokens);
bool is_bootstrap_mode() {
return _is_bootstrap_mode;
}
#if 0
public TokenMetadata getTokenMetadata()
{
return _token_metadata;
}
/**
* Increment about the known Compaction severity of the events in this node
*/
public void reportSeverity(double incr)
{
bgMonitor.incrCompactionSeverity(incr);
}
public void reportManualSeverity(double incr)
{
bgMonitor.incrManualSeverity(incr);
}
public double getSeverity(InetAddress endpoint)
{
return bgMonitor.getSeverity(endpoint);
}
/**
* for a keyspace, return the ranges and corresponding listen addresses.
* @param keyspace
* @return the endpoint map
*/
public Map, List> getRangeToEndpointMap(String keyspace)
{
/* All the ranges for the tokens */
Map, List> map = new HashMap<>();
for (Map.Entry,List> entry : getRangeToAddressMap(keyspace).entrySet())
{
map.put(entry.getKey().asList(), stringify(entry.getValue()));
}
return map;
}
#endif
/**
* Return the rpc address associated with an endpoint as a string.
* @param endpoint The endpoint to get rpc address for
* @return the rpc address
*/
sstring get_rpc_address(const inet_address& endpoint) const;
#if 0
/**
* for a keyspace, return the ranges and corresponding RPC addresses for a given keyspace.
* @param keyspace
* @return the endpoint map
*/
public Map, List> getRangeToRpcaddressMap(String keyspace)
{
/* All the ranges for the tokens */
Map, List> map = new HashMap<>();
for (Map.Entry, List> entry : getRangeToAddressMap(keyspace).entrySet())
{
List rpcaddrs = new ArrayList<>(entry.getValue().size());
for (InetAddress endpoint: entry.getValue())
{
rpcaddrs.add(getRpcaddress(endpoint));
}
map.put(entry.getKey().asList(), rpcaddrs);
}
return map;
}
public Map, List> getPendingRangeToEndpointMap(String keyspace)
{
// some people just want to get a visual representation of things. Allow null and set it to the first
// non-system keyspace.
if (keyspace == null)
keyspace = Schema.instance.getNonSystemKeyspaces().get(0);
Map, List> map = new HashMap<>();
for (Map.Entry, Collection> entry : _token_metadata.getPendingRanges(keyspace).entrySet())
{
List l = new ArrayList<>(entry.getValue());
map.put(entry.getKey().asList(), stringify(l));
}
return map;
}
#endif
std::unordered_map> get_range_to_address_map(const sstring& keyspace) const;
std::unordered_map> get_range_to_address_map_in_local_dc(
const sstring& keyspace) const;
std::vector get_tokens_in_local_dc() const;
bool is_local_dc(const inet_address& targetHost) const;
std::unordered_map> get_range_to_address_map(const sstring& keyspace,
const std::vector& sorted_tokens) const;
/**
* The same as {@code describeRing(String)} but converts TokenRange to the String for JMX compatibility
*
* @param keyspace The keyspace to fetch information about
*
* @return a List of TokenRange(s) converted to String for the given keyspace
*/
/*
* describeRingJMX will be implemented in the API
* It is left here just as a marker that there is no need to implement it
* here
*/
//std::vector describeRingJMX(const sstring& keyspace) const {
#if 0
/**
* The same as {@code describeRing(String)} but considers only the part of the ring formed by nodes in the local DC.
*/
public List describeLocalRing(String keyspace) throws InvalidRequestException
{
return describeRing(keyspace, true);
}
#endif
std::vector describe_ring(const sstring& keyspace, bool include_only_local_dc = false) const;
/**
* Retrieve a map of tokens to endpoints, including the bootstrapping ones.
*
* @return a map of tokens to endpoints in ascending order
*/
std::map get_token_to_endpoint_map();
#if 0
public String getLocalHostId()
{
return getTokenMetadata().getHostId(FBUtilities.getBroadcastAddress()).toString();
}
public Map getHostIdMap()
{
Map mapOut = new HashMap<>();
for (Map.Entry entry : getTokenMetadata().getEndpointToHostIdMapForReading().entrySet())
mapOut.put(entry.getKey().getHostAddress(), entry.getValue().toString());
return mapOut;
}
#endif
/**
* Construct the range to endpoint mapping based on the true view
* of the world.
* @param ranges
* @return mapping of ranges to the replicas responsible for them.
*/
std::unordered_map> construct_range_to_endpoint_map(
const sstring& keyspace,
const dht::token_range_vector& ranges) const;
public:
virtual void on_join(gms::inet_address endpoint, gms::endpoint_state ep_state) override;
virtual void before_change(gms::inet_address endpoint, gms::endpoint_state current_state, gms::application_state new_state_key, const gms::versioned_value& new_value) override;
/*
* Handle the reception of a new particular ApplicationState for a particular endpoint. Note that the value of the
* ApplicationState has not necessarily "changed" since the last known value, if we already received the same update
* from somewhere else.
*
* onChange only ever sees one ApplicationState piece change at a time (even if many ApplicationState updates were
* received at the same time), so we perform a kind of state machine here. We are concerned with two events: knowing
* the token associated with an endpoint, and knowing its operation mode. Nodes can start in either bootstrap or
* normal mode, and from bootstrap mode can change mode to normal. A node in bootstrap mode needs to have
* pendingranges set in TokenMetadata; a node in normal mode should instead be part of the token ring.
*
* Normal progression of ApplicationState.STATUS values for a node should be like this:
* STATUS_BOOTSTRAPPING,token
* if bootstrapping. stays this way until all files are received.
* STATUS_NORMAL,token
* ready to serve reads and writes.
* STATUS_LEAVING,token
* get ready to leave the cluster as part of a decommission
* STATUS_LEFT,token
* set after decommission is completed.
*
* Other STATUS values that may be seen (possibly anywhere in the normal progression):
* STATUS_MOVING,newtoken
* set if node is currently moving to a new token in the ring
* REMOVING_TOKEN,deadtoken
* set if the node is dead and is being removed by its REMOVAL_COORDINATOR
* REMOVED_TOKEN,deadtoken
* set if the node is dead and has been removed by its REMOVAL_COORDINATOR
*
* Note: Any time a node state changes from STATUS_NORMAL, it will not be visible to new nodes. So it follows that
* you should never bootstrap a new node during a removenode, decommission or move.
*/
virtual void on_change(inet_address endpoint, application_state state, const versioned_value& value) override;
virtual void on_alive(gms::inet_address endpoint, gms::endpoint_state state) override;
virtual void on_dead(gms::inet_address endpoint, gms::endpoint_state state) override;
virtual void on_remove(gms::inet_address endpoint) override;
virtual void on_restart(gms::inet_address endpoint, gms::endpoint_state state) override;
public:
// For migration_listener
virtual void on_create_keyspace(const sstring& ks_name) override { keyspace_changed(ks_name).get(); }
virtual void on_create_column_family(const sstring& ks_name, const sstring& cf_name) override {}
virtual void on_create_user_type(const sstring& ks_name, const sstring& type_name) override {}
virtual void on_create_function(const sstring& ks_name, const sstring& function_name) override {}
virtual void on_create_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
virtual void on_create_view(const sstring& ks_name, const sstring& view_name) override {}
virtual void on_update_keyspace(const sstring& ks_name) override { keyspace_changed(ks_name).get(); }
virtual void on_update_column_family(const sstring& ks_name, const sstring& cf_name, bool) override {}
virtual void on_update_user_type(const sstring& ks_name, const sstring& type_name) override {}
virtual void on_update_function(const sstring& ks_name, const sstring& function_name) override {}
virtual void on_update_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
virtual void on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) override {}
virtual void on_drop_keyspace(const sstring& ks_name) override { keyspace_changed(ks_name).get(); }
virtual void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {}
virtual void on_drop_user_type(const sstring& ks_name, const sstring& type_name) override {}
virtual void on_drop_function(const sstring& ks_name, const sstring& function_name) override {}
virtual void on_drop_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
virtual void on_drop_view(const sstring& ks_name, const sstring& view_name) override {}
private:
void update_peer_info(inet_address endpoint);
void do_update_system_peers_table(gms::inet_address endpoint, const application_state& state, const versioned_value& value);
sstring get_application_state_value(inet_address endpoint, application_state appstate);
std::unordered_set get_tokens_for(inet_address endpoint);
future<> replicate_to_all_cores();
semaphore _replicate_task{1};
private:
/**
* Replicates token_metadata contents on shard0 instance to other shards.
*
* Should be called with a _replicate_task semaphore taken.
* Should run on shard 0 only.
*
* @return a ready future when replication is complete.
*/
future<> replicate_tm_only();
/**
* Replicates token_metadata and gossiper::endpoint_state_map contents on
* shard0 instances to other shards.
*
* Should be called with a _replicate_task and a gossiper::timer_callback
* semaphores taken.
* Should run on shard 0 only.
*
* @param g0 a "shared_from_this()" pointer to a gossiper instance on shard0
*
* @return a ready future when replication is complete.
*/
future<> replicate_tm_and_ep_map(shared_ptr g0);
/**
* Handle node bootstrap
*
* @param endpoint bootstrapping node
*/
void handle_state_bootstrap(inet_address endpoint);
/**
* Handle node move to normal state. That is, node is entering token ring and participating
* in reads.
*
* @param endpoint node
*/
void handle_state_normal(inet_address endpoint);
/**
* Handle node preparing to leave the ring
*
* @param endpoint node
*/
void handle_state_leaving(inet_address endpoint);
/**
* Handle node leaving the ring. This will happen when a node is decommissioned
*
* @param endpoint If reason for leaving is decommission, endpoint is the leaving node.
* @param pieces STATE_LEFT,token
*/
void handle_state_left(inet_address endpoint, std::vector pieces);
/**
* Handle node moving inside the ring.
*
* @param endpoint moving endpoint address
* @param pieces STATE_MOVING, token
*/
void handle_state_moving(inet_address endpoint, std::vector pieces);
/**
* Handle notification that a node being actively removed from the ring via 'removenode'
*
* @param endpoint node
* @param pieces either REMOVED_TOKEN (node is gone) or REMOVING_TOKEN (replicas need to be restored)
*/
void handle_state_removing(inet_address endpoint, std::vector pieces);
private:
void excise(std::unordered_set tokens, inet_address endpoint);
void excise(std::unordered_set tokens, inet_address endpoint, long expire_time);
/** unlike excise we just need this endpoint gone without going through any notifications **/
void remove_endpoint(inet_address endpoint);
void add_expire_time_if_found(inet_address endpoint, int64_t expire_time);
int64_t extract_expire_time(const std::vector& pieces) {
return std::stoll(pieces[2]);
}
/**
* Finds living endpoints responsible for the given ranges
*
* @param keyspaceName the keyspace ranges belong to
* @param ranges the ranges to find sources for
* @return multimap of addresses to ranges the address is responsible for
*/
std::unordered_multimap get_new_source_ranges(const sstring& keyspaceName, const dht::token_range_vector& ranges);
public:
future<> confirm_replication(inet_address node);
private:
/**
* Sends a notification to a node indicating we have finished replicating data.
*
* @param remote node to send notification to
*/
future<> send_replication_notification(inet_address remote);
/**
* Called when an endpoint is removed from the ring. This function checks
* whether this node becomes responsible for new ranges as a
* consequence and streams data if needed.
*
* This is rather ineffective, but it does not matter so much
* since this is called very seldom
*
* @param endpoint the node that left
*/
future<> restore_replica_count(inet_address endpoint, inet_address notify_endpoint);
// needs to be modified to accept either a keyspace or ARS.
std::unordered_multimap get_changed_ranges_for_leaving(sstring keyspace_name, inet_address endpoint);
public:
/** raw load value */
double get_load();
sstring get_load_string();
future> get_load_map();
#if 0
public final void deliverHints(String host) throws UnknownHostException
{
HintedHandOffManager.instance.scheduleHintDelivery(host);
}
#endif
public:
future> get_local_tokens();
#if 0
/* These methods belong to the MBean interface */
public List getTokens()
{
return getTokens(FBUtilities.getBroadcastAddress());
}
public List getTokens(String endpoint) throws UnknownHostException
{
return getTokens(InetAddress.getByName(endpoint));
}
private List getTokens(InetAddress endpoint)
{
List strTokens = new ArrayList<>();
for (Token tok : getTokenMetadata().getTokens(endpoint))
strTokens.add(tok.toString());
return strTokens;
}
#endif
sstring get_release_version();
sstring get_schema_version();
future>> describe_schema_versions();
#if 0
public List getLeavingNodes()
{
return stringify(_token_metadata.getLeavingEndpoints());
}
public List getMovingNodes()
{
List endpoints = new ArrayList<>();
for (Pair node : _token_metadata.getMovingEndpoints())
{
endpoints.add(node.right.getHostAddress());
}
return endpoints;
}
public List getJoiningNodes()
{
return stringify(_token_metadata.getBootstrapTokens().valueSet());
}
public List getLiveNodes()
{
return stringify(Gossiper.instance.getLiveMembers());
}
public List getUnreachableNodes()
{
return stringify(Gossiper.instance.getUnreachableMembers());
}
private List stringify(Iterable endpoints)
{
List stringEndpoints = new ArrayList<>();
for (InetAddress ep : endpoints)
{
stringEndpoints.add(ep.getHostAddress());
}
return stringEndpoints;
}
public int forceKeyspaceCleanup(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
{
if (keyspaceName.equals(SystemKeyspace.NAME))
throw new RuntimeException("Cleanup of the system keyspace is neither necessary nor wise");
CompactionManager.AllSSTableOpStatus status = CompactionManager.AllSSTableOpStatus.SUCCESSFUL;
for (ColumnFamilyStore cfStore : getValidColumnFamilies(false, false, keyspaceName, columnFamilies))
{
CompactionManager.AllSSTableOpStatus oneStatus = cfStore.forceCleanup();
if (oneStatus != CompactionManager.AllSSTableOpStatus.SUCCESSFUL)
status = oneStatus;
}
return status.statusCode;
}
public int scrub(boolean disableSnapshot, boolean skipCorrupted, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
{
CompactionManager.AllSSTableOpStatus status = CompactionManager.AllSSTableOpStatus.SUCCESSFUL;
for (ColumnFamilyStore cfStore : getValidColumnFamilies(false, false, keyspaceName, columnFamilies))
{
CompactionManager.AllSSTableOpStatus oneStatus = cfStore.scrub(disableSnapshot, skipCorrupted);
if (oneStatus != CompactionManager.AllSSTableOpStatus.SUCCESSFUL)
status = oneStatus;
}
return status.statusCode;
}
public int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
{
CompactionManager.AllSSTableOpStatus status = CompactionManager.AllSSTableOpStatus.SUCCESSFUL;
for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, true, keyspaceName, columnFamilies))
{
CompactionManager.AllSSTableOpStatus oneStatus = cfStore.sstablesRewrite(excludeCurrentVersion);
if (oneStatus != CompactionManager.AllSSTableOpStatus.SUCCESSFUL)
status = oneStatus;
}
return status.statusCode;
}
public void forceKeyspaceCompaction(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
{
for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, false, keyspaceName, columnFamilies))
{
cfStore.forceMajorCompaction();
}
}
#endif
/**
* Takes the snapshot for all keyspaces. A snapshot name must be specified.
*
* @param tag the tag given to the snapshot; may not be null or empty
*/
future<> take_snapshot(sstring tag) {
return take_snapshot(tag, {});
}
/**
* Takes the snapshot for the given keyspaces. A snapshot name must be specified.
*
* @param tag the tag given to the snapshot; may not be null or empty
* @param keyspaceNames the names of the keyspaces to snapshot; empty means "all."
*/
future<> take_snapshot(sstring tag, std::vector keyspace_names);
/**
* Takes the snapshot of a specific column family. A snapshot name must be specified.
*
* @param keyspaceName the keyspace which holds the specified column family
* @param columnFamilyName the column family to snapshot
* @param tag the tag given to the snapshot; may not be null or empty
*/
future<> take_column_family_snapshot(sstring ks_name, sstring cf_name, sstring tag);
#if 0
private Keyspace getValidKeyspace(String keyspaceName) throws IOException
{
if (!Schema.instance.getKeyspaces().contains(keyspaceName))
{
throw new IOException("Keyspace " + keyspaceName + " does not exist");
}
return Keyspace.open(keyspaceName);
}
#endif
/**
* Remove the snapshot with the given name from the given keyspaces.
* If no tag is specified we will remove all snapshots.
*/
future<> clear_snapshot(sstring tag, std::vector keyspace_names);
future>> get_snapshot_details();
future true_snapshots_size();
#if 0
/**
* @param allowIndexes Allow index CF names to be passed in
* @param autoAddIndexes Automatically add secondary indexes if a CF has them
* @param keyspaceName keyspace
* @param cfNames CFs
* @throws java.lang.IllegalArgumentException when given CF name does not exist
*/
public Iterable getValidColumnFamilies(boolean allowIndexes, boolean autoAddIndexes, String keyspaceName, String... cfNames) throws IOException
{
Keyspace keyspace = getValidKeyspace(keyspaceName);
Set valid = new HashSet<>();
if (cfNames.length == 0)
{
// all stores are interesting
for (ColumnFamilyStore cfStore : keyspace.getColumnFamilyStores())
{
valid.add(cfStore);
if (autoAddIndexes)
{
for (SecondaryIndex si : cfStore.indexManager.getIndexes())
{
if (si.getIndexCfs() != null) {
logger.info("adding secondary index {} to operation", si.getIndexName());
valid.add(si.getIndexCfs());
}
}
}
}
return valid;
}
// filter out interesting stores
for (String cfName : cfNames)
{
//if the CF name is an index, just flush the CF that owns the index
String baseCfName = cfName;
String idxName = null;
if (cfName.contains(".")) // secondary index
{
if(!allowIndexes)
{
logger.warn("Operation not allowed on secondary Index table ({})", cfName);
continue;
}
String[] parts = cfName.split("\\.", 2);
baseCfName = parts[0];
idxName = parts[1];
}
ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore(baseCfName);
if (idxName != null)
{
Collection< SecondaryIndex > indexes = cfStore.indexManager.getIndexesByNames(new HashSet<>(Arrays.asList(cfName)));
if (indexes.isEmpty())
logger.warn(String.format("Invalid index specified: %s/%s. Proceeding with others.", baseCfName, idxName));
else
valid.add(Iterables.get(indexes, 0).getIndexCfs());
}
else
{
valid.add(cfStore);
if(autoAddIndexes)
{
for(SecondaryIndex si : cfStore.indexManager.getIndexes())
{
if (si.getIndexCfs() != null) {
logger.info("adding secondary index {} to operation", si.getIndexName());
valid.add(si.getIndexCfs());
}
}
}
}
}
return valid;
}
/**
* Flush all memtables for a keyspace and column families.
* @param keyspaceName
* @param columnFamilies
* @throws IOException
*/
public void forceKeyspaceFlush(String keyspaceName, String... columnFamilies) throws IOException
{
for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, false, keyspaceName, columnFamilies))
{
logger.debug("Forcing flush on keyspace {}, CF {}", keyspaceName, cfStore.name);
cfStore.forceBlockingFlush();
}
}
/**
* Sends JMX notification to subscribers.
*
* @param type Message type
* @param message Message itself
* @param userObject Arbitrary object to attach to notification
*/
public void sendNotification(String type, String message, Object userObject)
{
Notification jmxNotification = new Notification(type, jmxObjectName, notificationSerialNumber.incrementAndGet(), message);
jmxNotification.setUserData(userObject);
sendNotification(jmxNotification);
}
public int repairAsync(String keyspace, Map repairSpec)
{
RepairOption option = RepairOption.parse(repairSpec, getPartitioner());
// if ranges are not specified
if (option.getRanges().isEmpty())
{
if (option.isPrimaryRange())
{
// when repairing only primary range, neither dataCenters nor hosts can be set
if (option.getDataCenters().isEmpty() && option.getHosts().isEmpty())
option.getRanges().addAll(getPrimaryRanges(keyspace));
// except dataCenters only contain local DC (i.e. -local)
else if (option.getDataCenters().size() == 1 && option.getDataCenters().contains(DatabaseDescriptor.getLocalDataCenter()))
option.getRanges().addAll(getPrimaryRangesWithinDC(keyspace));
else
throw new IllegalArgumentException("You need to run primary range repair on all nodes in the cluster.");
}
else
{
option.getRanges().addAll(getLocalRanges(keyspace));
}
}
return forceRepairAsync(keyspace, option);
}
@Deprecated
public int forceRepairAsync(String keyspace,
boolean isSequential,
Collection dataCenters,
Collection hosts,
boolean primaryRange,
boolean fullRepair,
String... columnFamilies)
{
return forceRepairAsync(keyspace, isSequential ? RepairParallelism.SEQUENTIAL : RepairParallelism.PARALLEL, dataCenters, hosts, primaryRange, fullRepair, columnFamilies);
}
@Deprecated
public int forceRepairAsync(String keyspace,
RepairParallelism parallelismDegree,
Collection dataCenters,
Collection hosts,
boolean primaryRange,
boolean fullRepair,
String... columnFamilies)
{
if (FBUtilities.isWindows() && parallelismDegree != RepairParallelism.PARALLEL)
{
logger.warn("Snapshot-based repair is not yet supported on Windows. Reverting to parallel repair.");
parallelismDegree = RepairParallelism.PARALLEL;
}
RepairOption options = new RepairOption(parallelismDegree, primaryRange, !fullRepair, false, 1, Collections.>emptyList());
if (dataCenters != null)
{
options.getDataCenters().addAll(dataCenters);
}
if (hosts != null)
{
options.getHosts().addAll(hosts);
}
if (columnFamilies != null)
{
for (String columnFamily : columnFamilies)
{
options.getColumnFamilies().add(columnFamily);
}
}
return forceRepairAsync(keyspace, options);
}
public int forceRepairAsync(String keyspace,
boolean isSequential,
boolean isLocal,
boolean primaryRange,
boolean fullRepair,
String... columnFamilies)
{
Set dataCenters = null;
if (isLocal)
{
dataCenters = Sets.newHashSet(DatabaseDescriptor.getLocalDataCenter());
}
return forceRepairAsync(keyspace, isSequential, dataCenters, null, primaryRange, fullRepair, columnFamilies);
}
public int forceRepairRangeAsync(String beginToken,
String endToken,
String keyspaceName,
boolean isSequential,
Collection dataCenters,
Collection hosts,
boolean fullRepair,
String... columnFamilies)
{
return forceRepairRangeAsync(beginToken, endToken, keyspaceName, isSequential ? RepairParallelism.SEQUENTIAL : RepairParallelism.PARALLEL, dataCenters, hosts, fullRepair, columnFamilies);
}
public int forceRepairRangeAsync(String beginToken,
String endToken,
String keyspaceName,
RepairParallelism parallelismDegree,
Collection dataCenters,
Collection hosts,
boolean fullRepair,
String... columnFamilies)
{
if (FBUtilities.isWindows() && parallelismDegree != RepairParallelism.PARALLEL)
{
logger.warn("Snapshot-based repair is not yet supported on Windows. Reverting to parallel repair.");
parallelismDegree = RepairParallelism.PARALLEL;
}
Collection> repairingRange = createRepairRangeFrom(beginToken, endToken);
RepairOption options = new RepairOption(parallelismDegree, false, !fullRepair, false, 1, repairingRange);
options.getDataCenters().addAll(dataCenters);
if (hosts != null)
{
options.getHosts().addAll(hosts);
}
if (columnFamilies != null)
{
for (String columnFamily : columnFamilies)
{
options.getColumnFamilies().add(columnFamily);
}
}
logger.info("starting user-requested repair of range {} for keyspace {} and column families {}",
repairingRange, keyspaceName, columnFamilies);
return forceRepairAsync(keyspaceName, options);
}
public int forceRepairRangeAsync(String beginToken,
String endToken,
String keyspaceName,
boolean isSequential,
boolean isLocal,
boolean fullRepair,
String... columnFamilies)
{
Set dataCenters = null;
if (isLocal)
{
dataCenters = Sets.newHashSet(DatabaseDescriptor.getLocalDataCenter());
}
return forceRepairRangeAsync(beginToken, endToken, keyspaceName, isSequential, dataCenters, null, fullRepair, columnFamilies);
}
/**
* Create collection of ranges that match ring layout from given tokens.
*
* @param beginToken beginning token of the range
* @param endToken end token of the range
* @return collection of ranges that match ring layout in TokenMetadata
*/
@SuppressWarnings("unchecked")
@VisibleForTesting
Collection> createRepairRangeFrom(String beginToken, String endToken)
{
Token parsedBeginToken = getPartitioner().getTokenFactory().fromString(beginToken);
Token parsedEndToken = getPartitioner().getTokenFactory().fromString(endToken);
// Break up given range to match ring layout in TokenMetadata
ArrayList> repairingRange = new ArrayList<>();
ArrayList tokens = new ArrayList<>(_token_metadata.sortedTokens());
if (!tokens.contains(parsedBeginToken))
{
tokens.add(parsedBeginToken);
}
if (!tokens.contains(parsedEndToken))
{
tokens.add(parsedEndToken);
}
// tokens now contain all tokens including our endpoints
Collections.sort(tokens);
int start = tokens.indexOf(parsedBeginToken), end = tokens.indexOf(parsedEndToken);
for (int i = start; i != end; i = (i+1) % tokens.size())
{
Range range = new Range<>(tokens.get(i), tokens.get((i+1) % tokens.size()));
repairingRange.add(range);
}
return repairingRange;
}
public int forceRepairAsync(String keyspace, RepairOption options)
{
if (options.getRanges().isEmpty() || Keyspace.open(keyspace).getReplicationStrategy().getReplicationFactor() < 2)
return 0;
int cmd = nextRepairCommand.incrementAndGet();
new Thread(createRepairTask(cmd, keyspace, options)).start();
return cmd;
}
private Thread createQueryThread(final int cmd, final UUID sessionId)
{
return new Thread(new WrappedRunnable()
{
// Query events within a time interval that overlaps the last by one second. Ignore duplicates. Ignore local traces.
// Wake up upon local trace activity. Query when notified of trace activity with a timeout that doubles every two timeouts.
public void runMayThrow() throws Exception
{
TraceState state = Tracing.instance.get(sessionId);
if (state == null)
throw new Exception("no tracestate");
String format = "select event_id, source, activity from %s.%s where session_id = ? and event_id > ? and event_id < ?;";
String query = String.format(format, TraceKeyspace.NAME, TraceKeyspace.EVENTS);
SelectStatement statement = (SelectStatement) QueryProcessor.parseStatement(query).prepare().statement;
ByteBuffer sessionIdBytes = ByteBufferUtil.bytes(sessionId);
InetAddress source = FBUtilities.getBroadcastAddress();
HashSet[] seen = new HashSet[] { new HashSet(), new HashSet() };
int si = 0;
UUID uuid;
long tlast = System.currentTimeMillis(), tcur;
TraceState.Status status;
long minWaitMillis = 125;
long maxWaitMillis = 1000 * 1024L;
long timeout = minWaitMillis;
boolean shouldDouble = false;
while ((status = state.waitActivity(timeout)) != TraceState.Status.STOPPED)
{
if (status == TraceState.Status.IDLE)
{
timeout = shouldDouble ? Math.min(timeout * 2, maxWaitMillis) : timeout;
shouldDouble = !shouldDouble;
}
else
{
timeout = minWaitMillis;
shouldDouble = false;
}
ByteBuffer tminBytes = ByteBufferUtil.bytes(UUIDGen.minTimeUUID(tlast - 1000));
ByteBuffer tmaxBytes = ByteBufferUtil.bytes(UUIDGen.maxTimeUUID(tcur = System.currentTimeMillis()));
QueryOptions options = QueryOptions.forInternalCalls(ConsistencyLevel.ONE, Lists.newArrayList(sessionIdBytes, tminBytes, tmaxBytes));
ResultMessage.Rows rows = statement.execute(QueryState.forInternalCalls(), options);
UntypedResultSet result = UntypedResultSet.create(rows.result);
for (UntypedResultSet.Row r : result)
{
if (source.equals(r.getInetAddress("source")))
continue;
if ((uuid = r.getUUID("event_id")).timestamp() > (tcur - 1000) * 10000)
seen[si].add(uuid);
if (seen[si == 0 ? 1 : 0].contains(uuid))
continue;
String message = String.format("%s: %s", r.getInetAddress("source"), r.getString("activity"));
sendNotification("repair", message, new int[]{cmd, ActiveRepairService.Status.RUNNING.ordinal()});
}
tlast = tcur;
si = si == 0 ? 1 : 0;
seen[si].clear();
}
}
});
}
private FutureTask