mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-31 03:56:42 +00:00
Allows other topology operations to execute while tablets are being
drained on decommission. In particular, bootstrap on scale-out. This
is important for elasticity.
Allows multiple decommission/removenode to happen in parallel, which
is important for efficiency.
Flow of decommission/removenode request:
1) pending and paused, has tablet replicas on target node.
Tablet scheduler will start draining tablets.
2) No tablets on target node, request is pending but not paused
3) Request is scheduled, node is in transition
4) Request is done
Nodes are considered draining as soon as there is a leave or remove
request on them. If there are tablet replicas present on the target
node, the request is in a paused state and will not be picked by
topology coordinator. The paused state is computed from topology state
automatically on reload.
When request is not paused, its execution starts in
write_both_read_old state. The old tablet_draining state is not
entered (it's deprecated now).
Tablet load balancing will yield the state machine as soon as some
request is no longer paused and ready to be scheduled, based on
standard preemption mechanics.
The test case test_explicit_tablet_movement_during_decommission is
removed. It verifies that tablet move API works during tablet draining
transition. After this PR, we no longer enter this transition, so the
test doesn't work. It loses its purpose, because movement during
normal tablet balancing is not special and tested elsewhere.
325 lines
11 KiB
C++
325 lines
11 KiB
C++
/*
|
|
* Copyright (C) 2023-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "replica/database_fwd.hh"
|
|
#include "locator/tablets.hh"
|
|
#include "tablet_allocator_fwd.hh"
|
|
#include "locator/token_metadata_fwd.hh"
|
|
#include <seastar/core/metrics.hh>
|
|
|
|
namespace db {
|
|
class system_keyspace;
|
|
}
|
|
|
|
namespace service {
|
|
|
|
class topology;
|
|
|
|
struct auto_repair_stats {
|
|
// Number of tablets with auto repair enabled
|
|
size_t enabled_nr = 0;
|
|
// Number of tabelts with auto repair enabled that currently needs repair.
|
|
size_t needs_repair_nr = 0;
|
|
};
|
|
|
|
struct load_balancer_dc_stats {
|
|
uint64_t calls = 0;
|
|
uint64_t migrations_produced = 0;
|
|
uint64_t migrations_from_skiplist = 0;
|
|
uint64_t candidates_evaluated = 0;
|
|
uint64_t bad_first_candidates = 0;
|
|
uint64_t bad_migrations = 0;
|
|
uint64_t intranode_migrations_produced = 0;
|
|
uint64_t migrations_skipped = 0;
|
|
uint64_t tablets_skipped_node = 0;
|
|
uint64_t tablets_skipped_rack = 0;
|
|
uint64_t stop_balance = 0;
|
|
uint64_t stop_load_inversion = 0;
|
|
uint64_t stop_no_candidates = 0;
|
|
uint64_t stop_skip_limit = 0;
|
|
uint64_t stop_batch_size = 0;
|
|
uint64_t cross_rack_collocations = 0;
|
|
|
|
load_balancer_dc_stats operator-(const load_balancer_dc_stats& other) const {
|
|
return {
|
|
calls - other.calls,
|
|
migrations_produced - other.migrations_produced,
|
|
migrations_from_skiplist - other.migrations_from_skiplist,
|
|
candidates_evaluated - other.candidates_evaluated,
|
|
bad_first_candidates - other.bad_first_candidates,
|
|
bad_migrations - other.bad_migrations,
|
|
intranode_migrations_produced - other.intranode_migrations_produced,
|
|
migrations_skipped - other.migrations_skipped,
|
|
tablets_skipped_node - other.tablets_skipped_node,
|
|
tablets_skipped_rack - other.tablets_skipped_rack,
|
|
stop_balance - other.stop_balance,
|
|
stop_load_inversion - other.stop_load_inversion,
|
|
stop_no_candidates - other.stop_no_candidates,
|
|
stop_skip_limit - other.stop_skip_limit,
|
|
stop_batch_size - other.stop_batch_size,
|
|
cross_rack_collocations - other.cross_rack_collocations,
|
|
};
|
|
}
|
|
};
|
|
|
|
class drain_failure {
|
|
locator::host_id _node;
|
|
sstring _reason;
|
|
public:
|
|
drain_failure(locator::host_id node, sstring reason)
|
|
: _node(node)
|
|
, _reason(std::move(reason))
|
|
{ }
|
|
|
|
const locator::host_id& node() const { return _node; }
|
|
const sstring& reason() const { return _reason; }
|
|
};
|
|
|
|
struct load_balancer_node_stats {
|
|
double load = 0;
|
|
};
|
|
|
|
struct load_balancer_cluster_stats {
|
|
uint64_t resizes_emitted = 0;
|
|
uint64_t resizes_revoked = 0;
|
|
uint64_t resizes_finalized = 0;
|
|
uint64_t auto_repair_needs_repair_nr = 0;
|
|
uint64_t auto_repair_enabled_nr = 0;
|
|
};
|
|
|
|
using dc_name = sstring;
|
|
|
|
class load_balancer_stats_manager {
|
|
using host_id = locator::host_id;
|
|
|
|
sstring group_name;
|
|
std::unordered_map<dc_name, std::unique_ptr<load_balancer_dc_stats>> _dc_stats;
|
|
std::unordered_map<host_id, std::unique_ptr<load_balancer_node_stats>> _node_stats;
|
|
load_balancer_cluster_stats _cluster_stats;
|
|
seastar::metrics::label dc_label{"target_dc"};
|
|
seastar::metrics::label node_label{"target_node"};
|
|
seastar::metrics::metric_groups _metrics;
|
|
|
|
void setup_metrics(const dc_name& dc, load_balancer_dc_stats& stats);
|
|
void setup_metrics(const dc_name& dc, host_id node, load_balancer_node_stats& stats);
|
|
void setup_metrics(load_balancer_cluster_stats& stats);
|
|
public:
|
|
load_balancer_stats_manager(sstring group_name);
|
|
|
|
load_balancer_dc_stats& for_dc(const dc_name& dc);
|
|
load_balancer_node_stats& for_node(const dc_name& dc, host_id node);
|
|
load_balancer_cluster_stats& for_cluster();
|
|
|
|
void unregister();
|
|
};
|
|
|
|
using tablet_migration_info = locator::tablet_migration_info;
|
|
|
|
/// Represents intention to emit resize (split or merge) request for a
|
|
/// table, and finalize or revoke the request previously initiated.
|
|
struct table_resize_plan {
|
|
std::unordered_map<table_id, locator::resize_decision> resize;
|
|
std::unordered_set<table_id> finalize_resize;
|
|
|
|
size_t size() const { return resize.size() + finalize_resize.size(); }
|
|
|
|
void merge(table_resize_plan&& other) {
|
|
for (auto&& [id, other_resize] : other.resize) {
|
|
if (!resize.contains(id) || other_resize.sequence_number > resize[id].sequence_number) {
|
|
resize[id] = std::move(other_resize);
|
|
}
|
|
}
|
|
|
|
finalize_resize.merge(std::move(other.finalize_resize));
|
|
}
|
|
};
|
|
|
|
struct tablet_repair_plan {
|
|
std::unordered_set<locator::global_tablet_id> _repairs;
|
|
|
|
const std::unordered_set<locator::global_tablet_id>& repairs() const {
|
|
return _repairs;
|
|
}
|
|
|
|
size_t size() const { return _repairs.size(); };
|
|
|
|
void merge(tablet_repair_plan&& other) {
|
|
for (auto& r : other._repairs) {
|
|
_repairs.insert(r);
|
|
}
|
|
}
|
|
|
|
void add(const locator::global_tablet_id& gid) {
|
|
_repairs.insert(gid);
|
|
}
|
|
};
|
|
|
|
struct tablet_rack_list_colocation_plan {
|
|
utils::UUID _request_to_resume;
|
|
|
|
const utils::UUID& request_to_resume() const noexcept {
|
|
return _request_to_resume;
|
|
}
|
|
|
|
size_t size() const { return _request_to_resume ? 1 : 0; };
|
|
|
|
void merge(tablet_rack_list_colocation_plan&& other) {
|
|
_request_to_resume = _request_to_resume ? _request_to_resume : other._request_to_resume;
|
|
}
|
|
|
|
void maybe_add_request_to_resume(const utils::UUID& id) {
|
|
if (!_request_to_resume) {
|
|
_request_to_resume = id;
|
|
}
|
|
}
|
|
};
|
|
|
|
class migration_plan {
|
|
public:
|
|
using migrations_vector = utils::chunked_vector<tablet_migration_info>;
|
|
private:
|
|
migrations_vector _migrations;
|
|
table_resize_plan _resize_plan;
|
|
tablet_repair_plan _repair_plan;
|
|
tablet_rack_list_colocation_plan _rack_list_colocation_plan;
|
|
bool _has_nodes_to_drain = false;
|
|
std::vector<drain_failure> _drain_failures;
|
|
public:
|
|
/// Returns true iff there are decommissioning nodes which own some tablet replicas.
|
|
bool has_nodes_to_drain() const { return _has_nodes_to_drain; }
|
|
|
|
const migrations_vector& migrations() const { return _migrations; }
|
|
bool empty() const { return _migrations.empty() && !_resize_plan.size() && !_repair_plan.size() && !_rack_list_colocation_plan.size() && _drain_failures.empty(); }
|
|
size_t size() const { return _migrations.size() + _resize_plan.size() + _repair_plan.size() + _rack_list_colocation_plan.size() + _drain_failures.size(); }
|
|
size_t tablet_migration_count() const { return _migrations.size(); }
|
|
size_t resize_decision_count() const { return _resize_plan.size(); }
|
|
size_t tablet_repair_count() const { return _repair_plan.size(); }
|
|
size_t tablet_rack_list_colocation_count() const { return _rack_list_colocation_plan.size(); }
|
|
const std::vector<drain_failure>& drain_failures() const { return _drain_failures; }
|
|
|
|
void add(tablet_migration_info info) {
|
|
_migrations.emplace_back(std::move(info));
|
|
}
|
|
|
|
void add(migrations_vector migrations) {
|
|
for (auto&& mig : migrations) {
|
|
add(std::move(mig));
|
|
}
|
|
}
|
|
|
|
void add(drain_failure failure) {
|
|
_drain_failures.emplace_back(std::move(failure));
|
|
}
|
|
|
|
void merge(migration_plan&& other) {
|
|
std::move(other._migrations.begin(), other._migrations.end(), std::back_inserter(_migrations));
|
|
std::move(other._drain_failures.begin(), other._drain_failures.end(), std::back_inserter(_drain_failures));
|
|
_has_nodes_to_drain |= other._has_nodes_to_drain;
|
|
_resize_plan.merge(std::move(other._resize_plan));
|
|
_repair_plan.merge(std::move(other._repair_plan));
|
|
_rack_list_colocation_plan.merge(std::move(other._rack_list_colocation_plan));
|
|
}
|
|
|
|
void set_has_nodes_to_drain(bool b) {
|
|
_has_nodes_to_drain = b;
|
|
}
|
|
|
|
const table_resize_plan& resize_plan() const { return _resize_plan; }
|
|
|
|
void merge_resize_plan(table_resize_plan resize_plan) {
|
|
_resize_plan.merge(std::move(resize_plan));
|
|
}
|
|
|
|
const tablet_repair_plan& repair_plan() const { return _repair_plan; }
|
|
|
|
void set_repair_plan(tablet_repair_plan repair) {
|
|
_repair_plan = std::move(repair);
|
|
}
|
|
|
|
const tablet_rack_list_colocation_plan& rack_list_colocation_plan() const { return _rack_list_colocation_plan; }
|
|
|
|
void set_rack_list_colocation_plan(tablet_rack_list_colocation_plan rack_list_colocation_plan) {
|
|
_rack_list_colocation_plan = std::move(rack_list_colocation_plan);
|
|
}
|
|
|
|
future<std::unordered_set<locator::global_tablet_id>> get_migration_tablet_ids() const;
|
|
};
|
|
|
|
class migration_notifier;
|
|
|
|
class tablet_allocator {
|
|
public:
|
|
struct config {
|
|
};
|
|
class impl {
|
|
public:
|
|
virtual ~impl() = default;
|
|
};
|
|
private:
|
|
std::unique_ptr<impl> _impl;
|
|
tablet_allocator_impl& impl();
|
|
public:
|
|
tablet_allocator(config cfg, service::migration_notifier& mn, replica::database& db);
|
|
public:
|
|
future<> stop();
|
|
|
|
/// Returns a tablet migration plan that aims to achieve better load balance in the whole cluster.
|
|
/// The plan is computed based on information in the given token_metadata snapshot
|
|
/// and thus should be executed and reflected, at least as pending tablet transitions, in token_metadata
|
|
/// before this is called again.
|
|
///
|
|
/// For any given global_tablet_id there is at most one tablet_migration_info in the returned plan.
|
|
///
|
|
/// To achieve full balance, do:
|
|
///
|
|
/// while (true) {
|
|
/// auto plan = co_await balance_tablets(get_token_metadata());
|
|
/// if (plan.empty()) {
|
|
/// break;
|
|
/// }
|
|
/// co_await execute(plan);
|
|
/// }
|
|
///
|
|
/// It is ok to invoke the algorithm with already active tablet migrations. The algorithm will take them into account
|
|
/// when balancing the load as if they already succeeded. This means that applying a series of migration plans
|
|
/// produced by this function will give the same result regardless of whether applying means they are fully executed or
|
|
/// only initiated by creating corresponding transitions in tablet metadata.
|
|
///
|
|
/// The algorithm takes care of limiting the streaming load on the system, also by taking active migrations into account.
|
|
///
|
|
future<migration_plan> balance_tablets(locator::token_metadata_ptr, service::topology*, db::system_keyspace*, locator::load_stats_ptr = {}, std::unordered_set<locator::host_id> = {});
|
|
|
|
void set_load_stats(locator::load_stats_ptr);
|
|
|
|
locator::load_stats_ptr get_load_stats();
|
|
|
|
load_balancer_stats_manager& stats();
|
|
|
|
void set_use_table_aware_balancing(bool);
|
|
|
|
future<locator::tablet_map> resize_tablets(locator::token_metadata_ptr, table_id);
|
|
|
|
/// Should be called when the node is no longer a leader.
|
|
void on_leadership_lost();
|
|
};
|
|
|
|
future<bool> requires_rack_list_colocation(
|
|
replica::database& db,
|
|
locator::token_metadata_ptr tmptr,
|
|
db::system_keyspace* sys_ks,
|
|
utils::UUID request_id);
|
|
|
|
}
|
|
|
|
template <>
|
|
struct fmt::formatter<service::tablet_migration_info> : fmt::formatter<string_view> {
|
|
auto format(const service::tablet_migration_info&, fmt::format_context& ctx) const -> decltype(ctx.out());
|
|
};
|