Files
scylladb/service/tablet_allocator.hh
Pavel Emelyanov b4dd732dab tablet_allocator: Add skiplist to load_balancer
Currently load balancer skips nodes only based on its "administrative"
state, i.e. whether it's drained/decommissioned/removed/etc. There's no
way to exclude any node from balancing decision based on anything else.
This patch add this ability by adding skiplist argument to
balance_tablets() method. When a node is in it, it will not be
considered, as if it was removenode-d.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
2024-03-14 10:47:31 +03:00

133 lines
4.6 KiB
C++

/*
* Copyright (C) 2023-present ScyllaDB
*/
/*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
#pragma once
#include "replica/database_fwd.hh"
#include "locator/tablets.hh"
#include "tablet_allocator_fwd.hh"
#include "locator/token_metadata_fwd.hh"
namespace service {
using tablet_migration_info = locator::tablet_migration_info;
/// Represents intention to emit resize (split or merge) request for a
/// table, and finalize or revoke the request previously initiated.
struct table_resize_plan {
std::unordered_map<table_id, locator::resize_decision> resize;
std::unordered_set<table_id> finalize_resize;
size_t size() const { return resize.size() + finalize_resize.size(); }
void merge(table_resize_plan&& other) {
for (auto&& [id, other_resize] : other.resize) {
if (!resize.contains(id) || other_resize.sequence_number > resize[id].sequence_number) {
resize[id] = std::move(other_resize);
}
}
finalize_resize.merge(std::move(other.finalize_resize));
}
};
class migration_plan {
public:
using migrations_vector = utils::chunked_vector<tablet_migration_info>;
private:
migrations_vector _migrations;
table_resize_plan _resize_plan;
bool _has_nodes_to_drain = false;
public:
/// Returns true iff there are decommissioning nodes which own some tablet replicas.
bool has_nodes_to_drain() const { return _has_nodes_to_drain; }
const migrations_vector& migrations() const { return _migrations; }
bool empty() const { return _migrations.empty() && !_resize_plan.size(); }
size_t size() const { return _migrations.size() + _resize_plan.size(); }
size_t tablet_migration_count() const { return _migrations.size(); }
size_t resize_decision_count() const { return _resize_plan.size(); }
void add(tablet_migration_info info) {
_migrations.emplace_back(std::move(info));
}
void merge(migration_plan&& other) {
std::move(other._migrations.begin(), other._migrations.end(), std::back_inserter(_migrations));
_has_nodes_to_drain |= other._has_nodes_to_drain;
_resize_plan.merge(std::move(other._resize_plan));
}
void set_has_nodes_to_drain(bool b) {
_has_nodes_to_drain = b;
}
const table_resize_plan& resize_plan() const { return _resize_plan; }
void set_resize_plan(table_resize_plan resize_plan) {
_resize_plan = std::move(resize_plan);
}
};
class migration_notifier;
class tablet_allocator {
public:
struct config {
unsigned initial_tablets_scale = 1;
};
class impl {
public:
virtual ~impl() = default;
};
private:
std::unique_ptr<impl> _impl;
tablet_allocator_impl& impl();
public:
tablet_allocator(config cfg, service::migration_notifier& mn, replica::database& db);
public:
future<> stop();
/// Returns a tablet migration plan that aims to achieve better load balance in the whole cluster.
/// The plan is computed based on information in the given token_metadata snapshot
/// and thus should be executed and reflected, at least as pending tablet transitions, in token_metadata
/// before this is called again.
///
/// For any given global_tablet_id there is at most one tablet_migration_info in the returned plan.
///
/// To achieve full balance, do:
///
/// while (true) {
/// auto plan = co_await balance_tablets(get_token_metadata());
/// if (plan.empty()) {
/// break;
/// }
/// co_await execute(plan);
/// }
///
/// It is ok to invoke the algorithm with already active tablet migrations. The algorithm will take them into account
/// when balancing the load as if they already succeeded. This means that applying a series of migration plans
/// produced by this function will give the same result regardless of whether applying means they are fully executed or
/// only initiated by creating corresponding transitions in tablet metadata.
///
/// The algorithm takes care of limiting the streaming load on the system, also by taking active migrations into account.
///
future<migration_plan> balance_tablets(locator::token_metadata_ptr, locator::load_stats_ptr = {}, std::unordered_set<locator::host_id> = {});
future<locator::tablet_map> split_tablets(locator::token_metadata_ptr, table_id);
/// Should be called when the node is no longer a leader.
void on_leadership_lost();
};
}
template <>
struct fmt::formatter<service::tablet_migration_info> : fmt::formatter<std::string_view> {
auto format(const service::tablet_migration_info&, fmt::format_context& ctx) const -> decltype(ctx.out());
};