Files
scylladb/service/tablet_allocator.hh
Tomasz Grabiec 551cc0233d tablets, raft topology: Add support for decommission with tablets
Load balancer will recognize decommissioning nodes and will
move tablet replicas away from such nodes with highest priority.

Topology changes have now an extra step called "tablet draining" which
calls the load balancer. The step will execute tablet migration track
as long as there are nodes which require draining. It will not do regular
load balancing.

If load balancer is unable to find new tablet replicas, because RF
cannot be met or availability is at risk due to insufficient node
distribution in racks, it will throw an exception. Currently, topology
change will retry in a loop. We should make this error cause topology
change to be paused so that admin becomes aware of the problem and
issues an abort on the topology change. There is no infrastructure for
aborts yet, so this is not implemented.
2023-09-14 13:05:49 +02:00

105 lines
3.4 KiB
C++

/*
* Copyright (C) 2023-present ScyllaDB
*/
/*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
#pragma once
#include "replica/database.hh"
#include "service/migration_manager.hh"
#include "locator/tablets.hh"
#include <any>
namespace service {
/// Represents intention to move a single tablet replica from src to dst.
struct tablet_migration_info {
locator::global_tablet_id tablet;
locator::tablet_replica src;
locator::tablet_replica dst;
};
class migration_plan {
public:
using migrations_vector = utils::chunked_vector<tablet_migration_info>;
private:
migrations_vector _migrations;
bool _has_nodes_to_drain = false;
public:
/// Returns true iff there are decommissioning nodes which own some tablet replicas.
bool has_nodes_to_drain() const { return _has_nodes_to_drain; }
const migrations_vector& migrations() const { return _migrations; }
bool empty() const { return _migrations.empty(); }
size_t size() const { return _migrations.size(); }
void add(tablet_migration_info info) {
_migrations.emplace_back(std::move(info));
}
void merge(migration_plan&& other) {
std::move(other._migrations.begin(), other._migrations.end(), std::back_inserter(_migrations));
_has_nodes_to_drain |= other._has_nodes_to_drain;
}
void set_has_nodes_to_drain(bool b) {
_has_nodes_to_drain = b;
}
};
class tablet_allocator_impl;
class tablet_allocator {
public:
class impl {
public:
virtual ~impl() = default;
};
private:
std::unique_ptr<impl> _impl;
tablet_allocator_impl& impl();
public:
tablet_allocator(service::migration_notifier& mn, replica::database& db);
public:
future<> stop();
/// Returns a tablet migration plan that aims to achieve better load balance in the whole cluster.
/// The plan is computed based on information in the given token_metadata snapshot
/// and thus should be executed and reflected, at least as pending tablet transitions, in token_metadata
/// before this is called again.
///
/// For any given global_tablet_id there is at most one tablet_migration_info in the returned plan.
///
/// To achieve full balance, do:
///
/// while (true) {
/// auto plan = co_await balance_tablets(get_token_metadata());
/// if (plan.empty()) {
/// break;
/// }
/// co_await execute(plan);
/// }
///
/// It is ok to invoke the algorithm with already active tablet migrations. The algorithm will take them into account
/// when balancing the load as if they already succeeded. This means that applying a series of migration plans
/// produced by this function will give the same result regardless of whether applying means they are fully executed or
/// only initiated by creating corresponding transitions in tablet metadata.
///
/// The algorithm takes care of limiting the streaming load on the system, also by taking active migrations into account.
///
future<migration_plan> balance_tablets(locator::token_metadata_ptr);
/// Should be called when the node is no longer a leader.
void on_leadership_lost();
};
}
template <>
struct fmt::formatter<service::tablet_migration_info> : fmt::formatter<std::string_view> {
auto format(const service::tablet_migration_info&, fmt::format_context& ctx) const -> decltype(ctx.out());
};