storage_proxy: node_local_only: always use my_host_id

The previous implementation did not handle topology changes well:
* In node_local_only mode with CL=1, if the current node is pending,
  the CL is raised to 2, causing unavailable_exception.
* If the current tablet is in write_both_read_old and we read with
  node_local_only on the new node, the replica list is empty.

This patch changes node_local_only mode to always use my_host_id as
the replica list. An explicit check ensures the current node is a
replica for the operation; otherwise on_internal_error is called.
This commit is contained in:
Petr Gusev
2025-08-14 11:00:06 +02:00
parent f689d41747
commit ed6bec2cac
2 changed files with 27 additions and 23 deletions

View File

@@ -132,14 +132,6 @@ void validate_read_replicas(const locator::effective_replication_map& erm, const
}
}
void remove_non_local_host_ids(HostIdVector auto& host_ids, const locator::effective_replication_map& erm) {
const auto my_id = erm.get_topology().my_host_id();
auto it = std::ranges::remove_if(host_ids, [&my_id](locator::host_id& id) {
return id != my_id;
}).begin();
host_ids.erase(it, host_ids.end());
}
} // namespace
namespace storage_proxy_stats {
@@ -3427,8 +3419,20 @@ storage_proxy::create_write_response_handler_helper(schema_ptr s, const dht::tok
db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit, db::allow_per_partition_rate_limit allow_limit, is_cancellable cancellable, coordinator_mutate_options options) {
replica::table& table = _db.local().find_column_family(s->id());
auto erm = table.get_effective_replication_map();
host_id_vector_replica_set natural_endpoints = erm->get_natural_replicas(token);
host_id_vector_topology_change pending_endpoints = erm->get_pending_replicas(token);
host_id_vector_replica_set natural_endpoints;
host_id_vector_topology_change pending_endpoints;
if (options.node_local_only) [[unlikely]] {
if (erm->get_sharder(*s).shard_for_writes(token).empty()) [[unlikely]] {
on_internal_error(slogger,
format("Local-only write failed on node {}: not a replica for token {} of {}.{}",
my_host_id(*erm), token, s->ks_name(), s->cf_name()));
}
natural_endpoints = host_id_vector_replica_set{my_host_id(*erm)};
} else {
natural_endpoints = erm->get_natural_replicas(token);
pending_endpoints = erm->get_pending_replicas(token);
}
slogger.trace("creating write handler for token: {} natural: {} pending: {}", token, natural_endpoints, pending_endpoints);
tracing::trace(tr_state, "Creating write handler for token: {} natural: {} pending: {}", token, natural_endpoints ,pending_endpoints);
@@ -3451,11 +3455,6 @@ storage_proxy::create_write_response_handler_helper(schema_ptr s, const dht::tok
}).begin();
pending_endpoints.erase(itend, pending_endpoints.end());
if (options.node_local_only) {
remove_non_local_host_ids(natural_endpoints, *erm);
remove_non_local_host_ids(pending_endpoints, *erm);
}
auto all = natural_endpoints;
std::ranges::copy(pending_endpoints, std::back_inserter(all));
@@ -5810,7 +5809,7 @@ result<::shared_ptr<abstract_read_executor>> storage_proxy::get_read_executor(lw
speculative_retry::type retry_type = schema->speculative_retry().get_type();
std::optional<locator::host_id> extra_replica;
host_id_vector_replica_set all_replicas = get_endpoints_for_reading(schema->ks_name(), *erm, token, node_local_only);
host_id_vector_replica_set all_replicas = get_endpoints_for_reading(*schema, *erm, token, node_local_only);
// Check for a non-local read before heat-weighted load balancing
// reordering of endpoints happens. The local endpoint, if
// present, is always first in the list, as get_endpoints_for_reading()
@@ -6127,7 +6126,7 @@ storage_proxy::query_partition_key_range_concurrent(storage_proxy::clock_type::t
while (i != ranges.end()) {
dht::partition_range& range = *i;
host_id_vector_replica_set live_endpoints = get_endpoints_for_reading(schema->ks_name(), *erm, end_token(range), node_local_only);
host_id_vector_replica_set live_endpoints = get_endpoints_for_reading(*schema, *erm, end_token(range), node_local_only);
host_id_vector_replica_set merged_preferred_replicas = preferred_replicas_for_range(*i);
host_id_vector_replica_set filtered_endpoints = filter_replicas_for_read(cl, *erm, live_endpoints, merged_preferred_replicas, pcf);
std::vector<dht::token_range> merged_ranges{to_token_range(range)};
@@ -6143,7 +6142,7 @@ storage_proxy::query_partition_key_range_concurrent(storage_proxy::clock_type::t
{
const auto current_range_preferred_replicas = preferred_replicas_for_range(*i);
dht::partition_range& next_range = *i;
host_id_vector_replica_set next_endpoints = get_endpoints_for_reading(schema->ks_name(), *erm, end_token(next_range), node_local_only);
host_id_vector_replica_set next_endpoints = get_endpoints_for_reading(*schema, *erm, end_token(next_range), node_local_only);
host_id_vector_replica_set next_filtered_endpoints = filter_replicas_for_read(cl, *erm, next_endpoints, current_range_preferred_replicas, pcf);
// Origin has this to say here:
@@ -6761,14 +6760,19 @@ void storage_proxy::sort_endpoints_by_proximity(const locator::effective_replica
}
}
host_id_vector_replica_set storage_proxy::get_endpoints_for_reading(const sstring& ks_name, const locator::effective_replication_map& erm, const dht::token& token, node_local_only node_local_only) const {
host_id_vector_replica_set storage_proxy::get_endpoints_for_reading(const schema& s, const locator::effective_replication_map& erm, const dht::token& token, node_local_only node_local_only) const {
if (node_local_only) [[unlikely]] {
if (!erm.get_sharder(s).try_get_shard_for_reads(token)) [[unlikely]] {
on_internal_error(slogger,
format("Local-only read failed on node {}: not a replica for token {} of {}.{}",
my_host_id(erm), token, s.ks_name(), s.cf_name()));
}
return host_id_vector_replica_set{my_host_id(erm)};
}
auto endpoints = erm.get_replicas_for_reading(token);
validate_read_replicas(erm, endpoints);
auto it = std::ranges::remove_if(endpoints, std::not_fn(std::bind_front(&storage_proxy::is_alive, this, std::cref(erm)))).begin();
endpoints.erase(it, endpoints.end());
if (node_local_only) {
remove_non_local_host_ids(endpoints, erm);
}
sort_endpoints_by_proximity(erm, endpoints);
return endpoints;
}

View File

@@ -389,7 +389,7 @@ private:
bool hints_enabled(db::write_type type) const noexcept;
db::hints::manager& hints_manager_for(db::write_type type);
void sort_endpoints_by_proximity(const locator::effective_replication_map& erm, host_id_vector_replica_set& eps) const;
host_id_vector_replica_set get_endpoints_for_reading(const sstring& ks_name, const locator::effective_replication_map& erm, const dht::token& token, node_local_only node_local_only) const;
host_id_vector_replica_set get_endpoints_for_reading(const schema& s, const locator::effective_replication_map& erm, const dht::token& token, node_local_only node_local_only) const;
host_id_vector_replica_set filter_replicas_for_read(db::consistency_level, const locator::effective_replication_map&, host_id_vector_replica_set live_endpoints, const host_id_vector_replica_set& preferred_endpoints, db::read_repair_decision, std::optional<locator::host_id>* extra, replica::column_family*) const;
// As above with read_repair_decision=NONE, extra=nullptr.
host_id_vector_replica_set filter_replicas_for_read(db::consistency_level, const locator::effective_replication_map&, const host_id_vector_replica_set& live_endpoints, const host_id_vector_replica_set& preferred_endpoints, replica::column_family*) const;