/* * Modified by ScyllaDB * Copyright 2015-present ScyllaDB */ /* * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0) */ #include #include "gms/inet_address.hh" #include "load_meter.hh" #include "load_broadcaster.hh" #include "cache_hitrate_calculator.hh" #include "db/system_keyspace.hh" #include "gms/application_state.hh" #include "service/storage_proxy.hh" #include "service/view_update_backlog_broker.hh" #include "replica/database.hh" #include "locator/abstract_replication_strategy.hh" #include namespace service { constexpr std::chrono::milliseconds load_broadcaster::BROADCAST_INTERVAL; logging::logger llogger("load_broadcaster"); future<> load_meter::init(sharded& db, gms::gossiper& gms) { _lb = make_shared(db, gms); _lb->start_broadcasting(); return make_ready_future<>(); } future<> load_meter::exit() { return _lb->stop_broadcasting(); } future> load_meter::get_load_map() { return smp::submit_to(0, [this] () { std::map load_map; if (_lb) { for (auto& x : _lb->get_load_info()) { load_map.emplace(format("{}", _lb->gossiper().get_address_map().find(x.first).value_or(gms::inet_address{})), x.second); llogger.debug("get_load_map endpoint={}, load={}", x.first, x.second); } load_map.emplace(format("{}", _lb->gossiper().get_broadcast_address()), get_load()); } else { llogger.debug("load_broadcaster is not set yet!"); } return load_map; }); } double load_meter::get_load() const { double bytes = 0; #if 0 for (String keyspaceName : Schema.instance.getKeyspaces()) { Keyspace keyspace = Schema.instance.getKeyspaceInstance(keyspaceName); if (keyspace == null) continue; for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores()) bytes += cfs.getLiveDiskSpaceUsed(); } #endif return bytes; } void load_broadcaster::start_broadcasting() { _done = make_ready_future<>(); // send the first broadcast "right away" (i.e., in 2 gossip heartbeats, when we should have someone to talk to); // after that send every BROADCAST_INTERVAL. _timer.set_callback([this] { llogger.debug("Disseminating load info ..."); _done = _db.map_reduce0([](replica::database& db) { int64_t res = 0; db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr table) { res += table->get_stats().live_disk_space_used.on_disk; }); return res; }, int64_t(0), std::plus()).then([this] (int64_t size) { return _gossiper.add_local_application_state(gms::application_state::LOAD, gms::versioned_value::load(size)).then([this] { _timer.arm(BROADCAST_INTERVAL); return make_ready_future<>(); }); }); }); _timer.arm(2 * gms::gossiper::INTERVAL); } future<> load_broadcaster::stop_broadcasting() { _timer.cancel(); return _gossiper.unregister_(shared_from_this()).then([this] { return std::move(_done); }).then([this] { _stopped = true; }); } // cache_hitrate_calculator implementation cache_hitrate_calculator::cache_hitrate_calculator(seastar::sharded& db, gms::gossiper& g) : _db(db), _gossiper(g), _timer(std::bind(std::mem_fn(&cache_hitrate_calculator::recalculate_timer), this)) {} void cache_hitrate_calculator::recalculate_timer() { _done = recalculate_hitrates().then_wrapped([p = shared_from_this()] (future f) { lowres_clock::duration d; if (f.failed()) { d = std::chrono::milliseconds(2000); } else { d = f.get(); } p->run_on((this_shard_id() + 1) % smp::count, d); }); } void cache_hitrate_calculator::run_on(size_t master, lowres_clock::duration d) { if (!_stopped) { // Do it in the background. (void)container().invoke_on(master, [d] (cache_hitrate_calculator& local) { local._timer.arm(d); }).handle_exception_type([] (seastar::no_sharded_instance_exception&) { /* ignore */ }); } } future cache_hitrate_calculator::recalculate_hitrates() { auto non_system_filter = [&] (const std::pair>& cf) { return !_db.local().find_keyspace(cf.second->schema()->ks_name()).get_replication_strategy().is_local(); }; auto cf_to_cache_hit_stats = [non_system_filter] (replica::database& db) { return std::ranges::to>(db.get_tables_metadata().filter(non_system_filter) | std::views::transform([] (const std::pair>& cf) { auto& stats = cf.second->get_row_cache().stats(); return std::make_pair(cf.first, stat{float(stats.reads_with_no_misses.rate().rates[0]), float(stats.reads_with_misses.rate().rates[0])}); })); }; auto sum_stats_per_cf = [] (std::unordered_map a, std::unordered_map b) { for (auto& r : b) { a[r.first] += r.second; } return a; }; return _db.map_reduce0(cf_to_cache_hit_stats, std::unordered_map(), sum_stats_per_cf).then([this] (std::unordered_map rates) mutable { _diff = 0; _gstate.reserve(_slen); // assume length did not change from previous iteration _slen = 0; _rates = std::move(rates); // set calculated rates on all shards return _db.invoke_on_all([this, cpuid = this_shard_id()] (replica::database& db) { return do_for_each(_rates, [this, cpuid, &db] (auto&& r) mutable { auto cf_opt = db.get_tables_metadata().get_table_if_exists(r.first); if (!cf_opt) { // a table may be added before map/reduce completes and this code runs return; } auto& cf = cf_opt; stat& s = r.second; float rate = 0; if (s.h) { rate = s.h / (s.h + s.m); } if (this_shard_id() == cpuid) { // calculate max difference between old rate and new one for all cfs _diff = std::max(_diff, std::abs(float(cf->get_global_cache_hit_rate()) - rate)); _gstate += format("{}.{}:{:0.6f};", cf->schema()->ks_name(), cf->schema()->cf_name(), rate); } cf->set_global_cache_hit_rate(cache_temperature(rate)); }); }); }).then([this] { _slen = _gstate.size(); using namespace std::chrono_literals; auto now = lowres_clock::now(); // Publish CACHE_HITRATES in case: // // - We haven't published it at all // - The diff is bigger than 1% and we haven't published in the last 5 seconds // - The diff is really big 10% // // Note: A peer node can know the cache hitrate through read_data // read_mutation_data and read_digest RPC verbs which have // cache_temperature in the response. So there is no need to update // CACHE_HITRATES through gossip in high frequency. bool do_publish = (_published_nr == 0) || (_diff > 0.1) || ( _diff > 0.01 && (now - _published_time) > 5000ms); // We do the recalculation faster if the diff is bigger than 0.01. It // is useful to do the calculation even if we do not publish the // CACHE_HITRATES though gossip, since the recalculation will call the // table->set_global_cache_hit_rate to set the hitrate. auto recalculate_duration = _diff > 0.01 ? lowres_clock::duration(500ms) : lowres_clock::duration(2000ms); if (do_publish) { llogger.debug("Send CACHE_HITRATES update max_diff={}, published_nr={}", _diff, _published_nr); ++_published_nr; _published_time = now; return container().invoke_on(0, [&gstate = _gstate] (cache_hitrate_calculator& self) { return self._gossiper.add_local_application_state(gms::application_state::CACHE_HITRATES, gms::versioned_value::cache_hitrates(gstate)); }).then([recalculate_duration] { return recalculate_duration; }); } else { llogger.debug("Skip CACHE_HITRATES update max_diff={}, published_nr={}", _diff, _published_nr); return make_ready_future(recalculate_duration); } }).finally([this] { _gstate = std::string(); // free memory, do not trust clear() to do that for string _rates.clear(); }); } future<> cache_hitrate_calculator::stop() { _timer.cancel(); _stopped = true; return std::move(_done); } view_update_backlog_broker::view_update_backlog_broker( seastar::sharded& sp, gms::gossiper& gossiper) : _sp(sp) , _gossiper(gossiper) { } future<> view_update_backlog_broker::start() { _gossiper.register_(shared_from_this()); if (this_shard_id() == 0) { // Gossiper runs only on shard 0, and there's no API to add multiple, per-shard application states. // Also, right now we aggregate all backlogs, since the coordinator doesn't keep per-replica shard backlogs. _started = seastar::async([this] { while (!_as.abort_requested()) { auto backlog = _sp.local().get_view_update_backlog_if_changed().get(); if (!backlog) { sleep_abortable(gms::gossiper::INTERVAL, _as).get(); continue; } auto now = api::timestamp_type(std::chrono::duration_cast( std::chrono::system_clock::now().time_since_epoch()).count()); //FIXME: discarded future. (void)_gossiper.add_local_application_state( gms::application_state::VIEW_BACKLOG, gms::versioned_value(seastar::format("{}:{}:{}", backlog->get_current_bytes(), backlog->get_max_bytes(), now))); sleep_abortable(gms::gossiper::INTERVAL, _as).get(); } }).handle_exception_type([] (const seastar::sleep_aborted& ignored) { }); } return make_ready_future<>(); } future<> view_update_backlog_broker::stop() { return _gossiper.unregister_(shared_from_this()).then([this] { _as.request_abort(); return std::move(_started); }); } future<> view_update_backlog_broker::on_change(gms::inet_address endpoint, locator::host_id id, const gms::application_state_map& states, gms::permit_id pid) { return on_application_state_change(endpoint, id, states, gms::application_state::VIEW_BACKLOG, pid, [this] (gms::inet_address endpoint, locator::host_id id, const gms::versioned_value& value, gms::permit_id) { if (utils::get_local_injector().enter("skip_updating_local_backlog_via_view_update_backlog_broker")) { return make_ready_future<>(); } size_t current; size_t max; api::timestamp_type ticks; const char* start_bound = value.value().data(); char* end_bound; for (auto* ptr : {¤t, &max}) { errno = 0; *ptr = std::strtoull(start_bound, &end_bound, 10); if (errno == ERANGE) { return make_ready_future(); } start_bound = end_bound + 1; } if (max == 0) { return make_ready_future(); } errno = 0; ticks = std::strtoll(start_bound, &end_bound, 10); if (ticks == 0 || errno == ERANGE || end_bound != value.value().data() + value.value().size()) { return make_ready_future(); } auto backlog = view_update_backlog_timestamped{db::view::update_backlog{current, max}, ticks}; return _sp.invoke_on_all([id, backlog] (service::storage_proxy& sp) { auto[it, inserted] = sp._view_update_backlogs.try_emplace(id, backlog); if (!inserted && it->second.ts < backlog.ts) { it->second = backlog; } return make_ready_future(); }); }); } future<> view_update_backlog_broker::on_remove(gms::inet_address endpoint, locator::host_id id, gms::permit_id) { _sp.local()._view_update_backlogs.erase(id); return make_ready_future(); } }