From 39eda6809409aae19aeca2d37a049dfeb6aa9d39 Mon Sep 17 00:00:00 2001 From: Duarte Nunes Date: Mon, 8 Oct 2018 13:10:23 +0100 Subject: [PATCH] database: Don't generate view updates when node is overloaded MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We arrive at an overloaded state when we fail to acquire semaphore units in the base replica. This can mean clients are working in interactive mode, we fail to throttle them and consequently should start shedding load. We want to avoid impacting base table availability by running out of memory, so we could offload the memory queue to disk by writing the view updates as hints without attempting to send them. However, the disk is also a limited resource and in extreme cases we won’t be able to write hints. A tension exists between forgetting the view updates, thereby opening up a window for inconsistencies between base and view, or failing the base replica write. The latter can fail the whole user write, or if the coordinator was able to achieve CL, can instead cause inconsistencies between base tables (we wouldn't want to store a hint, because if the base replica is still overloaded, we would redo the whole dance). Between the devil and the deep blue sea, we chose to forget view updates. As a further simplification, we don't even write hints, assuming that if clients can’t be throttled (as we'll attempt to do in future patches), it will only be a matter of time before view updates can’t be offloaded. We also start acquiring the semaphore units using consume(), which is non-blocking, but allows for underflow of the available semaphore units. This is okay, and we expect not to underflow by much, as we stop generating new view updates. Refs #2538 Signed-off-by: Duarte Nunes --- database.cc | 9 +++++---- database.hh | 3 +++ table.cc | 9 +++++++++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/database.cc b/database.cc index 5f265cec49..bb2f025fdf 100644 --- a/database.cc +++ b/database.cc @@ -2355,6 +2355,9 @@ database::setup_metrics() { sm::description("Counts sstables that survived the clustering key filtering. " "High value indicates that bloom filter is not very efficient and still have to access a lot of sstables to get data.")), + sm::make_derive("dropped_view_updates", _cf_stats.dropped_view_updates, + sm::description("Counts the number of view updates that have been dropped due to cluster overload. ")), + sm::make_derive("total_writes", _stats->total_writes, sm::description("Counts the total number of successful write operations performed by this shard.")), @@ -4512,10 +4515,8 @@ future<> table::generate_and_propagate_view_updates(const schema_ptr& base, std::move(views), flat_mutation_reader_from_mutations({std::move(m)}), std::move(existings)).then([this, timeout, base_token = std::move(base_token)] (std::vector&& updates) mutable { - return seastar::get_units(*_config.view_update_concurrency_semaphore, memory_usage_of(updates), timeout).then( - [this, base_token = std::move(base_token), updates = std::move(updates)] (db::timeout_semaphore_units units) mutable { - db::view::mutate_MV(std::move(base_token), std::move(updates), _view_stats, std::move(units)).handle_exception([] (auto ignored) { }); - }); + auto units = seastar::consume_units(*_config.view_update_concurrency_semaphore, memory_usage_of(updates)); + db::view::mutate_MV(std::move(base_token), std::move(updates), _view_stats, std::move(units)).handle_exception([] (auto ignored) { }); }); } diff --git a/database.hh b/database.hh index 7e25ffb921..9a05ccea47 100644 --- a/database.hh +++ b/database.hh @@ -280,6 +280,9 @@ struct cf_stats { int64_t clustering_filter_fast_path_count = 0; // how many sstables survived the clustering key checks int64_t surviving_sstables_after_clustering_filter = 0; + + // How many view updates were dropped due to overload. + int64_t dropped_view_updates = 0; }; class table; diff --git a/table.cc b/table.cc index c79ac6046c..b027c21466 100644 --- a/table.cc +++ b/table.cc @@ -71,6 +71,15 @@ future table::push_view_replica_updates(const schema_pt } future table::do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source) const { + if (!_config.view_update_concurrency_semaphore->current()) { + // We don't have resources to generate view updates for this write. If we reached this point, we failed to + // throttle the client. The memory queue is already full, waiting on the semaphore would cause this node to + // run out of memory, and generating hints would ultimately result in the disk queue being full too. We don't + // drop the base write, which could create inconsistencies between base replicas. So we dolefully continue, + // and note the fact we dropped a view update. + ++_config.cf_stats->dropped_view_updates; + return make_ready_future(); + } auto& base = schema(); m.upgrade(base); auto views = affected_views(base, m);