Files
scylladb/table.cc
Avi Kivity dbe347811c Merge "materialized views: Apply backpressure from view replicas" from Duarte
"
As the amount of pending view updates increases we know that there’s a
mismatch between the rate at which the base receives writes and the
rate at which the view retires them. We react by applying backpressure
to decrease the rate of incoming base writes, allowing the slow view
replicas to catch up. We want to delay the client’s next writes to a
base replica and we use the base’s backlog of view updates to derive
this delay.

To validate this approach we tested a 3 node Scylla cluster on GCE,
using n1-standard-4 instances with NVMEs. A loader running on a
n1-standard-8 instance run cassandra-stress with 100 threads. With the
delay function d(x) set to 1s, we see no base write timeouts. With the
delay function as defined in the series, we see that backlogs stabilize
at some (arbitrary) point, as predicted, but this stabilization
co-exists with base write timeouts. However, the system overall behaves
better than the current version, with the 100 view update limit, and
also better than the version without such limit or any backpressure.

More work is necessary to further stabilize the system. Namely, we want
to keep delaying until we see the backlog is decreasing. This will
require us to add more delay beyond the stabilization point, which in
turn should minimize the base write timeouts, and will also minimize the
amount of memory the backlog takes at each base replica.

Design document:
    https://docs.google.com/document/d/1J6GeLBvN8_c3SbLVp8YsOXHcLc9nOLlRY7pC6MH3JWo

Fixes #2538
"

Reviewed-by: Nadav Har'El <nyh@scylladb.com>

* 'materialized-views/backpressure/v2' of https://github.com/duarten/scylla: (32 commits)
  service/storage_proxy: Release mutation as early as possible
  service/storage_proxy: Delay replica writes based on view update backlog
  service/storage_proxy: Get the backlog of a particular base replica
  service/storage_proxy: Add counters for delayed base writes
  main: Start and stop the view_update_backlog_broker
  service: Distribute a node's view update backlog
  service: Advertise view update backlog over gossip
  service/storage_proxy: Send view update backlog from replicas
  service/storage_proxy: Prepare to receive replica view update backlog
  service/storage_proxy: Expose local view update backlog
  tests/view_schema_test: Add simple test for db::view::node_update_backlog
  db/view: Introduce node_update_backlog class
  db/hints: Initialize current backlog
  database: Add counter for current view backlog
  database: Expose current memory view update backlog
  idl: Add db::view::update_backlog
  db/view: Add view_update_backlog
  database: Wait on view update semaphore for view building
  service/storage_proxy: Use near-infinite timeouts for view updates
  database: generate_and_propagate_view_updates no longer needs a timeout
  ...

(cherry picked from commit b66f59aa3d)
2018-12-20 19:11:56 +02:00

163 lines
8.1 KiB
C++

/*
* Copyright (C) 2018 ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#include "database.hh"
#include "sstables/sstables.hh"
#include "service/priority_manager.hh"
#include "db/view/view_updating_consumer.hh"
static logging::logger tlogger("table");
flat_mutation_reader
table::make_reader_excluding_sstable(schema_ptr s,
sstables::shared_sstable sst,
const dht::partition_range& range,
const query::partition_slice& slice,
const io_priority_class& pc,
tracing::trace_state_ptr trace_state,
streamed_mutation::forwarding fwd,
mutation_reader::forwarding fwd_mr) const {
std::vector<flat_mutation_reader> readers;
readers.reserve(_memtables->size() + 1);
for (auto&& mt : *_memtables) {
readers.emplace_back(mt->make_flat_reader(s, range, slice, pc, trace_state, fwd, fwd_mr));
}
auto effective_sstables = ::make_lw_shared<sstables::sstable_set>(*_sstables);
effective_sstables->erase(sst);
readers.emplace_back(make_sstable_reader(s, std::move(effective_sstables), range, slice, pc, std::move(trace_state), fwd, fwd_mr));
return make_combined_reader(s, std::move(readers), fwd, fwd_mr);
}
void table::move_sstable_from_staging_in_thread(sstables::shared_sstable sst) {
try {
sst->move_to_new_dir_in_thread(dir(), sst->generation());
} catch (...) {
tlogger.warn("Failed to move sstable {} from staging: {}", sst->get_filename(), std::current_exception());
return;
}
_sstables_staging.erase(sst->generation());
_compaction_strategy.get_backlog_tracker().add_sstable(sst);
}
/**
* Given an update for the base table, calculates the set of potentially affected views,
* generates the relevant updates, and sends them to the paired view replicas.
*/
future<row_locker::lock_holder> table::push_view_replica_updates(const schema_ptr& s, const frozen_mutation& fm, db::timeout_clock::time_point timeout) const {
//FIXME: Avoid unfreezing here.
auto m = fm.unfreeze(s);
return push_view_replica_updates(s, std::move(m), timeout);
}
future<row_locker::lock_holder> table::do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source) const {
if (!_config.view_update_concurrency_semaphore->current()) {
// We don't have resources to generate view updates for this write. If we reached this point, we failed to
// throttle the client. The memory queue is already full, waiting on the semaphore would cause this node to
// run out of memory, and generating hints would ultimately result in the disk queue being full too. We don't
// drop the base write, which could create inconsistencies between base replicas. So we dolefully continue,
// and note the fact we dropped a view update.
++_config.cf_stats->dropped_view_updates;
return make_ready_future<row_locker::lock_holder>();
}
auto& base = schema();
m.upgrade(base);
auto views = affected_views(base, m);
if (views.empty()) {
return make_ready_future<row_locker::lock_holder>();
}
auto cr_ranges = db::view::calculate_affected_clustering_ranges(*base, m.decorated_key(), m.partition(), views);
if (cr_ranges.empty()) {
return generate_and_propagate_view_updates(base, std::move(views), std::move(m), { }).then([] {
// In this case we are not doing a read-before-write, just a
// write, so no lock is needed.
return make_ready_future<row_locker::lock_holder>();
});
}
// We read the whole set of regular columns in case the update now causes a base row to pass
// a view's filters, and a view happens to include columns that have no value in this update.
// Also, one of those columns can determine the lifetime of the base row, if it has a TTL.
auto columns = boost::copy_range<std::vector<column_id>>(
base->regular_columns() | boost::adaptors::transformed(std::mem_fn(&column_definition::id)));
query::partition_slice::option_set opts;
opts.set(query::partition_slice::option::send_partition_key);
opts.set(query::partition_slice::option::send_clustering_key);
opts.set(query::partition_slice::option::send_timestamp);
opts.set(query::partition_slice::option::send_ttl);
auto slice = query::partition_slice(
std::move(cr_ranges), { }, std::move(columns), std::move(opts), { }, cql_serialization_format::internal(), query::max_rows);
// Take the shard-local lock on the base-table row or partition as needed.
// We'll return this lock to the caller, which will release it after
// writing the base-table update.
future<row_locker::lock_holder> lockf = local_base_lock(base, m.decorated_key(), slice.default_row_ranges(), timeout);
return lockf.then([m = std::move(m), slice = std::move(slice), views = std::move(views), base, this, timeout, source = std::move(source)] (row_locker::lock_holder lock) {
return do_with(
dht::partition_range::make_singular(m.decorated_key()),
std::move(slice),
std::move(m),
[base, views = std::move(views), lock = std::move(lock), this, timeout, source = std::move(source)] (auto& pk, auto& slice, auto& m) mutable {
auto reader = source.make_reader(base, pk, slice, service::get_local_sstable_query_read_priority());
return this->generate_and_propagate_view_updates(base, std::move(views), std::move(m), std::move(reader)).then([lock = std::move(lock)] () mutable {
// return the local partition/row lock we have taken so it
// remains locked until the caller is done modifying this
// partition/row and destroys the lock object.
return std::move(lock);
});
});
});
}
future<row_locker::lock_holder> table::push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout) const {
return do_push_view_replica_updates(s, std::move(m), timeout, as_mutation_source());
}
future<row_locker::lock_holder> table::stream_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, sstables::shared_sstable excluded_sstable) const {
return do_push_view_replica_updates(s, std::move(m), timeout, as_mutation_source_excluding(std::move(excluded_sstable)));
}
mutation_source
table::as_mutation_source_excluding(sstables::shared_sstable sst) const {
return mutation_source([this, sst = std::move(sst)] (schema_ptr s,
const dht::partition_range& range,
const query::partition_slice& slice,
const io_priority_class& pc,
tracing::trace_state_ptr trace_state,
streamed_mutation::forwarding fwd,
mutation_reader::forwarding fwd_mr) {
return this->make_reader_excluding_sstable(std::move(s), std::move(sst), range, slice, pc, std::move(trace_state), fwd, fwd_mr);
});
}
stop_iteration db::view::view_updating_consumer::consume_end_of_partition() {
if (_as.abort_requested()) {
return stop_iteration::yes;
}
try {
auto lock_holder = _table->stream_view_replica_updates(_schema, std::move(*_m), db::no_timeout, _excluded_sstable).get();
} catch (...) {
tlogger.warn("Failed to push replica updates for table {}.{}: {}", _schema->ks_name(), _schema->cf_name(), std::current_exception());
}
_m.reset();
return stop_iteration::no;
}