/* * Copyright (C) 2018 ScyllaDB */ /* * This file is part of Scylla. * * Scylla is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Scylla is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Scylla. If not, see . */ #include "schema_registry.hh" #include "service/priority_manager.hh" #include "multishard_mutation_query.hh" #include logging::logger mmq_log("multishard_mutation_query"); template using foreign_unique_ptr = foreign_ptr>; class read_context { struct reader_params { std::unique_ptr range; std::unique_ptr slice; reader_params(dht::partition_range range, query::partition_slice slice) : range(std::make_unique(std::move(range))) , slice(std::make_unique(std::move(slice))) { } reader_params(std::unique_ptr range, std::unique_ptr slice) : range(std::move(range)) , slice(std::move(slice)) { } }; struct bundled_remote_reader { foreign_unique_ptr params; foreign_unique_ptr read_operation; foreign_unique_ptr reader; }; using inexistent_state = std::monostate; struct successful_lookup_state { foreign_unique_ptr params; foreign_unique_ptr read_operation; foreign_unique_ptr reader; }; struct used_state { foreign_unique_ptr params; foreign_unique_ptr read_operation; }; struct dismantling_state { foreign_unique_ptr params; foreign_unique_ptr read_operation; future reader_fut; circular_buffer buffer; }; struct ready_to_save_state { foreign_unique_ptr params; foreign_unique_ptr read_operation; foreign_unique_ptr reader; circular_buffer buffer; }; struct future_used_state { future fut; }; struct future_dismantling_state { future fut; }; // ( ) // | // +------ inexistent_state -----+ // | | // (1) | (6) | // | | // successful_lookup_state future_used_state // | | | | // (2) | (3) | (7) | (8) | // | | | | // | used_state <---------+ future_dismantling_state // | | | // | (4) | (9) | // | | | // | dismantling_state <-----------------+ // | | // | (5) | // | | // +----> ready_to_save_state // | // (O) // // 1) lookup_readers() // 2) save_readers() // 3) make_remote_reader() // 4) dismantle_reader() // 5) prepare_reader_for_saving() // 6) do_make_remote_reader() // 7) reader is created // 8) dismantle_reader() // 9) reader is created using reader_state = std::variant< inexistent_state, successful_lookup_state, used_state, dismantling_state, ready_to_save_state, future_used_state, future_dismantling_state>; distributed& _db; schema_ptr _schema; const query::read_command& _cmd; const dht::partition_range_vector& _ranges; tracing::trace_state_ptr _trace_state; // One for each shard. Index is shard id. std::vector _readers; static future do_make_remote_reader( distributed& db, shard_id shard, schema_ptr schema, const dht::partition_range& pr, const query::partition_slice& ps, const io_priority_class& pc, tracing::trace_state_ptr trace_state); future> make_remote_reader( shard_id shard, schema_ptr schema, const dht::partition_range& pr, const query::partition_slice& ps, const io_priority_class& pc, tracing::trace_state_ptr trace_state, streamed_mutation::forwarding fwd_sm, mutation_reader::forwarding fwd_mr); void dismantle_reader(shard_id shard, future&& stopped_reader_fut); future<> cleanup_readers(); ready_to_save_state* prepare_reader_for_saving(dismantling_state& current_state, future&& stopped_reader_fut, const dht::decorated_key& last_pkey, const std::optional& last_ckey); void dismantle_combined_buffer(circular_buffer combined_buffer, const dht::decorated_key& pkey); void dismantle_compaction_state(detached_compaction_state compaction_state); future<> save_reader(ready_to_save_state& current_state, const dht::decorated_key& last_pkey, const std::optional& last_ckey); public: read_context(distributed& db, schema_ptr s, const query::read_command& cmd, const dht::partition_range_vector& ranges, tracing::trace_state_ptr trace_state) : _db(db) , _schema(std::move(s)) , _cmd(cmd) , _ranges(ranges) , _trace_state(std::move(trace_state)) { _readers.resize(smp::count); } read_context(read_context&&) = delete; read_context(const read_context&) = delete; read_context& operator=(read_context&&) = delete; read_context& operator=(const read_context&) = delete; remote_reader_factory factory() { return [this] ( shard_id shard, schema_ptr schema, const dht::partition_range& pr, const query::partition_slice& ps, const io_priority_class& pc, tracing::trace_state_ptr trace_state, streamed_mutation::forwarding fwd_sm, mutation_reader::forwarding fwd_mr) { return make_remote_reader(shard, std::move(schema), pr, ps, pc, std::move(trace_state), fwd_sm, fwd_mr); }; } foreign_reader_dismantler dismantler() { return [this] (shard_id shard, future&& stopped_reader_fut) { dismantle_reader(shard, std::move(stopped_reader_fut)); }; } future<> lookup_readers(); future<> save_readers(circular_buffer unconsumed_buffer, detached_compaction_state compaction_state, std::optional last_ckey); }; future read_context::do_make_remote_reader( distributed& db, shard_id shard, schema_ptr schema, const dht::partition_range& pr, const query::partition_slice& ps, const io_priority_class&, tracing::trace_state_ptr trace_state) { return db.invoke_on(shard, [gs = global_schema_ptr(schema), &pr, &ps, gts = tracing::global_trace_state_ptr(std::move(trace_state))] ( database& db) { auto s = gs.get(); auto& table = db.find_column_family(s); //TODO need a way to transport io_priority_calls across shards auto& pc = service::get_local_sstable_query_read_priority(); auto params = reader_params(pr, ps); auto read_operation = table.read_in_progress(); auto reader = table.as_mutation_source().make_reader(std::move(s), *params.range, *params.slice, pc, gts.get()); return make_ready_future(bundled_remote_reader{ make_foreign(std::make_unique(std::move(params))), make_foreign(std::make_unique(std::move(read_operation))), make_foreign(std::make_unique(std::move(reader)))}); }); } future> read_context::make_remote_reader( shard_id shard, schema_ptr schema, const dht::partition_range& pr, const query::partition_slice& ps, const io_priority_class& pc, tracing::trace_state_ptr trace_state, streamed_mutation::forwarding, mutation_reader::forwarding) { auto& rs = _readers[shard]; if (!std::holds_alternative(rs) && !std::holds_alternative(rs)) { mmq_log.warn("Unexpected request to create reader for shard {}. A reader for this shard was already created.", shard); throw std::logic_error(sprint("Unexpected request to create reader for shard {}." " A reader for this shard was already created in the context of this read.", shard)); } // The reader is either in inexistent or successful lookup state. if (auto current_state = std::get_if(&rs)) { auto reader = std::move(current_state->reader); rs = used_state{std::move(current_state->params), std::move(current_state->read_operation)}; return make_ready_future>(std::move(reader)); } auto created = promise(); rs = future_used_state{created.get_future()}; return do_make_remote_reader(_db, shard, std::move(schema), pr, ps, pc, std::move(trace_state)).then([this, &rs, created = std::move(created)] ( bundled_remote_reader&& bundled_reader) mutable { auto new_state = used_state{std::move(bundled_reader.params), std::move(bundled_reader.read_operation)}; if (std::holds_alternative(rs)) { rs = std::move(new_state); } else { created.set_value(std::move(new_state)); } return std::move(bundled_reader.reader); }); } void read_context::dismantle_reader(shard_id shard, future&& stopped_reader_fut) { auto& rs = _readers[shard]; if (auto* maybe_used_state = std::get_if(&rs)) { auto read_operation = std::move(maybe_used_state->read_operation); auto params = std::move(maybe_used_state->params); rs = dismantling_state{std::move(params), std::move(read_operation), std::move(stopped_reader_fut), circular_buffer{}}; } else if (auto* maybe_future_used_state = std::get_if(&rs)) { auto f = maybe_future_used_state->fut.then([stopped_reader_fut = std::move(stopped_reader_fut)] (used_state&& current_state) mutable { auto read_operation = std::move(current_state.read_operation); auto params = std::move(current_state.params); return dismantling_state{std::move(params), std::move(read_operation), std::move(stopped_reader_fut), circular_buffer{}}; }); rs = future_dismantling_state{std::move(f)}; } else { mmq_log.warn("Unexpected request to dismantle reader for shard {}. Reader was not created nor is in the process of being created.", shard); } } future<> read_context::cleanup_readers() { auto cleanup = [db = &_db.local()] (shard_id shard, dismantling_state state) { return state.reader_fut.then_wrapped([db, shard, params = std::move(state.params), read_operation = std::move(state.read_operation)] (future&& fut) mutable { if (fut.failed()) { mmq_log.debug("Failed to stop reader on shard {}: {}", shard, fut.get_exception()); ++db->get_stats().multishard_query_failed_reader_stops; } else { smp::submit_to(shard, [reader = fut.get0().remote_reader, params = std::move(params), read_operation = std::move(read_operation)] () mutable { reader.release(); params.release(); read_operation.release(); }); } }); }; std::vector> futures; // Wait for pending read-aheads in the background. for (shard_id shard = 0; shard != smp::count; ++shard) { auto& rs = _readers[shard]; if (auto maybe_dismantling_state = std::get_if(&rs)) { cleanup(shard, std::move(*maybe_dismantling_state)); } else if (auto maybe_future_dismantling_state = std::get_if(&rs)) { futures.emplace_back(maybe_future_dismantling_state->fut.then([=] (dismantling_state&& current_state) { cleanup(shard, std::move(current_state)); })); } } return when_all(futures.begin(), futures.end()).discard_result(); } void read_context::dismantle_combined_buffer(circular_buffer combined_buffer, const dht::decorated_key& pkey) { auto& partitioner = dht::global_partitioner(); std::vector tmp_buffer; auto rit = std::reverse_iterator(combined_buffer.end()); const auto rend = std::reverse_iterator(combined_buffer.begin()); for (;rit != rend; ++rit) { if (rit->is_partition_start()) { const auto shard = partitioner.shard_of(rit->as_partition_start().key().token()); auto& shard_buffer = std::get(_readers[shard]).buffer; for (auto& smf : tmp_buffer) { shard_buffer.emplace_front(std::move(smf)); } shard_buffer.emplace_front(std::move(*rit)); tmp_buffer.clear(); } else { tmp_buffer.emplace_back(std::move(*rit)); } } const auto shard = partitioner.shard_of(pkey.token()); auto& shard_buffer = std::get(_readers[shard]).buffer; for (auto& smf : tmp_buffer) { shard_buffer.emplace_front(std::move(smf)); } } void read_context::dismantle_compaction_state(detached_compaction_state compaction_state) { auto& partitioner = dht::global_partitioner(); const auto shard = partitioner.shard_of(compaction_state.partition_start.key().token()); auto& shard_buffer = std::get(_readers[shard]).buffer; for (auto& rt : compaction_state.range_tombstones | boost::adaptors::reversed) { shard_buffer.emplace_front(std::move(rt)); } if (compaction_state.static_row) { shard_buffer.emplace_front(std::move(*compaction_state.static_row)); } shard_buffer.emplace_front(std::move(compaction_state.partition_start)); } read_context::ready_to_save_state* read_context::prepare_reader_for_saving( dismantling_state& current_state, future&& stopped_reader_fut, const dht::decorated_key& last_pkey, const std::optional& last_ckey) { const auto shard = current_state.params.get_owner_shard(); auto& rs = _readers[shard]; if (stopped_reader_fut.failed()) { mmq_log.debug("Failed to stop reader on shard {}: {}", shard, stopped_reader_fut.get_exception()); ++_db.local().get_stats().multishard_query_failed_reader_stops; return nullptr; } auto stopped_reader = stopped_reader_fut.get0(); // If the buffer is empty just overwrite it. // If it has some data in it append the fragments to the back. // The unconsumed fragments appended here come from the // foreign_reader which is at the lowest layer, hence its // fragments need to be at the back of the buffer. if (current_state.buffer.empty()) { current_state.buffer = std::move(stopped_reader.unconsumed_fragments); } else { std::move(stopped_reader.unconsumed_fragments.begin(), stopped_reader.unconsumed_fragments.end(), std::back_inserter(current_state.buffer)); } rs = ready_to_save_state{std::move(current_state.params), std::move(current_state.read_operation), std::move(stopped_reader.remote_reader), std::move(current_state.buffer)}; return &std::get(rs); } future<> read_context::save_reader(ready_to_save_state& current_state, const dht::decorated_key& last_pkey, const std::optional& last_ckey) { const auto shard = current_state.reader.get_owner_shard(); return _db.invoke_on(shard, [shard, query_uuid = _cmd.query_uuid, query_ranges = _ranges, ¤t_state, &last_pkey, &last_ckey, gts = tracing::global_trace_state_ptr(_trace_state)] (database& db) mutable { try { auto params = current_state.params.release(); auto read_operation = current_state.read_operation.release(); auto reader = current_state.reader.release(); auto& buffer = current_state.buffer; const auto fragments = buffer.size(); const auto size_before = reader->buffer_size(); auto rit = std::reverse_iterator(buffer.cend()); auto rend = std::reverse_iterator(buffer.cbegin()); auto& schema = *reader->schema(); for (;rit != rend; ++rit) { // Copy the fragment, the buffer is on another shard. reader->unpop_mutation_fragment(mutation_fragment(schema, *rit)); } const auto size_after = reader->buffer_size(); auto querier = query::shard_mutation_querier( std::move(query_ranges), std::move(params->range), std::move(params->slice), std::move(*reader), last_pkey, last_ckey); db.get_querier_cache().insert(query_uuid, std::move(querier), gts.get()); db.get_stats().multishard_query_unpopped_fragments += fragments; db.get_stats().multishard_query_unpopped_bytes += (size_after - size_before); } catch (...) { // We don't want to fail a read just because of a failure to // save any of the readers. mmq_log.debug("Failed to save reader: {}", std::current_exception()); ++db.get_stats().multishard_query_failed_reader_saves; } }).handle_exception([this, shard] (std::exception_ptr e) { // We don't want to fail a read just because of a failure to // save any of the readers. mmq_log.debug("Failed to save reader on shard {}: {}", shard, e); // This will account the failure on the local shard but we don't // know where exactly the failure happened anyway. ++_db.local().get_stats().multishard_query_failed_reader_saves; }); } future<> read_context::lookup_readers() { if (_cmd.query_uuid == utils::UUID{} || _cmd.is_first_page) { return make_ready_future<>(); } return parallel_for_each(boost::irange(0u, smp::count), [this] (shard_id shard) { return _db.invoke_on(shard, [shard, cmd = &_cmd, ranges = &_ranges, gs = global_schema_ptr(_schema), gts = tracing::global_trace_state_ptr(_trace_state)] ( database& db) mutable -> reader_state { auto schema = gs.get(); auto querier_opt = db.get_querier_cache().lookup_shard_mutation_querier(cmd->query_uuid, *schema, *ranges, cmd->slice, gts.get()); if (!querier_opt) { return inexistent_state{}; } auto& q = *querier_opt; auto& table = db.find_column_family(schema); auto params = make_foreign(std::make_unique(std::move(q).reader_range(), std::move(q).reader_slice())); auto read_operation = make_foreign(std::make_unique(table.read_in_progress())); auto reader = make_foreign(std::make_unique(std::move(q).reader())); return successful_lookup_state{std::move(params), std::move(read_operation), std::move(reader)}; }).then([this, shard] (reader_state&& state) { _readers[shard] = std::move(state); }); }); } future<> read_context::save_readers(circular_buffer unconsumed_buffer, detached_compaction_state compaction_state, std::optional last_ckey) { if (_cmd.query_uuid == utils::UUID{}) { return cleanup_readers(); } auto last_pkey = compaction_state.partition_start.key(); dismantle_combined_buffer(std::move(unconsumed_buffer), last_pkey); dismantle_compaction_state(std::move(compaction_state)); return do_with(std::move(last_pkey), std::move(last_ckey), [this] (const dht::decorated_key& last_pkey, const std::optional& last_ckey) { return parallel_for_each(_readers, [this, &last_pkey, &last_ckey] (reader_state& rs) { if (auto* maybe_successful_lookup_state = std::get_if(&rs)) { auto& current_state = *maybe_successful_lookup_state; rs = ready_to_save_state{std::move(current_state.params), std::move(current_state.read_operation), std::move(current_state.reader), circular_buffer{}}; return save_reader(std::get(rs), last_pkey, last_ckey); } auto finish_saving = [this, &last_pkey, &last_ckey] (dismantling_state& current_state) { return current_state.reader_fut.then_wrapped([this, ¤t_state, &last_pkey, &last_ckey] ( future&& stopped_reader_fut) mutable { if (auto* ready_state = prepare_reader_for_saving(current_state, std::move(stopped_reader_fut), last_pkey, last_ckey)) { return save_reader(*ready_state, last_pkey, last_ckey); } return make_ready_future<>(); }); }; if (auto* maybe_dismantling_state = std::get_if(&rs)) { return finish_saving(*maybe_dismantling_state); } if (auto* maybe_future_dismantling_state = std::get_if(&rs)) { return maybe_future_dismantling_state->fut.then([this, &rs, finish_saving = std::move(finish_saving)] (dismantling_state&& next_state) mutable { rs = std::move(next_state); return finish_saving(std::get(rs)); }); } return make_ready_future<>(); }); }); } static future do_query_mutations( distributed& db, schema_ptr s, const query::read_command& cmd, const dht::partition_range_vector& ranges, tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout, query::result_memory_accounter&& accounter) { return do_with(std::make_unique(db, s, cmd, ranges, trace_state), [s, &cmd, &ranges, trace_state, timeout, accounter = std::move(accounter)] (std::unique_ptr& ctx) mutable { return ctx->lookup_readers().then([&ctx, s = std::move(s), &cmd, &ranges, trace_state, timeout, accounter = std::move(accounter)] () mutable { auto ms = mutation_source([&] (schema_ptr s, const dht::partition_range& pr, const query::partition_slice& ps, const io_priority_class& pc, tracing::trace_state_ptr trace_state, streamed_mutation::forwarding fwd_sm, mutation_reader::forwarding fwd_mr) { return make_multishard_combining_reader(std::move(s), pr, ps, pc, dht::global_partitioner(), ctx->factory(), std::move(trace_state), fwd_sm, fwd_mr, ctx->dismantler()); }); auto reader = make_flat_multi_range_reader(s, std::move(ms), ranges, cmd.slice, service::get_local_sstable_query_read_priority(), trace_state, mutation_reader::forwarding::no); auto compaction_state = make_lw_shared(*s, cmd.timestamp, cmd.slice, cmd.row_limit, cmd.partition_limit); return do_with(std::move(reader), std::move(compaction_state), [&, accounter = std::move(accounter), timeout] ( flat_mutation_reader& reader, lw_shared_ptr& compaction_state) mutable { auto rrb = reconcilable_result_builder(*reader.schema(), cmd.slice, std::move(accounter)); return query::consume_page(reader, compaction_state, cmd.slice, std::move(rrb), cmd.row_limit, cmd.partition_limit, cmd.timestamp, timeout).then([&] (std::optional&& last_ckey, reconcilable_result&& result) mutable { return make_ready_future, reconcilable_result, circular_buffer, lw_shared_ptr>(std::move(last_ckey), std::move(result), reader.detach_buffer(), std::move(compaction_state)); }); }).then_wrapped([&ctx] (future, reconcilable_result, circular_buffer, lw_shared_ptr>&& result_fut) { if (result_fut.failed()) { return make_exception_future(std::move(result_fut.get_exception())); } auto [last_ckey, result, unconsumed_buffer, compaction_state] = result_fut.get(); if (!compaction_state->are_limits_reached() && !result.is_short_read()) { return make_ready_future(std::move(result)); } return ctx->save_readers(std::move(unconsumed_buffer), std::move(*compaction_state).detach_state(), std::move(last_ckey)).then_wrapped([result = std::move(result)] (future<>&&) mutable { return make_ready_future(std::move(result)); }); }); }); }); } future>, cache_temperature> query_mutations_on_all_shards( distributed& db, schema_ptr s, const query::read_command& cmd, const dht::partition_range_vector& ranges, tracing::trace_state_ptr trace_state, uint64_t max_size, db::timeout_clock::time_point timeout) { if (cmd.row_limit == 0 || cmd.slice.partition_row_limit() == 0 || cmd.partition_limit == 0) { return make_ready_future>, cache_temperature>( make_foreign(make_lw_shared()), db.local().find_column_family(s).get_global_cache_hit_rate()); } return db.local().get_result_memory_limiter().new_mutation_read(max_size).then([&, s = std::move(s), trace_state = std::move(trace_state), timeout] (query::result_memory_accounter accounter) mutable { return do_query_mutations(db, s, cmd, ranges, std::move(trace_state), timeout, std::move(accounter)).then_wrapped( [&db, s = std::move(s)] (future&& f) { auto& local_db = db.local(); auto& stats = local_db.get_stats(); if (f.failed()) { ++stats.total_reads_failed; return make_exception_future>, cache_temperature>(f.get_exception()); } else { ++stats.total_reads; auto result = f.get0(); stats.short_mutation_queries += bool(result.is_short_read()); auto hit_rate = local_db.find_column_family(s).get_global_cache_hit_rate(); return make_ready_future>, cache_temperature>( make_foreign(make_lw_shared(std::move(result))), hit_rate); } }); }); }