/* * Copyright (C) 2015 ScyllaDB */ /* * This file is part of Scylla. * * Scylla is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Scylla is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Scylla. If not, see . */ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include "core/future-util.hh" #include "core/pipe.hh" #include "sstables.hh" #include "compaction.hh" #include "database.hh" #include "mutation_reader.hh" #include "schema.hh" #include "db/system_keyspace.hh" #include "db/query_context.hh" #include "service/storage_service.hh" #include "service/priority_manager.hh" #include "db_clock.hh" #include "mutation_compactor.hh" #include "leveled_manifest.hh" namespace sstables { logging::logger logger("compaction"); class sstable_reader final : public ::mutation_reader::impl { shared_sstable _sst; mutation_reader _reader; public: sstable_reader(shared_sstable sst, schema_ptr schema) : _sst(std::move(sst)) , _reader(_sst->read_rows(schema, service::get_local_compaction_priority())) {} virtual future operator()() override { return _reader.read().handle_exception([sst = _sst] (auto ep) { logger.error("Compaction found an exception when reading sstable {} : {}", sst->get_filename(), ep); return make_exception_future(ep); }); } }; static api::timestamp_type get_max_purgeable_timestamp(const column_family& cf, sstable_set::incremental_selector& selector, const std::unordered_set& compacting_set, const dht::decorated_key& dk) { auto timestamp = api::max_timestamp; stdx::optional hk; for (auto&& sst : boost::range::join(selector.select(dk.token()), cf.compacted_undeleted_sstables())) { if (compacting_set.count(sst)) { continue; } if (!hk) { hk = sstables::sstable::make_hashed_key(*cf.schema(), dk.key()); } if (sst->filter_has_key(*hk)) { timestamp = std::min(timestamp, sst->get_stats_metadata().min_timestamp); } } return timestamp; } static bool belongs_to_current_node(const dht::token& t, const dht::token_range_vector& sorted_owned_ranges) { auto low = std::lower_bound(sorted_owned_ranges.begin(), sorted_owned_ranges.end(), t, [] (const range& a, const dht::token& b) { // check that range a is before token b. return a.after(b, dht::token_comparator()); }); if (low != sorted_owned_ranges.end()) { const dht::token_range& r = *low; return r.contains(t, dht::token_comparator()); } return false; } static void delete_sstables_for_interrupted_compaction(std::vector& new_sstables, sstring& ks, sstring& cf) { // Delete either partially or fully written sstables of a compaction that // was either stopped abruptly (e.g. out of disk space) or deliberately // (e.g. nodetool stop COMPACTION). for (auto& sst : new_sstables) { logger.debug("Deleting sstable {} of interrupted compaction for {}.{}", sst->get_filename(), ks, cf); sst->mark_for_deletion(); } } static std::vector get_uncompacting_sstables(column_family& cf, std::vector& sstables) { auto all_sstables = boost::copy_range>(*cf.get_sstables_including_compacted_undeleted()); boost::sort(all_sstables, [] (const shared_sstable& x, const shared_sstable& y) { return x->generation() < y->generation(); }); std::sort(sstables.begin(), sstables.end(), [] (const shared_sstable& x, const shared_sstable& y) { return x->generation() < y->generation(); }); std::vector not_compacted_sstables; boost::set_difference(all_sstables, sstables, std::back_inserter(not_compacted_sstables), [] (const shared_sstable& x, const shared_sstable& y) { return x->generation() < y->generation(); }); return not_compacted_sstables; } class compacting_sstable_writer { const schema& _schema; std::function _creator; uint64_t _partitions_per_sstable; sstable_writer_config _cfg; uint32_t _sstable_level; db::replay_position _rp; std::vector _ancestors; compaction_info& _info; shared_sstable _sst; stdx::optional _writer; private: void finish_sstable_write() { _writer->consume_end_of_stream(); _writer = stdx::nullopt; _sst->open_data().get0(); _info.end_size += _sst->data_size(); } public: compacting_sstable_writer(const schema& s, std::function creator, uint64_t partitions_per_sstable, sstable_writer_config cfg, uint32_t sstable_level, db::replay_position rp, std::vector ancestors, compaction_info& info) : _schema(s) , _creator(creator) , _partitions_per_sstable(partitions_per_sstable) , _cfg(std::move(cfg)) , _sstable_level(sstable_level) , _rp(rp) , _ancestors(std::move(ancestors)) , _info(info) { } void consume_new_partition(const dht::decorated_key& dk) { if (_info.is_stop_requested()) { // Compaction manager will catch this exception and re-schedule the compaction. throw compaction_stop_exception(_info.ks, _info.cf, _info.stop_requested); } if (!_writer) { _sst = _creator(); _info.new_sstables.push_back(_sst); _sst->get_metadata_collector().set_replay_position(_rp); _sst->get_metadata_collector().sstable_level(_sstable_level); for (auto ancestor : _ancestors) { _sst->add_ancestor(ancestor); } auto&& priority = service::get_local_compaction_priority(); _writer.emplace(_sst->get_writer(_schema, _partitions_per_sstable, _cfg, priority)); } _info.total_keys_written++; _writer->consume_new_partition(dk); } void consume(tombstone t) { _writer->consume(t); } stop_iteration consume(static_row&& sr, tombstone, bool) { return _writer->consume(std::move(sr)); } stop_iteration consume(clustering_row&& cr, tombstone, bool) { return _writer->consume(std::move(cr)); } stop_iteration consume(range_tombstone&& rt) { return _writer->consume(std::move(rt)); } stop_iteration consume_end_of_partition() { auto ret = _writer->consume_end_of_partition(); if (ret == stop_iteration::yes) { finish_sstable_write(); } return ret; } void consume_end_of_stream() { if (_writer) { finish_sstable_write(); } } }; // compact_sstables compacts the given list of sstables creating one // (currently) or more (in the future) new sstables. The new sstables // are created using the "sstable_creator" object passed by the caller. future> compact_sstables(std::vector sstables, column_family& cf, std::function creator, uint64_t max_sstable_size, uint32_t sstable_level, bool cleanup) { return seastar::async([sstables = std::move(sstables), &cf, creator = std::move(creator), max_sstable_size, sstable_level, cleanup] () mutable { // keep a immutable copy of sstable set because selector needs it alive // and also sstables created after compaction shouldn't be considered. const sstable_set s = cf.get_sstable_set(); auto selector = s.make_incremental_selector(); std::vector<::mutation_reader> readers; uint64_t estimated_partitions = 0; std::vector ancestors; auto info = make_lw_shared(); auto& cm = cf.get_compaction_manager(); sstring sstable_logger_msg = "["; info->type = (cleanup) ? compaction_type::Cleanup : compaction_type::Compaction; // register compaction_stats of starting compaction into compaction manager cm.register_compaction(info); assert(sstables.size() > 0); db::replay_position rp; auto schema = cf.schema(); for (auto sst : sstables) { // We also capture the sstable, so we keep it alive while the read isn't done readers.emplace_back(make_mutation_reader(sst, schema)); // FIXME: If the sstables have cardinality estimation bitmaps, use that // for a better estimate for the number of partitions in the merged // sstable than just adding up the lengths of individual sstables. estimated_partitions += sst->get_estimated_key_count(); info->total_partitions += sst->get_estimated_key_count(); // Compacted sstable keeps track of its ancestors. ancestors.push_back(sst->generation()); sstable_logger_msg += sprint("%s:level=%d, ", sst->get_filename(), sst->get_sstable_level()); info->start_size += sst->data_size(); // TODO: // Note that this is not fully correct. Since we might be merging sstables that originated on // another shard (#cpu changed), we might be comparing RP:s with differing shard ids, // which might vary in "comparable" size quite a bit. However, since the worst that happens // is that we might miss a high water mark for the commit log replayer, // this is kind of ok, esp. since we will hopefully not be trying to recover based on // compacted sstables anyway (CL should be clean by then). rp = std::max(rp, sst->get_stats_metadata().position); } uint64_t estimated_sstables = std::max(1UL, uint64_t(ceil(double(info->start_size) / max_sstable_size))); uint64_t partitions_per_sstable = ceil(double(estimated_partitions) / estimated_sstables); sstable_logger_msg += "]"; info->sstables = sstables.size(); info->ks = schema->ks_name(); info->cf = schema->cf_name(); logger.info("{} {}", (!cleanup) ? "Compacting" : "Cleaning", sstable_logger_msg); dht::token_range_vector owned_ranges; if (cleanup) { owned_ranges = service::get_local_storage_service().get_local_ranges(schema->ks_name()); } auto reader = make_combined_reader(std::move(readers)); auto start_time = db_clock::now(); std::unordered_set compacting_set(sstables.begin(), sstables.end()); auto get_max_purgeable = [&cf, &selector, &compacting_set] (const dht::decorated_key& dk) { return get_max_purgeable_timestamp(cf, selector, compacting_set, dk); }; sstable_writer_config cfg; cfg.max_sstable_size = max_sstable_size; auto cr = compacting_sstable_writer(*schema, creator, partitions_per_sstable, std::move(cfg), sstable_level, rp, std::move(ancestors), *info); auto cfc = make_stable_flattened_mutations_consumer>( *schema, gc_clock::now(), std::move(cr), get_max_purgeable); auto filter = [cleanup, sorted_owned_ranges = std::move(owned_ranges)] (const streamed_mutation& sm) { if (dht::shard_of(sm.decorated_key().token()) != engine().cpu_id()) { return false; } if (cleanup && !belongs_to_current_node(sm.decorated_key().token(), sorted_owned_ranges)) { return false; } return true; }; try { consume_flattened_in_thread(reader, cfc, filter); } catch (...) { cm.deregister_compaction(info); delete_sstables_for_interrupted_compaction(info->new_sstables, info->ks, info->cf); throw; } // deregister compaction_stats of finished compaction from compaction manager. cm.deregister_compaction(info); double ratio = double(info->end_size) / double(info->start_size); auto end_time = db_clock::now(); // time taken by compaction in seconds. auto duration = std::chrono::duration(end_time - start_time); auto throughput = (double(info->end_size) / (1024*1024)) / duration.count(); sstring new_sstables_msg; for (auto& newtab : info->new_sstables) { new_sstables_msg += sprint("%s:level=%d, ", newtab->get_filename(), newtab->get_sstable_level()); } // FIXME: there is some missing information in the log message below. // look at CompactionTask::runMayThrow() in origin for reference. // - add support to merge summary (message: Partition merge counts were {%s}.). // - there is no easy way, currently, to know the exact number of total partitions. // By the time being, using estimated key count. logger.info("{} {} sstables to [{}]. {} bytes to {} (~{}% of original) in {}ms = {}MB/s. " \ "~{} total partitions merged to {}.", (!cleanup) ? "Compacted" : "Cleaned", info->sstables, new_sstables_msg, info->start_size, info->end_size, (int) (ratio * 100), std::chrono::duration_cast(duration).count(), throughput, info->total_partitions, info->total_keys_written); // If compaction is running for testing purposes, detect that there is // no query context and skip code that updates compaction history. if (!db::qctx) { return std::move(info->new_sstables); } // Skip code that updates compaction history if running on behalf of a cleanup job. if (cleanup) { return std::move(info->new_sstables); } auto compacted_at = std::chrono::duration_cast(end_time.time_since_epoch()).count(); // FIXME: add support to merged_rows. merged_rows is a histogram that // shows how many sstables each row is merged from. This information // cannot be accessed until we make combined_reader more generic, // for example, by adding a reducer method. db::system_keyspace::update_compaction_history(info->ks, info->cf, compacted_at, info->start_size, info->end_size, std::unordered_map{}).get0(); // Return vector with newly created sstable(s). return std::move(info->new_sstables); }); } std::vector get_fully_expired_sstables(column_family& cf, std::vector& compacting, int32_t gc_before) { logger.debug("Checking droppable sstables in {}.{}", cf.schema()->ks_name(), cf.schema()->cf_name()); if (compacting.empty()) { return {}; } std::list candidates; auto uncompacting_sstables = get_uncompacting_sstables(cf, compacting); // Get list of uncompacting sstables that overlap the ones being compacted. std::vector overlapping = leveled_manifest::overlapping(*cf.schema(), compacting, uncompacting_sstables); int64_t min_timestamp = std::numeric_limits::max(); for (auto& sstable : overlapping) { if (sstable->get_stats_metadata().max_local_deletion_time >= gc_before) { min_timestamp = std::min(min_timestamp, sstable->get_stats_metadata().min_timestamp); } } // SStables that do not contain live data is added to list of possibly expired sstables. for (auto& candidate : compacting) { logger.debug("Checking if candidate of generation {} and max_deletion_time {} is expired, gc_before is {}", candidate->generation(), candidate->get_stats_metadata().max_local_deletion_time, gc_before); if (candidate->get_stats_metadata().max_local_deletion_time < gc_before) { logger.debug("Adding candidate of generation {} to list of possibly expired sstables", candidate->generation()); candidates.push_back(candidate); } else { min_timestamp = std::min(min_timestamp, candidate->get_stats_metadata().min_timestamp); } } auto it = candidates.begin(); while (it != candidates.end()) { auto& candidate = *it; // Remove from list any candidate that may contain a tombstone that covers older data. if (candidate->get_stats_metadata().max_timestamp >= min_timestamp) { it = candidates.erase(it); } else { logger.debug("Dropping expired SSTable {} (maxLocalDeletionTime={}, gcBefore={})", candidate->get_filename(), candidate->get_stats_metadata().max_local_deletion_time, gc_before); it++; } } return std::vector(candidates.begin(), candidates.end()); } }