/* * Copyright (C) 2015 ScyllaDB * */ /* * This file is part of Scylla. * * Scylla is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Scylla is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Scylla. If not, see . */ #pragma once #include "core/file.hh" #include "core/fstream.hh" #include "core/future.hh" #include "core/sstring.hh" #include "core/enum.hh" #include "core/shared_ptr.hh" #include "core/distributed.hh" #include #include #include "types.hh" #include "clustering_key_filter.hh" #include "core/enum.hh" #include "compress.hh" #include "row.hh" #include "dht/i_partitioner.hh" #include "schema.hh" #include "mutation.hh" #include "utils/i_filter.hh" #include "core/stream.hh" #include "writer.hh" #include "metadata_collector.hh" #include "filter.hh" #include "exceptions.hh" #include "mutation_reader.hh" #include "query-request.hh" #include "compound_compat.hh" #include "disk-error-handler.hh" #include "atomic_deletion.hh" #include "sstables/shared_index_lists.hh" namespace sstables { extern logging::logger sstlog; // data_consume_context is an object returned by sstable::data_consume_rows() // which allows knowing when the consumer stops reading, and starting it again // (e.g., when the consumer wants to stop after every sstable row). // // The read() method initiates reading into the consumer, and continues to // read and feed data into the consumer until one of the consumer's callbacks // requests to stop, or until we reach the end of the data range originally // requested. read() returns a future which completes when reading stopped. // If we're at the end-of-file, the read may complete without reading anything // so it's the consumer class's task to check if anything was consumed. // Note: // The caller MUST ensure that between calling read() on this object, // and the time the returned future is completed, the object lives on. // Moreover, the sstable object used for the sstable::data_consume_rows() // call which created this data_consume_context, must also be kept alive. class data_consume_context { class impl; std::unique_ptr _pimpl; // This object can only be constructed by sstable::data_consume_rows() data_consume_context(std::unique_ptr); friend class sstable; public: future<> read(); future<> fast_forward_to(uint64_t begin, uint64_t end); future<> skip_to(indexable_element, uint64_t begin); uint64_t position() const; // Define (as defaults) the destructor and move operations in the source // file, so here we don't need to know the incomplete impl type. ~data_consume_context(); data_consume_context(data_consume_context&&) noexcept; data_consume_context& operator=(data_consume_context&&) noexcept; }; // mutation_reader is an object returned by sstable::read_rows() et al. which // allows getting each sstable row in sequence, in mutation format. // // The read() method reads the next mutation, returning a disengaged optional // on EOF. As usual for future-returning functions, a caller which starts a // read() MUST ensure that the mutation_reader object continues to live until // the returned future is fulfilled. Moreover, the sstable whose read_rows() // method was used to open this mutation_reader must also live between the // time read() is called and its future ends. // As soon as the future returned by read() completes, the object may safely // be deleted. In other words, when the read() future is fulfilled, we can // be sure there are no background tasks still scheduled. class mutation_reader { class impl; std::unique_ptr _pimpl; // This object can only be constructed by sstable::read_rows() et al. mutation_reader(std::unique_ptr); friend class sstable; public: future read(); future<> fast_forward_to(const dht::partition_range&); // Define (as defaults) the destructor and move operations in the source // file, so here we don't need to know the incomplete impl type. ~mutation_reader(); mutation_reader(mutation_reader&&); mutation_reader& operator=(mutation_reader&&); }; class key; class sstable_writer; struct foreign_sstable_open_info; struct sstable_open_info; class index_reader; struct sstable_writer_config { std::experimental::optional promoted_index_block_size; uint64_t max_sstable_size = std::numeric_limits::max(); bool backup = false; bool leave_unsealed = false; }; class sstable : public enable_lw_shared_from_this { public: enum class component_type { Index, CompressionInfo, Data, TOC, Summary, Digest, CRC, Filter, Statistics, TemporaryTOC, TemporaryStatistics, Scylla, Unknown, }; enum class version_types { ka, la }; enum class format_types { big }; static const size_t default_buffer_size = 128*1024; public: sstable(schema_ptr schema, sstring dir, int64_t generation, version_types v, format_types f, gc_clock::time_point now = gc_clock::now(), io_error_handler_gen error_handler_gen = default_io_error_handler_gen(), size_t buffer_size = default_buffer_size) : sstable_buffer_size(buffer_size) , _schema(std::move(schema)) , _dir(std::move(dir)) , _generation(generation) , _version(v) , _format(f) , _now(now) , _read_error_handler(error_handler_gen(sstable_read_error)) , _write_error_handler(error_handler_gen(sstable_write_error)) { } sstable& operator=(const sstable&) = delete; sstable(const sstable&) = delete; sstable(sstable&&) = default; ~sstable(); // Read one or few rows at the given byte range from the data file, // feeding them into the consumer. This function reads the entire given // byte range at once into memory, so it should not be used for iterating // over all the rows in the data file (see the next function for that. // The function returns a future which completes after all the data has // been fed into the consumer. The caller needs to ensure the "consumer" // object lives until then (e.g., using the do_with() idiom). future<> data_consume_rows_at_once(row_consumer& consumer, uint64_t pos, uint64_t end); // disk_read_range describes a byte ranges covering part of an sstable // row that we need to read from disk. Usually this is the whole byte // range covering a single sstable row, but in very large rows we might // want to only read a subset of the atoms which we know contains the // columns we are looking for. When the range to be read does NOT include // the entire row, the caller needs to supply the optional "row_info" // containing information about the entire row (key and deletion time) // which is normally read from the beginning of the row. struct disk_read_range { // TODO: this should become a vector of ranges uint64_t start; uint64_t end; // When the range above does not cover the beginning of the sstable // row, we need to supply information which is only available at the // beginning of the row - the row's key and its tombstone if any. struct row_info { key k; deletion_time deltime; }; std::experimental::optional ri; disk_read_range() : start(0), end(0) {} disk_read_range(uint64_t start, uint64_t end) : start(start), end(end) { } disk_read_range(uint64_t start, uint64_t end, const key& key, const deletion_time& deltime) : start(start), end(end), ri(row_info{key, deltime}) { } explicit operator bool() const { return start != end; } // found_row() is true if the row was found. This is not the same as // operator bool(): It is possible that found_row() but the promoted // index ruled out anything to read (in this case "ri" was set). bool found_row() const { return start != end || ri; } }; // data_consume_rows() iterates over rows in the data file from // a particular range, feeding them into the consumer. The iteration is // done as efficiently as possible - reading only the data file (not the // summary or index files) and reading data in batches. // // The consumer object may request the iteration to stop before reaching // the end of the requested data range (e.g. stop after each sstable row). // A context object is returned which allows to resume this consumption: // This context's read() method requests that consumption begins, and // returns a future which will be resolved when it ends (because the // consumer asked to stop, or the data range ended). Only after the // returned future is resolved, may read() be called again to consume // more. // The caller must ensure (e.g., using do_with()) that the context object, // as well as the sstable, remains alive as long as a read() is in // progress (i.e., returned a future which hasn't completed yet). data_consume_context data_consume_rows(row_consumer& consumer, disk_read_range toread); data_consume_context data_consume_single_partition(row_consumer& consumer, disk_read_range toread); // Like data_consume_rows() with bounds, but iterates over whole range data_consume_context data_consume_rows(row_consumer& consumer); static component_type component_from_sstring(sstring& s); static version_types version_from_sstring(sstring& s); static format_types format_from_sstring(sstring& s); static const sstring filename(sstring dir, sstring ks, sstring cf, version_types version, int64_t generation, format_types format, component_type component); static const sstring filename(sstring dir, sstring ks, sstring cf, version_types version, int64_t generation, format_types format, sstring component); // WARNING: it should only be called to remove components of a sstable with // a temporary TOC file. static future<> remove_sstable_with_temp_toc(sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f); // load sstable using components shared by a shard future<> load(foreign_sstable_open_info info); // load all components from disk // this variant will be useful for testing purposes and also when loading // a new sstable from scratch for sharing its components. future<> load(); future<> open_data(); future<> update_info_for_opened_data(); future<> set_generation(int64_t generation); int64_t generation() const { return _generation; } // read_row() reads the entire sstable row (partition) at a given // partition key k, or a subset of this row. The subset is defined by // a filter on the clustering keys which we want to read, which // additionally determines also if all the static columns will also be // returned in the result. future read_row( schema_ptr schema, const key& k, const query::partition_slice& slice = query::full_slice, const io_priority_class& pc = default_priority_class(), streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no); // Returns a mutation_reader for given range of partitions mutation_reader read_range_rows( schema_ptr schema, const dht::partition_range& range, const query::partition_slice& slice = query::full_slice, const io_priority_class& pc = default_priority_class(), streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no); // read_rows() returns each of the rows in the sstable, in sequence, // converted to a "mutation" data structure. // This function is implemented efficiently - doing buffered, sequential // read of the data file (no need to access the index file). // A "mutation_reader" object is returned with which the caller can // fetch mutations in sequence, and allows stop iteration any time // after getting each row. // // The caller must ensure (e.g., using do_with()) that the context object, // as well as the sstable, remains alive as long as a read() is in // progress (i.e., returned a future which hasn't completed yet). mutation_reader read_rows(schema_ptr schema, const io_priority_class& pc = default_priority_class(), streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no); // Write sstable components from a memtable. future<> write_components(memtable& mt, bool backup = false, const io_priority_class& pc = default_priority_class(), bool leave_unsealed = false); future<> write_components(::mutation_reader mr, uint64_t estimated_partitions, schema_ptr schema, const sstable_writer_config&, const io_priority_class& pc = default_priority_class()); sstable_writer get_writer(const schema& s, uint64_t estimated_partitions, const sstable_writer_config&, const io_priority_class& pc = default_priority_class()); future<> seal_sstable(bool backup); uint64_t get_estimated_key_count() const { return ((uint64_t)_components->summary.header.size_at_full_sampling + 1) * _components->summary.header.min_index_interval; } uint64_t estimated_keys_for_range(const dht::token_range& range); std::vector get_key_samples(const schema& s, const dht::token_range& range); // mark_for_deletion() specifies that a sstable isn't relevant to the // current shard, and thus can be deleted by the deletion manager, if // all shards sharing it agree. In case the sstable is unshared, it's // guaranteed that all of its on-disk files will be deleted as soon as // the in-memory object is destroyed. void mark_for_deletion() { _marked_for_deletion = true; } future<> mark_for_deletion_on_disk(); bool marked_for_deletion() const { return _marked_for_deletion; } void add_ancestor(int64_t generation) { _collector.add_ancestor(generation); } // Returns true iff this sstable contains data which belongs to many shards. bool is_shared() const { return _shared; } void set_unshared() { _shared = false; } uint64_t data_size() const; uint64_t index_size() const { return _index_file_size; } uint64_t filter_size() const { return _filter_file_size; } db_clock::time_point data_file_write_time() const { return _data_file_write_time; } uint64_t filter_memory_size() const { return _components->filter->memory_size(); } // Returns the total bytes of all components. uint64_t bytes_on_disk(); const partition_key& get_first_partition_key() const; const partition_key& get_last_partition_key() const; const dht::decorated_key& get_first_decorated_key() const; const dht::decorated_key& get_last_decorated_key() const; // SSTable comparator using the first key (decorated key). // Return values are those of a trichotomic comparison. int compare_by_first_key(const sstable& other) const; // SSTable comparator using the max timestamp. // Return values are those of a trichotomic comparison. int compare_by_max_timestamp(const sstable& other) const; const sstring get_filename() const { return filename(component_type::Data); } const sstring& get_dir() const { return _dir; } sstring toc_filename() const; metadata_collector& get_metadata_collector() { return _collector; } std::vector> all_components() const; future<> create_links(sstring dir, int64_t generation) const; future<> create_links(sstring dir) const { return create_links(dir, _generation); } /** * Note. This is using the Origin definition of * max_data_age, which is load time. This could maybe * be improved upon. */ gc_clock::time_point max_data_age() const { return _now; } std::vector component_filenames() const; template auto sstable_write_io_check(Func&& func, Args&&... args) const { return do_io_check(_write_error_handler, func, std::forward(args)...); } // Immutable components that can be shared among shards. struct shareable_components { sstables::compression compression; utils::filter_ptr filter; sstables::summary summary; sstables::statistics statistics; stdx::optional scylla_metadata; }; private: size_t sstable_buffer_size = default_buffer_size; static std::unordered_map> _version_string; static std::unordered_map> _format_string; static std::unordered_map> _component_map; std::unordered_set> _recognized_components; std::vector _unrecognized_components; foreign_ptr> _components = make_foreign(make_lw_shared()); shared_index_lists _index_lists; bool _shared = true; // across shards; safe default // NOTE: _collector and _c_stats are used to generation of statistics file // when writing a new sstable. metadata_collector _collector; column_stats _c_stats; file _index_file; file _data_file; uint64_t _data_file_size; uint64_t _index_file_size; uint64_t _filter_file_size = 0; uint64_t _bytes_on_disk = 0; db_clock::time_point _data_file_write_time; std::vector> _clustering_components_ranges; stdx::optional _first; stdx::optional _last; lw_shared_ptr _single_partition_history = make_lw_shared(); lw_shared_ptr _partition_range_history = make_lw_shared(); // _pi_write is used temporarily for building the promoted // index (column sample) of one partition when writing a new sstable. struct { // Unfortunately we cannot output the promoted index directly to the // index file because it needs to be prepended by its size. bytes_ostream data; uint32_t numblocks; deletion_time deltime; uint64_t block_start_offset; uint64_t block_next_start_offset; bytes block_first_colname; bytes block_last_colname; std::experimental::optional tombstone_accumulator; const schema* schemap; size_t desired_block_size; } _pi_write; void maybe_flush_pi_block(file_writer& out, const composite& clustering_key, const std::vector& column_names); schema_ptr _schema; sstring _dir; unsigned long _generation = 0; version_types _version; format_types _format; filter_tracker _filter_tracker; bool _marked_for_deletion = false; gc_clock::time_point _now; io_error_handler _read_error_handler; io_error_handler _write_error_handler; const bool has_component(component_type f) const; const sstring filename(component_type f) const; template future<> read_simple(T& comp, const io_priority_class& pc); template void write_simple(const T& comp, const io_priority_class& pc); void generate_toc(compressor c, double filter_fp_chance); void write_toc(const io_priority_class& pc); future<> seal_sstable(); future<> read_compression(const io_priority_class& pc); void write_compression(const io_priority_class& pc); future<> read_scylla_metadata(const io_priority_class& pc); void write_scylla_metadata(const io_priority_class& pc); future<> read_filter(const io_priority_class& pc); void write_filter(const io_priority_class& pc); future<> read_summary(const io_priority_class& pc); void write_summary(const io_priority_class& pc) { write_simple(_components->summary, pc); } // To be called when we try to load an SSTable that lacks a Summary. Could // happen if old tools are being used. future<> generate_summary(const io_priority_class& pc); future<> read_statistics(const io_priority_class& pc); void write_statistics(const io_priority_class& pc); // Rewrite statistics component by creating a temporary Statistics and // renaming it into place of existing one. void rewrite_statistics(const io_priority_class& pc); // Validate metadata that's used to optimize reads when user specifies // a clustering key range. If this specific metadata is incorrect, then // it should be cleared. Otherwise, it could lead to bad decisions. // Metadata is probably incorrect if generated by previous Scylla versions. void validate_min_max_metadata(); void set_first_and_last_keys(); // Create one range for each clustering component of this sstable. // Each range stores min and max value for that specific component. // It does nothing if schema defines no clustering key, and it's supposed // to be called when loading an existing sstable or after writing a new one. void set_clustering_components_ranges(); future<> create_data(); future read_indexes(uint64_t summary_idx, const io_priority_class& pc); index_reader get_index_reader(const io_priority_class& pc); // Return an input_stream which reads exactly the specified byte range // from the data file (after uncompression, if the file is compressed). // Unlike data_read() below, this method does not read the entire byte // range into memory all at once. Rather, this method allows reading the // data incrementally as a stream. Knowing in advance the exact amount // of bytes to be read using this stream, we can make better choices // about the buffer size to read, and where exactly to stop reading // (even when a large buffer size is used). input_stream data_stream(uint64_t pos, size_t len, const io_priority_class& pc, lw_shared_ptr history); // Read exactly the specific byte range from the data file (after // uncompression, if the file is compressed). This can be used to read // a specific row from the data file (its position and length can be // determined using the index file). // This function is intended (and optimized for) random access, not // for iteration through all the rows. future> data_read(uint64_t pos, size_t len, const io_priority_class& pc); future data_end_position(uint64_t summary_idx, uint64_t index_idx, const index_list& il, const io_priority_class& pc); // Returns data file position for an entry right after all entries mapped by given summary page. future data_end_position(uint64_t summary_idx, const io_priority_class& pc); template int binary_search(const T& entries, const key& sk, const dht::token& token); template int binary_search(const T& entries, const key& sk) { return binary_search(entries, sk, dht::global_partitioner().get_token(key_view(sk))); } // find_disk_ranges finds the ranges of bytes we need to read from the // sstable to read the desired columns out of the given key. This range // may be the entire byte range of the given partition - as found using // the summary and index files - but if the index contains a "promoted // index" (a sample of column positions for each key) it may be a smaller // range. The returned range may contain columns beyond those requested // in slice, so it is the reader's duty to use slice again // when parsing the data read from the returned range. future find_disk_ranges(schema_ptr schema, const sstables::key& key, const query::partition_slice& slice, const io_priority_class& pc); future read_summary_entry(size_t i); // FIXME: pending on Bloom filter implementation bool filter_has_key(const schema& s, const dht::decorated_key& dk) { return filter_has_key(key::from_partition_key(s, dk._key)); } // NOTE: functions used to generate sstable components. void write_row_marker(file_writer& out, const row_marker& marker, const composite& clustering_key); void write_clustered_row(file_writer& out, const schema& schema, const clustering_row& clustered_row); void write_static_row(file_writer& out, const schema& schema, const row& static_row); void write_cell(file_writer& out, atomic_cell_view cell, const column_definition& cdef); void write_column_name(file_writer& out, const composite& clustering_key, const std::vector& column_names, composite::eoc marker = composite::eoc::none); void write_column_name(file_writer& out, bytes_view column_names); void write_range_tombstone(file_writer& out, const composite& start, bound_kind start_kind, const composite& end, bound_kind stop_kind, std::vector suffix, const tombstone t); void write_range_tombstone(file_writer& out, const composite& start, const composite& end, std::vector suffix, const tombstone t) { write_range_tombstone(out, start, bound_kind::incl_start, end, bound_kind::incl_end, std::move(suffix), std::move(t)); } void write_collection(file_writer& out, const composite& clustering_key, const column_definition& cdef, collection_mutation_view collection); stdx::optional> get_sample_indexes_for_range(const dht::token_range& range); public: future<> read_toc(); bool filter_has_key(const key& key) { return _components->filter->is_present(bytes_view(key)); } bool filter_has_key(utils::hashed_key key) { return _components->filter->is_present(key); } bool filter_has_key(const schema& s, partition_key_view key) { return filter_has_key(key::from_partition_key(s, key)); } static utils::hashed_key make_hashed_key(const schema& s, const partition_key& key); uint64_t filter_get_false_positive() { return _filter_tracker.false_positive; } uint64_t filter_get_true_positive() { return _filter_tracker.true_positive; } uint64_t filter_get_recent_false_positive() { auto t = _filter_tracker.false_positive - _filter_tracker.last_false_positive; _filter_tracker.last_false_positive = _filter_tracker.false_positive; return t; } uint64_t filter_get_recent_true_positive() { auto t = _filter_tracker.true_positive - _filter_tracker.last_true_positive; _filter_tracker.last_true_positive = _filter_tracker.true_positive; return t; } const stats_metadata& get_stats_metadata() const { auto entry = _components->statistics.contents.find(metadata_type::Stats); if (entry == _components->statistics.contents.end()) { throw std::runtime_error("Stats metadata not available"); } auto& p = entry->second; if (!p) { throw std::runtime_error("Statistics is malformed"); } const stats_metadata& s = *static_cast(p.get()); return s; } const compaction_metadata& get_compaction_metadata() const { auto entry = _components->statistics.contents.find(metadata_type::Compaction); if (entry == _components->statistics.contents.end()) { throw std::runtime_error("Compaction metadata not available"); } auto& p = entry->second; if (!p) { throw std::runtime_error("Statistics is malformed"); } const compaction_metadata& s = *static_cast(p.get()); return s; } std::vector get_shards_for_this_sstable() const; uint32_t get_sstable_level() const { return get_stats_metadata().sstable_level; } // This will change sstable level only in memory. void set_sstable_level(uint32_t); double get_compression_ratio() const; future<> mutate_sstable_level(uint32_t); const summary& get_summary() const { return _components->summary; } // Return sstable key range as range reading only the summary component. future> get_sstable_key_range(const schema& s); future> get_owning_shards_from_unloaded(); const std::vector>& clustering_components_ranges() const; // returns all info needed for a sstable to be shared with other shards. static future load_shared_components(const schema_ptr& s, sstring dir, int generation, version_types v, format_types f); // Allow the test cases from sstable_test.cc to test private methods. We use // a placeholder to avoid cluttering this class too much. The sstable_test class // will then re-export as public every method it needs. friend class test; friend class components_writer; friend class sstable_writer; friend class index_reader; friend class mutation_reader::impl; }; using shared_sstable = lw_shared_ptr; using sstable_list = std::unordered_set; struct entry_descriptor { sstring ks; sstring cf; sstable::version_types version; int64_t generation; sstable::format_types format; sstable::component_type component; static entry_descriptor make_descriptor(sstring fname); entry_descriptor(sstring ks, sstring cf, sstable::version_types version, int64_t generation, sstable::format_types format, sstable::component_type component) : ks(ks), cf(cf), version(version), generation(generation), format(format), component(component) {} }; // Waits for all prior tasks started on current shard related to sstable management to finish. // // There may be asynchronous cleanup started from sstable destructor. Since we can't have blocking // destructors in seastar, that cleanup is not waited for. It can be waited for using this function. // It is also waited for when seastar exits. future<> await_background_jobs(); // Invokes await_background_jobs() on all shards future<> await_background_jobs_on_all_shards(); // When we compact sstables, we have to atomically instantiate the new // sstable and delete the old ones. Otherwise, if we compact A+B into C, // and if A contained some data that was tombstoned by B, and if B was // deleted but A survived, then data from A will be resurrected. // // There are two violators of the requirement to atomically delete // sstables: first sstable instantiation and deletion on disk is atomic // only wrt. itself, not other sstables, and second when an sstable is // shared among shard, so actual on-disk deletion of an sstable is deferred // until all shards agree it can be deleted. // // When shutting down, we will not be able to complete some deletions. // In that case, an atomic_deletion_cancelled exception is returned instead. // // This function only solves the second problem for now. future<> delete_atomically(std::vector ssts); future<> delete_atomically(std::vector ssts); // Cancel any deletions scheduled by delete_atomically() and make their // futures complete (with an atomic_deletion_cancelled exception). void cancel_atomic_deletions(); class components_writer { sstable& _sst; const schema& _schema; file_writer& _out; file_writer _index; uint64_t _max_sstable_size; bool _tombstone_written; // Remember first and last keys, which we need for the summary file. stdx::optional _first_key, _last_key; stdx::optional _partition_key; private: size_t get_offset(); file_writer index_file_writer(sstable& sst, const io_priority_class& pc); void ensure_tombstone_is_written() { if (!_tombstone_written) { consume(tombstone()); } } public: components_writer(sstable& sst, const schema& s, file_writer& out, uint64_t estimated_partitions, const sstable_writer_config&, const io_priority_class& pc); void consume_new_partition(const dht::decorated_key& dk); void consume(tombstone t); stop_iteration consume(static_row&& sr); stop_iteration consume(clustering_row&& cr); stop_iteration consume(range_tombstone&& rt); stop_iteration consume_end_of_partition(); void consume_end_of_stream(); }; class sstable_writer { sstable& _sst; const schema& _schema; const io_priority_class& _pc; bool _backup; bool _leave_unsealed; bool _compression_enabled; std::unique_ptr _writer; stdx::optional _components_writer; private: void prepare_file_writer(); void finish_file_writer(); public: sstable_writer(sstable& sst, const schema& s, uint64_t estimated_partitions, const sstable_writer_config&, const io_priority_class& pc); ~sstable_writer(); sstable_writer(sstable_writer&& o) : _sst(o._sst), _schema(o._schema), _pc(o._pc), _backup(o._backup), _leave_unsealed(o._leave_unsealed), _compression_enabled(o._compression_enabled), _writer(std::move(o._writer)), _components_writer(std::move(o._components_writer)) {} void consume_new_partition(const dht::decorated_key& dk) { return _components_writer->consume_new_partition(dk); } void consume(tombstone t) { _components_writer->consume(t); } stop_iteration consume(static_row&& sr) { return _components_writer->consume(std::move(sr)); } stop_iteration consume(clustering_row&& cr) { return _components_writer->consume(std::move(cr)); } stop_iteration consume(range_tombstone&& rt) { return _components_writer->consume(std::move(rt)); } stop_iteration consume_end_of_partition() { return _components_writer->consume_end_of_partition(); } void consume_end_of_stream(); }; // contains data for loading a sstable using components shared by a single shard; // can be moved across shards struct foreign_sstable_open_info { foreign_ptr> components; std::vector owners; seastar::file_handle data; seastar::file_handle index; }; // can only be used locally struct sstable_open_info { lw_shared_ptr components; std::vector owners; file data; file index; }; void init_metrics(); }