/* * Copyright (C) 2016 ScyllaDB */ /* * This file is part of Scylla. * * Scylla is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Scylla is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Scylla. If not, see . */ #pragma once #include "mutation_partition.hh" #include "streamed_mutation.hh" #include "utils/anchorless_list.hh" #include "utils/logalloc.hh" // This is MVCC implementation for mutation_partitions. // // It is assumed that mutation_partitions are stored in some sort of LSA-managed // container (memtable or row cache). // // partition_entry - the main handle to the mutation_partition, allows writes // and reads. // partition_version - mutation_partition inside a list of partition versions. // mutation_partition represents just a difference against // the next one in the list. To get a single // mutation_partition fully representing this version one // needs to merge this one and all its successors in the // list. // partition_snapshot - a handle to some particular partition_version. It allows // only reads and itself is immutable the partition version // it represents won't be modified as long as the snapshot // is alive. // // pe - partition_entry // pv - partition_version // ps - partition_snapshot // ps(u) - partition_snapshot marked as unique owner // Scene I. Write-only loads // pv // ^ // | // pe // In case of write-only loads all incoming mutations are directly applied // to the partition_version that partition_entry is pointing to. The list // of partition_versions contains only a single element. // // Scene II. Read-only loads // pv // ^ // | // pe <- ps // In case of read-only scenarios there is only a single partition_snapshot // object that points to the partition_entry. There is only a single // partition_version. // // Scene III. Writes and reads // pv -- pv -- pv // ^ ^ ^ // | | | // pe ps ps // If the partition_entry that needs to be modified is currently read from (i.e. // there exist a partition_snapshot pointing to it) instead of applying new // mutation directly a new partition version is created and added at the front // of the list. partition_entry points to the new version (so that it has the // most recent view of stored data) while the partition_snapshot points to the // same partition_version it pointed to before (so that the data it sees doesn't // change). // As a result the list may contain multiple partition versions used by // different partition snapshots. // When the partition_snapshot is destroyed partition_versions are squashed // together to minimize the amount of elements on the list. // // Scene IV. Schema upgrade // pv pv --- pv // ^ ^ ^ // | | | // pe ps(u) ps // When there is a schema upgrade the list of partition versions pointed to // by partition_entry is replaced by a new single partition_version that is a // result of squashing and upgrading the old versions. // Old versions not used by any partition snapshot are removed. The first // partition snapshot on the list is marked as unique which means that upon // its destruction it won't attempt to squash versions but instead remove // the unused ones and pass the "unique owner" mark the next snapshot on the // list (if there is any). // // Scene V. partition_entry eviction // pv // ^ // | // ps(u) // When partition_entry is removed (e.g. because it was evicted from cache) // the partition versions are removed in a similar manner than in the schema // upgrade scenario. The unused ones are destroyed right away and the first // snapshot on the list is marked as unique owner so that on its destruction // it continues removal of the partition versions. class partition_version_ref; class partition_version : public anchorless_list_base_hook { partition_version_ref* _backref = nullptr; mutation_partition _partition; friend class partition_version_ref; public: explicit partition_version(mutation_partition mp) noexcept : _partition(std::move(mp)) { } partition_version(partition_version&& pv) noexcept; partition_version& operator=(partition_version&& pv) noexcept; ~partition_version(); mutation_partition& partition() { return _partition; } const mutation_partition& partition() const { return _partition; } bool is_referenced() { return _backref; } partition_version_ref& back_reference() { return *_backref; } }; class partition_version_ref { partition_version* _version = nullptr; bool _unique_owner = false; friend class partition_version; public: partition_version_ref() = default; explicit partition_version_ref(partition_version& pv) noexcept : _version(&pv) { assert(!_version->_backref); _version->_backref = this; } ~partition_version_ref() { if (_version) { _version->_backref = nullptr; } } partition_version_ref(partition_version_ref&& other) noexcept : _version(other._version) { if (_version) { _version->_backref = this; } other._version = nullptr; } partition_version_ref& operator=(partition_version_ref&& other) noexcept { if (this != &other) { this->~partition_version_ref(); new (this) partition_version_ref(std::move(other)); } return *this; } explicit operator bool() { return _version; } partition_version& operator*() { assert(_version); return *_version; } partition_version* operator->() { assert(_version); return _version; } bool is_unique_owner() const { return _unique_owner; } void mark_as_unique_owner() { _unique_owner = true; } }; class partition_entry; class partition_snapshot : public enable_lw_shared_from_this { schema_ptr _schema; // Either _version or _entry is non-null. partition_version_ref _version; partition_entry* _entry; friend class partition_entry; public: explicit partition_snapshot(schema_ptr s, partition_entry* entry) : _schema(std::move(s)), _entry(entry) { } partition_snapshot(const partition_snapshot&) = delete; partition_snapshot(partition_snapshot&&) = delete; partition_snapshot& operator=(const partition_snapshot&) = delete; partition_snapshot& operator=(partition_snapshot&&) = delete; ~partition_snapshot(); partition_version_ref& version(); auto versions() { return version()->elements_from_this(); } unsigned version_count(); }; class partition_entry { partition_snapshot* _snapshot = nullptr; partition_version_ref _version; friend class partition_snapshot; private: void set_version(partition_version*); void apply(const schema& s, partition_version* pv, const schema& pv_schema); public: partition_entry() = default; explicit partition_entry(mutation_partition mp); ~partition_entry(); partition_entry(partition_entry&& pe) noexcept : _snapshot(pe._snapshot), _version(std::move(pe._version)) { if (_snapshot) { _snapshot->_entry = this; } pe._snapshot = nullptr; } partition_entry& operator=(partition_entry&& other) noexcept { if (this != &other) { this->~partition_entry(); new (this) partition_entry(std::move(other)); } return *this; } // Strong exception guarantees. void apply(const schema& s, const mutation_partition& mp, const schema& mp_schema); // Same exception guarantees as: // mutation_partition::apply(const schema&, mutation_partition&&, const schema&) void apply(const schema& s, mutation_partition&& mp, const schema& mp_schema); // Strong exception guarantees. void apply(const schema& s, mutation_partition_view mpv, const schema& mp_schema); // Weak exception guarantees. // If an exception is thrown this and pe will be left in some valid states // such that if the operation is retried (possibly many times) and eventually // succeeds the result will be as if the first attempt didn't fail. void apply(const schema& s, partition_entry&& pe, const schema& pe_schema); mutation_partition squashed(schema_ptr from, schema_ptr to); // needs to be called with reclaiming disabled void upgrade(schema_ptr from, schema_ptr to); lw_shared_ptr read(schema_ptr entry_schema); }; inline partition_version_ref& partition_snapshot::version() { if (_version) { return _version; } else { return _entry->_version; } } class partition_snapshot_reader : public streamed_mutation::impl { struct rows_position { mutation_partition::rows_type::const_iterator _position; mutation_partition::rows_type::const_iterator _end; }; class heap_compare { position_in_partition::less_compare& _cmp; public: explicit heap_compare(position_in_partition::less_compare cmp) : _cmp(cmp) { } bool operator()(const rows_position& a, const rows_position& b) { return _cmp(*b._position, *a._position); } }; private: // Keeps shared pointer to the container we read mutation from to make sure // that its lifetime is appropriately extended. boost::any _container_guard; // _filtering_context keeps alive the range of clustering rows query::clustering_key_filtering_context _filtering_context; query::clustering_row_ranges::const_iterator _current_ck_range; query::clustering_row_ranges::const_iterator _ck_range_end; bool _in_ck_range = false; position_in_partition::less_compare _cmp; position_in_partition::equal_compare _eq; lw_shared_ptr _snapshot; stdx::optional _last_entry; std::vector _clustering_rows; range_tombstone_stream _range_tombstones; logalloc::region& _lsa_region; logalloc::allocating_section& _read_section; uint64_t _reclaim_counter; unsigned _version_count = 0; private: void refresh_iterators(); void pop_clustering_row(); mutation_fragment_opt read_static_row(); mutation_fragment_opt read_next(); void do_fill_buffer(); static tombstone tomb(partition_snapshot& snp); public: partition_snapshot_reader(schema_ptr s, dht::decorated_key dk, lw_shared_ptr snp, query::clustering_key_filtering_context fc, const query::clustering_row_ranges& crr, logalloc::region& region, logalloc::allocating_section& read_section, boost::any pointer_to_container); ~partition_snapshot_reader(); virtual future<> fill_buffer() override; }; streamed_mutation make_partition_snapshot_reader(schema_ptr s, dht::decorated_key dk, query::clustering_key_filtering_context fc, const query::clustering_row_ranges& crr, lw_shared_ptr snp, logalloc::region& region, logalloc::allocating_section& read_section, boost::any pointer_to_container);