/* * Copyright (C) 2018 ScyllaDB */ /* * This file is part of Scylla. * * Scylla is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Scylla is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Scylla. If not, see . */ #pragma once #include #include #include #include "imr/compound.hh" #include "imr/fundamental.hh" #include "imr/alloc.hh" #include "imr/utils.hh" #include "imr/concepts.hh" #include "data/schema_info.hh" #include "data/value_view.hh" #include "gc_clock.hh" #include "timestamp.hh" namespace data { template class value_writer; struct cell { static constexpr size_t maximum_internal_storage_length = value_view::maximum_internal_storage_length; static constexpr size_t maximum_external_chunk_length = value_view::maximum_external_chunk_length; struct tags { class cell; class atomic_cell; class collection; class flags; class live; class expiring; class counter_update; class external_data; class ttl; class expiry; class empty; class timestamp; class value; class dead; class counter_update; class fixed_value; class variable_value; class value_size; class value_data; class pointer; class data; class external_data; class chunk_back_pointer; class chunk_next; class chunk_data; class last_chunk_size; }; using flags = imr::flags< tags::collection, tags::live, tags::expiring, tags::counter_update, tags::empty, tags::external_data >; /// Variable-length cell value /// /// This is a definition of the IMR structure of a variable-length value. /// It is used both by collections, counters and regular cells which type /// is variable-sized. The data can be stored internally, if its size is /// smaller or equal maximum_internal_storage_length or externally if it /// larger. struct variable_value { using data_variant = imr::variant>>, imr::member> >; using structure = imr::structure< imr::member>, imr::member >; /// Create writer of a variable-size value /// /// Returns a function object that can be used as a writer of a variable /// value. The first argument is expected to be either IMR sizer or /// serializer and the second is an appropriate IMR allocator helper /// object. /// \arg force_internal if set to true stores the value internally /// regardless of its size (used by collection members). template static value_writer> write(FragmentRange&& value, bool force_internal = false) noexcept; static auto write(bytes_view value, bool force_internal = false) noexcept; /// Create writer of an uninitialised variable-size value static value_writer write(size_t size, bool force_internal = false) noexcept; class context { bool _external_storage; uint32_t _value_size; public: explicit context(bool external_storage, uint32_t value_size) noexcept : _external_storage(external_storage), _value_size(value_size) { } template auto active_alternative_of() const noexcept { if (_external_storage) { return data_variant::index_for(); } else { return data_variant::index_for(); } } template size_t size_of() const noexcept { return _value_size; } template auto context_for(Args&&...) const noexcept { return *this; } }; template static basic_value_view do_make_view(structure::basic_view view, bool external_storage); static data::value_view make_view(structure::view view, bool external_storage) { return do_make_view(view, external_storage); } static data::value_mutable_view make_view(structure::mutable_view view, bool external_storage) { return do_make_view(view, external_storage); } }; using fixed_value = imr::buffer; /// Cell value /// /// The cell value can be either a deletion time (if the cell is dead), /// a delta (counter update cell), fixed-size value or variable-sized value. using value_variant = imr::variant>, imr::member>, imr::member, imr::member >; /// Atomic cell /// /// Atomic cells can be either regular cells or counters. Moreover, the /// cell may be live or dead and the regular cells may have expiration time. /// Counter cells may be either sets of shards or a delta. The former is not /// fully converted to the IMR yet and still use a custom serilalisation /// format. The IMR treats such cells the same way it handles regular blobs. using atomic_cell = imr::structure< imr::member>, imr::optional_member>, imr::member> >>, imr::member >; using atomic_cell_or_collection = imr::variant, imr::member >; /// Top IMR definition of a cell /// /// A cell in Scylla's data model can be either atomic (a regular cell, /// a counter or a frozen collection) or an unfrozen collection. As for now /// only regular cells are fully utilising the IMR. Collections are still /// using custom serialisation format and from the IMR point of view are /// just opaque values. using structure = imr::structure< imr::member, imr::member >; /// An fragment of externally stored value /// /// If a cell value size is above maximum_internal_storage_length it is /// stored externally. Moreover, in order to avoid stressing the memory /// allocators with large allocations values are fragmented in chunks /// no larger than maximum_external_chunk_length. The size of all chunks, /// but the last one is always maximum_external_chunk_length. using external_chunk = imr::structure< imr::member>>, imr::member>, imr::member> >; using external_last_chunk_size = imr::pod; /// The last fragment of an externally stored value /// /// The size of the last fragment of a value stored externally may vary. /// Due to the requirements the LSA imposes on migrators we need to store /// the size inside it so that it can be retrieved when the LSA migrates /// object. using external_last_chunk = imr::structure< imr::member>>, imr::member, imr::member> >; class context; class minimal_context; /// Value fragment deserialisation context /// /// This is a deserialization context for all, but last, value fragments. /// Their size is fixed. struct chunk_context { explicit constexpr chunk_context(const uint8_t*) noexcept { } template static constexpr size_t size_of() noexcept { return cell::maximum_external_chunk_length; } template auto context_for(Args&&...) const noexcept { return *this; } }; /// Last value fragment deserialisation context class last_chunk_context { uint16_t _size; public: explicit last_chunk_context(const uint8_t* ptr) noexcept : _size(external_last_chunk::get_member(ptr).load()) { } template size_t size_of() const noexcept { return _size; } template auto context_for(Args&&...) const noexcept { return *this; } }; template class basic_atomic_cell_view; using atomic_cell_view = basic_atomic_cell_view; using mutable_atomic_cell_view = basic_atomic_cell_view; private: static thread_local imr::alloc::lsa_migrate_fn> lsa_last_chunk_migrate_fn; static thread_local imr::alloc::lsa_migrate_fn> lsa_chunk_migrate_fn; public: /// Make a writer that copies a cell /// /// This function creates a writer that copies a cell. It can be either /// atomic or a collection. /// /// \arg ptr needs to remain valid as long as the writer is in use. /// \returns imr::WriterAllocator for cell::structure. static auto copy_fn(const type_info& ti, const uint8_t* ptr); /// Make a writer for a collection /// /// \arg data needs to remain valid as long as the writer is in use. /// \returns imr::WriterAllocator for cell::structure. template>>> static auto make_collection(FragmentRange&& data) noexcept { return [data] (auto&& serializer, auto&& allocations) noexcept { return serializer .serialize(imr::set_flag(), imr::set_flag(data.size_bytes() > maximum_internal_storage_length)) .template serialize_as(variable_value::write(data), allocations) .done(); }; } static auto make_collection(bytes_view data) noexcept { return make_collection(single_fragment_range(data)); } /// Make a writer for a dead cell /// /// This function returns a generic lambda that is a writer for a dead /// cell with the specified timestamp and deletion time. /// /// \returns imr::WriterAllocator for cell::structure. static auto make_dead(api::timestamp_type ts, gc_clock::time_point deletion_time) noexcept { return [ts, deletion_time] (auto&& serializer, auto&&...) noexcept { return serializer .serialize() .template serialize_as_nested() .serialize(ts) .skip() .template serialize_as(deletion_time.time_since_epoch().count()) .done() .done(); }; } static auto make_live_counter_update(api::timestamp_type ts, int64_t delta) noexcept { return [ts, delta] (auto&& serializer, auto&&...) noexcept { return serializer .serialize(imr::set_flag(), imr::set_flag()) .template serialize_as_nested() .serialize(ts) .skip() .template serialize_as(delta) .done() .done(); }; } /// Make a writer for a live non-expiring cell /// /// \arg value needs to remain valid as long as the writer is in use. /// \arg force_internal always store the value internally regardless of its /// size. This is a temporary (hopefully, sorry if you are reading this in /// 2020) hack to make integration with collections easier. /// /// \returns imr::WriterAllocator for cell::structure. template>>> static auto make_live(const type_info& ti, api::timestamp_type ts, FragmentRange&& value, bool force_internal = false) noexcept { return [&ti, ts, value, force_internal] (auto&& serializer, auto&& allocations) noexcept { auto after_expiring = serializer .serialize(imr::set_flag(), imr::set_flag(value.empty()), imr::set_flag(!force_internal && !ti.is_fixed_size() && value.size_bytes() > maximum_internal_storage_length)) .template serialize_as_nested() .serialize(ts) .skip(); return [&] { if (ti.is_fixed_size()) { return after_expiring.template serialize_as(value); } else { return after_expiring .template serialize_as(variable_value::write(value, force_internal), allocations); } }().done().done(); }; } static auto make_live(const type_info& ti, api::timestamp_type ts, bytes_view value, bool force_internal = false) noexcept { return make_live(ti, ts, single_fragment_range(value), force_internal); } template>>> static auto make_live(const type_info& ti, api::timestamp_type ts, FragmentRange&& value, gc_clock::time_point expiry, gc_clock::duration ttl, bool force_internal = false) noexcept { return [&ti, ts, value, expiry, ttl, force_internal] (auto&& serializer, auto&& allocations) noexcept { auto after_expiring = serializer .serialize(imr::set_flag(), imr::set_flag(), imr::set_flag(value.empty()), imr::set_flag(!force_internal && !ti.is_fixed_size() && value.size_bytes() > maximum_internal_storage_length)) .template serialize_as_nested() .serialize(ts) .serialize_nested() .serialize(ttl.count()) .serialize(expiry.time_since_epoch().count()) .done(); return [&] { if (ti.is_fixed_size()) { return after_expiring.template serialize_as(value); } else { return after_expiring .template serialize_as(variable_value::write(value, force_internal), allocations); } }().done().done(); }; } static auto make_live(const type_info& ti, api::timestamp_type ts, bytes_view value, gc_clock::time_point expiry, gc_clock::duration ttl, bool force_internal = false) noexcept { return make_live(ti, ts, single_fragment_range(value), expiry, ttl, force_internal); } /// Make a writer of a live cell with uninitialised value /// /// This function returns a function object which is a writer of a live /// cell. The space for value is allocated but not initialised. This can be /// used if the value is a result of some IMR-independent serialisation /// (e.g. counters). /// /// \returns imr::WriterAllocator for cell::structure. static auto make_live_uninitialized(const type_info& ti, api::timestamp_type ts, size_t size) noexcept { return [&ti, ts, size] (auto&& serializer, auto&& allocations) noexcept { auto after_expiring = serializer .serialize(imr::set_flag(), imr::set_flag(!size), imr::set_flag(!ti.is_fixed_size() && size > maximum_internal_storage_length)) .template serialize_as_nested() .serialize(ts) .skip(); return [&] { if (ti.is_fixed_size()) { return after_expiring.template serialize_as(size, [] (uint8_t*) noexcept { }); } else { return after_expiring .template serialize_as(variable_value::write(size, false), allocations); } }().done().done(); }; } template static size_t size_of(Builder&& builder, imr::alloc::object_allocator& allocator) noexcept { return structure::size_when_serialized(std::forward(builder), allocator.get_sizer()); } template static size_t serialize(uint8_t* ptr, Builder&& builder, imr::alloc::object_allocator& allocator) noexcept { return structure::serialize(ptr, std::forward(builder), allocator.get_serializer()); } static atomic_cell_view make_atomic_cell_view(const type_info& ti, const uint8_t* ptr) noexcept; static mutable_atomic_cell_view make_atomic_cell_view(const type_info& ti, uint8_t* ptr) noexcept; static void destroy(uint8_t* ptr) noexcept; }; /// Minimal cell deserialisation context /// /// This is a minimal deserialisation context that doesn't require the cell /// type to be known, but allows only some operations to be performed. In /// particular it is able to provide sufficient information to destroy a cell. class cell::minimal_context { protected: cell::flags::view _flags; public: explicit minimal_context(cell::flags::view flags) noexcept : _flags(flags) { } template bool is_present() const noexcept; template auto active_alternative_of() const noexcept; template size_t size_of() const noexcept; template auto context_for(const uint8_t*) const noexcept { return *this; } }; template<> inline bool cell::minimal_context::is_present() const noexcept { return _flags.get(); } template<> inline auto cell::minimal_context::active_alternative_of() const noexcept { if (_flags.get()) { return cell::atomic_cell_or_collection::index_for(); } else { return cell::atomic_cell_or_collection::index_for(); } } /// Cell deserialisation context /// /// This class combines schema-dependnent and instance-specific information /// and provides an appropriate interface for the IMR deserialisation routines /// to read a cell. class cell::context : public cell::minimal_context { type_info _type; public: explicit context(const uint8_t* ptr, const type_info& tinfo) noexcept : context(structure::get_member(ptr), tinfo) { } explicit context(cell::flags::view flags, const type_info& tinfo) noexcept : minimal_context(flags), _type(tinfo) { } template bool is_present() const noexcept { return minimal_context::is_present(); } template auto active_alternative_of() const noexcept { return minimal_context::active_alternative_of(); } template size_t size_of() const noexcept; template auto context_for(const uint8_t*) const noexcept { return *this; } }; template<> inline auto cell::context::context_for(const uint8_t* ptr) const noexcept { auto length = variable_value::structure::get_member(ptr); return variable_value::context(_flags.get(), length.load()); } template<> inline auto cell::context::context_for(const uint8_t* ptr) const noexcept { auto length = variable_value::structure::get_member(ptr); return variable_value::context(_flags.get(), length.load()); } template<> inline auto cell::context::active_alternative_of() const noexcept { if (_flags.get()) { if (__builtin_expect(_flags.get(), false)) { return cell::value_variant::index_for(); } if (_type.is_fixed_size()) { return cell::value_variant::index_for(); } else { return cell::value_variant::index_for(); } } else { return cell::value_variant::index_for(); } } template<> inline size_t cell::context::size_of() const noexcept { return _flags.get() ? 0 : _type.value_size(); } /// Atomic cell view /// /// This is a, possibly mutable, view of an atomic cell. It is a wrapper on top /// of IMR-generated view that provides more convenient interface which doesn't /// depend on the actual cell structure. /// /// \note Instances of this class are being copied and passed by value a lot. /// It is desireable that it remains small and trivial, so that the compiler /// can try to keep it in registers at all times. We also should not worry too /// much about computing the same thing more than once (unless the profiler /// tells otherwise, of course). Most of the IMR code and its direct users rely /// heavily on inlining which would allow the compiler remove duplicated /// computations. template class cell::basic_atomic_cell_view { public: using view_type = structure::basic_view; private: type_info _type; view_type _view; private: flags::view flags_view() const noexcept { return _view.template get(); } atomic_cell::basic_view cell_view() const noexcept { return _view.template get().template as(); } context make_context() const noexcept { return context(flags_view(), _type); } public: basic_atomic_cell_view(const type_info& ti, view_type v) noexcept : _type(ti), _view(std::move(v)) { } operator basic_atomic_cell_view() const noexcept { return basic_atomic_cell_view(_type, _view); } const uint8_t* raw_pointer() const { return _view.raw_pointer(); } bytes_view serialize() const noexcept { assert(!flags_view().template get()); auto ptr = raw_pointer(); auto len = structure::serialized_object_size(ptr, make_context()); return bytes_view(reinterpret_cast(ptr), len); } bool is_live() const noexcept { return flags_view().template get(); } bool is_expiring() const noexcept { return flags_view().template get(); } bool is_counter_update() const noexcept { return flags_view().template get(); } api::timestamp_type timestamp() const noexcept { return cell_view().template get().load(); } void set_timestamp(api::timestamp_type ts) noexcept { cell_view().template get().store(ts); } gc_clock::time_point expiry() const noexcept { auto v = cell_view().template get().get().template get().load(); return gc_clock::time_point(gc_clock::duration(v)); } gc_clock::duration ttl() const noexcept { auto v = cell_view().template get().get().template get().load(); return gc_clock::duration(v); } gc_clock::time_point deletion_time() const noexcept { auto v = cell_view().template get(make_context()).template as().load(); return gc_clock::time_point(gc_clock::duration(v)); } int64_t counter_update_value() const noexcept { return cell_view().template get(make_context()).template as().load(); } basic_value_view value() const noexcept { auto ctx = make_context(); return cell_view().template get(ctx).visit(make_visitor( [] (fixed_value::basic_view view) { return basic_value_view(view, 0, nullptr); }, [&] (variable_value::structure::basic_view view) { return variable_value::make_view(view, flags_view().template get()); }, [] (...) -> basic_value_view { abort(); } ), ctx); } size_t value_size() const noexcept { auto ctx = make_context(); return cell_view().template get(ctx).visit(make_visitor( [] (fixed_value::view view) -> size_t { return view.size(); }, [] (variable_value::structure::view view) -> size_t { return view.template get().load(); }, [] (...) -> size_t { abort(); } ), ctx); } bool is_value_fragmented() const noexcept { return flags_view().template get() && value_size() > maximum_external_chunk_length; } }; inline auto cell::copy_fn(const type_info& ti, const uint8_t* ptr) { // Slow path return [&ti, ptr] (auto&& serializer, auto&& allocations) noexcept { auto f = structure::get_member(ptr); context ctx(ptr, ti); if (f.get()) { auto view = structure::get_member(ptr).as(ctx); auto dv = variable_value::make_view(view, f.get()); return make_collection(dv)(serializer, allocations); } else { auto acv = atomic_cell_view(ti, structure::make_view(ptr, ti)); if (acv.is_live()) { if (acv.is_counter_update()) { return make_live_counter_update(acv.timestamp(), acv.counter_update_value())(serializer, allocations); } else if (acv.is_expiring()) { return make_live(ti, acv.timestamp(), acv.value(), acv.expiry(), acv.ttl())(serializer, allocations); } return make_live(ti, acv.timestamp(), acv.value())(serializer, allocations); } else { return make_dead(acv.timestamp(), acv.deletion_time())(serializer, allocations); } } }; } inline cell::atomic_cell_view cell::make_atomic_cell_view(const type_info& ti, const uint8_t* ptr) noexcept { return atomic_cell_view(ti, structure::make_view(ptr)); } inline cell::mutable_atomic_cell_view cell::make_atomic_cell_view(const type_info& ti, uint8_t* ptr) noexcept { return mutable_atomic_cell_view(ti, structure::make_view(ptr)); } /// Context for external value destruction /// /// When a cell value is stored externally as a list of fragments we need to /// know when we reach the last fragment. The way to do that is to read the /// total value size from the parent cell object and use the fact that the size /// of all fragments except the last one is cell::maximum_external_chunk_length. class fragment_chain_destructor_context : public imr::no_context_t { size_t _total_length; public: explicit fragment_chain_destructor_context(size_t total_length) noexcept : _total_length(total_length) { } void next_chunk() noexcept { _total_length -= data::cell::maximum_external_chunk_length; } bool is_last_chunk() const noexcept { return _total_length <= data::cell::maximum_external_chunk_length; } }; } namespace imr { namespace methods { /// Cell destructor /// /// If the cell value exceeds certain thresholds its value is stored externally /// (possibly fragmented). This requires a destructor so that the owned memory /// can be freed when the cell is destroyed. /// Note that we don't need to know the actual type of the cell to destroy it, /// since all the necessary information is stored in each instance. This means /// that IMR cells can be owned by C++ objects without the problem of passing /// arguments to C++ destructors. template<> struct destructor { static void run(uint8_t* ptr, ...) { auto flags = data::cell::structure::get_member(ptr); if (flags.get()) { auto cell_offset = data::cell::structure::offset_of(ptr); auto variable_value_ptr = [&] { if (flags.get()) { return ptr + cell_offset; } else { auto ctx = data::cell::minimal_context(flags); auto offset = data::cell::atomic_cell::offset_of(ptr + cell_offset, ctx); return ptr + cell_offset + offset; } }(); imr::methods::destroy(variable_value_ptr); } } }; /// Cell mover template<> struct mover { static void run(uint8_t* ptr, ...) { auto flags = data::cell::structure::get_member(ptr); if (flags.get()) { auto cell_offset = data::cell::structure::offset_of(ptr); auto variable_value_ptr = [&] { if (flags.get()) { return ptr + cell_offset; } else { auto ctx = data::cell::minimal_context(flags); auto offset = data::cell::atomic_cell::offset_of(ptr + cell_offset, ctx); return ptr + cell_offset + offset; } }(); variable_value_ptr += data::cell::variable_value::structure::offset_of(variable_value_ptr); imr::methods::move>>(variable_value_ptr); } } }; template<> struct destructor { static void run(uint8_t* ptr, ...) { auto varval = data::cell::variable_value::structure::make_view(ptr); auto total_length = varval.template get().load(); if (total_length <= data::cell::maximum_internal_storage_length) { return; } auto ctx = data::fragment_chain_destructor_context(total_length); auto ptr_view = varval.get().as(); if (ctx.is_last_chunk()) { imr::methods::destroy(ptr_view.load()); } else { imr::methods::destroy(ptr_view.load(), ctx); } current_allocator().free(ptr_view.load()); } }; template<> struct mover>> { static void run(uint8_t* ptr, ...) { auto ptr_view = imr::pod::make_view(ptr); auto chk_ptr = ptr_view.load(); auto chk = data::cell::external_last_chunk::make_view(chk_ptr, data::cell::last_chunk_context(chk_ptr)); chk.get().store(ptr); } }; template<> struct mover>> { static void run(uint8_t* bptr, ...) { auto bptr_view = imr::pod::make_view(bptr); auto ptr_ptr = bptr_view.load(); auto ptr = imr::pod::make_view(ptr_ptr); ptr.store(bptr); } }; /// External chunk destructor template<> struct destructor { static void run(uint8_t* ptr, data::fragment_chain_destructor_context ctx) { bool first = true; while (true) { ctx.next_chunk(); auto echk_view = data::cell::external_chunk::make_view(ptr); auto ptr_view = echk_view.get(); if (ctx.is_last_chunk()) { imr::methods::destroy(ptr_view.load()); current_allocator().free(ptr_view.load()); if (!first) { current_allocator().free(ptr); } break; } else { auto last = ptr; ptr = ptr_view.load(); if (!first) { current_allocator().free(last); } else { first = false; } } } } }; template<> struct mover { static void run(uint8_t* ptr, ...) { auto echk_view = data::cell::external_chunk::make_view(ptr, data::cell::chunk_context(ptr)); auto next_ptr = echk_view.get().load(); auto bptr = imr::pod::make_view(next_ptr); bptr.store(ptr + echk_view.offset_of()); auto back_ptr = echk_view.get().load(); auto nptr = imr::pod::make_view(back_ptr); nptr.store(ptr); } }; } } template<> struct appending_hash { template void operator()(Hasher& h, data::value_view v) const { feed_hash(h, v.size_bytes()); using boost::range::for_each; for_each(v, [&h] (auto&& chk) { h.update(reinterpret_cast(chk.data()), chk.size()); }); } }; int compare_unsigned(data::value_view lhs, data::value_view rhs) noexcept; namespace data { struct type_imr_descriptor { using context_factory = imr::alloc::context_factory, data::type_info>; using lsa_migrate_fn = imr::alloc::lsa_migrate_fn::structure, context_factory>; private: data::type_info _type_info; lsa_migrate_fn _lsa_migrator; public: explicit type_imr_descriptor(data::type_info ti) : _type_info(ti) , _lsa_migrator(context_factory(ti)) { } const data::type_info& type_info() const { return _type_info; } const lsa_migrate_fn& lsa_migrator() const { return _lsa_migrator; } }; } #include "value_view_impl.hh" #include "cell_impl.hh"