scylladb/db/hints/manager.hh

/*
 * Modified by ScyllaDB
 * Copyright (C) 2017 ScyllaDB
 */

/*
 * This file is part of Scylla.
 *
 * Scylla is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Scylla is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

#pragma once

#include <unordered_map>
#include <vector>
#include <list>
#include <chrono>
#include <seastar/core/gate.hh>
#include <seastar/core/sharded.hh>
#include <seastar/core/timer.hh>
#include <seastar/core/lowres_clock.hh>
#include <seastar/core/shared_mutex.hh>
#include "lister.hh"
#include "gms/gossiper.hh"
#include "locator/snitch_base.hh"
#include "service/endpoint_lifecycle_subscriber.hh"
#include "db/commitlog/commitlog.hh"
#include "utils/loading_shared_values.hh"
#include "utils/fragmented_temporary_buffer.hh"
#include "db/hints/resource_manager.hh"

namespace service {
class storage_service;
}

namespace db {
namespace hints {

using node_to_hint_store_factory_type = utils::loading_shared_values<gms::inet_address, db::commitlog>;
using hints_store_ptr = node_to_hint_store_factory_type::entry_ptr;
using hint_entry_reader = commitlog_entry_reader;
using timer_clock_type = seastar::lowres_clock;

class manager : public service::endpoint_lifecycle_subscriber {
private:
    struct stats {
        uint64_t size_of_hints_in_progress = 0;
        uint64_t written = 0;
        uint64_t errors = 0;
        uint64_t dropped = 0;
        uint64_t sent = 0;
        uint64_t discarded = 0;
    };

    // map: shard -> segments
    using hints_ep_segments_map = std::unordered_map<unsigned, std::list<fs::path>>;
    // map: IP -> map: shard -> segments
    using hints_segments_map = std::unordered_map<sstring, hints_ep_segments_map>;

    class drain_tag {};
    using drain = seastar::bool_class<drain_tag>;

    friend class space_watchdog;

public:
    class end_point_hints_manager {
    public:
        using key_type = gms::inet_address;

        class sender {
            // Important: clock::now() must be noexcept.
            // TODO: add the corresponding static_assert() when seastar::lowres_clock::now() is marked as "noexcept".
            using clock = seastar::lowres_clock;

            enum class state {
                stopping,               // stop() was called
                ep_state_is_not_normal, // destination Node state is not NORMAL - usually means that it has been decommissioned
                draining,               // try to send everything out and ignore errors
            };

            using state_set = enum_set<super_enum<state,
                state::stopping,
                state::ep_state_is_not_normal,
                state::draining>>;

            enum class send_state {
                segment_replay_failed,  // current segment sending failed
                restart_segment,        // segment sending failed and it has to be restarted from the beginning since we failed to store one or more RPs
            };

            using send_state_set = enum_set<super_enum<send_state,
                send_state::segment_replay_failed,
                send_state::restart_segment>>;

            struct send_one_file_ctx {
                send_one_file_ctx(std::unordered_map<table_schema_version, column_mapping>& last_schema_ver_to_column_mapping)
                    : schema_ver_to_column_mapping(last_schema_ver_to_column_mapping)
                {}
                std::unordered_map<table_schema_version, column_mapping>& schema_ver_to_column_mapping;
                seastar::gate file_send_gate;
                std::unordered_set<db::replay_position> rps_set; // number of elements in this set is never going to be greater than the maximum send queue length
                send_state_set state;
            };

        private:
            std::list<sstring> _segments_to_replay;
            replay_position _last_not_complete_rp;
            std::unordered_map<table_schema_version, column_mapping> _last_schema_ver_to_column_mapping;
            state_set _state;
            future<> _stopped;
            clock::time_point _next_flush_tp;
            clock::time_point _next_send_retry_tp;
            key_type _ep_key;
            end_point_hints_manager& _ep_manager;
            manager& _shard_manager;
            resource_manager& _resource_manager;
            service::storage_proxy& _proxy;
            database& _db;
            seastar::scheduling_group _hints_cpu_sched_group;
            gms::gossiper& _gossiper;
            seastar::shared_mutex& _file_update_mutex;

        public:
            sender(end_point_hints_manager& parent, service::storage_proxy& local_storage_proxy, database& local_db, gms::gossiper& local_gossiper) noexcept;

            /// \brief A constructor that should be called from the copy/move-constructor of end_point_hints_manager.
            ///
            /// Make sure to properly reassign the references - especially to the \param parent and its internals.
            ///
            /// \param other the "sender" instance to copy from
            /// \param parent the parent object for this "sender" instance
            sender(const sender& other, end_point_hints_manager& parent) noexcept;

            /// \brief Start sending hints.
            ///
            /// Flush hints aggregated to far to the storage every hints_flush_period.
            /// If the _segments_to_replay is not empty sending send all hints we have.
            ///
            /// Sending is stopped when stop() is called.
            void start();

            /// \brief Stop the sender - make sure all background sending is complete.
            /// \param should_drain if is drain::yes - drain all pending hints
            future<> stop(drain should_drain) noexcept;

            /// \brief Add a new segment ready for sending.
            void add_segment(sstring seg_name);

            /// \brief Check if there are still unsent segments.
            /// \return TRUE if there are still unsent segments.
            bool have_segments() const noexcept { return !_segments_to_replay.empty(); };

        private:
            /// \brief Send hints collected so far.
            ///
            /// Send hints aggregated so far. This function is going to try to deplete
            /// the _segments_to_replay list. Once it's empty it's going to be repopulated during the next send_hints() call
            /// with the new hints files if any.
            ///
            /// send_hints() is going to stop sending if it sends for too long (longer than the timer period). In this case it's
            /// going to return and next send_hints() is going to continue from the point the previous call left.
            void send_hints_maybe() noexcept;

            void set_draining() noexcept {
                _state.set(state::draining);
            }

            bool draining() const noexcept {
                return _state.contains(state::draining);
            }

            void set_stopping() noexcept {
                _state.set(state::stopping);
            }

            bool stopping() const noexcept {
                return _state.contains(state::stopping);
            }

            bool replay_allowed() const noexcept {
                return _ep_manager.replay_allowed();
            }

            /// \brief Try to send one hint read from the file.
            ///  - Limit the maximum memory size of hints "in the air" and the maximum total number of hints "in the air".
            ///  - Discard the hints that are older than the grace seconds value of the corresponding table.
            ///
            /// If sending fails we are going to clear the state::segment_replay_ok in the _state and \ref rp is going to be stored in the _rps_set.
            /// If sending is successful then \ref rp is going to be removed from the _rps_set.
            ///
            /// \param ctx_ptr shared pointer to the file sending context
            /// \param buf buffer representing the hint
            /// \param rp replay position of this hint in the file (see commitlog for more details on "replay position")
            /// \param secs_since_file_mod last modification time stamp (in seconds since Epoch) of the current hints file
            /// \param fname name of the hints file this hint was read from
            /// \return future that resolves when next hint may be sent
            future<> send_one_hint(lw_shared_ptr<send_one_file_ctx> ctx_ptr, fragmented_temporary_buffer buf, db::replay_position rp, gc_clock::duration secs_since_file_mod, const sstring& fname);

            /// \brief Send all hint from a single file and delete it after it has been successfully sent.
            /// Send all hints from the given file. If we failed to send the current segment we will pick up in the next
            /// iteration from where we left in this one.
            ///
            /// \param fname file to send
            /// \return TRUE if file has been successfully sent
            bool send_one_file(const sstring& fname);

            /// \brief Checks if we can still send hints.
            /// \return TRUE if the destination Node is either ALIVE or has left the NORMAL state (e.g. has been decommissioned).
            bool can_send() noexcept;

            /// \brief Restore a mutation object from the hints file entry.
            /// \param ctx_ptr pointer to the send context
            /// \param buf hints file entry
            /// \return The mutation object representing the original mutation stored in the hints file.
            frozen_mutation_and_schema get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, fragmented_temporary_buffer& buf);

            /// \brief Get a reference to the column_mapping object for a given frozen mutation.
            /// \param ctx_ptr pointer to the send context
            /// \param fm Frozen mutation object
            /// \param hr hint entry reader object
            /// \return
            const column_mapping& get_column_mapping(lw_shared_ptr<send_one_file_ctx> ctx_ptr, const frozen_mutation& fm, const hint_entry_reader& hr);

            /// \brief Perform a single mutation send atempt.
            ///
            /// If the original destination end point is still a replica for the given mutation - send the mutation directly
            /// to it, otherwise execute the mutation "from scratch" with CL=ALL.
            ///
            /// \param m mutation to send
            /// \param natural_endpoints current replicas for the given mutation
            /// \return future that resolves when the operation is complete
            future<> do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;

            /// \brief Send one mutation out.
            ///
            /// \param m mutation to send
            /// \return future that resolves when the mutation sending processing is complete.
            future<> send_one_mutation(frozen_mutation_and_schema m);

            /// \brief Get the last modification time stamp for a given file.
            /// \param fname File name
            /// \return The last modification time stamp for \param fname.
            static future<timespec> get_last_file_modification(const sstring& fname);

            struct stats& shard_stats() {
                return _shard_manager._stats;
            }

            /// \brief Flush all pending hints to storage if hints_flush_period passed since the last flush event.
            /// \return Ready, never exceptional, future when operation is complete.
            future<> flush_maybe() noexcept;

            const key_type& end_point_key() const noexcept {
                return _ep_key;
            }

            /// \brief Return the amount of time we want to sleep after the current iteration.
            /// \return The time till the soonest event: flushing or re-sending.
            clock::duration next_sleep_duration() const;
        };

    private:
        key_type _key;
        manager& _shard_manager;
        hints_store_ptr _hints_store_anchor;
        seastar::gate _store_gate;
        seastar::shared_mutex _file_update_mutex;

        enum class state {
            can_hint,               // hinting is currently allowed (used by the space_watchdog)
            stopping,               // stopping is in progress (stop() method has been called)
            stopped                 // stop() has completed
        };

        using state_set = enum_set<super_enum<state,
            state::can_hint,
            state::stopping,
            state::stopped>>;

        state_set _state;
        const fs::path _hints_dir;
        uint64_t _hints_in_progress = 0;
        sender _sender;

    public:
        end_point_hints_manager(const key_type& key, manager& shard_manager);
        end_point_hints_manager(end_point_hints_manager&&);
        ~end_point_hints_manager();

        const key_type& end_point_key() const noexcept {
            return _key;
        }

        /// \brief Get the corresponding hints_store object. Create it if needed.
        /// \note Must be called under the \ref _file_update_mutex.
        /// \return The corresponding hints_store object.
        future<hints_store_ptr> get_or_load();

        /// \brief Store a single mutation hint.
        /// \param s column family descriptor
        /// \param fm frozen mutation object
        /// \param tr_state trace_state handle
        /// \return FALSE if hint is definitely not going to be stored
        bool store_hint(schema_ptr s, lw_shared_ptr<const frozen_mutation> fm, tracing::trace_state_ptr tr_state) noexcept;

        /// \brief Populates the _segments_to_replay list.
        ///  Populates the _segments_to_replay list with the names of the files in the <manager hints files directory> directory
        ///  in the order they should be sent out.
        ///
        /// \return Ready future when end point hints manager is initialized.
        future<> populate_segments_to_replay();

        /// \brief Waits till all writers complete and shuts down the hints store. Drains hints if needed.
        ///
        /// If "draining" is requested - sends all pending hints out.
        ///
        /// When hints are being drained we will not stop sending after a single hint sending has failed and will continue sending hints
        /// till the end of the current segment. After that we will remove the current segment and move to the next one till
        /// there isn't any segment left.
        ///
        /// \param should_drain is drain::yes - drain all pending hints
        /// \return Ready future when all operations are complete
        future<> stop(drain should_drain = drain::no) noexcept;

        /// \brief Start the timer.
        void start();

        /// \return Number of in-flight (towards the file) hints.
        uint64_t hints_in_progress() const noexcept {
            return _hints_in_progress;
        }

        bool replay_allowed() const noexcept {
            return _shard_manager.replay_allowed();
        }

        bool can_hint() const noexcept {
            return _state.contains(state::can_hint);
        }

        void allow_hints() noexcept {
            _state.set(state::can_hint);
        }

        void forbid_hints() noexcept {
            _state.remove(state::can_hint);
        }

        void set_stopping() noexcept {
            _state.set(state::stopping);
        }

        bool stopping() const noexcept {
            return _state.contains(state::stopping);
        }

        void set_stopped() noexcept {
            _state.set(state::stopped);
        }

        void clear_stopped() noexcept {
            _state.remove(state::stopped);
        }

        bool stopped() const noexcept {
            return _state.contains(state::stopped);
        }

        seastar::shared_mutex& file_update_mutex() {
            return _file_update_mutex;
        }

        const fs::path& hints_dir() const noexcept {
            return _hints_dir;
        }

    private:
        /// \brief Creates a new hints store object.
        ///
        /// - Creates a hints store directory if doesn't exist: <shard_hints_dir>/<ep_key>
        /// - Creates a store object.
        /// - Populate _segments_to_replay if it's empty.
        ///
        /// \return A new hints store object.
        future<commitlog> add_store() noexcept;

        /// \brief Flushes all hints written so far to the disk.
        ///  - Repopulates the _segments_to_replay list if needed.
        ///
        /// \return Ready future when the procedure above completes.
        future<> flush_current_hints() noexcept;

        struct stats& shard_stats() {
            return _shard_manager._stats;
        }

        resource_manager& shard_resource_manager() {
            return _shard_manager._resource_manager;
        }
    };

    enum class state {
        started,                // hinting is currently allowed (start() call is complete)
        replay_allowed,         // replaying (hints sending) is allowed
        stopping                // hinting is not allowed - stopping is in progress (stop() method has been called)
    };

    using state_set = enum_set<super_enum<state,
        state::started,
        state::replay_allowed,
        state::stopping>>;

private:
    using ep_key_type = typename end_point_hints_manager::key_type;
    using ep_managers_map_type = std::unordered_map<ep_key_type, end_point_hints_manager>;

public:
    static const std::string FILENAME_PREFIX;
    static const std::chrono::seconds hints_flush_period;
    static const std::chrono::seconds hint_file_write_timeout;

private:
    state_set _state;
    const fs::path _hints_dir;
    dev_t _hints_dir_device_id = 0;

    node_to_hint_store_factory_type _store_factory;
    std::unordered_set<sstring> _hinted_dcs;
    shared_ptr<service::storage_proxy> _proxy_anchor;
    shared_ptr<gms::gossiper> _gossiper_anchor;
    shared_ptr<service::storage_service> _strorage_service_anchor;
    locator::snitch_ptr& _local_snitch_ptr;
    int64_t _max_hint_window_us = 0;
    database& _local_db;

    seastar::gate _draining_eps_gate; // gate used to control the progress of ep_managers stopping not in the context of manager::stop() call

    resource_manager& _resource_manager;

    ep_managers_map_type _ep_managers;
    stats _stats;
    seastar::metrics::metric_groups _metrics;
    std::unordered_set<ep_key_type> _eps_with_pending_hints;

    size_t _max_backlog_size = 1;
    size_t _backlog_size = 0;

public:
    manager(sstring hints_directory, std::vector<sstring> hinted_dcs, int64_t max_hint_window_ms, resource_manager&res_manager, distributed<database>& db);
    virtual ~manager();
    manager(manager&&) = delete;
    manager& operator=(manager&&) = delete;
    void register_metrics(const sstring& group_name);
    future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
    future<> stop();
    bool store_hint(gms::inet_address ep, schema_ptr s, lw_shared_ptr<const frozen_mutation> fm, tracing::trace_state_ptr tr_state) noexcept;

    /// \brief Check if a hint may be generated to the give end point
    /// \param ep end point to check
    /// \return true if we should generate the hint to the given end point if it becomes unavailable
    bool can_hint_for(ep_key_type ep) const noexcept;

    /// \brief Check if there aren't too many in-flight hints
    ///
    /// This function checks if there are too many "in-flight" hints on the current shard - hints that are being stored
    /// and which storing is not complete yet. This is meant to stabilize the memory consumption of the hints storing path
    /// which is initialed from the storage_proxy WRITE flow. storage_proxy is going to check this condition and if it
    /// returns TRUE it won't attempt any new WRITEs thus eliminating the possibility of new hints generation. If new hints
    /// are not generated the amount of in-flight hints amount and thus the memory they are consuming is going to drop eventualy
    /// because the hints are going to be either stored or dropped. After that the things are going to get back to normal again.
    ///
    /// Note that we can't consider the disk usage consumption here because the disk usage is not promissed to drop down shortly
    /// because it requires the remote node to be UP.
    ///
    /// \param ep end point to check
    /// \return TRUE if we are allowed to generate hint to the given end point but there are too many in-flight hints
    bool too_many_in_flight_hints_for(ep_key_type ep) const noexcept;

    /// \brief Check if DC \param ep belongs to is "hintable"
    /// \param ep End point identificator
    /// \return TRUE if hints are allowed to be generated to \param ep.
    bool check_dc_for(ep_key_type ep) const noexcept;

    /// \return Size of mutations of hints in-flight (to the disk) at the moment.
    uint64_t size_of_hints_in_progress() const noexcept {
        return _stats.size_of_hints_in_progress;
    }

    /// \brief Get the number of in-flight (to the disk) hints to a given end point.
    /// \param ep End point identificator
    /// \return Number of hints in-flight to \param ep.
    uint64_t hints_in_progress_for(ep_key_type ep) const noexcept {
        auto it = find_ep_manager(ep);
        if (it == ep_managers_end()) {
            return 0;
        }
        return it->second.hints_in_progress();
    }

    void add_ep_with_pending_hints(ep_key_type key) {
        _eps_with_pending_hints.insert(key);
    }

    void clear_eps_with_pending_hints() {
        _eps_with_pending_hints.clear();
        _eps_with_pending_hints.reserve(_ep_managers.size());
    }

    bool has_ep_with_pending_hints(ep_key_type key) const {
        return _eps_with_pending_hints.count(key);
    }

    size_t ep_managers_size() const {
        return _ep_managers.size();
    }

    const fs::path& hints_dir() const {
        return _hints_dir;
    }

    dev_t hints_dir_device_id() const {
        return _hints_dir_device_id;
    }

    void allow_hints();
    void forbid_hints();
    void forbid_hints_for_eps_with_pending_hints();

    size_t max_backlog_size() const {
        return _max_backlog_size;
    }

    size_t backlog_size() const {
        return _backlog_size;
    }

    void allow_replaying() noexcept {
        _state.set(state::replay_allowed);
    }

    /// \brief Rebalance hints segments among all present shards.
    ///
    /// The difference between the number of segments on every two shard will be not greater than 1 after the
    /// rebalancing.
    ///
    /// Removes the sub-directories of \ref hints_directory that correspond to shards that are not relevant any more
    /// (re-sharding to a lower shards number case).
    ///
    /// Complexity: O(N+K), where N is a total number of present hints' segments and
    ///                           K = <number of shards during the previous boot> * <number of end points for which hints where ever created>
    ///
    /// \param hints_directory A hints directory to rebalance
    /// \return A future that resolves when the operation is complete.
    static future<> rebalance(sstring hints_directory);

    virtual void on_join_cluster(const gms::inet_address& endpoint) override {}
    virtual void on_leave_cluster(const gms::inet_address& endpoint) override {
        drain_for(endpoint);
    };
    virtual void on_up(const gms::inet_address& endpoint) override {}
    virtual void on_down(const gms::inet_address& endpoint) override {}

private:
    future<> compute_hints_dir_device_id();

    /// \brief Scan the given hints directory and build the map of all present hints segments.
    ///
    /// Complexity: O(N+K), where N is a total number of present hints' segments and
    ///                           K = <number of shards during the previous boot> * <number of end points for which hints where ever created>
    ///
    /// \note Should be called from a seastar::thread context.
    ///
    /// \param hints_directory directory to scan
    /// \return a map: ep -> map: shard -> segments (full paths)
    static hints_segments_map get_current_hints_segments(const sstring& hints_directory);

    /// \brief Rebalance hints segments for a given (destination) end point
    ///
    /// This method is going to consume files from the \ref segments_to_move and distribute them between the present
    /// shards (taking into an account the \ref ep_segments state - there may be zero or more segments that belong to a
    /// particular shard in it) until we either achieve the requested \ref segments_per_shard level on each shard
    /// or until we are out of files to move.
    ///
    /// As a result (in addition to the actual state on the disk) both \ref ep_segments and \ref segments_to_move are going
    /// to be modified.
    ///
    /// Complexity: O(N), where N is a total number of present hints' segments for the \ref ep end point (as a destination).
    ///
    /// \note Should be called from a seastar::thread context.
    ///
    /// \param ep destination end point ID (a string with its IP address)
    /// \param segments_per_shard number of hints segments per-shard we want to achieve
    /// \param hints_directory a root hints directory
    /// \param ep_segments a map that was originally built by get_current_hints_segments() for this end point
    /// \param segments_to_move a list of segments we are allowed to move
    static void rebalance_segments_for(
            const sstring& ep,
            size_t segments_per_shard,
            const sstring& hints_directory,
            hints_ep_segments_map& ep_segments,
            std::list<fs::path>& segments_to_move);

    /// \brief Rebalance all present hints segments.
    ///
    /// The difference between the number of segments on every two shard will be not greater than 1 after the
    /// rebalancing.
    ///
    /// Complexity: O(N), where N is a total number of present hints' segments.
    ///
    /// \note Should be called from a seastar::thread context.
    ///
    /// \param hints_directory a root hints directory
    /// \param segments_map a map that was built by get_current_hints_segments()
    static void rebalance_segments(const sstring& hints_directory, hints_segments_map& segments_map);

    /// \brief Remove sub-directories of shards that are not relevant any more (re-sharding to a lower number of shards case).
    ///
    /// Complexity: O(S*E), where S is a number of shards during the previous boot and
    ///                           E is a number of end points for which hints where ever created.
    ///
    /// \param hints_directory a root hints directory
    static void remove_irrelevant_shards_directories(const sstring& hints_directory);

    node_to_hint_store_factory_type& store_factory() noexcept {
        return _store_factory;
    }

    service::storage_proxy& local_storage_proxy() const noexcept {
        return *_proxy_anchor;
    }

    gms::gossiper& local_gossiper() const noexcept {
        return *_gossiper_anchor;
    }

    database& local_db() noexcept {
        return _local_db;
    }

    end_point_hints_manager& get_ep_manager(ep_key_type ep);
    bool have_ep_manager(ep_key_type ep) const noexcept;

    /// \brief Initiate the draining when we detect that the node has left the cluster.
    ///
    /// If the node that has left is the current node - drains all pending hints to all nodes.
    /// Otherwise drains hints to the node that has left.
    ///
    /// In both cases - removes the corresponding hints' directories after all hints have been drained and erases the
    /// corresponding end_point_hints_manager objects.
    ///
    /// \param endpoint node that left the cluster
    void drain_for(gms::inet_address endpoint);

    void update_backlog(size_t backlog, size_t max_backlog);

    bool stopping() const noexcept {
        return _state.contains(state::stopping);
    }

    void set_stopping() noexcept {
        _state.set(state::stopping);
    }

    bool started() const noexcept {
        return _state.contains(state::started);
    }

    void set_started() noexcept {
        _state.set(state::started);
    }

    bool replay_allowed() const noexcept {
        return _state.contains(state::replay_allowed);
    }

public:
    ep_managers_map_type::iterator find_ep_manager(ep_key_type ep_key) noexcept {
        return _ep_managers.find(ep_key);
    }

    ep_managers_map_type::const_iterator find_ep_manager(ep_key_type ep_key) const noexcept {
        return _ep_managers.find(ep_key);
    }

    ep_managers_map_type::iterator ep_managers_end() noexcept {
        return _ep_managers.end();
    }

    ep_managers_map_type::const_iterator ep_managers_end() const noexcept {
        return _ep_managers.end();
    }
};

}
}