From f71932f37f5b84547b07854ff7f14aaa019d4312 Mon Sep 17 00:00:00 2001 From: "Raphael S. Carvalho" Date: Tue, 28 Jul 2015 16:39:06 -0300 Subject: [PATCH 1/2] utils: add exponential_backoff_retry This is a retry mechanism that exponentially increases sleep time between retries. Signed-off-by: Raphael S. Carvalho --- utils/exponential_backoff_retry.hh | 39 ++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 utils/exponential_backoff_retry.hh diff --git a/utils/exponential_backoff_retry.hh b/utils/exponential_backoff_retry.hh new file mode 100644 index 0000000000..2faca4e978 --- /dev/null +++ b/utils/exponential_backoff_retry.hh @@ -0,0 +1,39 @@ + +/* + * Copyright 2015 Cloudius Systems. + */ + +#pragma once + +#include "core/sleep.hh" +#include + +// Implements retry policy that exponentially increases sleep time between retries. +class exponential_backoff_retry { + std::chrono::milliseconds _base_sleep_time; + std::chrono::milliseconds _sleep_time; + std::chrono::milliseconds _max_sleep_time; +public: + exponential_backoff_retry(std::chrono::milliseconds base_sleep_time, std::chrono::milliseconds max_sleep_time) + : _base_sleep_time(std::min(base_sleep_time, max_sleep_time)) + , _sleep_time(_base_sleep_time) + , _max_sleep_time(max_sleep_time) {} + + future<> retry() { + auto old_sleep_time = _sleep_time; + // calculating sleep time seconds for the next retry. + _sleep_time = std::min(_sleep_time * 2, _max_sleep_time); + + return sleep(old_sleep_time); + } + + // Return sleep time in seconds to be used for next retry. + std::chrono::milliseconds sleep_time() const { + return _sleep_time; + } + + // Reset sleep time to base sleep time. + void reset() { + _sleep_time = _base_sleep_time; + } +}; From 5a70c8c8f4558e4a26d5fc50f2e8ef43b3a54e20 Mon Sep 17 00:00:00 2001 From: "Raphael S. Carvalho" Date: Tue, 28 Jul 2015 16:51:02 -0300 Subject: [PATCH 2/2] db: implement retry policy for compaction Currently, compaction will no longer happen for a column family which a compaction failed for some unexpected reason. We want to implement a retry policy that will sleep for a while until the next compaction attempt. This patch implements retry policy for compaction using exponential_backoff_retry. With exponential_backoff_retry, the sleep time grows exponentially with the number of retries until the maximum sleep time is reached. For compaction specifically, the base sleep time will be 5 seconds and the maximum sleeping time will be 300 seconds, i.e. 5 minutes. If compaction succeeded after a retry, the sleep time will be reset to the base sleep time. Signed-off-by: Raphael S. Carvalho --- database.cc | 66 +++++++++++++++++++++++++++++++++++++++-------------- database.hh | 2 ++ 2 files changed, 51 insertions(+), 17 deletions(-) diff --git a/database.cc b/database.cc index 45f188a638..1d0c8fa136 100644 --- a/database.cc +++ b/database.cc @@ -440,22 +440,8 @@ column_family::stop() { return _in_flight_seals.close().then([this] { _compaction_sem.broken(); - return _compaction_done.then_wrapped([this] (future<> f) { - // NOTE: broken_semaphore and seastar::gate_closed_exception exceptions - // are used for regular termination of compaction fiber. - try { - f.get(); - } catch (broken_semaphore& e) { - dblog.info("compaction for column_family {}/{} not restarted due to shutdown", _schema->ks_name(), _schema->cf_name()); - } catch (seastar::gate_closed_exception& e) { - dblog.info("compaction for column_family {}/{} not restarted due to shutdown", _schema->ks_name(), _schema->cf_name()); - } catch (std::exception& e) { - dblog.error("compaction failed: {}", e.what()); - throw; - } catch (...) { - dblog.error("compaction failed: unknown error"); - throw; - } + return _compaction_done.then([] { + return make_ready_future<>(); }); }); } @@ -533,10 +519,56 @@ void column_family::start_compaction() { sstables::compaction_strategy strategy = _compaction_strategy; return do_with(std::move(strategy), [this] (sstables::compaction_strategy& cs) { dblog.info("started compaction for column_family {}/{}", _schema->ks_name(), _schema->cf_name()); - return cs.compact(*this); + return cs.compact(*this).then([this] { + // If compaction completed successfully, let's reset sleep time of _compaction_retry. + _compaction_retry.reset(); + }); }); }); + }).then_wrapped([this] (future<> f) { + bool retry = false; + + // NOTE: broken_semaphore and seastar::gate_closed_exception exceptions + // are used for regular termination of compaction fiber. + try { + f.get(); + } catch (broken_semaphore& e) { + dblog.info("compaction for column_family {}/{} not restarted due to shutdown", _schema->ks_name(), _schema->cf_name()); + throw; + } catch (seastar::gate_closed_exception& e) { + dblog.info("compaction for column_family {}/{} not restarted due to shutdown", _schema->ks_name(), _schema->cf_name()); + throw; + } catch (std::exception& e) { + dblog.error("compaction for column_family {}/{} failed: {}", _schema->ks_name(), _schema->cf_name(), e.what()); + retry = true; + } catch (...) { + dblog.error("compaction for column_family {}/{} failed: unknown error", _schema->ks_name(), _schema->cf_name()); + retry = true; + } + + if (retry) { + dblog.info("compaction task for column_family {}/{} sleeping for {} seconds", + _schema->ks_name(), _schema->cf_name(), std::chrono::duration_cast(_compaction_retry.sleep_time()).count()); + return _compaction_retry.retry().then([this] { + // after sleeping, signal semaphore for the next compaction attempt. + _compaction_sem.signal(); + }); + } + return make_ready_future<>(); }); + }).then_wrapped([this] (future<> f) { + // here, we ignore both broken_semaphore and seastar::gate_closed_exception that + // were used for regular termination of the compaction fiber. + try { + f.get(); + } catch (broken_semaphore& e) { + // exception logged in keep_doing. + } catch (seastar::gate_closed_exception& e) { + // exception logged in keep_doing. + } catch (...) { + // this shouldn't happen, let's log it anyway. + dblog.error("compaction for column_family {}/{} failed: unexpected error", _schema->ks_name(), _schema->cf_name()); + } }); } diff --git a/database.hh b/database.hh index 08dd7533ac..905dda230d 100644 --- a/database.hh +++ b/database.hh @@ -46,6 +46,7 @@ #include "mutation_reader.hh" #include "row_cache.hh" #include "compaction_strategy.hh" +#include "utils/exponential_backoff_retry.hh" class frozen_mutation; class reconcilable_result; @@ -101,6 +102,7 @@ private: sstables::compaction_strategy _compaction_strategy; future<> _compaction_done = make_ready_future<>(); semaphore _compaction_sem; + exponential_backoff_retry _compaction_retry = exponential_backoff_retry(std::chrono::seconds(5), std::chrono::seconds(300)); private: void add_sstable(sstables::sstable&& sstable); void add_memtable();