release: prepare for 2.3.3 by hagitsegev

database: Fix race condition in sstable snapshot
Race condition takes place when one of the sstables selected by snapshot is deleted by compaction. Snapshot fails because it tries to link a sstable that was previously unlinked by compaction's sstable deletion. Refs #4051. (master commit 1b7cad3531) Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com> Message-Id: <20190110194048.26051-1-raphaelsc@scylladb.com>
2019-02-19 14:02:37 +02:00 · 2019-02-19 10:13:56 +02:00 · 2019-02-16 19:04:38 +02:00 · 2019-02-11 23:55:06 +02:00 · 2019-02-11 14:18:54 +02:00 · 2019-02-04 18:02:43 +02:00
111 changed files with 2127 additions and 909 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=2.3.3

 if test -f version
 then
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -2228,11 +2228,11 @@
               "description":"The column family"
            },
            "total":{
-               "type":"int",
+               "type":"long",
               "description":"The total snapshot size"
            },
            "live":{
-               "type":"int",
+               "type":"long",
               "description":"The live snapshot size"
            }
         }
--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -187,7 +187,24 @@ size_t atomic_cell_or_collection::external_memory_usage(const abstract_type& t)
        return 0;
    }
    auto ctx = data::cell::context(_data.get(), t.imr_state().type_info());
-    return data::cell::structure::serialized_object_size(_data.get(), ctx);
+
+    auto view = data::cell::structure::make_view(_data.get(), ctx);
+    auto flags = view.get<data::cell::tags::flags>();
+
+    size_t external_value_size = 0;
+    if (flags.get<data::cell::tags::external_data>()) {
+        if (flags.get<data::cell::tags::collection>()) {
+            external_value_size = get_collection_mutation_view(_data.get()).data.size_bytes();
+        } else {
+            auto cell_view = data::cell::atomic_cell_view(t.imr_state().type_info(), view);
+            external_value_size = cell_view.value_size();
+        }
+        // Add overhead of chunk headers. The last one is a special case.
+        external_value_size += (external_value_size - 1) / data::cell::maximum_external_chunk_length * data::cell::external_chunk_overhead;
+        external_value_size += data::cell::external_last_chunk_overhead;
+    }
+    return data::cell::structure::serialized_object_size(_data.get(), ctx)
+        + imr_object_type::size_overhead + external_value_size;
 }

 std::ostream& operator<<(std::ostream& os, const atomic_cell_or_collection& c) {
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -28,6 +28,7 @@
 #include "database.hh"
 #include "schema_builder.hh"
 #include "service/migration_manager.hh"
+#include "timeout_config.hh"

 namespace auth {

@@ -94,4 +95,10 @@ future<> wait_for_schema_agreement(::service::migration_manager& mm, const datab
    });
 }

+const timeout_config& internal_distributed_timeout_config() noexcept {
+    static const auto t = 5s;
+    static const timeout_config tc{t, t, t, t, t, t, t};
+    return tc;
+}
+
 }
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -38,6 +38,7 @@
 using namespace std::chrono_literals;

 class database;
+class timeout_config;

 namespace service {
 class migration_manager;
@@ -82,4 +83,9 @@ future<> create_metadata_table_if_missing(

 future<> wait_for_schema_agreement(::service::migration_manager&, const database&);

+///
+/// Time-outs for internal, non-local CQL queries.
+///
+const timeout_config& internal_distributed_timeout_config() noexcept;
+
 }
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -228,7 +228,7 @@ default_authorizer::modify(
        return _qp.process(
                query,
                db::consistency_level::ONE,
-                infinite_timeout_config,
+                internal_distributed_timeout_config(),
                {permissions::to_strings(set), sstring(role_name), resource.name()}).discard_result();
    });
 }
@@ -254,7 +254,7 @@ future<std::vector<permission_details>> default_authorizer::list_all() const {
    return _qp.process(
            query,
            db::consistency_level::ONE,
-            infinite_timeout_config,
+            internal_distributed_timeout_config(),
            {},
            true).then([](::shared_ptr<cql3::untyped_result_set> results) {
        std::vector<permission_details> all_details;
@@ -282,7 +282,7 @@ future<> default_authorizer::revoke_all(stdx::string_view role_name) const {
    return _qp.process(
            query,
            db::consistency_level::ONE,
-            infinite_timeout_config,
+            internal_distributed_timeout_config(),
            {sstring(role_name)}).discard_result().handle_exception([role_name](auto ep) {
        try {
            std::rethrow_exception(ep);
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -149,7 +149,9 @@ static sstring gensalt() {
    // blowfish 2011 fix, blowfish, sha512, sha256, md5
    for (sstring pfx : { "$2y$", "$2a$", "$6$", "$5$", "$1$" }) {
        salt = pfx + input;
-        if (crypt_r("fisk", salt.c_str(), &tlcrypt)) {
+        const char* e = crypt_r("fisk", salt.c_str(), &tlcrypt);
+
+        if (e && (e[0] != '*')) {
            prefix = pfx;
            return salt;
        }
@@ -184,7 +186,7 @@ future<> password_authenticator::migrate_legacy_metadata() const {
    return _qp.process(
            query,
            db::consistency_level::QUORUM,
-            infinite_timeout_config).then([this](::shared_ptr<cql3::untyped_result_set> results) {
+            internal_distributed_timeout_config()).then([this](::shared_ptr<cql3::untyped_result_set> results) {
        return do_for_each(*results, [this](const cql3::untyped_result_set_row& row) {
            auto username = row.get_as<sstring>("username");
            auto salted_hash = row.get_as<sstring>(SALTED_HASH);
@@ -192,7 +194,7 @@ future<> password_authenticator::migrate_legacy_metadata() const {
            return _qp.process(
                    update_row_query,
                    consistency_for_user(username),
-                    infinite_timeout_config,
+                    internal_distributed_timeout_config(),
                    {std::move(salted_hash), username}).discard_result();
        }).finally([results] {});
    }).then([] {
@@ -209,7 +211,7 @@ future<> password_authenticator::create_default_if_missing() const {
            return _qp.process(
                    update_row_query,
                    db::consistency_level::QUORUM,
-                    infinite_timeout_config,
+                    internal_distributed_timeout_config(),
                    {hashpw(DEFAULT_USER_PASSWORD), DEFAULT_USER_NAME}).then([](auto&&) {
                plogger.info("Created default superuser authentication record.");
            });
@@ -309,13 +311,17 @@ future<authenticated_user> password_authenticator::authenticate(
        return _qp.process(
                query,
                consistency_for_user(username),
-                infinite_timeout_config,
+                internal_distributed_timeout_config(),
                {username},
                true);
    }).then_wrapped([=](future<::shared_ptr<cql3::untyped_result_set>> f) {
        try {
            auto res = f.get0();
-            if (res->empty() || !checkpw(password, res->one().get_as<sstring>(SALTED_HASH))) {
+            auto salted_hash = std::experimental::optional<sstring>();
+            if (!res->empty()) {
+                salted_hash = res->one().get_opt<sstring>(SALTED_HASH);
+            }
+            if (!salted_hash || !checkpw(password, *salted_hash)) {
                throw exceptions::authentication_exception("Username and/or password are incorrect");
            }
            return make_ready_future<authenticated_user>(username);
@@ -337,7 +343,7 @@ future<> password_authenticator::create(stdx::string_view role_name, const authe
    return _qp.process(
            update_row_query,
            consistency_for_user(role_name),
-            infinite_timeout_config,
+            internal_distributed_timeout_config(),
            {hashpw(*options.password), sstring(role_name)}).discard_result();
 }

@@ -355,7 +361,7 @@ future<> password_authenticator::alter(stdx::string_view role_name, const authen
    return _qp.process(
            query,
            consistency_for_user(role_name),
-            infinite_timeout_config,
+            internal_distributed_timeout_config(),
            {hashpw(*options.password), sstring(role_name)}).discard_result();
 }

@@ -366,7 +372,10 @@ future<> password_authenticator::drop(stdx::string_view name) const {
            meta::roles_table::qualified_name(),
            meta::roles_table::role_col_name);

-    return _qp.process(query, consistency_for_user(name), infinite_timeout_config, {sstring(name)}).discard_result();
+    return _qp.process(
+            query, consistency_for_user(name),
+            internal_distributed_timeout_config(),
+            {sstring(name)}).discard_result();
 }

 future<custom_options> password_authenticator::query_custom_options(stdx::string_view role_name) const {
--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -79,7 +79,7 @@ future<bool> default_role_row_satisfies(
                return qp.process(
                        query,
                        db::consistency_level::QUORUM,
-                        infinite_timeout_config,
+                        internal_distributed_timeout_config(),
                        {meta::DEFAULT_SUPERUSER_NAME},
                        true).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
                    if (results->empty()) {
@@ -104,7 +104,7 @@ future<bool> any_nondefault_role_row_satisfies(
        return qp.process(
                query,
                db::consistency_level::QUORUM,
-                infinite_timeout_config).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
+                internal_distributed_timeout_config()).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
            if (results->empty()) {
                return false;
            }
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -196,6 +196,10 @@ future<> service::start() {
 }

 future<> service::stop() {
+    // Only one of the shards has the listener registered, but let's try to
+    // unregister on each one just to make sure.
+    _migration_manager.unregister_listener(_migration_listener.get());
+
    return _permissions_cache->stop().then([this] {
        return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop());
    });
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -89,7 +89,7 @@ static future<stdx::optional<record>> find_record(cql3::query_processor& qp, std
    return qp.process(
            query,
            consistency_for_role(role_name),
-            infinite_timeout_config,
+            internal_distributed_timeout_config(),
            {sstring(role_name)},
            true).then([](::shared_ptr<cql3::untyped_result_set> results) {
        if (results->empty()) {
@@ -174,7 +174,7 @@ future<> standard_role_manager::create_default_role_if_missing() const {
            return _qp.process(
                    query,
                    db::consistency_level::QUORUM,
-                    infinite_timeout_config,
+                    internal_distributed_timeout_config(),
                    {meta::DEFAULT_SUPERUSER_NAME}).then([](auto&&) {
                log.info("Created default superuser role '{}'.", meta::DEFAULT_SUPERUSER_NAME);
                return make_ready_future<>();
@@ -201,7 +201,7 @@ future<> standard_role_manager::migrate_legacy_metadata() const {
    return _qp.process(
            query,
            db::consistency_level::QUORUM,
-            infinite_timeout_config).then([this](::shared_ptr<cql3::untyped_result_set> results) {
+            internal_distributed_timeout_config()).then([this](::shared_ptr<cql3::untyped_result_set> results) {
        return do_for_each(*results, [this](const cql3::untyped_result_set_row& row) {
            role_config config;
            config.is_superuser = row.get_as<bool>("super");
@@ -263,7 +263,7 @@ future<> standard_role_manager::create_or_replace(stdx::string_view role_name, c
    return _qp.process(
            query,
            consistency_for_role(role_name),
-            infinite_timeout_config,
+            internal_distributed_timeout_config(),
            {sstring(role_name), c.is_superuser, c.can_login},
            true).discard_result();
 }
@@ -307,7 +307,7 @@ standard_role_manager::alter(stdx::string_view role_name, const role_config_upda
                        build_column_assignments(u),
                        meta::roles_table::role_col_name),
                consistency_for_role(role_name),
-                infinite_timeout_config,
+                internal_distributed_timeout_config(),
                {sstring(role_name)}).discard_result();
    });
 }
@@ -327,7 +327,7 @@ future<> standard_role_manager::drop(stdx::string_view role_name) const {
            return _qp.process(
                    query,
                    consistency_for_role(role_name),
-                    infinite_timeout_config,
+                    internal_distributed_timeout_config(),
                    {sstring(role_name)}).then([this, role_name](::shared_ptr<cql3::untyped_result_set> members) {
                return parallel_for_each(
                        members->begin(),
@@ -367,7 +367,7 @@ future<> standard_role_manager::drop(stdx::string_view role_name) const {
            return _qp.process(
                    query,
                    consistency_for_role(role_name),
-                    infinite_timeout_config,
+                    internal_distributed_timeout_config(),
                    {sstring(role_name)}).discard_result();
        };

@@ -394,7 +394,7 @@ standard_role_manager::modify_membership(
        return _qp.process(
                query,
                consistency_for_role(grantee_name),
-                infinite_timeout_config,
+                internal_distributed_timeout_config(),
                {role_set{sstring(role_name)}, sstring(grantee_name)}).discard_result();
    };

@@ -406,7 +406,7 @@ standard_role_manager::modify_membership(
                                "INSERT INTO %s (role, member) VALUES (?, ?)",
                                meta::role_members_table::qualified_name()),
                        consistency_for_role(role_name),
-                        infinite_timeout_config,
+                        internal_distributed_timeout_config(),
                        {sstring(role_name), sstring(grantee_name)}).discard_result();

            case membership_change::remove:
@@ -415,7 +415,7 @@ standard_role_manager::modify_membership(
                                "DELETE FROM %s WHERE role = ? AND member = ?",
                                meta::role_members_table::qualified_name()),
                        consistency_for_role(role_name),
-                        infinite_timeout_config,
+                        internal_distributed_timeout_config(),
                        {sstring(role_name), sstring(grantee_name)}).discard_result();
        }

@@ -516,7 +516,10 @@ future<role_set> standard_role_manager::query_all() const {
    // To avoid many copies of a view.
    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);

-    return _qp.process(query, db::consistency_level::QUORUM, infinite_timeout_config).then([](::shared_ptr<cql3::untyped_result_set> results) {
+    return _qp.process(
+            query,
+            db::consistency_level::QUORUM,
+            internal_distributed_timeout_config()).then([](::shared_ptr<cql3::untyped_result_set> results) {
        role_set roles;

        std::transform(
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -60,6 +60,7 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
        // - _next_row_in_range = _next.position() < _upper_bound
        // - _last_row points at a direct predecessor of the next row which is going to be read.
        //   Used for populating continuity.
+        // - _population_range_starts_before_all_rows is set accordingly
        reading_from_underlying,

        end_of_stream
@@ -86,6 +87,13 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
    partition_snapshot_row_cursor _next_row;
    bool _next_row_in_range = false;

+    // True iff current population interval, since the previous clustering row, starts before all clustered rows.
+    // We cannot just look at _lower_bound, because emission of range tombstones changes _lower_bound and
+    // because we mark clustering intervals as continuous when consuming a clustering_row, it would prevent
+    // us from marking the interval as continuous.
+    // Valid when _state == reading_from_underlying.
+    bool _population_range_starts_before_all_rows;
+
    // Whether _lower_bound was changed within current fill_buffer().
    // If it did not then we cannot break out of it (e.g. on preemption) because
    // forward progress is not guaranteed in case iterators are getting constantly invalidated.
@@ -231,6 +239,7 @@ inline
 future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
    if (_state == state::move_to_underlying) {
        _state = state::reading_from_underlying;
+        _population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
        auto end = _next_row_in_range ? position_in_partition(_next_row.position())
                                      : position_in_partition(_upper_bound);
        return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
@@ -360,7 +369,7 @@ future<> cache_flat_mutation_reader::read_from_underlying(db::timeout_clock::tim

 inline
 bool cache_flat_mutation_reader::ensure_population_lower_bound() {
-    if (!_ck_ranges_curr->start()) {
+    if (_population_range_starts_before_all_rows) {
        return true;
    }
    if (!_last_row.refresh(*_snp)) {
@@ -415,6 +424,7 @@ inline
 void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
    if (!can_populate()) {
        _last_row = nullptr;
+        _population_range_starts_before_all_rows = false;
        _read_context->cache().on_mispopulate();
        return;
    }
@@ -448,6 +458,7 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
        with_allocator(standard_allocator(), [&] {
            _last_row = partition_snapshot_row_weakref(*_snp, it, true);
        });
+        _population_range_starts_before_all_rows = false;
    });
 }

--- a/configure.py
+++ b/configure.py
@@ -303,6 +303,7 @@ scylla_tests = [
    'tests/imr_test',
    'tests/partition_data_test',
    'tests/reusable_buffer_test',
+    'tests/json_test'
 ]

 perf_tests = [
@@ -406,6 +407,7 @@ scylla_core = (['database.cc',
                 'mutation_reader.cc',
                 'flat_mutation_reader.cc',
                 'mutation_query.cc',
+                 'json.cc',
                 'keys.cc',
                 'counters.cc',                 
                 'compress.cc',
@@ -740,6 +742,7 @@ pure_boost_tests = set([
    'tests/imr_test',
    'tests/partition_data_test',
    'tests/reusable_buffer_test',
+    'tests/json_test',
 ])

 tests_not_using_seastar_test_framework = set([
@@ -791,7 +794,7 @@ deps['tests/log_heap_test'] = ['tests/log_heap_test.cc']
 deps['tests/anchorless_list_test'] = ['tests/anchorless_list_test.cc']
 deps['tests/perf/perf_fast_forward'] += ['release.cc']
 deps['tests/meta_test'] = ['tests/meta_test.cc']
-deps['tests/imr_test'] = ['tests/imr_test.cc']
+deps['tests/imr_test'] = ['tests/imr_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
 deps['tests/reusable_buffer_test'] = ['tests/reusable_buffer_test.cc']

 warnings = [
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -473,9 +473,9 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
        ::shared_ptr<cql3::term::raw> json_value;
    }
    : K_INSERT K_INTO cf=columnFamilyName
-          '(' c1=cident { column_names.push_back(c1); }  ( ',' cn=cident { column_names.push_back(cn); } )* ')'
-        ( K_VALUES
-              '(' v1=term { values.push_back(v1); } ( ',' vn=term { values.push_back(vn); } )* ')'
+        ('(' c1=cident { column_names.push_back(c1); }  ( ',' cn=cident { column_names.push_back(cn); } )* ')'
+            K_VALUES
+            '(' v1=term { values.push_back(v1); } ( ',' vn=term { values.push_back(vn); } )* ')'
            ( K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
            ( usingClause[attrs] )?
              {
--- a/cql3/error_collector.hh
+++ b/cql3/error_collector.hh
@@ -67,6 +67,12 @@ class error_collector : public error_listener<RecognizerType, ExceptionBaseType>
     */
    const sstring_view _query;

+    /**
+     * An empty bitset to be used as a workaround for AntLR null dereference
+     * bug.
+     */
+    static typename ExceptionBaseType::BitsetListType _empty_bit_list;
+
 public:

    /**
@@ -144,6 +150,14 @@ private:
            break;
        }
        default:
+            // AntLR Exception class has a bug of dereferencing a null
+            // pointer in the displayRecognitionError. The following
+            // if statement makes sure it will not be null before the
+            // call to that function (displayRecognitionError).
+            // bug reference: https://github.com/antlr/antlr3/issues/191
+            if (!ex->get_expectingSet()) {
+                ex->set_expectingSet(&_empty_bit_list);
+            }
            ex->displayRecognitionError(token_names, msg);
        }
        return msg.str();
@@ -345,4 +359,8 @@ private:
 #endif
 };

+template<typename RecognizerType, typename TokenType, typename ExceptionBaseType>
+typename ExceptionBaseType::BitsetListType
+error_collector<RecognizerType,TokenType,ExceptionBaseType>::_empty_bit_list = typename ExceptionBaseType::BitsetListType();
+
 }
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -177,7 +177,7 @@ shared_ptr<function>
 make_to_json_function(data_type t) {
    return make_native_scalar_function<true>("tojson", utf8_type, {t},
            [t](cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
-        return utf8_type->decompose(t->to_json_string(parameters[0].value()));
+        return utf8_type->decompose(t->to_json_string(parameters[0]));
    });
 }

--- a/cql3/query_options.cc
+++ b/cql3/query_options.cc
@@ -217,19 +217,18 @@ void query_options::prepare(const std::vector<::shared_ptr<column_specification>
    }

    auto& names = *_names;
-    std::vector<cql3::raw_value> ordered_values;
+    std::vector<cql3::raw_value_view> ordered_values;
    ordered_values.reserve(specs.size());
    for (auto&& spec : specs) {
        auto& spec_name = spec->name->text();
        for (size_t j = 0; j < names.size(); j++) {
            if (names[j] == spec_name) {
-                ordered_values.emplace_back(_values[j]);
+                ordered_values.emplace_back(_value_views[j]);
                break;
            }
        }
    }
-    _values = std::move(ordered_values);
-    fill_value_views();
+    _value_views = std::move(ordered_values);
 }

 void query_options::fill_value_views()
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -239,11 +239,11 @@ query_processor::process(const sstring_view& query_string, service::query_state&
    log.trace("process: \"{}\"", query_string);
    tracing::trace(query_state.get_trace_state(), "Parsing a statement");
    auto p = get_statement(query_string, query_state.get_client_state());
-    options.prepare(p->bound_names);
    auto cql_statement = p->statement;
    if (cql_statement->get_bound_terms() != options.get_values_count()) {
        throw exceptions::invalid_request_exception("Invalid amount of bind variables");
    }
+    options.prepare(p->bound_names);

    warn(unimplemented::cause::METRICS);
 #if 0
--- a/cql3/restrictions/single_column_restriction.hh
+++ b/cql3/restrictions/single_column_restriction.hh
@@ -202,6 +202,14 @@ public:
                                 const query_options& options,
                                 gc_clock::time_point now) const override;

+    virtual std::vector<bytes_opt> values_raw(const query_options& options) const = 0;
+
+    virtual std::vector<bytes_opt> values(const query_options& options) const override {
+        std::vector<bytes_opt> ret = values_raw(options);
+        std::sort(ret.begin(),ret.end());
+        ret.erase(std::unique(ret.begin(),ret.end()),ret.end());
+        return ret;
+    }
 #if 0
    @Override
    protected final boolean isSupportedBy(SecondaryIndex index)
@@ -224,7 +232,7 @@ public:
        return abstract_restriction::term_uses_function(_values, ks_name, function_name);
    }

-    virtual std::vector<bytes_opt> values(const query_options& options) const override {
+    virtual std::vector<bytes_opt> values_raw(const query_options& options) const override {
        std::vector<bytes_opt> ret;
        for (auto&& v : _values) {
            ret.emplace_back(to_bytes_opt(v->bind_and_get(options)));
@@ -249,7 +257,7 @@ public:
        return false;
    }

-    virtual std::vector<bytes_opt> values(const query_options& options) const override {
+    virtual std::vector<bytes_opt> values_raw(const query_options& options) const override {
        auto&& lval = dynamic_pointer_cast<multi_item_terminal>(_marker->bind(options));
        if (!lval) {
            throw exceptions::invalid_request_exception("Invalid null value for IN restriction");
--- a/cql3/selection/selector.hh
+++ b/cql3/selection/selector.hh
@@ -105,9 +105,11 @@ public:
    virtual void reset() = 0;

    virtual assignment_testable::test_result test_assignment(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) override {
-        if (receiver->type == get_type()) {
+        auto t1 = receiver->type->underlying_type();
+        auto t2 = get_type()->underlying_type();
+        if (t1 == t2) {
            return assignment_testable::test_result::EXACT_MATCH;
-        } else if (receiver->type->is_value_compatible_with(*get_type())) {
+        } else if (t1->is_value_compatible_with(*t2)) {
            return assignment_testable::test_result::WEAKLY_ASSIGNABLE;
        } else {
            return assignment_testable::test_result::NOT_ASSIGNABLE;
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -96,12 +96,8 @@ public:
                encoded_row.write("\\\"", 2);
            }
            encoded_row.write("\": ", 3);
-            if (parameters[i]) {
-                sstring row_sstring = _selector_types[i]->to_json_string(parameters[i].value());
-                encoded_row.write(row_sstring.c_str(), row_sstring.size());
-            } else {
-                encoded_row.write("null", 4);
-            }
+            sstring row_sstring = _selector_types[i]->to_json_string(parameters[i]);
+            encoded_row.write(row_sstring.c_str(), row_sstring.size());
        }
        encoded_row.write("}", 1);
        return encoded_row.linearize().to_string();
@@ -974,6 +970,10 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
    }

    check_needs_filtering(restrictions);
+    size_t restrictions_size = restrictions->get_partition_key_restrictions()->size() + restrictions->get_clustering_columns_restrictions()->size() + restrictions->get_non_pk_restriction().size();
+    if (restrictions->uses_secondary_indexing() && restrictions_size > 1) {
+        throw exceptions::invalid_request_exception("Indexed query may not contain multiple restrictions in 2.3");
+    }

    ::shared_ptr<cql3::statements::select_statement> stmt;
    if (restrictions->uses_secondary_indexing()) {
--- a/cql3/statements/update_statement.cc
+++ b/cql3/statements/update_statement.cc
@@ -179,7 +179,21 @@ modification_statement::json_cache_opt insert_prepared_json_statement::maybe_pre
 void
 insert_prepared_json_statement::execute_set_value(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params, const column_definition& column, const bytes_opt& value) {
    if (!value) {
+        if (column.type->is_collection()) {
+            auto& k = static_pointer_cast<const collection_type_impl>(column.type)->_kind;
+            if (&k == &collection_type_impl::kind::list) {
+                lists::setter::execute(m, prefix, params, column, make_shared<lists::value>(lists::value(std::vector<bytes_opt>())));
+            } else if (&k == &collection_type_impl::kind::set) {
+                sets::setter::execute(m, prefix, params, column, make_shared<sets::value>(sets::value(std::set<bytes, serialized_compare>(serialized_compare(empty_type)))));
+            } else if (&k == &collection_type_impl::kind::map) {
+                maps::setter::execute(m, prefix, params, column, make_shared<maps::value>(maps::value(std::map<bytes, bytes, serialized_compare>(serialized_compare(empty_type)))));
+            } else {
+                throw exceptions::invalid_request_exception("Incorrect value kind in JSON INSERT statement");
+            }
+            return;
+        }
        m.set_cell(prefix, column, std::move(operation::make_dead_cell(params)));
+        return;
    } else if (!column.type->is_collection()) {
        constants::setter::execute(m, prefix, params, column, raw_value_view::make_value(bytes_view(*value)));
        return;
@@ -204,15 +218,17 @@ insert_prepared_json_statement::execute_set_value(mutation& m, const clustering_
 dht::partition_range_vector
 insert_prepared_json_statement::build_partition_keys(const query_options& options, const json_cache_opt& json_cache) {
    dht::partition_range_vector ranges;
+    std::vector<bytes_opt> exploded;
    for (const auto& def : s->partition_key_columns()) {
        auto json_value = json_cache->at(def.name_as_text());
-        auto k = query::range<partition_key>::make_singular(partition_key::from_single_value(*s, json_value.value()));
-        ranges.emplace_back(std::move(k).transform(
-                    [this] (partition_key&& k) -> query::ring_position {
-                        auto token = dht::global_partitioner().get_token(*s, k);
-                        return { std::move(token), std::move(k) };
-                    }));
+        if (!json_value) {
+            throw exceptions::invalid_request_exception(sprint("Missing mandatory PRIMARY KEY part %s", def.name_as_text()));
+        }
+        exploded.emplace_back(*json_value);
    }
+    auto pkey = partition_key::from_optional_exploded(*s, std::move(exploded));
+    auto k = query::range<query::ring_position>::make_singular(dht::global_partitioner().decorate_key(*s, std::move(pkey)));
+    ranges.emplace_back(std::move(k));
    return ranges;
 }

@@ -221,7 +237,10 @@ query::clustering_row_ranges insert_prepared_json_statement::create_clustering_r
    std::vector<bytes_opt> exploded;
    for (const auto& def : s->clustering_key_columns()) {
        auto json_value = json_cache->at(def.name_as_text());
-        exploded.emplace_back(json_value.value());
+        if (!json_value) {
+            throw exceptions::invalid_request_exception(sprint("Missing mandatory PRIMARY KEY part %s", def.name_as_text()));
+        }
+        exploded.emplace_back(*json_value);
    }
    auto k = query::range<clustering_key_prefix>::make_singular(clustering_key_prefix::from_optional_exploded(*s, std::move(exploded)));
    ranges.emplace_back(query::clustering_range(std::move(k)));
--- a/cql3/update_parameters.cc
+++ b/cql3/update_parameters.cc
@@ -53,6 +53,9 @@ update_parameters::get_prefetched_list(
        return {};
    }

+    if (column.is_static()) {
+        ckey = clustering_key_view::make_empty();
+    }
    auto i = _prefetched->rows.find(std::make_pair(std::move(pkey), std::move(ckey)));
    if (i == _prefetched->rows.end()) {
        return {};
--- a/data/cell.hh
+++ b/data/cell.hh
@@ -211,6 +211,7 @@ struct cell {
        imr::member<tags::chunk_next, imr::pod<uint8_t*>>,
        imr::member<tags::chunk_data, imr::buffer<tags::chunk_data>>
    >;
+    static constexpr size_t external_chunk_overhead = sizeof(uint8_t*) * 2;

    using external_last_chunk_size = imr::pod<uint16_t>;
    /// The last fragment of an externally stored value
@@ -224,6 +225,7 @@ struct cell {
        imr::member<tags::last_chunk_size, external_last_chunk_size>,
        imr::member<tags::chunk_data, imr::buffer<tags::chunk_data>>
    >;
+    static constexpr size_t external_last_chunk_overhead = sizeof(uint8_t*) + sizeof(uint16_t);

    class context;
    class minimal_context;
--- a/database.cc
+++ b/database.cc
@@ -383,9 +383,13 @@ filter_sstable_for_reader(std::vector<sstables::shared_sstable>&& sstables, colu
    };
    sstables.erase(boost::remove_if(sstables, sstable_has_not_key), sstables.end());

+    // FIXME: Workaround for https://github.com/scylladb/scylla/issues/3552
+    // and https://github.com/scylladb/scylla/issues/3553
+    const bool filtering_broken = true;
+
    // no clustering filtering is applied if schema defines no clustering key or
    // compaction strategy thinks it will not benefit from such an optimization.
-    if (!schema->clustering_key_size() || !cf.get_compaction_strategy().use_clustering_key_filter()) {
+    if (filtering_broken || !schema->clustering_key_size() || !cf.get_compaction_strategy().use_clustering_key_filter()) {
         return sstables;
    }
    ::cf_stats* stats = cf.cf_stats();
@@ -957,6 +961,11 @@ table::seal_active_memtable(flush_permit&& permit) {
    }
    _memtables->add_memtable();
    _stats.memtable_switch_count++;
+    // This will set evictable occupancy of the old memtable region to zero, so that
+    // this region is considered last for flushing by dirty_memory_manager::flush_when_needed().
+    // If we don't do that, the flusher may keep picking up this memtable list for flushing after
+    // the permit is released even though there is not much to flush in the active memtable of this list.
+    old->region().ground_evictable_occupancy();
    auto previous_flush = _flush_barrier.advance_and_await();
    auto op = _flush_barrier.start();

@@ -1325,6 +1334,7 @@ table::on_compaction_completion(const std::vector<sstables::shared_sstable>& new

    // This is done in the background, so we can consider this compaction completed.
    seastar::with_gate(_sstable_deletion_gate, [this, sstables_to_remove] {
+       return with_semaphore(_sstable_deletion_sem, 1, [this, sstables_to_remove = std::move(sstables_to_remove)] {
        return sstables::delete_atomically(sstables_to_remove, *get_large_partition_handler()).then_wrapped([this, sstables_to_remove] (future<> f) {
            std::exception_ptr eptr;
            try {
@@ -1348,6 +1358,7 @@ table::on_compaction_completion(const std::vector<sstables::shared_sstable>& new
                return make_exception_future<>(eptr);
            }
            return make_ready_future<>();
+         });
        }).then([this] {
            // refresh underlying data source in row cache to prevent it from holding reference
            // to sstables files which were previously deleted.
@@ -1651,9 +1662,9 @@ future<> distributed_loader::open_sstable(distributed<database>& db, sstables::e
    // to distribute evenly the resource usage among all shards.

    return db.invoke_on(column_family::calculate_shard_from_sstable_generation(comps.generation),
-            [&db, comps = std::move(comps), func = std::move(func), pc] (database& local) {
+            [&db, comps = std::move(comps), func = std::move(func), &pc] (database& local) {

-        return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func), pc] {
+        return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func), &pc] {
            auto& cf = local.find_column_family(comps.ks, comps.cf);

            auto f = sstables::sstable::load_shared_components(cf.schema(), cf._config.datadir, comps.generation, comps.version, comps.format, pc);
@@ -2699,7 +2710,7 @@ future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_
    remove(*cf);
    cf->clear_views();
    auto& ks = find_keyspace(ks_name);
-    return cf->await_pending_writes().then([this, &ks, cf, tsf = std::move(tsf), snapshot] {
+    return when_all_succeed(cf->await_pending_writes(), cf->await_pending_reads()).then([this, &ks, cf, tsf = std::move(tsf), snapshot] {
        return truncate(ks, *cf, std::move(tsf), snapshot).finally([this, cf] {
            return cf->stop();
        });
@@ -3139,7 +3150,7 @@ database::query(schema_ptr s, const query::read_command& cmd, query::result_opti
            seastar::ref(get_result_memory_limiter()),
            max_result_size,
            timeout,
-            std::move(cache_ctx)).then_wrapped([this, s = _stats, hit_rate = cf.get_global_cache_hit_rate()] (auto f) {
+            std::move(cache_ctx)).then_wrapped([this, s = _stats, hit_rate = cf.get_global_cache_hit_rate(), op = cf.read_in_progress()] (auto f) {
        if (f.failed()) {
            ++s->total_reads_failed;
            return make_exception_future<lw_shared_ptr<query::result>, cache_temperature>(f.get_exception());
@@ -3167,7 +3178,7 @@ database::query_mutations(schema_ptr s, const query::read_command& cmd, const dh
            std::move(accounter),
            std::move(trace_state),
            timeout,
-            std::move(cache_ctx)).then_wrapped([this, s = _stats, hit_rate = cf.get_global_cache_hit_rate()] (auto f) {
+            std::move(cache_ctx)).then_wrapped([this, s = _stats, hit_rate = cf.get_global_cache_hit_rate(), op = cf.read_in_progress()] (auto f) {
        if (f.failed()) {
            ++s->total_reads_failed;
            return make_exception_future<reconcilable_result, cache_temperature>(f.get_exception());
@@ -3433,6 +3444,13 @@ future<> dirty_memory_manager::flush_when_needed() {
                // release the biggest amount of memory and is less likely to be generating tiny
                // SSTables.
                memtable& candidate_memtable = memtable::from_region(*(this->_virtual_region_group.get_largest_region()));
+
+                if (candidate_memtable.empty()) {
+                    // Soft pressure, but nothing to flush. It could be due to fsync or memtable_to_cache lagging.
+                    // Back off to avoid OOMing with flush continuations.
+                    return sleep(1ms);
+                }
+
                // Do not wait. The semaphore will protect us against a concurrent flush. But we
                // want to start a new one as soon as the permits are destroyed and the semaphore is
                // made ready again, not when we are done with the current one.
@@ -3980,6 +3998,7 @@ seal_snapshot(sstring jsondir) {

 future<> table::snapshot(sstring name) {
    return flush().then([this, name = std::move(name)]() {
+       return with_semaphore(_sstable_deletion_sem, 1, [this, name = std::move(name)]() {
        auto tables = boost::copy_range<std::vector<sstables::shared_sstable>>(*_sstables->all());
        return do_with(std::move(tables), [this, name](std::vector<sstables::shared_sstable> & tables) {
            auto jsondir = _config.datadir + "/snapshots/" + name;
@@ -4044,6 +4063,7 @@ future<> table::snapshot(sstring name) {
                });
            });
        });
+       });
    });
 }

@@ -4175,6 +4195,7 @@ future<> table::fail_streaming_mutations(utils::UUID plan_id) {
    _streaming_memtables_big.erase(it);
    return entry->flush_in_progress.close().then([this, entry] {
        for (auto&& sst : entry->sstables) {
+            sst.monitor->write_failed();
            sst.sstable->mark_for_deletion();
        }
    });
--- a/database.hh
+++ b/database.hh
@@ -294,6 +294,8 @@ public:
 class table;
 using column_family = table;

+class database_sstable_write_monitor;
+
 class table : public enable_lw_shared_from_this<table> {
 public:
    struct config {
@@ -389,7 +391,7 @@ private:
    // plan memtables and the resulting sstables are not made visible until
    // the streaming is complete.
    struct monitored_sstable {
-        std::unique_ptr<sstables::write_monitor> monitor;
+        std::unique_ptr<database_sstable_write_monitor> monitor;
        sstables::shared_sstable sstable;
    };

@@ -428,6 +430,10 @@ private:
    std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_need_rewrite;
    // Control background fibers waiting for sstables to be deleted
    seastar::gate _sstable_deletion_gate;
+    // This semaphore ensures that an operation like snapshot won't have its selected
+    // sstables deleted by compaction in parallel, a race condition which could
+    // easily result in failure.
+    seastar::semaphore _sstable_deletion_sem = {1};
    // There are situations in which we need to stop writing sstables. Flushers will take
    // the read lock, and the ones that wish to stop that process will take the write lock.
    rwlock _sstables_lock;
@@ -475,6 +481,8 @@ private:
    // after some modification, needs to ensure that news writes will see it before
    // it can proceed, such as the view building code.
    utils::phased_barrier _pending_writes_phaser;
+    // Corresponding phaser for in-progress reads.
+    utils::phased_barrier _pending_reads_phaser;
 private:
    void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable, const std::vector<unsigned>& shards_for_the_sstable) noexcept;
    // Adds new sstable to the set of sstables
@@ -817,6 +825,14 @@ public:
        return _pending_writes_phaser.advance_and_await();
    }

+    utils::phased_barrier::operation read_in_progress() {
+        return _pending_reads_phaser.start();
+    }
+
+    future<> await_pending_reads() {
+        return _pending_reads_phaser.advance_and_await();
+    }
+
    void add_or_update_view(view_ptr v);
    void remove_view(view_ptr v);
    void clear_views();
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -163,7 +163,7 @@ future<> db::commitlog_replayer::impl::init() {
                // Get all truncation records for the CF and initialize max rps if
                // present. Cannot do this on demand, as there may be no sstables to
                // mark the CF as "needed".
-                return db::system_keyspace::get_truncated_position(uuid).then([&map, &uuid](std::vector<db::replay_position> tpps) {
+                return db::system_keyspace::get_truncated_position(uuid).then([&map, uuid](std::vector<db::replay_position> tpps) {
                    for (auto& p : tpps) {
                        rlogger.trace("CF {} truncated at {}", uuid, p);
                        auto& pp = map[p.shard_id()][uuid];
--- a/db/system_distributed_keyspace.cc
+++ b/db/system_distributed_keyspace.cc
@@ -26,6 +26,7 @@
 #include "db/consistency_level_type.hh"
 #include "db/system_keyspace.hh"
 #include "schema_builder.hh"
+#include "timeout_config.hh"
 #include "types.hh"

 #include <seastar/core/reactor.hh>
@@ -97,11 +98,17 @@ future<> system_distributed_keyspace::stop() {
    return make_ready_future<>();
 }

+static const timeout_config internal_distributed_timeout_config = [] {
+    using namespace std::chrono_literals;
+    const auto t = 10s;
+    return timeout_config{ t, t, t, t, t, t, t };
+}();
+
 future<std::unordered_map<utils::UUID, sstring>> system_distributed_keyspace::view_status(sstring ks_name, sstring view_name) const {
    return _qp.process(
            sprint("SELECT host_id, status FROM %s.%s WHERE keyspace_name = ? AND view_name = ?", NAME, VIEW_BUILD_STATUS),
            db::consistency_level::ONE,
-            infinite_timeout_config,
+            internal_distributed_timeout_config,
            { std::move(ks_name), std::move(view_name) },
            false).then([this] (::shared_ptr<cql3::untyped_result_set> cql_result) {
        return boost::copy_range<std::unordered_map<utils::UUID, sstring>>(*cql_result
@@ -118,7 +125,7 @@ future<> system_distributed_keyspace::start_view_build(sstring ks_name, sstring
        return _qp.process(
                sprint("INSERT INTO %s.%s (keyspace_name, view_name, host_id, status) VALUES (?, ?, ?, ?)", NAME, VIEW_BUILD_STATUS),
                db::consistency_level::ONE,
-                infinite_timeout_config,
+                internal_distributed_timeout_config,
                { std::move(ks_name), std::move(view_name), std::move(host_id), "STARTED" },
                false).discard_result();
    });
@@ -129,7 +136,7 @@ future<> system_distributed_keyspace::finish_view_build(sstring ks_name, sstring
        return _qp.process(
                sprint("UPDATE %s.%s SET status = ? WHERE keyspace_name = ? AND view_name = ? AND host_id = ?", NAME, VIEW_BUILD_STATUS),
                db::consistency_level::ONE,
-                infinite_timeout_config,
+                internal_distributed_timeout_config,
                { "SUCCESS", std::move(ks_name), std::move(view_name), std::move(host_id) },
                false).discard_result();
    });
@@ -139,7 +146,7 @@ future<> system_distributed_keyspace::remove_view(sstring ks_name, sstring view_
    return _qp.process(
            sprint("DELETE FROM %s.%s WHERE keyspace_name = ? AND view_name = ?", NAME, VIEW_BUILD_STATUS),
            db::consistency_level::ONE,
-            infinite_timeout_config,
+            internal_distributed_timeout_config,
            { std::move(ks_name), std::move(view_name) },
            false).discard_result();
 }
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -1635,6 +1635,9 @@ void make(database& db, bool durable, bool volatile_testing_only) {
        auto cfg = ks.make_column_family_config(*table, db.get_config(), db.get_large_partition_handler());
        if (maybe_write_in_user_memory(table, db)) {
            cfg.dirty_memory_manager = &db._dirty_memory_manager;
+        } else {
+            cfg.memtable_scheduling_group = default_scheduling_group();
+            cfg.memtable_to_cache_scheduling_group = default_scheduling_group();
        }
        db.add_column_family(ks, table, std::move(cfg));
        maybe_add_virtual_reader(table, db);
--- a/dht/i_partitioner.hh
+++ b/dht/i_partitioner.hh
@@ -384,6 +384,10 @@ public:
        return "biased-token-round-robin";
    }

+    virtual unsigned sharding_ignore_msb() const {
+        return 0;
+    }
+
    friend bool operator==(token_view t1, token_view t2);
    friend bool operator<(token_view t1, token_view t2);
    friend int tri_compare(token_view t1, token_view t2);
--- a/dht/murmur3_partitioner.cc
+++ b/dht/murmur3_partitioner.cc
@@ -290,6 +290,11 @@ murmur3_partitioner::token_for_next_shard(const token& t, shard_id shard, unsign
    return bias(n);
 }

+unsigned
+murmur3_partitioner::sharding_ignore_msb() const {
+    return _sharding_ignore_msb_bits;
+}
+

 using registry = class_registrator<i_partitioner, murmur3_partitioner, const unsigned&, const unsigned&>;
 static registry registrator("org.apache.cassandra.dht.Murmur3Partitioner");
--- a/dht/murmur3_partitioner.hh
+++ b/dht/murmur3_partitioner.hh
@@ -52,6 +52,7 @@ public:

    virtual unsigned shard_of(const token& t) const override;
    virtual token token_for_next_shard(const token& t, shard_id shard, unsigned spans) const override;
+    virtual unsigned sharding_ignore_msb() const override;
 private:
    using uint128_t = unsigned __int128;
    static int64_t normalize(int64_t in);
--- a/dist/ami/build_ami.sh
+++ b/dist/ami/build_ami.sh
@@ -11,11 +11,9 @@ print_usage() {
    echo "  --repo  repository for both install and update, specify .repo/.list file URL"
    echo "  --repo-for-install  repository for install, specify .repo/.list file URL"
    echo "  --repo-for-update  repository for update, specify .repo/.list file URL"
-    echo "  --target    specify target distribution"
    exit 1
 }
 LOCALRPM=0
-TARGET=centos
 while [ $# -gt 0 ]; do
    case "$1" in
        "--localrpm")
@@ -34,10 +32,6 @@ while [ $# -gt 0 ]; do
            INSTALL_ARGS="$INSTALL_ARGS --repo-for-update $2"
            shift 2
            ;;
-        "--target")
-            TARGET="$2"
-            shift 2
-            ;;
        *)
            print_usage
            ;;
@@ -62,91 +56,42 @@ pkg_install() {
    fi
 }

-case "$TARGET" in
-    "centos")
-        AMI=ami-ae7bfdb8
-        REGION=us-east-1
-        SSH_USERNAME=centos
-        ;;
-    "trusty")
-        AMI=ami-ff427095
-        REGION=us-east-1
-        SSH_USERNAME=ubuntu
-        ;;
-    "xenial")
-        AMI=ami-da05a4a0
-        REGION=us-east-1
-        SSH_USERNAME=ubuntu
-        ;;
-    *)
-        echo "build_ami.sh does not supported this distribution."
-        exit 1
-        ;;
-esac
+AMI=ami-ae7bfdb8
+REGION=us-east-1
+SSH_USERNAME=centos

 if [ $LOCALRPM -eq 1 ]; then
    sudo rm -rf build/*
-    REPO=`./scripts/scylla_current_repo --target $TARGET`
+    REPO=`./scripts/scylla_current_repo --target centos`
    INSTALL_ARGS="$INSTALL_ARGS --localrpm --repo $REPO"
    if [ ! -f /usr/bin/git ]; then
        pkg_install git
    fi

-    if [ "$TARGET" = "centos" ]; then
-        if [ ! -f dist/ami/files/scylla.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-kernel-conf.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-conf.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-server.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-debuginfo.x86_64.rpm ]; then
-            dist/redhat/build_rpm.sh --dist --target epel-7-x86_64
-            cp build/rpms/scylla-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla.x86_64.rpm
-            cp build/rpms/scylla-kernel-conf-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-kernel-conf.x86_64.rpm
-            cp build/rpms/scylla-conf-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-conf.x86_64.rpm
-            cp build/rpms/scylla-server-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-server.x86_64.rpm
-            cp build/rpms/scylla-debuginfo-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-debuginfo.x86_64.rpm
-        fi
-        if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
-            cd build
-            git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
-            cd scylla-jmx
-            dist/redhat/build_rpm.sh --target epel-7-x86_64
-            cd ../..
-            cp build/scylla-jmx/build/rpms/scylla-jmx-`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-jmx.noarch.rpm
-        fi
-        if [ ! -f dist/ami/files/scylla-tools.noarch.rpm ] || [ ! -f dist/ami/files/scylla-tools-core.noarch.rpm ]; then
-            cd build
-            git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
-            cd scylla-tools-java
-            dist/redhat/build_rpm.sh --target epel-7-x86_64
-            cd ../..
-            cp build/scylla-tools-java/build/rpms/scylla-tools-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools.noarch.rpm
-            cp build/scylla-tools-java/build/rpms/scylla-tools-core-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools-core.noarch.rpm
-        fi
-    else
-        if [ ! -f dist/ami/files/scylla-server_amd64.deb ]; then
-            ./scripts/git-archive-all --force-submodules --prefix scylla build/scylla.tar
-            tar -C build/ -xvpf build/scylla.tar
-            cd build/scylla
-            dist/debian/build_deb.sh --dist --target $TARGET
-            cd ../..
-            cp build/scylla/build/debs/scylla_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-0ubuntu1~${TARGET}_amd64.deb dist/ami/files/scylla_amd64.deb
-            cp build/scylla/build/debs/scylla-kernel-conf_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-0ubuntu1~${TARGET}_amd64.deb dist/ami/files/scylla-kernel-conf_amd64.deb
-            cp build/scylla/build/debs/scylla-conf_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-0ubuntu1~${TARGET}_amd64.deb dist/ami/files/scylla-conf_amd64.deb
-            cp build/scylla/build/debs/scylla-server_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-0ubuntu1~${TARGET}_amd64.deb dist/ami/files/scylla-server_amd64.deb
-            cp build/scylla/build/debs/scylla-server-dbg_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-0ubuntu1~${TARGET}_amd64.deb dist/ami/files/scylla-server-dbg_amd64.deb
-        fi
-        if [ ! -f dist/ami/files/scylla-jmx_all.deb ]; then
-            cd build
-            git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
-            cd scylla-jmx
-            dist/debian/build_deb.sh --target $TARGET
-            cd ../..
-            cp build/scylla-jmx/build/debs/scylla-jmx_`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`-0ubuntu1~${TARGET}_all.deb dist/ami/files/scylla-jmx_all.deb
-        fi
-        if [ ! -f dist/ami/files/scylla-tools_all.deb ]; then
-            cd build
-            git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
-            cd scylla-tools-java
-            dist/debian/build_deb.sh --target $TARGET
-            cd ../..
-            cp build/scylla-tools-java/build/debs/scylla-tools_`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`-0ubuntu1~${TARGET}_all.deb dist/ami/files/scylla-tools_all.deb
-        fi
+    if [ ! -f dist/ami/files/scylla.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-kernel-conf.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-conf.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-server.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-debuginfo.x86_64.rpm ]; then
+        dist/redhat/build_rpm.sh --dist --target epel-7-x86_64
+        cp build/rpms/scylla-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla.x86_64.rpm
+        cp build/rpms/scylla-kernel-conf-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-kernel-conf.x86_64.rpm
+        cp build/rpms/scylla-conf-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-conf.x86_64.rpm
+        cp build/rpms/scylla-server-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-server.x86_64.rpm
+        cp build/rpms/scylla-debuginfo-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-debuginfo.x86_64.rpm
+    fi
+    if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
+        cd build
+        git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
+        cd scylla-jmx
+        dist/redhat/build_rpm.sh --target epel-7-x86_64
+        cd ../..
+        cp build/scylla-jmx/build/rpms/scylla-jmx-`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-jmx.noarch.rpm
+    fi
+    if [ ! -f dist/ami/files/scylla-tools.noarch.rpm ] || [ ! -f dist/ami/files/scylla-tools-core.noarch.rpm ]; then
+        cd build
+        git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
+        cd scylla-tools-java
+        dist/redhat/build_rpm.sh --target epel-7-x86_64
+        cd ../..
+        cp build/scylla-tools-java/build/rpms/scylla-tools-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools.noarch.rpm
+        cp build/scylla-tools-java/build/rpms/scylla-tools-core-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools-core.noarch.rpm
    fi
 fi

--- a/dist/ami/files/.bash_profile
+++ b/dist/ami/files/.bash_profile
@@ -7,121 +7,8 @@ fi

 # User specific environment and startup programs

-. /usr/lib/scylla/scylla_lib.sh
-
 PATH=$PATH:$HOME/.local/bin:$HOME/bin

 export PATH

-echo
-echo '   _____            _ _       _____  ____  '
-echo '  / ____|          | | |     |  __ \|  _ \ '
-echo ' | (___   ___ _   _| | | __ _| |  | | |_) |'
-echo '  \___ \ / __| | | | | |/ _` | |  | |  _ < '
-echo '  ____) | (__| |_| | | | (_| | |__| | |_) |'
-echo ' |_____/ \___|\__, |_|_|\__,_|_____/|____/ '
-echo '               __/ |                       '
-echo '              |___/                        '
-echo ''
-echo ''
-echo 'Nodetool:'
-echo '	nodetool help'
-echo 'CQL Shell:'
-echo '	cqlsh'
-echo 'More documentation available at: '
-echo '	http://www.scylladb.com/doc/'
-echo 'By default, Scylla sends certain information about this node to a data collection server. For information, see http://www.scylladb.com/privacy/'
-echo
-
-if [ `ec2_is_supported_instance_type` -eq 0 ]; then
-	TYPE=`curl -s http://169.254.169.254/latest/meta-data/instance-type`
-	tput setaf 1
-	tput bold
-	echo "    $TYPE is not supported instance type!"
-	tput sgr0
-	echo -n "To continue startup ScyllaDB on this instance, run 'sudo scylla_io_setup' "
-	if ! is_systemd; then
-		echo "then 'initctl start scylla-server'."
-	else
-		echo "then 'systemctl start scylla-server'."
-	fi
-	echo "For a list of optimized instance types and more EC2 instructions see http://www.scylladb.com/doc/getting-started-amazon/"
-	echo
-else
-	SETUP=
-	if is_systemd; then
-		SETUP=`systemctl is-active scylla-ami-setup`
-	fi
-	if [ "$SETUP" == "activating" ]; then
-		tput setaf 4
-		tput bold
-		echo "    Constructing RAID volume..."
-		tput sgr0
-		echo
-		echo "Please wait for setup. To see status, run "
-		echo " 'systemctl status scylla-ami-setup'"
-		echo
-		echo "After setup finished, scylla-server service will launch."
-		echo "To see status of scylla-server, run "
-		echo " 'systemctl status scylla-server'"
-		echo
-	elif [ "$SETUP" == "failed" ]; then
-		tput setaf 1
-		tput bold
-		echo "    AMI initial configuration failed!"
-		tput sgr0
-		echo
-		echo "To see status, run "
-		echo " 'systemctl status scylla-ami-setup'"
-		echo
-	else
-		if is_systemd; then
-			SCYLLA=`systemctl is-active scylla-server`
-		else
-			if [ "`initctl status scylla-server|grep "running, process"`" != "" ]; then
-				SCYLLA="active"
-			else
-				SCYLLA="failed"
-			fi
-		fi
-		if [ "$SCYLLA" == "activating" ]; then
-			tput setaf 4
-			tput bold
-			echo "    ScyllaDB is starting..."
-			tput sgr0
-			echo
-			echo "Please wait for start. To see status, run "
-			echo " 'systemctl status scylla-server'"
-			echo
-		elif [ "$SCYLLA" == "active" ]; then
-			tput setaf 4
-			tput bold
-			echo "    ScyllaDB is active."
-			tput sgr0
-			echo
-			echo "$ nodetool status"
-			echo
-			nodetool status
-		else
-			tput setaf 1
-			tput bold
-			echo "    ScyllaDB is not started!"
-			tput sgr0
-			echo "Please wait for startup. To see status of ScyllaDB, run "
-			if ! is_systemd; then
-				echo " 'initctl status scylla-server'"
-				echo "and"
-				echo " 'sudo cat /var/log/upstart/scylla-server.log'"
-				echo
-			else
-				echo " 'systemctl status scylla-server'"
-				echo
-			fi
-		fi
-	fi
-	echo -n "    "
-	/usr/lib/scylla/scylla_ec2_check
-	if [ $? -eq 0 ]; then
-	    echo
-	fi
-fi
+~/.scylla_ami_login
--- a/dist/ami/files/.scylla_ami_login
+++ b/dist/ami/files/.scylla_ami_login
@@ -0,0 +1,118 @@
+#!/usr/bin/python3
+#
+# Copyright 2018 ScyllaDB
+#
+
+#
+# This file is part of Scylla.
+#
+# Scylla is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Scylla is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+
+import os
+import sys
+import argparse
+sys.path.append('/usr/lib/scylla')
+from scylla_util import *
+
+MSG_HEADER = '''
+
+   _____            _ _       _____  ____  
+  / ____|          | | |     |  __ \|  _ \ 
+ | (___   ___ _   _| | | __ _| |  | | |_) |
+  \___ \ / __| | | | | |/ _` | |  | |  _ < 
+  ____) | (__| |_| | | | (_| | |__| | |_) |
+ |_____/ \___|\__, |_|_|\__,_|_____/|____/ 
+               __/ |                       
+              |___/                        
+
+
+Nodetool:
+	nodetool help
+CQL Shell:
+	cqlsh
+More documentation available at: 
+	http://www.scylladb.com/doc/
+By default, Scylla sends certain information about this node to a data collection server. For information, see http://www.scylladb.com/privacy/
+
+'''[1:-1]
+MSG_UNSUPPORTED_INSTANCE_TYPE = '''
+    {red}{type} is not supported instance type!{nocolor}
+To continue startup ScyllaDB on this instance, run 'sudo scylla_io_setup' then 'systemctl start scylla-server'.
+For a list of optimized instance types and more EC2 instructions see http://www.scylladb.com/doc/getting-started-amazon/"
+
+'''[1:-1]
+MSG_SETUP_ACTIVATING = '''
+    {green}Constructing RAID volume...{nocolor}
+
+Please wait for setup. To see status, run 
+ 'systemctl status scylla-ami-setup'
+
+After setup finished, scylla-server service will launch.
+To see status of scylla-server, run 
+ 'systemctl status scylla-server'
+
+'''[1:-1]
+MSG_SETUP_FAILED = '''
+    {red}AMI initial configuration failed!{nocolor}
+
+To see status, run 
+ 'systemctl status scylla-ami-setup'
+
+'''[1:-1]
+MSG_SCYLLA_ACTIVATING = '''
+    {green}ScyllaDB is starting...{nocolor}
+
+Please wait for start. To see status, run 
+ 'systemctl status scylla-server'
+
+'''[1:-1]
+MSG_SCYLLA_FAILED = '''
+    {red}ScyllaDB is not started!{nocolor}
+Please wait for startup. To see status of ScyllaDB, run 
+ 'systemctl status scylla-server'
+
+'''[1:-1]
+MSG_SCYLLA_ACTIVE = '''
+    {green}ScyllaDB is active.{nocolor}
+
+$ nodetool status
+
+'''[1:-1]
+
+if __name__ == '__main__':
+    colorprint(MSG_HEADER)
+    aws = aws_instance()
+    if not aws.is_supported_instance_class():
+        colorprint(MSG_UNSUPPORTED_INSTANCE_TYPE, type=aws.instance_class())
+    else:
+        setup = systemd_unit('scylla-ami-setup.service')
+        res = setup.is_active()
+        if res == 'activating':
+            colorprint(MSG_SETUP_ACTIVATING)
+        elif res == 'failed':
+            colorprint(MSG_SETUP_FAILED)
+        else:
+            server = systemd_unit('scylla-server.service')
+            res = server.is_active()
+            if res == 'activating':
+                colorprint(MSG_SCYLLA_ACTIVATING)
+            elif res == 'failed':
+                colorprint(MSG_SCYLLA_FAILED)
+            else:
+                colorprint(MSG_SCYLLA_ACTIVE)
+                run('nodetool status', exception=False)
+        print('    ', end='')
+        res = run('/usr/lib/scylla/scylla_ec2_check --nic eth0', exception=False)
+        if res == 0:
+            print('')
--- a/dist/ami/files/scylla-ami
+++ b/dist/ami/files/scylla-ami
--- a/dist/ami/scylla.json
+++ b/dist/ami/scylla.json
@@ -64,14 +64,11 @@
      "source": "files/",
      "destination": "/home/{{user `ssh_username`}}/"
    },
-    {
-      "type": "file",
-      "source": "../../scripts/scylla_install_pkg",
-      "destination": "/home/{{user `ssh_username`}}/scylla_install_pkg"
-    },
    {
      "type": "shell",
      "inline": [
+         "sudo yum install -y epel-release",
+         "sudo yum install -y python34",
         "sudo /home/{{user `ssh_username`}}/scylla-ami/scylla_install_ami {{ user `install_args` }}"
       ]
    }
--- a/dist/common/scripts/node_exporter_install
+++ b/dist/common/scripts/node_exporter_install
@@ -1,6 +1,8 @@
-#!/bin/sh
+#!/usr/bin/python3
 #
-# Copyright 2016 ScyllaDB
+# Copyright 2018 ScyllaDB
+#
+
 #
 # This file is part of Scylla.
 #
@@ -17,42 +19,46 @@
 # You should have received a copy of the GNU General Public License
 # along with Scylla.  If not, see <http://www.gnu.org/licenses/>.

-if [ "`id -u`" -ne 0 ]; then
-    echo "Requires root permission."
-    exit 1
-fi
+import os
+import sys
+import tempfile
+import tarfile
+from scylla_util import *

-if [ -f /usr/bin/node_exporter ] || [ -f /usr/bin/prometheus-node_exporter ]; then
-    echo "node_exporter already installed"
-    exit 1
-fi
+VERSION='0.14.0'
+INSTALL_DIR='/usr/lib/scylla/Prometheus/node_exporter'

-. /usr/lib/scylla/scylla_lib.sh
+if __name__ == '__main__':
+    if os.getuid() > 0:
+        print('Requires root permission.')
+        sys.exit(1)

-if is_gentoo_variant; then
-    emerge -uq app-metrics/node_exporter
-    if is_systemd; then
-       echo "app-metrics/node_exporter does not install systemd service files, please fill a bug if you need them."
-    else
-        rc-update add node_exporter default
-        rc-service node_exporter start
-    fi
-else
-    version=0.14.0
-    dir=/usr/lib/scylla/Prometheus/node_exporter
-    mkdir -p $dir
-    cd $dir
-    curl -L https://github.com/prometheus/node_exporter/releases/download/v$version/node_exporter-$version.linux-amd64.tar.gz -o $dir/node_exporter-$version.linux-amd64.tar.gz
-    tar -xvzf $dir/node_exporter-$version.linux-amd64.tar.gz
-    rm $dir/node_exporter-$version.linux-amd64.tar.gz
-    ln -s $dir/node_exporter-$version.linux-amd64/node_exporter /usr/bin
-    . /etc/os-release
+    if os.path.exists('/usr/bin/node_exporter') or os.path.exists('/usr/bin/prometheus-node_exporter'):
+        print('node_exporter already installed')
+        sys.exit(1)

-     if is_systemd; then
-        systemctl enable node-exporter
-        systemctl start node-exporter
-    else
-        cat <<EOT >> /etc/init/node_exporter.conf
+    if is_gentoo_variant():
+        run('emerge -uq app-metrics/node_exporter')
+        if is_systemd():
+            print('app-metrics/node_exporter does not install systemd service files, please fill a bug if you need them.')
+            sys.exit(1)
+        else:
+            run('rc-update add node_exporter default')
+            run('rc-service node_exporter start')
+    else:
+        data = curl('https://github.com/prometheus/node_exporter/releases/download/v{version}/node_exporter-{version}.linux-amd64.tar.gz'.format(version=VERSION), byte=True)
+        with open('/var/tmp/node_exporter-{version}.linux-amd64.tar.gz'.format(version=VERSION), 'wb') as f:
+            f.write(data)
+        with tarfile.open('/var/tmp/node_exporter-{version}.linux-amd64.tar.gz'.format(version=VERSION)) as tf:
+            tf.extractall(INSTALL_DIR)
+        os.remove('/var/tmp/node_exporter-{version}.linux-amd64.tar.gz'.format(version=VERSION))
+        os.symlink('{install_dir}/node_exporter-{version}.linux-amd64/node_exporter'.format(install_dir=INSTALL_DIR, version=VERSION), '/usr/bin/node_exporter')
+        if is_systemd():
+            node_exporter = systemd_unit('node-exporter.service')
+            node_exporter.enable()
+            node_exporter.start()
+        else:
+            conf = '''
 # Run node_exporter

 start on startup
@@ -60,9 +66,9 @@ start on startup
 script
   /usr/bin/node_exporter
 end script
-EOT
-        service node_exporter start
-    fi
-fi
+'''[1:-1]
+            with open('/etc/init/node_exporter.conf', 'w') as f:
+                f.write(conf)
+            run('service node_exporter start')

-printf "node_exporter successfully installed\n"
+    print('node_exporter successfully installed')
--- a/dist/common/scripts/node_health_check
+++ b/dist/common/scripts/node_health_check
@@ -28,6 +28,7 @@ OUTPUT_PATH4="$OUTPUT_PATH/data_model"
 OUTPUT_PATH5="$OUTPUT_PATH/network_checks"
 IS_FEDORA="0"
 IS_DEBIAN="0"
+IS_GENTOO="0"
 JMX_PORT="7199"
 CQL_PORT="9042"
 PRINT_DM=NO
@@ -75,7 +76,7 @@ while getopts ":hdncap:q:" opt; do
 done


-##Check server release (Fedora/Oracle/Debian)##
+##Check server release (Fedora/Oracle/Debian/Gentoo)##
 cat /etc/os-release | grep -i fedora &> /dev/null
 if [ $? -ne 0 ]; then
    cat /etc/os-release | grep -i oracle &> /dev/null
@@ -89,7 +90,12 @@ if [ $? -ne 0 ]; then
    IS_DEBIAN="1"
 fi

-if [ "$IS_FEDORA" == "1" ] && [ "$IS_DEBIAN" == "1" ]; then
+cat /etc/os-release | grep -i gentoo &> /dev/null
+if [ $? -ne 0 ]; then
+    IS_GENTOO="1"
+fi
+
+if [ "$IS_FEDORA" == "1" ] && [ "$IS_DEBIAN" == "1" ] && [ "$IS_GENTOO" == "1" ]; then
    echo "This s a Non-Supported OS, Please Review the Support Matrix"
    exit 222
 fi
@@ -108,7 +114,7 @@ if [ $? -ne 0 ]; then
 else
    echo "Scylla-server Service: OK"
    echo "--------------------------------------------------"
-fi 
+fi


 ##Scylla-JMX service status##
@@ -125,7 +131,7 @@ if [ $? -ne 0 ]; then
 else
    echo "Scylla-JMX Service (nodetool): OK"
    echo "--------------------------------------------------"
-fi 
+fi


 #Install 'net-tools' pkg, to be used for netstat command#
@@ -141,6 +147,9 @@ if [ "$IS_DEBIAN" == "0" ]; then
    sudo apt-get install net-tools -y | grep already
 fi

+if [ "$IS_GENTOO" == "0" ]; then
+    sudo emerge -1uq sys-apps/ethtool sys-apps/net-tools
+fi

 #Create dir structure to save output_files#
 echo "--------------------------------------------------"
@@ -182,6 +191,12 @@ if [ "$IS_DEBIAN" == "0" ]; then
    cp -p /etc/default/scylla-server $OUTPUT_PATH2
 fi

+if [ "$IS_GENTOO" == "0" ]; then
+    sudo emerge -1uq app-portage/portage-utils
+    sudo qlist -ICv scylla > $OUTPUT_PATH2/scylla-pkgs.txt
+    cp -p /etc/default/scylla-server $OUTPUT_PATH2
+fi
+

 #Scylla Logs#
 echo "--------------------------------------------------"
@@ -192,7 +207,11 @@ journalctl --help &> /dev/null
 if [ $? -eq 0 ]; then
    journalctl -t scylla > $OUTPUT_PATH/scylla-logs.txt
 else
-    cat /var/log/syslog | grep -i scylla > $OUTPUT_PATH/scylla-logs.txt
+    if [ "$IS_GENTOO" == "0" ]; then
+        cat /var/log/scylla/scylla.log > $OUTPUT_PATH/scylla-logs.txt
+    else
+        cat /var/log/syslog | grep -i scylla > $OUTPUT_PATH/scylla-logs.txt
+    fi
 fi

 gzip -f $OUTPUT_PATH/scylla-logs.txt
@@ -224,6 +243,7 @@ if [ "$SCYLLA_SERVICE" == "1" ]; then
    echo "Skipping Data Model Info Collection"
    echo "--------------------------------------------------"
 else
+    # TODO: handle connecting with authentication
    cqlsh `hostname -i` $CQL_PORT -e "HELP" &> /dev/null
    if [ $? -eq 0 ]; then
        echo "Collecting Data Model Info (using port $CQL_PORT)"
@@ -357,7 +377,7 @@ if [ "$IS_FEDORA" == "0" ]; then
    echo "## /etc/sysconfig/scylla-server ##" >> $REPORT
 fi

-if [ "$IS_DEBIAN" == "0" ]; then
+if [ "$IS_DEBIAN" == "0" ] || [ "$IS_GENTOO" == "0" ]; then
    echo "## /etc/default/scylla-server ##" >> $REPORT
 fi

--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -23,7 +23,6 @@ import os
 import sys
 import argparse
 import subprocess
-import shutil
 from scylla_util import *

 if __name__ == '__main__':
@@ -62,7 +61,7 @@ ExternalSizeMax=1024G
        with open('/etc/systemd/coredump.conf', 'w') as f:
            conf = f.write(conf_data)
        if args.dump_to_raiddir:
-            shutil.rmtree('/var/lib/systemd/coredump')
+            rmtree('/var/lib/systemd/coredump')
            makedirs('/var/lib/scylla/coredump')
            os.symlink('/var/lib/scylla/coredump', '/var/lib/systemd/coredump')
        run('systemctl daemon-reload')
--- a/dist/common/scripts/scylla_ec2_check
+++ b/dist/common/scripts/scylla_ec2_check
@@ -24,46 +24,38 @@ import sys
 import argparse
 from scylla_util import *

-def get_en_interface_type():
-    type, subtype = curl('http://169.254.169.254/latest/meta-data/instance-type').split('.')
-    if type in ['c3', 'c4', 'd4', 'd2', 'i2', 'r3']:
-        return 'ixgbevf'
-    if type in ['i3', 'p2', 'r4', 'x1']:
-        return 'ena'
-    if type == 'm4':
-        if subtype == '16xlarge':
-            return 'ena'
-        else:
-            return 'ixgbevf'
-
-def is_vpc_enabled():
-    with open('/sys/class/net/eth0/address') as f:
-        mac = f.read().strip()
-    mac_stat = curl('http://169.254.169.254/latest/meta-data/network/interfaces/macs/{}/'.format(mac))
-    return True if re.search(r'^vpc-id$', mac_stat, flags=re.MULTILINE) else False
-
-
 if __name__ == '__main__':
    if not is_ec2():
        sys.exit(0)
+    parser = argparse.ArgumentParser(description='Verify EC2 configuration is optimized.')
+    parser.add_argument('--nic', default='eth0',
+                        help='specify NIC')
+    args = parser.parse_args()

-    type = curl('http://169.254.169.254/latest/meta-data/instance-type')
-    en = get_en_interface_type()
-    match = re.search(r'^driver: (\S+)$', out('ethtool -i eth0'), flags=re.MULTILINE)
+    if not is_valid_nic(args.nic):
+        print('NIC {} doesn\'t exist.'.format(args.nic))
+        sys.exit(1)
+
+    aws = aws_instance()
+    instance_class = aws.instance_class()
+    en = aws.get_en_interface_type()
+    match = re.search(r'^driver: (\S+)$', out('ethtool -i {}'.format(args.nic)), flags=re.MULTILINE)
    driver = match.group(1)

    if not en:
-        print('{bold_red}{type} doesn\'t support enahanced networking!{no_color}'.format(bold_red=concolor.BOLD_RED, type=type, no_color=concolor.NO_COLOR))
+        colorprint('{red}{instance_class} doesn\'t support enahanced networking!{nocolor}', instance_class=instance_class)
        print('''To enable enhanced networking, please use the instance type which supports it.
 More documentation available at:
 http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/enhanced-networking.html#enabling_enhanced_networking''')
        sys.exit(1)
-    elif not is_vpc_enabled():
-        print('{bold_red}VPC is not enabled!{no_color}'.format(bold_red=concolor.BOLD_RED, no_color=concolor.NO_COLOR))
+    elif not aws.is_vpc_enabled(args.nic):
+        colorprint('{red}VPC is not enabled!{nocolor}')
        print('To enable enhanced networking, please enable VPC.')
        sys.exit(1)
    elif driver != en:
-        print('{bold_red}Enhanced networking is disabled!{no_color}'.format(bold_red=concolor.BOLD_RED, no_color=concolor.NO_COLOR))
+        colorprint('{red}Enhanced networking is disabled!{nocolor}')
        print('''More documentation available at:
 http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/enhanced-networking.html''')
        sys.exit(1)
+
+    colorprint('{green}This EC2 instance is optimized for Scylla.{nocolor}')
--- a/dist/common/scripts/scylla_fstrim_setup
+++ b/dist/common/scripts/scylla_fstrim_setup
@@ -28,6 +28,8 @@ if __name__ == '__main__':
    if os.getuid() > 0:
        print('Requires root permission.')
        sys.exit(1)
+    if is_systemd():
+        systemd_unit('scylla-fstrim.timer').unmask()
    if is_redhat_variant():
        systemd_unit('fstrim.timer').disable()
    if dist_name() == 'Ubuntu' and os.path.exists('/etc/cron.weekly/fstrim'):
--- a/dist/common/scripts/scylla_lib.sh
+++ b/dist/common/scripts/scylla_lib.sh
@@ -1,122 +0,0 @@
-#
-#  Copyright (C) 2016 ScyllaDB
-
-is_debian_variant() {
-    [ -f /etc/debian_version ]
-}
-
-is_redhat_variant() {
-    [ -f /etc/redhat-release ]
-}
-
-is_gentoo_variant() {
-    [ -f /etc/gentoo-release ]
-}
-
-is_systemd() {
-    grep -q '^systemd$' /proc/1/comm
-}
-
-is_ec2() {
-    [ -f /sys/hypervisor/uuid ] && [ "$(head -c 3 /sys/hypervisor/uuid)" = "ec2" ]
-}
-
-is_selinux_enabled() {
-    STATUS=`getenforce`
-    if [ "$STATUS" = "Disabled" ]; then
-        return 0
-    else
-        return 1
-    fi
-}
-
-ec2_is_supported_instance_type() {
-    TYPE=`curl -s http://169.254.169.254/latest/meta-data/instance-type|cut -d . -f 1`
-    case $TYPE in
-           "i2"|"i3") echo 1;;
-            *) echo 0;;
-    esac
-}
-
-verify_args() {
-     if [ -z "$2" ] || [[ "$2" =~ ^--+ ]]; then
-        echo "Requires more parameter for $1."
-        print_usage
-        exit 1
-    fi
-}
-
-#
-#  get_mode_cpu_set <mode name, e.g. 'mq', 'sq', 'sq_split'>
-#
-get_mode_cpu_set() {
-    local mode=$1
-    local mode_cpu_mask=`/usr/lib/scylla/perftune.py --tune net --nic "$nic" --mode "$mode" --get-cpu-mask` 2>&-
-
-    # If the given mode is not supported - return invalid CPU set
-    if [[ "$?" -ne "0" ]]; then
-        echo "-1"
-    else
-        echo "$mode_cpu_mask" | /usr/lib/scylla/hex2list.py
-    fi
-}
-
-#
-# check_cpuset_conf <NIC name>
-#
-get_tune_mode() {
-    local nic=$1
-
-    # if cpuset.conf doesn't exist use the default mode
-    [[ ! -e '/etc/scylla.d/cpuset.conf' ]] && return
-
-    local cur_cpuset=`cat /etc/scylla.d/cpuset.conf | cut -d "\"" -f2- | cut -d" " -f2`
-    local mq_cpuset=`get_mode_cpu_set 'mq'`
-    local sq_cpuset=`get_mode_cpu_set 'sq'`
-    local sq_split_cpuset=`get_mode_cpu_set 'sq_split'`
-    local tune_mode=""
-
-    case "$cur_cpuset" in
-        "$mq_cpuset")
-            tune_mode="--mode mq"
-            ;;
-        "$sq_cpuset")
-            tune_mode="--mode sq"
-            ;;
-        "$sq_split_cpuset")
-            tune_mode="--mode sq_split"
-            ;;
-    esac
-
-    # if cpuset is something different from what we expect - use the default mode
-    echo "$tune_mode"
-}
-
-#
-# create_perftune_conf [<NIC name>]
-#
-create_perftune_conf() {
-    local nic=$1
-    [[ -z "$nic" ]] && nic='eth0'
-
-    # if exists - do nothing
-    [[ -e '/etc/scylla.d/perftune.yaml' ]] && return
-
-    local mode=`get_tune_mode "$nic"`
-    /usr/lib/scylla/perftune.py --tune net --nic "$nic" $mode --dump-options-file > /etc/scylla.d/perftune.yaml
-}
-
-. /etc/os-release
-if is_debian_variant || is_gentoo_variant; then
-    SYSCONFIG=/etc/default
-else
-    SYSCONFIG=/etc/sysconfig
-fi
-. $SYSCONFIG/scylla-server
-
-for i in /etc/scylla.d/*.conf; do
-    if [ "$i" = "/etc/scylla.d/*.conf" ]; then
-        break
-    fi
-    . "$i"
-done
--- a/dist/common/scripts/scylla_ntp_setup
+++ b/dist/common/scripts/scylla_ntp_setup
@@ -49,7 +49,8 @@ if __name__ == '__main__':
        if is_systemd():
            ntp = systemd_unit('ntp.service')
            ntp.stop()
-            run('ntpdate ntp.ubuntu.com')
+            # ignore error, ntpd may able to adjust clock later
+            run('ntpdate ntp.ubuntu.com', exception=False)
            ntp.start()
        else:
            run('service ntp stop')
@@ -70,7 +71,8 @@ if __name__ == '__main__':
            sntpd.start()
        else:
            run('rc-service ntpd stop', exception=False)
-            run('ntpdate {}'.format(server))
+            # ignore error, ntpd may able to adjust clock later
+            run('ntpdate {}'.format(server), exception=False)
            run('rc-update add ntpd default')
            run('rc-service ntpd start')

@@ -87,6 +89,7 @@ if __name__ == '__main__':
        server = match.group(1)
        ntpd = systemd_unit('ntpd.service')
        ntpd.stop()
-        run('ntpdate {}'.format(server))
+        # ignore error, ntpd may able to adjust clock later
+        run('ntpdate {}'.format(server), exception=False)
        ntpd.enable()
        ntpd.start()
--- a/dist/common/scripts/scylla_prepare
+++ b/dist/common/scripts/scylla_prepare
@@ -1,33 +1,71 @@
-#!/bin/bash -e
+#!/usr/bin/python3
+#
+# Copyright 2018 ScyllaDB
+#

-. /usr/lib/scylla/scylla_lib.sh
+#
+# This file is part of Scylla.
+#
+# Scylla is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Scylla is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.

-if [ "$AMI" = "yes" ] && [ -f /etc/scylla/ami_disabled ]; then
-    rm /etc/scylla/ami_disabled
-    exit 1
-fi
+import os
+import sys
+import glob
+from scylla_util import *

-if [ "$NETWORK_MODE" = "virtio" ]; then
-    ip tuntap del mode tap dev $TAP
-    ip tuntap add mode tap dev $TAP user $USER one_queue vnet_hdr
-    ip link set dev $TAP up
-    ip link set dev $TAP master $BRIDGE
-    chown $USER.$GROUP /dev/vhost-net
-elif [ "$NETWORK_MODE" = "dpdk" ]; then
-    modprobe uio
-    modprobe uio_pci_generic
-    /usr/lib/scylla/dpdk-devbind.py --force --bind=uio_pci_generic $ETHPCIID
-    for n in /sys/devices/system/node/node?; do
-        echo $NR_HUGEPAGES > $n/hugepages/hugepages-2048kB/nr_hugepages
-    done
-    if [ "$ID" = "ubuntu" ]; then
-        hugeadm --create-mounts
-    fi
-else # NETWORK_MODE = posix
-    if [ "$SET_NIC" = "yes" ]; then
-        create_perftune_conf "$IFNAME"
-        /usr/lib/scylla/posix_net_conf.sh $IFNAME --options-file /etc/scylla.d/perftune.yaml
-    fi
-fi
+if __name__ == '__main__':
+    if os.getuid() > 0:
+        print('Requires root permission.')
+        sys.exit(1)
+    if is_redhat_variant():
+        cfg = sysconfig_parser('/etc/sysconfig/scylla-server')
+    else:
+        cfg = sysconfig_parser('/etc/default/scylla-server')
+    ami = cfg.get('AMI')
+    mode = cfg.get('NETWORK_MODE')

-/usr/lib/scylla/scylla-blocktune
+    if ami == 'yes' and os.path.exists('/etc/scylla/ami_disabled'):
+        os.remove('/etc/scylla/ami_disabled')
+        sys.exit(1)
+
+    if mode == 'virtio':
+        tap = cfg.get('TAP')
+        user = cfg.get('USER')
+        group = cfg.get('GROUP')
+        bridge = cfg.get('BRIDGE')
+        run('ip tuntap del mode tap dev {TAP}'.format(TAP=tap))
+        run('ip tuntap add mode tap dev {TAP} user {USER} one_queue vnet_hdr'.format(TAP=tap, USER=user))
+        run('ip link set dev {TAP} up'.format(TAP=tap))
+        run('ip link set dev {TAP} master {BRIDGE}'.format(TAP=tap, BRIDGE=bridge))
+        run('chown {USER}.{GROUP} /dev/vhost-net'.format(USER=user, GROUP=group))
+    elif mode == 'dpdk':
+        ethpcciid = cfg.get('ETHPCIID')
+        nr_hugepages = cfg.get('NR_HUGEPAGES')
+        run('modprobe uio')
+        run('modprobe uio_pci_generic')
+        run('/usr/lib/scylla/dpdk-devbind.py --force --bind=uio_pci_generic {ETHPCIID}'.format(ETHPCIID=ethpciid))
+        for n in glob.glob('/sys/devices/system/node/node?'):
+            with open('{n}/hugepages/hugepages-2048kB/nr_hugepages'.format(n=n), 'w') as f:
+                f.write(nr_hugepages)
+        if dist_name() == 'Ubuntu': 
+            run('hugeadm --create-mounts')
+        fi
+    else:
+        set_nic = cfg.get('SET_NIC')
+        ifname = cfg.get('IFNAME')
+        if set_nic  == 'yes':
+            create_perftune_conf(ifname)
+            run('/usr/lib/scylla/posix_net_conf.sh {IFNAME} --options-file /etc/scylla.d/perftune.yaml'.format(IFNAME=ifname))
+
+    run('/usr/lib/scylla/scylla-blocktune')
--- a/dist/common/scripts/scylla_raid_setup
+++ b/dist/common/scripts/scylla_raid_setup
@@ -23,6 +23,8 @@ import os
 import argparse
 import pwd
 import grp
+import sys
+import stat
 from scylla_util import *

 if __name__ == '__main__':
@@ -40,6 +42,8 @@ if __name__ == '__main__':
                        help='specify the root of the tree')
    parser.add_argument('--volume-role', default='all',
                        help='specify how will this device be used (data, commitlog, or all)')
+    parser.add_argument('--force-raid', action='store_true', default=False,
+                        help='force constructing RAID when only one disk is specified')

    args = parser.parse_args()

@@ -60,6 +64,12 @@ if __name__ == '__main__':
        if not os.path.exists(disk):
            print('{} is not found'.format(disk))
            sys.exit(1)
+        if not stat.S_ISBLK(os.stat(disk).st_mode):
+            print('{} is not block device'.format(disk))
+            sys.exit(1)
+        if not is_unused_disk(disk):
+            print('{} is busy'.format(disk))
+            sys.exit(1)

    if os.path.exists(args.raiddev):
        print('{} is already using'.format(args.raiddev))
@@ -74,12 +84,20 @@ if __name__ == '__main__':
    elif is_gentoo_variant():
        run('emerge -uq sys-fs/mdadm sys-fs/xfsprogs')

-    print('Creating RAID0 for scylla using {nr_disk} disk(s): {disks}'.format(nr_disk=len(disks), disks=args.disks))
+    if len(disks) == 1 and not args.force_raid:
+        raid = False
+        fsdev = disks[0]
+    else:
+        raid = True
+        fsdev = args.raiddev
+
+    print('Creating {type} for scylla using {nr_disk} disk(s): {disks}'.format(type='RAID0' if raid else 'XFS volume', nr_disk=len(disks), disks=args.disks))
    if dist_name() == 'Ubuntu' and dist_ver() == '14.04':
-        run('udevadm settle')
-        run('mdadm --create --verbose --force --run {raid} --level=0 -c1024 --raid-devices={nr_disk} {disks}'.format(raid=args.raiddev, nr_disk=len(disks), disks=args.disks.replace(',', ' ')))
-        run('udevadm settle')
-        run('mkfs.xfs {} -f'.format(args.raiddev))
+        if raid:
+            run('udevadm settle')
+            run('mdadm --create --verbose --force --run {raid} --level=0 -c1024 --raid-devices={nr_disk} {disks}'.format(raid=fsdev, nr_disk=len(disks), disks=args.disks.replace(',', ' ')))
+            run('udevadm settle')
+        run('mkfs.xfs {} -f'.format(fsdev))
    else:
        procs=[]
        for disk in disks:
@@ -93,22 +111,24 @@ if __name__ == '__main__':
                    procs.append(proc)
        for proc in procs:
            proc.wait()
-        run('udevadm settle')
-        run('mdadm --create --verbose --force --run {raid} --level=0 -c1024 --raid-devices={nr_disk} {disks}'.format(raid=args.raiddev, nr_disk=len(disks), disks=args.disks.replace(',', ' ')))
-        run('udevadm settle')
-        run('mkfs.xfs {} -f -K'.format(args.raiddev))
+        if raid:
+            run('udevadm settle')
+            run('mdadm --create --verbose --force --run {raid} --level=0 -c1024 --raid-devices={nr_disk} {disks}'.format(raid=fsdev, nr_disk=len(disks), disks=args.disks.replace(',', ' ')))
+            run('udevadm settle')
+        run('mkfs.xfs {} -f -K'.format(fsdev))

    if is_debian_variant():
        confpath = '/etc/mdadm/mdadm.conf'
    else:
        confpath = '/etc/mdadm.conf'

-    res = out('mdadm --detail --scan')
-    with open(confpath, 'w') as f:
-        f.write(res)
+    if raid:
+        res = out('mdadm --detail --scan')
+        with open(confpath, 'w') as f:
+            f.write(res)

    makedirs(mount_at)
-    run('mount -t xfs -o noatime {raid} "{mount_at}"'.format(raid=args.raiddev, mount_at=mount_at))
+    run('mount -t xfs -o noatime {raid} "{mount_at}"'.format(raid=fsdev, mount_at=mount_at))

    makedirs('{}/data'.format(root))
    makedirs('{}/commitlog'.format(root))
@@ -122,11 +142,19 @@ if __name__ == '__main__':
    os.chown('{}/coredump'.format(root), uid, gid)

    if args.update_fstab:
-        res = out('blkid {}'.format(args.raiddev))
+        res = out('blkid {}'.format(fsdev))
        match = re.search(r'^/dev/\S+: (UUID="\S+")', res.strip())
        uuid = match.group(1)
        with open('/etc/fstab', 'a') as f:
-            f.write('{uuid} {mount_at} xfs noatime 0 0\n'.format(uuid=uuid, mount_at=mount_at))
+            f.write('{uuid} {mount_at} xfs noatime,nofail 0 0\n'.format(uuid=uuid, mount_at=mount_at))
+        mounts_conf = '/etc/systemd/system/scylla-server.service.d/mounts.conf'
+        if not os.path.exists(mounts_conf):
+            makedirs('/etc/systemd/system/scylla-server.service.d/')
+            with open(mounts_conf, 'w') as f:
+                f.write('[Unit]\nRequiresMountsFor={mount_at}\n'.format(mount_at=mount_at))
+        else:
+            with open(mounts_conf, 'a') as f:
+                f.write('RequiresMountsFor={mount_at}\n'.format(mount_at=mount_at))

    if is_debian_variant():
        run('update-initramfs -u')
--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -22,7 +22,6 @@
 import os
 import sys
 import argparse
-import logging
 import glob
 import shutil
 import io
@@ -49,11 +48,28 @@ def interactive_ask_service(msg1, msg2, default = None):
        elif ans == 'no' or ans =='n':
            return False

+def interactive_choose_nic():
+    nics = [os.path.basename(n) for n in glob.glob('/sys/class/net/*') if n != '/sys/class/net/lo']
+    if len(nics) == 0:
+        print('A NIC was not found.')
+        sys.exit(1)
+    elif len(nics) == 1:
+        return nics[0]
+    else:
+        print('Please select a NIC from the following list:')
+        while True:
+            print(nics)
+            n = input('> ')
+            if is_valid_nic(n):
+                return n
+
 def do_verify_package(pkg):
    if is_debian_variant():
        res = run('dpkg -s {}'.format(pkg), silent=True, exception=False)
    elif is_redhat_variant():
        res = run('rpm -q {}'.format(pkg), silent=True, exception=False)
+    elif is_gentoo_variant():
+        res = 1 if len(glob.glob('/var/db/pkg/*/{}-*'.format(pkg))) else 0
    if res != 0:
        print('{} package is not installed.'.format(pkg))
        sys.exit(1)
@@ -67,22 +83,18 @@ def list_block_devices():
        devices = []
        for p in ['/dev/sd*', '/dev/hd*', '/dev/xvd*', '/dev/nvme*', '/dev/mapper/*']:
            devices.extend([d for d in glob.glob(p) if d != '/dev/mapper/control'])
-        return devices
+    return devices

 def get_unused_disks():
    unused = []
    for dev in list_block_devices():
-        with open('/proc/mounts') as f:
-            s = f.read().strip()
-        count_raw = len(re.findall('^{} '.format(dev), s, flags=re.MULTILINE))
-        count_pvs = 0
-        if shutil.which('pvs'):
-            s = out('pvs -o pv_name --nohead')
-            count_pvs = len(re.findall(dev, s, flags=re.MULTILINE))
-        s = out('swapon --show=NAME --noheadings')
-        count_swap = len(re.findall(dev, s, flags=re.MULTILINE))
-        if count_raw + count_pvs + count_swap == 0:
-            unused.append(dev)
+        # dev contains partitions
+        if len(glob.glob('/sys/class/block/{dev}/{dev}*'.format(dev=dev.replace('/dev/','')))) > 0:
+            continue
+        # dev is used
+        if not is_unused_disk(dev):
+            continue
+        unused.append(dev)
    return unused

 def run_setup_script(name, script):
@@ -90,7 +102,7 @@ def run_setup_script(name, script):
    res = run(script, exception=False)
    if res != 0:
        if interactive:
-            print('{red}{name} setup failed. Press any key to continue...{no_color}'.format(red=concolor.BOLD_RED, name=name, no_color=concolor.NO_COLOR))
+            colorprint('{red}{name} setup failed. Press any key to continue...{nocolor}', name=name)
            input()
        else:
            print('{} setup failed.'.format(name))
@@ -99,12 +111,12 @@ def run_setup_script(name, script):

 if __name__ == '__main__':
    if os.getuid() > 0:
-        logging.error('Requires root permission.')
+        print('Requires root permission.')
        sys.exit(1)
    parser = argparse.ArgumentParser(description='Configure environment for Scylla.')
    parser.add_argument('--disks',
                        help='specify disks for RAID')
-    parser.add_argument('--nic',
+    parser.add_argument('--nic', default='eth0',
                        help='specify NIC')
    parser.add_argument('--ntp-domain',
                        help='specify NTP domain')
@@ -115,7 +127,7 @@ if __name__ == '__main__':
    parser.add_argument('--developer-mode', action='store_true', default=False,
                        help='enable developer mode')
    parser.add_argument('--no-ec2-check', action='store_true', default=False,
-                        help='skip EC2 configuration check(only on EC2)')
+                        help='skip EC2 configuration check')
    parser.add_argument('--no-kernel-check', action='store_true', default=False,
                        help='skip kernel version check')
    parser.add_argument('--no-verify-package', action='store_true', default=False,
@@ -150,12 +162,14 @@ if __name__ == '__main__':
    if len(sys.argv) == 1:
        interactive = True

-    if not interactive and not args.no_raid_setup and not args.disks:
-        parser.print_help()
-        sys.exit(1)
-    if not interactive and not args.no_sysconfig_setup and not args.nic:
-        parser.print_help()
-        sys.exit(1)
+    if not interactive:
+        if not args.no_raid_setup and not args.disks:
+            parser.print_help()
+            sys.exit(1)
+        if not args.no_sysconfig_setup or (is_ec2() and not args.no_ec2_check):
+            if not is_valid_nic(args.nic):
+                print('NIC {} doesn\'t exist.'.format(args.nic))
+                sys.exit(1)

    disks = args.disks
    nic = args.nic
@@ -178,13 +192,16 @@ if __name__ == '__main__':
    fstrim_setup = not args.no_fstrim_setup
    selinux_reboot_required = False

-    print('{green}Skip any of the following steps by answering \'no\'{no_color}'.format(green=concolor.GREEN, no_color=concolor.NO_COLOR))
+    if interactive:
+        colorprint('{green}Skip any of the following steps by answering \'no\'{nocolor}')

    if is_ec2():
        if interactive:
-            ec2_check = interactive_ask_service('Do you want to run Amazon EC2 configuration check?', 'Yes - runs a script to verify that this instance is optimized for running Scylls. No - skips the configuration check.', 'yes')
+            ec2_check = interactive_ask_service('Do you want to run Amazon EC2 configuration check?', 'Yes - runs a script to verify that this instance is optimized for running Scylla. No - skips the configuration check.', 'yes')
+            if ec2_check:
+                nic = interactive_choose_nic()
        if ec2_check:
-            run('/usr/lib/scylla/scylla_ec2_check')
+            run('/usr/lib/scylla/scylla_ec2_check --nic {}'.format(nic))

    if interactive:
        kernel_check = interactive_ask_service('Do you want to run check your kernel version?', 'Yes - runs a  script to verify that the kernel for this instance qualifies to run Scylla. No - skips the kernel check.', 'yes')
@@ -202,11 +219,10 @@ if __name__ == '__main__':
    if enable_service:
        if is_systemd():
            systemd_unit('scylla-server.service').enable()
-            systemd_unit('scylla-fstrim.timer').unmask()
        elif is_gentoo_variant():
            run('rc-update add scylla-server default')

-    if interactive:
+    if interactive and not os.path.exists('/etc/scylla.d/housekeeping.cfg'):
        version_check = interactive_ask_service('Do you want to enable Scylla to check if there is a newer version of Scylla available?', 'Yes - start the Scylla-housekeeping service to check for a newer version. This check runs periodically. No - skips this step.', 'yes')
    if version_check:
        with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
@@ -277,10 +293,14 @@ if __name__ == '__main__':
            else:
                print('Please select unmounted disks from the following list: {}'.format(devices))
            selected = []
-            while len(devices):
+            dsklist = []
+            while True:
                print('type \'cancel\' to cancel RAID/XFS setup.')
                print('type \'done\' to finish selection. Selected: {}'.format(selected))
-                dsk = input('> ')
+                if len(dsklist) > 0:
+                    dsk = dsklist.pop(0)
+                else:
+                    dsk = input('> ')
                if dsk == 'cancel':
                    raid_setup = 0
                    break
@@ -290,12 +310,16 @@ if __name__ == '__main__':
                    break
                if dsk == '':
                    continue
+                if dsk.find(',') > 0:
+                    dsklist = dsk.split(',')
+                    continue
                if not os.path.exists(dsk):
                    print('{} not found'.format(dsk))
                    continue
                if not stat.S_ISBLK(os.stat(dsk).st_mode):
                    print('{} is not block device'.format(dsk))
-                selected += dsk
+                    continue
+                selected.append(dsk)
                devices.remove(dsk)
            disks = ','.join(selected)
    if raid_setup:
@@ -312,21 +336,9 @@ if __name__ == '__main__':
    if interactive:
        sysconfig_setup = interactive_ask_service('Do you want to setup a system-wide customized configuration for Scylla?', 'Yes - setup the sysconfig file. No - skips this step.', 'yes')
    if sysconfig_setup:
-        nics = [os.path.basename(n) for n in glob.glob('/sys/class/net/*') if n != '/sys/class/net/lo']
-        if len(nics) == 0:
-            print('A NIC was not found.')
-            sys.exit(1)
-        elif len(nics) == 1:
-            nic=nics[0]
-        else:
-            print('Please select a NIC from the following list:')
-            while True:
-                print(nics)
-                n = input('> ')
-                if os.path.exists('/sys/class/net/{}'.format(n)):
-                    nic = n
-                    break
-        set_nic = interactive_ask_service('Do you want to enable Network Interface Card (NIC) optimization?', 'Yes - optimize the NIC queue settings. Selecting Yes greatly improves performance. No - skip this step.', 'yes')
+        nic = interactive_choose_nic()
+        if interactive:
+            set_nic = interactive_ask_service('Do you want to enable Network Interface Card (NIC) optimization?', 'Yes - optimize the NIC queue settings. Selecting Yes greatly improves performance. No - skip this step.', 'yes')
    if sysconfig_setup:
        setup_args = '--setup-nic' if set_nic else ''
        run_setup_script('NIC queue', '/usr/lib/scylla/scylla_sysconfig_setup --nic {nic} {setup_args}'.format(nic=nic, setup_args=setup_args))
--- a/dist/common/scripts/scylla_stop
+++ b/dist/common/scripts/scylla_stop
@@ -1,10 +1,40 @@
-#!/bin/bash -e
+#!/usr/bin/python3
+#
+# Copyright 2018 ScyllaDB
+#

-. /usr/lib/scylla/scylla_lib.sh
+#
+# This file is part of Scylla.
+#
+# Scylla is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Scylla is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.

-if [ "$NETWORK_MODE" = "virtio" ]; then
-    ip tuntap del mode tap dev $TAP
-elif [ "$NETWORK_MODE" = "dpdk" ]; then
-    /usr/lib/scylla/dpdk-devbind.py -u $ETHPCIID
-    /usr/lib/scylla/dpdk-devbind.py -b $ETHDRV $ETHPCIID
-fi
+import os
+import sys
+from scylla_util import *
+
+if __name__ == '__main__':
+    if os.getuid() > 0:
+        print('Requires root permission.')
+        sys.exit(1)
+    if is_redhat_variant():
+        cfg = sysconfig_parser('/etc/sysconfig/scylla-server')
+    else:
+        cfg = sysconfig_parser('/etc/default/scylla-server')
+
+    
+    if cfg.get('NETWORK_MODE') == 'virtio':
+        run('ip tuntap del mode tap dev {TAP}'.format(TAP=cfg.get('TAP')))
+    elif cfg.get('NETWORK_MODE') == 'dpdk':
+        run('/usr/lib/scylla/dpdk-devbind.py -u {ETHPCIID}'.format(ETHPCIID=cfg.get('ETHPCIID')))
+        run('/usr/lib/scylla/dpdk-devbind.py -b {ETHDRV} {ETHPCIID}'.format(ETHDRV=cfg.get('ETHDRV'), ETHPCIID=cfg.get('ETHPCIID')))
--- a/dist/common/scripts/scylla_sysconfig_setup
+++ b/dist/common/scripts/scylla_sysconfig_setup
@@ -64,6 +64,10 @@ if __name__ == '__main__':
                        help='AMI instance mode')
    args = parser.parse_args()

+    if args.nic and not is_valid_nic(args.nic):
+            print('NIC {} not found.'.format(args.nic))
+            sys.exit(1)
+
    ifname = args.nic if args.nic else cfg.get('IFNAME')
    network_mode = args.mode if args.mode else cfg.get('NETWORK_MODE')

--- a/dist/common/scripts/scylla_util.py
+++ b/dist/common/scripts/scylla_util.py
@@ -27,14 +27,19 @@ import platform
 import configparser
 import io
 import shlex
+import shutil

-def curl(url):
+def curl(url, byte=False):
    max_retries = 5
    retries = 0
    while True:
        try:
            req = urllib.request.Request(url)
-            return urllib.request.urlopen(req).read().decode('utf-8')
+            with urllib.request.urlopen(req) as res:
+                if byte:
+                    return res.read()
+                else:
+                    return res.read().decode('utf-8')
        except urllib.error.HTTPError:
            logging.warn("Failed to grab %s..." % url)
            time.sleep(5)
@@ -79,6 +84,10 @@ class aws_instance:
                continue
            self._disks[t] += [ self.__xenify(dev) ]

+    def __mac_address(self, nic='eth0'):
+        with open('/sys/class/net/{}/address'.format(nic)) as f:
+            return f.read().strip()
+
    def __init__(self):
        self._type = self.__instance_metadata("instance-type")
        self.__populate_disks()
@@ -95,6 +104,25 @@ class aws_instance:
        """Returns the class of the instance we are running in. i.e.: i3"""
        return self._type.split(".")[0]

+    def is_supported_instance_class(self):
+        if self.instance_class() in ['i2', 'i3']:
+            return True
+        return False
+
+    def get_en_interface_type(self):
+        instance_class = self.instance_class()
+        instance_size = self.instance_size()
+        if instance_class in ['c3', 'c4', 'd2', 'i2', 'r3']:
+            return 'ixgbevf'
+        if instance_class in ['c5', 'c5d', 'f1', 'g3', 'h1', 'i3', 'm5', 'm5d', 'p2', 'p3', 'r4', 'x1']:
+            return 'ena'
+        if instance_class == 'm4':
+            if instance_size == '16xlarge':
+                return 'ena'
+            else:
+                return 'ixgbevf'
+        return None
+
    def disks(self):
        """Returns all disks in the system, as visible from the AWS registry"""
        disks = set()
@@ -133,6 +161,11 @@ class aws_instance:
        """Returns the private IPv4 address of this instance"""
        return self.__instance_metadata("local-ipv4")

+    def is_vpc_enabled(self, nic='eth0'):
+        mac = self.__mac_address(nic)
+        mac_stat = self.__instance_metadata('network/interfaces/macs/{}'.format(mac))
+        return True if re.search(r'^vpc-id$', mac_stat, flags=re.MULTILINE) else False
+

 ## Regular expression helpers
 # non-advancing comment matcher
@@ -222,37 +255,24 @@ class scylla_cpuinfo:
            return len(self._cpu_data["system"])

 def run(cmd, shell=False, silent=False, exception=True):
-    stdout=None
-    stderr=None
-    if silent:
-        stdout=subprocess.DEVNULL
-        stderr=subprocess.DEVNULL
-    if shell:
-        if exception:
-            return subprocess.check_call(cmd, shell=True, stdout=stdout, stderr=stderr)
-        else:
-            p = subprocess.Popen(cmd, shell=True, stdout=stdout, stderr=stderr)
-            return p.wait()
+    stdout=subprocess.DEVNULL if silent else None
+    stderr=subprocess.DEVNULL if silent else None
+    if not shell:
+        cmd = shlex.split(cmd)
+    if exception:
+        return subprocess.check_call(cmd, shell=shell, stdout=stdout, stderr=stderr)
    else:
-        if exception:
-            return subprocess.check_call(shlex.split(cmd), stdout=stdout, stderr=stderr)
-        else:
-            p = subprocess.Popen(shlex.split(cmd), stdout=stdout, stderr=stderr)
-            return p.wait()
+        p = subprocess.Popen(cmd, shell=shell, stdout=stdout, stderr=stderr)
+        return p.wait()

 def out(cmd, shell=False, exception=True):
-    if shell:
-        if exception:
-            return subprocess.check_output(cmd, shell=True).strip().decode('utf-8')
-        else:
-            p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
-            return p.communicate()[0].strip().decode('utf-8')
+    if not shell:
+        cmd = shlex.split(cmd)
+    if exception:
+        return subprocess.check_output(cmd, shell=shell).strip().decode('utf-8')
    else:
-        if exception:
-            return subprocess.check_output(shlex.split(cmd)).strip().decode('utf-8')
-        else:
-            p = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE)
-            return p.communicate()[0].strip().decode('utf-8')
+        p = subprocess.Popen(cmd, shell=shell, stdout=subprocess.PIPE)
+        return p.communicate()[0].strip().decode('utf-8')

 def is_debian_variant():
    return os.path.exists('/etc/debian_version')
@@ -306,17 +326,82 @@ def makedirs(name):
    if not os.path.isdir(name):
        os.makedirs(name)

+def rmtree(path):
+    if not os.path.islink(path):
+        shutil.rmtree(path)
+    else:
+        os.remove(path)
+
 def dist_name():
    return platform.dist()[0]

 def dist_ver():
    return platform.dist()[1]

+def is_unused_disk(dev):
+    # dev is not in /sys/class/block/, like /dev/nvme[0-9]+
+    if not os.path.isdir('/sys/class/block/{dev}'.format(dev=dev.replace('/dev/',''))):
+        return False
+    try:
+        fd = os.open(dev, os.O_EXCL)
+        os.close(fd)
+        return True
+    except OSError:
+        return False
+
+CONCOLORS = {'green':'\033[1;32m', 'red':'\033[1;31m', 'nocolor':'\033[0m'}
+def colorprint(msg, **kwargs):
+    fmt = dict(CONCOLORS)
+    fmt.update(kwargs)
+    print(msg.format(**fmt))
+
+def get_mode_cpuset(nic, mode):
+    try:
+        mode_cpu_mask=out('/usr/lib/scylla/perftune.py --tune net --nic "{nic}" --mode "{mode}" --get-cpu-mask'.format(nic=nic, mode=mode))
+        return hex2list(mode_cpu_mask)
+    except subprocess.CalledProcessError:
+        return '-1'
+
+def get_cur_cpuset():
+    cfg = sysconfig_parser('/etc/scylla.d/cpuset.conf')
+    cpuset=cfg.get('CPUSET')
+    return re.sub(r'^--cpuset (.+)$', r'\1', cpuset).strip()
+
+def get_tune_mode(nic):
+    if not os.path.exists('/etc/scylla.d/cpuset.conf'):
+        return
+    cur_cpuset=get_cur_cpuset()
+    mq_cpuset=get_mode_cpuset(nic, 'mq')
+    sq_cpuset=get_mode_cpuset(nic, 'sq')
+    sq_split_cpuset=get_mode_cpuset(nic, 'sq_split')
+
+    if cur_cpuset == mq_cpuset:
+        return 'mq'
+    elif cur_cpuset == sq_cpuset:
+        return 'sq'
+    elif cur_cpuset == sq_split_cpuset:
+        return 'sq_split'
+
+def create_perftune_conf(nic='eth0'):
+    if os.path.exists('/etc/scylla.d/perftune.yaml'):
+        return
+    mode=get_tune_mode(nic)
+    yaml=out('/usr/lib/scylla/perftune.py --tune net --nic "{nic}" --mode {mode} --dump-options-file'.format(nic=nic, mode=mode))
+    with open('/etc/scylla.d/perftune.yaml', 'w') as f:
+        f.write(yaml)
+
+def is_valid_nic(nic):
+    return os.path.exists('/sys/class/net/{}'.format(nic))
+
 class SystemdException(Exception):
    pass

 class systemd_unit:
    def __init__(self, unit):
+        try:
+            run('systemctl cat {}'.format(unit), silent=True)
+        except subprocess.CalledProcessError:
+            raise SystemdException('unit {} not found'.format(unit))
        self._unit = unit

    def start(self):
@@ -336,8 +421,7 @@ class systemd_unit:
        return run('systemctl disable {}'.format(self._unit))

    def is_active(self):
-        res = out('systemctl is-active {}'.format(self._unit), exception=False)
-        return True if re.match(r'^active', res, flags=re.MULTILINE) else False
+        return out('systemctl is-active {}'.format(self._unit), exception=False)

    def mask(self):
        return run('systemctl mask {}'.format(self._unit))
@@ -368,7 +452,7 @@ class sysconfig_parser:
        self.__load()

    def get(self, key):
-        return self._cfg.get('global', key)
+        return self._cfg.get('global', key).strip('"')

    def set(self, key, val):
        if not self._cfg.has_option('global', key):
@@ -379,9 +463,3 @@ class sysconfig_parser:
    def commit(self):
        with open(self._filename, 'w') as f:
            f.write(self._data)
-
-class concolor:
-    GREEN = '\033[0;32m'
-    RED = '\033[0;31m'
-    BOLD_RED = '\033[1;31m'
-    NO_COLOR = '\033[0m'
--- a/dist/common/systemd/scylla-housekeeping-daily.service.mustache
+++ b/dist/common/systemd/scylla-housekeeping-daily.service.mustache
@@ -10,7 +10,7 @@ Group=scylla
 ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/apt/sources.list.d/scylla*.list' version --mode d
 {{/debian}}
 {{#redhat}}
-ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files @@REPOFILES@@ version --mode d
+ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/yum.repos.d/scylla*.repo' version --mode d
 {{/redhat}}

 [Install]
--- a/dist/common/systemd/scylla-housekeeping-restart.service.mustache
+++ b/dist/common/systemd/scylla-housekeeping-restart.service.mustache
@@ -6,7 +6,7 @@ After=network.target
 Type=simple
 User=scylla
 Group=scylla
-ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files @@REPOFILES@@ version --mode r
+ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/yum.repos.d/scylla*.repo' version --mode r

 [Install]
 WantedBy=multi-user.target
--- a/dist/debian/build_deb.sh
+++ b/dist/debian/build_deb.sh
@@ -51,6 +51,18 @@ is_redhat_variant() {
 is_debian_variant() {
    [ -f /etc/debian_version ]
 }
+is_debian() {
+    case "$1" in
+        jessie|stretch) return 0;;
+        *) return 1;;
+    esac
+}
+is_ubuntu() {
+    case "$1" in
+        trusty|xenial|bionic) return 0;;
+        *) return 1;;
+    esac
+}


 pkg_install() {
@@ -99,11 +111,14 @@ if [ ! -f /usr/bin/dh_testdir ]; then
 fi
 if [ ! -f /usr/bin/pystache ]; then
    if is_redhat_variant; then
-        sudo yum install -y python2-pystache || sudo yum install -y pystache
+        sudo yum install -y /usr/bin/pystache
    elif is_debian_variant; then
        sudo apt-get install -y python-pystache
    fi
 fi
+if is_debian_variant && [ ! -f /usr/share/doc/python-pkg-resources/copyright ]; then
+    sudo apt-get install -y python-pkg-resources
+fi

 if [ -z "$TARGET" ]; then
    if is_debian_variant; then
@@ -125,12 +140,12 @@ echo $VERSION > version

 cp -a dist/debian/debian debian
 cp dist/common/sysconfig/scylla-server debian/scylla-server.default
-if [ "$TARGET" = "jessie" ] || [ "$TARGET" = "stretch" ]; then
-    REVISION="1~$TARGET"
-elif [ "$TARGET" = "trusty" ]; then
+if [ "$TARGET" = "trusty" ]; then
    cp dist/debian/scylla-server.cron.d debian/
-    REVISION="0ubuntu1~$TARGET"
-elif [ "$TARGET" = "xenial" ] || [ "$TARGET" = "bionic" ]; then
+fi
+if is_debian $TARGET; then
+    REVISION="1~$TARGET"
+elif is_ubuntu $TARGET; then
    REVISION="0ubuntu1~$TARGET"
 else
   echo "Unknown distribution: $TARGET"
@@ -145,8 +160,8 @@ chmod a+rx debian/rules

 if [ "$TARGET" != "trusty" ]; then
    pystache dist/common/systemd/scylla-server.service.mustache "{ $MUSTACHE_DIST }" > debian/scylla-server.service
-    pystache dist/common/systemd/scylla-housekeeping-daily.service.mustache "{ $MUSTACHE_DIST }" > debian/scylla-housekeeping-daily.service
-    pystache dist/common/systemd/scylla-housekeeping-restart.service.mustache "{ $MUSTACHE_DIST }" > debian/scylla-housekeeping-restart.service
+    pystache dist/common/systemd/scylla-housekeeping-daily.service.mustache "{ $MUSTACHE_DIST }" > debian/scylla-server.scylla-housekeeping-daily.service
+    pystache dist/common/systemd/scylla-housekeeping-restart.service.mustache "{ $MUSTACHE_DIST }" > debian/scylla-server.scylla-housekeeping-restart.service
    cp dist/common/systemd/scylla-fstrim.service debian/scylla-server.scylla-fstrim.service
    cp dist/common/systemd/node-exporter.service debian/scylla-server.node-exporter.service
 fi
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -26,7 +26,7 @@ ADD commandlineparser.py /commandlineparser.py
 ADD docker-entrypoint.py /docker-entrypoint.py

 # Install Scylla:
-RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo && \
+RUN curl http://downloads.scylladb.com/rpm/centos/scylla-2.3.repo -o /etc/yum.repos.d/scylla.repo && \
    yum -y install epel-release && \
    yum -y clean expire-cache && \
    yum -y update && \
--- a/dist/docker/redhat/scylla-service.sh
+++ b/dist/docker/redhat/scylla-service.sh
@@ -1,7 +1,13 @@
 #!/bin/bash

-. /usr/lib/scylla/scylla_prepare
+/usr/lib/scylla/scylla_prepare
+
+. /etc/sysconfig/scylla-server

 export SCYLLA_HOME SCYLLA_CONF

+for f in /etc/scylla.d/*.conf; do
+    . "$f"
+done
+
 exec /usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO $DEV_MODE $CPUSET $SCYLLA_DOCKER_ARGS
--- a/dist/redhat/build_rpm.sh
+++ b/dist/redhat/build_rpm.sh
@@ -98,12 +98,19 @@ rm -f version

 pystache dist/redhat/scylla.spec.mustache "{ \"version\": \"$SCYLLA_VERSION\", \"release\": \"$SCYLLA_RELEASE\", \"housekeeping\": $DIST }" > build/scylla.spec

+# mock generates files owned by root, fix this up
+fix_ownership() {
+    sudo chown "$(id -u):$(id -g)" -R "$@"
+}
+
 if [ $JOBS -gt 0 ]; then
    RPM_JOBS_OPTS=(--define="_smp_mflags -j$JOBS")
 fi
 sudo mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/scylla-$VERSION.tar $SRPM_OPTS "${RPM_JOBS_OPTS[@]}"
+fix_ownership build/srpms
 if [[ "$TARGET" =~ ^epel-7- ]]; then
    TARGET=scylla-$TARGET
    RPM_OPTS="$RPM_OPTS --configdir=dist/redhat/mock"
 fi
 sudo mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS "${RPM_JOBS_OPTS[@]}" build/srpms/scylla-$VERSION*.src.rpm
+fix_ownership build/rpms
--- a/dist/redhat/scylla.spec.mustache
+++ b/dist/redhat/scylla.spec.mustache
@@ -201,7 +201,6 @@ rm -rf $RPM_BUILD_ROOT
 %{_prefix}/lib/scylla/api/api-doc/*
 %{_prefix}/lib/scylla/scyllatop/*
 %{_prefix}/lib/scylla/scylla_config_get.py
-%{_prefix}/lib/scylla/scylla_lib.sh
 %{_prefix}/lib/scylla/scylla_util.py
 %if 0%{?fedora} >= 27
 %{_prefix}/lib/scylla/scylla-gdb.py
--- a/docs/protocol-extensions.md
+++ b/docs/protocol-extensions.md
@@ -0,0 +1,82 @@
+Protocol extensions to the Cassandra Native Protocol
+====================================================
+
+This document specifies extensions to the protocol defined
+by Cassandra's native_protocol_v4.spec and native_protocol_v5.spec.
+The extensions are designed so that a driver supporting them can
+continue to interoperate with Cassandra and other compatible servers
+with no configuration needed; the driver can discover the extensions
+and enable them conditionally.
+
+An extension can be discovered by using the OPTIONS request; the
+returned SUPPORTED response will have zero or more options beginning
+with SCYLLA indicating extensions defined in this documented, in
+addition to options documented by Cassandra. How to use the extension
+is further explained in this document.
+
+# Intranode sharding
+
+This extension allows the driver to discover how Scylla internally
+partitions data among logical cores. It can then create at least
+one connection per logical core, and send queries directly to the
+logical core that will serve them, greatly improving load balancing
+and efficiency.
+
+To use the extension, send the OPTIONS message. The data is returned
+in the SUPPORTED message, as a set of key/value options. Numeric values
+are returned as their base-10 ASCII representation.
+
+The keys and values are:
+  - `SCYLLA_SHARD` is an integer, the zero-based shard number this connection
+    is connected to (for example, `3`).
+  - `SCYLLA_NR_SHARDS` is an integer containing the number of shards on this
+    node (for example, `12`). All shard numbers are smaller than this number.
+  - `SCYLLA_PARTITIONER` is a the fully-qualified name of the partitioner in use (i.e.
+    `org.apache.cassandra.partitioners.Murmur3Partitioner`).
+  - `SCYLLA_SHARDING_ALGORITHM` is the name of an algorithm used to select how
+    partitions are mapped into shards (described below)
+  - `SCYLLA_SHARDING_IGNORE_MSB` is an integer parameter to the algorithm (also
+    described below)
+
+Currently, one `SCYLLA_SHARDING_ALGORITHM` is defined,
+`biased-token-round-robin`. To apply the algorithm,
+perform the following steps (assuming infinite-precision arithmetic):
+
+  - subtract the minimum token value from the partition's token
+    in order to bias it: `biased_token = token - (-2**63)`
+  - shift `biased_token` left by `ignore_msb` bits, discarding any
+    bits beyond the 63rd:
+      `biased_token = (biased_token << SCYLLA_SHARDING_IGNORE_MSB) % (2**64)`
+  - multiply by `SCYLLA_NR_SHARDS` and perform a truncating division by 2**64:
+    `shard = (biased_token * SCYLLA_NR_SHARDS) / 2**64`
+
+(this apparently convoluted algorithm replaces a slow division instruction with
+a fast multiply instruction).
+
+in C with 128-bit arithmetic support, these operations can be efficiently
+performed in three steps:
+
+```c++
+    uint64_t biased_token = token + ((uint64_t)1 << 63);
+    biased_token <<= ignore_msb;
+    int shard = ((unsigned __int128)biased_token * nr_shards) >> 64;
+```
+
+In languages without 128-bit arithmetic support, use the following (this example
+is for Java):
+
+```Java
+    private int scyllaShardOf(long token) {
+        token += Long.MIN_VALUE;
+        token <<= ignoreMsb;
+        long tokLo = token & 0xffffffffL;
+        long tokHi = (token >>> 32) & 0xffffffffL;
+        long mul1 = tokLo * nrShards;
+        long mul2 = tokHi * nrShards;
+        long sum = (mul1 >>> 32) + mul2;
+        return (int)(sum >>> 32);
+    }
+```
+
+It is recommended that drivers open connections until they have at
+least one connection per shard, then close excess connections.
--- a/gms/endpoint_state.hh
+++ b/gms/endpoint_state.hh
@@ -129,26 +129,8 @@ public:
        update_is_normal();
    }

-    void apply_application_state(application_state key, versioned_value&& value) {
-        auto&& e = _application_state[key];
-        if (e.version < value.version) {
-            e = std::move(value);
-        }
-        update_is_normal();
-    }
-
-    void apply_application_state(application_state key, const versioned_value& value) {
-        auto&& e = _application_state[key];
-        if (e.version < value.version) {
-            e = value;
-        }
-        update_is_normal();
-    }
-
-    void apply_application_state(const endpoint_state& es) {
-        for (auto&& e : es._application_state) {
-            apply_application_state(e.first, e.second);
-        }
+    void add_application_state(const endpoint_state& es) {
+        _application_state = es._application_state;
        update_is_normal();
    }

--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -923,7 +923,7 @@ void gossiper::make_random_gossip_digest(std::vector<gossip_digest>& g_digests)
 future<> gossiper::replicate(inet_address ep, const endpoint_state& es) {
    return container().invoke_on_all([ep, es, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
        if (engine().cpu_id() != orig) {
-            g.endpoint_state_map[ep].apply_application_state(es);
+            g.endpoint_state_map[ep].add_application_state(es);
        }
    });
 }
@@ -932,7 +932,7 @@ future<> gossiper::replicate(inet_address ep, const std::map<application_state,
    return container().invoke_on_all([ep, &src, &changed, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
        if (engine().cpu_id() != orig) {
            for (auto&& key : changed) {
-                g.endpoint_state_map[ep].apply_application_state(key, src.at(key));
+                g.endpoint_state_map[ep].add_application_state(key, src.at(key));
            }
        }
    });
@@ -941,7 +941,7 @@ future<> gossiper::replicate(inet_address ep, const std::map<application_state,
 future<> gossiper::replicate(inet_address ep, application_state key, const versioned_value& value) {
    return container().invoke_on_all([ep, key, &value, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
        if (engine().cpu_id() != orig) {
-            g.endpoint_state_map[ep].apply_application_state(key, value);
+            g.endpoint_state_map[ep].add_application_state(key, value);
        }
    });
 }
@@ -1168,11 +1168,13 @@ stdx::optional<endpoint_state> gossiper::get_endpoint_state_for_endpoint(inet_ad
    }
 }

-void gossiper::reset_endpoint_state_map() {
-    endpoint_state_map.clear();
+future<> gossiper::reset_endpoint_state_map() {
    _unreachable_endpoints.clear();
    _live_endpoints.clear();
    _live_endpoints_just_added.clear();
+    return container().invoke_on_all([] (gossiper& g) {
+        g.endpoint_state_map.clear();
+    });
 }

 std::unordered_map<inet_address, endpoint_state>& gms::gossiper::get_endpoint_states() {
@@ -1655,6 +1657,7 @@ void gossiper::maybe_initialize_local_state(int generation_nbr) {
    }
 }

+// Runs inside seastar::async context
 void gossiper::add_saved_endpoint(inet_address ep) {
    if (ep == get_broadcast_address()) {
        logger.debug("Attempt to add self as saved endpoint");
@@ -1680,6 +1683,7 @@ void gossiper::add_saved_endpoint(inet_address ep) {
    }
    ep_state.mark_dead();
    endpoint_state_map[ep] = ep_state;
+    replicate(ep, ep_state).get();
    _unreachable_endpoints[ep] = now();
    logger.trace("Adding saved endpoint {} {}", ep, ep_state.get_heart_beat_state().get_generation());
 }
@@ -1915,6 +1919,7 @@ void gossiper::mark_as_shutdown(const inet_address& endpoint) {
        auto& ep_state = *es;
        ep_state.add_application_state(application_state::STATUS, storage_service_value_factory().shutdown(true));
        ep_state.get_heart_beat_state().force_highest_possible_version_unsafe();
+        replicate(endpoint, ep_state).get();
        mark_dead(endpoint, ep_state);
        get_local_failure_detector().force_conviction(endpoint);
    }
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -418,7 +418,7 @@ public:
    stdx::optional<endpoint_state> get_endpoint_state_for_endpoint(inet_address ep) const;

    // removes ALL endpoint states; should only be called after shadow gossip
-    void reset_endpoint_state_map();
+    future<> reset_endpoint_state_map();

    std::unordered_map<inet_address, endpoint_state>& get_endpoint_states();

--- a/imr/alloc.hh
+++ b/imr/alloc.hh
@@ -84,6 +84,8 @@ template<typename Structure, typename CtxFactory>
 GCC6_CONCEPT(requires ContextFactory<CtxFactory>)
 class lsa_migrate_fn final : public migrate_fn_type, CtxFactory {
 public:
+    using structure = Structure;
+
    explicit lsa_migrate_fn(CtxFactory context_factory)
        : migrate_fn_type(1)
        , CtxFactory(std::move(context_factory))
@@ -201,8 +203,21 @@ public:
        /// arguments are passed to `T::size_when_serialized`.
        ///
        /// \return null pointer of type `uint8_t*`.
+        template<typename T, typename MigrateFn, typename... Args>
+        uint8_t* allocate(MigrateFn* migrate_fn, Args&&... args) noexcept {
+            static_assert(std::is_same_v<typename MigrateFn::structure, T>);
+            return do_allocate<T>(migrate_fn, std::forward<Args>(args)...);
+        }
+
+        template<typename T, typename MigrateFn, typename... Args>
+        auto allocate_nested(MigrateFn* migrate_fn, Args&&... args) noexcept {
+            static_assert(std::is_same_v<typename MigrateFn::structure, T>);
+            return do_allocate_nested<T>(migrate_fn, std::forward<Args>(args)...);
+        }
+
+    private:
        template<typename T, typename... Args>
-        uint8_t* allocate(migrate_fn_type* migrate_fn, Args&& ... args) noexcept {
+        uint8_t* do_allocate(migrate_fn_type* migrate_fn, Args&&... args) noexcept {
            auto size = T::size_when_serialized(std::forward<Args>(args)...);
            _parent.request(size, migrate_fn);

@@ -216,7 +231,7 @@ public:
        }

        template<typename T, typename... Args>
-        auto allocate_nested(migrate_fn_type* migrate_fn, Args&& ... args) noexcept {
+        auto do_allocate_nested(migrate_fn_type* migrate_fn, Args&& ... args) noexcept {
            auto n = _parent.request(0, migrate_fn);
            return T::get_sizer(continuation(_parent, n),
                                std::forward<Args>(args)...);
@@ -244,15 +259,28 @@ public:
        /// to the buffer requested in the sizing phase. Arguments are passed
        /// to `T::serialize`.
        /// \return pointer to the IMR object
+        template<typename T, typename MigrateFn, typename... Args>
+        uint8_t* allocate(MigrateFn* migrate_fn, Args&&... args) noexcept {
+            static_assert(std::is_same_v<typename MigrateFn::structure, T>);
+            return do_allocate<T>(migrate_fn, std::forward<Args>(args)...);
+        }
+
+        template<typename T, typename MigrateFn, typename... Args>
+        auto allocate_nested(MigrateFn* migrate_fn, Args&&... args) noexcept {
+            static_assert(std::is_same_v<typename MigrateFn::structure, T>);
+            return do_allocate_nested<T>(migrate_fn, std::forward<Args>(args)...);
+        }
+
+    private:
        template<typename T, typename... Args>
-        uint8_t* allocate(migrate_fn_type* migrate_fn, Args&& ... args) noexcept {
+        uint8_t* do_allocate(migrate_fn_type* migrate_fn, Args&&... args) noexcept {
            auto ptr = _parent.next_object();
            T::serialize(ptr, std::forward<Args>(args)...);
            return ptr;
        }

        template<typename T, typename... Args>
-        auto allocate_nested(migrate_fn_type*, Args&& ... args) noexcept {
+        auto do_allocate_nested(migrate_fn_type*, Args&& ... args) noexcept {
            auto ptr = _parent.next_object();
            return T::get_serializer(ptr,
                                     continuation(ptr),
--- a/imr/utils.hh
+++ b/imr/utils.hh
@@ -61,8 +61,12 @@ private:
 public:
    object_context(const uint8_t*, State... state) : _state { state... } { }
    template<typename Tag, typename... Args>
-    Context context_for(const uint8_t* ptr, Args&&... args) const noexcept {
-        return create(ptr, std::index_sequence_for<State...>());
+    auto context_for(const uint8_t* ptr, Args&&... args) const noexcept {
+        if constexpr (std::is_same_v<Tag, basic_object::tags::back_pointer>) {
+            return no_context_t();
+        } else {
+            return create(ptr, std::index_sequence_for<State...>());
+        }
    }
 };

@@ -92,7 +96,7 @@ public:
                        imr::member<tags::back_pointer, imr::tagged_type<tags::back_pointer, imr::pod<basic_object*>>>,
                        imr::member<tags::object, Structure>
                      >;
-
+    static constexpr size_t size_overhead = sizeof(basic_object*);
 private:
    explicit object(uint8_t* ptr) noexcept
        : basic_object(ptr)
@@ -158,13 +162,22 @@ public:
    }

    /// Create an IMR objects
-    template<typename Writer>
+    template<typename Writer, typename MigrateFn>
    GCC6_CONCEPT(requires WriterAllocator<Writer, Structure>)
    static object make(Writer&& object_writer,
-                       allocation_strategy::migrate_fn migrate = &imr::alloc::default_lsa_migrate_fn<structure>::migrate_fn) {
+                       MigrateFn* migrate = &imr::alloc::default_lsa_migrate_fn<structure>::migrate_fn) {
+        static_assert(std::is_same_v<typename MigrateFn::structure, structure>);
+        return do_make(std::forward<Writer>(object_writer), migrate);
+    }
+private:
+    template<typename Writer>
+    GCC6_CONCEPT(requires WriterAllocator<Writer, Structure>)
+    static object do_make(Writer&& object_writer, allocation_strategy::migrate_fn migrate) {
        struct alloc_deleter {
+            size_t _size;
+
            void operator()(uint8_t* ptr) {
-                current_allocator().free(ptr);
+                current_allocator().free(ptr, _size);
            }
        };
        using alloc_unique_ptr = std::unique_ptr<uint8_t[], alloc_deleter>;
@@ -176,7 +189,7 @@ public:
        auto& alloc = current_allocator();
        alloc::object_allocator allocator(alloc);
        auto obj_size = structure::size_when_serialized(writer, allocator.get_sizer());
-        auto ptr = alloc_unique_ptr(static_cast<uint8_t*>(alloc.alloc(migrate, obj_size, 1)));
+        auto ptr = alloc_unique_ptr(static_cast<uint8_t*>(alloc.alloc(migrate, obj_size, 1)), alloc_deleter { obj_size });
        allocator.allocate_all();
        structure::serialize(ptr.get(), writer, allocator.get_serializer());
        return object(ptr.release());
--- a/json.cc
+++ b/json.cc
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "json.hh"
+
+namespace seastar {
+namespace json {
+
+static inline bool is_control_char(char c) {
+    return c >= 0 && c <= 0x1F;
+}
+
+static inline bool needs_escaping(const sstring& s) {
+    return std::any_of(s.begin(), s.end(), [](char c) {return is_control_char(c) || c == '"' || c == '\\';});
+}
+
+sstring value_to_quoted_string(const sstring& value) {
+    if (!needs_escaping(value)) {
+        return sprint("\"%s\"", value);
+    }
+    std::ostringstream oss;
+    oss << std::hex << std::uppercase << std::setfill('0');
+    oss.put('"');
+    for (char c : value) {
+        switch (c) {
+        case '"':
+            oss.put('\\').put('"');
+            break;
+        case '\\':
+            oss.put('\\').put('\\');
+            break;
+        case '\b':
+            oss.put('\\').put('b');
+            break;
+        case '\f':
+            oss.put('\\').put('f');
+            break;
+        case '\n':
+            oss.put('\\').put('n');
+            break;
+        case '\r':
+            oss.put('\\').put('r');
+            break;
+        case '\t':
+            oss.put('\\').put('t');
+            break;
+        default:
+            if (is_control_char(c)) {
+                oss.put('\\').put('u') << std::setw(4) << static_cast<int>(c);
+            } else {
+                oss.put(c);
+            }
+            break;
+        }
+    }
+    oss.put('"');
+    return oss.str();
+}
+
+}
+
+}
--- a/json.hh
+++ b/json.hh
@@ -95,6 +95,8 @@ inline std::map<sstring, sstring> to_map(const sstring& raw) {
    return to_map(raw, std::map<sstring, sstring>());
 }

+sstring value_to_quoted_string(const sstring& value);
+
 }

 }
--- a/keys.hh
+++ b/keys.hh
@@ -748,6 +748,10 @@ public:
    static const compound& get_compound_type(const schema& s) {
        return s.clustering_key_prefix_type();
    }
+
+    static clustering_key_prefix_view make_empty() {
+        return { bytes_view() };
+    }
 };

 class clustering_key_prefix : public prefix_compound_wrapper<clustering_key_prefix, clustering_key_prefix_view, clustering_key> {
--- a/locator/abstract_replication_strategy.cc
+++ b/locator/abstract_replication_strategy.cc
@@ -119,9 +119,17 @@ insert_token_range_to_sorted_container_while_unwrapping(
        const dht::token& tok,
        dht::token_range_vector& ret) {
    if (prev_tok < tok) {
-        ret.emplace_back(
-                dht::token_range::bound(prev_tok, false),
-                dht::token_range::bound(tok, true));
+        auto pos = ret.end();
+        if (!ret.empty() && !std::prev(pos)->end()) {
+            // We inserted a wrapped range (a, b] previously as
+            // (-inf, b], (a, +inf). So now we insert in the next-to-last
+            // position to keep the last range (a, +inf) at the end.
+            pos = std::prev(pos);
+        }
+        ret.insert(pos,
+                dht::token_range{
+                        dht::token_range::bound(prev_tok, false),
+                        dht::token_range::bound(tok, true)});
    } else {
        ret.emplace_back(
                dht::token_range::bound(prev_tok, false),
@@ -164,6 +172,30 @@ abstract_replication_strategy::get_primary_ranges(inet_address ep) {
    return ret;
 }

+dht::token_range_vector
+abstract_replication_strategy::get_primary_ranges_within_dc(inet_address ep) {
+    dht::token_range_vector ret;
+    sstring local_dc = _snitch->get_datacenter(ep);
+    std::unordered_set<inet_address> local_dc_nodes = _token_metadata.get_topology().get_datacenter_endpoints().at(local_dc);
+    auto prev_tok = _token_metadata.sorted_tokens().back();
+    for (auto tok : _token_metadata.sorted_tokens()) {
+        auto&& eps = calculate_natural_endpoints(tok, _token_metadata);
+        // Unlike get_primary_ranges() which checks if ep is the first
+        // owner of this range, here we check if ep is the first just
+        // among nodes which belong to the local dc of ep.
+        for (auto& e : eps) {
+            if (local_dc_nodes.count(e)) {
+                if (e == ep) {
+                    insert_token_range_to_sorted_container_while_unwrapping(prev_tok, tok, ret);
+                }
+                break;
+            }
+        }
+        prev_tok = tok;
+    }
+    return ret;
+}
+
 std::unordered_multimap<inet_address, dht::token_range>
 abstract_replication_strategy::get_address_ranges(token_metadata& tm) const {
    std::unordered_multimap<inet_address, dht::token_range> ret;
--- a/locator/abstract_replication_strategy.hh
+++ b/locator/abstract_replication_strategy.hh
@@ -113,6 +113,10 @@ public:
    // This function is the analogue of Origin's
    // StorageService.getPrimaryRangesForEndpoint().
    dht::token_range_vector get_primary_ranges(inet_address ep);
+    // get_primary_ranges_within_dc() is similar to get_primary_ranges()
+    // except it assigns a primary node for each range within each dc,
+    // instead of one node globally.
+    dht::token_range_vector get_primary_ranges_within_dc(inet_address ep);

    std::unordered_multimap<inet_address, dht::token_range> get_address_ranges(token_metadata& tm) const;

--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -262,11 +262,12 @@ void messaging_service::start_listen() {
    // FIXME: we don't set so.tcp_nodelay, because we can't tell at this point whether the connection will come from a
    //        local or remote datacenter, and whether or not the connection will be used for gossip. We can fix
    //        the first by wrapping its server_socket, but not the second.
+    auto limits = rpc_resource_limits(_mcfg.rpc_memory_limit);
    if (!_server[0]) {
        auto listen = [&] (const gms::inet_address& a) {
            auto addr = ipv4_addr{a.raw_addr(), _port};
            return std::unique_ptr<rpc_protocol_server_wrapper>(new rpc_protocol_server_wrapper(*_rpc,
-                    so, addr, rpc_resource_limits(_mcfg.rpc_memory_limit)));
+                    so, addr, limits));
        };
        _server[0] = listen(_listen_address);
        if (listen_to_bc) {
@@ -277,7 +278,7 @@ void messaging_service::start_listen() {
    if (!_server_tls[0]) {
        auto listen = [&] (const gms::inet_address& a) {
            return std::unique_ptr<rpc_protocol_server_wrapper>(
-                    [this, &so, &a] () -> std::unique_ptr<rpc_protocol_server_wrapper>{
+                    [this, &so, &a, limits] () -> std::unique_ptr<rpc_protocol_server_wrapper>{
                if (_encrypt_what == encrypt_what::none) {
                    return nullptr;
                }
@@ -285,7 +286,7 @@ void messaging_service::start_listen() {
                lo.reuse_address = true;
                auto addr = make_ipv4_address(ipv4_addr{a.raw_addr(), _ssl_port});
                return std::make_unique<rpc_protocol_server_wrapper>(*_rpc,
-                        so, seastar::tls::listen(_credentials, addr, lo));
+                        so, seastar::tls::listen(_credentials, addr, lo), limits);
            }());
        };
        _server_tls[0] = listen(_listen_address);
--- a/mutation_fragment.hh
+++ b/mutation_fragment.hh
@@ -125,7 +125,7 @@ public:
        return _ck.equal(s, other._ck)
               && _t == other._t
               && _marker == other._marker
-               && _cells.equal(column_kind::static_column, s, other._cells, s);
+               && _cells.equal(column_kind::regular_column, s, other._cells, s);
    }

    friend std::ostream& operator<<(std::ostream& os, const clustering_row& row);
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -1095,7 +1095,7 @@ row::apply_monotonically(const column_definition& column, atomic_cell_or_collect
    if (_type == storage_type::vector && id < max_vector_size) {
        if (id >= _storage.vector.v.size()) {
            _storage.vector.v.resize(id);
-            _storage.vector.v.emplace_back(cell_and_hash{std::move(value), std::move(hash)});
+            _storage.vector.v.emplace_back(std::move(value), std::move(hash));
            _storage.vector.present.set(id);
            _size++;
        } else if (auto& cell_and_hash = _storage.vector.v[id]; !bool(cell_and_hash.cell)) {
@@ -1162,7 +1162,7 @@ row::find_cell(column_id id) const {
 size_t row::external_memory_usage(const schema& s, column_kind kind) const {
    size_t mem = 0;
    if (_type == storage_type::vector) {
-        mem += _storage.vector.v.external_memory_usage();
+        mem += _storage.vector.v.used_space_external_memory_usage();
        column_id id = 0;
        for (auto&& c_a_h : _storage.vector.v) {
            auto& cdef = s.column_at(kind, id++);
@@ -1396,12 +1396,17 @@ row::row(const schema& s, column_kind kind, const row& o)
    if (_type == storage_type::vector) {
        auto& other_vec = o._storage.vector;
        auto& vec = *new (&_storage.vector) vector_storage;
-        vec.present = other_vec.present;
-        vec.v.reserve(other_vec.v.size());
-        column_id id = 0;
-        for (auto& cell : other_vec.v) {
-            auto& cdef = s.column_at(kind, id++);
-            vec.v.emplace_back(cell_and_hash { cell.cell.copy(*cdef.type), cell.hash });
+        try {
+            vec.present = other_vec.present;
+            vec.v.reserve(other_vec.v.size());
+            column_id id = 0;
+            for (auto& cell : other_vec.v) {
+                auto& cdef = s.column_at(kind, id++);
+                vec.v.emplace_back(cell_and_hash{cell.cell.copy(*cdef.type), cell.hash});
+            }
+        } catch (...) {
+            _storage.vector.~vector_storage();
+            throw;
        }
    } else {
        auto cloner = [&] (const auto& x) {
@@ -1812,9 +1817,10 @@ void mutation_querier::query_static_row(const row& r, tombstone current_tombston
        } else if (_short_reads_allowed) {
            seastar::measuring_output_stream stream;
            ser::qr_partition__static_row__cells<seastar::measuring_output_stream> out(stream, { });
+            auto start = stream.size();
            get_compacted_row_slice(_schema, slice, column_kind::static_column,
-                                    r, slice.static_columns, _static_cells_wr);
-            _memory_accounter.update(stream.size());
+                                    r, slice.static_columns, out);
+            _memory_accounter.update(stream.size() - start);
        }
        if (_pw.requested_digest()) {
            max_timestamp max_ts{_pw.last_modified()};
@@ -1875,8 +1881,9 @@ stop_iteration mutation_querier::consume(clustering_row&& cr, row_tombstone curr
    } else if (_short_reads_allowed) {
        seastar::measuring_output_stream stream;
        ser::qr_partition__rows<seastar::measuring_output_stream> out(stream, { });
+        auto start = stream.size();
        write_row(out);
-        stop = _memory_accounter.update_and_check(stream.size());
+        stop = _memory_accounter.update_and_check(stream.size() - start);
    }

    _live_clustering_rows++;
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -74,6 +74,15 @@ using cell_hash_opt = seastar::optimized_optional<cell_hash>;
 struct cell_and_hash {
    atomic_cell_or_collection cell;
    mutable cell_hash_opt hash;
+
+    cell_and_hash() = default;
+    cell_and_hash(cell_and_hash&&) noexcept = default;
+    cell_and_hash& operator=(cell_and_hash&&) noexcept = default;
+
+    cell_and_hash(atomic_cell_or_collection&& cell, cell_hash_opt hash)
+        : cell(std::move(cell))
+        , hash(hash)
+    { }
 };

 //
--- a/partition_version.cc
+++ b/partition_version.cc
@@ -457,7 +457,10 @@ coroutine partition_entry::apply_to_incomplete(const schema& s,
        pe.upgrade(pe_schema.shared_from_this(), s.shared_from_this(), pe_cleaner, no_cache_tracker);
    }

-    bool can_move = !pe._snapshot;
+    // When preemptible, later memtable reads could start using the snapshot before
+    // snapshot's writes are made visible in cache, which would cause them to miss those writes.
+    // So we cannot allow erasing when preemptible.
+    bool can_move = !preemptible && !pe._snapshot;

    auto src_snp = pe.read(reg, pe_cleaner, s.shared_from_this(), no_cache_tracker);
    lw_shared_ptr<partition_snapshot> prev_snp;
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -273,6 +273,11 @@ public:
        return is_partition_end() || (_ck && _ck->is_empty(s) && _bound_weight > 0);
    }

+    bool is_before_all_clustered_rows(const schema& s) const {
+        return _type < partition_region::clustered
+               || (_type == partition_region::clustered && _ck->is_empty(s) && _bound_weight < 0);
+    }
+
    template<typename Hasher>
    void feed_hash(Hasher& hasher, const schema& s) const {
        ::feed_hash(hasher, _bound_weight);
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -1004,6 +1004,22 @@ static dht::token_range_vector get_primary_ranges(
            utils::fb_utilities::get_broadcast_address());
 }

+// get_primary_ranges_within_dc() is similar to get_primary_ranges(),
+// but instead of each range being assigned just one primary owner
+// across the entire cluster, here each range is assigned a primary
+// owner in each of the clusters.
+static dht::token_range_vector get_primary_ranges_within_dc(
+        database& db, sstring keyspace) {
+    auto& rs = db.find_keyspace(keyspace).get_replication_strategy();
+    return rs.get_primary_ranges_within_dc(
+            utils::fb_utilities::get_broadcast_address());
+}
+
+static sstring get_local_dc() {
+    return locator::i_endpoint_snitch::get_local_snitch_ptr()->get_datacenter(
+            utils::fb_utilities::get_broadcast_address());
+}
+

 struct repair_options {
    // If primary_range is true, we should perform repair only on this node's
@@ -1256,21 +1272,14 @@ static int do_repair_start(seastar::sharded<database>& db, sstring keyspace,
        rlogger.info("primary-range repair");
        // when "primary_range" option is on, neither data_centers nor hosts
        // may be set, except data_centers may contain only local DC (-local)
-#if 0
        if (options.data_centers.size() == 1 &&
-                options.data_centers[0] == DatabaseDescriptor.getLocalDataCenter()) {
+            options.data_centers[0] == get_local_dc()) {
            ranges = get_primary_ranges_within_dc(db.local(), keyspace);
-        } else
-#endif
-#if 0
-        if (options.data_centers.size() > 0 || options.hosts.size() > 0) {
+        } else if (options.data_centers.size() > 0 || options.hosts.size() > 0) {
            throw std::runtime_error("You need to run primary range repair on all nodes in the cluster.");
        } else {
-#endif
            ranges = get_primary_ranges(db.local(), keyspace);
-#if 0
        }
-#endif
    } else {
        ranges = get_local_ranges(db.local(), keyspace);
    }
--- a/scripts/scylla_install_pkg
+++ b/scripts/scylla_install_pkg
@@ -1,109 +0,0 @@
-#!/bin/bash -e
-#
-#  Copyright (C) 2015 ScyllaDB
-
-if [ "`id -u`" -ne 0 ]; then
-    echo "Requires root permission."
-    exit 1
-fi
-
-print_usage() {
-    echo "scylla_install_pkg --local-pkg /home/scylla/rpms --repo [URL]"
-    echo "  --local-pkg	install locally built .rpm/.deb on specified directory"
-    echo "  --repo  repository for both install and update, specify .repo/.list file URL"
-    echo "  --repo-for-install  repository for install, specify .repo/.list file URL"
-    echo "  --repo-for-update  repository for update, specify .repo/.list file URL"
-    exit 1
-}
-
-LOCAL_PKG=
-UNSTABLE=0
-REPO_FOR_INSTALL=
-REPO_FOR_UPDATE=
-while [ $# -gt 0 ]; do
-    case "$1" in
-        "--local-pkg")
-            LOCAL_PKG=$2
-            shift 2
-            ;;
-        "--repo")
-            REPO_FOR_INSTALL=$2
-            REPO_FOR_UPDATE=$2
-            shift 2
-            ;;
-        "--repo-for-install")
-            REPO_FOR_INSTALL=$2
-            shift 2
-            ;;
-        "--repo-for-update")
-            REPO_FOR_UPDATE=$2
-            shift 2
-            ;;
-        *)
-            print_usage
-            shift 1
-            ;;
-    esac
-done
-
-. /etc/os-release
-
-if [ -f /etc/debian_version ]; then
-    echo "#!/bin/sh" >> /usr/sbin/policy-rc.d
-    echo "exit 101" >> /usr/sbin/policy-rc.d
-    chmod +x /usr/sbin/policy-rc.d
-    cp /etc/hosts /etc/hosts.orig
-    echo 127.0.0.1 `hostname` >> /etc/hosts
-    if [ "$REPO_FOR_INSTALL" != "" ]; then
-        curl -L -o /etc/apt/sources.list.d/scylla_install.list $REPO_FOR_INSTALL
-    fi
-    apt-get -o Acquire::AllowInsecureRepositories=true \
-            -o Acquire::AllowDowngradeToInsecureRepositories=true update
-    if [ "$LOCAL_PKG" = "" ]; then
-        apt-get install -o APT::Get::AllowUnauthenticated=true \
-                        -y --force-yes scylla
-    else
-        if [ ! -f /usr/bin/gdebi ]; then
-            apt-get install -y --force-yes gdebi-core
-        fi
-        echo Y | gdebi $LOCAL_PKG/scylla-kernel-conf*.deb
-        echo Y | gdebi $LOCAL_PKG/scylla-conf*.deb
-        echo Y | gdebi $LOCAL_PKG/scylla-server_*.deb
-        echo Y | gdebi $LOCAL_PKG/scylla-server-dbg*.deb
-        echo Y | gdebi $LOCAL_PKG/scylla-jmx*.deb
-        echo Y | gdebi $LOCAL_PKG/scylla-tools*.deb
-        echo Y | gdebi $LOCAL_PKG/scylla_*.deb
-    fi
-    mv /etc/hosts.orig /etc/hosts
-    rm /usr/sbin/policy-rc.d
-    rm /etc/apt/sources.list.d/scylla_install.list
-    if [ "$REPO_FOR_UPDATE" != "" ]; then
-        curl -L -o /etc/apt/sources.list.d/scylla.list $REPO_FOR_UPDATE
-    fi
-    apt-get -o Acquire::AllowInsecureRepositories=true \
-            -o Acquire::AllowDowngradeToInsecureRepositories=true update
-else
-    if [ "$REPO_FOR_INSTALL" != "" ]; then
-        curl -L -o /etc/yum.repos.d/scylla_install.repo $REPO_FOR_INSTALL
-    fi
-
-    if [ "$ID" = "centos" ]; then
-            yum install -y epel-release
-    elif [ "$ID" = "rhel" ]; then
-        rpm -ivh http://download.fedoraproject.org/pub/epel/7/x86_64/e/epel-release-7-7.noarch.rpm
-    else
-        echo "Unsupported distribution"
-        exit 1
-    fi
-
-    if [ "$LOCAL_PKG" = "" ]; then
-        yum install -y scylla
-    else
-        yum install -y $LOCAL_PKG/scylla*.*.rpm
-    fi
-
-    rm /etc/yum.repos.d/scylla_install.repo
-    if [ "$REPO_FOR_UPDATE" != "" ]; then
-        curl -L -o /etc/yum.repos.d/scylla.repo $REPO_FOR_UPDATE
-    fi
-fi
--- a/2
+++ b/2
--- a/service/migration_task.cc
+++ b/service/migration_task.cc
@@ -54,7 +54,7 @@ static logging::logger mlogger("migration_task");
 future<> migration_task::run_may_throw(distributed<service::storage_proxy>& proxy, const gms::inet_address& endpoint)
 {
    if (!gms::get_failure_detector().local().is_alive(endpoint)) {
-        mlogger.error("Can't send migration request: node {} is down.", endpoint);
+        mlogger.warn("Can't send migration request: node {} is down.", endpoint);
        return make_ready_future<>();
    }
    netw::messaging_service::msg_addr id{endpoint, 0};
--- a/service/misc_services.cc
+++ b/service/misc_services.cc
@@ -144,7 +144,11 @@ future<lowres_clock::duration> cache_hitrate_calculator::recalculate_hitrates()
        return _db.invoke_on_all([this, rates = std::move(rates), cpuid = engine().cpu_id()] (database& db) {
            sstring gstate;
            for (auto& cf : db.get_column_families() | boost::adaptors::filtered(non_system_filter)) {
-                stat s = rates.at(cf.first);
+                auto it = rates.find(cf.first);
+                if (it == rates.end()) { // a table may be added before map/reduce compltes and this code runs
+                    continue;
+                }
+                stat s = it->second;
                float rate = 0;
                if (s.h) {
                    rate = s.h / (s.h + s.m);
--- a/service/pager/query_pagers.cc
+++ b/service/pager/query_pagers.cc
@@ -85,7 +85,7 @@ static bool has_clustering_keys(const schema& s, const query::read_command& cmd)
            _query_read_repair_decision = state->get_query_read_repair_decision();
        } else {
            // Reusing readers is currently only supported for singular queries.
-            if (_ranges.front().is_singular()) {
+            if (!_ranges.empty() && query::is_single_partition(_ranges.front())) {
                _cmd->query_uuid = utils::make_random_uuid();
            }
            _cmd->is_first_page = true;
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -211,7 +211,7 @@ protected:

 protected:
    virtual bool waited_for(gms::inet_address from) = 0;
-    virtual void signal(gms::inet_address from) {
+    void signal(gms::inet_address from) {
        if (waited_for(from)) {
            signal();
        }
@@ -221,7 +221,7 @@ public:
    abstract_write_response_handler(shared_ptr<storage_proxy> p, keyspace& ks, db::consistency_level cl, db::write_type type,
            std::unique_ptr<mutation_holder> mh, std::unordered_set<gms::inet_address> targets, tracing::trace_state_ptr trace_state,
            storage_proxy::write_stats& stats, size_t pending_endpoints = 0, std::vector<gms::inet_address> dead_endpoints = {})
-            : _id(p->_next_response_id++), _proxy(std::move(p)), _trace_state(trace_state), _cl(cl), _type(type), _mutation_holder(std::move(mh)), _targets(std::move(targets)),
+            : _id(p->get_next_response_id()), _proxy(std::move(p)), _trace_state(trace_state), _cl(cl), _type(type), _mutation_holder(std::move(mh)), _targets(std::move(targets)),
              _dead_endpoints(std::move(dead_endpoints)), _stats(stats) {
        // original comment from cassandra:
        // during bootstrap, include pending endpoints in the count
@@ -285,10 +285,13 @@ public:
    }
    // return true on last ack
    bool response(gms::inet_address from) {
-        signal(from);
        auto it = _targets.find(from);
-        assert(it != _targets.end());
-        _targets.erase(it);
+        if (it != _targets.end()) {
+            signal(from);
+            _targets.erase(it);
+        } else {
+            slogger.warn("Receive outdated write ack from {}", from);
+        }
        return _targets.size() == 0;
    }
    future<> wait() {
@@ -632,9 +635,12 @@ void storage_proxy_stats::split_stats::register_metrics_for(gms::inet_address ep
    }
 }

+using namespace std::literals::chrono_literals;
+
 storage_proxy::~storage_proxy() {}
 storage_proxy::storage_proxy(distributed<database>& db, storage_proxy::config cfg)
    : _db(db)
+    , _next_response_id(std::chrono::system_clock::now().time_since_epoch()/1ms)
    , _hints_resource_manager(cfg.available_memory / 10)
    , _hints_for_views_manager(_db.local().get_config().data_file_directories()[0] + "/view_pending_updates", {}, _db.local().get_config().max_hint_window_in_ms(), _hints_resource_manager, _db)
    , _background_write_throttle_threahsold(cfg.available_memory / 10) {
@@ -1974,18 +1980,21 @@ public:
        _timeout.arm(timeout);
    }
    virtual ~abstract_read_resolver() {};
-    virtual void on_error(gms::inet_address ep) = 0;
+    virtual void on_error(gms::inet_address ep, bool disconnect) = 0;
    future<> done() {
        return _done_promise.get_future();
    }
    void error(gms::inet_address ep, std::exception_ptr eptr) {
        sstring why;
+        bool disconnect = false;
        try {
            std::rethrow_exception(eptr);
        } catch (rpc::closed_error&) {
-            return; // do not report connection closed exception, gossiper does that
+            // do not report connection closed exception, gossiper does that
+            disconnect = true;
        } catch (rpc::timeout_error&) {
-            return; // do not report timeouts, the whole operation will timeout and be reported
+            // do not report timeouts, the whole operation will timeout and be reported
+            return; // also do not report timeout as replica failure for the same reason
        } catch(std::exception& e) {
            why = e.what();
        } catch(...) {
@@ -1993,10 +2002,12 @@ public:
        }

        if (!_request_failed) { // request may fail only once.
-            on_error(ep);
+            on_error(ep, disconnect);
        }

-        slogger.error("Exception when communicating with {}: {}", ep, why);
+        if (why.length()) {
+            slogger.error("Exception when communicating with {}: {}", ep, why);
+        }
    }
 };

@@ -2071,10 +2082,16 @@ public:
            _done_promise.set_value();
        }
    }
-    void on_error(gms::inet_address ep) override {
+    void on_error(gms::inet_address ep, bool disconnect) override {
        if (waiting_for(ep)) {
            _failed++;
        }
+        if (disconnect && _block_for == _target_count_for_cl) {
+            // if the error is because of a connection disconnect and there is no targets to speculate
+            // wait for timeout in hope that the client will issue speculative read
+            // FIXME: resolver should have access to all replicas and try another one in this case
+            return;
+        }
        if (_block_for + _failed > _target_count_for_cl) {
            fail_request(std::make_exception_ptr(read_failure_exception(_schema->ks_name(), _schema->cf_name(), _cl, _cl_responses, _failed, _block_for, _data_result)));
        }
@@ -2400,7 +2417,7 @@ public:
            }
        }
    }
-    void on_error(gms::inet_address ep) override {
+    void on_error(gms::inet_address ep, bool disconnect) override {
        fail_request(std::make_exception_ptr(read_failure_exception(_schema->ks_name(), _schema->cf_name(), _cl, response_count(), 1, _targets_count, response_count() != 0)));
    }
    uint32_t max_live_count() const {
@@ -3323,9 +3340,22 @@ storage_proxy::query_partition_key_range(lw_shared_ptr<query::read_command> cmd,
    slogger.debug("Estimated result rows per range: {}; requested rows: {}, ranges.size(): {}; concurrent range requests: {}",
            result_rows_per_range, cmd->row_limit, ranges.size(), concurrency_factor);

+    // The call to `query_partition_key_range_concurrent()` below
+    // updates `cmd` directly when processing the results. Under
+    // some circumstances, when the query executes without deferring,
+    // this updating will happen before the lambda object is constructed
+    // and hence the updates will be visible to the lambda. This will
+    // result in the merger below trimming the results according to the
+    // updated (decremented) limits and causing the paging logic to
+    // declare the query exhausted due to the non-full page. To avoid
+    // this save the original values of the limits here and pass these
+    // to the lambda below.
+    const auto row_limit = cmd->row_limit;
+    const auto partition_limit = cmd->partition_limit;
+
    return query_partition_key_range_concurrent(query_options.timeout(*this), std::move(results), cmd, cl, ranges.begin(), std::move(ranges),
            concurrency_factor, std::move(query_options.trace_state), cmd->row_limit, cmd->partition_limit)
-            .then([row_limit = cmd->row_limit, partition_limit = cmd->partition_limit](std::vector<foreign_ptr<lw_shared_ptr<query::result>>> results) {
+            .then([row_limit, partition_limit](std::vector<foreign_ptr<lw_shared_ptr<query::result>>> results) {
        query::result_merger merger(row_limit, partition_limit);
        merger.reserve(results.size());

@@ -3682,6 +3712,7 @@ future<> storage_proxy::truncate_blocking(sstring keyspace, sstring cfname) {
           std::rethrow_exception(ep);
       } catch (rpc::timeout_error& e) {
           slogger.trace("Truncation of {} timed out: {}", cfname, e.what());
+           throw;
       } catch (...) {
           throw;
       }
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -143,7 +143,7 @@ public:
    };
 private:
    distributed<database>& _db;
-    response_id_type _next_response_id = 1; // 0 is reserved for unique_response_handler
+    response_id_type _next_response_id;
    std::unordered_map<response_id_type, rh_entry> _response_handlers;
    // This buffer hold ids of throttled writes in case resource consumption goes
    // below the threshold and we want to unthrottle some of them. Without this throttled
@@ -263,6 +263,13 @@ public:
        return _db;
    }

+    response_id_type get_next_response_id() {
+        auto next = _next_response_id++;
+        if (next == 0) { // 0 is reserved for unique_response_handler
+            next = _next_response_id++;
+        }
+        return next;
+    }
    void init_messaging_service();

    // Applies mutation on this node.
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -303,7 +303,7 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
                        gossiper.check_knows_remote_features(local_features, peer_features);
                    }

-                    gossiper.reset_endpoint_state_map();
+                    gossiper.reset_endpoint_state_map().get();
                    for (auto ep : loaded_endpoints) {
                        gossiper.add_saved_endpoint(ep);
                    }
@@ -317,7 +317,7 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
            slogger.info("Checking remote features with gossip");
            gossiper.do_shadow_round().get();
            gossiper.check_knows_remote_features(local_features);
-            gossiper.reset_endpoint_state_map();
+            gossiper.reset_endpoint_state_map().get();
            for (auto ep : loaded_endpoints) {
                gossiper.add_saved_endpoint(ep);
            }
@@ -419,13 +419,9 @@ void storage_service::join_token_ring(int delay) {
            db::system_keyspace::set_bootstrap_state(db::system_keyspace::bootstrap_state::IN_PROGRESS).get();
        }
        set_mode(mode::JOINING, "waiting for ring information", true);
-        // first sleep the delay to make sure we see all our peers
-        for (int i = 0; i < delay; i += 1000) {
-            // if we see schema, we can proceed to the next check directly
-            if (_db.local().get_version() != database::empty_version) {
-                slogger.debug("got schema: {}", _db.local().get_version());
-                break;
-            }
+        auto& gossiper = gms::get_gossiper().local();
+        // first sleep the delay to make sure we see *at least* one other node
+        for (int i = 0; i < delay && gossiper.get_live_members().size() < 2; i += 1000) {
            sleep(std::chrono::seconds(1)).get();
        }
        // if our schema hasn't matched yet, keep sleeping until it does
@@ -484,7 +480,6 @@ void storage_service::join_token_ring(int delay) {
                for (auto token : _bootstrap_tokens) {
                    auto existing = _token_metadata.get_endpoint(token);
                    if (existing) {
-                        auto& gossiper = gms::get_local_gossiper();
                        auto* eps = gossiper.get_endpoint_state_for_endpoint_ptr(*existing);
                        if (eps && eps->get_update_timestamp() > gms::gossiper::clk::now() - std::chrono::milliseconds(delay)) {
                            throw std::runtime_error("Cannot replace a live node...");
@@ -622,6 +617,8 @@ void storage_service::bootstrap(std::unordered_set<token> tokens) {
    db::system_keyspace::update_tokens(tokens).get();
    auto& gossiper = gms::get_local_gossiper();
    if (!db().local().is_replacing()) {
+        // Wait until we know tokens of existing node before announcing join status.
+        gossiper.wait_for_range_setup().get();
        // if not an existing token then bootstrap
        gossiper.add_local_application_state({
            { gms::application_state::TOKENS, value_factory.tokens(tokens) },
@@ -1541,7 +1538,7 @@ future<> storage_service::check_for_endpoint_collision() {
                            throw std::runtime_error("Other bootstrapping/leaving/moving nodes detected, cannot bootstrap while consistent_rangemovement is true (check_for_endpoint_collision)");
                        } else {
                            gossiper.goto_shadow_round();
-                            gossiper.reset_endpoint_state_map();
+                            gossiper.reset_endpoint_state_map().get();
                            found_bootstrapping_node = true;
                            auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(gms::gossiper::clk::now() - t).count();
                            slogger.info("Checking bootstrapping/leaving/moving nodes: node={}, status={}, sleep 1 second and check again ({} seconds elapsed) (check_for_endpoint_collision)", addr, state, elapsed);
@@ -1553,7 +1550,7 @@ future<> storage_service::check_for_endpoint_collision() {
            }
        } while (found_bootstrapping_node);
        slogger.info("Checking bootstrapping/leaving/moving nodes: ok (check_for_endpoint_collision)");
-        gossiper.reset_endpoint_state_map();
+        gossiper.reset_endpoint_state_map().get();
    });
 }

@@ -1603,8 +1600,9 @@ future<std::unordered_set<token>> storage_service::prepare_replacement_info() {
        auto tokens = get_tokens_for(replace_address);
        // use the replacee's host Id as our own so we receive hints, etc
        return db::system_keyspace::set_local_host_id(host_id).discard_result().then([replace_address, tokens = std::move(tokens)] {
-            gms::get_local_gossiper().reset_endpoint_state_map(); // clean up since we have what we need
-            return make_ready_future<std::unordered_set<token>>(std::move(tokens));
+            return gms::get_local_gossiper().reset_endpoint_state_map().then([tokens = std::move(tokens)] { // clean up since we have what we need
+                return make_ready_future<std::unordered_set<token>>(std::move(tokens));
+            });
        });
    });
 }
@@ -2643,14 +2641,20 @@ future<> storage_service::send_replication_notification(inet_address remote) {
    // notify the remote token
    auto done = make_shared<bool>(false);
    auto local = get_broadcast_address();
+    auto sent = make_lw_shared<int>(0);
    slogger.debug("Notifying {} of replication completion", remote);
    return do_until(
-        [done, remote] {
-            return *done || !gms::get_local_failure_detector().is_alive(remote);
+        [done, sent, remote] {
+            // The node can send REPLICATION_FINISHED to itself, in which case
+            // is_alive will be true. If the messaging_service is stopped,
+            // REPLICATION_FINISHED can be sent infinitely here. To fix, limit
+            // the number of retries.
+            return *done || !gms::get_local_failure_detector().is_alive(remote) || *sent >= 3;
        },
-        [done, remote, local] {
+        [done, sent, remote, local] {
            auto& ms = netw::get_local_messaging_service();
            netw::msg_addr id{remote, 0};
+            (*sent)++;
            return ms.send_replication_finished(id, local).then_wrapped([id, done] (auto&& f) {
                try {
                    f.get();
--- a/sstables/compress.cc
+++ b/sstables/compress.cc
@@ -33,6 +33,7 @@
 #include "unimplemented.hh"
 #include "stdx.hh"
 #include "segmented_compress_params.hh"
+#include "utils/class_registrator.hh"

 namespace sstables {

@@ -299,7 +300,8 @@ size_t local_compression::compress_max_size(size_t input_len) const {

 void compression::set_compressor(compressor_ptr c) {
    if (c) {
-        auto& cn = c->name();
+        unqualified_name uqn(compressor::namespace_prefix, c->name());
+        const sstring& cn = uqn;
        name.value = bytes(cn.begin(), cn.end());
        for (auto& p : c->options()) {
            if (p.first != compression_parameters::SSTABLE_COMPRESSION) {
--- a/sstables/mp_row_consumer.cc
+++ b/sstables/mp_row_consumer.cc
@@ -53,7 +53,8 @@ atomic_cell make_counter_cell(api::timestamp_type timestamp, bytes_view value) {
            throw marshal_exception("encountered a local shard in a counter cell");
        }
    }
-    auto shard_count = value.size() / shard_size;
+    auto header_length = (size_t(header_size) + 1) * sizeof(int16_t);
+    auto shard_count = (value.size() - header_length) / shard_size;
    if (shard_count != size_t(header_size)) {
        throw marshal_exception("encountered remote shards in a counter cell");
    }
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -787,6 +787,11 @@ future<> parse(sstable_version_types v, random_access_reader& in, utils::estimat
        if (length == 0) {
            throw malformed_sstable_exception("Estimated histogram with zero size found. Can't continue!");
        }
+
+        // Arrays are potentially pre-initialized by the estimated_histogram constructor.
+        eh.bucket_offsets.clear();
+        eh.buckets.clear();
+
        eh.bucket_offsets.reserve(length - 1);
        eh.buckets.reserve(length);

--- a/test.py
+++ b/test.py
@@ -235,7 +235,7 @@ if __name__ == "__main__":
                mode = 'debug'
            xmlout = (args.jenkins + "." + mode + "." +
                      os.path.basename(path.split()[0]) + ".boost.xml")
-            boost_args += ['--report_level=no', '--logger=XML,test_suite,' + xmlout]
+            boost_args += ['--report_level=no', '--logger=HRF,test_suite:XML,test_suite,' + xmlout]
        if type == 'boost':
            boost_args += ['--']
        def report_error(out, report_subcause):
--- a/tests/aggregate_fcts_test.cc
+++ b/tests/aggregate_fcts_test.cc
@@ -215,3 +215,22 @@ SEASTAR_TEST_CASE(test_aggregate_count) {
        }
    });
 }
+
+SEASTAR_TEST_CASE(test_reverse_type_aggregation) {
+    return do_with_cql_env_thread([&] (auto& e) {
+        e.execute_cql("CREATE TABLE test(p int, c timestamp, v int, primary key (p, c)) with clustering order by (c desc)").get();
+        e.execute_cql("INSERT INTO test(p, c, v) VALUES (1, 1, 1)").get();
+        e.execute_cql("INSERT INTO test(p, c, v) VALUES (1, 2, 1)").get();
+
+        {
+            auto tp = db_clock::from_time_t({ 0 }) + std::chrono::milliseconds(1);
+            auto msg = e.execute_cql("SELECT min(c) FROM test").get0();
+            assert_that(msg).is_rows().with_size(1).with_row({{timestamp_type->decompose(tp)}});
+        }
+        {
+            auto tp = db_clock::from_time_t({ 0 }) + std::chrono::milliseconds(2);
+            auto msg = e.execute_cql("SELECT max(c) FROM test").get0();
+            assert_that(msg).is_rows().with_size(1).with_row({{timestamp_type->decompose(tp)}});
+        }
+    });
+}
--- a/tests/cql_query_test.cc
+++ b/tests/cql_query_test.cc
@@ -2076,10 +2076,9 @@ SEASTAR_TEST_CASE(test_in_restriction) {
            assert_that(msg).is_rows().with_size(0);
            return e.execute_cql("select r1 from tir where p1 in (2, 0, 2, 1);");
        }).then([&e] (shared_ptr<cql_transport::messages::result_message> msg) {
-            assert_that(msg).is_rows().with_rows({
+            assert_that(msg).is_rows().with_rows_ignore_order({
                {int32_type->decompose(4)},
                {int32_type->decompose(0)},
-                {int32_type->decompose(4)},
                {int32_type->decompose(1)},
                {int32_type->decompose(2)},
                {int32_type->decompose(3)},
@@ -2101,6 +2100,58 @@ SEASTAR_TEST_CASE(test_in_restriction) {
                {int32_type->decompose(2)},
                {int32_type->decompose(1)},
            });
+            return e.prepare("select r1 from tir where p1 in ?");
+        }).then([&e] (cql3::prepared_cache_key_type prepared_id){
+            auto my_list_type = list_type_impl::get_instance(int32_type, true);
+            std::vector<cql3::raw_value> raw_values;
+            auto in_values_list = my_list_type->decompose(make_list_value(my_list_type,
+                    list_type_impl::native_type{{int(2), int(0), int(2), int(1)}}));
+            raw_values.emplace_back(cql3::raw_value::make_value(in_values_list));
+            return e.execute_prepared(prepared_id,raw_values);
+        }).then([&e] (shared_ptr<cql_transport::messages::result_message> msg) {
+            assert_that(msg).is_rows().with_rows_ignore_order({
+                {int32_type->decompose(4)},
+                {int32_type->decompose(0)},
+                {int32_type->decompose(1)},
+                {int32_type->decompose(2)},
+                {int32_type->decompose(3)},
+            });
+        }).then([&e]{
+            return e.execute_cql("create table tir2 (p1 int, c1 int, r1 int, PRIMARY KEY (p1, c1,r1));").discard_result();
+        }).then([&e] {
+            e.require_table_exists("ks", "tir2");
+            return e.execute_cql("insert into tir2 (p1, c1, r1) values (0, 0, 0);").discard_result();
+        }).then([&e] {
+            return e.execute_cql("insert into tir2 (p1, c1, r1) values (1, 0, 1);").discard_result();
+        }).then([&e] {
+            return e.execute_cql("insert into tir2 (p1, c1, r1) values (1, 1, 2);").discard_result();
+        }).then([&e] {
+            return e.execute_cql("insert into tir2 (p1, c1, r1) values (1, 2, 3);").discard_result();
+        }).then([&e] {
+            return e.execute_cql("insert into tir2 (p1, c1, r1) values (2, 3, 4);").discard_result();
+        }).then([&e]{
+            return e.execute_cql("select r1 from tir2 where (c1,r1) in ((0, 1),(1,2),(0,1),(1,2),(3,3)) ALLOW FILTERING;");
+        }).then([&e] (shared_ptr<cql_transport::messages::result_message> msg) {
+            assert_that(msg).is_rows().with_rows_ignore_order({
+                {int32_type->decompose(1)},
+                {int32_type->decompose(2)},
+            });
+            return e.prepare("select r1 from tir where p1 in ?");
+        }).then([&e] (cql3::prepared_cache_key_type prepared_id){
+            auto my_list_type = list_type_impl::get_instance(int32_type, true);
+            std::vector<cql3::raw_value> raw_values;
+            auto in_values_list = my_list_type->decompose(make_list_value(my_list_type,
+                    list_type_impl::native_type{{int(2), int(0), int(2), int(1)}}));
+            raw_values.emplace_back(cql3::raw_value::make_value(in_values_list));
+            return e.execute_prepared(prepared_id,raw_values);
+        }).then([&e] (shared_ptr<cql_transport::messages::result_message> msg) {
+            assert_that(msg).is_rows().with_rows_ignore_order({
+                {int32_type->decompose(4)},
+                {int32_type->decompose(0)},
+                {int32_type->decompose(1)},
+                {int32_type->decompose(2)},
+                {int32_type->decompose(3)},
+            });
        });
    });
 }
@@ -2613,6 +2664,7 @@ SEASTAR_TEST_CASE(test_select_json_types) {
                "    r date,"
                "    s time,"
                "    u duration,"
+                "    w int,"
                ");").get();

        e.require_table_exists("ks", "all_types").get();
@@ -2640,7 +2692,7 @@ SEASTAR_TEST_CASE(test_select_json_types) {
                "    1y2mo3w4d5h6m7s8ms9us10ns"
                ");").get();

-        auto msg = e.execute_cql("SELECT JSON a, b, c, d, e, f, \"G\", \"H\", \"I\", j, k, l, m, n, o, p, q, r, s, u, unixtimestampof(k) FROM all_types WHERE a = 'ascii'").get0();
+        auto msg = e.execute_cql("SELECT JSON a, b, c, d, e, f, \"G\", \"H\", \"I\", j, k, l, m, n, o, p, q, r, s, u, w, unixtimestampof(k) FROM all_types WHERE a = 'ascii'").get0();
        assert_that(msg).is_rows().with_rows({
            {
                utf8_type->decompose(
@@ -2664,6 +2716,7 @@ SEASTAR_TEST_CASE(test_select_json_types) {
                    "\"r\": \"1970-01-02\", "
                    "\"s\": 00:00:00.000000001, "
                    "\"u\": \"1y2mo25d5h6m7s8ms9us10ns\", "
+                    "\"w\": null, "
                    "\"unixtimestampof(k)\": 1261009589805}"
                )
            }
@@ -2671,7 +2724,7 @@ SEASTAR_TEST_CASE(test_select_json_types) {

        msg = e.execute_cql("SELECT toJson(a), toJson(b), toJson(c), toJson(d), toJson(e), toJson(f),"
                "toJson(\"G\"), toJson(\"H\"), toJson(\"I\"), toJson(j), toJson(k), toJson(l), toJson(m), toJson(n),"
-                "toJson(o), toJson(p), toJson(q), toJson(r), toJson(s), toJson(u),"
+                "toJson(o), toJson(p), toJson(q), toJson(r), toJson(s), toJson(u), toJson(w),"
                "toJson(unixtimestampof(k)), toJson(toJson(toJson(p))) FROM all_types WHERE a = 'ascii'").get0();
        assert_that(msg).is_rows().with_rows({
            {
@@ -2695,6 +2748,7 @@ SEASTAR_TEST_CASE(test_select_json_types) {
                utf8_type->decompose("\"1970-01-02\""),
                utf8_type->decompose("00:00:00.000000001"),
                utf8_type->decompose("\"1y2mo25d5h6m7s8ms9us10ns\""),
+                utf8_type->decompose("null"),
                utf8_type->decompose("1261009589805"),
                utf8_type->decompose("\"\\\"3\\\"\"")
            }
@@ -2783,7 +2837,7 @@ SEASTAR_TEST_CASE(test_insert_json_types) {

        e.require_table_exists("ks", "all_types").get();
        e.execute_cql(
-            "INSERT INTO all_types (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, u) JSON '"
+            "INSERT INTO all_types JSON '"
                "{\"a\": \"ascii\", "
                "\"b\": 123456789, "
                "\"c\": \"0xdeadbeef\", "
@@ -2838,13 +2892,41 @@ SEASTAR_TEST_CASE(test_insert_json_types) {

        e.execute_cql("UPDATE all_types SET b = fromJson('42') WHERE a = fromJson('\"ascii\"');").get();
        e.execute_cql("UPDATE all_types SET \"I\" = fromJson('\"zażółć gęślą jaźń\"') WHERE a = fromJson('\"ascii\"');").get();
+        e.execute_cql("UPDATE all_types SET n = fromJson('\"2147483648\"') WHERE a = fromJson('\"ascii\"');").get();
+        e.execute_cql("UPDATE all_types SET o = fromJson('\"3.45\"') WHERE a = fromJson('\"ascii\"');").get();

-        msg = e.execute_cql("SELECT a, b, \"I\" FROM all_types WHERE a = 'ascii'").get0();
+        msg = e.execute_cql("SELECT a, b, \"I\", n, o FROM all_types WHERE a = 'ascii'").get0();
        assert_that(msg).is_rows().with_rows({
            {
                ascii_type->decompose(sstring("ascii")),
                long_type->decompose(42l),
                utf8_type->decompose(sstring("zażółć gęślą jaźń")),
+                varint_type->decompose(boost::multiprecision::cpp_int(2147483648)),
+                decimal_type->decompose(big_decimal { 2, boost::multiprecision::cpp_int(345) }),
+            }
+        });
+
+        e.execute_cql("CREATE TABLE multi_column_pk_table (p1 int, p2 int, p3 int, c1 int, c2 int, v int, PRIMARY KEY((p1, p2, p3), c1, c2));").get();
+        e.require_table_exists("ks", "multi_column_pk_table").get();
+
+        e.execute_cql("INSERT INTO multi_column_pk_table JSON '"
+                "{\"p1\": 1, "
+                "\"p2\": 2, "
+                "\"p3\": 3, "
+                "\"c1\": 4, "
+                "\"c2\": 5, "
+                "\"v\": 6 "
+                "}'").get();
+
+        msg = e.execute_cql("SELECT * FROM multi_column_pk_table").get0();
+        assert_that(msg).is_rows().with_rows({
+            {
+                int32_type->decompose(1),
+                int32_type->decompose(2),
+                int32_type->decompose(3),
+                int32_type->decompose(4),
+                int32_type->decompose(5),
+                int32_type->decompose(6)
            }
        });
    });
@@ -2863,7 +2945,7 @@ SEASTAR_TEST_CASE(test_insert_json_collections) {
        e.require_table_exists("ks", "collections").get();

        e.execute_cql(
-            "INSERT INTO collections (a, b, c, d) JSON '"
+            "INSERT INTO collections JSON '"
                "{\"a\": \"key\", "
                "\"b\": {\"1\": \"abc\", \"2\": \"!\", \"3\": \"de\"}, "
                "\"c\": [0, 1.125, 2.25, 4.5], "
@@ -2898,10 +2980,10 @@ SEASTAR_TEST_CASE(test_prepared_json) {

        cql3::prepared_cache_key_type prepared_id = e.prepare(
            "begin batch \n"
-                "  insert into json_data (k, v) json :named_bound0; \n"
-                "  insert into json_data (k, v) json ?; \n"
-                "  insert into json_data (k, v) json :named_bound1; \n"
-                "  insert into json_data (k, v) json ?; \n"
+                "  insert into json_data json :named_bound0; \n"
+                "  insert into json_data json ?; \n"
+                "  insert into json_data json :named_bound1; \n"
+                "  insert into json_data json ?; \n"
                "apply batch;").get0();

        std::vector<cql3::raw_value> raw_values;
@@ -2993,3 +3075,81 @@ SEASTAR_TEST_CASE(test_time_conversions) {

    });
 }
+
+// Corner-case test that checks for the paging code's preparedness for an empty
+// range list.
+SEASTAR_TEST_CASE(test_empty_partition_range_scan) {
+    return do_with_cql_env_thread([] (cql_test_env& e) {
+        e.execute_cql("create keyspace empty_partition_range_scan with replication = {'class': 'SimpleStrategy', 'replication_factor': 1};").get();
+        e.execute_cql("create table empty_partition_range_scan.tb (a int, b int, c int, val int, PRIMARY KEY ((a,b),c) );").get();
+
+
+        auto qo = std::make_unique<cql3::query_options>(db::consistency_level::LOCAL_ONE, infinite_timeout_config, std::vector<cql3::raw_value>{},
+                cql3::query_options::specific_options{1, nullptr, {}, api::new_timestamp()});
+        auto res = e.execute_cql("select * from empty_partition_range_scan.tb where token (a,b) > 1 and token(a,b) <= 1;", std::move(qo)).get0();
+        assert_that(res).is_rows().is_empty();
+    });
+}
+
+SEASTAR_TEST_CASE(test_static_multi_cell_static_lists_with_ckey) {
+    return do_with_cql_env_thread([] (cql_test_env& e) {
+        e.execute_cql("CREATE TABLE t (p int, c int, slist list<int> static, v int, PRIMARY KEY (p, c));").get();
+        e.execute_cql("INSERT INTO t (p, c, slist, v) VALUES (1, 1, [1], 1); ").get();
+
+        {
+            e.execute_cql("UPDATE t SET slist[0] = 3, v = 3 WHERE p = 1 AND c = 1;").get();
+            auto msg = e.execute_cql("SELECT slist, v FROM t WHERE p = 1 AND c = 1;").get0();
+            auto slist_type = list_type_impl::get_instance(int32_type, true);
+            assert_that(msg).is_rows().with_row({
+                { slist_type->decompose(make_list_value(slist_type, list_type_impl::native_type({{3}}))) },
+                { int32_type->decompose(3) }
+            });
+        }
+        {
+            e.execute_cql("UPDATE t SET slist = [4], v = 4 WHERE p = 1 AND c = 1;").get();
+            auto msg = e.execute_cql("SELECT slist, v FROM t WHERE p = 1 AND c = 1;").get0();
+            auto slist_type = list_type_impl::get_instance(int32_type, true);
+            assert_that(msg).is_rows().with_row({
+                { slist_type->decompose(make_list_value(slist_type, list_type_impl::native_type({{4}}))) },
+                { int32_type->decompose(4) }
+            });
+        }
+        {
+            e.execute_cql("UPDATE t SET slist = [3] + slist , v = 5 WHERE p = 1 AND c = 1;").get();
+            auto msg = e.execute_cql("SELECT slist, v FROM t WHERE p = 1 AND c = 1;").get0();
+            auto slist_type = list_type_impl::get_instance(int32_type, true);
+            assert_that(msg).is_rows().with_row({
+                { slist_type->decompose(make_list_value(slist_type, list_type_impl::native_type({3, 4}))) },
+                { int32_type->decompose(5) }
+            });
+        }
+        {
+            e.execute_cql("UPDATE t SET slist = slist + [5] , v = 6 WHERE p = 1 AND c = 1;").get();
+            auto msg = e.execute_cql("SELECT slist, v FROM t WHERE p = 1 AND c = 1;").get0();
+            auto slist_type = list_type_impl::get_instance(int32_type, true);
+            assert_that(msg).is_rows().with_row({
+                { slist_type->decompose(make_list_value(slist_type, list_type_impl::native_type({3, 4, 5}))) },
+                { int32_type->decompose(6) }
+            });
+        }
+        {
+            e.execute_cql("DELETE slist[2] from t WHERE p = 1;").get();
+            auto msg = e.execute_cql("SELECT slist, v FROM t WHERE p = 1 AND c = 1;").get0();
+            auto slist_type = list_type_impl::get_instance(int32_type, true);
+            assert_that(msg).is_rows().with_row({
+                { slist_type->decompose(make_list_value(slist_type, list_type_impl::native_type({3, 4}))) },
+                { int32_type->decompose(6) }
+            });
+        }
+        {
+            e.execute_cql("UPDATE t SET slist = slist - [4] , v = 7 WHERE p = 1 AND c = 1;").get();
+            auto msg = e.execute_cql("SELECT slist, v FROM t WHERE p = 1 AND c = 1;").get0();
+            auto slist_type = list_type_impl::get_instance(int32_type, true);
+            assert_that(msg).is_rows().with_row({
+                { slist_type->decompose(make_list_value(slist_type, list_type_impl::native_type({3}))) },
+                { int32_type->decompose(7) }
+            });
+        }
+    });
+}
+
--- a/tests/database_test.cc
+++ b/tests/database_test.cc
@@ -29,6 +29,9 @@
 #include "database.hh"
 #include "partition_slice_builder.hh"
 #include "frozen_mutation.hh"
+#include "mutation_source_test.hh"
+#include "schema_registry.hh"
+#include "service/migration_manager.hh"

 SEASTAR_TEST_CASE(test_querying_with_limits) {
    return do_with_cql_env([](cql_test_env& e) {
@@ -74,3 +77,33 @@ SEASTAR_TEST_CASE(test_querying_with_limits) {
        });
    });
 }
+
+SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source) {
+    do_with_cql_env([] (cql_test_env& e) {
+        run_mutation_source_tests([&] (schema_ptr s, const std::vector<mutation>& partitions) -> mutation_source {
+            try {
+                e.local_db().find_column_family(s->ks_name(), s->cf_name());
+                service::get_local_migration_manager().announce_column_family_drop(s->ks_name(), s->cf_name(), true).get();
+            } catch (const no_such_column_family&) {
+                // expected
+            }
+            service::get_local_migration_manager().announce_new_column_family(s, true).get();
+            column_family& cf = e.local_db().find_column_family(s);
+            for (auto&& m : partitions) {
+                e.local_db().apply(cf.schema(), freeze(m)).get();
+            }
+            cf.flush().get();
+            cf.get_row_cache().invalidate([] {}).get();
+            return mutation_source([&] (schema_ptr s,
+                    const dht::partition_range& range,
+                    const query::partition_slice& slice,
+                    const io_priority_class& pc,
+                    tracing::trace_state_ptr trace_state,
+                    streamed_mutation::forwarding fwd,
+                    mutation_reader::forwarding fwd_mr) {
+                return cf.make_reader(s, range, slice, pc, std::move(trace_state), fwd, fwd_mr);
+            });
+        });
+        return make_ready_future<>();
+    }).get();
+}
--- a/tests/flat_mutation_reader_assertions.hh
+++ b/tests/flat_mutation_reader_assertions.hh
@@ -239,6 +239,17 @@ public:
        return *this;
    }

+    flat_reader_assertions& produces(const schema& s, const mutation_fragment& mf) {
+        auto mfopt = read_next();
+        if (!mfopt) {
+            BOOST_FAIL(sprint("Expected %s, but got end of stream", mf));
+        }
+        if (!mfopt->equal(s, mf)) {
+            BOOST_FAIL(sprint("Expected %s, but got %s", mf, *mfopt));
+        }
+        return *this;
+    }
+
    flat_reader_assertions& produces_end_of_stream() {
        BOOST_TEST_MESSAGE("Expecting end of stream");
        auto mfopt = read_next();
--- a/tests/flat_mutation_reader_test.cc
+++ b/tests/flat_mutation_reader_test.cc
@@ -108,14 +108,10 @@ SEASTAR_TEST_CASE(test_flat_mutation_reader_consume_single_partition) {
                BOOST_REQUIRE_EQUAL(1, result._consume_new_partition_call_count);
                BOOST_REQUIRE_EQUAL(1, result._consume_end_of_partition_call_count);
                BOOST_REQUIRE_EQUAL(m.partition().partition_tombstone() ? 1 : 0, result._consume_tombstone_call_count);
-                auto r2 = flat_mutation_reader_from_mutations({m});
-                auto start = r2().get0();
-                BOOST_REQUIRE(start);
-                BOOST_REQUIRE(start->is_partition_start());
+                auto r2 = assert_that(flat_mutation_reader_from_mutations({m}));
+                r2.produces_partition_start(m.decorated_key(), m.partition().partition_tombstone());
                for (auto& mf : result._fragments) {
-                    auto mfopt = r2().get0();
-                    BOOST_REQUIRE(mfopt);
-                    BOOST_REQUIRE(mf.equal(*m.schema(), *mfopt));
+                    r2.produces(*m.schema(), mf);
                }
            }
        });
--- a/tests/gossip_test.cc
+++ b/tests/gossip_test.cc
@@ -23,6 +23,8 @@

 #include <boost/test/unit_test.hpp>

+#include <seastar/util/defer.hh>
+
 #include "tests/test-utils.hh"
 #include "message/messaging_service.hh"
 #include "gms/failure_detector.hh"
@@ -39,18 +41,23 @@ SEASTAR_TEST_CASE(test_boot_shutdown){
        sharded<auth::service> auth_service;
        sharded<db::system_distributed_keyspace> sys_dist_ks;
        utils::fb_utilities::set_broadcast_address(gms::inet_address("127.0.0.1"));
+
        locator::i_endpoint_snitch::create_snitch("SimpleSnitch").get();
+        auto stop_snitch = defer([&] { gms::get_failure_detector().stop().get(); });
+
+        netw::get_messaging_service().start(gms::inet_address("127.0.0.1"), 7000, false /* don't bind */).get();
+        auto stop_messaging_service = defer([&] { netw::get_messaging_service().stop().get(); });
+
        service::get_storage_service().start(std::ref(db), std::ref(auth_service), std::ref(sys_dist_ks)).get();
+        auto stop_ss = defer([&] { service::get_storage_service().stop().get(); });
+
        db.start().get();
-        netw::get_messaging_service().start(gms::inet_address("127.0.0.1")).get();
+        auto stop_db = defer([&] { db.stop().get(); });
+
        gms::get_failure_detector().start().get();
+        auto stop_failure_detector = defer([&] { gms::get_failure_detector().stop().get(); });

        gms::get_gossiper().start().get();
-        gms::get_gossiper().stop().get();
-        gms::get_failure_detector().stop().get();
-        db.stop().get();
-        service::get_storage_service().stop().get();
-        netw::get_messaging_service().stop().get();
-        locator::i_endpoint_snitch::stop_snitch().get();
+        auto stop_gossiper = defer([&] { gms::get_gossiper().stop().get(); });
    });
 }
--- a/tests/imr_test.cc
+++ b/tests/imr_test.cc
@@ -33,6 +33,10 @@
 #include "imr/fundamental.hh"
 #include "imr/compound.hh"
 #include "imr/methods.hh"
+#include "imr/utils.hh"
+
+#include "failure_injecting_allocation_strategy.hh"
+#include "utils/logalloc.hh"

 #include "random-utils.hh"

@@ -717,3 +721,127 @@ BOOST_AUTO_TEST_CASE(test_variant_destructor) {
 }

 BOOST_AUTO_TEST_SUITE_END();
+
+namespace object_exception_safety {
+
+using nested_structure = imr::structure<
+    imr::member<A, imr::pod<size_t>>,
+    imr::member<B, imr::buffer<B>>
+>;
+
+using structure = imr::structure<
+    imr::member<A, imr::pod<size_t>>,
+    imr::member<C, imr::tagged_type<C, imr::pod<void*>>>,
+    imr::member<D, imr::tagged_type<C, imr::pod<void*>>>,
+    imr::member<B, imr::buffer<A>>
+>;
+
+struct structue_context {
+    size_t _size;
+
+    structue_context(const uint8_t* ptr)
+        : _size(imr::pod<size_t>::make_view(ptr).load())
+    {
+        BOOST_CHECK_EQUAL(_size, 4);
+    }
+    
+    template<typename Tag>
+    size_t size_of() const noexcept {
+        return _size;
+    }
+
+    template<typename Tag, typename... Args>
+    decltype(auto) context_for(Args&&...) const noexcept { return *this; }
+};
+
+struct nested_structue_context {
+    size_t _size;
+
+    nested_structue_context(const uint8_t* ptr)
+        : _size(imr::pod<size_t>::make_view(ptr).load())
+    {
+        BOOST_CHECK_NE(_size, 0);
+    }
+    
+    template<typename Tag>
+    size_t size_of() const noexcept {
+        return _size;
+    }
+
+    template<typename Tag, typename... Args>
+    decltype(auto) context_for(Args&&...) const noexcept { return *this; }
+};
+
+}
+
+namespace imr::methods {
+
+template<>
+struct destructor<imr::tagged_type<C, imr::pod<void*>>> {
+    static void run(uint8_t* ptr, ...) {
+        using namespace object_exception_safety;
+        auto obj_ptr = imr::pod<uint8_t*>::make_view(ptr).load();
+        imr::methods::destroy<nested_structure>(obj_ptr, nested_structue_context(obj_ptr));
+        current_allocator().free(obj_ptr);
+    }
+};
+
+}
+
+BOOST_AUTO_TEST_CASE(test_object_exception_safety) {
+    using namespace object_exception_safety;
+
+    using context_factory_for_structure = imr::alloc::context_factory<imr::utils::object_context<structue_context>>;
+    using lsa_migrator_fn_for_structure = imr::alloc::lsa_migrate_fn<imr::utils::object<structure>::structure, context_factory_for_structure>;
+    auto migrator_for_structure = lsa_migrator_fn_for_structure(context_factory_for_structure());
+
+    using context_factory_for_nested_structure = imr::alloc::context_factory<nested_structue_context>;
+    using lsa_migrator_fn_for_nested_structure = imr::alloc::lsa_migrate_fn<nested_structure, context_factory_for_nested_structure>;
+    auto migrator_for_nested_structure = lsa_migrator_fn_for_nested_structure(context_factory_for_nested_structure());
+
+    auto writer_fn = [&] (auto serializer, auto& allocator) {
+        return serializer
+            .serialize(4)
+            .serialize(allocator.template allocate<nested_structure>(
+                &migrator_for_nested_structure,
+                [&] (auto nested_serializer) {
+                    return nested_serializer
+                        .serialize(128)
+                        .serialize(128, [] (auto&&...) { })
+                        .done();
+                }
+            ))
+            .serialize(allocator.template allocate<nested_structure>(
+                &migrator_for_nested_structure,
+                [&] (auto nested_serializer) {
+                    return nested_serializer
+                        .serialize(1024)
+                        .serialize(1024, [] (auto&&...) { })
+                        .done();
+                }
+            ))
+            .serialize(bytes(4, 'a'))
+            .done();
+    };
+
+    logalloc::region reg;
+
+    size_t fail_offset = 0;
+    auto allocator = failure_injecting_allocation_strategy(reg.allocator());
+    with_allocator(allocator, [&] {
+        while (true) {
+            allocator.fail_after(fail_offset++);
+            try {
+                imr::utils::object<structure>::make(writer_fn, &migrator_for_structure);
+            } catch (const std::bad_alloc&) {
+                BOOST_CHECK_EQUAL(reg.occupancy().used_space(), 0);
+                continue;
+            }
+            BOOST_CHECK_EQUAL(reg.occupancy().used_space(), 0);
+            break;
+        }
+    });
+
+    BOOST_CHECK_EQUAL(fail_offset, 4);
+}
+
--- a/tests/json_test.cc
+++ b/tests/json_test.cc
@@ -0,0 +1,59 @@
+
+/*
+ * Copyright (C) 2015 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define BOOST_TEST_MODULE json
+
+#include <boost/test/unit_test.hpp>
+
+#include "tests/test-utils.hh"
+#include "json.hh"
+#include "stdx.hh"
+
+BOOST_AUTO_TEST_CASE(test_value_to_quoted_string) {
+    std::vector<sstring> input = {
+            "\"\\\b\f\n\r\t",
+            sstring(1, '\0') + "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f",
+            "regular string",
+            "mixed\t\t\t\ba\f \007 string \002 fgh",
+            "chào mọi người 123!",
+            "ყველას მოგესალმებით 456?;",
+            "всем привет",
+            "大家好",
+            ""
+    };
+
+    std::vector<sstring> expected = {
+            "\"\\\"\\\\\\b\\f\\n\\r\\t\"",
+            "\"\\u0000\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006\\u0007\\b\\t\\n\\u000B\\f\\r\\u000E\\u000F\\u0010\\u0011\\u0012\\u0013\\u0014\\u0015\\u0016\\u0017\\u0018\\u0019\\u001A\\u001B\\u001C\\u001D\\u001E\\u001F\"",
+            "\"regular string\"",
+            "\"mixed\\t\\t\\t\\ba\\f \\u0007 string \\u0002 fgh\"",
+            "\"chào mọi người 123!\"",
+            "\"ყველას მოგესალმებით 456?;\"",
+            "\"всем привет\"",
+            "\"大家好\"",
+            "\"\""
+    };
+
+    for (size_t i = 0; i < input.size(); ++i) {
+        BOOST_CHECK_EQUAL(json::value_to_quoted_string(input[i]), expected[i]);
+    }
+}
--- a/tests/loading_cache_test.cc
+++ b/tests/loading_cache_test.cc
@@ -269,7 +269,7 @@ SEASTAR_TEST_CASE(test_loading_cache_loading_different_keys) {
        using namespace std::chrono;
        std::vector<int> ivec(num_loaders);
        load_count = 0;
-        utils::loading_cache<int, sstring> loading_cache(num_loaders, 1s, test_logger);
+        utils::loading_cache<int, sstring> loading_cache(num_loaders, 1h, test_logger);
        auto stop_cache_reload = seastar::defer([&loading_cache] { loading_cache.stop().get(); });

        prepare().get();
--- a/tests/mutation_query_test.cc
+++ b/tests/mutation_query_test.cc
@@ -26,11 +26,13 @@

 #include <boost/test/unit_test.hpp>
 #include <query-result-set.hh>
+#include <query-result-writer.hh>

 #include "tests/test_services.hh"
 #include "tests/test-utils.hh"
 #include "tests/mutation_assertions.hh"
 #include "tests/result_set_assertions.hh"
+#include "tests/mutation_source_test.hh"

 #include "mutation_query.hh"
 #include "core/do_with.hh"
@@ -527,3 +529,22 @@ SEASTAR_TEST_CASE(test_partition_limit) {
        }
    });
 }
+
+SEASTAR_THREAD_TEST_CASE(test_result_size_calculation) {
+    random_mutation_generator gen(random_mutation_generator::generate_counters::no);
+    std::vector<mutation> mutations = gen(1);
+    schema_ptr s = gen.schema();
+    mutation_source source = make_source(std::move(mutations));
+    query::result_memory_limiter l(std::numeric_limits<ssize_t>::max());
+    query::partition_slice slice = make_full_slice(*s);
+    slice.options.set<query::partition_slice::option::allow_short_read>();
+
+    query::result::builder digest_only_builder(slice, query::result_options{query::result_request::only_digest, query::digest_algorithm::xxHash}, l.new_digest_read(query::result_memory_limiter::maximum_result_size).get0());
+    data_query(s, source, query::full_partition_range, slice, std::numeric_limits<uint32_t>::max(), std::numeric_limits<uint32_t>::max(), gc_clock::now(), digest_only_builder).get0();
+
+    query::result::builder result_and_digest_builder(slice, query::result_options{query::result_request::result_and_digest, query::digest_algorithm::xxHash}, l.new_data_read(query::result_memory_limiter::maximum_result_size).get0());
+    data_query(s, source, query::full_partition_range, slice, std::numeric_limits<uint32_t>::max(), std::numeric_limits<uint32_t>::max(), gc_clock::now(), result_and_digest_builder).get0();
+
+    BOOST_REQUIRE_EQUAL(digest_only_builder.memory_accounter().used_memory(), result_and_digest_builder.memory_accounter().used_memory());
+}
+
--- a/tests/mutation_source_test.cc
+++ b/tests/mutation_source_test.cc
@@ -659,6 +659,46 @@ void test_mutation_reader_fragments_have_monotonic_positions(populate_fn populat
    });
 }

+static void test_date_tiered_clustering_slicing(populate_fn populate) {
+    BOOST_TEST_MESSAGE(__PRETTY_FUNCTION__);
+
+    simple_schema ss;
+
+    auto s = schema_builder(ss.schema())
+        .set_compaction_strategy(sstables::compaction_strategy_type::date_tiered)
+        .build();
+
+    auto pkey = ss.make_pkey();
+
+    mutation m1(s, pkey);
+    ss.add_static_row(m1, "s");
+    m1.partition().apply(ss.new_tombstone());
+    ss.add_row(m1, ss.make_ckey(0), "v1");
+
+    mutation_source ms = populate(s, {m1});
+
+    // query row outside the range of existing rows to exercise sstable clustering key filter
+    {
+        auto slice = partition_slice_builder(*s)
+            .with_range(ss.make_ckey_range(1, 2))
+            .build();
+        auto prange = dht::partition_range::make_singular(pkey);
+        assert_that(ms.make_reader(s, prange, slice))
+            .produces(m1, slice.row_ranges(*s, pkey.key()))
+            .produces_end_of_stream();
+    }
+
+    {
+        auto slice = partition_slice_builder(*s)
+            .with_range(query::clustering_range::make_singular(ss.make_ckey(0)))
+            .build();
+        auto prange = dht::partition_range::make_singular(pkey);
+        assert_that(ms.make_reader(s, prange, slice))
+            .produces(m1)
+            .produces_end_of_stream();
+    }
+}
+
 static void test_clustering_slices(populate_fn populate) {
    BOOST_TEST_MESSAGE(__PRETTY_FUNCTION__);
    auto s = schema_builder("ks", "cf")
@@ -1012,6 +1052,7 @@ void test_slicing_with_overlapping_range_tombstones(populate_fn populate) {
 }

 void run_mutation_reader_tests(populate_fn populate) {
+    test_date_tiered_clustering_slicing(populate);
    test_fast_forwarding_across_partitions_to_empty_range(populate);
    test_clustering_slices(populate);
    test_mutation_reader_fragments_have_monotonic_positions(populate);
--- a/tests/mutation_test.cc
+++ b/tests/mutation_test.cc
@@ -53,7 +53,7 @@
 #include "cell_locking.hh"
 #include "flat_mutation_reader_assertions.hh"
 #include "service/storage_proxy.hh"
-
+#include "random-utils.hh"
 #include "simple_schema.hh"

 using namespace std::chrono_literals;
@@ -78,7 +78,7 @@ static atomic_cell make_atomic_cell(data_type dt, T value) {

 template<typename T>
 static atomic_cell make_collection_member(data_type dt, T value) {
-    return atomic_cell::make_live(*dt, 0, dt->decompose(std::move(value)));
+    return atomic_cell::make_live(*dt, 0, dt->decompose(std::move(value)), atomic_cell::collection_member::yes);
 };

 static mutation_partition get_partition(memtable& mt, const partition_key& key) {
@@ -1603,3 +1603,158 @@ SEASTAR_TEST_CASE(test_continuity_merging) {
        }
    });
 }
+
+class measuring_allocator final : public allocation_strategy {
+    size_t _allocated_bytes;
+public:
+    virtual void* alloc(migrate_fn mf, size_t size, size_t alignment) override {
+        _allocated_bytes += size;
+        return standard_allocator().alloc(mf, size, alignment);
+    }
+    virtual void free(void* ptr, size_t size) override {
+        standard_allocator().free(ptr, size);
+    }
+    virtual void free(void* ptr) override {
+        standard_allocator().free(ptr);
+    }
+    virtual size_t object_memory_size_in_allocator(const void* obj) const noexcept override {
+        return standard_allocator().object_memory_size_in_allocator(obj);
+    }
+    size_t allocated_bytes() const { return _allocated_bytes; }
+};
+
+SEASTAR_THREAD_TEST_CASE(test_external_memory_usage) {
+    measuring_allocator alloc;
+    auto s = simple_schema();
+
+    auto generate = [&s] {
+        size_t data_size = 0;
+
+        auto m = mutation(s.schema(), s.make_pkey("pk"));
+
+        auto row_count = tests::random::get_int(1, 16);
+        for (auto i = 0; i < row_count; i++) {
+            auto ck_value = to_hex(tests::random::get_bytes(tests::random::get_int(1023) + 1));
+            data_size += ck_value.size();
+            auto ck = s.make_ckey(ck_value);
+
+            auto value = to_hex(tests::random::get_bytes(tests::random::get_int(128 * 1024)));
+            data_size += value.size();
+            s.add_row(m, ck, value);
+        }
+
+        return std::pair(std::move(m), data_size);
+    };
+
+    for (auto i = 0; i < 16; i++) {
+        auto [ m, size ] = generate();
+
+        with_allocator(alloc, [&] {
+            auto before = alloc.allocated_bytes();
+            auto m2 = m;
+            auto after = alloc.allocated_bytes();
+
+            BOOST_CHECK_EQUAL(m.partition().external_memory_usage(*s.schema()),
+                              m2.partition().external_memory_usage(*s.schema()));
+
+            BOOST_CHECK_GE(m.partition().external_memory_usage(*s.schema()), size);
+            BOOST_CHECK_EQUAL(m.partition().external_memory_usage(*s.schema()), after - before);
+        });
+    }
+}
+
+SEASTAR_THREAD_TEST_CASE(test_cell_external_memory_usage) {
+    measuring_allocator alloc;
+
+
+    auto test_live_atomic_cell = [&] (data_type dt, bytes_view bv) {
+        with_allocator(alloc, [&] {
+            auto before = alloc.allocated_bytes();
+            auto ac = atomic_cell_or_collection(atomic_cell::make_live(*dt, 1, bv));
+            auto after = alloc.allocated_bytes();
+            BOOST_CHECK_GE(ac.external_memory_usage(*dt), bv.size());
+            BOOST_CHECK_EQUAL(ac.external_memory_usage(*dt), after - before);
+        });
+    };
+
+    test_live_atomic_cell(int32_type, { });
+    test_live_atomic_cell(int32_type, int32_type->decompose(int32_t(1)));
+
+    test_live_atomic_cell(bytes_type, { });
+    test_live_atomic_cell(bytes_type, bytes(1, 'a'));
+    test_live_atomic_cell(bytes_type, bytes(16, 'a'));
+    test_live_atomic_cell(bytes_type, bytes(32, 'a'));
+    test_live_atomic_cell(bytes_type, bytes(1024, 'a'));
+    test_live_atomic_cell(bytes_type, bytes(64 * 1024 - 1, 'a'));
+    test_live_atomic_cell(bytes_type, bytes(64 * 1024, 'a'));
+    test_live_atomic_cell(bytes_type, bytes(64 * 1024 + 1, 'a'));
+    test_live_atomic_cell(bytes_type, bytes(1024 * 1024, 'a'));
+
+    auto test_collection = [&] (bytes_view bv) {
+        auto collection_type = map_type_impl::get_instance(int32_type, bytes_type, true);
+
+        auto m = make_collection_mutation({ }, int32_type->decompose(0), make_collection_member(bytes_type, data_value(bytes(bv))));
+        auto cell = atomic_cell_or_collection(collection_type->serialize_mutation_form(m));
+
+        with_allocator(alloc, [&] {
+            auto before = alloc.allocated_bytes();
+            auto cell2 = cell.copy(*collection_type);
+            auto after = alloc.allocated_bytes();
+            BOOST_CHECK_GE(cell2.external_memory_usage(*collection_type), bv.size());
+            BOOST_CHECK_EQUAL(cell2.external_memory_usage(*collection_type), cell.external_memory_usage(*collection_type));
+            BOOST_CHECK_EQUAL(cell2.external_memory_usage(*collection_type), after - before);
+        });
+    };
+
+    test_collection({ });
+    test_collection(bytes(1, 'a'));
+    test_collection(bytes(16, 'a'));
+    test_collection(bytes(32, 'a'));
+    test_collection(bytes(1024, 'a'));
+    test_collection(bytes(64 * 1024 - 1, 'a'));
+    test_collection(bytes(64 * 1024, 'a'));
+    test_collection(bytes(64 * 1024 + 1, 'a'));
+    test_collection(bytes(1024 * 1024, 'a'));
+}
+
+// external_memory_usage() must be invariant to the merging order,
+// so that accounting of a clustering_row produced by partition_snapshot_flat_reader
+// doesn't give a greater result than what is used by the memtable region, possibly
+// after all MVCC versions are merged.
+// Overaccounting leads to assertion failure in ~flush_memory_accounter.
+SEASTAR_THREAD_TEST_CASE(test_row_size_is_immune_to_application_order) {
+    auto s = schema_builder("ks", "cf")
+            .with_column("pk", utf8_type, column_kind::partition_key)
+            .with_column("v1", utf8_type)
+            .with_column("v2", utf8_type)
+            .with_column("v3", utf8_type)
+            .with_column("v4", utf8_type)
+            .with_column("v5", utf8_type)
+            .with_column("v6", utf8_type)
+            .with_column("v7", utf8_type)
+            .with_column("v8", utf8_type)
+            .with_column("v9", utf8_type)
+            .build();
+
+    auto value = utf8_type->decompose(data_value("value"));
+
+    row r1;
+    r1.append_cell(7, make_atomic_cell(value));
+
+    row r2;
+    r2.append_cell(8, make_atomic_cell(value));
+
+    auto size1 = [&] {
+        auto r3 = row(*s, column_kind::regular_column, r1);
+        r3.apply(*s, column_kind::regular_column, r2);
+        return r3.external_memory_usage(*s, column_kind::regular_column);
+    }();
+
+    auto size2 = [&] {
+        auto r3 = row(*s, column_kind::regular_column, r2);
+        r3.apply(*s, column_kind::regular_column, r1);
+        return r3.external_memory_usage(*s, column_kind::regular_column);
+    }();
+
+    BOOST_REQUIRE_EQUAL(size1, size2);
+}
--- a/tests/network_topology_strategy_test.cc
+++ b/tests/network_topology_strategy_test.cc
@@ -30,6 +30,7 @@
 #include <map>
 #include <iostream>
 #include <sstream>
+#include <boost/range/algorithm/adjacent_find.hpp>

 static logging::logger nlogger("NetworkTopologyStrategyLogger");

@@ -52,6 +53,27 @@ void print_natural_endpoints(double point, const std::vector<inet_address> v) {
    nlogger.debug("{}", strm.str());
 }

+#ifndef SEASTAR_DEBUG
+static void verify_sorted(const dht::token_range_vector& trv) {
+    auto not_strictly_before = [] (const dht::token_range a, const dht::token_range b) {
+        return !b.start()
+                || !a.end()
+                || a.end()->value() > b.start()->value()
+                || (a.end()->value() == b.start()->value() && a.end()->is_inclusive() && b.start()->is_inclusive());
+    };
+    BOOST_CHECK(boost::adjacent_find(trv, not_strictly_before) == trv.end());
+}
+#endif
+
+static void check_ranges_are_sorted(abstract_replication_strategy* ars, gms::inet_address ep) {
+    // Too slow in debug mode
+#ifndef SEASTAR_DEBUG
+    verify_sorted(ars->get_ranges(ep));
+    verify_sorted(ars->get_primary_ranges(ep));
+    verify_sorted(ars->get_primary_ranges_within_dc(ep));
+#endif
+}
+
 void strategy_sanity_check(
    abstract_replication_strategy* ars_ptr,
    const std::map<sstring, sstring>& options) {
@@ -150,6 +172,7 @@ void full_ring_check(const std::vector<ring_point>& ring_points,
        auto endpoints2 = ars_ptr->get_natural_endpoints(t2);

        endpoints_check(ars_ptr, endpoints2);
+        check_ranges_are_sorted(ars_ptr, rp.host);
        BOOST_CHECK(cache_hit_count + 1 == ars_ptr->get_cache_hits_count());
        BOOST_CHECK(endpoints1 == endpoints2);
    }
--- a/Show More
+++ b/Show More