commit log reader bugfix: Fix tried to read entries across chunk bounds

read_entry did not verify that current chunk has enough data left for a minimal entry. Thus we could try to read an entry from the slack left in a chunk, and get lost in the file (pos > next, skip very much -> eof). And also give false errors about corruption. Message-Id: <1452517700-599-1-git-send-email-calle@scylladb.com> (cherry picked from commit 7f4985a017)
db: reduce log spam when ignoring an sstable
2016-01-12 10:29:31 +02:00 · 2016-01-07 19:26:18 +02:00 · 2016-01-07 18:00:09 +02:00 · 2016-01-07 18:00:03 +02:00 · 2016-01-07 16:50:27 +02:00 · 2016-01-07 16:50:22 +02:00
75 changed files with 1795 additions and 605 deletions
--- a/README.md
+++ b/README.md
@@ -15,13 +15,13 @@ git submodule update --recursive
 * Installing required packages:

 ```
-sudo yum install yaml-cpp-devel lz4-devel zlib-devel snappy-devel jsoncpp-devel thrift-devel antlr3-tool antlr3-C++-devel libasan libubsan
+sudo yum install yaml-cpp-devel lz4-devel zlib-devel snappy-devel jsoncpp-devel thrift-devel antlr3-tool antlr3-C++-devel libasan libubsan gcc-c++ gnutls-devel ninja-build ragel libaio-devel cryptopp-devel xfsprogs-devel
 ```

 * Build Scylla
 ```
 ./configure.py --mode=release --with=scylla --disable-xen
-ninja build/release/scylla -j2 # you can use more cpus if you have tons of RAM
+ninja-build build/release/scylla -j2 # you can use more cpus if you have tons of RAM

 ```

--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=development
+VERSION=0.15

 if test -f version
 then
--- a/api/api-doc/messaging_service.json
+++ b/api/api-doc/messaging_service.json
@@ -233,46 +233,27 @@
            "verb":{
               "type":"string",
               "enum":[
-                  "MUTATION",
-                  "BINARY",
-                  "READ_REPAIR",
-                  "READ",
-                  "REQUEST_RESPONSE",
-                  "STREAM_INITIATE",
-                  "STREAM_INITIATE_DONE",
-                  "STREAM_REPLY",
-                  "STREAM_REQUEST",
-                  "RANGE_SLICE",
-                  "BOOTSTRAP_TOKEN",
-                  "TREE_REQUEST",
-                  "TREE_RESPONSE",
-                  "JOIN",
-                  "GOSSIP_DIGEST_SYN",
-                  "GOSSIP_DIGEST_ACK",
-                  "GOSSIP_DIGEST_ACK2",
-                  "DEFINITIONS_ANNOUNCE",
-                  "DEFINITIONS_UPDATE",
-                  "TRUNCATE",
-                  "SCHEMA_CHECK",
-                  "INDEX_SCAN",
-                  "REPLICATION_FINISHED",
-                  "INTERNAL_RESPONSE",
-                  "COUNTER_MUTATION",
-                  "STREAMING_REPAIR_REQUEST",
-                  "STREAMING_REPAIR_RESPONSE",
-                  "SNAPSHOT",
-                  "MIGRATION_REQUEST",
-                  "GOSSIP_SHUTDOWN",
-                  "_TRACE",
-                  "ECHO",
-                  "REPAIR_MESSAGE",
-                  "PAXOS_PREPARE",
-                  "PAXOS_PROPOSE",
-                  "PAXOS_COMMIT",
-                  "PAGED_RANGE",
-                  "UNUSED_1",
-                  "UNUSED_2",
-                  "UNUSED_3"
+                 "CLIENT_ID",
+                 "ECHO",
+                 "MUTATION",
+                 "MUTATION_DONE",
+                 "READ_DATA",
+                 "READ_MUTATION_DATA",
+                 "READ_DIGEST",
+                 "GOSSIP_DIGEST_SYN",
+                 "GOSSIP_DIGEST_ACK2",
+                 "GOSSIP_SHUTDOWN",
+                 "DEFINITIONS_UPDATE",
+                 "TRUNCATE",
+                 "REPLICATION_FINISHED",
+                 "MIGRATION_REQUEST",
+                 "STREAM_INIT_MESSAGE",
+                 "PREPARE_MESSAGE",
+                 "PREPARE_DONE_MESSAGE",
+                 "STREAM_MUTATION",
+                 "STREAM_MUTATION_DONE",
+                 "COMPLETE_MESSAGE",
+                 "LAST"
               ]
            }
         }
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -425,7 +425,7 @@
               "summary":"load value. Keys are IP addresses",
               "type":"array",
               "items":{
-                  "type":"double_mapper"
+                  "type":"map_string_double"
               },
               "nickname":"get_load_map",
               "produces":[
@@ -2028,8 +2028,8 @@
            }
         }
      },
-      "double_mapper":{
-         "id":"double_mapper",
+      "map_string_double":{
+         "id":"map_string_double",
         "description":"A key value mapping between a string and a double",
         "properties":{
            "key":{
--- a/api/messaging_service.cc
+++ b/api/messaging_service.cc
@@ -34,7 +34,7 @@ namespace api {
 using shard_info = messaging_service::shard_info;
 using shard_id = messaging_service::shard_id;

-static const int32_t num_verb = static_cast<int32_t>(messaging_verb::UNUSED_3) + 1;
+static const int32_t num_verb = static_cast<int32_t>(messaging_verb::LAST) + 1;

 std::vector<message_counter> map_to_message_counters(
        const std::unordered_map<gms::inet_address, unsigned long>& map) {
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -169,9 +169,9 @@ void set_storage_service(http_context& ctx, routes& r) {

    ss::get_load_map.set(r, [] (std::unique_ptr<request> req) {
        return service::get_local_storage_service().get_load_map().then([] (auto&& load_map) {
-            std::vector<ss::double_mapper> res;
+            std::vector<ss::map_string_double> res;
            for (auto i : load_map) {
-                ss::double_mapper val;
+                ss::map_string_double val;
                val.key = i.first;
                val.value = i.second;
                res.push_back(val);
@@ -542,10 +542,9 @@ void set_storage_service(http_context& ctx, routes& r) {
        return make_ready_future<json::json_return_type>(0);
    });

-    ss::get_compaction_throughput_mb_per_sec.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        return make_ready_future<json::json_return_type>(0);
+    ss::get_compaction_throughput_mb_per_sec.set(r, [&ctx](std::unique_ptr<request> req) {
+        int value = ctx.db.local().get_config().compaction_throughput_mb_per_sec();
+        return make_ready_future<json::json_return_type>(value);
    });

    ss::set_compaction_throughput_mb_per_sec.set(r, [](std::unique_ptr<request> req) {
--- a/api/stream_manager.cc
+++ b/api/stream_manager.cc
@@ -72,11 +72,12 @@ static hs::stream_state get_state(
        si.peer = boost::lexical_cast<std::string>(info.peer);
        si.session_index = info.session_index;
        si.state = info.state;
-        si.connecting = boost::lexical_cast<std::string>(info.connecting);
+        si.connecting = si.peer;
        set_summaries(info.receiving_summaries, si.receiving_summaries);
        set_summaries(info.sending_summaries, si.sending_summaries);
        set_files(info.receiving_files, si.receiving_files);
        set_files(info.sending_files, si.sending_files);
+        state.sessions.push(si);
    }
    return state;
 }
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -824,3 +824,17 @@ commitlog_total_space_in_mb: -1
 # reducing overhead from the TCP protocol itself, at the cost of increasing
 # latency if you block for cross-datacenter responses.
 # inter_dc_tcp_nodelay: false
+
+# Relaxation of environment checks.
+#
+# Scylla places certain requirements on its environment.  If these requirements are
+# not met, performance and reliability can be degraded.
+#
+# These requirements include:
+#    - A filesystem with good support for aysnchronous I/O (AIO). Currently,
+#      this means XFS.
+#
+# false: strict environment checks are in place; do not start if they are not met.
+# true: relaxed environment checks; performance and reliability may degraade.
+#
+# developer_mode: false
--- a/configure.py
+++ b/configure.py
@@ -292,6 +292,7 @@ scylla_core = (['database.cc',
                 'cql3/statements/index_target.cc',
                 'cql3/statements/create_index_statement.cc',
                 'cql3/statements/truncate_statement.cc',
+                 'cql3/statements/alter_table_statement.cc',
                 'cql3/update_parameters.cc',
                 'cql3/ut_name.cc',
                 'thrift/handler.cc',
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -31,6 +31,7 @@ options {

@parser::includes {
 #include "cql3/selection/writetime_or_ttl.hh"
+#include "cql3/statements/alter_table_statement.hh"
 #include "cql3/statements/create_keyspace_statement.hh"
 #include "cql3/statements/drop_keyspace_statement.hh"
 #include "cql3/statements/create_index_statement.hh"
@@ -269,7 +270,9 @@ cqlStatement returns [shared_ptr<parsed_statement> stmt]
    | st12=dropTableStatement          { $stmt = st12; }
 #if 0
    | st13=dropIndexStatement          { $stmt = st13; }
+#endif
    | st14=alterTableStatement         { $stmt = st14; }
+#if 0
    | st15=alterKeyspaceStatement      { $stmt = st15; }
    | st16=grantStatement              { $stmt = st16; }
    | st17=revokeStatement             { $stmt = st17; }
@@ -768,7 +771,7 @@ alterKeyspaceStatement returns [AlterKeyspaceStatement expr]
    : K_ALTER K_KEYSPACE ks=keyspaceName
        K_WITH properties[attrs] { $expr = new AlterKeyspaceStatement(ks, attrs); }
    ;
-
+#endif

 /**
 * ALTER COLUMN FAMILY <CF> ALTER <column> TYPE <newtype>;
@@ -777,27 +780,29 @@ alterKeyspaceStatement returns [AlterKeyspaceStatement expr]
 * ALTER COLUMN FAMILY <CF> WITH <property> = <value>;
 * ALTER COLUMN FAMILY <CF> RENAME <column> TO <column>;
 */
-alterTableStatement returns [AlterTableStatement expr]
+alterTableStatement returns [shared_ptr<alter_table_statement> expr]
    @init {
-        AlterTableStatement.Type type = null;
-        CFPropDefs props = new CFPropDefs();
-        Map<ColumnIdentifier.Raw, ColumnIdentifier.Raw> renames = new HashMap<ColumnIdentifier.Raw, ColumnIdentifier.Raw>();
-        boolean isStatic = false;
+        alter_table_statement::type type;
+        auto props = make_shared<cql3::statements::cf_prop_defs>();;
+        std::vector<std::pair<shared_ptr<cql3::column_identifier::raw>, shared_ptr<cql3::column_identifier::raw>>> renames;
+        bool is_static = false;
    }
    : K_ALTER K_COLUMNFAMILY cf=columnFamilyName
-          ( K_ALTER id=cident K_TYPE v=comparatorType { type = AlterTableStatement.Type.ALTER; }
-          | K_ADD   id=cident v=comparatorType ({ isStatic=true; } K_STATIC)? { type = AlterTableStatement.Type.ADD; }
-          | K_DROP  id=cident                         { type = AlterTableStatement.Type.DROP; }
-          | K_WITH  properties[props]                 { type = AlterTableStatement.Type.OPTS; }
-          | K_RENAME                                  { type = AlterTableStatement.Type.RENAME; }
-               id1=cident K_TO toId1=cident { renames.put(id1, toId1); }
-               ( K_AND idn=cident K_TO toIdn=cident { renames.put(idn, toIdn); } )*
+          ( K_ALTER id=cident K_TYPE v=comparatorType { type = alter_table_statement::type::alter; }
+          | K_ADD   id=cident v=comparatorType ({ is_static=true; } K_STATIC)? { type = alter_table_statement::type::add; }
+          | K_DROP  id=cident                         { type = alter_table_statement::type::drop; }
+          | K_WITH  properties[props]                 { type = alter_table_statement::type::opts; }
+          | K_RENAME                                  { type = alter_table_statement::type::rename; }
+               id1=cident K_TO toId1=cident { renames.emplace_back(id1, toId1); }
+               ( K_AND idn=cident K_TO toIdn=cident { renames.emplace_back(idn, toIdn); } )*
          )
    {
-        $expr = new AlterTableStatement(cf, type, id, v, props, renames, isStatic);
+        $expr = ::make_shared<alter_table_statement>(std::move(cf), type, std::move(id),
+            std::move(v), std::move(props), std::move(renames), is_static);
    }
    ;

+#if 0
 /**
 * ALTER TYPE <name> ALTER <field> TYPE <newtype>;
 * ALTER TYPE <name> ADD <field> <newtype>;
@@ -1243,6 +1248,7 @@ relationType returns [const cql3::operator_type* op = nullptr]
    ;

 relation[std::vector<cql3::relation_ptr>& clauses]
+    @init{ const cql3::operator_type* rt = nullptr; }
    : name=cident type=relationType t=term { $clauses.emplace_back(::make_shared<cql3::single_column_relation>(std::move(name), *type, std::move(t))); }

    | K_TOKEN l=tupleOfIdentifiers type=relationType t=term
@@ -1252,11 +1258,9 @@ relation[std::vector<cql3::relation_ptr>& clauses]
        { $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), cql3::operator_type::IN, std::move(marker))); }
    | name=cident K_IN in_values=singleColumnInValues
        { $clauses.emplace_back(cql3::single_column_relation::create_in_relation(std::move(name), std::move(in_values))); }
-#if 0
-    | name=cident K_CONTAINS { Operator rt = Operator.CONTAINS; } (K_KEY { rt = Operator.CONTAINS_KEY; })?
-        t=term { $clauses.add(new SingleColumnRelation(name, rt, t)); }
-    | name=cident '[' key=term ']' type=relationType t=term { $clauses.add(new SingleColumnRelation(name, key, type, t)); }
-#endif
+    | name=cident K_CONTAINS { rt = &cql3::operator_type::CONTAINS; } (K_KEY { rt = &cql3::operator_type::CONTAINS_KEY; })?
+        t=term { $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), *rt, std::move(t))); }
+    | name=cident '[' key=term ']' type=relationType t=term { $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), std::move(key), *type, std::move(t))); }
    | ids=tupleOfIdentifiers
      ( K_IN
          ( '(' ')'
--- a/cql3/selection/selection.hh
+++ b/cql3/selection/selection.hh
@@ -134,7 +134,7 @@ public:
     * @return <code>true</code> if this selection contains a collection, <code>false</code> otherwise.
     */
    bool contains_a_collection() const {
-        if (!_schema->has_collections()) {
+        if (!_schema->has_multi_cell_collections()) {
            return false;
        }

--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -0,0 +1,300 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright 2015 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cql3/statements/alter_table_statement.hh"
+#include "service/migration_manager.hh"
+#include "validation.hh"
+
+namespace cql3 {
+
+namespace statements {
+
+alter_table_statement::alter_table_statement(shared_ptr<cf_name> name,
+                                             type t,
+                                             shared_ptr<column_identifier::raw> column_name,
+                                             shared_ptr<cql3_type::raw> validator,
+                                             shared_ptr<cf_prop_defs> properties,
+                                             renames_type renames,
+                                             bool is_static)
+    : schema_altering_statement(std::move(name))
+    , _type(t)
+    , _raw_column_name(std::move(column_name))
+    , _validator(std::move(validator))
+    , _properties(std::move(properties))
+    , _renames(std::move(renames))
+    , _is_static(is_static)
+{
+}
+
+void alter_table_statement::check_access(const service::client_state& state)
+{
+    warn(unimplemented::cause::PERMISSIONS);
+#if 0
+    state.hasColumnFamilyAccess(keyspace(), columnFamily(), Permission.ALTER);
+#endif
+}
+
+void alter_table_statement::validate(distributed<service::storage_proxy>& proxy, const service::client_state& state)
+{
+    // validated in announce_migration()
+}
+
+future<bool> alter_table_statement::announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only)
+{
+    throw std::runtime_error(sprint("%s not implemented", __PRETTY_FUNCTION__));
+#if 0
+    CFMetaData meta = validateColumnFamily(keyspace(), columnFamily());
+    CFMetaData cfm = meta.copy();
+
+    CQL3Type validator = this.validator == null ? null : this.validator.prepare(keyspace());
+    ColumnIdentifier columnName = null;
+    ColumnDefinition def = null;
+    if (rawColumnName != null)
+    {
+        columnName = rawColumnName.prepare(cfm);
+        def = cfm.getColumnDefinition(columnName);
+    }
+
+    switch (oType)
+    {
+        case ADD:
+            assert columnName != null;
+            if (cfm.comparator.isDense())
+                throw new InvalidRequestException("Cannot add new column to a COMPACT STORAGE table");
+
+            if (isStatic)
+            {
+                if (!cfm.comparator.isCompound())
+                    throw new InvalidRequestException("Static columns are not allowed in COMPACT STORAGE tables");
+                if (cfm.clusteringColumns().isEmpty())
+                    throw new InvalidRequestException("Static columns are only useful (and thus allowed) if the table has at least one clustering column");
+            }
+
+            if (def != null)
+            {
+                switch (def.kind)
+                {
+                    case PARTITION_KEY:
+                    case CLUSTERING_COLUMN:
+                        throw new InvalidRequestException(String.format("Invalid column name %s because it conflicts with a PRIMARY KEY part", columnName));
+                    default:
+                        throw new InvalidRequestException(String.format("Invalid column name %s because it conflicts with an existing column", columnName));
+                }
+            }
+
+            // Cannot re-add a dropped counter column. See #7831.
+            if (meta.isCounter() && meta.getDroppedColumns().containsKey(columnName))
+                throw new InvalidRequestException(String.format("Cannot re-add previously dropped counter column %s", columnName));
+
+            AbstractType<?> type = validator.getType();
+            if (type.isCollection() && type.isMultiCell())
+            {
+                if (!cfm.comparator.supportCollections())
+                    throw new InvalidRequestException("Cannot use non-frozen collections with a non-composite PRIMARY KEY");
+                if (cfm.isSuper())
+                    throw new InvalidRequestException("Cannot use non-frozen collections with super column families");
+
+                // If there used to be a collection column with the same name (that has been dropped), it will
+                // still be appear in the ColumnToCollectionType because or reasons explained on #6276. The same
+                // reason mean that we can't allow adding a new collection with that name (see the ticket for details).
+                if (cfm.comparator.hasCollections())
+                {
+                    CollectionType previous = cfm.comparator.collectionType() == null ? null : cfm.comparator.collectionType().defined.get(columnName.bytes);
+                    if (previous != null && !type.isCompatibleWith(previous))
+                        throw new InvalidRequestException(String.format("Cannot add a collection with the name %s " +
+                                    "because a collection with the same name and a different type has already been used in the past", columnName));
+                }
+
+                cfm.comparator = cfm.comparator.addOrUpdateCollection(columnName, (CollectionType)type);
+            }
+
+            Integer componentIndex = cfm.comparator.isCompound() ? cfm.comparator.clusteringPrefixSize() : null;
+            cfm.addColumnDefinition(isStatic
+                                    ? ColumnDefinition.staticDef(cfm, columnName.bytes, type, componentIndex)
+                                    : ColumnDefinition.regularDef(cfm, columnName.bytes, type, componentIndex));
+            break;
+
+        case ALTER:
+            assert columnName != null;
+            if (def == null)
+                throw new InvalidRequestException(String.format("Column %s was not found in table %s", columnName, columnFamily()));
+
+            AbstractType<?> validatorType = validator.getType();
+            switch (def.kind)
+            {
+                case PARTITION_KEY:
+                    if (validatorType instanceof CounterColumnType)
+                        throw new InvalidRequestException(String.format("counter type is not supported for PRIMARY KEY part %s", columnName));
+                    if (cfm.getKeyValidator() instanceof CompositeType)
+                    {
+                        List<AbstractType<?>> oldTypes = ((CompositeType) cfm.getKeyValidator()).types;
+                        if (!validatorType.isValueCompatibleWith(oldTypes.get(def.position())))
+                            throw new ConfigurationException(String.format("Cannot change %s from type %s to type %s: types are incompatible.",
+                                                                           columnName,
+                                                                           oldTypes.get(def.position()).asCQL3Type(),
+                                                                           validator));
+
+                        List<AbstractType<?>> newTypes = new ArrayList<AbstractType<?>>(oldTypes);
+                        newTypes.set(def.position(), validatorType);
+                        cfm.keyValidator(CompositeType.getInstance(newTypes));
+                    }
+                    else
+                    {
+                        if (!validatorType.isValueCompatibleWith(cfm.getKeyValidator()))
+                            throw new ConfigurationException(String.format("Cannot change %s from type %s to type %s: types are incompatible.",
+                                                                           columnName,
+                                                                           cfm.getKeyValidator().asCQL3Type(),
+                                                                           validator));
+                        cfm.keyValidator(validatorType);
+                    }
+                    break;
+                case CLUSTERING_COLUMN:
+                    if (!cfm.isCQL3Table())
+                        throw new InvalidRequestException(String.format("Cannot alter clustering column %s in a non-CQL3 table", columnName));
+
+                    AbstractType<?> oldType = cfm.comparator.subtype(def.position());
+                    // Note that CFMetaData.validateCompatibility already validate the change we're about to do. However, the error message it
+                    // sends is a bit cryptic for a CQL3 user, so validating here for a sake of returning a better error message
+                    // Do note that we need isCompatibleWith here, not just isValueCompatibleWith.
+                    if (!validatorType.isCompatibleWith(oldType))
+                        throw new ConfigurationException(String.format("Cannot change %s from type %s to type %s: types are not order-compatible.",
+                                                                       columnName,
+                                                                       oldType.asCQL3Type(),
+                                                                       validator));
+
+                    cfm.comparator = cfm.comparator.setSubtype(def.position(), validatorType);
+                    break;
+                case COMPACT_VALUE:
+                    // See below
+                    if (!validatorType.isValueCompatibleWith(cfm.getDefaultValidator()))
+                        throw new ConfigurationException(String.format("Cannot change %s from type %s to type %s: types are incompatible.",
+                                                                       columnName,
+                                                                       cfm.getDefaultValidator().asCQL3Type(),
+                                                                       validator));
+                    cfm.defaultValidator(validatorType);
+                    break;
+                case REGULAR:
+                case STATIC:
+                    // Thrift allows to change a column validator so CFMetaData.validateCompatibility will let it slide
+                    // if we change to an incompatible type (contrarily to the comparator case). But we don't want to
+                    // allow it for CQL3 (see #5882) so validating it explicitly here. We only care about value compatibility
+                    // though since we won't compare values (except when there is an index, but that is validated by
+                    // ColumnDefinition already).
+                    if (!validatorType.isValueCompatibleWith(def.type))
+                        throw new ConfigurationException(String.format("Cannot change %s from type %s to type %s: types are incompatible.",
+                                                                       columnName,
+                                                                       def.type.asCQL3Type(),
+                                                                       validator));
+
+                    // For collections, if we alter the type, we need to update the comparator too since it includes
+                    // the type too (note that isValueCompatibleWith above has validated that the new type doesn't
+                    // change the underlying sorting order, but we still don't want to have a discrepancy between the type
+                    // in the comparator and the one in the ColumnDefinition as that would be dodgy).
+                    if (validatorType.isCollection() && validatorType.isMultiCell())
+                        cfm.comparator = cfm.comparator.addOrUpdateCollection(def.name, (CollectionType)validatorType);
+
+                    break;
+            }
+            // In any case, we update the column definition
+            cfm.addOrReplaceColumnDefinition(def.withNewType(validatorType));
+            break;
+
+        case DROP:
+            assert columnName != null;
+            if (!cfm.isCQL3Table())
+                throw new InvalidRequestException("Cannot drop columns from a non-CQL3 table");
+            if (def == null)
+                throw new InvalidRequestException(String.format("Column %s was not found in table %s", columnName, columnFamily()));
+
+            switch (def.kind)
+            {
+                case PARTITION_KEY:
+                case CLUSTERING_COLUMN:
+                    throw new InvalidRequestException(String.format("Cannot drop PRIMARY KEY part %s", columnName));
+                case REGULAR:
+                case STATIC:
+                    ColumnDefinition toDelete = null;
+                    for (ColumnDefinition columnDef : cfm.regularAndStaticColumns())
+                    {
+                        if (columnDef.name.equals(columnName))
+                            toDelete = columnDef;
+                    }
+                    assert toDelete != null;
+                    cfm.removeColumnDefinition(toDelete);
+                    cfm.recordColumnDrop(toDelete);
+                    break;
+            }
+            break;
+        case OPTS:
+            if (cfProps == null)
+                throw new InvalidRequestException(String.format("ALTER COLUMNFAMILY WITH invoked, but no parameters found"));
+
+            cfProps.validate();
+
+            if (meta.isCounter() && cfProps.getDefaultTimeToLive() > 0)
+                throw new InvalidRequestException("Cannot set default_time_to_live on a table with counters");
+
+            cfProps.applyToCFMetadata(cfm);
+            break;
+        case RENAME:
+            for (Map.Entry<ColumnIdentifier.Raw, ColumnIdentifier.Raw> entry : renames.entrySet())
+            {
+                ColumnIdentifier from = entry.getKey().prepare(cfm);
+                ColumnIdentifier to = entry.getValue().prepare(cfm);
+                cfm.renameColumn(from, to);
+            }
+            break;
+    }
+
+    MigrationManager.announceColumnFamilyUpdate(cfm, false, isLocalOnly);
+    return true;
+#endif
+}
+
+shared_ptr<transport::event::schema_change> alter_table_statement::change_event()
+{
+    return make_shared<transport::event::schema_change>(transport::event::schema_change::change_type::UPDATED,
+        transport::event::schema_change::target_type::TABLE, keyspace(), column_family());
+}
+
+}
+
+}
--- a/cql3/statements/alter_table_statement.hh
+++ b/cql3/statements/alter_table_statement.hh
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright 2015 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "cql3/statements/schema_altering_statement.hh"
+#include "cql3/statements/cf_prop_defs.hh"
+#include "cql3/cql3_type.hh"
+
+namespace cql3 {
+
+namespace statements {
+
+class alter_table_statement : public schema_altering_statement {
+public:
+    enum class type {
+        add,
+        alter,
+        drop,
+        opts,
+        rename,
+    };
+    using renames_type = std::vector<std::pair<shared_ptr<column_identifier::raw>,
+                                               shared_ptr<column_identifier::raw>>>;
+private:
+    const type _type;
+    const shared_ptr<column_identifier::raw> _raw_column_name;
+    const shared_ptr<cql3_type::raw> _validator;
+    const shared_ptr<cf_prop_defs> _properties;
+    const renames_type _renames;
+    const bool _is_static;
+public:
+    alter_table_statement(shared_ptr<cf_name> name,
+                          type t,
+                          shared_ptr<column_identifier::raw> column_name,
+                          shared_ptr<cql3_type::raw> validator,
+                          shared_ptr<cf_prop_defs> properties,
+                          renames_type renames,
+                          bool is_static);
+
+    virtual void check_access(const service::client_state& state) override;
+    virtual void validate(distributed<service::storage_proxy>& proxy, const service::client_state& state) override;
+    virtual future<bool> announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) override;
+    virtual shared_ptr<transport::event::schema_change> change_event() override;
+};
+
+}
+
+}
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -139,6 +139,11 @@ std::map<sstring, sstring> cf_prop_defs::get_compression_options() const {
    return std::map<sstring, sstring>{};
 }

+int32_t cf_prop_defs::get_default_time_to_live() const
+{
+    return get_int(KW_DEFAULT_TIME_TO_LIVE, 0);
+}
+
 void cf_prop_defs::apply_to_builder(schema_builder& builder) {
    if (has_property(KW_COMMENT)) {
        builder.set_comment(get_string(KW_COMMENT, ""));
--- a/cql3/statements/cf_prop_defs.hh
+++ b/cql3/statements/cf_prop_defs.hh
@@ -100,6 +100,8 @@ public:
        return options;
    }
 #endif
+    int32_t get_default_time_to_live() const;
+
    void apply_to_builder(schema_builder& builder);
    void validate_minimum_int(const sstring& field, int32_t minimum_value, int32_t default_value) const;
 };
--- a/database.cc
+++ b/database.cc
@@ -358,23 +358,32 @@ column_family::for_all_partitions_slow(std::function<bool (const dht::decorated_
 class lister {
 public:
    using dir_entry_types = std::unordered_set<directory_entry_type, enum_hash<directory_entry_type>>;
+    using walker_type = std::function<future<> (directory_entry)>;
+    using filter_type = std::function<bool (const sstring&)>;
 private:
    file _f;
-    std::function<future<> (directory_entry de)> _walker;
+    walker_type _walker;
+    filter_type _filter;
    dir_entry_types _expected_type;
    subscription<directory_entry> _listing;
    sstring _dirname;

 public:
-    lister(file f, dir_entry_types type, std::function<future<> (directory_entry)> walker, sstring dirname)
+    lister(file f, dir_entry_types type, walker_type walker, sstring dirname)
            : _f(std::move(f))
            , _walker(std::move(walker))
+            , _filter([] (const sstring& fname) { return true; })
            , _expected_type(type)
            , _listing(_f.list_directory([this] (directory_entry de) { return _visit(de); }))
            , _dirname(dirname) {
    }

-    static future<> scan_dir(sstring name, dir_entry_types type, std::function<future<> (directory_entry)> walker);
+    lister(file f, dir_entry_types type, walker_type walker, filter_type filter, sstring dirname)
+            : lister(std::move(f), type, std::move(walker), dirname) {
+        _filter = std::move(filter);
+    }
+
+    static future<> scan_dir(sstring name, dir_entry_types type, walker_type walker, filter_type filter = [] (const sstring& fname) { return true; });
 protected:
    future<> _visit(directory_entry de) {

@@ -383,6 +392,12 @@ protected:
            if ((!_expected_type.count(*(de.type))) || (de.name[0] == '.')) {
                return make_ready_future<>();
            }
+
+            // apply a filter
+            if (!_filter(_dirname + "/" + de.name)) {
+                return make_ready_future<>();
+            }
+
            return _walker(de);
        });

@@ -403,9 +418,9 @@ private:
 };


-future<> lister::scan_dir(sstring name, lister::dir_entry_types type, std::function<future<> (directory_entry)> walker) {
-    return engine().open_directory(name).then([type, walker = std::move(walker), name] (file f) {
-        auto l = make_lw_shared<lister>(std::move(f), type, walker, name);
+future<> lister::scan_dir(sstring name, lister::dir_entry_types type, walker_type walker, filter_type filter) {
+    return engine().open_directory(name).then([type, walker = std::move(walker), filter = std::move(filter), name] (file f) {
+        auto l = make_lw_shared<lister>(std::move(f), type, walker, filter, name);
        return l->done().then([l] { });
    });
 }
@@ -453,6 +468,9 @@ future<sstables::entry_descriptor> column_family::probe_file(sstring sstdir, sst
    return std::move(fut).then([this, sstdir = std::move(sstdir), comps] (range<partition_key> r) {
        // Checks whether or not sstable belongs to current shard.
        if (!belongs_to_current_shard(*_schema, std::move(r))) {
+            dblog.debug("sstable {} not relevant for this shard, ignoring",
+                    sstables::sstable::filename(sstdir, _schema->ks_name(), _schema->cf_name(), comps.version, comps.generation, comps.format,
+                            sstables::sstable::component_type::Data));
            sstable::mark_sstable_for_deletion(_schema->ks_name(), _schema->cf_name(), sstdir, comps.generation, comps.version, comps.format);
            return make_ready_future<>();
        }
@@ -672,7 +690,7 @@ column_family::reshuffle_sstables(int64_t start) {
            // Those SSTables are not known by anyone in the system. So we don't have any kind of
            // object describing them. There isn't too much of a choice.
            return work.sstables[comps.generation]->read_toc();
-        }).then([&work] {
+        }, &manifest_json_filter).then([&work] {
            // Note: cannot be parallel because we will be shuffling things around at this stage. Can't race.
            return do_for_each(work.sstables, [&work] (auto& pair) {
                auto&& comps = std::move(work.descriptors.at(pair.first));
@@ -838,6 +856,17 @@ lw_shared_ptr<sstable_list> column_family::get_sstables() {
    return _sstables;
 }

+inline bool column_family::manifest_json_filter(const sstring& fname) {
+    using namespace boost::filesystem;
+
+    path entry_path(fname);
+    if (!is_directory(status(entry_path)) && entry_path.filename() == path("manifest.json")) {
+        return false;
+    }
+
+    return true;
+}
+
 future<> column_family::populate(sstring sstdir) {
    // We can catch most errors when we try to load an sstable. But if the TOC
    // file is the one missing, we won't try to load the sstable at all. This
@@ -899,7 +928,7 @@ future<> column_family::populate(sstring sstdir) {
            futures.push_back(std::move(f));

            return make_ready_future<>();
-        }).then([&futures] {
+        }, &manifest_json_filter).then([&futures] {
            return when_all(futures.begin(), futures.end()).then([] (std::vector<future<>> ret) {
                try {
                    for (auto& f : ret) {
@@ -919,7 +948,7 @@ future<> column_family::populate(sstring sstdir) {
                    sstables::sstable::format_types format = descriptor->format.value();

                    if (engine().cpu_id() != 0) {
-                        dblog.info("At directory: {}, partial SSTable with generation {} not relevant for this shard, ignoring", sstdir, v.first);
+                        dblog.debug("At directory: {}, partial SSTable with generation {} not relevant for this shard, ignoring", sstdir, v.first);
                        return make_ready_future<>();
                    }
                    // shard 0 is the responsible for removing a partial sstable.
@@ -1798,6 +1827,36 @@ const sstring& database::get_snitch_name() const {
    return _cfg->endpoint_snitch();
 }

+// For the filesystem operations, this code will assume that all keyspaces are visible in all shards
+// (as we have been doing for a lot of the other operations, like the snapshot itself).
+future<> database::clear_snapshot(sstring tag, std::vector<sstring> keyspace_names) {
+    std::vector<std::reference_wrapper<keyspace>> keyspaces;
+
+    if (keyspace_names.empty()) {
+        // if keyspace names are not given - apply to all existing local keyspaces
+        for (auto& ks: _keyspaces) {
+            keyspaces.push_back(std::reference_wrapper<keyspace>(ks.second));
+        }
+    } else {
+        for (auto& ksname: keyspace_names) {
+            try {
+                keyspaces.push_back(std::reference_wrapper<keyspace>(find_keyspace(ksname)));
+            } catch (no_such_keyspace& e) {
+                return make_exception_future(std::current_exception());
+            }
+        }
+    }
+
+    return parallel_for_each(keyspaces, [this, tag] (auto& ks) {
+        return parallel_for_each(ks.get().metadata()->cf_meta_data(), [this, tag] (auto& pair) {
+            auto& cf = this->find_column_family(pair.second);
+            return cf.clear_snapshot(tag);
+         }).then_wrapped([] (future<> f) {
+            dblog.debug("Cleared out snapshot directories");
+         });
+    });
+}
+
 future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy)
 {
    return db::schema_tables::calculate_schema_digest(proxy).then([&proxy] (utils::UUID uuid) {
@@ -1862,7 +1921,7 @@ seal_snapshot(sstring jsondir) {
    dblog.debug("Storing manifest {}", jsonfile);

    return recursive_touch_directory(jsondir).then([jsonfile, json = std::move(json)] {
-        return engine().open_file_dma(jsonfile, open_flags::wo | open_flags::create | open_flags::truncate).then([json](file f) {
+        return open_file_dma(jsonfile, open_flags::wo | open_flags::create | open_flags::truncate).then([json](file f) {
            return do_with(make_file_output_stream(std::move(f)), [json] (output_stream<char>& out) {
                return out.write(json.c_str(), json.size()).then([&out] {
                   return out.flush();
--- a/database.hh
+++ b/database.hh
@@ -351,6 +351,9 @@ private:
    // one are also complete
    future<> seal_active_memtable();

+    // filter manifest.json files out
+    static bool manifest_json_filter(const sstring& fname);
+
    seastar::gate _in_flight_seals;

    // Iterate over all partitions.  Protocol is the same as std::all_of(),
@@ -621,6 +624,7 @@ public:
    future<> apply(const frozen_mutation&);
    keyspace::config make_keyspace_config(const keyspace_metadata& ksm);
    const sstring& get_snitch_name() const;
+    future<> clear_snapshot(sstring tag, std::vector<sstring> keyspace_names);

    friend std::ostream& operator<<(std::ostream& out, const database& db);
    const std::unordered_map<sstring, keyspace>& get_keyspaces() const {
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -887,7 +887,7 @@ void db::commitlog::segment_manager::flush_segments(bool force) {

 future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::allocate_segment(bool active) {
    descriptor d(next_id());
-    return engine().open_file_dma(cfg.commit_log_location + "/" + d.filename(), open_flags::wo | open_flags::create).then([this, d, active](file f) {
+    return open_file_dma(cfg.commit_log_location + "/" + d.filename(), open_flags::wo | open_flags::create).then([this, d, active](file f) {
        // xfs doesn't like files extended betond eof, so enlarge the file
        return f.truncate(max_size).then([this, d, active, f] () mutable {
            auto s = make_lw_shared<segment>(this, d, std::move(f), active);
@@ -1215,7 +1215,7 @@ const db::commitlog::config& db::commitlog::active_config() const {

 future<std::unique_ptr<subscription<temporary_buffer<char>, db::replay_position>>>
 db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func next, position_type off) {
-    return engine().open_file_dma(filename, open_flags::ro).then([next = std::move(next), off](file f) {
+    return open_file_dma(filename, open_flags::ro).then([next = std::move(next), off](file f) {
       return std::make_unique<subscription<temporary_buffer<char>, replay_position>>(
           read_log_file(std::move(f), std::move(next), off));
    });
@@ -1350,6 +1350,17 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
        }
        future<> read_entry() {
            static constexpr size_t entry_header_size = segment::entry_overhead_size - sizeof(uint32_t);
+
+            /**
+             * #598 - Must check that data left in chunk is enough to even read an entry.
+             * If not, this is small slack space in the chunk end, and we should just go
+             * to the next.
+             */
+            assert(pos <= next);
+            if ((pos + entry_header_size) >= next) {
+                return skip(next - pos);
+            }
+
            return fin.read_exactly(entry_header_size).then([this](temporary_buffer<char> buf) {
                replay_position rp(id, position_type(pos));

--- a/db/config.cc
+++ b/db/config.cc
@@ -410,7 +410,7 @@ future<> db::config::read_from_file(file f) {
 }

 future<> db::config::read_from_file(const sstring& filename) {
-    return engine().open_file_dma(filename, open_flags::ro).then([this](file f) {
+    return open_file_dma(filename, open_flags::ro).then([this](file f) {
       return read_from_file(std::move(f));
    });
 }
--- a/db/config.hh
+++ b/db/config.hh
@@ -679,25 +679,15 @@ public:
            "truststore : (Default: <system truststore> ) Location of the truststore containing the trusted certificate for authenticating remote servers.\n"    \
            "Related information: Node-to-node encryption"  \
    )   \
-    val(client_encryption_options, string_map, /*none*/, Unused,     \
-            "Enable or disable client-to-node encryption. You must also generate keys and provide the appropriate key and trust store locations and passwords. No custom encryption options are currently enabled. The available options are:\n"    \
+    val(client_encryption_options, string_map, /*none*/, Used,     \
+            "Enable or disable client-to-node encryption. You must also generate keys and provide the appropriate key and certificate. No custom encryption options are currently enabled. The available options are:\n"    \
            "\n"    \
            "\tenabled : (Default: false ) To enable, set to true.\n"    \
-            "\tkeystore : (Default: conf/.keystore ) The location of a Java keystore (JKS) suitable for use with Java Secure Socket Extension (JSSE), which is the Java version of the Secure Sockets Layer (SSL), and Transport Layer Security (TLS) protocols. The keystore contains the private key used to encrypt outgoing messages.\n"    \
-            "\tkeystore_password : (Default: cassandra ) Password for the keystore. This must match the password used when generating the keystore and truststore.\n"    \
-            "\trequire_client_auth : (Default: false ) Enables or disables certificate authentication. (Available starting with Cassandra 1.2.3.)\n"    \
-            "\ttruststore : (Default: conf/.truststore ) Set if require_client_auth is true.\n"    \
-            "\ttruststore_password : <truststore_password> Set if require_client_auth is true.\n"    \
-            "\n"    \
-            "The advanced settings are:\n"    \
-            "\n"    \
-            "\tprotocol : (Default: TLS )\n"    \
-            "\talgorithm : (Default: SunX509 )\n"    \
-            "\tstore_type : (Default: JKS )\n"    \
-            "\tcipher_suites : (Default: TLS_RSA_WITH_AES_128_CBC_SHA , TLS_RSA_WITH_AES_256_CBC_SHA )\n"  \
+            "\tcertificate: (Default: conf/scylla.crt) The location of a PEM-encoded x509 certificate used to identify and encrypt the client/server communication.\n"   \
+            "\tkeyfile: (Default: conf/scylla.key) PEM Key file associated with certificate.\n"   \
            "Related information: Client-to-node encryption"    \
    )   \
-    val(ssl_storage_port, uint32_t, 7001, Unused,     \
+    val(ssl_storage_port, uint32_t, 7001, Used,     \
            "The SSL port for encrypted communication. Unused unless enabled in encryption_options."  \
    )                                                   \
    val(default_log_level, sstring, "warn", Used, \
@@ -724,7 +714,8 @@ public:
    val(replace_address, sstring, "", Used, "The listen_address or broadcast_address of the dead node to replace. Same as -Dcassandra.replace_address.") \
    val(replace_address_first_boot, sstring, "", Used, "Like replace_address option, but if the node has been bootstrapped sucessfully it will be ignored. Same as -Dcassandra.replace_address_first_boot.") \
    val(override_decommission, bool, false, Used, "Set true to force a decommissioned node to join the cluster") \
-    val(ring_delay_ms, uint32_t, 30 * 1000, Used, "Time a node waits to hear from other nodes before joining the ring in milliseconds. Same as -Dcassandra.ring_delay_ms in cassandra.")
+    val(ring_delay_ms, uint32_t, 30 * 1000, Used, "Time a node waits to hear from other nodes before joining the ring in milliseconds. Same as -Dcassandra.ring_delay_ms in cassandra.") \
+    val(developer_mode, bool, false, Used, "Relax environement checks. Setting to true can reduce performance and reliability significantly.") \
    /* done! */

 #define _make_value_member(name, type, deflt, status, desc, ...)    \
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -486,6 +486,10 @@ future<> init_local_cache() {
    });
 }

+future<> deinit_local_cache() {
+    return _local_cache.stop();
+}
+
 void minimal_setup(distributed<database>& db, distributed<cql3::query_processor>& qp) {
    qctx = std::make_unique<query_context>(db, qp);
 }
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -88,6 +88,7 @@ extern schema_ptr built_indexes(); // TODO (from Cassandra): make private
 void minimal_setup(distributed<database>& db, distributed<cql3::query_processor>& qp);

 future<> init_local_cache();
+future<> deinit_local_cache();
 future<> setup(distributed<database>& db, distributed<cql3::query_processor>& qp);
 future<> update_schema_version(utils::UUID version);
 future<> update_tokens(std::unordered_set<dht::token> tokens);
--- a/dht/range_streamer.cc
+++ b/dht/range_streamer.cc
@@ -241,12 +241,11 @@ future<streaming::stream_state> range_streamer::fetch_async() {
        for (auto& x : fetch.second) {
            auto& source = x.first;
            auto& ranges = x.second;
-            auto preferred = net::get_local_messaging_service().get_preferred_ip(source);
            /* Send messages to respective folks to stream data over to me */
            if (logger.is_enabled(logging::log_level::debug)) {
                logger.debug("{}ing from {} ranges {}", _description, source, ranges);
            }
-            _stream_plan.request_ranges(source, preferred, keyspace, ranges);
+            _stream_plan.request_ranges(source, keyspace, ranges);
        }
    }

--- a/dist/common/limits.d/scylla.conf
+++ b/dist/common/limits.d/scylla.conf
@@ -1,5 +1,5 @@
 scylla  -  core     unlimited
 scylla  -  memlock  unlimited
-scylla  -  nofile   100000
+scylla  -  nofile   200000
 scylla  -  as       unlimited
 scylla  -  nproc    8096
--- a/dist/common/scripts/scylla_bootparam_setup
+++ b/dist/common/scripts/scylla_bootparam_setup
@@ -20,14 +20,14 @@ while getopts a OPT; do
    esac
 done

-
 . /etc/os-release
-. /etc/sysconfig/scylla-server

 if [ $AMI -eq 1 ]; then
+    . /etc/sysconfig/scylla-server
    sed -e "s#append #append clocksource=tsc tsc=reliable hugepagesz=2M hugepages=$NR_HUGEPAGES #" /boot/extlinux/extlinux.conf > /tmp/extlinux.conf
    mv /tmp/extlinux.conf /boot/extlinux/extlinux.conf
 else
+    . /etc/sysconfig/scylla-server
    if [ ! -f /etc/default/grub ]; then
        echo "Unsupported bootloader"
        exit 1
--- a/dist/common/scripts/scylla_prepare
+++ b/dist/common/scripts/scylla_prepare
@@ -31,7 +31,7 @@ if [ "$AMI" = "yes" ]; then
        if [ "$DISKS" != "" ]; then
            /usr/lib/scylla/scylla_raid_setup -d $DISKS
        else
-            echo "WARN: Scylla is not using XFS to store data. Perforamnce will suffer." > /home/fedora/WARN_PLEASE_READ.TXT
+            echo "ERROR: Scylla is not using XFS to store data. The scylla service will refuse to start." > /home/fedora/SCYLLA_SETUP_ERROR.LOG
        fi

        /usr/lib/scylla/scylla-ami/ds2_configure.py
--- a/dist/common/scripts/scylla_save_coredump
+++ b/dist/common/scripts/scylla_save_coredump
@@ -6,4 +6,5 @@ FILE=$1
 TIME=`date --date @$2 +%F-%T`
 PID=$3

+mkdir -p /var/lib/scylla/coredump
 /usr/bin/gzip -c > /var/lib/scylla/coredump/core.$FILE-$TIME-$PID.gz
--- a/dist/docker/Dockerfile
+++ b/dist/docker/Dockerfile
@@ -1,11 +1,13 @@
-FROM fedora:22
+FROM centos:7

 MAINTAINER Avi Kivity <avi@cloudius-systems.com>

+RUN yum -y install epel-release
 ADD scylla.repo /etc/yum.repos.d/
-RUN dnf -y update
-RUN dnf -y install scylla-server hostname
-RUN dnf clean all
+RUN yum -y update
+RUN yum -y remove boost-thread boost-system
+RUN yum -y install scylla-server hostname
+RUN yum clean all
 ADD start-scylla /start-scylla
 RUN chown scylla /start-scylla

--- a/dist/docker/scylla.repo
+++ b/dist/docker/scylla.repo
@@ -1,11 +1,23 @@
 [scylla]
-name=Scylla for Fedora $releasever - $basearch
-baseurl=https://s3.amazonaws.com/downloads.scylladb.com/rpm/fedora/$releasever/$basearch/
+name=Scylla for Centos $releasever - $basearch
+baseurl=https://s3.amazonaws.com/downloads.scylladb.com/rpm/centos/$releasever/$basearch/
 enabled=1
 gpgcheck=0

 [scylla-generic]
-name=Scylla for Fedora $releasever
-baseurl=https://s3.amazonaws.com/downloads.scylladb.com/rpm/fedora/$releasever/noarch/
+name=Scylla for centos $releasever
+baseurl=https://s3.amazonaws.com/downloads.scylladb.com/rpm/centos/$releasever/noarch/
+enabled=1
+gpgcheck=0
+
+[scylla-3rdparty]
+name=Scylla 3rdParty for Centos $releasever - $basearch
+baseurl=https://s3.amazonaws.com/downloads.scylladb.com/rpm/3rdparty/centos/$releasever/$basearch/
+enabled=1
+gpgcheck=0
+
+[scylla-3rdparty-generic]
+name=Scylla 3rdParty for Centos $releasever
+baseurl=https://s3.amazonaws.com/downloads.scylladb.com/rpm/3rdparty/centos/$releasever/noarch/
 enabled=1
 gpgcheck=0
--- a/dist/docker/start-scylla
+++ b/dist/docker/start-scylla
@@ -4,6 +4,7 @@ IP=$(hostname -i)
 sed -e "s/seeds:.*/seeds: $IP/g" /var/lib/scylla/conf/scylla.yaml > $HOME/scylla.yaml
 /usr/bin/scylla --log-to-syslog 1 \
                --log-to-stdout 0 \
+                --developer-mode true \
                --default-log-level info \
                --options-file $HOME/scylla.yaml \
                --listen-address $IP \
--- a/dist/redhat/scripts/scylla_run
+++ b/dist/redhat/scripts/scylla_run
--- a/dist/redhat/scylla-server.spec.in
+++ b/dist/redhat/scylla-server.spec.in
@@ -51,6 +51,7 @@ install -m644 conf/scylla.yaml $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 install -m644 conf/cassandra-rackdc.properties $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 install -m644 dist/redhat/systemd/scylla-server.service $RPM_BUILD_ROOT%{_unitdir}/
 install -m755 dist/common/scripts/* $RPM_BUILD_ROOT%{_prefix}/lib/scylla/
+install -m755 dist/redhat/scripts/* $RPM_BUILD_ROOT%{_prefix}/lib/scylla/
 install -m755 seastar/scripts/posix_net_conf.sh  $RPM_BUILD_ROOT%{_prefix}/lib/scylla/
 install -m755 seastar/dpdk/tools/dpdk_nic_bind.py $RPM_BUILD_ROOT%{_prefix}/lib/scylla/
 install -m755 build/release/scylla $RPM_BUILD_ROOT%{_bindir}
--- a/dist/redhat/systemd/scylla-server.service
+++ b/dist/redhat/systemd/scylla-server.service
@@ -5,13 +5,14 @@ After=network.target libvirtd.service
 [Service]
 Type=simple
 LimitMEMLOCK=infinity
-LimitNOFILE=100000
+LimitNOFILE=200000
 LimitAS=infinity
 LimitNPROC=8096
 EnvironmentFile=/etc/sysconfig/scylla-server
 ExecStartPre=/usr/lib/scylla/scylla_prepare
 ExecStart=/usr/lib/scylla/scylla_run
 ExecStopPost=/usr/lib/scylla/scylla_stop
+TimeoutStartSec=900
 KillMode=process
 Restart=no

--- a/dist/ubuntu/debian/rules
+++ b/dist/ubuntu/debian/rules
@@ -38,6 +38,7 @@ override_dh_auto_install:
 	mkdir -p $(SCRIPTS) && \
 	cp $(CURDIR)/seastar/dpdk/tools/dpdk_nic_bind.py $(SCRIPTS)
 	cp $(CURDIR)/dist/common/scripts/* $(SCRIPTS)
+	cp $(CURDIR)/dist/ubuntu/scripts/* $(SCRIPTS)

 	mkdir -p $(SWAGGER) && \
 	cp -r $(CURDIR)/swagger-ui/dist $(SWAGGER)
--- a/dist/ubuntu/scripts/scylla_run
+++ b/dist/ubuntu/scripts/scylla_run
@@ -0,0 +1,19 @@
+#!/bin/bash -e
+
+args="--log-to-syslog 1 --log-to-stdout 0 --default-log-level info $SCYLLA_ARGS"
+
+if [ "$NETWORK_MODE" = "posix" ]; then
+    args="$args --network-stack posix"
+elif [ "$NETWORK_MODE" = "virtio" ]; then
+    args="$args --network-stack native"
+elif [ "$NETWORK_MODE" = "dpdk" ]; then
+    args="$args --network-stack native --dpdk-pmd"
+fi
+
+export HOME=/var/lib/scylla
+ulimit -c unlimited
+ulimit -l unlimited
+ulimit -n 200000
+ulimit -m unlimited
+ulimit -u 8096
+exec sudo -E -u $USER /usr/bin/scylla $args
--- a/locator/gossiping_property_file_snitch.cc
+++ b/locator/gossiping_property_file_snitch.cc
@@ -41,7 +41,7 @@

 namespace locator {
 future<bool> gossiping_property_file_snitch::property_file_was_modified() {
-    return engine().open_file_dma(_prop_file_name, open_flags::ro)
+    return open_file_dma(_prop_file_name, open_flags::ro)
    .then([this](file f) {
        return do_with(std::move(f), [] (file& f) {
            return f.stat();
--- a/locator/production_snitch_base.cc
+++ b/locator/production_snitch_base.cc
@@ -2,7 +2,7 @@

 namespace locator {
 future<> production_snitch_base::load_property_file() {
-    return engine().open_file_dma(_prop_file_name, open_flags::ro)
+    return open_file_dma(_prop_file_name, open_flags::ro)
    .then([this] (file f) {
        return do_with(std::move(f), [this] (file& f) {
            return f.size().then([this, &f] (size_t s) {
--- a/locator/snitch_base.hh
+++ b/locator/snitch_base.hh
@@ -149,9 +149,10 @@ public:
    virtual void set_local_private_addr(const sstring& addr_str) {};

    static distributed<snitch_ptr>& snitch_instance() {
-        static distributed<snitch_ptr> snitch_inst;
+        // FIXME: leaked intentionally to avoid shutdown problems, see #293
+        static distributed<snitch_ptr>* snitch_inst = new distributed<snitch_ptr>();

-        return snitch_inst;
+        return *snitch_inst;
    }

    static snitch_ptr& get_local_snitch_ptr() {
--- a/main.cc
+++ b/main.cc
@@ -45,6 +45,8 @@
 #include "release.hh"
 #include <cstdio>
 #include <core/file.hh>
+#include <sys/time.h>
+#include <sys/resource.h>

 logging::logger startlog("init");

@@ -100,12 +102,19 @@ static logging::log_level to_loglevel(sstring level) {
    }
 }

-static future<> disk_sanity(sstring path) {
-    return check_direct_io_support(path).then([path] {
-        return file_system_at(path).then([path] (auto fs) {
+static future<> disk_sanity(sstring path, bool developer_mode) {
+    return check_direct_io_support(path).then([path, developer_mode] {
+        return file_system_at(path).then([path, developer_mode] (auto fs) {
            if (fs != fs_type::xfs) {
-                startlog.warn("{} is not on XFS. This is a non-supported setup, and performance is expected to be very bad.\n"
-                    "For better performance, placing your data on XFS-formatted directories is strongly recommended", path);
+                if (!developer_mode) {
+                    startlog.error("{} is not on XFS. This is a non-supported setup, and performance is expected to be very bad.\n"
+                            "For better performance, placing your data on XFS-formatted directories is required."
+                            " To override this error, see the developer_mode configuration option.", path);
+                    throw std::runtime_error(sprint("invalid configuration: path \"%s\" on unsupported filesystem", path));
+                } else {
+                    startlog.warn("{} is not on XFS. This is a non-supported setup, and performance is expected to be very bad.\n"
+                            "For better performance, placing your data on XFS-formatted directories is strongly recommended", path);
+                }
            }
        });
    });
@@ -171,6 +180,28 @@ private:

 class bad_configuration_error : public std::exception {};

+static
+void
+verify_rlimit(bool developer_mode) {
+    struct rlimit lim;
+    int r = getrlimit(RLIMIT_NOFILE, &lim);
+    if (r == -1) {
+        throw std::system_error(errno, std::system_category());
+    }
+    auto recommended = 200'000U;
+    auto min = 10'000U;
+    if (lim.rlim_cur < min) {
+        if (developer_mode) {
+            startlog.warn("NOFILE rlimit too low (recommended setting {}, minimum setting {};"
+                          " you may run out of file descriptors.", recommended, min);
+        } else {
+            startlog.error("NOFILE rlimit too low (recommended setting {}, minimum setting {};"
+                          " refusing to start.", recommended, min);
+            throw std::runtime_error("NOFILE rlimit too low");
+        }
+    }
+}
+
 int main(int ac, char** av) {
    runtime::init_uptime();
    std::setvbuf(stdout, nullptr, _IOLBF, 1000);
@@ -210,6 +241,7 @@ int main(int ac, char** av) {
        return read_config(opts, *cfg).then([cfg, &db, &qp, &proxy, &mm, &ctx, &opts, &dirs]() {
            apply_logger_settings(cfg->default_log_level(), cfg->logger_log_level(),
                    cfg->log_to_stdout(), cfg->log_to_syslog());
+            verify_rlimit(cfg->developer_mode());
            dht::set_global_partitioner(cfg->partitioner());
            auto start_thrift = cfg->start_rpc();
            uint16_t api_port = cfg->api_port();
@@ -267,10 +299,10 @@ int main(int ac, char** av) {
                return dns::gethostbyname(api_address);
            }).then([&db, api_address, api_port, &ctx] (dns::hostent e){
                auto ip = e.addresses[0].in.s_addr;
-                ctx.http_server.start().then([api_address, api_port, ip, &ctx] {
+                return ctx.http_server.start().then([api_address, api_port, ip, &ctx] {
                    return set_server(ctx);
                }).then([api_address, api_port, ip, &ctx] {
-                    ctx.http_server.listen(ipv4_addr{ip, api_port});
+                    return ctx.http_server.listen(ipv4_addr{ip, api_port});
                }).then([api_address, api_port] {
                    print("Seastar HTTP server listening on %s:%s ...\n", api_address, api_port);
                });
@@ -318,6 +350,16 @@ int main(int ac, char** av) {
                                , phi);
            }).then([&db] {
                return streaming::stream_session::init_streaming_service(db);
+            }).then([&db] {
+                // Start handling REPAIR_CHECKSUM_RANGE messages
+                return net::get_messaging_service().invoke_on_all([&db] (auto& ms) {
+                    ms.register_repair_checksum_range([&db] (sstring keyspace, sstring cf, query::range<dht::token> range) {
+                       return do_with(std::move(keyspace), std::move(cf), std::move(range),
+                               [&db] (auto& keyspace, auto& cf, auto& range) {
+                           return checksum_range(db, keyspace, cf, range);
+                       });
+                    });
+                });
            }).then([&proxy, &db] {
                return proxy.start(std::ref(db)).then([&proxy] {
                    // #293 - do not stop anything
@@ -347,9 +389,9 @@ int main(int ac, char** av) {
                directories.insert(db.local().get_config().data_file_directories().cbegin(),
                        db.local().get_config().data_file_directories().cend());
                directories.insert(db.local().get_config().commitlog_directory());
-                return do_with(std::move(directories), [] (auto& directories) {
-                    return parallel_for_each(directories, [] (sstring pathname) {
-                        return disk_sanity(pathname);
+                return do_with(std::move(directories), [&db] (auto& directories) {
+                    return parallel_for_each(directories, [&db] (sstring pathname) {
+                        return disk_sanity(pathname, db.local().get_config().developer_mode());
                    });
                });
            }).then([&db] {
--- a/map_difference.hh
+++ b/map_difference.hh
@@ -46,16 +46,17 @@ struct map_difference {
    { }
 };

-template<typename Key,
+template<template<typename...> class Map,
+         typename Key,
         typename Tp,
         typename Compare = std::less<Key>,
         typename Eq = std::equal_to<Tp>,
         typename Alloc>
 inline
 map_difference<Key>
-difference(const std::map<Key, Tp, Compare, Alloc>& left,
-           const std::map<Key, Tp, Compare, Alloc>& right,
-           Compare key_comp,
+difference(const Map<Key, Tp, Compare, Alloc>& left,
+           const Map<Key, Tp, Compare, Alloc>& right,
+           Compare key_comp = Compare(),
           Eq equals = Eq(),
           Alloc alloc = Alloc())
 {
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -187,8 +187,9 @@ messaging_service::messaging_service(gms::inet_address ip
        );
    }())
 {
-    register_handler(this, messaging_verb::CLIENT_ID, [] (rpc::client_info& ci, gms::inet_address broadcast_address) {
+    register_handler(this, messaging_verb::CLIENT_ID, [] (rpc::client_info& ci, gms::inet_address broadcast_address, uint32_t src_cpu_id) {
        ci.attach_auxiliary("baddr", broadcast_address);
+        ci.attach_auxiliary("src_cpu_id", src_cpu_id);
        return rpc::no_wait;
    });
 }
@@ -204,16 +205,14 @@ gms::inet_address messaging_service::listen_address() {
 }

 future<> messaging_service::stop() {
-    return _in_flight_requests.close().then([this] {
-        return when_all(
-            _server->stop(),
-            parallel_for_each(_clients, [] (auto& m) {
-                return parallel_for_each(m, [] (std::pair<const shard_id, shard_info>& c) {
-                    return c.second.rpc_client->stop();
-                });
-            })
-        ).discard_result();
-    });
+    return when_all(
+        _server->stop(),
+        parallel_for_each(_clients, [] (auto& m) {
+            return parallel_for_each(m, [] (std::pair<const shard_id, shard_info>& c) {
+                return c.second.rpc_client->stop();
+            });
+        })
+    ).discard_result();
 }

 rpc::no_wait_type messaging_service::no_wait() {
@@ -223,7 +222,6 @@ rpc::no_wait_type messaging_service::no_wait() {
 static unsigned get_rpc_client_idx(messaging_verb verb) {
    unsigned idx = 0;
    if (verb == messaging_verb::GOSSIP_DIGEST_SYN ||
-        verb == messaging_verb::GOSSIP_DIGEST_ACK ||
        verb == messaging_verb::GOSSIP_DIGEST_ACK2 ||
        verb == messaging_verb::GOSSIP_SHUTDOWN ||
        verb == messaging_verb::ECHO) {
@@ -319,7 +317,8 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
                                    remote_addr, local_addr);

    it = _clients[idx].emplace(id, shard_info(std::move(client))).first;
-    _rpc->make_client<rpc::no_wait_type(gms::inet_address)>(messaging_verb::CLIENT_ID)(*it->second.rpc_client, utils::fb_utilities::get_broadcast_address());
+    uint32_t src_cpu_id = engine().cpu_id();
+    _rpc->make_client<rpc::no_wait_type(gms::inet_address, uint32_t)>(messaging_verb::CLIENT_ID)(*it->second.rpc_client, utils::fb_utilities::get_broadcast_address(), src_cpu_id);
    return it->second.rpc_client;
 }

@@ -357,54 +356,50 @@ std::unique_ptr<messaging_service::rpc_protocol_wrapper>& messaging_service::rpc
 // Send a message for verb
 template <typename MsgIn, typename... MsgOut>
 auto send_message(messaging_service* ms, messaging_verb verb, shard_id id, MsgOut&&... msg) {
-    return seastar::with_gate(ms->requests_gate(), [&] {
-        auto rpc_client_ptr = ms->get_rpc_client(verb, id);
-        auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
-        auto& rpc_client = *rpc_client_ptr;
-        return rpc_handler(rpc_client, std::forward<MsgOut>(msg)...).then_wrapped([ms = ms->shared_from_this(), id, verb, rpc_client_ptr = std::move(rpc_client_ptr)] (auto&& f) {
-            try {
-                if (f.failed()) {
-                    ms->increment_dropped_messages(verb);
-                    f.get();
-                    assert(false); // never reached
-                }
-                return std::move(f);
-            } catch (rpc::closed_error) {
-                // This is a transport error
-                ms->remove_error_rpc_client(verb, id);
-                throw;
-            } catch (...) {
-                // This is expected to be a rpc server error, e.g., the rpc handler throws a std::runtime_error.
-                throw;
+    auto rpc_client_ptr = ms->get_rpc_client(verb, id);
+    auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
+    auto& rpc_client = *rpc_client_ptr;
+    return rpc_handler(rpc_client, std::forward<MsgOut>(msg)...).then_wrapped([ms = ms->shared_from_this(), id, verb, rpc_client_ptr = std::move(rpc_client_ptr)] (auto&& f) {
+        try {
+            if (f.failed()) {
+                ms->increment_dropped_messages(verb);
+                f.get();
+                assert(false); // never reached
            }
-        });
+            return std::move(f);
+        } catch (rpc::closed_error) {
+            // This is a transport error
+            ms->remove_error_rpc_client(verb, id);
+            throw;
+        } catch (...) {
+            // This is expected to be a rpc server error, e.g., the rpc handler throws a std::runtime_error.
+            throw;
+        }
    });
 }

 // TODO: Remove duplicated code in send_message
 template <typename MsgIn, typename Timeout, typename... MsgOut>
 auto send_message_timeout(messaging_service* ms, messaging_verb verb, shard_id id, Timeout timeout, MsgOut&&... msg) {
-    return seastar::with_gate(ms->requests_gate(), [&] {
-        auto rpc_client_ptr = ms->get_rpc_client(verb, id);
-        auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
-        auto& rpc_client = *rpc_client_ptr;
-        return rpc_handler(rpc_client, timeout, std::forward<MsgOut>(msg)...).then_wrapped([ms = ms->shared_from_this(), id, verb, rpc_client_ptr = std::move(rpc_client_ptr)] (auto&& f) {
-            try {
-                if (f.failed()) {
-                    ms->increment_dropped_messages(verb);
-                    f.get();
-                    assert(false); // never reached
-                }
-                return std::move(f);
-            } catch (rpc::closed_error) {
-                // This is a transport error
-                ms->remove_error_rpc_client(verb, id);
-                throw;
-            } catch (...) {
-                // This is expected to be a rpc server error, e.g., the rpc handler throws a std::runtime_error.
-                throw;
+    auto rpc_client_ptr = ms->get_rpc_client(verb, id);
+    auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
+    auto& rpc_client = *rpc_client_ptr;
+    return rpc_handler(rpc_client, timeout, std::forward<MsgOut>(msg)...).then_wrapped([ms = ms->shared_from_this(), id, verb, rpc_client_ptr = std::move(rpc_client_ptr)] (auto&& f) {
+        try {
+            if (f.failed()) {
+                ms->increment_dropped_messages(verb);
+                f.get();
+                assert(false); // never reached
            }
-        });
+            return std::move(f);
+        } catch (rpc::closed_error) {
+            // This is a transport error
+            ms->remove_error_rpc_client(verb, id);
+            throw;
+        } catch (...) {
+            // This is expected to be a rpc server error, e.g., the rpc handler throws a std::runtime_error.
+            throw;
+        }
    });
 }

@@ -460,35 +455,36 @@ static constexpr std::chrono::seconds streaming_timeout{30};
 static constexpr std::chrono::seconds streaming_wait_before_retry{30};

 // STREAM_INIT_MESSAGE
-void messaging_service::register_stream_init_message(std::function<future<unsigned> (streaming::messages::stream_init_message msg, unsigned src_cpu_id)>&& func) {
+void messaging_service::register_stream_init_message(std::function<future<unsigned> (const rpc::client_info& cinfo,
+        streaming::messages::stream_init_message msg)>&& func) {
    register_handler(this, messaging_verb::STREAM_INIT_MESSAGE, std::move(func));
 }
-future<unsigned> messaging_service::send_stream_init_message(shard_id id, streaming::messages::stream_init_message msg, unsigned src_cpu_id) {
+future<unsigned> messaging_service::send_stream_init_message(shard_id id, streaming::messages::stream_init_message msg) {
    return send_message_timeout_and_retry<unsigned>(this, messaging_verb::STREAM_INIT_MESSAGE, id,
-        streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
-        std::move(msg), src_cpu_id);
+        streaming_timeout, streaming_nr_retry, streaming_wait_before_retry, std::move(msg));
 }

 // PREPARE_MESSAGE
-void messaging_service::register_prepare_message(std::function<future<streaming::messages::prepare_message> (streaming::messages::prepare_message msg, UUID plan_id,
-    inet_address from, inet_address connecting, unsigned src_cpu_id, unsigned dst_cpu_id)>&& func) {
+void messaging_service::register_prepare_message(std::function<future<streaming::messages::prepare_message> (const rpc::client_info& cinfo,
+        streaming::messages::prepare_message msg, UUID plan_id,
+        unsigned dst_cpu_id)>&& func) {
    register_handler(this, messaging_verb::PREPARE_MESSAGE, std::move(func));
 }
 future<streaming::messages::prepare_message> messaging_service::send_prepare_message(shard_id id, streaming::messages::prepare_message msg, UUID plan_id,
-    inet_address from, inet_address connecting, unsigned src_cpu_id, unsigned dst_cpu_id) {
+        unsigned dst_cpu_id) {
    return send_message_timeout_and_retry<streaming::messages::prepare_message>(this, messaging_verb::PREPARE_MESSAGE, id,
        streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
-        std::move(msg), plan_id, from, connecting, src_cpu_id, dst_cpu_id);
+        std::move(msg), plan_id, dst_cpu_id);
 }

 // PREPARE_DONE_MESSAGE
-void messaging_service::register_prepare_done_message(std::function<future<> (UUID plan_id, inet_address from, inet_address connecting, unsigned dst_cpu_id)>&& func) {
+void messaging_service::register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func) {
    register_handler(this, messaging_verb::PREPARE_DONE_MESSAGE, std::move(func));
 }
-future<> messaging_service::send_prepare_done_message(shard_id id, UUID plan_id, inet_address from, inet_address connecting, unsigned dst_cpu_id) {
+future<> messaging_service::send_prepare_done_message(shard_id id, UUID plan_id, unsigned dst_cpu_id) {
    return send_message_timeout_and_retry<void>(this, messaging_verb::PREPARE_DONE_MESSAGE, id,
        streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
-        plan_id, from, connecting, dst_cpu_id);
+        plan_id, dst_cpu_id);
 }

 // STREAM_MUTATION
@@ -502,23 +498,24 @@ future<> messaging_service::send_stream_mutation(shard_id id, UUID plan_id, froz
 }

 // STREAM_MUTATION_DONE
-void messaging_service::register_stream_mutation_done(std::function<future<> (UUID plan_id, std::vector<range<dht::token>> ranges, UUID cf_id, inet_address from, inet_address connecting, unsigned dst_cpu_id)>&& func) {
+void messaging_service::register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo,
+        UUID plan_id, std::vector<range<dht::token>> ranges, UUID cf_id, unsigned dst_cpu_id)>&& func) {
    register_handler(this, messaging_verb::STREAM_MUTATION_DONE, std::move(func));
 }
-future<> messaging_service::send_stream_mutation_done(shard_id id, UUID plan_id, std::vector<range<dht::token>> ranges, UUID cf_id, inet_address from, inet_address connecting, unsigned dst_cpu_id) {
+future<> messaging_service::send_stream_mutation_done(shard_id id, UUID plan_id, std::vector<range<dht::token>> ranges, UUID cf_id, unsigned dst_cpu_id) {
    return send_message_timeout_and_retry<void>(this, messaging_verb::STREAM_MUTATION_DONE, id,
        streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
-        plan_id, std::move(ranges), cf_id, from, connecting, dst_cpu_id);
+        plan_id, std::move(ranges), cf_id, dst_cpu_id);
 }

 // COMPLETE_MESSAGE
-void messaging_service::register_complete_message(std::function<future<> (UUID plan_id, inet_address from, inet_address connecting, unsigned dst_cpu_id)>&& func) {
+void messaging_service::register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func) {
    register_handler(this, messaging_verb::COMPLETE_MESSAGE, std::move(func));
 }
-future<> messaging_service::send_complete_message(shard_id id, UUID plan_id, inet_address from, inet_address connecting, unsigned dst_cpu_id) {
+future<> messaging_service::send_complete_message(shard_id id, UUID plan_id, unsigned dst_cpu_id) {
    return send_message_timeout_and_retry<void>(this, messaging_verb::COMPLETE_MESSAGE, id,
        streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
-        plan_id, from, connecting, dst_cpu_id);
+        plan_id, dst_cpu_id);
 }

 void messaging_service::register_echo(std::function<future<> ()>&& func) {
@@ -610,7 +607,7 @@ void messaging_service::register_read_data(std::function<future<foreign_ptr<lw_s
 void messaging_service::unregister_read_data() {
    _rpc->unregister_handler(net::messaging_verb::READ_DATA);
 }
-future<query::result> messaging_service::send_read_data(shard_id id, query::read_command& cmd, query::partition_range& pr) {
+future<query::result> messaging_service::send_read_data(shard_id id, const query::read_command& cmd, const query::partition_range& pr) {
    return send_message<query::result>(this, messaging_verb::READ_DATA, std::move(id), cmd, pr);
 }

@@ -620,7 +617,7 @@ void messaging_service::register_read_mutation_data(std::function<future<foreign
 void messaging_service::unregister_read_mutation_data() {
    _rpc->unregister_handler(net::messaging_verb::READ_MUTATION_DATA);
 }
-future<reconcilable_result> messaging_service::send_read_mutation_data(shard_id id, query::read_command& cmd, query::partition_range& pr) {
+future<reconcilable_result> messaging_service::send_read_mutation_data(shard_id id, const query::read_command& cmd, const query::partition_range& pr) {
    return send_message<reconcilable_result>(this, messaging_verb::READ_MUTATION_DATA, std::move(id), cmd, pr);
 }

@@ -630,7 +627,7 @@ void messaging_service::register_read_digest(std::function<future<query::result_
 void messaging_service::unregister_read_digest() {
    _rpc->unregister_handler(net::messaging_verb::READ_DIGEST);
 }
-future<query::result_digest> messaging_service::send_read_digest(shard_id id, query::read_command& cmd, query::partition_range& pr) {
+future<query::result_digest> messaging_service::send_read_digest(shard_id id, const query::read_command& cmd, const query::partition_range& pr) {
    return send_message<query::result_digest>(this, net::messaging_verb::READ_DIGEST, std::move(id), cmd, pr);
 }

@@ -659,4 +656,21 @@ future<> messaging_service::send_replication_finished(shard_id id, inet_address
    return send_message_timeout<void>(this, messaging_verb::REPLICATION_FINISHED, std::move(id), 10000ms, std::move(from));
 }

+// Wrapper for REPAIR_CHECKSUM_RANGE
+void messaging_service::register_repair_checksum_range(
+        std::function<future<partition_checksum> (sstring keyspace,
+                sstring cf, query::range<dht::token> range)>&& f) {
+    register_handler(this, messaging_verb::REPAIR_CHECKSUM_RANGE, std::move(f));
+}
+void messaging_service::unregister_repair_checksum_range() {
+    _rpc->unregister_handler(messaging_verb::REPAIR_CHECKSUM_RANGE);
+}
+future<partition_checksum> messaging_service::send_repair_checksum_range(
+        shard_id id, sstring keyspace, sstring cf, ::range<dht::token> range)
+{
+    return send_message<partition_checksum>(this,
+            messaging_verb::REPAIR_CHECKSUM_RANGE, std::move(id),
+            std::move(keyspace), std::move(cf), std::move(range));
+}
+
 } // namespace net
--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -36,7 +36,7 @@
 #include "query-request.hh"
 #include "db/serializer.hh"
 #include "mutation_query.hh"
-#include <seastar/core/gate.hh>
+#include "repair/repair.hh"

 #include <seastar/net/tls.hh>

@@ -66,65 +66,30 @@ namespace net {

 /* All verb handler identifiers */
 enum class messaging_verb : int32_t {
-    MUTATION,
-    MUTATION_DONE,
-    BINARY, // Deprecated
-    READ_REPAIR,
-    READ,
-    READ_DATA,
-    READ_MUTATION_DATA, // scylla-only
-    READ_DIGEST,
-    REQUEST_RESPONSE, // client-initiated reads and writes
-    STREAM_INITIATE, // Deprecated
-    STREAM_INITIATE_DONE, // Deprecated
-    STREAM_REPLY, // Deprecated
-    STREAM_REQUEST, // Deprecated
-    RANGE_SLICE,
-    BOOTSTRAP_TOKEN, // Deprecated
-    TREE_REQUEST, // Deprecated
-    TREE_RESPONSE, // Deprecated
-    JOIN, // Deprecated
-    GOSSIP_DIGEST_SYN,
-    GOSSIP_DIGEST_ACK,
-    GOSSIP_DIGEST_ACK2,
-    DEFINITIONS_ANNOUNCE, // Deprecated
-    DEFINITIONS_UPDATE,
-    TRUNCATE,
-    SCHEMA_CHECK,
-    INDEX_SCAN, // Deprecated
-    REPLICATION_FINISHED,
-    INTERNAL_RESPONSE, // responses to internal calls
-    COUNTER_MUTATION,
-    STREAMING_REPAIR_REQUEST, // Deprecated
-    STREAMING_REPAIR_RESPONSE, // Deprecated
-    SNAPSHOT, // Similar to nt snapshot
-    MIGRATION_REQUEST,
-    GOSSIP_SHUTDOWN,
-    _TRACE,
-    ECHO,
-    REPAIR_MESSAGE,
-    PAXOS_PREPARE,
-    PAXOS_PROPOSE,
-    PAXOS_COMMIT,
-    PAGED_RANGE,
-    UNUSED_1,
-    UNUSED_2,
-    UNUSED_3,
+    CLIENT_ID = 0,
+    ECHO = 1,
+    MUTATION = 2,
+    MUTATION_DONE = 3,
+    READ_DATA = 4,
+    READ_MUTATION_DATA = 5,
+    READ_DIGEST = 6,
+    GOSSIP_DIGEST_SYN = 7,
+    GOSSIP_DIGEST_ACK2 = 8,
+    GOSSIP_SHUTDOWN = 9,
+    DEFINITIONS_UPDATE = 10,
+    TRUNCATE = 11,
+    REPLICATION_FINISHED = 12,
+    MIGRATION_REQUEST = 13,
    // Used by streaming
-    STREAM_INIT_MESSAGE,
-    PREPARE_MESSAGE,
-    PREPARE_DONE_MESSAGE,
-    STREAM_MUTATION,
-    STREAM_MUTATION_DONE,
-    INCOMING_FILE_MESSAGE,
-    OUTGOING_FILE_MESSAGE,
-    RECEIVED_MESSAGE,
-    RETRY_MESSAGE,
-    COMPLETE_MESSAGE,
-    SESSION_FAILED_MESSAGE,
+    STREAM_INIT_MESSAGE = 14,
+    PREPARE_MESSAGE = 15,
+    PREPARE_DONE_MESSAGE = 16,
+    STREAM_MUTATION = 17,
+    STREAM_MUTATION_DONE = 18,
+    COMPLETE_MESSAGE = 19,
+    REPAIR_CHECKSUM_RANGE = 20,
    // end of streaming verbs
-    CLIENT_ID,
-    LAST,
+    LAST = 21,
 };

 } // namespace net
@@ -418,7 +383,6 @@ private:
    std::unique_ptr<rpc_protocol_server_wrapper> _server_tls;
    std::array<clients_map, 2> _clients;
    uint64_t _dropped_messages[static_cast<int32_t>(messaging_verb::LAST)] = {};
-    seastar::gate _in_flight_requests;
 public:
    using clock_type = std::chrono::steady_clock;
 public:
@@ -431,35 +395,40 @@ public:
    gms::inet_address listen_address();
    future<> stop();
    static rpc::no_wait_type no_wait();
-    seastar::gate& requests_gate() { return _in_flight_requests; }
 public:
    gms::inet_address get_preferred_ip(gms::inet_address ep);
    future<> init_local_preferred_ip_cache();
    void cache_preferred_ip(gms::inet_address ep, gms::inet_address ip);

    // Wrapper for STREAM_INIT_MESSAGE verb
-    void register_stream_init_message(std::function<future<unsigned> (streaming::messages::stream_init_message msg, unsigned src_cpu_id)>&& func);
-    future<unsigned> send_stream_init_message(shard_id id, streaming::messages::stream_init_message msg, unsigned src_cpu_id);
+    void register_stream_init_message(std::function<future<unsigned> (const rpc::client_info& cinfo, streaming::messages::stream_init_message msg)>&& func);
+    future<unsigned> send_stream_init_message(shard_id id, streaming::messages::stream_init_message msg);

    // Wrapper for PREPARE_MESSAGE verb
-    void register_prepare_message(std::function<future<streaming::messages::prepare_message> (streaming::messages::prepare_message msg, UUID plan_id,
-        inet_address from, inet_address connecting, unsigned src_cpu_id, unsigned dst_cpu_id)>&& func);
+    void register_prepare_message(std::function<future<streaming::messages::prepare_message> (const rpc::client_info& cinfo,
+            streaming::messages::prepare_message msg, UUID plan_id,
+            unsigned dst_cpu_id)>&& func);
    future<streaming::messages::prepare_message> send_prepare_message(shard_id id, streaming::messages::prepare_message msg, UUID plan_id,
-        inet_address from, inet_address connecting, unsigned src_cpu_id, unsigned dst_cpu_id);
+            unsigned dst_cpu_id);

    // Wrapper for PREPARE_DONE_MESSAGE verb
-    void register_prepare_done_message(std::function<future<> (UUID plan_id, inet_address from, inet_address connecting, unsigned dst_cpu_id)>&& func);
-    future<> send_prepare_done_message(shard_id id, UUID plan_id, inet_address from, inet_address connecting, unsigned dst_cpu_id);
+    void register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func);
+    future<> send_prepare_done_message(shard_id id, UUID plan_id, unsigned dst_cpu_id);

    // Wrapper for STREAM_MUTATION verb
    void register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id)>&& func);
    future<> send_stream_mutation(shard_id id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id);

-    void register_stream_mutation_done(std::function<future<> (UUID plan_id, std::vector<range<dht::token>> ranges, UUID cf_id, inet_address from, inet_address connecting, unsigned dst_cpu_id)>&& func);
-    future<> send_stream_mutation_done(shard_id id, UUID plan_id, std::vector<range<dht::token>> ranges, UUID cf_id, inet_address from, inet_address connecting, unsigned dst_cpu_id);
+    void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, std::vector<range<dht::token>> ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
+    future<> send_stream_mutation_done(shard_id id, UUID plan_id, std::vector<range<dht::token>> ranges, UUID cf_id, unsigned dst_cpu_id);

-    void register_complete_message(std::function<future<> (UUID plan_id, inet_address from, inet_address connecting, unsigned dst_cpu_id)>&& func);
-    future<> send_complete_message(shard_id id, UUID plan_id, inet_address from, inet_address connecting, unsigned dst_cpu_id);
+    void register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func);
+    future<> send_complete_message(shard_id id, UUID plan_id, unsigned dst_cpu_id);
+
+    // Wrapper for REPAIR_CHECKSUM_RANGE verb
+    void register_repair_checksum_range(std::function<future<partition_checksum> (sstring keyspace, sstring cf, query::range<dht::token> range)>&& func);
+    void unregister_repair_checksum_range();
+    future<partition_checksum> send_repair_checksum_range(shard_id id, sstring keyspace, sstring cf, query::range<dht::token> range);

    // Wrapper for ECHO verb
    void register_echo(std::function<future<> ()>&& func);
@@ -509,17 +478,17 @@ public:
    // Note: WTH is future<foreign_ptr<lw_shared_ptr<query::result>>
    void register_read_data(std::function<future<foreign_ptr<lw_shared_ptr<query::result>>> (query::read_command cmd, query::partition_range pr)>&& func);
    void unregister_read_data();
-    future<query::result> send_read_data(shard_id id, query::read_command& cmd, query::partition_range& pr);
+    future<query::result> send_read_data(shard_id id, const query::read_command& cmd, const query::partition_range& pr);

    // Wrapper for READ_MUTATION_DATA
    void register_read_mutation_data(std::function<future<foreign_ptr<lw_shared_ptr<reconcilable_result>>> (query::read_command cmd, query::partition_range pr)>&& func);
    void unregister_read_mutation_data();
-    future<reconcilable_result> send_read_mutation_data(shard_id id, query::read_command& cmd, query::partition_range& pr);
+    future<reconcilable_result> send_read_mutation_data(shard_id id, const query::read_command& cmd, const query::partition_range& pr);

    // Wrapper for READ_DIGEST
    void register_read_digest(std::function<future<query::result_digest> (query::read_command cmd, query::partition_range pr)>&& func);
    void unregister_read_digest();
-    future<query::result_digest> send_read_digest(shard_id id, query::read_command& cmd, query::partition_range& pr);
+    future<query::result_digest> send_read_digest(shard_id id, const query::read_command& cmd, const query::partition_range& pr);

    // Wrapper for TRUNCATE
    void register_truncate(std::function<future<>(sstring, sstring)>&& func);
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -725,7 +725,12 @@ void mutation_partition::trim_rows(const schema& s,
                stop = true;
                break;
            }
-            ++last;
+
+            if (e.empty()) {
+                last = reversal_traits<reversed>::erase_and_dispose(_rows, last, std::next(last, 1), deleter);
+            } else {
+                ++last;
+            }
        }
    }

--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -481,6 +481,9 @@ public:
    void apply(tombstone t) {
        _row.apply(t);
    }
+    bool empty() const {
+        return _row.empty();
+    }
    struct compare {
        clustering_key::less_compare _c;
        compare(const schema& s) : _c(s) {}
@@ -605,6 +608,7 @@ private:

    // Calls func for each row entry inside row_ranges until func returns stop_iteration::yes.
    // Removes all entries for which func didn't return stop_iteration::no or wasn't called at all.
+    // Removes all entries that are empty, check rows_entry::empty().
    // If reversed is true, func will be called on entries in reverse order. In that case row_ranges
    // must be already in reverse order.
    template<bool reversed, typename Func>
--- a/query.cc
+++ b/query.cc
@@ -255,7 +255,7 @@ to_partition_range(query::range<dht::token> r) {

    auto end = r.end()
               ? bound_opt(dht::ring_position(r.end()->value(),
-            r.start()->is_inclusive()
+            r.end()->is_inclusive()
            ? dht::ring_position::token_bound::end
            : dht::ring_position::token_bound::start))
               : bound_opt();
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -24,11 +24,15 @@
 #include "streaming/stream_plan.hh"
 #include "streaming/stream_state.hh"
 #include "gms/inet_address.hh"
+#include "db/config.hh"
+#include "service/storage_service.hh"

 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/algorithm/string/split.hpp>
 #include <boost/algorithm/string/classification.hpp>

+#include <cryptopp/sha.h>
+
 static logging::logger logger("repair");

 template <typename T1, typename T2>
@@ -68,21 +72,92 @@ void remove_item(Collection& c, T& item) {

 // Return all of the neighbors with whom we share the provided range.
 static std::vector<gms::inet_address> get_neighbors(database& db,
-        const sstring& ksname, query::range<dht::token> range
-        //Collection<String> dataCenters, Collection<String> hosts)
-        ) {
+        const sstring& ksname, query::range<dht::token> range,
+        const std::vector<sstring>& data_centers,
+        const std::vector<sstring>& hosts) {
+
    keyspace& ks = db.find_keyspace(ksname);
    auto& rs = ks.get_replication_strategy();

    dht::token tok = range.end() ? range.end()->value() : dht::maximum_token();
    auto ret = rs.get_natural_endpoints(tok);
    remove_item(ret, utils::fb_utilities::get_broadcast_address());
+
+    if (!data_centers.empty()) {
+        auto dc_endpoints_map = service::get_local_storage_service().get_token_metadata().get_topology().get_datacenter_endpoints();
+        std::unordered_set<gms::inet_address> dc_endpoints;
+        for (const sstring& dc : data_centers) {
+            auto it = dc_endpoints_map.find(dc);
+            if (it == dc_endpoints_map.end()) {
+                std::vector<sstring> dcs;
+                for (const auto& e : dc_endpoints_map) {
+                    dcs.push_back(e.first);
+                }
+                throw std::runtime_error(sprint("Unknown data center '%s'. "
+                        "Known data centers: %s", dc, dcs));
+            }
+            for (const auto& endpoint : it->second) {
+                dc_endpoints.insert(endpoint);
+            }
+        }
+        // The resulting list of nodes is the intersection of the nodes in the
+        // listed data centers, and the (range-dependent) list of neighbors.
+        std::unordered_set<gms::inet_address> neighbor_set(ret.begin(), ret.end());
+        ret.clear();
+        for (const auto& endpoint : dc_endpoints) {
+            if (neighbor_set.count(endpoint)) {
+                ret.push_back(endpoint);
+            }
+        }
+    } else if (!hosts.empty()) {
+        bool found_me = false;
+        std::unordered_set<gms::inet_address> neighbor_set(ret.begin(), ret.end());
+        ret.clear();
+        for (const sstring& host : hosts) {
+            gms::inet_address endpoint;
+            try {
+                endpoint = gms::inet_address(host);
+            } catch(...) {
+                throw std::runtime_error(sprint("Unknown host specified: %s", host));
+            }
+            if (endpoint == utils::fb_utilities::get_broadcast_address()) {
+                found_me = true;
+            } else if (neighbor_set.count(endpoint)) {
+                ret.push_back(endpoint);
+                // If same host is listed twice, don't add it again later
+                neighbor_set.erase(endpoint);
+            }
+            // Nodes which aren't neighbors for this range are ignored.
+            // This allows the user to give a list of "good" nodes, where
+            // for each different range, only the subset of nodes actually
+            // holding a replica of the given range is used. This,
+            // however, means the user is never warned if one of the nodes
+            // on the list isn't even part of the cluster.
+        }
+        // We require, like Cassandra does, that the current host must also
+        // be listed on the "-hosts" option - even those we don't want it in
+        // the returned list:
+        if (!found_me) {
+            throw std::runtime_error("The current host must be part of the repair");
+        }
+        if (ret.size() < 1) {
+            auto me = utils::fb_utilities::get_broadcast_address();
+            auto others = rs.get_natural_endpoints(tok);
+            remove_item(others, me);
+            throw std::runtime_error(sprint("Repair requires at least two "
+                    "endpoints that are neighbors before it can continue, "
+                    "the endpoint used for this repair is %s, other "
+                    "available neighbors are %s but these neighbors were not "
+                    "part of the supplied list of hosts to use during the "
+                    "repair (%s).", me, others, hosts));
+        }
+    }
+
    return ret;

 #if 0
-    // Origin's ActiveRepairService.getNeighbors() contains a lot of important
-    // stuff we need to do, like verifying the requested range fits a local
-    // range, and also taking the "datacenters" and "hosts" options.
+    // Origin's ActiveRepairService.getNeighbors() also verifies that the
+    // requested range fits into a local range
        StorageService ss = StorageService.instance;
        Map<Range<Token>, List<InetAddress>> replicaSets = ss.getRangeToAddressMap(keyspaceName);
        Range<Token> rangeSuperSet = null;
@@ -101,55 +176,6 @@ static std::vector<gms::inet_address> get_neighbors(database& db,
        if (rangeSuperSet == null || !replicaSets.containsKey(rangeSuperSet))
            return Collections.emptySet();

-        Set<InetAddress> neighbors = new HashSet<>(replicaSets.get(rangeSuperSet));
-        neighbors.remove(FBUtilities.getBroadcastAddress());
-
-        if (dataCenters != null && !dataCenters.isEmpty())
-        {
-            TokenMetadata.Topology topology = ss.getTokenMetadata().cloneOnlyTokenMap().getTopology();
-            Set<InetAddress> dcEndpoints = Sets.newHashSet();
-            Multimap<String,InetAddress> dcEndpointsMap = topology.getDatacenterEndpoints();
-            for (String dc : dataCenters)
-            {
-                Collection<InetAddress> c = dcEndpointsMap.get(dc);
-                if (c != null)
-                   dcEndpoints.addAll(c);
-            }
-            return Sets.intersection(neighbors, dcEndpoints);
-        }
-        else if (hosts != null && !hosts.isEmpty())
-        {
-            Set<InetAddress> specifiedHost = new HashSet<>();
-            for (final String host : hosts)
-            {
-                try
-                {
-                    final InetAddress endpoint = InetAddress.getByName(host.trim());
-                    if (endpoint.equals(FBUtilities.getBroadcastAddress()) || neighbors.contains(endpoint))
-                        specifiedHost.add(endpoint);
-                }
-                catch (UnknownHostException e)
-                {
-                    throw new IllegalArgumentException("Unknown host specified " + host, e);
-                }
-            }
-
-            if (!specifiedHost.contains(FBUtilities.getBroadcastAddress()))
-                throw new IllegalArgumentException("The current host must be part of the repair");
-
-            if (specifiedHost.size() <= 1)
-            {
-                String msg = "Repair requires at least two endpoints that are neighbours before it can continue, the endpoint used for this repair is %s, " +
-                             "other available neighbours are %s but these neighbours were not part of the supplied list of hosts to use during the repair (%s).";
-                throw new IllegalArgumentException(String.format(msg, specifiedHost, neighbors, hosts));
-            }
-
-            specifiedHost.remove(FBUtilities.getBroadcastAddress());
-            return specifiedHost;
-
-        }
-
-        return neighbors;
 #endif
 }

@@ -198,45 +224,296 @@ public:
    }
 } repair_tracker;

-// repair_start() can run on any cpu; It runs on cpu0 the function
-// do_repair_start(). The benefit of always running that function on the same
-// CPU is that it allows us to keep some state (like a list of ongoing
-// repairs). It is fine to always do this on one CPU, because the function
-// itself does very little (mainly tell other nodes and CPUs what to do).

-// Repair a single range. Comparable to RepairSession in Origin
-// In Origin, this is composed of several "repair jobs", each with one cf,
-// but our streaming already works for several cfs.
-static future<> repair_range(seastar::sharded<database>& db, sstring keyspace,
-        query::range<dht::token> range, std::vector<sstring> cfs) {
-    auto sp = make_lw_shared<streaming::stream_plan>("repair");
-    auto id = utils::UUID_gen::get_time_UUID();
+partition_checksum::partition_checksum(const mutation& m) {
+    auto frozen = freeze(m);
+    auto bytes = frozen.representation();
+    CryptoPP::SHA256 hash;
+    static_assert(CryptoPP::SHA256::DIGESTSIZE == sizeof(_digest),
+            "digest size");
+    static_assert(sizeof(char) == sizeof(decltype(*bytes.data())),
+            "Assumed that chars are bytes");
+    hash.CalculateDigest(reinterpret_cast<unsigned char*>(_digest),
+            reinterpret_cast<const unsigned char*>(bytes.data()),
+            bytes.size());
+}

-    auto neighbors = get_neighbors(db.local(), keyspace, range);
-    logger.info("[repair #{}] new session: will sync {} on range {} for {}.{}", id, neighbors, range, keyspace, cfs);
-    for (auto peer : neighbors) {
-        // FIXME: obviously, we'll need Merkel trees or another alternative
-        // method to decide which parts of the data we need to stream instead
-        // of streaming everything like we do now. So this logging is kind of
-        // silly, and we never log the corresponding "... is consistent with"
-        // message: see SyncTask.run() in Origin for the original messages.
-        auto me = utils::fb_utilities::get_broadcast_address();
-        for (auto &&cf : cfs) {
-            logger.info("[repair #{}] Endpoints {} and {} have {} range(s) out of sync for {}", id, me, peer, 1, cf);
+void partition_checksum::add(const partition_checksum& other) {
+     static_assert(sizeof(_digest) / sizeof(_digest[0]) == 4, "digest size");
+     _digest[0] ^= other._digest[0];
+     _digest[1] ^= other._digest[1];
+     _digest[2] ^= other._digest[2];
+     _digest[3] ^= other._digest[3];
+}
+
+bool partition_checksum::operator==(const partition_checksum& other) const {
+    static_assert(sizeof(_digest) / sizeof(_digest[0]) == 4, "digest size");
+    return  _digest[0] == other._digest[0] &&
+            _digest[1] == other._digest[1] &&
+            _digest[2] == other._digest[2] &&
+            _digest[3] == other._digest[3];
+}
+
+void partition_checksum::serialize(bytes::iterator& out) const {
+    out = std::copy(
+            reinterpret_cast<const char*>(&_digest),
+            reinterpret_cast<const char*>(&_digest) + sizeof(_digest),
+            out);
+}
+
+partition_checksum partition_checksum::deserialize(bytes_view& in) {
+    partition_checksum ret;
+    auto v = read_simple_bytes(in, sizeof(ret._digest));
+    std::copy(v.begin(), v.end(), reinterpret_cast<char*>(ret._digest));
+    return ret;
+}
+
+size_t partition_checksum::serialized_size() const {
+    return sizeof(_digest);
+}
+
+std::ostream& operator<<(std::ostream& out, const partition_checksum& c) {
+    out << std::hex;
+    std::copy(c._digest, c._digest + sizeof(c._digest)/sizeof(c._digest[0]),
+            std::ostream_iterator<decltype(c._digest[0])>(out, "-"));
+    out << std::dec;
+    return out;
+}
+
+// Calculate the checksum of the data held *on this shard* of a column family,
+// in the given token range.
+// All parameters to this function are constant references, and the caller
+// must ensure they live as long as the future returned by this function is
+// not resolved.
+// FIXME: Both master and slave will typically call this on consecutive ranges
+// so it would be useful to have this code cache its stopping point or have
+// some object live throughout the operation. Moreover, it makes sense to to
+// vary the collection of sstables used throught a long repair.
+// FIXME: cf.make_reader() puts all read partitions in the cache. This might
+// not be a good idea (see issue #382). Perhaps it is better to read the
+// sstables directly (as compaction does) - after flushing memtables first
+// (there might be old data in memtables which isn't flushed because no new
+// data is coming in).
+static future<partition_checksum> checksum_range_shard(database &db,
+        const sstring& keyspace_name, const sstring& cf_name,
+        const ::range<dht::token>& range) {
+    auto& cf = db.find_column_family(keyspace_name, cf_name);
+    return do_with(query::to_partition_range(range), [&cf] (const auto& partition_range) {
+        return do_with(cf.make_reader(partition_range), partition_checksum(),
+            [] (auto& reader, auto& checksum) {
+            return repeat([&reader, &checksum] () {
+                return reader().then([&checksum] (auto mopt) {
+                    if (mopt) {
+                        checksum.add(partition_checksum(*mopt));
+                        return stop_iteration::no;
+                    } else {
+                        return stop_iteration::yes;
+                    }
+                });
+            }).then([&checksum] {
+                return checksum;
+            });
+        });
+    });
+}
+
+// Calculate the checksum of the data held on all shards of a column family,
+// in the given token range.
+// In practice, we only need to consider one or two shards which intersect the
+// given "range". This is because the token ring has nodes*vnodes tokens,
+// dividing the token space into nodes*vnodes ranges, with "range" being one
+// of those. This number is big (vnodes = 256 by default). At the same time,
+// sharding divides the token space into relatively few large ranges, one per
+// thread.
+// Watch out: All parameters to this function are constant references, and the
+// caller must ensure they live as line as the future returned by this
+// function is not resolved.
+future<partition_checksum> checksum_range(seastar::sharded<database> &db,
+        const sstring& keyspace, const sstring& cf,
+        const ::range<dht::token>& range) {
+    unsigned shard_begin = range.start() ?
+            dht::shard_of(range.start()->value()) : 0;
+    unsigned shard_end = range.end() ?
+            dht::shard_of(range.end()->value())+1 : smp::count;
+    return do_with(partition_checksum(), [shard_begin, shard_end, &db, &keyspace, &cf, &range] (auto& result) {
+        return parallel_for_each(boost::counting_iterator<int>(shard_begin),
+                boost::counting_iterator<int>(shard_end),
+                [&db, &keyspace, &cf, &range, &result] (unsigned shard) {
+            return db.invoke_on(shard, [&keyspace, &cf, &range] (database& db) {
+                return checksum_range_shard(db, keyspace, cf, range);
+            }).then([&result] (partition_checksum sum) {
+                result.add(sum);
+            });
+        }).then([&result] {
+            return make_ready_future<partition_checksum>(result);
+        });
+    });
+}
+
+static future<> sync_range(seastar::sharded<database>& db,
+        const sstring& keyspace, const sstring& cf,
+        const ::range<dht::token>& range,
+        std::vector<gms::inet_address>& neighbors) {
+    return do_with(streaming::stream_plan("repair-in"),
+                   streaming::stream_plan("repair-out"),
+            [&db, &keyspace, &cf, &range, &neighbors]
+            (auto& sp_in, auto& sp_out) {
+        for (const auto& peer : neighbors) {
+            sp_in.request_ranges(peer, keyspace, {range}, {cf});
+            sp_out.transfer_ranges(peer, keyspace, {range}, {cf});
        }
-
-        // FIXME: think: if we have several neighbors, perhaps we need to
-        // request ranges from all of them and only later transfer ranges to
-        // all of them? Otherwise, we won't necessarily fully repair the
-        // other ndoes, just this one? What does Cassandra do here?
-        sp->transfer_ranges(peer, peer, keyspace, {range}, cfs);
-        sp->request_ranges(peer, peer, keyspace, {range}, cfs);
+        return sp_in.execute().discard_result().then([&sp_out] {
+                return sp_out.execute().discard_result();
+        }).handle_exception([] (auto ep) {
+            logger.error("repair's stream failed: {}", ep);
+            return make_exception_future(ep);
+        });
+    });
+}
+static void split_and_add(std::vector<::range<dht::token>>& ranges,
+        const range<dht::token>& range,
+        uint64_t estimated_partitions, uint64_t target_partitions) {
+    if (estimated_partitions < target_partitions) {
+        // We're done, the range is small enough to not be split further
+        ranges.push_back(range);
+        return;
    }
-    return sp->execute().discard_result().then([sp, id] {
-        logger.info("repair session #{} successful", id);
-    }).handle_exception([id] (auto ep) {
-        logger.error("repair session #{} stream failed: {}", id, ep);
-        return make_exception_future(std::runtime_error("repair_range failed"));
+    // The use of minimum_token() here twice is not a typo - because wrap-
+    // around token ranges are supported by midpoint(), the beyond-maximum
+    // token can also be represented by minimum_token().
+    auto midpoint = dht::global_partitioner().midpoint(
+            range.start() ? range.start()->value() : dht::minimum_token(),
+            range.end() ? range.end()->value() : dht::minimum_token());
+    auto halves = range.split(midpoint, dht::token_comparator());
+    ranges.push_back(halves.first);
+    ranges.push_back(halves.second);
+}
+
+// Repair a single cf in a single local range.
+// Comparable to RepairJob in Origin.
+static future<> repair_cf_range(seastar::sharded<database>& db,
+        sstring keyspace, sstring cf, ::range<dht::token> range,
+        std::vector<gms::inet_address>& neighbors) {
+    if (neighbors.empty()) {
+        // Nothing to do in this case...
+        return make_ready_future<>();
+    }
+
+    // The partition iterating code inside checksum_range_shard does not
+    // support wrap-around ranges, so we need to break at least wrap-
+    // around ranges.
+    std::vector<::range<dht::token>> ranges;
+    if (range.is_wrap_around(dht::token_comparator())) {
+        auto unwrapped = range.unwrap();
+        ranges.push_back(unwrapped.first);
+        ranges.push_back(unwrapped.second);
+    } else {
+        ranges.push_back(range);
+    }
+    // Additionally, we want to break up large ranges so they will have
+    // (approximately) a desired number of rows each.
+    // FIXME: column_family should have a method to estimate the number of
+    // partitions (and of course it should use cardinality estimation bitmaps,
+    // not trivial sum). We shouldn't have this ugly code here...
+    auto sstables = db.local().find_column_family(keyspace, cf).get_sstables();
+    uint64_t estimated_partitions = 0;
+    for (auto sst : *sstables) {
+        estimated_partitions += sst.second->get_estimated_key_count();
+    }
+    // This node contains replicas of rf * vnodes ranges like this one, so
+    // estimate the number of partitions in just this range:
+    estimated_partitions /= db.local().get_config().num_tokens();
+    estimated_partitions /= db.local().find_keyspace(keyspace).get_replication_strategy().get_replication_factor();
+
+    // FIXME: we should have an on-the-fly iterator generator here, not
+    // fill a vector in advance.
+    std::vector<::range<dht::token>> tosplit;
+    ranges.swap(tosplit);
+    for (const auto& range : tosplit) {
+        // FIXME: this "100" needs to be a parameter.
+        split_and_add(ranges, range, estimated_partitions, 100);
+    }
+
+    // We don't need to wait for one checksum to finish before we start the
+    // next, but doing too many of these operations in parallel also doesn't
+    // make sense, so we limit the number of concurrent ongoing checksum
+    // requests with a semaphore.
+    constexpr int parallelism = 100;
+    return do_with(semaphore(parallelism), true, std::move(keyspace), std::move(cf), std::move(ranges),
+        [&db, &neighbors, parallelism] (auto& sem, auto& success, const auto& keyspace, const auto& cf, const auto& ranges) {
+        return do_for_each(ranges, [&sem, &success, &db, &neighbors, &keyspace, &cf]
+                           (const auto& range) {
+            return sem.wait(1).then([&sem, &success, &db, &neighbors, &keyspace, &cf, &range] {
+                // Ask this node, and all neighbors, to calculate checksums in
+                // this range. When all are done, compare the results, and if
+                // there are any differences, sync the content of this range.
+                std::vector<future<partition_checksum>> checksums;
+                checksums.reserve(1 + neighbors.size());
+                checksums.push_back(checksum_range(db, keyspace, cf, range));
+                for (auto&& neighbor : neighbors) {
+                    checksums.push_back(
+                            net::get_local_messaging_service().send_repair_checksum_range(
+                                    net::shard_id{neighbor},keyspace, cf, range));
+                }
+                when_all(checksums.begin(), checksums.end()).then(
+                        [&db, &keyspace, &cf, &range, &neighbors, &success]
+                        (std::vector<future<partition_checksum>> checksums) {
+                    for (unsigned i = 0; i < checksums.size(); i++) {
+                        if (checksums[i].failed()) {
+                            logger.warn(
+                                "Checksum of range {} on {} failed: {}",
+                                range,
+                                (i ? neighbors[i-1] :
+                                 utils::fb_utilities::get_broadcast_address()),
+                                checksums[i].get_exception());
+                            success = false;
+                            // Do not break out of the loop here, so we can log
+                            // (and discard) all the exceptions.
+                        }
+                    }
+                    if (!success) {
+                        return make_ready_future<>();
+                    }
+                    auto checksum0 = checksums[0].get();
+                    for (unsigned i = 1; i < checksums.size(); i++) {
+                        if (checksum0 != checksums[i].get()) {
+                            logger.info("Found differing range {}", range);
+                            return sync_range(db, keyspace, cf, range, neighbors);
+                        }
+                    }
+                    return make_ready_future<>();
+                }).handle_exception([&success, &range] (std::exception_ptr eptr) {
+                    // Something above (e.g., sync_range) failed. We could
+                    // stop the repair immediately, or let it continue with
+                    // other ranges (at the moment, we do the latter). But in
+                    // any case, we need to remember that the repair failed to
+                    // tell the caller.
+                    success = false;
+                    logger.warn("Failed sync of range {}: {}", range, eptr);
+                }).finally([&sem] { sem.signal(1); });
+            });
+        }).finally([&sem, &success, parallelism] {
+            return sem.wait(parallelism).then([&success] {
+                return success ? make_ready_future<>() :
+                        make_exception_future<>(std::runtime_error("Checksum or sync of partial range failed"));
+            });
+        });
+    });
+}
+
+// Repair a single local range, multiple column families.
+// Comparable to RepairSession in Origin
+static future<> repair_range(seastar::sharded<database>& db, sstring keyspace,
+        ::range<dht::token> range, std::vector<sstring>& cfs,
+        const std::vector<sstring>& data_centers,
+        const std::vector<sstring>& hosts) {
+    auto id = utils::UUID_gen::get_time_UUID();
+    return do_with(get_neighbors(db.local(), keyspace, range, data_centers, hosts), [&db, &cfs, keyspace, id, range] (auto& neighbors) {
+        logger.info("[repair #{}] new session: will sync {} on range {} for {}.{}", id, neighbors, range, keyspace, cfs);
+        return do_for_each(cfs.begin(), cfs.end(),
+                [&db, keyspace, &neighbors, id, range] (auto&& cf) {
+            return repair_cf_range(db, keyspace, cf, range, neighbors);
+        });
    });
 }

@@ -278,11 +555,22 @@ struct repair_options {
    // keyspace. If this list is empty (the default), all the column families
    // in this keyspace are repaired
    std::vector<sstring> column_families;
+    // hosts specifies the list of known good hosts to repair with this host
+    // (note that this host is required to also be on this list). For each
+    // range repaired, only the relevant subset of the hosts (holding a
+    // replica of this range) is used.
+    std::vector<sstring> hosts;
+    // data_centers is used to restrict the repair to the local data center.
+    // The node starting the repair must be in the data center; Issuing a
+    // repair to a data center other than the named one returns an error.
+    std::vector<sstring> data_centers;

    repair_options(std::unordered_map<sstring, sstring> options) {
        bool_opt(primary_range, options, PRIMARY_RANGE_KEY);
        ranges_opt(ranges, options, RANGES_KEY);
        list_opt(column_families, options, COLUMNFAMILIES_KEY);
+        list_opt(hosts, options, HOSTS_KEY);
+        list_opt(data_centers, options, DATACENTERS_KEY);
        // We currently do not support incremental repair. We could probably
        // ignore this option as it is just an optimization, but for now,
        // let's make it an error.
@@ -395,6 +683,40 @@ private:
    }
 };

+// repair_ranges repairs a list of token ranges, each assumed to be a token
+// range for which this node holds a replica, and, importantly, each range
+// is assumed to be a indivisible in the sense that all the tokens in has the
+// same nodes as replicas.
+static future<> repair_ranges(seastar::sharded<database>& db, sstring keyspace,
+        std::vector<query::range<dht::token>> ranges,
+        std::vector<sstring> cfs, int id,
+        std::vector<sstring> data_centers, std::vector<sstring> hosts) {
+    return do_with(std::move(ranges), std::move(keyspace), std::move(cfs),
+            std::move(data_centers), std::move(hosts),
+            [&db, id] (auto& ranges, auto& keyspace, auto& cfs, auto& data_centers, auto& hosts) {
+#if 1
+        // repair all the ranges in parallel
+        return parallel_for_each(ranges.begin(), ranges.end(), [&db, keyspace, &cfs, &data_centers, &hosts, id] (auto&& range) {
+#else
+        // repair all the ranges in sequence
+        return do_for_each(ranges.begin(), ranges.end(), [&db, keyspace, &cfs, &data_centers, &hosts, id] (auto&& range) {
+#endif
+            return repair_range(db, keyspace, range, cfs, data_centers, hosts);
+        }).then([id] {
+            logger.info("repair {} completed sucessfully", id);
+            repair_tracker.done(id, true);
+        }).handle_exception([id] (std::exception_ptr eptr) {
+            logger.info("repair {} failed - {}", id, eptr);
+            repair_tracker.done(id, false);
+        });
+    });
+}
+
+// repair_start() can run on any cpu; It runs on cpu0 the function
+// do_repair_start(). The benefit of always running that function on the same
+// CPU is that it allows us to keep some state (like a list of ongoing
+// repairs). It is fine to always do this on one CPU, because the function
+// itself does very little (mainly tell other nodes and CPUs what to do).
 static int do_repair_start(seastar::sharded<database>& db, sstring keyspace,
        std::unordered_map<sstring, sstring> options_map) {

@@ -454,23 +776,8 @@ static int do_repair_start(seastar::sharded<database>& db, sstring keyspace,
        cfs = list_column_families(db.local(), keyspace);
    }

-    do_with(std::move(ranges), [&db, keyspace, cfs, id] (auto& ranges) {
-#if 1
-        // repair all the ranges in parallel
-        return parallel_for_each(ranges.begin(), ranges.end(), [&db, keyspace, cfs, id] (auto&& range) {
-#else
-        // repair all the ranges in sequence
-        return do_for_each(ranges.begin(), ranges.end(), [&db, keyspace, cfs, id] (auto&& range) {
-#endif
-            return repair_range(db, keyspace, range, cfs);
-        }).then([id] {
-            logger.info("repair {} completed sucessfully", id);
-            repair_tracker.done(id, true);
-        }).handle_exception([id] (std::exception_ptr eptr) {
-            logger.info("repair {} failed", id);
-            repair_tracker.done(id, false);
-        });
-    });
+    repair_ranges(db, std::move(keyspace), std::move(ranges), std::move(cfs),
+            id, options.data_centers, options.hosts);

    return id;
 }
--- a/repair/repair.hh
+++ b/repair/repair.hh
@@ -57,3 +57,37 @@ enum class repair_status { RUNNING, SUCCESSFUL, FAILED };
 // repair_get_status() returns a future because it needs to run code on a
 // different CPU (cpu 0) and that might be a deferring operation.
 future<repair_status> repair_get_status(seastar::sharded<database>& db, int id);
+
+// The class partition_checksum calculates a 256-bit cryptographically-secure
+// checksum of a set of partitions fed to it. The checksum of a partition set
+// is calculated by calculating a strong hash function (SHA-256) of each
+// individual partition, and then XORing the individual hashes together.
+// XOR is good enough for merging strong checksums, and allows us to
+// independently calculate the checksums of different subsets of the original
+// set, and then combine the results into one checksum with the add() method.
+// The hash of an individual partition uses both its key and value.
+class partition_checksum {
+private:
+    uint64_t _digest[4]; // 256 bits
+public:
+    constexpr partition_checksum() : _digest{} { }
+    partition_checksum(const mutation& m);
+    void add(const partition_checksum& other);
+    bool operator==(const partition_checksum& other) const;
+    bool operator!=(const partition_checksum& other) const { return !operator==(other); }
+    friend std::ostream& operator<<(std::ostream&, const partition_checksum&);
+
+    // The following are used to send this object over messaging_service:
+    void serialize(bytes::iterator& out) const;
+    static partition_checksum deserialize(bytes_view& in);
+    size_t serialized_size() const;
+};
+
+// Calculate the checksum of the data held on all shards of a column family,
+// in the given token range.
+// All parameters to this function are constant references, and the caller
+// must ensure they live as long as the future returned by this function is
+// not resolved.
+future<partition_checksum> checksum_range(seastar::sharded<database> &db,
+        const sstring& keyspace, const sstring& cf,
+        const ::range<dht::token>& range);
--- a/schema.cc
+++ b/schema.cc
@@ -220,9 +220,9 @@ sstring schema::thrift_key_validator() const {
 }

 bool
-schema::has_collections() const {
+schema::has_multi_cell_collections() const {
    return boost::algorithm::any_of(all_columns_in_select_order(), [] (const column_definition& cdef) {
-        return cdef.type->is_collection();
+        return cdef.type->is_collection() && cdef.type->is_multi_cell();
    });
 }

@@ -538,11 +538,11 @@ static sstring compound_name(const schema& s) {
        compound += s.regular_column_name_type()->name() + ",";
    }

-    if (s.has_collections()) {
+    if (s.has_multi_cell_collections()) {
        compound += _collection_str;
        compound += "(";
        for (auto& t: s.regular_columns()) {
-            if (t.type->is_collection()) {
+            if (t.type->is_collection() && t.type->is_multi_cell()) {
                auto ct = static_pointer_cast<const collection_type_impl>(t.type);
                compound += "00000000:" + ct->name() + ",";
            }
--- a/schema.hh
+++ b/schema.hh
@@ -223,6 +223,12 @@ public:
        return 0;
    }
    bool is_on_all_components() const;
+    bool is_indexed() const {
+        return idx_info.index_type != index_type::none;
+    }
+    bool is_part_of_cell_name() const {
+        return is_regular() || is_static();
+    }
    friend bool operator==(const column_definition&, const column_definition&);
 };

@@ -345,6 +351,10 @@ public:
        return _raw._is_compound;
    }

+    bool is_cql3_table() const {
+        return !is_super() && !is_dense() && is_compound();
+    }
+
    thrift_schema& thrift() {
        return _thrift;
    }
@@ -428,7 +438,7 @@ public:
    const column_definition& regular_column_at(column_id id) const;
    const column_definition& static_column_at(column_id id) const;
    bool is_last_partition_key(const column_definition& def) const;
-    bool has_collections() const;
+    bool has_multi_cell_collections() const;
    bool has_static_columns() const;
    size_t partition_key_size() const;
    size_t clustering_key_size() const;
--- a/scripts/scylla_install
+++ b/scripts/scylla_install
@@ -83,14 +83,18 @@ else
        fi
        grep -v ' - mounts' /etc/cloud/cloud.cfg > /tmp/cloud.cfg
        mv /tmp/cloud.cfg /etc/cloud/cloud.cfg
+        mv /home/fedora/scylla-ami /usr/lib/scylla/scylla-ami
+        chmod a+rx /usr/lib/scylla/scylla-ami/ds2_configure.py
    fi
    systemctl enable scylla-server.service
    systemctl enable scylla-jmx.service
    /usr/lib/scylla/scylla_ntp_setup -a
 fi
-/usr/lib/scylla/scylla_bootparam_setup
 /usr/lib/scylla/scylla_coredump_setup
 if [ $AMI -eq 0 ]; then
+    /usr/lib/scylla/scylla_bootparam_setup
    /usr/lib/scylla/scylla_raid_setup -d $DISKS -u
+else
+    /usr/lib/scylla/scylla_bootparam_setup -a
 fi
 /usr/lib/scylla/scylla_sysconfig_setup $SYSCONFIG_SETUP_ARGS
--- a/2
+++ b/2
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -173,7 +173,11 @@ future<> migration_manager::notify_create_keyspace(const lw_shared_ptr<keyspace_
 {
    return get_migration_manager().invoke_on_all([name = ksm->name()] (auto&& mm) {
        for (auto&& listener : mm._listeners) {
-            listener->on_create_keyspace(name);
+            try {
+                listener->on_create_keyspace(name);
+            } catch (...) {
+                logger.warn("Create keyspace notification failed {}: {}", name, std::current_exception());
+            }
        }
    });
 }
@@ -182,7 +186,11 @@ future<> migration_manager::notify_create_column_family(schema_ptr cfm)
 {
    return get_migration_manager().invoke_on_all([ks_name = cfm->ks_name(), cf_name = cfm->cf_name()] (auto&& mm) {
        for (auto&& listener : mm._listeners) {
-            listener->on_create_column_family(ks_name, cf_name);
+            try {
+                listener->on_create_column_family(ks_name, cf_name);
+            } catch (...) {
+                logger.warn("Create column family notification failed {}.{}: {}", ks_name, cf_name, std::current_exception());
+            }
        }
    });
 }
@@ -211,7 +219,11 @@ future<> migration_manager::notify_update_keyspace(const lw_shared_ptr<keyspace_
 {
    return get_migration_manager().invoke_on_all([name = ksm->name()] (auto&& mm) {
        for (auto&& listener : mm._listeners) {
-            listener->on_update_keyspace(name);
+            try {
+                listener->on_update_keyspace(name);
+            } catch (...) {
+                logger.warn("Update keyspace notification failed {}: {}", name, std::current_exception());
+            }
        }
    });
 }
@@ -220,7 +232,11 @@ future<> migration_manager::notify_update_column_family(schema_ptr cfm)
 {
    return get_migration_manager().invoke_on_all([ks_name = cfm->ks_name(), cf_name = cfm->cf_name()] (auto&& mm) {
        for (auto&& listener : mm._listeners) {
-            listener->on_update_column_family(ks_name, cf_name);
+            try {
+                listener->on_update_column_family(ks_name, cf_name);
+            } catch (...) {
+                logger.warn("Update column family notification failed {}.{}: {}", ks_name, cf_name, std::current_exception());
+            }
        }
    });
 }
@@ -249,7 +265,11 @@ future<> migration_manager::notify_drop_keyspace(sstring ks_name)
 {
    return get_migration_manager().invoke_on_all([ks_name] (auto&& mm) {
        for (auto&& listener : mm._listeners) {
-            listener->on_drop_keyspace(ks_name);
+            try {
+                listener->on_drop_keyspace(ks_name);
+            } catch (...) {
+                logger.warn("Drop keyspace notification failed {}: {}", ks_name, std::current_exception());
+            }
        }
    });
 }
@@ -258,7 +278,11 @@ future<> migration_manager::notify_drop_column_family(schema_ptr cfm)
 {
    return get_migration_manager().invoke_on_all([ks_name = cfm->ks_name(), cf_name = cfm->cf_name()] (auto&& mm) {
        for (auto&& listener : mm._listeners) {
-            listener->on_drop_column_family(ks_name, cf_name);
+            try {
+                listener->on_drop_column_family(ks_name, cf_name);
+            } catch (...) {
+                logger.warn("Drop column family notification failed {}.{}: {}", ks_name, cf_name, std::current_exception());
+            }
        }
    });
 }
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -196,7 +196,6 @@ void storage_service::prepare_to_join() {
    auto& gossiper = gms::get_local_gossiper();
    gossiper.register_(this->shared_from_this());
    // FIXME: SystemKeyspace.incrementAndGetGeneration()
-    print("Start gossiper service ...\n");
    gossiper.start_gossiping(get_generation_number(), app_states).then([this] {
 #if SS_DEBUG
        gms::get_local_gossiper().debug_show();
@@ -588,13 +587,21 @@ void storage_service::handle_state_normal(inet_address endpoint) {
        _token_metadata.remove_from_moving(endpoint);
        get_storage_service().invoke_on_all([endpoint] (auto&& ss) {
            for (auto&& subscriber : ss._lifecycle_subscribers) {
-                subscriber->on_move(endpoint);
+                try {
+                    subscriber->on_move(endpoint);
+                } catch (...) {
+                    logger.warn("Move notification failed {}: {}", endpoint, std::current_exception());
+                }
            }
        }).get();
    } else {
        get_storage_service().invoke_on_all([endpoint] (auto&& ss) {
            for (auto&& subscriber : ss._lifecycle_subscribers) {
-                subscriber->on_join_cluster(endpoint);
+                try {
+                    subscriber->on_join_cluster(endpoint);
+                } catch (...) {
+                    logger.warn("Join cluster notification failed {}: {}", endpoint, std::current_exception());
+                }
            }
        }).get();
    }
@@ -722,7 +729,11 @@ void storage_service::on_alive(gms::inet_address endpoint, gms::endpoint_state s
 #endif
        get_storage_service().invoke_on_all([endpoint] (auto&& ss) {
            for (auto&& subscriber : ss._lifecycle_subscribers) {
-                subscriber->on_up(endpoint);
+                try {
+                    subscriber->on_up(endpoint);
+                } catch (...) {
+                    logger.warn("Up notification failed {}: {}", endpoint, std::current_exception());
+                }
            }
        });
    }
@@ -785,7 +796,11 @@ void storage_service::on_dead(gms::inet_address endpoint, gms::endpoint_state st
    net::get_local_messaging_service().remove_rpc_client(net::shard_id{endpoint, 0});
    get_storage_service().invoke_on_all([endpoint] (auto&& ss) {
        for (auto&& subscriber : ss._lifecycle_subscribers) {
-            subscriber->on_down(endpoint);
+            try {
+                subscriber->on_down(endpoint);
+            } catch (...) {
+                logger.warn("Down notification failed {}: {}", endpoint, std::current_exception());
+            }
        }
    }).get();
 }
@@ -1401,27 +1416,8 @@ future<> storage_service::take_column_family_snapshot(sstring ks_name, sstring c
    });
 }

-// For the filesystem operations, this code will assume that all keyspaces are visible in all shards
-// (as we have been doing for a lot of the other operations, like the snapshot itself).
 future<> storage_service::clear_snapshot(sstring tag, std::vector<sstring> keyspace_names) {
-    std::vector<std::reference_wrapper<keyspace>> keyspaces;
-    for (auto& ksname: keyspace_names) {
-        try {
-            keyspaces.push_back(std::reference_wrapper<keyspace>(_db.local().find_keyspace(ksname)));
-        } catch (no_such_keyspace& e) {
-            return make_exception_future(std::current_exception());
-        }
-    }
-
-    auto deleted_keyspaces = make_lw_shared<std::vector<sstring>>();
-    return parallel_for_each(keyspaces, [this, tag, deleted_keyspaces] (auto& ks) {
-        return parallel_for_each(ks.get().metadata()->cf_meta_data(), [this, tag] (auto& pair) {
-            auto& cf = _db.local().find_column_family(pair.second);
-            return cf.clear_snapshot(tag);
-         }).then_wrapped([] (future<> f) {
-            logger.debug("Cleared out snapshot directories");
-         });
-    });
+    return _db.local().clear_snapshot(tag, keyspace_names);
 }

 future<std::unordered_map<sstring, std::vector<service::storage_service::snapshot_details>>>
@@ -2065,9 +2061,8 @@ future<> storage_service::restore_replica_count(inet_address endpoint, inet_addr
        for (auto& m : maps) {
            auto source = m.first;
            auto ranges = m.second;
-            auto preferred = net::get_local_messaging_service().get_preferred_ip(source);
            logger.debug("Requesting from {} ranges {}", source, ranges);
-            sp->request_ranges(source, preferred, keyspace_name, ranges);
+            sp->request_ranges(source, keyspace_name, ranges);
        }
    }
    return sp->execute().then_wrapped([this, sp, notify_endpoint] (auto&& f) {
@@ -2093,7 +2088,11 @@ void storage_service::excise(std::unordered_set<token> tokens, inet_address endp

    get_storage_service().invoke_on_all([endpoint] (auto&& ss) {
        for (auto&& subscriber : ss._lifecycle_subscribers) {
-            subscriber->on_leave_cluster(endpoint);
+            try {
+                subscriber->on_leave_cluster(endpoint);
+            } catch (...) {
+                logger.warn("Leave cluster notification failed {}: {}", endpoint, std::current_exception());
+            }
        }
    }).get();

@@ -2185,10 +2184,8 @@ storage_service::stream_ranges(std::unordered_map<sstring, std::unordered_multim
        for (auto& ranges_entry : ranges_per_endpoint) {
            auto& ranges = ranges_entry.second;
            auto new_endpoint = ranges_entry.first;
-            auto preferred = net::get_local_messaging_service().get_preferred_ip(new_endpoint);
-
            // TODO each call to transferRanges re-flushes, this is potentially a lot of waste
-            sp->transfer_ranges(new_endpoint, preferred, keyspace_name, ranges);
+            sp->transfer_ranges(new_endpoint, keyspace_name, ranges);
        }
    }
    return sp->execute().discard_result().then([sp] {
@@ -2225,7 +2222,6 @@ future<> storage_service::stream_hints() {

        snitch->sort_by_proximity(get_broadcast_address(), candidates);
        auto hints_destination_host = candidates.front();
-        auto preferred = net::get_local_messaging_service().get_preferred_ip(hints_destination_host);

        // stream all hints -- range list will be a singleton of "the entire ring"
        std::vector<range<token>> ranges = {range<token>::make_open_ended_both_sides()};
@@ -2234,7 +2230,7 @@ future<> storage_service::stream_hints() {
        auto sp = make_lw_shared<streaming::stream_plan>("Hints");
        std::vector<sstring> column_families = { db::system_keyspace::HINTS };
        auto keyspace = db::system_keyspace::NAME;
-        sp->transfer_ranges(hints_destination_host, preferred, keyspace, ranges, column_families);
+        sp->transfer_ranges(hints_destination_host, keyspace, ranges, column_families);
        return sp->execute().discard_result().then([sp] {
            logger.info("stream_hints successful");
        }).handle_exception([] (auto ep) {
@@ -2553,8 +2549,7 @@ void storage_service::range_relocator::calculate_to_from_streams(std::unordered_
                auto& address = x.first;
                auto& ranges = x.second;
                logger.debug("Will stream range {} of keyspace {} to endpoint {}", ranges , keyspace, address);
-                auto preferred = net::get_local_messaging_service().get_preferred_ip(address);
-                _stream_plan.transfer_ranges(address, preferred, keyspace, ranges);
+                _stream_plan.transfer_ranges(address, keyspace, ranges);
            }

            // stream requests
@@ -2569,8 +2564,7 @@ void storage_service::range_relocator::calculate_to_from_streams(std::unordered_
                auto& address = x.first;
                auto& ranges = x.second;
                logger.debug("Will request range {} of keyspace {} from endpoint {}", ranges, keyspace, address);
-                auto preferred = net::get_local_messaging_service().get_preferred_ip(address);
-                _stream_plan.request_ranges(address, preferred, keyspace, ranges);
+                _stream_plan.request_ranges(address, keyspace, ranges);
            }
            if (logger.is_enabled(logging::log_level::debug)) {
                for (auto& x : work) {
--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -2310,9 +2310,7 @@ public:
 };

 inline future<> init_storage_service(distributed<database>& db) {
-    return service::get_storage_service().start(std::ref(db)).then([] {
-        print("Start Storage service ...\n");
-    });
+    return service::get_storage_service().start(std::ref(db));
 }

 inline future<> deinit_storage_service() {
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -654,7 +654,7 @@ future<> sstable::read_toc() {

    sstlog.debug("Reading TOC file {} ", file_path);

-    return engine().open_file_dma(file_path, open_flags::ro).then([this] (file f) {
+    return open_file_dma(file_path, open_flags::ro).then([this] (file f) {
        auto bufptr = allocate_aligned_buffer<char>(4096, 4096);
        auto buf = bufptr.get();

@@ -724,7 +724,7 @@ void sstable::write_toc() {
    sstlog.debug("Writing TOC file {} ", file_path);

    // Writing TOC content to temporary file.
-    file f = engine().open_file_dma(file_path, open_flags::wo | open_flags::create | open_flags::truncate).get0();
+    file f = open_file_dma(file_path, open_flags::wo | open_flags::create | open_flags::truncate).get0();
    auto out = file_writer(std::move(f), 4096);
    auto w = file_writer(std::move(out));

@@ -764,7 +764,7 @@ void write_crc(const sstring file_path, checksum& c) {
    sstlog.debug("Writing CRC file {} ", file_path);

    auto oflags = open_flags::wo | open_flags::create | open_flags::exclusive;
-    file f = engine().open_file_dma(file_path, oflags).get0();
+    file f = open_file_dma(file_path, oflags).get0();
    auto out = file_writer(std::move(f), 4096);
    auto w = file_writer(std::move(out));
    write(w, c);
@@ -776,7 +776,7 @@ void write_digest(const sstring file_path, uint32_t full_checksum) {
    sstlog.debug("Writing Digest file {} ", file_path);

    auto oflags = open_flags::wo | open_flags::create | open_flags::exclusive;
-    auto f = engine().open_file_dma(file_path, oflags).get0();
+    auto f = open_file_dma(file_path, oflags).get0();
    auto out = file_writer(std::move(f), 4096);
    auto w = file_writer(std::move(out));

@@ -821,7 +821,7 @@ future<> sstable::read_simple(T& component) {

    auto file_path = filename(Type);
    sstlog.debug(("Reading " + _component_map[Type] + " file {} ").c_str(), file_path);
-    return engine().open_file_dma(file_path, open_flags::ro).then([this, &component] (file f) {
+    return open_file_dma(file_path, open_flags::ro).then([this, &component] (file f) {
        auto r = make_lw_shared<file_random_access_reader>(std::move(f), sstable_buffer_size);
        auto fut = parse(*r, component);
        return fut.finally([r = std::move(r)] {
@@ -842,7 +842,7 @@ template <sstable::component_type Type, typename T>
 void sstable::write_simple(T& component) {
    auto file_path = filename(Type);
    sstlog.debug(("Writing " + _component_map[Type] + " file {} ").c_str(), file_path);
-    file f = engine().open_file_dma(file_path, open_flags::wo | open_flags::create | open_flags::truncate).get0();
+    file f = open_file_dma(file_path, open_flags::wo | open_flags::create | open_flags::truncate).get0();
    auto out = file_writer(std::move(f), sstable_buffer_size);
    auto w = file_writer(std::move(out));
    write(w, component);
@@ -879,8 +879,8 @@ void sstable::write_statistics() {
 }

 future<> sstable::open_data() {
-    return when_all(engine().open_file_dma(filename(component_type::Index), open_flags::ro),
-                    engine().open_file_dma(filename(component_type::Data), open_flags::ro)).then([this] (auto files) {
+    return when_all(open_file_dma(filename(component_type::Index), open_flags::ro),
+                    open_file_dma(filename(component_type::Data), open_flags::ro)).then([this] (auto files) {
        _index_file = std::get<file>(std::get<0>(files).get());
        _data_file  = std::get<file>(std::get<1>(files).get());
        return _data_file.size().then([this] (auto size) {
@@ -900,8 +900,8 @@ future<> sstable::open_data() {

 future<> sstable::create_data() {
    auto oflags = open_flags::wo | open_flags::create | open_flags::exclusive;
-    return when_all(engine().open_file_dma(filename(component_type::Index), oflags),
-                    engine().open_file_dma(filename(component_type::Data), oflags)).then([this] (auto files) {
+    return when_all(open_file_dma(filename(component_type::Index), oflags),
+                    open_file_dma(filename(component_type::Data), oflags)).then([this] (auto files) {
        // FIXME: If both files could not be created, the first get below will
        // throw an exception, and second get() will not be attempted, and
        // we'll get a warning about the second future being destructed
@@ -1755,7 +1755,6 @@ sstable::get_sstable_key_range(const schema& s, sstring ks, sstring cf, sstring

 void sstable::mark_sstable_for_deletion(sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f) {
    auto sst = sstable(ks, cf, dir, generation, v, f);
-    sstlog.info("sstable {} not relevant for this shard, ignoring", sst.get_filename());
    sst.mark_for_deletion();
 }

--- a/streaming/session_info.hh
+++ b/streaming/session_info.hh
@@ -55,7 +55,6 @@ public:
    using inet_address = gms::inet_address;
    inet_address peer;
    int session_index;
-    inet_address connecting;
    /** Immutable collection of receiving summaries */
    std::vector<stream_summary> receiving_summaries;
    /** Immutable collection of sending summaries*/
@@ -67,12 +66,11 @@ public:
    std::map<sstring, progress_info> sending_files;

    session_info() = default;
-    session_info(inet_address peer_, int session_index_, inet_address connecting_,
+    session_info(inet_address peer_, int session_index_,
                 std::vector<stream_summary> receiving_summaries_,
                 std::vector<stream_summary> sending_summaries_,
                 stream_session_state state_)
        : peer(peer_)
-        , connecting(connecting_)
        , receiving_summaries(std::move(receiving_summaries_))
        , sending_summaries(std::move(sending_summaries_))
        , state(state_) {
--- a/streaming/stream_coordinator.cc
+++ b/streaming/stream_coordinator.cc
@@ -117,11 +117,11 @@ bool stream_coordinator::host_streaming_data::has_active_sessions() {
    return false;
 }

-shared_ptr<stream_session> stream_coordinator::host_streaming_data::get_or_create_next_session(inet_address peer, inet_address connecting) {
+shared_ptr<stream_session> stream_coordinator::host_streaming_data::get_or_create_next_session(inet_address peer) {
    // create
    int size = _stream_sessions.size();
    if (size < _connections_per_host) {
-        auto session = make_shared<stream_session>(peer, connecting, size, _keep_ss_table_level);
+        auto session = make_shared<stream_session>(peer, size, _keep_ss_table_level);
        _stream_sessions.emplace(++_last_returned, session);
        return _stream_sessions[_last_returned];
    // get
@@ -142,10 +142,10 @@ std::vector<shared_ptr<stream_session>> stream_coordinator::host_streaming_data:
 }

 shared_ptr<stream_session> stream_coordinator::host_streaming_data::get_or_create_session_by_id(inet_address peer,
-    int id, inet_address connecting) {
+    int id) {
    auto it = _stream_sessions.find(id);
    if (it == _stream_sessions.end()) {
-        it = _stream_sessions.emplace(id, make_shared<stream_session>(peer, connecting, id, _keep_ss_table_level)).first;
+        it = _stream_sessions.emplace(id, make_shared<stream_session>(peer, id, _keep_ss_table_level)).first;
    }
    return it->second;
 }
--- a/streaming/stream_coordinator.hh
+++ b/streaming/stream_coordinator.hh
@@ -90,12 +90,12 @@ public:
    std::set<inet_address> get_peers();

 public:
-    shared_ptr<stream_session> get_or_create_next_session(inet_address peer, inet_address connecting) {
-        return get_or_create_host_data(peer).get_or_create_next_session(peer, connecting);
+    shared_ptr<stream_session> get_or_create_next_session(inet_address peer) {
+        return get_or_create_host_data(peer).get_or_create_next_session(peer);
    }

-    shared_ptr<stream_session> get_or_create_session_by_id(inet_address peer, int id, inet_address connecting) {
-        return get_or_create_host_data(peer).get_or_create_session_by_id(peer, id, connecting);
+    shared_ptr<stream_session> get_or_create_session_by_id(inet_address peer, int id) {
+        return get_or_create_host_data(peer).get_or_create_session_by_id(peer, id);
    }

    void update_progress(progress_info info) {
@@ -159,13 +159,13 @@ private:

        bool has_active_sessions();

-        shared_ptr<stream_session> get_or_create_next_session(inet_address peer, inet_address connecting);
+        shared_ptr<stream_session> get_or_create_next_session(inet_address peer);

        void connect_all_stream_sessions();

        std::vector<shared_ptr<stream_session>> get_all_stream_sessions();

-        shared_ptr<stream_session> get_or_create_session_by_id(inet_address peer, int id, inet_address connecting);
+        shared_ptr<stream_session> get_or_create_session_by_id(inet_address peer, int id);

        void update_progress(progress_info info);

--- a/streaming/stream_plan.cc
+++ b/streaming/stream_plan.cc
@@ -44,28 +44,24 @@ namespace streaming {

 extern logging::logger sslog;

-stream_plan& stream_plan::request_ranges(inet_address from, inet_address connecting, sstring keyspace, std::vector<query::range<token>> ranges) {
-    return request_ranges(from, connecting, keyspace, ranges, {});
+stream_plan& stream_plan::request_ranges(inet_address from, sstring keyspace, std::vector<query::range<token>> ranges) {
+    return request_ranges(from, keyspace, ranges, {});
 }

-stream_plan& stream_plan::request_ranges(inet_address from, inet_address connecting, sstring keyspace, std::vector<query::range<token>> ranges, std::vector<sstring> column_families) {
+stream_plan& stream_plan::request_ranges(inet_address from, sstring keyspace, std::vector<query::range<token>> ranges, std::vector<sstring> column_families) {
    _range_added = true;
-    auto session = _coordinator->get_or_create_next_session(from, connecting);
+    auto session = _coordinator->get_or_create_next_session(from);
    session->add_stream_request(keyspace, ranges, std::move(column_families), _repaired_at);
    return *this;
 }

+stream_plan& stream_plan::transfer_ranges(inet_address to, sstring keyspace, std::vector<query::range<token>> ranges) {
+    return transfer_ranges(to, keyspace, ranges, {});
+}
+
 stream_plan& stream_plan::transfer_ranges(inet_address to, sstring keyspace, std::vector<query::range<token>> ranges, std::vector<sstring> column_families) {
-    return transfer_ranges(to, to, keyspace, ranges, column_families);
-}
-
-stream_plan& stream_plan::transfer_ranges(inet_address to, inet_address connecting, sstring keyspace, std::vector<query::range<token>> ranges) {
-    return transfer_ranges(to, connecting, keyspace, ranges, {});
-}
-
-stream_plan& stream_plan::transfer_ranges(inet_address to, inet_address connecting, sstring keyspace, std::vector<query::range<token>> ranges, std::vector<sstring> column_families) {
    _range_added = true;
-    auto session = _coordinator->get_or_create_next_session(to, connecting);
+    auto session = _coordinator->get_or_create_next_session(to);
    session->add_transfer_ranges(keyspace, std::move(ranges), std::move(column_families), _flush_before_transfer, _repaired_at);
    return *this;
 }
--- a/streaming/stream_plan.hh
+++ b/streaming/stream_plan.hh
@@ -104,7 +104,7 @@ public:
     * @param ranges ranges to fetch
     * @return this object for chaining
     */
-    stream_plan& request_ranges(inet_address from, inet_address connecting, sstring keyspace, std::vector<query::range<token>> ranges);
+    stream_plan& request_ranges(inet_address from, sstring keyspace, std::vector<query::range<token>> ranges);

    /**
     * Request data in {@code columnFamilies} under {@code keyspace} and {@code ranges} from specific node.
@@ -116,14 +116,7 @@ public:
     * @param columnFamilies specific column families
     * @return this object for chaining
     */
-    stream_plan& request_ranges(inet_address from, inet_address connecting, sstring keyspace, std::vector<query::range<token>> ranges, std::vector<sstring> column_families);
-
-    /**
-     * Add transfer task to send data of specific {@code columnFamilies} under {@code keyspace} and {@code ranges}.
-     *
-     * @see #transferRanges(java.net.InetAddress, java.net.InetAddress, String, java.util.Collection, String...)
-     */
-    stream_plan& transfer_ranges(inet_address to, sstring keyspace, std::vector<query::range<token>> ranges, std::vector<sstring> column_families);
+    stream_plan& request_ranges(inet_address from, sstring keyspace, std::vector<query::range<token>> ranges, std::vector<sstring> column_families);

    /**
     * Add transfer task to send data of specific keyspace and ranges.
@@ -134,7 +127,7 @@ public:
     * @param ranges ranges to send
     * @return this object for chaining
     */
-    stream_plan& transfer_ranges(inet_address to, inet_address connecting, sstring keyspace, std::vector<query::range<token>> ranges);
+    stream_plan& transfer_ranges(inet_address to, sstring keyspace, std::vector<query::range<token>> ranges);

    /**
     * Add transfer task to send data of specific {@code columnFamilies} under {@code keyspace} and {@code ranges}.
@@ -146,7 +139,7 @@ public:
     * @param columnFamilies specific column families
     * @return this object for chaining
     */
-    stream_plan& transfer_ranges(inet_address to, inet_address connecting, sstring keyspace, std::vector<query::range<token>> ranges, std::vector<sstring> column_families);
+    stream_plan& transfer_ranges(inet_address to, sstring keyspace, std::vector<query::range<token>> ranges, std::vector<sstring> column_families);

    stream_plan& listeners(std::vector<stream_event_handler*> handlers);
 #if 0
--- a/streaming/stream_session.cc
+++ b/streaming/stream_session.cc
@@ -77,29 +77,33 @@ static auto get_stream_result_future(utils::UUID plan_id) {
 }

 void stream_session::init_messaging_service_handler() {
-    ms().register_stream_init_message([] (messages::stream_init_message msg, unsigned src_cpu_id) {
+    ms().register_stream_init_message([] (const rpc::client_info& cinfo, messages::stream_init_message msg) {
+        const auto& src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
        auto dst_cpu_id = engine().cpu_id();
        return smp::submit_to(dst_cpu_id, [msg = std::move(msg), src_cpu_id, dst_cpu_id] () mutable {
-            sslog.debug("[Stream #{}] GOT STREAM_INIT_MESSAGE", msg.plan_id);
+            sslog.debug("[Stream #{}] GOT STREAM_INIT_MESSAGE: src_cpu_id={}, dst_cpu_id={}",
+                    msg.plan_id, src_cpu_id, dst_cpu_id);
            stream_result_future::init_receiving_side(msg.session_index, msg.plan_id,
                msg.description, msg.from, msg.keep_ss_table_level);
            return make_ready_future<unsigned>(dst_cpu_id);
        });
    });
-    ms().register_prepare_message([] (messages::prepare_message msg, UUID plan_id, inet_address from, inet_address connecting, unsigned src_cpu_id, unsigned dst_cpu_id) {
-        return smp::submit_to(dst_cpu_id, [msg = std::move(msg), plan_id, from, connecting, src_cpu_id] () mutable {
+    ms().register_prepare_message([] (const rpc::client_info& cinfo, messages::prepare_message msg, UUID plan_id, unsigned dst_cpu_id) {
+        const auto& src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
+        const auto& from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
+        return smp::submit_to(dst_cpu_id, [msg = std::move(msg), plan_id, from, src_cpu_id] () mutable {
            auto f = get_stream_result_future(plan_id);
-            sslog.debug("[Stream #{}] GOT PREPARE_MESSAGE: from={}, connecting={}", plan_id, from, connecting);
+            sslog.debug("[Stream #{}] GOT PREPARE_MESSAGE: from={}", plan_id, from);
            if (f) {
                auto coordinator = f->get_coordinator();
                assert(coordinator);
-                auto session = coordinator->get_or_create_next_session(from, from);
+                auto session = coordinator->get_or_create_next_session(from);
                assert(session);
                session->init(f);
                session->dst_cpu_id = src_cpu_id;
                session->start_keep_alive_timer();
-                sslog.debug("[Stream #{}] GOT PREPARE_MESSAGE: get session peer={} connecting={} src_cpu_id={}, dst_cpu_id={}",
-                    session->plan_id(), session->peer, session->connecting, session->src_cpu_id, session->dst_cpu_id);
+                sslog.debug("[Stream #{}] GOT PREPARE_MESSAGE: get session peer={}, dst_cpu_id={}",
+                    session->plan_id(), session->peer, session->dst_cpu_id);
                return session->prepare(std::move(msg.requests), std::move(msg.summaries));
            } else {
                auto err = sprint("[Stream #%s] GOT PREPARE_MESSAGE: Can not find stream_manager", plan_id);
@@ -108,14 +112,15 @@ void stream_session::init_messaging_service_handler() {
            }
        });
    });
-    ms().register_prepare_done_message([] (UUID plan_id, inet_address from, inet_address connecting, unsigned dst_cpu_id) {
-        return smp::submit_to(dst_cpu_id, [plan_id, from, connecting] () mutable {
-            sslog.debug("[Stream #{}] GOT PREPARE_DONE_MESSAGE: from={}, connecting={}", plan_id, from, connecting);
+    ms().register_prepare_done_message([] (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id) {
+        const auto& from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
+        return smp::submit_to(dst_cpu_id, [plan_id, from] () mutable {
+            sslog.debug("[Stream #{}] GOT PREPARE_DONE_MESSAGE: from={}", plan_id, from);
            auto f = get_stream_result_future(plan_id);
            if (f) {
                auto coordinator = f->get_coordinator();
                assert(coordinator);
-                auto session = coordinator->get_or_create_next_session(from, from);
+                auto session = coordinator->get_or_create_next_session(from);
                assert(session);
                session->start_keep_alive_timer();
                session->follower_start_sent();
@@ -128,7 +133,7 @@ void stream_session::init_messaging_service_handler() {
        });
    });
    ms().register_stream_mutation([] (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id) {
-        auto& from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
+        const auto& from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
        return smp::submit_to(dst_cpu_id, [plan_id, from, fm = std::move(fm)] () mutable {
            if (sslog.is_enabled(logging::log_level::debug)) {
                auto cf_id = fm.column_family_id();
@@ -138,7 +143,7 @@ void stream_session::init_messaging_service_handler() {
            if (f) {
                auto coordinator = f->get_coordinator();
                assert(coordinator);
-                auto session = coordinator->get_or_create_next_session(from, from);
+                auto session = coordinator->get_or_create_next_session(from);
                assert(session);
                session->start_keep_alive_timer();
                return service::get_storage_proxy().local().mutate_locally(fm);
@@ -149,14 +154,15 @@ void stream_session::init_messaging_service_handler() {
            }
        });
    });
-    ms().register_stream_mutation_done([] (UUID plan_id, std::vector<range<dht::token>> ranges, UUID cf_id, inet_address from, inet_address connecting, unsigned dst_cpu_id) {
-        return smp::submit_to(dst_cpu_id, [ranges = std::move(ranges), plan_id, cf_id, from, connecting] () mutable {
-            sslog.debug("[Stream #{}] GOT STREAM_MUTATION_DONE: cf_id={}, from={}, connecting={}", plan_id, cf_id, from, connecting);
+    ms().register_stream_mutation_done([] (const rpc::client_info& cinfo, UUID plan_id, std::vector<range<dht::token>> ranges, UUID cf_id, unsigned dst_cpu_id) {
+        const auto& from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
+        return smp::submit_to(dst_cpu_id, [ranges = std::move(ranges), plan_id, cf_id, from] () mutable {
+            sslog.debug("[Stream #{}] GOT STREAM_MUTATION_DONE: cf_id={}, from={}", plan_id, cf_id, from);
            auto f = get_stream_result_future(plan_id);
            if (f) {
                auto coordinator = f->get_coordinator();
                assert(coordinator);
-                auto session = coordinator->get_or_create_next_session(from, from);
+                auto session = coordinator->get_or_create_next_session(from);
                assert(session);
                session->start_keep_alive_timer();
                session->receive_task_completed(cf_id);
@@ -181,14 +187,15 @@ void stream_session::init_messaging_service_handler() {
        });
    });
 #endif
-    ms().register_complete_message([] (UUID plan_id, inet_address from, inet_address connecting, unsigned dst_cpu_id) {
-        return smp::submit_to(dst_cpu_id, [plan_id, from, connecting, dst_cpu_id] () mutable {
-            sslog.debug("[Stream #{}] GOT COMPLETE_MESSAGE, from={}, connecting={}, dst_cpu_id={}", plan_id, from, connecting, dst_cpu_id);
+    ms().register_complete_message([] (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id) {
+        const auto& from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
+        return smp::submit_to(dst_cpu_id, [plan_id, from, dst_cpu_id] () mutable {
+            sslog.debug("[Stream #{}] GOT COMPLETE_MESSAGE, from={}, dst_cpu_id={}", plan_id, from, dst_cpu_id);
            auto f = get_stream_result_future(plan_id);
            if (f) {
                auto coordinator = f->get_coordinator();
                assert(coordinator);
-                auto session = coordinator->get_or_create_next_session(from, from);
+                auto session = coordinator->get_or_create_next_session(from);
                assert(session);
                session->start_keep_alive_timer();
                session->complete();
@@ -220,9 +227,8 @@ distributed<database>* stream_session::_db;

 stream_session::stream_session() = default;

-stream_session::stream_session(inet_address peer_, inet_address connecting_, int index_, bool keep_ss_table_level_)
+stream_session::stream_session(inet_address peer_, int index_, bool keep_ss_table_level_)
    : peer(peer_)
-    , connecting(connecting_)
    , _index(index_)
    , _keep_ss_table_level(keep_ss_table_level_) {
    //this.metrics = StreamingMetrics.get(connecting);
@@ -282,7 +288,7 @@ future<> stream_session::test(distributed<cql3::query_processor>& qp) {
                    auto ks = sstring("ks");
                    std::vector<query::range<token>> ranges = {query::range<token>::make_open_ended_both_sides()};
                    std::vector<sstring> cfs{tb};
-                    sp.transfer_ranges(to, to, ks, ranges, cfs).request_ranges(to, to, ks, ranges, cfs).execute().then_wrapped([] (auto&& f) {
+                    sp.transfer_ranges(to, ks, ranges, cfs).request_ranges(to, ks, ranges, cfs).execute().then_wrapped([] (auto&& f) {
                        try {
                            auto state = f.get0();
                            sslog.debug("plan_id={} description={} DONE", state.plan_id, state.description);
@@ -308,9 +314,8 @@ future<> stream_session::initiate() {
    messages::stream_init_message msg(from, session_index(), plan_id(), description(),
            is_for_outgoing, keep_ss_table_level());
    auto id = shard_id{this->peer, 0};
-    this->src_cpu_id = engine().cpu_id();
    sslog.debug("[Stream #{}] SEND SENDSTREAM_INIT_MESSAGE to {}", plan_id(), id);
-    return ms().send_stream_init_message(std::move(id), std::move(msg), this->src_cpu_id).then_wrapped([this, id] (auto&& f) {
+    return ms().send_stream_init_message(std::move(id), std::move(msg)).then_wrapped([this, id] (auto&& f) {
        try {
            unsigned dst_cpu_id = f.get0();
            this->start_keep_alive_timer();
@@ -334,10 +339,9 @@ future<> stream_session::on_initialization_complete() {
        prepare.summaries.emplace_back(x.second.get_summary());
    }
    auto id = shard_id{this->peer, this->dst_cpu_id};
-    auto from = utils::fb_utilities::get_broadcast_address();
    sslog.debug("[Stream #{}] SEND PREPARE_MESSAGE to {}", plan_id(), id);
-    return ms().send_prepare_message(id, std::move(prepare), plan_id(), from,
-        this->connecting, this->src_cpu_id, this->dst_cpu_id).then_wrapped([this, id] (auto&& f) {
+    return ms().send_prepare_message(id, std::move(prepare), plan_id(),
+        this->dst_cpu_id).then_wrapped([this, id] (auto&& f) {
        try {
            auto msg = f.get0();
            this->start_keep_alive_timer();
@@ -351,10 +355,10 @@ future<> stream_session::on_initialization_complete() {
            throw;
        }
        return make_ready_future<>();
-    }).then([this, id, from] {
+    }).then([this, id] {
        auto plan_id = this->plan_id();
        sslog.debug("[Stream #{}] SEND PREPARE_DONE_MESSAGE to {}", plan_id, id);
-        return ms().send_prepare_done_message(id, plan_id, from, this->connecting, this->dst_cpu_id).then([this] {
+        return ms().send_prepare_done_message(id, plan_id, this->dst_cpu_id).then([this] {
            this->start_keep_alive_timer();
        }).handle_exception([id, plan_id] (auto ep) {
            sslog.error("[Stream #{}] Fail to send PREPARE_DONE_MESSAGE to {}, {}", plan_id, id, ep);
@@ -503,7 +507,7 @@ session_info stream_session::get_session_info() {
    for (auto& transfer : _transfers) {
        transfer_summaries.emplace_back(transfer.second.get_summary());
    }
-    return session_info(peer, _index, connecting, std::move(receiving_summaries), std::move(transfer_summaries), _state);
+    return session_info(peer, _index, std::move(receiving_summaries), std::move(transfer_summaries), _state);
 }

 void stream_session::receive_task_completed(UUID cf_id) {
@@ -521,11 +525,10 @@ void stream_session::transfer_task_completed(UUID cf_id) {
 }

 void stream_session::send_complete_message() {
-    auto from = utils::fb_utilities::get_broadcast_address();
    auto id = shard_id{this->peer, this->dst_cpu_id};
    auto plan_id = this->plan_id();
    sslog.debug("[Stream #{}] SEND COMPLETE_MESSAGE to {}", plan_id, id);
-    this->ms().send_complete_message(id, plan_id, from, this->connecting, this->dst_cpu_id).then([session = shared_from_this(), plan_id] {
+    this->ms().send_complete_message(id, plan_id, this->dst_cpu_id).then([session = shared_from_this(), plan_id] {
        sslog.debug("[Stream #{}] GOT COMPLETE_MESSAGE Reply", plan_id);
    }).handle_exception([plan_id] (auto ep) {
        sslog.warn("[Stream #{}] ERROR COMPLETE_MESSAGE Reply: {}", plan_id, ep);
@@ -686,6 +689,7 @@ void stream_session::start() {
        close_session(stream_session_state::COMPLETE);
        return;
    }
+    auto connecting = net::get_local_messaging_service().get_preferred_ip(peer);
    if (peer == connecting) {
        sslog.info("[Stream #{}] Starting streaming to {}", plan_id(), peer);
    } else {
--- a/streaming/stream_session.hh
+++ b/streaming/stream_session.hh
@@ -162,9 +162,6 @@ public:
     * Each {@code StreamSession} is identified by this InetAddress which is broadcast address of the node streaming.
     */
    inet_address peer;
-    /** Actual connecting address. Can be the same as {@linkplain #peer}. */
-    inet_address connecting;
-    unsigned src_cpu_id;
    unsigned dst_cpu_id;
 private:
    int _index;
@@ -204,7 +201,7 @@ public:
     * @param connecting Actual connecting address
     * @param factory is used for establishing connection
     */
-    stream_session(inet_address peer_, inet_address connecting_, int index_, bool keep_ss_table_level_);
+    stream_session(inet_address peer_, int index_, bool keep_ss_table_level_);
    ~stream_session();

    UUID plan_id();
--- a/streaming/stream_transfer_task.cc
+++ b/streaming/stream_transfer_task.cc
@@ -149,10 +149,9 @@ void stream_transfer_task::complete(int sequence_number) {
    // all file sent, notify session this task is complete.
    if (signal_complete) {
        using shard_id = net::messaging_service::shard_id;
-        auto from = utils::fb_utilities::get_broadcast_address();
        auto id = shard_id{session->peer, session->dst_cpu_id};
        sslog.debug("[Stream #{}] SEND STREAM_MUTATION_DONE to {}, seq={}", plan_id, id, sequence_number);
-        session->ms().send_stream_mutation_done(id, plan_id, std::move(_ranges), this->cf_id, from, session->connecting, session->dst_cpu_id).then_wrapped([this, id, plan_id] (auto&& f) {
+        session->ms().send_stream_mutation_done(id, plan_id, std::move(_ranges), this->cf_id, session->dst_cpu_id).then_wrapped([this, id, plan_id] (auto&& f) {
            try {
                f.get();
                session->start_keep_alive_timer();
--- a/tests/commitlog_test.cc
+++ b/tests/commitlog_test.cc
@@ -321,7 +321,7 @@ SEASTAR_TEST_CASE(test_commitlog_reader){
 }

 static future<> corrupt_segment(sstring seg, uint64_t off, uint32_t value) {
-    return engine().open_file_dma(seg, open_flags::rw).then([off, value](file f) {
+    return open_file_dma(seg, open_flags::rw).then([off, value](file f) {
        size_t size = align_up<size_t>(off, 4096);
        return do_with(std::move(f), [size, off, value](file& f) {
            return f.dma_read_exactly<char>(0, size).then([&f, off, value](auto buf) {
--- a/tests/cql_query_test.cc
+++ b/tests/cql_query_test.cc
@@ -1997,4 +1997,22 @@ SEASTAR_TEST_CASE(test_result_order) {
    });
 }

+SEASTAR_TEST_CASE(test_frozen_collections) {
+    return do_with_cql_env([] (auto& e) {
+        auto set_of_ints = set_type_impl::get_instance(int32_type, false);
+        auto list_of_ints = list_type_impl::get_instance(int32_type, false);
+        auto frozen_map_of_set_and_list = map_type_impl::get_instance(set_of_ints, list_of_ints, false);
+        return e.execute_cql("CREATE TABLE tfc (a int, b int, c frozen<map<set<int>, list<int>>> static, d int, PRIMARY KEY (a, b));").discard_result().then([&e] {
+            return e.execute_cql("INSERT INTO tfc (a, b, c, d) VALUES (0, 0, {}, 0);").discard_result();
+        }).then([&e] {
+            return e.execute_cql("SELECT * FROM tfc;");
+        }).then([&e, frozen_map_of_set_and_list] (auto msg) {
+            map_type_impl::mutation_view empty_mv{};
+            assert_that(msg).is_rows().with_rows({
+                { int32_type->decompose(0), int32_type->decompose(0), frozen_map_of_set_and_list->to_value(empty_mv, serialization_format::internal()), int32_type->decompose(0) },
+            });
+        });
+    });
+}
+

--- a/tests/cql_test_env.cc
+++ b/tests/cql_test_env.cc
@@ -92,22 +92,6 @@ static future<> tst_init_ms_fd_gossiper(sstring listen_address, uint16_t port, d
 }
 // END TODO

-future<> init_once(shared_ptr<distributed<database>> db) {
-    static bool done = false;
-    if (!done) {
-        done = true;
-        // FIXME: we leak db, since we're initializing the global storage_service with it.
-        new shared_ptr<distributed<database>>(db);
-        return tst_init_storage_service(*db).then([] {
-            return tst_init_ms_fd_gossiper("127.0.0.1", 7000, db::config::seed_provider_type());
-        }).then([] {
-            return db::system_keyspace::init_local_cache();
-        });
-    } else {
-        return make_ready_future();
-    }
-}
-
 class single_node_cql_env : public cql_test_env {
 public:
    static auto constexpr ks_name = "ks";
@@ -287,43 +271,49 @@ public:
            utils::fb_utilities::set_broadcast_rpc_address(gms::inet_address("localhost"));
            locator::i_endpoint_snitch::create_snitch("SimpleSnitch").get();
            auto db = ::make_shared<distributed<database>>();
-            init_once(db).get();
            auto cfg = make_lw_shared<db::config>();
            _data_dir = make_lw_shared<tmpdir>();
            cfg->data_file_directories() = { _data_dir->path };
            cfg->commitlog_directory() = _data_dir->path + "/commitlog.dir";
+            cfg->num_tokens() = 256;
+            cfg->ring_delay_ms() = 500;
            boost::filesystem::create_directories((_data_dir->path + "/system").c_str());
            boost::filesystem::create_directories(cfg->commitlog_directory().c_str());
+            tst_init_storage_service(*db).get();
+
            db->start(std::move(*cfg)).get();
-            db->invoke_on_all([this] (database& db) {
-                return db.init_system_keyspace();
-            }).get();
-            auto& ks = db->local().find_keyspace(db::system_keyspace::NAME);
-            parallel_for_each(ks.metadata()->cf_meta_data(), [&ks] (auto& pair) {
-                auto cfm = pair.second;
-                return ks.make_directory_for_column_family(cfm->cf_name(), cfm->id());
-             }).get();
+
+            tst_init_ms_fd_gossiper("127.0.0.1", 7000, db::config::seed_provider_type()).get();

            distributed<service::storage_proxy>& proxy = service::get_storage_proxy();
            distributed<service::migration_manager>& mm = service::get_migration_manager();
            distributed<db::batchlog_manager>& bm = db::get_batchlog_manager();

-            auto qp = ::make_shared<distributed<cql3::query_processor>>();
            proxy.start(std::ref(*db)).get();
            mm.start().get();
+
+            auto qp = ::make_shared<distributed<cql3::query_processor>>();
            qp->start(std::ref(proxy), std::ref(*db)).get();

-            db::system_keyspace::minimal_setup(*db, *qp);
-
-            auto& ss = service::get_local_storage_service();
-            static bool storage_service_started = false;
-            if (!storage_service_started) {
-                storage_service_started = true;
-                ss.init_server().get();
-            }
-
            bm.start(std::ref(*qp)).get();

+            db->invoke_on_all([this] (database& db) {
+                return db.init_system_keyspace();
+            }).get();
+
+            auto& ks = db->local().find_keyspace(db::system_keyspace::NAME);
+            parallel_for_each(ks.metadata()->cf_meta_data(), [&ks] (auto& pair) {
+                auto cfm = pair.second;
+                return ks.make_directory_for_column_family(cfm->cf_name(), cfm->id());
+            }).get();
+
+            // In main.cc we call db::system_keyspace::setup which calls
+            // minimal_setup and init_local_cache
+            db::system_keyspace::minimal_setup(*db, *qp);
+            db::system_keyspace::init_local_cache().get();
+
+            service::get_local_storage_service().init_server().get();
+
            _core_local.start().get();
            _db = std::move(db);
            _qp = std::move(qp);
@@ -336,12 +326,23 @@ public:
    virtual future<> stop() override {
        return seastar::async([this] {
            _core_local.stop().get();
+            db::system_keyspace::deinit_local_cache().get();
+
            db::get_batchlog_manager().stop().get();
            _qp->stop().get();
            db::qctx = {};
            service::get_migration_manager().stop().get();
            service::get_storage_proxy().stop().get();
+
+            gms::get_gossiper().stop().get();
+            gms::get_failure_detector().stop().get();
+            net::get_messaging_service().stop().get();
+
            _db->stop().get();
+
+            service::get_storage_service().stop().get();
+            service::get_pending_range_calculator_service().stop().get();
+
            locator::i_endpoint_snitch::stop_snitch().get();
            bool old_active = true;
            assert(active.compare_exchange_strong(old_active, false));
--- a/tests/gossip_test.cc
+++ b/tests/gossip_test.cc
@@ -29,19 +29,26 @@
 #include "gms/failure_detector.hh"
 #include "gms/gossiper.hh"
 #include "core/reactor.hh"
+#include "service/pending_range_calculator_service.hh"
+#include "service/storage_service.hh"
+#include "core/distributed.hh"
+#include "database.hh"

 SEASTAR_TEST_CASE(test_boot_shutdown){
-    return net::get_messaging_service().start(gms::inet_address("127.0.0.1")).then( [] () {
-        return gms::get_failure_detector().start().then([] {
-            return gms::get_gossiper().start().then([] {
-                return gms::get_gossiper().stop().then( [] (){
-                    return gms::get_failure_detector().stop().then( [] (){
-                        return net::get_messaging_service().stop().then ( [] () {
-                            return make_ready_future<>();
-                        });
-                    });
-                });
-            });
-        });
+    return seastar::async([] {
+        distributed<database> db;
+        service::get_pending_range_calculator_service().start(std::ref(db));
+        service::get_storage_service().start(std::ref(db)).get();
+        db.start().get();
+        net::get_messaging_service().start(gms::inet_address("127.0.0.1")).get();
+        gms::get_failure_detector().start().get();
+
+        gms::get_gossiper().start().get();
+        gms::get_gossiper().stop().get();
+        gms::get_failure_detector().stop().get();
+        net::get_messaging_service().stop().get();
+        db.stop().get();
+        service::get_storage_service().stop().get();
+        service::get_pending_range_calculator_service().stop().get();
    });
 }
--- a/tests/mutation_test.cc
+++ b/tests/mutation_test.cc
@@ -847,3 +847,27 @@ SEASTAR_TEST_CASE(test_large_blobs) {
    });

 }
+
+SEASTAR_TEST_CASE(test_tombstone_purge) {
+    auto builder = schema_builder("tests", "tombstone_purge")
+        .with_column("id", utf8_type, column_kind::partition_key)
+        .with_column("value", int32_type);
+    builder.set_gc_grace_seconds(0);
+    auto s = builder.build();
+
+    auto key = partition_key::from_exploded(*s, {to_bytes("key1")});
+    const column_definition& col = *s->get_column_definition("value");
+
+    mutation m(key, s);
+    m.set_clustered_cell(clustering_key::make_empty(*s), col, make_atomic_cell(int32_type->decompose(1)));
+    tombstone tomb(api::new_timestamp(), gc_clock::now() - std::chrono::seconds(1));
+    m.partition().apply(tomb);
+    BOOST_REQUIRE(!m.partition().empty());
+    m.partition().compact_for_compaction(*s, api::max_timestamp, gc_clock::now());
+    // Check that row was covered by tombstone.
+    BOOST_REQUIRE(m.partition().empty());
+    // Check that tombstone was purged after compact_for_compaction().
+    BOOST_REQUIRE(!m.partition().partition_tombstone());
+
+    return make_ready_future<>();
+}
--- a/tests/sstable_datafile_test.cc
+++ b/tests/sstable_datafile_test.cc
@@ -90,7 +90,7 @@ SEASTAR_TEST_CASE(datafile_generation_01) {

        auto fname = sstable::filename("tests/sstables/tests-temporary", "ks", "cf", la, 1, big, sstable::component_type::Data);
        return sst->write_components(*mt).then([mt, sst, s, fname] {
-            return engine().open_file_dma(fname, open_flags::ro).then([] (file f) {
+            return open_file_dma(fname, open_flags::ro).then([] (file f) {
                auto bufptr = allocate_aligned_buffer<char>(4096, 4096);

                auto fut = f.dma_read(0, bufptr.get(), 4096);
@@ -156,7 +156,7 @@ SEASTAR_TEST_CASE(datafile_generation_02) {

        auto fname = sstable::filename("tests/sstables/tests-temporary", "ks", "cf", la, 2, big, sstable::component_type::Data);
        return sst->write_components(*mt).then([mt, sst, s, fname] {
-            return engine().open_file_dma(fname, open_flags::ro).then([] (file f) {
+            return open_file_dma(fname, open_flags::ro).then([] (file f) {
                auto bufptr = allocate_aligned_buffer<char>(4096, 4096);

                auto fut = f.dma_read(0, bufptr.get(), 4096);
@@ -224,7 +224,7 @@ SEASTAR_TEST_CASE(datafile_generation_03) {

        auto fname = sstable::filename("tests/sstables/tests-temporary", "ks", "cf", la, 3, big, sstable::component_type::Data);
        return sst->write_components(*mt).then([mt, sst, s, fname] {
-            return engine().open_file_dma(fname, open_flags::ro).then([] (file f) {
+            return open_file_dma(fname, open_flags::ro).then([] (file f) {
                auto bufptr = allocate_aligned_buffer<char>(4096, 4096);

                auto fut = f.dma_read(0, bufptr.get(), 4096);
@@ -295,7 +295,7 @@ SEASTAR_TEST_CASE(datafile_generation_04) {

        auto fname = sstable::filename("tests/sstables/tests-temporary", "ks", "cf", la, 4, big, sstable::component_type::Data);
        return sst->write_components(*mt).then([mt, sst, s, fname] {
-            return engine().open_file_dma(fname, open_flags::ro).then([] (file f) {
+            return open_file_dma(fname, open_flags::ro).then([] (file f) {
                auto bufptr = allocate_aligned_buffer<char>(4096, 4096);

                auto fut = f.dma_read(0, bufptr.get(), 4096);
@@ -366,7 +366,7 @@ SEASTAR_TEST_CASE(datafile_generation_05) {

        return sst->write_components(*mt).then([mt, sst, s] {
            auto fname = sstable::filename("tests/sstables/tests-temporary", "ks", "cf", la, 5, big, sstable::component_type::Data);
-            return engine().open_file_dma(fname, open_flags::ro).then([] (file f) {
+            return open_file_dma(fname, open_flags::ro).then([] (file f) {
                auto bufptr = allocate_aligned_buffer<char>(4096, 4096);

                auto fut = f.dma_read(0, bufptr.get(), 4096);
@@ -438,7 +438,7 @@ SEASTAR_TEST_CASE(datafile_generation_06) {

        return sst->write_components(*mt).then([mt, sst, s] {
            auto fname = sstable::filename("tests/sstables/tests-temporary", "ks", "cf", la, 6, big, sstable::component_type::Data);
-            return engine().open_file_dma(fname, open_flags::ro).then([] (file f) {
+            return open_file_dma(fname, open_flags::ro).then([] (file f) {
                auto bufptr = allocate_aligned_buffer<char>(4096, 4096);

                auto fut = f.dma_read(0, bufptr.get(), 4096);
@@ -514,7 +514,7 @@ SEASTAR_TEST_CASE(datafile_generation_07) {

        return sst->write_components(*mt).then([mt, sst, s] {
            auto fname = sstable::filename("tests/sstables/tests-temporary", "ks", "cf", la, 7, big, sstable::component_type::Index);
-            return engine().open_file_dma(fname, open_flags::ro).then([] (file f) {
+            return open_file_dma(fname, open_flags::ro).then([] (file f) {
                auto bufptr = allocate_aligned_buffer<char>(4096, 4096);

                auto fut = f.dma_read(0, bufptr.get(), 4096);
@@ -571,7 +571,7 @@ SEASTAR_TEST_CASE(datafile_generation_08) {

        return sst->write_components(*mt).then([mt, sst, s] {
            auto fname = sstable::filename("tests/sstables/tests-temporary", "ks", "cf", la, 8, big, sstable::component_type::Summary);
-            return engine().open_file_dma(fname, open_flags::ro).then([] (file f) {
+            return open_file_dma(fname, open_flags::ro).then([] (file f) {
                auto bufptr = allocate_aligned_buffer<char>(4096, 4096);

                auto fut = f.dma_read(0, bufptr.get(), 4096);
@@ -680,7 +680,7 @@ SEASTAR_TEST_CASE(datafile_generation_10) {

        return sst->write_components(*mt).then([mt, sst, s] {
            auto fname = sstable::filename("tests/sstables/tests-temporary", "ks", "cf", la, 10, big, sstable::component_type::Data);
-            return engine().open_file_dma(fname, open_flags::ro).then([] (file f) {
+            return open_file_dma(fname, open_flags::ro).then([] (file f) {
                auto bufptr = allocate_aligned_buffer<char>(4096, 4096);

                auto fut = f.dma_read(0, bufptr.get(), 4096);
@@ -691,7 +691,7 @@ SEASTAR_TEST_CASE(datafile_generation_10) {
                    f.close().finally([f]{});

                    auto fname = sstable::filename("tests/sstables/tests-temporary", "ks", "cf", la, 10, big, sstable::component_type::CRC);
-                    return engine().open_file_dma(fname, open_flags::ro).then([adler] (file f) {
+                    return open_file_dma(fname, open_flags::ro).then([adler] (file f) {
                        auto bufptr = allocate_aligned_buffer<char>(4096, 4096);

                        auto fut = f.dma_read(0, bufptr.get(), 4096);
@@ -713,7 +713,7 @@ SEASTAR_TEST_CASE(datafile_generation_10) {
                        });
                    }).then([adler] {
                        auto fname = sstable::filename("tests/sstables/tests-temporary", "ks", "cf", la, 10, big, sstable::component_type::Digest);
-                        return engine().open_file_dma(fname, open_flags::ro).then([adler] (file f) {
+                        return open_file_dma(fname, open_flags::ro).then([adler] (file f) {
                            auto bufptr = allocate_aligned_buffer<char>(4096, 4096);

                            auto fut = f.dma_read(0, bufptr.get(), 4096);
@@ -1456,7 +1456,7 @@ SEASTAR_TEST_CASE(datafile_generation_40) {

        return sst->write_components(*mt).then([mt, sst, s] {
            auto fname = sstable::filename("tests/sstables/tests-temporary", "ks", "cf", la, 40, big, sstable::component_type::Data);
-            return engine().open_file_dma(fname, open_flags::ro).then([] (file f) {
+            return open_file_dma(fname, open_flags::ro).then([] (file f) {
                auto bufptr = allocate_aligned_buffer<char>(4096, 4096);

                auto fut = f.dma_read(0, bufptr.get(), 4096);
@@ -2052,3 +2052,107 @@ SEASTAR_TEST_CASE(check_read_indexes) {
        });
    });
 }
+
+SEASTAR_TEST_CASE(tombstone_purge_test) {
+    // In a column family with gc_grace_seconds set to 0, check that a tombstone
+    // is purged after compaction.
+    auto builder = schema_builder("tests", "tombstone_purge")
+        .with_column("id", utf8_type, column_kind::partition_key)
+        .with_column("value", int32_type);
+    builder.set_gc_grace_seconds(0);
+    auto s = builder.build();
+
+    // Create a memtable containing two partitions, alpha and beta.
+    auto mt1 = make_lw_shared<memtable>(s);
+    auto insert_data = [&mt1, s] (sstring key_value) {
+        auto key = partition_key::from_exploded(*s, {to_bytes(key_value)});
+        mutation m(key, s);
+        const column_definition& col = *s->get_column_definition("value");
+        m.set_clustered_cell(clustering_key::make_empty(*s), col, make_atomic_cell(int32_type->decompose(1)));
+        mt1->apply(std::move(m));
+    };
+    insert_data("alpha");
+    insert_data("beta");
+
+    // Create a second memtable containing one tombstone for the partition alpha.
+    auto mt2 = make_lw_shared<memtable>(s);
+    auto key = partition_key::from_exploded(*s, {to_bytes("alpha")});
+    mutation m(key, s);
+    // gc_clock isn't very precise and tombstone's deletion time has to be lower
+    // than gc_before for it to be purged. So let's subtract 1 second from now().
+    tombstone tomb(api::new_timestamp(), gc_clock::now() - std::chrono::seconds(1));
+    m.partition().apply(tomb);
+    mt2->apply(std::move(m));
+
+    auto memtables = make_lw_shared<std::vector<lw_shared_ptr<memtable>>>();
+    memtables->push_back(std::move(mt1));
+    memtables->push_back(std::move(mt2));
+
+    auto tmp = make_lw_shared<tmpdir>();
+    auto gen = make_lw_shared<unsigned>(1);
+    auto sstables = make_lw_shared<std::vector<shared_sstable>>();
+
+    return do_for_each(*memtables, [tmp, gen, sstables, memtables] (lw_shared_ptr<memtable>& mt) {
+        auto sst = make_lw_shared<sstable>("ks", "cf", tmp->path, (*gen)++, la, big);
+        return sst->write_components(*mt).then([sstables, sst] {
+            return sst->open_data().then([sstables, sst] {
+                sstables->push_back(sst);
+            });
+        });
+    }).then([s, sstables] {
+        BOOST_REQUIRE(sstables->size() == 2);
+
+        // Validate first generated sstable
+        auto sst = (*sstables)[0];
+        BOOST_REQUIRE(sst->generation() == 1);
+        auto reader = make_lw_shared(sstable_reader(sst, s));
+        return (*reader)().then([s, reader] (mutation_opt m) {
+            BOOST_REQUIRE(m);
+            auto beta = partition_key::from_exploded(*s, {to_bytes("beta")});
+            BOOST_REQUIRE(m->key().equal(*s, beta));
+            return (*reader)();
+        }).then([s, reader] (mutation_opt m) {
+            BOOST_REQUIRE(m);
+            auto alpha = partition_key::from_exploded(*s, {to_bytes("alpha")});
+            BOOST_REQUIRE(m->key().equal(*s, alpha));
+            return (*reader)();
+        }).then([reader] (mutation_opt m) {
+            BOOST_REQUIRE(!m);
+        });
+    }).then([s, sstables, tomb] {
+        // Validate second generated sstable
+        auto sst = (*sstables)[1];
+        BOOST_REQUIRE(sst->generation() == 2);
+        auto reader = make_lw_shared(sstable_reader(sst, s));
+        return (*reader)().then([s, reader, tomb] (mutation_opt m) {
+            BOOST_REQUIRE(m);
+            auto alpha = partition_key::from_exploded(*s, {to_bytes("alpha")});
+            BOOST_REQUIRE(m->key().equal(*s, alpha));
+            BOOST_REQUIRE(m->partition().partition_tombstone() == tomb);
+            return (*reader)();
+        }).then([reader] (mutation_opt m) {
+            BOOST_REQUIRE(!m);
+        });
+    }).then([s, tmp, sstables] {
+        auto cm = make_lw_shared<compaction_manager>();
+        auto cf = make_lw_shared<column_family>(s, column_family::config(), column_family::no_commitlog(), *cm);
+        auto create = [tmp] {
+            return make_lw_shared<sstable>("ks", "cf", tmp->path, 3, la, big);
+        };
+
+        return sstables::compact_sstables(*sstables, *cf, create, std::numeric_limits<uint64_t>::max(), 0).then([s, tmp, sstables, cf, cm] {
+            return open_sstable(tmp->path, 3).then([s] (shared_sstable sst) {
+                auto reader = make_lw_shared(sstable_reader(sst, s)); // reader holds sst and s alive.
+                return (*reader)().then([s, reader] (mutation_opt m) {
+                    BOOST_REQUIRE(m);
+                    auto beta = partition_key::from_exploded(*s, {to_bytes("beta")});
+                    BOOST_REQUIRE(m->key().equal(*s, beta));
+                    BOOST_REQUIRE(!m->partition().partition_tombstone());
+                    return (*reader)();
+                }).then([reader] (mutation_opt m) {
+                    BOOST_REQUIRE(!m);
+                });
+            });
+        });
+    }).then([s, tmp] {});
+}
--- a/tests/sstable_test.cc
+++ b/tests/sstable_test.cc
@@ -187,7 +187,7 @@ static future<> write_sst_info(sstring dir, unsigned long generation) {
 using bufptr_t = std::unique_ptr<char [], free_deleter>;
 static future<std::pair<bufptr_t, size_t>> read_file(sstring file_path)
 {
-    return engine().open_file_dma(file_path, open_flags::rw).then([] (file f) {
+    return open_file_dma(file_path, open_flags::rw).then([] (file f) {
        return f.size().then([f] (auto size) mutable {
            auto aligned_size = align_up(size, 512UL);
            auto buf = allocate_aligned_buffer<char>(aligned_size, 512UL);
@@ -803,7 +803,7 @@ SEASTAR_TEST_CASE(wrong_range) {
 static future<>
 test_sstable_exists(sstring dir, unsigned long generation, bool exists) {
    auto file_path = sstable::filename(dir, "ks", "cf", la, generation, big, sstable::component_type::Data);
-    return engine().open_file_dma(file_path, open_flags::ro).then_wrapped([exists] (future<file> f) {
+    return open_file_dma(file_path, open_flags::ro).then_wrapped([exists] (future<file> f) {
        if (exists) {
            BOOST_CHECK_NO_THROW(f.get0());
        } else {
--- a/tests/types_test.cc
+++ b/tests/types_test.cc
@@ -577,3 +577,78 @@ BOOST_AUTO_TEST_CASE(test_reversed_type_value_compatibility) {
    BOOST_REQUIRE(rb->is_value_compatible_with(*rs));
    BOOST_REQUIRE(rb->is_value_compatible_with(*utf8_type));
 }
+
+BOOST_AUTO_TEST_CASE(test_collection_type_compatibility) {
+    auto m__bi = map_type_impl::get_instance(bytes_type, int32_type, true);
+    auto mf_bi = map_type_impl::get_instance(bytes_type, int32_type, false);
+    auto m__bb = map_type_impl::get_instance(bytes_type, bytes_type, true);
+    auto mf_bb = map_type_impl::get_instance(bytes_type, bytes_type, false);
+    auto m__ii = map_type_impl::get_instance(int32_type, int32_type, true);
+    auto mf_ii = map_type_impl::get_instance(int32_type, int32_type, false);
+    auto m__ib = map_type_impl::get_instance(int32_type, bytes_type, true);
+    auto mf_ib = map_type_impl::get_instance(int32_type, bytes_type, false);
+    auto s__i = set_type_impl::get_instance(int32_type, true);
+    auto sf_i = set_type_impl::get_instance(int32_type, false);
+    auto s__b = set_type_impl::get_instance(bytes_type, true);
+    auto sf_b = set_type_impl::get_instance(bytes_type, false);
+    auto l__i = list_type_impl::get_instance(int32_type, true);
+    auto lf_i = list_type_impl::get_instance(int32_type, false);
+    auto l__b = list_type_impl::get_instance(bytes_type, true);
+    auto lf_b = list_type_impl::get_instance(bytes_type, false);
+
+    static auto msg = [] (const char* m, data_type x, data_type y) -> std::string {
+        return sprint("%s(%s, %s)", m, x->name(), y->name());
+    };
+
+    // Sort order does not change
+    auto verify_compat = [] (data_type to, data_type from) {
+        BOOST_CHECK_MESSAGE(to->is_compatible_with(*from), msg("verify_compat is_compatible", to, from));
+        // value compatibility is implied by compatibility
+        BOOST_CHECK_MESSAGE(to->is_value_compatible_with(*from), msg("verify_compat is_value_compatible", to, from));
+    };
+    // Sort order may change
+    auto verify_value_compat = [] (data_type to, data_type from) {
+        BOOST_CHECK_MESSAGE(!to->is_compatible_with(*from), msg("verify_value_compat !is_compatible", to, from)); // or verify_compat would be used
+        BOOST_CHECK_MESSAGE(to->is_value_compatible_with(*from), msg("verify_value_compat is_value_compatible", to, from));
+    };
+    // Cannot be cast
+    auto verify_not_compat = [] (data_type to, data_type from) {
+        BOOST_CHECK_MESSAGE(!to->is_compatible_with(*from), msg("verify_not_compat !is_compatible", to, from));
+        BOOST_CHECK_MESSAGE(!to->is_value_compatible_with(*from), msg("verify_not_compat !is_value_compatible", to, from));
+    };
+    auto cc = verify_compat;
+    auto vc = verify_value_compat;
+    auto nc = verify_not_compat;
+
+    struct test_case {
+        void (*verify)(data_type to, data_type from);
+        data_type to;
+        data_type from;
+    };
+    test_case tests[] = {
+            { nc, m__bi, int32_type },  // collection vs. primitiv
+            { cc, m__bi, m__bi },       // identity
+            { nc, m__bi, m__ib },       // key not compatible
+            { nc, mf_bi, mf_ib },       //  "
+            { nc, m__bb, mf_bb },       // frozen vs. unfrozen
+            { nc, mf_ii, mf_bb },       // key not compatible
+            { nc, mf_ii, mf_ib },       // frozen, and value not compatible
+            { cc, m__ib, m__ii },       // unfrozen so values don't need to sort
+            { nc, m__ii, m__bb },       // key not compatible
+            { nc, m__ii, m__bi },       // key not compatible
+            { nc, m__ii, m__ib },       // values not compatible
+            { vc, mf_ib, mf_ii },       // values value-compatible but don't sort
+            { nc, l__i,  s__i },        // different collection kinds
+            { nc, s__b,  s__i },        // different sorts
+            { nc, sf_b,  sf_i },        // different sorts
+            { nc, sf_i,  s__i },        // different temperature
+            { nc, sf_i,  sf_b },        // elements not compatible
+            { cc, l__b,  l__i },        // unfrozen so values don't need to sort
+            { vc, lf_b,  lf_i },        // values don't sort, so only value-compatible
+            { nc, lf_i,  l__i },        // different temperature
+            { nc, lf_i,  lf_b },        // elements not compatible
+    };
+    for (auto&& tc : tests) {
+        tc.verify(tc.to, tc.from);
+    }
+}
--- a/transport/server.cc
+++ b/transport/server.cc
@@ -212,6 +212,8 @@ private:
 cql_server::cql_server(distributed<service::storage_proxy>& proxy, distributed<cql3::query_processor>& qp, cql_load_balance lb)
    : _proxy(proxy)
    , _query_processor(qp)
+    , _max_request_size(memory::stats().total_memory() / 10)
+    , _memory_available(_max_request_size)
    , _collectd_registrations(std::make_unique<scollectd::registrations>(setup_collectd()))
    , _lb(lb)
 {
@@ -236,20 +238,34 @@ cql_server::setup_collectd() {
            scollectd::type_instance_id("transport", scollectd::per_cpu_plugin_instance,
                    "queue_length", "requests_serving"),
            scollectd::make_typed(scollectd::data_type::GAUGE, _requests_serving)),
+        scollectd::add_polled_metric(
+            scollectd::type_instance_id("transport", scollectd::per_cpu_plugin_instance,
+                    "queue_length", "requests_blocked_memory"),
+            scollectd::make_typed(scollectd::data_type::GAUGE, [this] { return _memory_available.waiters(); })),
    };
 }

 future<> cql_server::stop() {
    _stopping = true;
+    size_t nr = 0;
+    size_t nr_total = _listeners.size();
+    logger.debug("cql_server: abort accept nr_total={}", nr_total);
    for (auto&& l : _listeners) {
        l.abort_accept();
+        logger.debug("cql_server: abort accept {} out of {} done", ++nr, nr_total);
    }
-    for (auto&& c : _connections_list) {
-        c.shutdown();
-    }
-    service::get_local_storage_service().unregister_subscriber(_notifier.get());
-    service::get_local_migration_manager().unregister_listener(_notifier.get());
-    return std::move(_stopped);
+    auto nr_conn = make_lw_shared<size_t>(0);
+    auto nr_conn_total = _connections_list.size();
+    logger.debug("cql_server: shutdown connection nr_total={}", nr_conn_total);
+    return parallel_for_each(_connections_list.begin(), _connections_list.end(), [nr_conn, nr_conn_total] (auto&& c) {
+        return c.shutdown().then([nr_conn, nr_conn_total] {
+            logger.debug("cql_server: shutdown connection {} out of {} done", ++(*nr_conn), nr_conn_total);
+        });
+    }).then([this] {
+        service::get_local_storage_service().unregister_subscriber(_notifier.get());
+        service::get_local_migration_manager().unregister_listener(_notifier.get());
+        return std::move(_stopped);
+    });
 }

 future<>
@@ -490,8 +506,16 @@ future<> cql_server::connection::process_request() {

        auto op = f.opcode;
        auto stream = f.stream;
+        auto mem_estimate = f.length * 2 + 8000; // Allow for extra copies and bookkeeping

-        return _read_buf.read_exactly(f.length).then([this, op, stream] (temporary_buffer<char> buf) {
+        if (mem_estimate > _server._max_request_size) {
+            throw exceptions::invalid_request_exception(sprint(
+                    "request size too large (frame size %d; estimate %d; allowed %d",
+                    f.length, mem_estimate, _server._max_request_size));
+        }
+
+        return with_semaphore(_server._memory_available, mem_estimate, [this, length = f.length, op, stream] {
+          return _read_buf.read_exactly(length).then([this, op, stream] (temporary_buffer<char> buf) {

            ++_server._requests_served;
            ++_server._requests_serving;
@@ -514,6 +538,7 @@ future<> cql_server::connection::process_request() {
            });

            return make_ready_future<>();
+          });
        });
    });
 }
--- a/transport/server.hh
+++ b/transport/server.hh
@@ -27,6 +27,7 @@
 #include "service/storage_proxy.hh"
 #include "cql3/query_processor.hh"
 #include "core/distributed.hh"
+#include <seastar/core/semaphore.hh>
 #include <memory>
 #include <boost/intrusive/list.hpp>
 #include <seastar/net/tls.hh>
@@ -92,6 +93,8 @@ private:
    std::vector<server_socket> _listeners;
    distributed<service::storage_proxy>& _proxy;
    distributed<cql3::query_processor>& _query_processor;
+    size_t _max_request_size;
+    semaphore _memory_available;
    std::unique_ptr<scollectd::registrations> _collectd_registrations;
    std::unique_ptr<event_notifier> _notifier;
 private:
--- a/types.cc
+++ b/types.cc
@@ -1359,8 +1359,46 @@ collection_type_impl::serialize_for_native_protocol(std::vector<atomic_cell> cel

 bool
 collection_type_impl::is_compatible_with(const abstract_type& previous) const {
-    // FIXME: implement
-    abort();
+    if (this == &previous) {
+        return true;
+    }
+    if (!previous.is_collection()) {
+        return false;
+    }
+    auto& cprev = static_cast<const collection_type_impl&>(previous);
+    if (&_kind != &cprev._kind) {
+        return false;
+    }
+    if (is_multi_cell() != cprev.is_multi_cell()) {
+        return false;
+    }
+
+    if (!is_multi_cell()) {
+        return is_compatible_with_frozen(cprev);
+    }
+
+    if (!name_comparator()->is_compatible_with(*cprev.name_comparator())) {
+        return false;
+    }
+
+    // the value comparator is only used for Cell values, so sorting doesn't matter
+    return value_comparator()->is_value_compatible_with(*cprev.value_comparator());
+}
+
+bool
+collection_type_impl::is_value_compatible_with_internal(const abstract_type& previous) const {
+    // for multi-cell collections, compatibility and value-compatibility are the same
+    if (is_multi_cell() || previous.is_multi_cell()) {
+        return is_compatible_with(previous);
+    }
+    if (!previous.is_collection()) {
+        return false;
+    }
+    auto& cprev = static_cast<const collection_type_impl&>(previous);
+    if (&_kind != &cprev._kind) {
+        return false;
+    }
+    return is_value_compatible_with_frozen(cprev);
 }

 shared_ptr<cql3::cql3_type>
--- a/types.hh
+++ b/types.hh
@@ -700,6 +700,7 @@ public:
    virtual std::vector<bytes> serialized_values(std::vector<atomic_cell> cells) const = 0;
    bytes serialize_for_native_protocol(std::vector<atomic_cell> cells, int version) const;
    virtual bool is_compatible_with(const abstract_type& previous) const override;
+    virtual bool is_value_compatible_with_internal(const abstract_type& other) const override;
    virtual bool is_compatible_with_frozen(const collection_type_impl& previous) const = 0;
    virtual bool is_value_compatible_with_frozen(const collection_type_impl& previous) const = 0;
    virtual shared_ptr<cql3::cql3_type> as_cql3_type() const override;