lsa: disable constant_time_size in binomial_heap implementation

Corrupts heap on boost < 1.60, and not needed. Fixes #698.
dist: host gcc-5.1.1-4.fc22.src.rpm on our S3 account, since Fedora mirror deleted it
2015-12-29 15:12:30 +02:00 · 2015-12-17 12:53:19 +02:00 · 2015-12-16 11:46:14 +02:00 · 2015-12-09 18:26:22 +02:00 · 2015-12-09 18:23:47 +02:00 · 2015-12-09 14:19:43 +02:00
114 changed files with 2609 additions and 1101 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/76
+++ b/76
@@ -1 +1,77 @@
 http://git-wip-us.apache.org/repos/asf/cassandra.git trunk (bf599fb5b062cbcc652da78b7d699e7a01b949ad)
+
+import = bf599fb5b062cbcc652da78b7d699e7a01b949ad
+Y      = Already in scylla
+
+$ git log --oneline import..cassandra-2.1.11 -- gms/
+Y  484e645 Mark node as dead even if already left
+   d0c166f Add trampled commit back
+   ba5837e Merge branch 'cassandra-2.0' into cassandra-2.1
+   718e47f Forgot a damn c/r
+   a7282e4 Merge branch 'cassandra-2.0' into cassandra-2.1
+Y  ae4cd69 Print versions for gossip states in gossipinfo.
+Y  7fba3d2 Don't mark nodes down before the max local pause interval once paused.
+   c2142e6 Merge branch 'cassandra-2.0' into cassandra-2.1
+   ba9a69e checkForEndpointCollision fails for legitimate collisions, finalized list of statuses and nits, CASSANDRA-9765
+   54470a2 checkForEndpointCollision fails for legitimate collisions, improved version after CR, CASSANDRA-9765
+   2c9b490 checkForEndpointCollision fails for legitimate collisions, CASSANDRA-9765
+   4c15970 Merge branch 'cassandra-2.0' into cassandra-2.1
+   ad8047a ArrivalWindow should use primitives
+Y  4012134 Failure detector detects and ignores local pauses
+   9bcdd0f Merge branch 'cassandra-2.0' into cassandra-2.1
+   cefaa4e Close incoming connections when MessagingService is stopped
+   ea1beda Merge branch 'cassandra-2.0' into cassandra-2.1
+   08dbbd6 Ignore gossip SYNs after shutdown
+   3c17ac6 Merge branch 'cassandra-2.0' into cassandra-2.1
+   a64bc43 lists work better when you initialize them
+   543a899 change list to arraylist
+   730d4d4 Merge branch 'cassandra-2.0' into cassandra-2.1
+   e3e2de0 change list to arraylist
+   f7884c5 Merge branch 'cassandra-2.0' into cassandra-2.1
+Y  84b2846 remove redundant state
+   4f2c372 Merge branch 'cassandra-2.0' into cassandra-2.1
+Y  b2c62bb Add shutdown gossip state to prevent timeouts during rolling restarts
+Y  def4835 Add missing follow on fix for 7816 only applied to cassandra-2.1 branch in 763130bdbde2f4cec2e8973bcd5203caf51cc89f
+Y  763130b Followup commit for 7816
+   1376b8e Merge branch 'cassandra-2.0' into cassandra-2.1
+Y  2199a87 Fix duplicate up/down messages sent to native clients
+   136042e Merge branch 'cassandra-2.0' into cassandra-2.1
+Y  eb9c5bb Improve FD logging when the arrival time is ignored.
+
+$ git log --oneline import..cassandra-2.1.11 -- service/StorageService.java
+   92c5787 Keep StorageServiceMBean interface stable
+   6039d0e Fix DC and Rack in nodetool info
+   a2f0da0 Merge branch 'cassandra-2.0' into cassandra-2.1
+   c4de752 Follow-up to CASSANDRA-10238
+   e889ee4 2i key cache load fails
+   4b1d59e Merge branch 'cassandra-2.0' into cassandra-2.1
+   257cdaa Fix consolidating racks violating the RF contract
+Y  27754c0 refuse to decomission if not in state NORMAL patch by Jan Karlsson and Stefania for CASSANDRA-8741
+Y  5bc56c3 refuse to decomission if not in state NORMAL patch by Jan Karlsson and Stefania for CASSANDRA-8741
+Y  8f9ca07 Cannot replace token does not exist - DN node removed as Fat Client
+   c2142e6 Merge branch 'cassandra-2.0' into cassandra-2.1
+   54470a2 checkForEndpointCollision fails for legitimate collisions, improved version after CR, CASSANDRA-9765
+   1eccced Handle corrupt files on startup
+   2c9b490 checkForEndpointCollision fails for legitimate collisions, CASSANDRA-9765
+   c4b5260 Merge branch 'cassandra-2.0' into cassandra-2.1
+Y  52dbc3f Can't transition from write survey to normal mode
+   9966419 Make rebuild only run one at a time
+   d693ca1 Merge branch 'cassandra-2.0' into cassandra-2.1
+   be9eff5 Add option to not validate atoms during scrub
+   2a4daaf followup fix for 8564
+   93478ab Wait for anticompaction to finish
+   9e9846e Fix for harmless exceptions being logged as ERROR
+   6d06f32 Fix anticompaction blocking ANTI_ENTROPY stage
+   4f2c372 Merge branch 'cassandra-2.0' into cassandra-2.1
+Y  b2c62bb Add shutdown gossip state to prevent timeouts during rolling restarts
+Y  cba1b68 Fix failed bootstrap/replace attempts being persisted in system.peers
+   f59df28 Allow takeColumnFamilySnapshot to take a list of tables patch by Sachin Jarin; reviewed by Nick Bailey for CASSANDRA-8348
+Y  ac46747 Fix failed bootstrap/replace attempts being persisted in system.peers
+   5abab57 Merge branch 'cassandra-2.0' into cassandra-2.1
+   0ff9c3c Allow reusing snapshot tags across different column families.
+   f9c57a5 Merge branch 'cassandra-2.0' into cassandra-2.1
+Y  b296c55 Fix MOVED_NODE client event
+   bbb3fc7 Merge branch 'cassandra-2.0' into cassandra-2.1
+   37eb2a0 Fix NPE in nodetool getendpoints with bad ks/cf
+   f8b43d4 Merge branch 'cassandra-2.0' into cassandra-2.1
+   e20810c Remove C* specific class from JMX API
--- a/README.md
+++ b/README.md
@@ -82,3 +82,15 @@ Run the image with:
 ```
 docker run -p $(hostname -i):9042:9042 -i -t <image name>
 ```
+
+
+## Contributing to Scylla
+
+Do not send pull requests.
+
+Send patches to the mailing list address scylladb-dev@googlegroups.com.
+Be sure to subscribe.
+
+In order for your patches to be merged, you must sign the Contributor's
+License Agreement, protecting your rights and ours.  See
+http://www.scylladb.com/opensource/cla/.
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=development
+VERSION=0.13.2

 if test -f version
 then
--- a/api/api-doc/column_family.json
+++ b/api/api-doc/column_family.json
@@ -579,30 +579,6 @@
            }
         ]
      },
-      {
-         "path":"/column_family/sstables/snapshots_size/{name}",
-         "operations":[
-            {
-               "method":"GET",
-               "summary":"the size of SSTables in 'snapshots' subdirectory which aren't live anymore",
-               "type":"double",
-               "nickname":"true_snapshots_size",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            }
-         ]
-      },
      {
         "path":"/column_family/metrics/memtable_columns_count/{name}",
         "operations":[
@@ -2041,7 +2017,7 @@
         ]
      },
      {
-         "path":"/column_family/metrics/true_snapshots_size/{name}",
+         "path":"/column_family/metrics/snapshots_size/{name}",
         "operations":[
            {
               "method":"GET",
--- a/api/api-doc/compaction_manager.json
+++ b/api/api-doc/compaction_manager.json
@@ -15,7 +15,7 @@
               "summary":"get List of running compactions",
               "type":"array",
               "items":{
-                  "type":"jsonmap"
+                  "type":"summary"
               },
               "nickname":"get_compactions",
               "produces":[
@@ -46,16 +46,16 @@
         ]
      },
      {
-         "path":"/compaction_manager/compaction_summary",
+         "path":"/compaction_manager/compaction_info",
         "operations":[
            {
               "method":"GET",
-               "summary":"get compaction summary",
+               "summary":"get a list of all active compaction info",
               "type":"array",
               "items":{
-                  "type":"string"
+                  "type":"compaction_info"
               },
-               "nickname":"get_compaction_summary",
+               "nickname":"get_compaction_info",
               "produces":[
                  "application/json"
               ],
@@ -174,30 +174,73 @@
    }
   ],
   "models":{
-      "mapper":{
-         "id":"mapper",
-         "description":"A key value mapping",
+      "row_merged":{
+         "id":"row_merged",
+         "description":"A row merged information",
         "properties":{
            "key":{
-               "type":"string",
-               "description":"The key"
+               "type":"int",
+               "description":"The number of sstable"
            },
            "value":{
-               "type":"string",
-               "description":"The value"
+               "type":"long",
+               "description":"The number or row compacted"
            }
         }
      },
-      "jsonmap":{
-         "id":"jsonmap",
-         "description":"A json representation of a map as a list of key value",
+      "compaction_info" :{
+          "id": "compaction_info",
+          "description":"A key value mapping",
+          "properties":{
+            "operation_type":{
+               "type":"string",
+               "description":"The operation type"
+            },
+            "completed":{
+               "type":"long",
+               "description":"The current completed"
+            },
+            "total":{
+               "type":"long",
+               "description":"The total to compact"
+            },
+            "unit":{
+               "type":"string",
+               "description":"The compacted unit"
+            }
+          }
+      },
+      "summary":{
+         "id":"summary",
+         "description":"A compaction summary object",
         "properties":{
-            "value":{
-               "type":"array",
-               "items":{
-                  "type":"mapper"
-               },
-               "description":"A list of key, value mapping"
+            "id":{
+               "type":"string",
+               "description":"The UUID"
+            },
+            "ks":{
+               "type":"string",
+               "description":"The keyspace name"
+            },
+            "cf":{
+               "type":"string",
+               "description":"The column family name"
+            },
+            "completed":{
+               "type":"long",
+               "description":"The number of units completed"
+            },
+            "total":{
+               "type":"long",
+               "description":"The total number of units"
+            },
+            "task_type":{
+               "type":"string",
+               "description":"The task compaction type"
+            },
+            "unit":{
+               "type":"string",
+               "description":"The units being used"
            }
         }
      },
@@ -232,7 +275,7 @@
            "rows_merged":{
               "type":"array",
               "items":{
-                  "type":"mapper"
+                  "type":"row_merged"
               },
               "description":"The merged rows"
            }
--- a/api/api-doc/failure_detector.json
+++ b/api/api-doc/failure_detector.json
@@ -48,7 +48,10 @@
            {
               "method":"GET",
               "summary":"Get all endpoint states",
-               "type":"string",
+               "type":"array",
+               "items":{
+                  "type":"endpoint_state"
+               },
               "nickname":"get_all_endpoint_states",
               "produces":[
                  "application/json"
@@ -148,6 +151,53 @@
                    "description": "The value"
                }
            }
+        },
+        "endpoint_state": {
+           "id": "states",
+           "description": "Holds an endpoint state",
+               "properties": {
+                "addrs": {
+                    "type": "string",
+                    "description": "The endpoint address"
+                },
+                "generation": {
+                    "type": "int",
+                    "description": "The heart beat generation"
+                },
+                "version": {
+                    "type": "int",
+                    "description": "The heart beat version"
+                },
+                "update_time": {
+                    "type": "long",
+                    "description": "The update timestamp"
+                },
+                "is_alive": {
+                    "type": "boolean",
+                    "description": "Is the endpoint alive"
+                },
+                "application_state" : {
+                    "type":"array",
+                    "items":{
+                        "type":"version_value"
+                    },
+                    "description": "Is the endpoint alive"
+                }
+            }
+        },
+        "version_value": {
+           "id": "version_value",
+           "description": "Holds a version value for an application state",
+               "properties": {
+                "application_state": {
+                    "type": "int",
+                    "description": "The application state enum index"
+                },
+                "value": {
+                    "type": "string",
+                    "description": "The version value"
+                }
+            }
        }
    }
 }
--- a/api/api-doc/messaging_service.json
+++ b/api/api-doc/messaging_service.json
@@ -184,6 +184,30 @@
               ]
            }
         ]
+      },
+      {
+         "path":"/messaging_service/version",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get the version number",
+               "type":"int",
+               "nickname":"get_version",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"addr",
+                     "description":"Address",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
      }
   ],
   "models":{
--- a/api/api.hh
+++ b/api/api.hh
@@ -128,47 +128,54 @@ inline double pow2(double a) {
    return a * a;
 }

-inline httpd::utils_json::histogram add_histogram(httpd::utils_json::histogram res,
+// FIXME: Move to utils::ihistogram::operator+=()
+inline utils::ihistogram add_histogram(utils::ihistogram res,
        const utils::ihistogram& val) {
-    if (!res.count._set) {
-        res = val;
-        return res;
+    if (res.count == 0) {
+        return val;
    }
    if (val.count == 0) {
-        return res;
+        return std::move(res);
    }
-    if (res.min() > val.min) {
+    if (res.min > val.min) {
        res.min = val.min;
    }
-    if (res.max() < val.max) {
+    if (res.max < val.max) {
        res.max = val.max;
    }
-    double ncount = res.count() + val.count;
+    double ncount = res.count + val.count;
    // To get an estimated sum we take the estimated mean
    // and multiply it by the true count
-    res.sum = res.sum() + val.mean * val.count;
-    double a = res.count()/ncount;
+    res.sum = res.sum + val.mean * val.count;
+    double a = res.count/ncount;
    double b = val.count/ncount;

-    double mean =  a * res.mean() + b * val.mean;
+    double mean =  a * res.mean + b * val.mean;

-    res.variance = (res.variance() + pow2(res.mean() - mean) )* a +
+    res.variance = (res.variance + pow2(res.mean - mean) )* a +
            (val.variance + pow2(val.mean -mean))* b;

    res.mean = mean;
-    res.count = res.count() + val.count;
+    res.count = res.count + val.count;
    for (auto i : val.sample) {
-        res.sample.push(i);
+        res.sample.push_back(i);
    }
    return res;
 }

+inline
+httpd::utils_json::histogram to_json(const utils::ihistogram& val) {
+    httpd::utils_json::histogram h;
+    h = val;
+    return h;
+}
+
 template<class T, class F>
 future<json::json_return_type>  sum_histogram_stats(distributed<T>& d, utils::ihistogram F::*f) {

-    return d.map_reduce0([f](const T& p) {return p.get_stats().*f;}, httpd::utils_json::histogram(),
-            add_histogram).then([](const httpd::utils_json::histogram& val) {
-        return make_ready_future<json::json_return_type>(val);
+    return d.map_reduce0([f](const T& p) {return p.get_stats().*f;}, utils::ihistogram(),
+            add_histogram).then([](const utils::ihistogram& val) {
+        return make_ready_future<json::json_return_type>(to_json(val));
    });
 }

--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -110,23 +110,25 @@ static future<json::json_return_type>  get_cf_histogram(http_context& ctx, const
        utils::ihistogram column_family::stats::*f) {
    utils::UUID uuid = get_uuid(name, ctx.db.local());
    return ctx.db.map_reduce0([f, uuid](const database& p) {return p.find_column_family(uuid).get_stats().*f;},
-            httpd::utils_json::histogram(),
+            utils::ihistogram(),
            add_histogram)
-            .then([](const httpd::utils_json::histogram& val) {
-                return make_ready_future<json::json_return_type>(val);
+            .then([](const utils::ihistogram& val) {
+                return make_ready_future<json::json_return_type>(to_json(val));
    });
 }

 static future<json::json_return_type> get_cf_histogram(http_context& ctx, utils::ihistogram column_family::stats::*f) {
-    std::function<httpd::utils_json::histogram(const database&)> fun = [f] (const database& db)  {
-        httpd::utils_json::histogram res;
+    std::function<utils::ihistogram(const database&)> fun = [f] (const database& db)  {
+        utils::ihistogram res;
        for (auto i : db.get_column_families()) {
            res = add_histogram(res, i.second->get_stats().*f);
        }
        return res;
    };
-    return ctx.db.map(fun).then([](const std::vector<httpd::utils_json::histogram> &res) {
-        return make_ready_future<json::json_return_type>(res);
+    return ctx.db.map(fun).then([](const std::vector<utils::ihistogram> &res) {
+        std::vector<httpd::utils_json::histogram> r;
+        boost::copy(res | boost::adaptors::transformed(to_json), std::back_inserter(r));
+        return make_ready_future<json::json_return_type>(r);
    });
 }

@@ -589,11 +591,16 @@ void set_column_family(http_context& ctx, routes& r) {
        return make_ready_future<json::json_return_type>(0);
    });

-    cf::get_true_snapshots_size.set(r, [] (std::unique_ptr<request> req) {
-        //TBD
-        // FIXME
-        //auto id = get_uuid(req->param["name"], ctx.db.local());
-        return make_ready_future<json::json_return_type>(0);
+    cf::get_true_snapshots_size.set(r, [&ctx] (std::unique_ptr<request> req) {
+        auto uuid = get_uuid(req->param["name"], ctx.db.local());
+        return ctx.db.local().find_column_family(uuid).get_snapshot_details().then([](
+                const std::unordered_map<sstring, column_family::snapshot_details>& sd) {
+            int64_t res = 0;
+            for (auto i : sd) {
+                res += i.second.total;
+            }
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    cf::get_all_true_snapshots_size.set(r, [] (std::unique_ptr<request> req) {
--- a/api/compaction_manager.cc
+++ b/api/compaction_manager.cc
@@ -26,7 +26,7 @@ namespace api {

 using namespace scollectd;
 namespace cm = httpd::compaction_manager_json;
-
+using namespace json;

 static future<json::json_return_type> get_cm_stats(http_context& ctx,
        int64_t compaction_manager::stats::*f) {
@@ -40,27 +40,23 @@ static future<json::json_return_type> get_cm_stats(http_context& ctx,
 void set_compaction_manager(http_context& ctx, routes& r) {
    cm::get_compactions.set(r, [] (std::unique_ptr<request> req) {
        //TBD
-        unimplemented();
-        std::vector<cm::jsonmap> map;
+        // FIXME
+        warn(unimplemented::cause::API);
+        std::vector<cm::summary> map;
        return make_ready_future<json::json_return_type>(map);
    });

-    cm::get_compaction_summary.set(r, [] (std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        std::vector<sstring> res;
-        return make_ready_future<json::json_return_type>(res);
-    });
-
    cm::force_user_defined_compaction.set(r, [] (std::unique_ptr<request> req) {
        //TBD
-        unimplemented();
-        return make_ready_future<json::json_return_type>("");
+        // FIXME
+        warn(unimplemented::cause::API);
+        return make_ready_future<json::json_return_type>(json_void());
    });

    cm::stop_compaction.set(r, [] (std::unique_ptr<request> req) {
        //TBD
-        unimplemented();
+        // FIXME
+        warn(unimplemented::cause::API);
        return make_ready_future<json::json_return_type>("");
    });

@@ -81,17 +77,27 @@ void set_compaction_manager(http_context& ctx, routes& r) {

    cm::get_bytes_compacted.set(r, [] (std::unique_ptr<request> req) {
        //TBD
-        unimplemented();
+        // FIXME
+        warn(unimplemented::cause::API);
        return make_ready_future<json::json_return_type>(0);
    });

    cm::get_compaction_history.set(r, [] (std::unique_ptr<request> req) {
        //TBD
-        unimplemented();
+        // FIXME
+        warn(unimplemented::cause::API);
        std::vector<cm::history> res;
        return make_ready_future<json::json_return_type>(res);
    });

+    cm::get_compaction_info.set(r, [] (std::unique_ptr<request> req) {
+        //TBD
+        // FIXME
+        warn(unimplemented::cause::API);
+        std::vector<cm::compaction_info> res;
+        return make_ready_future<json::json_return_type>(res);
+    });
+
 }

 }
--- a/api/failure_detector.cc
+++ b/api/failure_detector.cc
@@ -22,15 +22,33 @@
 #include "failure_detector.hh"
 #include "api/api-doc/failure_detector.json.hh"
 #include "gms/failure_detector.hh"
+#include "gms/application_state.hh"
+#include "gms/gossiper.hh"
 namespace api {

 namespace fd = httpd::failure_detector_json;

 void set_failure_detector(http_context& ctx, routes& r) {
    fd::get_all_endpoint_states.set(r, [](std::unique_ptr<request> req) {
-        return gms::get_all_endpoint_states().then([](const sstring& str) {
-            return make_ready_future<json::json_return_type>(str);
-        });
+        std::vector<fd::endpoint_state> res;
+        for (auto i : gms::get_local_gossiper().endpoint_state_map) {
+            fd::endpoint_state val;
+            val.addrs = boost::lexical_cast<std::string>(i.first);
+            val.is_alive = i.second.is_alive();
+            val.generation = i.second.get_heart_beat_state().get_generation();
+            val.version = i.second.get_heart_beat_state().get_heart_beat_version();
+            val.update_time = i.second.get_update_timestamp().time_since_epoch().count();
+            for (auto a : i.second.get_application_state_map()) {
+                fd::version_value version_val;
+                // We return the enum index and not it's name to stay compatible to origin
+                // method that the state index are static but the name can be changed.
+                version_val.application_state = static_cast<std::underlying_type<gms::application_state>::type>(a.first);
+                version_val.value = a.second.value;
+                val.application_state.push(version_val);
+            }
+            res.push_back(val);
+        }
+        return make_ready_future<json::json_return_type>(res);
    });

    fd::get_up_endpoint_count.set(r, [](std::unique_ptr<request> req) {
--- a/api/messaging_service.cc
+++ b/api/messaging_service.cc
@@ -119,6 +119,10 @@ void set_messaging_service(http_context& ctx, routes& r) {
        return c.sent_messages;
    }));

+    get_version.set(r, [](const_req req) {
+        return net::get_local_messaging_service().get_raw_version(req.get_query_param("addr"));
+    });
+
    get_dropped_messages_by_ver.set(r, [](std::unique_ptr<request> req) {
        shared_ptr<std::vector<uint64_t>> map = make_shared<std::vector<uint64_t>>(num_verb, 0);

--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -33,8 +33,10 @@
 *
 */
 class bytes_ostream {
+public:
    using size_type = bytes::size_type;
    using value_type = bytes::value_type;
+private:
    static_assert(sizeof(value_type) == 1, "value_type is assumed to be one byte long");
    struct chunk {
        // FIXME: group fragment pointers to reduce pointer chasing when packetizing
@@ -117,13 +119,13 @@ private:
        };
    }
 public:
-    bytes_ostream()
+    bytes_ostream() noexcept
        : _begin()
        , _current(nullptr)
        , _size(0)
    { }

-    bytes_ostream(bytes_ostream&& o)
+    bytes_ostream(bytes_ostream&& o) noexcept
        : _begin(std::move(o._begin))
        , _current(o._current)
        , _size(o._size)
@@ -148,7 +150,7 @@ public:
        return *this;
    }

-    bytes_ostream& operator=(bytes_ostream&& o) {
+    bytes_ostream& operator=(bytes_ostream&& o) noexcept {
        _size = o._size;
        _begin = std::move(o._begin);
        _current = o._current;
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -856,7 +856,7 @@ dropIndexStatement returns [DropIndexStatement expr]
  * TRUNCATE <CF>;
  */
 truncateStatement returns [::shared_ptr<truncate_statement> stmt]
-    : K_TRUNCATE cf=columnFamilyName { $stmt = ::make_shared<truncate_statement>(cf); }
+    : K_TRUNCATE (K_COLUMNFAMILY)? cf=columnFamilyName { $stmt = ::make_shared<truncate_statement>(cf); }
    ;

 #if 0
--- a/database.cc
+++ b/database.cc
@@ -416,6 +416,23 @@ static std::vector<sstring> parse_fname(sstring filename) {
    return comps;
 }

+static bool belongs_to_current_shard(const schema& s, const partition_key& first, const partition_key& last) {
+    auto key_shard = [&s] (const partition_key& pk) {
+        auto token = dht::global_partitioner().get_token(s, pk);
+        return dht::shard_of(token);
+    };
+    auto s1 = key_shard(first);
+    auto s2 = key_shard(last);
+    auto me = engine().cpu_id();
+    return (s1 <= me) && (me <= s2);
+}
+
+static bool belongs_to_current_shard(const schema& s, range<partition_key> r) {
+    assert(r.start());
+    assert(r.end());
+    return belongs_to_current_shard(s, r.start()->value(), r.end()->value());
+}
+
 future<sstables::entry_descriptor> column_family::probe_file(sstring sstdir, sstring fname) {

    using namespace sstables;
@@ -432,12 +449,21 @@ future<sstables::entry_descriptor> column_family::probe_file(sstring sstdir, sst
    update_sstables_known_generation(comps.generation);
    assert(_sstables->count(comps.generation) == 0);

-    auto sst = std::make_unique<sstables::sstable>(_schema->ks_name(), _schema->cf_name(), sstdir, comps.generation, comps.version, comps.format);
-    auto fut = sst->load();
-    return std::move(fut).then([this, sst = std::move(sst)] () mutable {
-        add_sstable(std::move(*sst));
-        return make_ready_future<>();
-    }).then_wrapped([fname, comps = std::move(comps)] (future<> f) {
+    auto fut = sstable::get_sstable_key_range(*_schema, _schema->ks_name(), _schema->cf_name(), sstdir, comps.generation, comps.version, comps.format);
+    return std::move(fut).then([this, sstdir = std::move(sstdir), comps] (range<partition_key> r) {
+        // Checks whether or not sstable belongs to current shard.
+        if (!belongs_to_current_shard(*_schema, std::move(r))) {
+            sstable::mark_sstable_for_deletion(_schema->ks_name(), _schema->cf_name(), sstdir, comps.generation, comps.version, comps.format);
+            return make_ready_future<>();
+        }
+
+        auto sst = std::make_unique<sstables::sstable>(_schema->ks_name(), _schema->cf_name(), sstdir, comps.generation, comps.version, comps.format);
+        auto fut = sst->load();
+        return std::move(fut).then([this, sst = std::move(sst)] () mutable {
+            add_sstable(std::move(*sst));
+            return make_ready_future<>();
+        });
+    }).then_wrapped([fname, comps] (future<> f) {
        try {
            f.get();
        } catch (malformed_sstable_exception& e) {
@@ -462,19 +488,6 @@ void column_family::add_sstable(sstables::sstable&& sstable) {
 }

 void column_family::add_sstable(lw_shared_ptr<sstables::sstable> sstable) {
-    auto key_shard = [this] (const partition_key& pk) {
-        auto token = dht::global_partitioner().get_token(*_schema, pk);
-        return dht::shard_of(token);
-    };
-    auto s1 = key_shard(sstable->get_first_partition_key(*_schema));
-    auto s2 = key_shard(sstable->get_last_partition_key(*_schema));
-    auto me = engine().cpu_id();
-    auto included = (s1 <= me) && (me <= s2);
-    if (!included) {
-        dblog.info("sstable {} not relevant for this shard, ignoring", sstable->get_filename());
-        sstable->mark_for_deletion();
-        return;
-    }
    auto generation = sstable->generation();
    // allow in-progress reads to continue using old list
    _sstables = make_lw_shared<sstable_list>(*_sstables);
@@ -745,7 +758,11 @@ column_family::load_new_sstables(std::vector<sstables::entry_descriptor> new_tab
        return sst->load().then([this, sst] {
            return sst->mutate_sstable_level(0);
        }).then([this, sst] {
-            this->add_sstable(sst);
+            auto first = sst->get_first_partition_key(*_schema);
+            auto last = sst->get_last_partition_key(*_schema);
+            if (belongs_to_current_shard(*_schema, first, last)) {
+                this->add_sstable(sst);
+            }
            return make_ready_future<>();
        });
    });
@@ -837,9 +854,10 @@ future<> column_family::populate(sstring sstdir) {
    auto verifier = make_lw_shared<std::unordered_map<unsigned long, status>>();
    auto descriptor = make_lw_shared<sstable_descriptor>();

-    return lister::scan_dir(sstdir, { directory_entry_type::regular }, [this, sstdir, verifier, descriptor] (directory_entry de) {
+  return do_with(std::vector<future<>>(), [this, sstdir, verifier, descriptor] (std::vector<future<>>& futures) {
+    return lister::scan_dir(sstdir, { directory_entry_type::regular }, [this, sstdir, verifier, descriptor, &futures] (directory_entry de) {
        // FIXME: The secondary indexes are in this level, but with a directory type, (starting with ".")
-        return probe_file(sstdir, de.name).then([verifier, descriptor] (auto entry) {
+        auto f = probe_file(sstdir, de.name).then([verifier, descriptor] (auto entry) {
            if (verifier->count(entry.generation)) {
                if (verifier->at(entry.generation) == status::has_toc_file) {
                    if (entry.component == sstables::sstable::component_type::TOC) {
@@ -870,6 +888,23 @@ future<> column_family::populate(sstring sstdir) {
                descriptor->format = entry.format;
            }
        });
+
+        // push future returned by probe_file into an array of futures,
+        // so that the supplied callback will not block scan_dir() from
+        // reading the next entry in the directory.
+        futures.push_back(std::move(f));
+
+        return make_ready_future<>();
+    }).then([&futures] {
+        return when_all(futures.begin(), futures.end()).then([] (std::vector<future<>> ret) {
+            try {
+                for (auto& f : ret) {
+                    f.get();
+                }
+            } catch(...) {
+                throw;
+            }
+        });
    }).then([verifier, sstdir, descriptor, this] {
        return parallel_for_each(*verifier, [sstdir = std::move(sstdir), descriptor, this] (auto v) {
            if (v.second == status::has_temporary_toc_file) {
@@ -891,6 +926,7 @@ future<> column_family::populate(sstring sstdir) {
            return make_ready_future<>();
        });
    });
+  });
 }

 utils::UUID database::empty_version = utils::UUID_gen::get_name_UUID(bytes{});
@@ -996,7 +1032,7 @@ template <typename Func>
 static future<>
 do_parse_system_tables(distributed<service::storage_proxy>& proxy, const sstring& _cf_name, Func&& func) {
    using namespace db::schema_tables;
-    static_assert(std::is_same<future<>, std::result_of_t<Func(schema_result::value_type&)>>::value,
+    static_assert(std::is_same<future<>, std::result_of_t<Func(schema_result_value_type&)>>::value,
                  "bad Func signature");


@@ -1031,11 +1067,11 @@ do_parse_system_tables(distributed<service::storage_proxy>& proxy, const sstring

 future<> database::parse_system_tables(distributed<service::storage_proxy>& proxy) {
    using namespace db::schema_tables;
-    return do_parse_system_tables(proxy, db::schema_tables::KEYSPACES, [this] (schema_result::value_type &v) {
+    return do_parse_system_tables(proxy, db::schema_tables::KEYSPACES, [this] (schema_result_value_type &v) {
        auto ksm = create_keyspace_from_schema_partition(v);
        return create_keyspace(ksm);
    }).then([&proxy, this] {
-        return do_parse_system_tables(proxy, db::schema_tables::COLUMNFAMILIES, [this, &proxy] (schema_result::value_type &v) {
+        return do_parse_system_tables(proxy, db::schema_tables::COLUMNFAMILIES, [this, &proxy] (schema_result_value_type &v) {
            return create_tables_from_tables_partition(proxy, v.second).then([this] (std::map<sstring, schema_ptr> tables) {
                for (auto& t: tables) {
                    auto s = t.second;
@@ -1462,7 +1498,7 @@ column_family::query(const query::read_command& cmd, const std::vector<query::pa
    }).finally([lc, this]() mutable {
        _stats.reads.mark(lc);
        if (lc.is_start()) {
-            _stats.estimated_read.add(lc.latency_in_nano(), _stats.reads.count);
+            _stats.estimated_read.add(lc.latency(), _stats.reads.count);
        }
    });
 }
@@ -1476,28 +1512,14 @@ column_family::as_mutation_source() const {

 future<lw_shared_ptr<query::result>>
 database::query(const query::read_command& cmd, const std::vector<query::partition_range>& ranges) {
-    static auto make_empty = [] {
-        return make_ready_future<lw_shared_ptr<query::result>>(make_lw_shared(query::result()));
-    };
-
-    try {
-        column_family& cf = find_column_family(cmd.cf_id);
-        return cf.query(cmd, ranges);
-    } catch (const no_such_column_family&) {
-        // FIXME: load from sstables
-        return make_empty();
-    }
+    column_family& cf = find_column_family(cmd.cf_id);
+    return cf.query(cmd, ranges);
 }

 future<reconcilable_result>
 database::query_mutations(const query::read_command& cmd, const query::partition_range& range) {
-    try {
-        column_family& cf = find_column_family(cmd.cf_id);
-        return mutation_query(cf.as_mutation_source(), range, cmd.slice, cmd.row_limit, cmd.timestamp);
-    } catch (const no_such_column_family&) {
-        // FIXME: load from sstables
-        return make_ready_future<reconcilable_result>(reconcilable_result());
-    }
+    column_family& cf = find_column_family(cmd.cf_id);
+    return mutation_query(cf.as_mutation_source(), range, cmd.slice, cmd.row_limit, cmd.timestamp);
 }

 std::unordered_set<sstring> database::get_initial_tokens() {
@@ -1512,6 +1534,31 @@ std::unordered_set<sstring> database::get_initial_tokens() {
    return tokens;
 }

+std::experimental::optional<gms::inet_address> database::get_replace_address() {
+    auto& cfg = get_config();
+    sstring replace_address = cfg.replace_address();
+    sstring replace_address_first_boot = cfg.replace_address_first_boot();
+    try {
+        if (!replace_address.empty()) {
+            return gms::inet_address(replace_address);
+        } else if (!replace_address_first_boot.empty()) {
+            return gms::inet_address(replace_address_first_boot);
+        }
+        return std::experimental::nullopt;
+    } catch (...) {
+        return std::experimental::nullopt;
+    }
+}
+
+bool database::is_replacing() {
+    sstring replace_address_first_boot = get_config().replace_address_first_boot();
+    if (!replace_address_first_boot.empty() && db::system_keyspace::bootstrap_complete()) {
+        dblog.info("Replace address on first boot requested; this node is already bootstrapped");
+        return false;
+    }
+    return bool(get_replace_address());
+}
+
 std::ostream& operator<<(std::ostream& out, const atomic_cell_or_collection& c) {
    return out << to_hex(c._data);
 }
@@ -1541,8 +1588,7 @@ future<> database::apply_in_memory(const frozen_mutation& m, const db::replay_po
        auto& cf = find_column_family(m.column_family_id());
        cf.apply(m, rp);
    } catch (no_such_column_family&) {
-        // TODO: log a warning
-        // FIXME: load keyspace meta-data from storage
+        dblog.error("Attempting to mutate non-existent table {}", m.column_family_id());
    }
    return make_ready_future<>();
 }
@@ -1975,7 +2021,11 @@ future<> column_family::clear_snapshot(sstring tag) {
 future<std::unordered_map<sstring, column_family::snapshot_details>> column_family::get_snapshot_details() {
    std::unordered_map<sstring, snapshot_details> all_snapshots;
    return do_with(std::move(all_snapshots), [this] (auto& all_snapshots) {
-        return lister::scan_dir(_config.datadir + "/snapshots",  { directory_entry_type::directory }, [this, &all_snapshots] (directory_entry de) {
+        return engine().file_exists(_config.datadir + "/snapshots").then([this, &all_snapshots](bool file_exists) {
+            if (!file_exists) {
+                return make_ready_future<>();
+            }
+            return lister::scan_dir(_config.datadir + "/snapshots",  { directory_entry_type::directory }, [this, &all_snapshots] (directory_entry de) {
            auto snapshot_name = de.name;
            auto snapshot = _config.datadir + "/snapshots/" + snapshot_name;
            all_snapshots.emplace(snapshot_name, snapshot_details());
@@ -2010,6 +2060,7 @@ future<std::unordered_map<sstring, column_family::snapshot_details>> column_fami
                    });
                });
            });
+        });
        }).then([&all_snapshots] {
            return std::move(all_snapshots);
        });
--- a/database.hh
+++ b/database.hh
@@ -648,6 +648,8 @@ public:
    }

    std::unordered_set<sstring> get_initial_tokens();
+    std::experimental::optional<gms::inet_address> get_replace_address();
+    bool is_replacing();
 };

 // FIXME: stub
@@ -662,7 +664,7 @@ column_family::apply(const mutation& m, const db::replay_position& rp) {
    seal_on_overflow();
    _stats.writes.mark(lc);
    if (lc.is_start()) {
-        _stats.estimated_write.add(lc.latency_in_nano(), _stats.writes.count);
+        _stats.estimated_write.add(lc.latency(), _stats.writes.count);
    }
 }

@@ -696,7 +698,7 @@ column_family::apply(const frozen_mutation& m, const db::replay_position& rp) {
    seal_on_overflow();
    _stats.writes.mark(lc);
    if (lc.is_start()) {
-        _stats.estimated_write.add(lc.latency_in_nano(), _stats.writes.count);
+        _stats.estimated_write.add(lc.latency(), _stats.writes.count);
    }
 }

--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -90,7 +90,7 @@ public:

 db::commitlog::config::config(const db::config& cfg)
    : commit_log_location(cfg.commitlog_directory())
-    , commitlog_total_space_in_mb(cfg.commitlog_total_space_in_mb() >= 0 ? cfg.commitlog_total_space_in_mb() : memory::stats().total_memory())
+    , commitlog_total_space_in_mb(cfg.commitlog_total_space_in_mb() >= 0 ? cfg.commitlog_total_space_in_mb() : memory::stats().total_memory() >> 20)
    , commitlog_segment_size_in_mb(cfg.commitlog_segment_size_in_mb())
    , commitlog_sync_period_in_ms(cfg.commitlog_sync_batch_window_in_ms())
    , mode(cfg.commitlog_sync() == "batch" ? sync_mode::BATCH : sync_mode::PERIODIC)
@@ -1097,7 +1097,7 @@ db::commitlog::commitlog(config cfg)
        : _segment_manager(new segment_manager(std::move(cfg))) {
 }

-db::commitlog::commitlog(commitlog&& v)
+db::commitlog::commitlog(commitlog&& v) noexcept
        : _segment_manager(std::move(v._segment_manager)) {
 }

@@ -1173,10 +1173,11 @@ const db::commitlog::config& db::commitlog::active_config() const {
    return _segment_manager->cfg;
 }

-future<subscription<temporary_buffer<char>, db::replay_position>>
+future<std::unique_ptr<subscription<temporary_buffer<char>, db::replay_position>>>
 db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func next, position_type off) {
    return engine().open_file_dma(filename, open_flags::ro).then([next = std::move(next), off](file f) {
-       return read_log_file(std::move(f), std::move(next), off);
+       return std::make_unique<subscription<temporary_buffer<char>, replay_position>>(
+           read_log_file(std::move(f), std::move(next), off));
    });
 }

@@ -1192,6 +1193,8 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
        size_t next = 0;
        size_t start_off = 0;
        size_t skip_to = 0;
+        size_t file_size = 0;
+        size_t corrupt_size = 0;
        bool eof = false;
        bool header = true;

@@ -1289,7 +1292,11 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type

                auto cs = crc.checksum();
                if (cs != checksum) {
-                    throw std::runtime_error("Checksum error in chunk header");
+                    // if a chunk header checksum is broken, we shall just assume that all
+                    // remaining is as well. We cannot trust the "next" pointer, so...
+                    logger.debug("Checksum error in segment chunk at {}.", pos);
+                    corrupt_size += (file_size - pos);
+                    return stop();
                }

                this->next = next;
@@ -1315,21 +1322,24 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
                auto size = in.read<uint32_t>();
                auto checksum = in.read<uint32_t>();

-                if (size == 0) {
-                    // special scylla case: zero padding due to dma blocks
-                    auto slack = next - pos;
-                    return skip(slack);
-                }
+                crc32_nbo crc;
+                crc.process(size);

-                if (size < 3 * sizeof(uint32_t)) {
-                    throw std::runtime_error("Invalid entry size");
+                if (size < 3 * sizeof(uint32_t) || checksum != crc.checksum()) {
+                    auto slack = next - pos;
+                    if (size != 0) {
+                        logger.debug("Segment entry at {} has broken header. Skipping to next chunk ({} bytes)", rp, slack);
+                        corrupt_size += slack;
+                    }
+                    // size == 0 -> special scylla case: zero padding due to dma blocks
+                    return skip(slack);
                }

                if (start_off > pos) {
                    return skip(size - entry_header_size);
                }

-                return fin.read_exactly(size - entry_header_size).then([this, size, checksum, rp](temporary_buffer<char> buf) {
+                return fin.read_exactly(size - entry_header_size).then([this, size, crc = std::move(crc), rp](temporary_buffer<char> buf) mutable {
                    advance(buf);

                    data_input in(buf);
@@ -1338,12 +1348,15 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
                    in.skip(data_size);
                    auto checksum = in.read<uint32_t>();

-                    crc32_nbo crc;
-                    crc.process(size);
                    crc.process_bytes(buf.get(), data_size);

                    if (crc.checksum() != checksum) {
-                        throw std::runtime_error("Checksum error in data entry");
+                        // If we're getting a checksum error here, most likely the rest of
+                        // the file will be corrupt as well. But it does not hurt to retry.
+                        // Just go to the next entry (since "size" in header seemed ok).
+                        logger.debug("Segment entry at {} checksum error. Skipping {} bytes", rp, size);
+                        corrupt_size += size;
+                        return make_ready_future<>();
                    }

                    return s.produce(buf.share(0, data_size), rp);
@@ -1351,10 +1364,18 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
            });
        }
        future<> read_file() {
-            return read_header().then(
-                    [this] {
-                        return do_until(std::bind(&work::end_of_file, this), std::bind(&work::read_chunk, this));
-                    });
+            return f.size().then([this](uint64_t size) {
+                file_size = size;
+            }).then([this] {
+                return read_header().then(
+                        [this] {
+                            return do_until(std::bind(&work::end_of_file, this), std::bind(&work::read_chunk, this));
+                }).then([this] {
+                  if (corrupt_size > 0) {
+                      throw segment_data_corruption_error("Data corruption", corrupt_size);
+                  }
+                });
+            });
        }
    };

@@ -1382,6 +1403,10 @@ uint64_t db::commitlog::get_completed_tasks() const {
    return _segment_manager->totals.allocation_count;
 }

+uint64_t db::commitlog::get_flush_count() const {
+    return _segment_manager->totals.flush_count;
+}
+
 uint64_t db::commitlog::get_pending_tasks() const {
    return _segment_manager->totals.pending_operations;
 }
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -139,7 +139,7 @@ public:
        const uint32_t ver;
    };

-    commitlog(commitlog&&);
+    commitlog(commitlog&&) noexcept;
    ~commitlog();

    /**
@@ -231,6 +231,7 @@ public:

    uint64_t get_total_size() const;
    uint64_t get_completed_tasks() const;
+    uint64_t get_flush_count() const;
    uint64_t get_pending_tasks() const;
    uint64_t get_num_segments_created() const;
    uint64_t get_num_segments_destroyed() const;
@@ -265,8 +266,21 @@ public:

    typedef std::function<future<>(temporary_buffer<char>, replay_position)> commit_load_reader_func;

+    class segment_data_corruption_error: public std::runtime_error {
+    public:
+        segment_data_corruption_error(std::string msg, uint64_t s)
+                : std::runtime_error(msg), _bytes(s) {
+        }
+        uint64_t bytes() const {
+            return _bytes;
+        }
+    private:
+        uint64_t _bytes;
+    };
+
    static subscription<temporary_buffer<char>, replay_position> read_log_file(file, commit_load_reader_func, position_type = 0);
-    static future<subscription<temporary_buffer<char>, replay_position>> read_log_file(const sstring&, commit_load_reader_func, position_type = 0);
+    static future<std::unique_ptr<subscription<temporary_buffer<char>, replay_position>>> read_log_file(
+            const sstring&, commit_load_reader_func, position_type = 0);
 private:
    commitlog(config);
 };
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -69,6 +69,7 @@ public:
        uint64_t invalid_mutations = 0;
        uint64_t skipped_mutations = 0;
        uint64_t applied_mutations = 0;
+        uint64_t corrupt_bytes = 0;
    };

    future<> process(stats*, temporary_buffer<char> buf, replay_position rp);
@@ -166,9 +167,16 @@ db::commitlog_replayer::impl::recover(sstring file) {
    return db::commitlog::read_log_file(file,
            std::bind(&impl::process, this, s.get(), std::placeholders::_1,
                    std::placeholders::_2), p).then([](auto s) {
-        auto f = s.done();
+        auto f = s->done();
        return f.finally([s = std::move(s)] {});
-    }).then([s] {
+    }).then_wrapped([s](future<> f) {
+        try {
+            f.get();
+        } catch (commitlog::segment_data_corruption_error& e) {
+            s->corrupt_bytes += e.bytes();
+        } catch (...) {
+            throw;
+        }
        return make_ready_future<stats>(*s);
    });
 }
@@ -233,7 +241,7 @@ db::commitlog_replayer::commitlog_replayer(seastar::sharded<cql3::query_processo
    : _impl(std::make_unique<impl>(qp))
 {}

-db::commitlog_replayer::commitlog_replayer(commitlog_replayer&& r)
+db::commitlog_replayer::commitlog_replayer(commitlog_replayer&& r) noexcept
    : _impl(std::move(r._impl))
 {}

@@ -250,31 +258,32 @@ future<db::commitlog_replayer> db::commitlog_replayer::create_replayer(seastar::
 }

 future<> db::commitlog_replayer::recover(std::vector<sstring> files) {
-    logger.info("Replaying {}", files);
-
    return parallel_for_each(files, [this](auto f) {
-        return this->recover(f).handle_exception([f](auto ep) {
-            logger.error("Error recovering {}: {}", f, ep);
-            try {
-                std::rethrow_exception(ep);
-            } catch (std::invalid_argument&) {
-                logger.error("Scylla cannot process {}. Make sure to fully flush all Cassandra commit log files to sstable before migrating.");
-                throw;
-            } catch (...) {
-                throw;
-            }
-        });
+        return this->recover(f);
    });
 }

-future<> db::commitlog_replayer::recover(sstring file) {
-    return _impl->recover(file).then([file](impl::stats stats) {
+future<> db::commitlog_replayer::recover(sstring f) {
+    return _impl->recover(f).then([f](impl::stats stats) {
+        if (stats.corrupt_bytes != 0) {
+            logger.warn("Corrupted file: {}. {} bytes skipped.", f, stats.corrupt_bytes);
+        }
        logger.info("Log replay of {} complete, {} replayed mutations ({} invalid, {} skipped)"
-                , file
+                , f
                , stats.applied_mutations
                , stats.invalid_mutations
                , stats.skipped_mutations
                );
-    });
+    }).handle_exception([f](auto ep) {
+        logger.error("Error recovering {}: {}", f, ep);
+        try {
+            std::rethrow_exception(ep);
+        } catch (std::invalid_argument&) {
+            logger.error("Scylla cannot process {}. Make sure to fully flush all Cassandra commit log files to sstable before migrating.");
+            throw;
+        } catch (...) {
+            throw;
+        }
+    });;
 }

--- a/db/commitlog/commitlog_replayer.hh
+++ b/db/commitlog/commitlog_replayer.hh
@@ -57,7 +57,7 @@ class commitlog;

 class commitlog_replayer {
 public:
-    commitlog_replayer(commitlog_replayer&&);
+    commitlog_replayer(commitlog_replayer&&) noexcept;
    ~commitlog_replayer();

    static future<commitlog_replayer> create_replayer(seastar::sharded<cql3::query_processor>&);
--- a/db/config.hh
+++ b/db/config.hh
@@ -290,7 +290,7 @@ public:
            "Related information: Configuring compaction"   \
    )                                                   \
    /* Common fault detection setting */    \
-    val(phi_convict_threshold, uint32_t, 8, Unused,     \
+    val(phi_convict_threshold, uint32_t, 8, Used,     \
            "Adjusts the sensitivity of the failure detector on an exponential scale. Generally this setting never needs adjusting.\n"  \
            "Related information: Failure detection and recovery"  \
    )                                                   \
@@ -560,7 +560,7 @@ public:
    )   \
    /* RPC (remote procedure call) settings */  \
    /* Settings for configuring and tuning client connections. */   \
-    val(broadcast_rpc_address, sstring, /* unset */, Unused,    \
+    val(broadcast_rpc_address, sstring, /* unset */, Used,    \
            "RPC address to broadcast to drivers and other Cassandra nodes. This cannot be set to 0.0.0.0. If blank, it is set to the value of the rpc_address or rpc_interface. If rpc_address or rpc_interfaceis set to 0.0.0.0, this property must be set.\n"    \
    )   \
    val(rpc_port, uint16_t, 9160, Used,                \
@@ -743,6 +743,13 @@ public:
    val(api_ui_dir, sstring, "swagger-ui/dist/", Used, "The directory location of the API GUI") \
    val(api_doc_dir, sstring, "api/api-doc/", Used, "The API definition file directory") \
    val(load_balance, sstring, "none", Used, "CQL request load balancing: 'none' or round-robin'") \
+    val(consistent_rangemovement, bool, true, Used, "When set to true, range movements will be consistent. It means: 1) it will refuse to bootstrapp a new node if other bootstrapping/leaving/moving nodes detected. 2) data will be streamed to a new node only from the node which is no longer responsible for the token range. Same as -Dcassandra.consistent.rangemovement in cassandra") \
+    val(join_ring, bool, true, Used, "When set to true, a node will join the token ring. When set to false, a node will not join the token ring. User can use nodetool join to initiate ring joinging later. Same as -Dcassandra.join_ring in cassandra.") \
+    val(load_ring_state, bool, true, Used, "When set to true, load tokens and host_ids previously saved. Same as -Dcassandra.load_ring_state in cassandra.") \
+    val(replace_node, sstring, "", Used, "The UUID of the node to replace. Same as -Dcassandra.replace_node in cssandra.") \
+    val(replace_token, sstring, "", Used, "The tokens of the node to replace. Same as -Dcassandra.replace_token in cassandra.") \
+    val(replace_address, sstring, "", Used, "The listen_address or broadcast_address of the dead node to replace. Same as -Dcassandra.replace_address.") \
+    val(replace_address_first_boot, sstring, "", Used, "Like replace_address option, but if the node has been bootstrapped sucessfully it will be ignored. Same as -Dcassandra.replace_address_first_boot.") \
    /* done! */

 #define _make_value_member(name, type, deflt, status, desc, ...)    \
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -398,18 +398,18 @@ read_schema_for_keyspaces(distributed<service::storage_proxy>& proxy, const sstr
    return map_reduce(keyspace_names.begin(), keyspace_names.end(), map, schema_result{}, insert);
 }

-future<schema_result::value_type>
+future<schema_result_value_type>
 read_schema_partition_for_keyspace(distributed<service::storage_proxy>& proxy, const sstring& schema_table_name, const sstring& keyspace_name)
 {
    auto schema = proxy.local().get_db().local().find_schema(system_keyspace::NAME, schema_table_name);
    auto keyspace_key = dht::global_partitioner().decorate_key(*schema,
        partition_key::from_singular(*schema, keyspace_name));
    return db::system_keyspace::query(proxy, schema_table_name, keyspace_key).then([keyspace_name] (auto&& rs) {
-        return schema_result::value_type{keyspace_name, std::move(rs)};
+        return schema_result_value_type{keyspace_name, std::move(rs)};
    });
 }

-future<schema_result::value_type>
+future<schema_result_value_type>
 read_schema_partition_for_table(distributed<service::storage_proxy>& proxy, const sstring& schema_table_name, const sstring& keyspace_name, const sstring& table_name)
 {
    auto schema = proxy.local().get_db().local().find_schema(system_keyspace::NAME, schema_table_name);
@@ -417,7 +417,7 @@ read_schema_partition_for_table(distributed<service::storage_proxy>& proxy, cons
        partition_key::from_singular(*schema, keyspace_name));
    auto clustering_range = query::clustering_range(clustering_key_prefix::from_clustering_prefix(*schema, exploded_clustering_prefix({utf8_type->decompose(table_name)})));
    return db::system_keyspace::query(proxy, schema_table_name, keyspace_key, clustering_range).then([keyspace_name] (auto&& rs) {
-        return schema_result::value_type{keyspace_name, std::move(rs)};
+        return schema_result_value_type{keyspace_name, std::move(rs)};
    });
 }

@@ -528,7 +528,7 @@ future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std::vector

 future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after)
 {
-    std::vector<schema_result::value_type> created;
+    std::vector<schema_result_value_type> created;
    std::vector<sstring> altered;
    std::set<sstring> dropped;

@@ -552,7 +552,7 @@ future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& p
    for (auto&& key : diff.entries_only_on_right) {
        auto&& value = after[key];
        if (!value->empty()) {
-            created.emplace_back(schema_result::value_type{key, std::move(value)});
+            created.emplace_back(schema_result_value_type{key, std::move(value)});
        }
    }
    for (auto&& key : diff.entries_differing) {
@@ -566,7 +566,7 @@ future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& p
        } else if (!pre->empty()) {
            dropped.emplace(keyspace_name);
        } else if (!post->empty()) { // a (re)created keyspace
-            created.emplace_back(schema_result::value_type{key, std::move(post)});
+            created.emplace_back(schema_result_value_type{key, std::move(post)});
        }
    }
    return do_with(std::move(created), [&proxy, altered = std::move(altered)] (auto& created) {
@@ -899,7 +899,7 @@ std::vector<mutation> make_drop_keyspace_mutations(lw_shared_ptr<keyspace_metada
 *
 * @param partition Keyspace attributes in serialized form
 */
-lw_shared_ptr<keyspace_metadata> create_keyspace_from_schema_partition(const schema_result::value_type& result)
+lw_shared_ptr<keyspace_metadata> create_keyspace_from_schema_partition(const schema_result_value_type& result)
 {
    auto&& rs = result.second;
    if (rs->empty()) {
@@ -1310,10 +1310,10 @@ void create_table_from_table_row_and_column_rows(schema_builder& builder, const
        builder.set_max_compaction_threshold(table_row.get_nonnull<int>("max_compaction_threshold"));
    }

-#if 0
-    if (result.has("comment"))
-        cfm.comment(result.getString("comment"));
-#endif
+    if (table_row.has("comment")) {
+        builder.set_comment(table_row.get_nonnull<sstring>("comment"));
+    }
+
    if (table_row.has("memtable_flush_period_in_ms")) {
        builder.set_memtable_flush_period(table_row.get_nonnull<int32_t>("memtable_flush_period_in_ms"));
    }
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -55,6 +55,7 @@ namespace db {
 namespace schema_tables {

 using schema_result = std::map<sstring, lw_shared_ptr<query::result_set>>;
+using schema_result_value_type = std::pair<sstring, lw_shared_ptr<query::result_set>>;

 static constexpr auto KEYSPACES = "schema_keyspaces";
 static constexpr auto COLUMNFAMILIES = "schema_columnfamilies";
@@ -74,7 +75,7 @@ future<utils::UUID> calculate_schema_digest(distributed<service::storage_proxy>&

 future<std::vector<frozen_mutation>> convert_schema_to_mutations(distributed<service::storage_proxy>& proxy);

-future<schema_result::value_type>
+future<schema_result_value_type>
 read_schema_partition_for_keyspace(distributed<service::storage_proxy>& proxy, const sstring& schema_table_name, const sstring& keyspace_name);

 future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations);
@@ -89,11 +90,11 @@ std::vector<mutation> make_create_keyspace_mutations(lw_shared_ptr<keyspace_meta

 std::vector<mutation> make_drop_keyspace_mutations(lw_shared_ptr<keyspace_metadata> keyspace, api::timestamp_type timestamp);

-lw_shared_ptr<keyspace_metadata> create_keyspace_from_schema_partition(const schema_result::value_type& partition);
+lw_shared_ptr<keyspace_metadata> create_keyspace_from_schema_partition(const schema_result_value_type& partition);

 future<> merge_tables(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after);

-lw_shared_ptr<keyspace_metadata> create_keyspace_from_schema_partition(const schema_result::value_type& partition);
+lw_shared_ptr<keyspace_metadata> create_keyspace_from_schema_partition(const schema_result_value_type& partition);

 mutation make_create_keyspace_mutation(lw_shared_ptr<keyspace_metadata> keyspace, api::timestamp_type timestamp, bool with_tables_and_types_and_functions = true);

--- a/db/serializer.hh
+++ b/db/serializer.hh
@@ -26,7 +26,6 @@
 #include "utils/data_output.hh"
 #include "bytes_ostream.hh"
 #include "bytes.hh"
-#include "mutation.hh"
 #include "keys.hh"
 #include "database_fwd.hh"
 #include "frozen_mutation.hh"
--- a/dht/range_streamer.cc
+++ b/dht/range_streamer.cc
@@ -45,6 +45,7 @@
 #include "log.hh"
 #include "streaming/stream_plan.hh"
 #include "streaming/stream_state.hh"
+#include "service/storage_service.hh"

 namespace dht {

@@ -109,7 +110,6 @@ range_streamer::get_all_ranges_with_sources_for(const sstring& keyspace_name, st
    auto& ks = _db.local().find_keyspace(keyspace_name);
    auto& strat = ks.get_replication_strategy();

-    // std::unordered_multimap<range<token>, inet_address>
    auto tm = _metadata.clone_only_token_map();
    auto range_addresses = unordered_multimap_to_unordered_map(strat.get_range_addresses(tm));

@@ -205,9 +205,7 @@ range_streamer::get_all_ranges_with_strict_sources_for(const sstring& keyspace_n
 bool range_streamer::use_strict_sources_for_ranges(const sstring& keyspace_name) {
    auto& ks = _db.local().find_keyspace(keyspace_name);
    auto& strat = ks.get_replication_strategy();
-    // FIXME: DatabaseDescriptor.isReplacing()
-    auto is_replacing = false;
-    return !is_replacing
+    return !_db.local().is_replacing()
           && use_strict_consistency()
           && !_tokens.empty()
           && _metadata.get_all_endpoints().size() != strat.get_replication_factor();
@@ -224,25 +222,17 @@ void range_streamer::add_ranges(const sstring& keyspace_name, std::vector<range<
        }
    }

-    // TODO: share code with unordered_multimap_to_unordered_map
-    std::unordered_map<inet_address, std::vector<range<token>>> tmp;
+    std::unordered_map<inet_address, std::vector<range<token>>> range_fetch_map;
    for (auto& x : get_range_fetch_map(ranges_for_keyspace, _source_filters, keyspace_name)) {
-        auto& addr = x.first;
-        auto& range_ = x.second;
-        auto it = tmp.find(addr);
-        if (it != tmp.end()) {
-            it->second.push_back(range_);
-        } else {
-            tmp.emplace(addr, std::vector<range<token>>{range_});
-        }
+        range_fetch_map[x.first].emplace_back(x.second);
    }

    if (logger.is_enabled(logging::log_level::debug)) {
-        for (auto& x : tmp) {
+        for (auto& x : range_fetch_map) {
            logger.debug("{} : range {} from source {} for keyspace {}", _description, x.second, x.first, keyspace_name);
        }
    }
-    _to_fetch.emplace(keyspace_name, std::move(tmp));
+    _to_fetch.emplace(keyspace_name, std::move(range_fetch_map));
 }

 future<streaming::stream_state> range_streamer::fetch_async() {
@@ -272,4 +262,8 @@ range_streamer::get_work_map(const std::unordered_multimap<range<token>, inet_ad
    return get_range_fetch_map(ranges_with_source_target, source_filters, keyspace);
 }

+bool range_streamer::use_strict_consistency() {
+    return service::get_local_storage_service().db().local().get_config().consistent_rangemovement();
+}
+
 } // dht
--- a/dht/range_streamer.hh
+++ b/dht/range_streamer.hh
@@ -62,10 +62,7 @@ public:
    using stream_plan = streaming::stream_plan;
    using stream_state = streaming::stream_state;
    using i_failure_detector = gms::i_failure_detector;
-    static bool use_strict_consistency() {
-        //FIXME: Boolean.parseBoolean(System.getProperty("cassandra.consistent.rangemovement","true"));
-        return true;
-    }
+    static bool use_strict_consistency();
 public:
    /**
     * A filter applied to sources to stream from when constructing a fetch map.
--- a/dist/ami/files/scylla-ami
+++ b/dist/ami/files/scylla-ami
--- a/dist/ami/files/scylla-setup.sh
+++ b/dist/ami/files/scylla-setup.sh
@@ -1,52 +1,56 @@
 #!/bin/sh -e

-if [ -b /dev/md0 ]; then
+RAIDCNT=`grep xvdb /proc/mdstat | wc -l`
+RAIDDEV=`grep xvdb /proc/mdstat | awk '{print $1}'`
+
+if [ $RAIDCNT -ge 1 ]; then
    echo "RAID already constructed."
-    exit 1
-fi
-
-dnf update -y
-
-DISKS=""
-NR=0
-for i in xvd{b..z}; do
-    if [ -b /dev/$i ];then
-        echo Found disk /dev/$i
-        DISKS="$DISKS /dev/$i"
-        NR=$((NR+1))
-    fi
-done
-
-echo Creating RAID0 for scylla using $NR disk\(s\): $DISKS
-
-if [ $NR -ge 1 ]; then
-    mdadm --create --verbose --force --run /dev/md0 --level=0 -c256 --raid-devices=$NR $DISKS
-    blockdev --setra 65536 /dev/md0
-    mkfs.xfs /dev/md0 -f
-    echo "DEVICE $DISKS" > /etc/mdadm.conf
-    mdadm --detail --scan >> /etc/mdadm.conf
-    UUID=`blkid /dev/md0 | awk '{print $2}'`
-    mkdir /data
-    echo "$UUID /data xfs noatime,discard 0 0" >> /etc/fstab
-    mount /data
+    mount -o noatime /dev/$RAIDDEV /var/lib/scylla
 else
-   echo "WARN: Scylla is not using XFS to store data. Perforamnce will suffer." > /home/fedora/WARN_PLEASE_READ.TXT
+    echo "RAID does not constructed, going to initialize..."
+
+    dnf update -y
+
+    DISKS=""
+    NR=0
+    for i in xvd{b..z}; do
+        if [ -b /dev/$i ];then
+            echo Found disk /dev/$i
+            DISKS="$DISKS /dev/$i"
+            NR=$((NR+1))
+        fi
+    done
+
+    echo Creating RAID0 for scylla using $NR disk\(s\): $DISKS
+
+    if [ $NR -ge 1 ]; then
+        mdadm --create --verbose --force --run /dev/md0 --level=0 -c256 --raid-devices=$NR $DISKS
+        blockdev --setra 65536 /dev/md0
+        mkfs.xfs /dev/md0 -f
+        echo "DEVICE $DISKS" > /etc/mdadm.conf
+        mdadm --detail --scan >> /etc/mdadm.conf
+        UUID=`blkid /dev/md0 | awk '{print $2}'`
+        mount -o noatime /dev/md0 /var/lib/scylla
+    else
+        echo "WARN: Scylla is not using XFS to store data. Perforamnce will suffer." > /home/fedora/WARN_PLEASE_READ.TXT
+    fi
+
+    mkdir -p /var/lib/scylla/data
+    mkdir -p /var/lib/scylla/commitlog
+    chown scylla:scylla /var/lib/scylla/*
+    chown scylla:scylla /var/lib/scylla/
+
+    CPU_NR=`cat /proc/cpuinfo |grep processor|wc -l`
+    if [ $CPU_NR -ge 8 ]; then
+        NR=$((CPU_NR - 1))
+        grep -v SCYLLA_ARGS /etc/sysconfig/scylla-server | grep -v SET_NIC > /tmp/scylla-server
+        echo SCYLLA_ARGS=\"--cpuset 1-$NR  --smp $NR\" >> /tmp/scylla-server
+        echo SET_NIC=\"yes\" >> /tmp/scylla-server
+        mv /tmp/scylla-server /etc/sysconfig/scylla-server
+    fi
+
+    /usr/lib/scylla/scylla-ami/ds2_configure.py
 fi

-mkdir -p /data/data
-mkdir -p /data/commitlog
-chown scylla:scylla /data/*
-
-CPU_NR=`cat /proc/cpuinfo |grep processor|wc -l`
-if [ $CPU_NR -ge 8 ]; then
-    NR=$((CPU_NR - 1))
-    echo SCYLLA_ARGS=\"--cpuset 1-$NR  --smp $NR\" >> /etc/sysconfig/scylla-server
-    echo SET_NIC=\"yes\" >> /etc/sysconfig/scylla-server
-fi
-
-/usr/lib/scylla/scylla-ami/ds2_configure.py
-systemctl disable scylla-setup.service
-systemctl enable scylla-server.service
 systemctl start scylla-server.service
-systemctl enable scylla-jmx.service
 systemctl start scylla-jmx.service
--- a/dist/ami/files/setup-ami.sh
+++ b/dist/ami/files/setup-ami.sh
@@ -14,7 +14,5 @@ chmod a+rx /usr/lib/scylla/scylla-setup.sh
 mv /home/fedora/scylla-ami /usr/lib/scylla/scylla-ami
 chmod a+rx /usr/lib/scylla/scylla-ami/ds2_configure.py
 systemctl enable scylla-setup.service
-sed -e 's!/var/lib/scylla/data!/data/data!' -e 's!commitlog_directory: /var/lib/scylla/commitlog!commitlog_directory: /data/commitlog!' /var/lib/scylla/conf/scylla.yaml > /tmp/scylla.yaml
-mv /tmp/scylla.yaml /var/lib/scylla/conf
 grep -v ' - mounts' /etc/cloud/cloud.cfg > /tmp/cloud.cfg
 mv /tmp/cloud.cfg /etc/cloud/cloud.cfg
--- a/dist/common/scripts/scylla_prepare
+++ b/dist/common/scripts/scylla_prepare
@@ -13,6 +13,10 @@ elif [ "$NETWORK_MODE" = "dpdk" ]; then
    for n in /sys/devices/system/node/node?; do
        echo $NR_HUGEPAGES > $n/hugepages/hugepages-2048kB/nr_hugepages
    done
+else # NETWORK_MODE = posix
+    if [ "$SET_NIC" = "yes" ]; then
+        sudo sh /usr/lib/scylla/posix_net_conf.sh >/dev/null 2>&1 || true
+    fi
 fi
 . /etc/os-release
 if [ "$NAME" = "Ubuntu" ]; then
--- a/dist/common/scripts/scylla_run
+++ b/dist/common/scripts/scylla_run
@@ -10,9 +10,5 @@ elif [ "$NETWORK_MODE" = "dpdk" ]; then
    args="$args --network-stack native --dpdk-pmd"
 fi

-if [ "$SET_NIC" == "yes" ]; then
-    sudo sh /usr/lib/scylla/posix_net_conf.sh >/dev/null 2>&1 || true
-fi
-
 export HOME=/var/lib/scylla
 exec sudo -E -u $USER /usr/bin/scylla $args
--- a/dist/redhat/centos_dep/build_dependency.sh
+++ b/dist/redhat/centos_dep/build_dependency.sh
@@ -21,7 +21,7 @@ if [ ! -f isl-0.14-3.fc22.src.rpm ]; then
 fi

 if [ ! -f gcc-5.1.1-4.fc22.src.rpm ]; then
-    wget http://download.fedoraproject.org/pub/fedora/linux/updates/22/SRPMS/g/gcc-5.1.1-4.fc22.src.rpm
+    wget https://s3.amazonaws.com/scylla-centos-dep/gcc-5.1.1-4.fc22.src.rpm
 fi

 if [ ! -f boost-1.57.0-6.fc22.src.rpm ]; then
--- a/dist/redhat/scylla-server.spec.in
+++ b/dist/redhat/scylla-server.spec.in
@@ -57,6 +57,10 @@ install -m644 licenses/* $RPM_BUILD_ROOT%{_docdir}/scylla/licenses/
 install -d -m755 $RPM_BUILD_ROOT%{_sharedstatedir}/scylla/
 install -d -m755 $RPM_BUILD_ROOT%{_sharedstatedir}/scylla/data
 install -d -m755 $RPM_BUILD_ROOT%{_sharedstatedir}/scylla/commitlog
+install -d -m755 $RPM_BUILD_ROOT%{_prefix}/lib/scylla/swagger-ui
+cp -r swagger-ui/dist $RPM_BUILD_ROOT%{_prefix}/lib/scylla/swagger-ui
+install -d -m755 $RPM_BUILD_ROOT%{_prefix}/lib/scylla/api
+cp -r api/api-doc $RPM_BUILD_ROOT%{_prefix}/lib/scylla/api

 %pre
 /usr/sbin/groupadd scylla 2> /dev/null || :
@@ -73,6 +77,11 @@ if [ -d /var/lib/scylla/conf ] && [ ! -L /var/lib/scylla/conf ]; then
 fi

 %post
+grep -v api_ui_dir /etc/scylla/scylla.yaml | grep -v api_doc_dir > /tmp/scylla.yaml
+echo "api_ui_dir: /usr/lib/scylla/swagger-ui/dist/" >> /tmp/scylla.yaml
+echo "api_doc_dir: /usr/lib/scylla/api/api-doc/" >> /tmp/scylla.yaml
+mv /tmp/scylla.yaml /etc/scylla/scylla.yaml
+
 %systemd_post scylla-server.service

 %preun
@@ -113,6 +122,8 @@ rm -rf $RPM_BUILD_ROOT
 %{_prefix}/lib/scylla/dpdk_nic_bind.py
 %{_prefix}/lib/scylla/dpdk_nic_bind.pyc
 %{_prefix}/lib/scylla/dpdk_nic_bind.pyo
+%{_prefix}/lib/scylla/swagger-ui/dist/*
+%{_prefix}/lib/scylla/api/api-doc/*
 %attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/
 %attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/data
 %attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/commitlog
--- a/dist/ubuntu/build_deb.sh
+++ b/dist/ubuntu/build_deb.sh
@@ -10,6 +10,14 @@ if [ -e debian ] || [ -e build/release ]; then
    mkdir build
 fi

+RELEASE=`lsb_release -r|awk '{print $2}'`
+CODENAME=`lsb_release -c|awk '{print $2}'`
+if [ `grep -c $RELEASE dist/ubuntu/supported_release` -lt 1 ]; then
+    echo "Unsupported release: $RELEASE"
+    echo "Pless any key to continue..."
+    read input
+fi
+
 VERSION=$(./SCYLLA-VERSION-GEN)
 SCYLLA_VERSION=$(cat build/SCYLLA-VERSION-FILE)
 SCYLLA_RELEASE=$(cat build/SCYLLA-RELEASE-FILE)
@@ -24,14 +32,29 @@ cp dist/common/sysconfig/scylla-server debian/scylla-server.default
 cp dist/ubuntu/changelog.in debian/changelog
 sed -i -e "s/@@VERSION@@/$SCYLLA_VERSION/g" debian/changelog
 sed -i -e "s/@@RELEASE@@/$SCYLLA_RELEASE/g" debian/changelog
+sed -i -e "s/@@CODENAME@@/$CODENAME/g" debian/changelog

 sudo apt-get -y update

 ./dist/ubuntu/dep/build_dependency.sh

-sudo apt-get -y install libyaml-cpp-dev liblz4-dev libsnappy-dev libcrypto++-dev libboost1.55-dev libjsoncpp-dev libaio-dev ragel ninja-build git libyaml-cpp0.5 liblz4-1 libsnappy1 libcrypto++9 libboost-program-options1.55.0 libboost-program-options1.55-dev libboost-system1.55.0 libboost-system1.55-dev libboost-thread1.55.0 libboost-thread1.55-dev libboost-test1.55.0 libboost-test1.55-dev libjsoncpp0 libaio1 hugepages software-properties-common libboost-filesystem1.55-dev libboost-filesystem1.55.0
-sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
-sudo apt-get -y update
+DEP="libyaml-cpp-dev liblz4-dev libsnappy-dev libcrypto++-dev libjsoncpp-dev libaio-dev ragel ninja-build git liblz4-1 libaio1 hugepages software-properties-common"
+
+if [ "$RELEASE" = "14.04" ]; then
+    DEP="$DEP libboost1.55-dev libboost-program-options1.55.0 libboost-program-options1.55-dev libboost-system1.55.0 libboost-system1.55-dev libboost-thread1.55.0 libboost-thread1.55-dev libboost-test1.55.0 libboost-test1.55-dev libboost-filesystem1.55-dev libboost-filesystem1.55.0 libsnappy1"
+else
+    DEP="$DEP libboost-dev libboost-program-options-dev libboost-system-dev libboost-thread-dev libboost-test-dev libboost-filesystem-dev libboost-filesystem-dev libsnappy1v5"
+fi
+if [ "$RELEASE" = "15.10" ]; then
+    DEP="$DEP libjsoncpp0v5 libcrypto++9v5 libyaml-cpp0.5v5 antlr3"
+else
+    DEP="$DEP libjsoncpp0 libcrypto++9 libyaml-cpp0.5"
+fi
+sudo apt-get -y install $DEP
+if [ "$RELEASE" != "15.10" ]; then
+    sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+    sudo apt-get -y update
+fi
 sudo apt-get -y install g++-5

 debuild -r fakeroot -us -uc
--- a/dist/ubuntu/changelog.in
+++ b/dist/ubuntu/changelog.in
@@ -1,4 +1,4 @@
-scylla-server (@@VERSION@@-@@RELEASE@@-ubuntu1) trusty; urgency=medium
+scylla-server (@@VERSION@@-@@RELEASE@@-ubuntu1) @@CODENAME@@; urgency=medium

  * Initial release.

--- a/dist/ubuntu/debian/control
+++ b/dist/ubuntu/debian/control
@@ -4,7 +4,7 @@ Homepage: http://scylladb.com
 Section: database
 Priority: optional
 Standards-Version: 3.9.5
-Build-Depends: debhelper (>= 9), libyaml-cpp-dev, liblz4-dev, libsnappy-dev, libcrypto++-dev, libjsoncpp-dev, libaio-dev, libthrift-dev, thrift-compiler, antlr3-tool, antlr3-c++-dev, ragel, g++-5, ninja-build, git, libboost-program-options1.55-dev, libboost-filesystem1.55-dev, libboost-system1.55-dev, libboost-thread1.55-dev, libboost-test1.55-dev
+Build-Depends: debhelper (>= 9), libyaml-cpp-dev, liblz4-dev, libsnappy-dev, libcrypto++-dev, libjsoncpp-dev, libaio-dev, libthrift-dev, thrift-compiler, antlr3, antlr3-c++-dev, ragel, g++-5, ninja-build, git, libboost-program-options1.55-dev | libboost-program-options-dev, libboost-filesystem1.55-dev | libboost-filesystem-dev, libboost-system1.55-dev | libboost-system-dev, libboost-thread1.55-dev | libboost-thread-dev, libboost-test1.55-dev | libboost-test-dev

 Package: scylla-server
 Architecture: amd64
--- a/dist/ubuntu/debian/rules
+++ b/dist/ubuntu/debian/rules
@@ -2,6 +2,8 @@

 DOC = $(CURDIR)/debian/scylla-server/usr/share/doc/scylla-server
 SCRIPTS = $(CURDIR)/debian/scylla-server/usr/lib/scylla
+SWAGGER = $(SCRIPTS)/swagger-ui
+API = $(SCRIPTS)/api
 LIMITS= $(CURDIR)/debian/scylla-server/etc/security/limits.d
 LIBS = $(CURDIR)/debian/scylla-server/usr/lib
 CONF = $(CURDIR)/debian/scylla-server/etc/scylla
@@ -33,6 +35,12 @@ override_dh_auto_install:
 	cp $(CURDIR)/seastar/dpdk/tools/dpdk_nic_bind.py $(SCRIPTS)
 	cp $(CURDIR)/dist/common/scripts/* $(SCRIPTS)

+	mkdir -p $(SWAGGER) && \
+	cp -r $(CURDIR)/swagger-ui/dist $(SWAGGER)
+
+	mkdir -p $(API) && \
+	cp -r $(CURDIR)/api/api-doc $(API)
+
 	mkdir -p $(CURDIR)/debian/scylla-server/usr/bin/ && \
 	cp $(CURDIR)/build/release/scylla \
 		$(CURDIR)/debian/scylla-server/usr/bin/
--- a/dist/ubuntu/debian/scylla-server.postinst
+++ b/dist/ubuntu/debian/scylla-server.postinst
@@ -14,4 +14,9 @@ fi

 ln -sfT /etc/scylla /var/lib/scylla/conf

+grep -v api_ui_dir /etc/scylla/scylla.yaml | grep -v api_doc_dir > /tmp/scylla.yaml
+echo "api_ui_dir: /usr/lib/scylla/swagger-ui/dist/" >> /tmp/scylla.yaml
+echo "api_doc_dir: /usr/lib/scylla/api/api-doc/" >> /tmp/scylla.yaml
+mv /tmp/scylla.yaml /etc/scylla/scylla.yaml
+
 #DEBHELPER#
--- a/dist/ubuntu/dep/antlr3-tool-3.5.2/antlr3
+++ b/dist/ubuntu/dep/antlr3-tool-3.5.2/antlr3
--- a/dist/ubuntu/dep/antlr3-tool-3.5.2/debian/changelog
+++ b/dist/ubuntu/dep/antlr3-tool-3.5.2/debian/changelog
@@ -1,4 +1,4 @@
-antlr3-tool (3.5.2-ubuntu1) trusty; urgency=medium
+antlr3 (3.5.2-ubuntu1) trusty; urgency=medium

  * Initial release.

--- a/dist/ubuntu/dep/antlr3-tool-3.5.2/debian/compat
+++ b/dist/ubuntu/dep/antlr3-tool-3.5.2/debian/compat
--- a/dist/ubuntu/dep/antlr3-tool-3.5.2/debian/control
+++ b/dist/ubuntu/dep/antlr3-tool-3.5.2/debian/control
@@ -1,12 +1,13 @@
-Source: antlr3-tool
+Source: antlr3
 Maintainer: Takuya ASADA <syuu@scylladb.com>
 Section: misc
 Priority: optional
 Standards-Version: 3.5.2
 Build-Depends: debhelper (>= 9)

-Package: antlr3-tool
+Package: antlr3
 Architecture: all
 Depends: ${shlibs:Depends}, ${misc:Depends}, openjdk-7-jre-headless
+Replaces: antlr3-tool
 Description: language tool for constructing recognizers, compilers etc
 A language tool that provides a framework for constructing recognizers, interpreters, compilers, and translators from grammatical descriptions containing actions in a variety of target languages.
--- a/dist/ubuntu/dep/antlr3-tool-3.5.2/debian/copyright
+++ b/dist/ubuntu/dep/antlr3-tool-3.5.2/debian/copyright
--- a/dist/ubuntu/dep/antlr3-3.5.2/debian/rules
+++ b/dist/ubuntu/dep/antlr3-3.5.2/debian/rules
@@ -0,0 +1,12 @@
+#!/usr/bin/make -f
+
+override_dh_auto_install:
+	mkdir -p $(CURDIR)/debian/antlr3/usr/share/java
+	cp $(CURDIR)/antlr-3.5.2-complete-no-st3.jar \
+		$(CURDIR)/debian/antlr3/usr/share/java
+
+	mkdir -p $(CURDIR)/debian/antlr3/usr/bin
+	cp $(CURDIR)/antlr3 \
+		$(CURDIR)/debian/antlr3/usr/bin
+%:
+	dh $@
--- a/dist/ubuntu/dep/antlr3-tool-3.5.2/debian/rules
+++ b/dist/ubuntu/dep/antlr3-tool-3.5.2/debian/rules
@@ -1,12 +0,0 @@
-#!/usr/bin/make -f
-
-override_dh_auto_install:
-	mkdir -p $(CURDIR)/debian/antlr3-tool/usr/share/java
-	cp $(CURDIR)/antlr-3.5.2-complete-no-st3.jar \
-		$(CURDIR)/debian/antlr3-tool/usr/share/java
-
-	mkdir -p $(CURDIR)/debian/antlr3-tool/usr/bin
-	cp $(CURDIR)/antlr3 \
-		$(CURDIR)/debian/antlr3-tool/usr/bin
-%:
-	dh $@
--- a/dist/ubuntu/dep/build_dependency.sh
+++ b/dist/ubuntu/dep/build_dependency.sh
@@ -1,15 +1,25 @@
 #!/bin/sh -e

-sudo apt-get -y install build-essential debhelper openjdk-7-jre-headless build-essential autoconf automake pkg-config libtool bison flex libboost1.55-dev libboost-test1.55-dev libevent-dev libglib2.0-dev libqt4-dev python-dev python-dbg php5-dev devscripts python-support xfslibs-dev
+RELEASE=`lsb_release -r|awk '{print $2}'`
+DEP="build-essential debhelper openjdk-7-jre-headless build-essential autoconf automake pkg-config libtool bison flex libevent-dev libglib2.0-dev libqt4-dev python-dev python-dbg php5-dev devscripts python-support xfslibs-dev"

-if [ ! -f build/antlr3-tool_3.5.2-1_all.deb ]; then
-    rm -rf build/antlr3-tool-3.5.2
-    mkdir -p build/antlr3-tool-3.5.2
-    cp -a dist/ubuntu/dep/antlr3-tool-3.5.2/* build/antlr3-tool-3.5.2
-    cd build/antlr3-tool-3.5.2
-    wget http://www.antlr3.org/download/antlr-3.5.2-complete-no-st3.jar
-    debuild -r fakeroot --no-tgz-check -us -uc
-    cd -
+if [ "$RELEASE" = "14.04" ]; then
+    DEP="$DEP libboost1.55-dev libboost-test1.55-dev"
+else
+    DEP="$DEP libboost-dev libboost-test-dev"
+fi
+sudo apt-get -y install $DEP
+
+if [ "$RELEASE" = "14.04" ]; then
+    if [ ! -f build/antlr3_3.5.2-1_all.deb ]; then
+        rm -rf build/antlr3-3.5.2
+        mkdir -p build/antlr3-3.5.2
+        cp -a dist/ubuntu/dep/antlr3-3.5.2/* build/antlr3-3.5.2
+        cd build/antlr3-3.5.2
+        wget http://www.antlr3.org/download/antlr-3.5.2-complete-no-st3.jar
+        debuild -r fakeroot --no-tgz-check -us -uc
+        cd -
+    fi
 fi

 if [ ! -f build/antlr3-c++-dev_3.5.2-1_all.deb ]; then
--- a/dist/ubuntu/supported_release
+++ b/dist/ubuntu/supported_release
@@ -0,0 +1,2 @@
+14.04
+15.10
--- a/frozen_mutation.cc
+++ b/frozen_mutation.cc
@@ -21,6 +21,8 @@

 #include "db/serializer.hh"
 #include "frozen_mutation.hh"
+#include "mutation_partition.hh"
+#include "mutation.hh"
 #include "partition_builder.hh"
 #include "mutation_partition_serializer.hh"
 #include "utils/UUID.hh"
--- a/frozen_mutation.hh
+++ b/frozen_mutation.hh
@@ -23,9 +23,9 @@

 #include "atomic_cell.hh"
 #include "keys.hh"
-#include "mutation.hh"
 #include "mutation_partition_view.hh"

+class mutation;

 // Immutable, compact form of mutation.
 //
--- a/gms/failure_detector.cc
+++ b/gms/failure_detector.cc
@@ -43,12 +43,15 @@
 #include "gms/endpoint_state.hh"
 #include "gms/application_state.hh"
 #include "gms/inet_address.hh"
+#include "log.hh"
 #include <iostream>
 #include <chrono>

 namespace gms {

-extern logging::logger logger;
+static logging::logger logger("failure_detector");
+
+constexpr std::chrono::milliseconds failure_detector::DEFAULT_MAX_PAUSE;

 using clk = arrival_window::clk;

@@ -86,13 +89,13 @@ clk::duration arrival_window::get_max_interval() {
    return get_initial_value();
 }

-void arrival_window::add(clk::time_point value) {
+void arrival_window::add(clk::time_point value, const gms::inet_address& ep) {
    if (_tlast > clk::time_point::min()) {
        auto inter_arrival_time = value - _tlast;
        if (inter_arrival_time <= get_max_interval()) {
            _arrival_intervals.add(inter_arrival_time.count());
        } else  {
-            logger.debug("failure_detector: Ignoring interval time of {}", inter_arrival_time.count());
+            logger.debug("failure_detector: Ignoring interval time of {} for {}", inter_arrival_time.count(), ep);
        }
    } else {
        // We use a very large initial interval since the "right" average depends on the cluster size
@@ -186,27 +189,28 @@ sstring failure_detector::get_endpoint_state(sstring address) {
 void failure_detector::append_endpoint_state(std::stringstream& ss, endpoint_state& state) {
    ss << "  generation:" << state.get_heart_beat_state().get_generation() << "\n";
    ss << "  heartbeat:" << state.get_heart_beat_state().get_heart_beat_version() << "\n";
-    for (auto& entry : state.get_application_state_map()) {
+    for (const auto& entry : state.get_application_state_map()) {
        auto& app_state = entry.first;
-        auto& value = entry.second;
+        auto& versioned_val = entry.second;
        if (app_state == application_state::TOKENS) {
            continue;
        }
-        // FIXME: Add operator<< for application_state
-        ss << "  " << int32_t(app_state) << ":" << value.value << "\n";
+        ss << "  " << app_state << ":" << versioned_val.version << ":" << versioned_val.value << "\n";
+    }
+    const auto& app_state_map = state.get_application_state_map();
+    if (app_state_map.count(application_state::TOKENS)) {
+        ss << "  TOKENS:" << app_state_map.at(application_state::TOKENS).version << ":<hidden>\n";
+    } else {
+        ss << "  TOKENS: not present" << "\n";
    }
 }

 void failure_detector::set_phi_convict_threshold(double phi) {
-    // FIXME
-    // DatabaseDescriptor.setPhiConvictThreshold(phi);
+    _phi = phi;
 }

 double failure_detector::get_phi_convict_threshold() {
-    // FIXME: phi_convict_threshold must be between 5 and 16"
-    // return DatabaseDescriptor.getPhiConvictThreshold();
-    warn(unimplemented::cause::GOSSIP);
-    return 8;
+    return _phi;
 }

 bool failure_detector::is_alive(inet_address ep) {
@@ -220,10 +224,10 @@ void failure_detector::report(inet_address ep) {
    if (it == _arrival_samples.end()) {
        // avoid adding an empty ArrivalWindow to the Map
        auto heartbeat_window = arrival_window(SAMPLE_SIZE);
-        heartbeat_window.add(now);
+        heartbeat_window.add(now, ep);
        _arrival_samples.emplace(ep, heartbeat_window);
    } else {
-        it->second.add(now);
+        it->second.add(now, ep);
    }
 }

@@ -235,8 +239,20 @@ void failure_detector::interpret(inet_address ep) {
    }
    arrival_window& hb_wnd = it->second;
    auto now = clk::now();
+    auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(now - _last_interpret);
+    _last_interpret = now;
+    if (diff > get_max_local_pause()) {
+        logger.warn("Not marking nodes down due to local pause of {} > {} (milliseconds)", diff.count(), get_max_local_pause().count());
+        _last_paused = now;
+        return;
+    }
+    if (clk::now() - _last_paused < get_max_local_pause()) {
+        logger.debug("Still not marking nodes down due to local pause");
+        return;
+    }
    double phi = hb_wnd.phi(now);
    logger.trace("failure_detector: PHI for {} : {}", ep, phi);
+    logger.trace("failure_detector: phi_convict_threshold={}", _phi);

    if (PHI_FACTOR * phi > get_phi_convict_threshold()) {
        logger.trace("failure_detector: notifying listeners that {} is down", ep);
--- a/gms/failure_detector.hh
+++ b/gms/failure_detector.hh
@@ -78,7 +78,7 @@ public:
    // this value defaults to the same initial value the FD is seeded with
    static clk::duration get_max_interval();

-    void add(clk::time_point value);
+    void add(clk::time_point value, const gms::inet_address& ep);

    double mean();

@@ -105,9 +105,34 @@ private:
    static constexpr double PHI_FACTOR{1.0 / std::log(10.0)}; // 0.434...
    std::map<inet_address, arrival_window> _arrival_samples;
    std::list<i_failure_detection_event_listener*> _fd_evnt_listeners;
+    double _phi = 8;
+
+    static constexpr std::chrono::milliseconds DEFAULT_MAX_PAUSE{5000};
+
+    std::chrono::milliseconds get_max_local_pause() {
+        // FIXME: cassandra.max_local_pause_in_ms
+#if 0
+        if (System.getProperty("cassandra.max_local_pause_in_ms") != null) {
+            long pause = Long.parseLong(System.getProperty("cassandra.max_local_pause_in_ms"));
+            logger.warn("Overriding max local pause time to {}ms", pause);
+            return pause * 1000000L;
+        } else {
+            return DEFAULT_MAX_PAUSE;
+        }
+#endif
+        return DEFAULT_MAX_PAUSE;
+    }
+
+    arrival_window::clk::time_point _last_interpret;
+    arrival_window::clk::time_point _last_paused;

 public:
    failure_detector() {
+        _last_interpret = arrival_window::clk::now();
+    }
+
+    failure_detector(double phi) : _phi(phi) {
+        _last_interpret = arrival_window::clk::now();
    }

    future<> stop() {
@@ -188,7 +213,7 @@ inline future<> set_phi_convict_threshold(double phi) {
    });
 }

-inline  future<double> get_phi_convict_threshold() {
+inline future<double> get_phi_convict_threshold() {
    return smp::submit_to(0, [] {
        return get_local_failure_detector().get_phi_convict_threshold();
    });
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -62,7 +62,7 @@ namespace gms {

 using clk = gossiper::clk;

-logging::logger logger("gossip");
+static logging::logger logger("gossip");

 constexpr std::chrono::milliseconds gossiper::INTERVAL;
 constexpr std::chrono::hours gossiper::A_VERY_LONG_TIME;
@@ -251,8 +251,8 @@ void gossiper::init_messaging_service_handler() {
                logger.debug("Ignoring shutdown message from {} because gossip is disabled", from);
                return make_ready_future<>();
            }
-            return seastar::async([from, fd = get_local_failure_detector().shared_from_this()] {
-                fd->force_conviction(from);
+            return seastar::async([from] {
+                gms::get_local_gossiper().mark_as_shutdown(from);
            });
        }).handle_exception([] (auto ep) {
            logger.warn("Fail to handle GOSSIP_SHUTDOWN: {}", ep);
@@ -423,6 +423,7 @@ void gossiper::remove_endpoint(inet_address endpoint) {
    }

    _live_endpoints.erase(endpoint);
+    _live_endpoints_just_added.remove(endpoint);
    _unreachable_endpoints.erase(endpoint);
    // do not remove endpointState until the quarantine expires
    get_local_failure_detector().remove(endpoint);
@@ -663,11 +664,17 @@ void gossiper::convict(inet_address endpoint, double phi) {
        return;
    }
    auto& state = it->second;
+    // FIXME: Add getGossipStatus
+    // logger.debug("Convicting {} with status {} - alive {}", endpoint, getGossipStatus(epState), state.is_alive());
+    if (!state.is_alive()) {
+        return;
+    }
+
    logger.trace("convict ep={}, phi={}, is_alive={}, is_dead_state={}", endpoint, phi, state.is_alive(), is_dead_state(state));
-    if (state.is_alive() && !is_dead_state(state)) {
-        mark_dead(endpoint, state);
+    if (is_shutdown(endpoint)) {
+        mark_as_shutdown(endpoint);
    } else {
-        state.mark_dead();
+        mark_dead(endpoint, state);
    }
 }

@@ -868,6 +875,12 @@ future<bool> gossiper::do_gossip_to_live_member(gossip_digest_syn message) {
        return make_ready_future<bool>(false);
    }
    logger.trace("do_gossip_to_live_member: live_endpoint nr={}", _live_endpoints.size());
+    if (!_live_endpoints_just_added.empty()) {
+        auto ep = _live_endpoints_just_added.front();
+        _live_endpoints_just_added.pop_front();
+        logger.info("do_gossip_to_live_member: Favor newly added node {}", ep);
+        return send_gossip(message, std::set<inet_address>{ep});
+    }
    return send_gossip(message, _live_endpoints);
 }

@@ -937,7 +950,7 @@ clk::time_point gossiper::get_expire_time_for_endpoint(inet_address endpoint) {
    }
 }

-std::experimental::optional<endpoint_state> gossiper::get_endpoint_state_for_endpoint(inet_address ep) {
+std::experimental::optional<endpoint_state> gossiper::get_endpoint_state_for_endpoint(inet_address ep) const {
    auto it = endpoint_state_map.find(ep);
    if (it == endpoint_state_map.end()) {
        return {};
@@ -950,6 +963,7 @@ void gossiper::reset_endpoint_state_map() {
    endpoint_state_map.clear();
    _unreachable_endpoints.clear();
    _live_endpoints.clear();
+    _live_endpoints_just_added.clear();
 }

 std::unordered_map<inet_address, endpoint_state>& gms::gossiper::get_endpoint_states() {
@@ -1059,10 +1073,13 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
    local_state.mark_alive();
    local_state.update_timestamp(); // prevents do_status_check from racing us and evicting if it was down > A_VERY_LONG_TIME
    _live_endpoints.insert(addr);
+    _live_endpoints_just_added.push_back(addr);
    _unreachable_endpoints.erase(addr);
    _expire_time_endpoint_map.erase(addr);
    logger.debug("removing expire time for endpoint : {}", addr);
-    logger.info("inet_address {} is now UP", addr);
+    if (!_in_shadow_round) {
+        logger.info("InetAddress {} is now UP", addr);
+    }

    _subscribers.for_each([addr, local_state] (auto& subscriber) {
        subscriber->on_alive(addr, local_state);
@@ -1075,8 +1092,9 @@ void gossiper::mark_dead(inet_address addr, endpoint_state& local_state) {
    logger.trace("marking as down {}", addr);
    local_state.mark_dead();
    _live_endpoints.erase(addr);
+    _live_endpoints_just_added.remove(addr);
    _unreachable_endpoints[addr] = now();
-    logger.info("inet_address {} is now DOWN", addr);
+    logger.info("InetAddress {} is now DOWN", addr);
    _subscribers.for_each([addr, local_state] (auto& subscriber) {
        subscriber->on_dead(addr, local_state);
        logger.trace("Notified {}", subscriber.get());
@@ -1089,7 +1107,7 @@ void gossiper::handle_major_state_change(inet_address ep, const endpoint_state&
    if (endpoint_state_map.count(ep) > 0) {
        local_ep_state = endpoint_state_map.at(ep);
    }
-    if (!is_dead_state(eps)) {
+    if (!is_dead_state(eps) && !_in_shadow_round) {
        if (endpoint_state_map.count(ep))  {
            logger.info("Node {} has restarted, now UP", ep);
        } else {
@@ -1117,6 +1135,10 @@ void gossiper::handle_major_state_change(inet_address ep, const endpoint_state&
    _subscribers.for_each([ep, ep_state] (auto& subscriber) {
        subscriber->on_join(ep, ep_state);
    });
+    // check this at the end so nodes will learn about the endpoint
+    if (is_shutdown(ep)) {
+        mark_as_shutdown(ep);
+    }
 }

 bool gossiper::is_dead_state(const endpoint_state& eps) const {
@@ -1136,6 +1158,47 @@ bool gossiper::is_dead_state(const endpoint_state& eps) const {
    return false;
 }

+bool gossiper::is_shutdown(const inet_address& endpoint) const {
+    auto ep_state = get_endpoint_state_for_endpoint(endpoint);
+    if (!ep_state) {
+        return false;
+    }
+
+    auto app_state = ep_state->get_application_state(application_state::STATUS);
+    if (!app_state) {
+        return false;
+    }
+
+    auto value = app_state->value;
+    std::vector<sstring> pieces;
+    boost::split(pieces, value, boost::is_any_of(","));
+    assert(pieces.size() > 0);
+    sstring state = pieces[0];
+
+    return state == sstring(versioned_value::SHUTDOWN);
+}
+
+
+bool gossiper::is_silent_shutdown_state(const endpoint_state& ep_state) const{
+    auto app_state = ep_state.get_application_state(application_state::STATUS);
+    if (!app_state) {
+        return false;
+    }
+
+    auto value = app_state->value;
+    std::vector<sstring> pieces;
+    boost::split(pieces, value, boost::is_any_of(","));
+    assert(pieces.size() > 0);
+    sstring state = pieces[0];
+
+    for (auto& deadstate : SILENT_SHUTDOWN_STATES) {
+        if (state == deadstate) {
+            return true;
+        }
+    }
+    return false;
+}
+
 // Runs inside seastar::async context
 void gossiper::apply_new_states(inet_address addr, endpoint_state& local_state, const endpoint_state& remote_state) {
    // don't assert here, since if the node restarts the version will go back to zero
@@ -1255,17 +1318,15 @@ void gossiper::examine_gossiper(std::vector<gossip_digest>& g_digest_list,
    }
 }

-future<> gossiper::start(int generation_number) {
-    return start(generation_number, std::map<application_state, versioned_value>());
+future<> gossiper::start_gossiping(int generation_number) {
+    return start_gossiping(generation_number, std::map<application_state, versioned_value>());
 }

-future<> gossiper::start(int generation_nbr, std::map<application_state, versioned_value> preload_local_states) {
+future<> gossiper::start_gossiping(int generation_nbr, std::map<application_state, versioned_value> preload_local_states) {
    // Although gossiper runs on cpu0 only, we need to listen incoming gossip
    // message on all cpus and forard them to cpu0 to process.
-    return _handlers.start().then([this] {
-        return _handlers.invoke_on_all([this] (handler& h) {
-            this->init_messaging_service_handler();
-        });
+    return get_gossiper().invoke_on_all([] (gossiper& g) {
+        g.init_messaging_service_handler();
    }).then([this, generation_nbr, preload_local_states] {
        build_seeds_list();
        /* initialize the heartbeat state for this localEndpoint */
@@ -1382,46 +1443,60 @@ future<> gossiper::add_local_application_state(application_state state, versione
    });
 }

-future<> gossiper::stop() {
-    logger.debug("gossip::stop on cpu {}", engine().cpu_id());
-
-    if (engine().cpu_id() != 0) {
-        return make_ready_future<>();
-    }
-
+future<> gossiper::do_stop_gossiping() {
    return seastar::async([this, g = this->shared_from_this()] {
        _enabled = false;
-        _scheduled_gossip_task.cancel();
-        logger.info("Announcing shutdown");
-        sleep(INTERVAL * 2).get();
-        for (inet_address addr : _live_endpoints) {
-            shard_id id = get_shard_id(addr);
-            logger.trace("Sending a GossipShutdown to {}", id);
-            ms().send_gossip_shutdown(id, get_broadcast_address()).then_wrapped([id] (auto&&f) {
-                try {
-                    f.get();
-                    logger.trace("Got GossipShutdown Reply");
-                } catch (...) {
-                    logger.warn("Fail to send GossipShutdown to {}: {}", id, std::current_exception());
-                }
-                return make_ready_future<>();
-            }).get();
-        }
-        _handlers.stop().then([this] () {
-            logger.debug("gossip::handler::stop on cpu {}", engine().cpu_id());
-            if (engine().cpu_id() == 0) {
-                get_local_failure_detector().unregister_failure_detection_event_listener(this);
+        auto my_ep_state = get_endpoint_state_for_endpoint(get_broadcast_address());
+        if (my_ep_state && !is_silent_shutdown_state(*my_ep_state)) {
+            logger.info("Announcing shutdown");
+            add_local_application_state(application_state::STATUS, storage_service_value_factory().shutdown(true)).get();
+            for (inet_address addr : _live_endpoints) {
+                shard_id id = get_shard_id(addr);
+                logger.trace("Sending a GossipShutdown to {}", id);
+                ms().send_gossip_shutdown(id, get_broadcast_address()).then_wrapped([id] (auto&&f) {
+                    try {
+                        f.get();
+                        logger.trace("Got GossipShutdown Reply");
+                    } catch (...) {
+                        logger.warn("Fail to send GossipShutdown to {}: {}", id, std::current_exception());
+                    }
+                    return make_ready_future<>();
+                }).get();
            }
-            uninit_messaging_service_handler();
+            // FIXME: Integer.getInteger("cassandra.shutdown_announce_in_ms", 2000)
+            sleep(INTERVAL * 2).get();
+        } else {
+            logger.warn("No local state or state is in silent shutdown, not announcing shutdown");
+        }
+        _scheduled_gossip_task.cancel();
+        get_gossiper().invoke_on_all([] (gossiper& g) {
+            if (engine().cpu_id() == 0) {
+                get_local_failure_detector().unregister_failure_detection_event_listener(&g);
+            }
+            g.uninit_messaging_service_handler();
            return make_ready_future<>();
        }).get();
    });
 }

+future<> gossiper::stop_gossiping() {
+    return get_gossiper().invoke_on(0, [] (gossiper& g) {
+        return g.do_stop_gossiping();
+    });
+}
+
+future<> gossiper::stop() {
+    return make_ready_future();
+}
+
 bool gossiper::is_enabled() {
    return _enabled;
 }

+void gossiper::goto_shadow_round() {
+    _in_shadow_round = true;
+}
+
 void gossiper::finish_shadow_round() {
    if (_in_shadow_round) {
        _in_shadow_round = false;
@@ -1472,4 +1547,29 @@ bool gossiper::is_alive(inet_address ep) {
    }
 }

+/**
+ * This method is used to mark a node as shutdown; that is it gracefully exited on its own and told us about it
+ * @param endpoint endpoint that has shut itself down
+ */
+// Runs inside seastar::async context
+void gossiper::mark_as_shutdown(const inet_address& endpoint) {
+    auto it = endpoint_state_map.find(endpoint);
+    if (it != endpoint_state_map.end()) {
+        auto& ep_state = it->second;
+        ep_state.add_application_state(application_state::STATUS, storage_service_value_factory().shutdown(true));
+        ep_state.get_heart_beat_state().force_highest_possible_version_unsafe();
+        mark_dead(endpoint, ep_state);
+        get_local_failure_detector().force_conviction(endpoint);
+    }
+}
+
+void gossiper::force_newer_generation() {
+    auto it = endpoint_state_map.find(get_broadcast_address());
+    if (it != endpoint_state_map.end()) {
+        auto& ep_state = it->second;
+        ep_state.get_heart_beat_state().force_newer_generation_unsafe();
+    }
+}
+
+
 } // namespace gms
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -86,13 +86,6 @@ private:
    net::messaging_service& ms() {
        return net::get_local_messaging_service();
    }
-    class handler {
-    public:
-        future<> stop() {
-            return make_ready_future<>();
-        }
-    };
-    distributed<handler> _handlers;
    void init_messaging_service_handler();
    void uninit_messaging_service_handler();
    future<gossip_digest_ack> handle_syn_msg(gossip_digest_syn syn_msg);
@@ -121,8 +114,19 @@ public:
    /* map where key is the endpoint and value is the state associated with the endpoint */
    std::unordered_map<inet_address, endpoint_state> endpoint_state_map;

-    const std::vector<sstring> DEAD_STATES = { versioned_value::REMOVING_TOKEN, versioned_value::REMOVED_TOKEN,
-                                               versioned_value::STATUS_LEFT, versioned_value::HIBERNATE };
+    const std::vector<sstring> DEAD_STATES = {
+        versioned_value::REMOVING_TOKEN,
+        versioned_value::REMOVED_TOKEN,
+        versioned_value::STATUS_LEFT,
+        versioned_value::HIBERNATE
+    };
+    const std::vector<sstring> SILENT_SHUTDOWN_STATES = {
+        versioned_value::REMOVING_TOKEN,
+        versioned_value::REMOVED_TOKEN,
+        versioned_value::STATUS_LEFT,
+        versioned_value::HIBERNATE,
+        versioned_value::STATUS_BOOTSTRAPPING,
+    };
    static constexpr std::chrono::milliseconds INTERVAL{1000};
    static constexpr std::chrono::hours A_VERY_LONG_TIME{24 * 3};

@@ -172,6 +176,7 @@ private:

    /* live member set */
    std::set<inet_address> _live_endpoints;
+    std::list<inet_address> _live_endpoints_just_added;

    /* unreachable member set */
    std::map<inet_address, clk::time_point> _unreachable_endpoints;
@@ -366,7 +371,7 @@ private:
 public:
    clk::time_point get_expire_time_for_endpoint(inet_address endpoint);

-    std::experimental::optional<endpoint_state> get_endpoint_state_for_endpoint(inet_address ep);
+    std::experimental::optional<endpoint_state> get_endpoint_state_for_endpoint(inet_address ep) const;

    // removes ALL endpoint states; should only be called after shadow gossip
    void reset_endpoint_state_map();
@@ -437,12 +442,12 @@ public:
                         std::map<inet_address, endpoint_state>& delta_ep_state_map);

 public:
-    future<> start(int generation_number);
+    future<> start_gossiping(int generation_number);

    /**
     * Start the gossiper with the generation number, preloading the map of application states before starting
     */
-    future<> start(int generation_nbr, std::map<application_state, versioned_value> preload_local_states);
+    future<> start_gossiping(int generation_nbr, std::map<application_state, versioned_value> preload_local_states);

 public:
    /**
@@ -465,7 +470,11 @@ public:

    future<> add_local_application_state(application_state state, versioned_value value);

+    // Needed by seastar::sharded
    future<> stop();
+    future<> stop_gossiping();
+private:
+    future<> do_stop_gossiping();

 public:
    bool is_enabled();
@@ -474,6 +483,8 @@ public:

    bool is_in_shadow_round();

+    void goto_shadow_round();
+
 public:
    void add_expire_time_for_endpoint(inet_address endpoint, clk::time_point expire_time);

@@ -481,6 +492,11 @@ public:
 public:
    void dump_endpoint_state_map();
    void debug_show();
+public:
+    bool is_shutdown(const inet_address& endpoint) const;
+    bool is_silent_shutdown_state(const endpoint_state& ep_state) const;
+    void mark_as_shutdown(const inet_address& endpoint);
+    void force_newer_generation();
 };

 extern distributed<gossiper> _the_gossiper;
--- a/gms/heart_beat_state.hh
+++ b/gms/heart_beat_state.hh
@@ -42,6 +42,7 @@
 #include "types.hh"
 #include "utils/serialization.hh"
 #include <ostream>
+#include <limits>

 namespace gms {
 /**
@@ -82,6 +83,10 @@ public:
        _generation += 1;
    }

+    void force_highest_possible_version_unsafe() {
+        _version = std::numeric_limits<int32_t>::max();
+    }
+
    friend inline std::ostream& operator<<(std::ostream& os, const heart_beat_state& h) {
        return os << "{ generation = " << h._generation << ", version = " << h._version << " }";
    }
--- a/gms/versioned_value.cc
+++ b/gms/versioned_value.cc
@@ -49,6 +49,7 @@ constexpr const char* versioned_value::STATUS_MOVING;
 constexpr const char* versioned_value::REMOVING_TOKEN;
 constexpr const char* versioned_value::REMOVED_TOKEN;
 constexpr const char* versioned_value::HIBERNATE;
+constexpr const char* versioned_value::SHUTDOWN;
 constexpr const char* versioned_value::REMOVAL_COORDINATOR;

 void versioned_value::serialize(bytes::iterator& out) const {
--- a/gms/versioned_value.hh
+++ b/gms/versioned_value.hh
@@ -83,6 +83,7 @@ public:
    static constexpr const char* REMOVED_TOKEN = "removed";

    static constexpr const char* HIBERNATE = "hibernate";
+    static constexpr const char* SHUTDOWN = "shutdown";

    // values for ApplicationState.REMOVAL_COORDINATOR
    static constexpr const char* REMOVAL_COORDINATOR = "REMOVER";
@@ -134,11 +135,17 @@ public:
    class factory {
        using token = dht::token;
    public:
-        sstring make_token_string(const std::unordered_set<token>& tokens) {
+        sstring make_full_token_string(const std::unordered_set<token>& tokens) {
            return ::join(";", tokens | boost::adaptors::transformed([] (const token& t) {
                return dht::global_partitioner().to_sstring(t); })
            );
        }
+        sstring make_token_string(const std::unordered_set<token>& tokens) {
+            if (tokens.empty()) {
+                return "";
+            }
+            return dht::global_partitioner().to_sstring(*tokens.begin());
+        }

        versioned_value clone_with_higher_version(const versioned_value& value) {
            return versioned_value(value.value);
@@ -155,7 +162,7 @@ public:
        }

        versioned_value load(double load) {
-            return versioned_value(to_sstring_sprintf(load, "%g"));
+            return versioned_value(to_sstring(load));
        }

        versioned_value schema(const utils::UUID &new_version) {
@@ -184,7 +191,7 @@ public:
        }

        versioned_value tokens(const std::unordered_set<token>& tokens) {
-            return versioned_value(make_token_string(tokens));
+            return versioned_value(make_full_token_string(tokens));
        }

        versioned_value removing_nonlocal(const utils::UUID& host_id) {
@@ -206,6 +213,10 @@ public:
            return versioned_value(sstring(HIBERNATE) + sstring(DELIMITER_STR) + (value ? "true" : "false"));
        }

+        versioned_value shutdown(bool value) {
+            return versioned_value(sstring(SHUTDOWN) + sstring(DELIMITER_STR) + (value ? "true" : "false"));
+        }
+
        versioned_value datacenter(const sstring& dc_id) {
            return versioned_value(dc_id);
        }
@@ -231,7 +242,7 @@ public:
        }

        versioned_value severity(double value) {
-            return versioned_value(to_sstring_sprintf(value, "%g"));
+            return versioned_value(to_sstring(value));
        }
    };

--- a/init.cc
+++ b/init.cc
@@ -45,15 +45,15 @@ future<> init_storage_service(distributed<database>& db) {
    });
 }

-future<> init_ms_fd_gossiper(sstring listen_address, uint16_t port, db::seed_provider_type seed_provider, sstring cluster_name) {
+future<> init_ms_fd_gossiper(sstring listen_address, uint16_t port, db::seed_provider_type seed_provider, sstring cluster_name, double phi) {
    const gms::inet_address listen(listen_address);
    // Init messaging_service
-    return net::get_messaging_service().start(listen, std::move(port)).then([]{
+    return net::get_messaging_service().start(listen, std::move(port)).then([] {
        // #293 - do not stop anything
        //engine().at_exit([] { return net::get_messaging_service().stop(); });
-    }).then([] {
+    }).then([phi] {
        // Init failure_detector
-        return gms::get_failure_detector().start().then([] {
+        return gms::get_failure_detector().start(std::move(phi)).then([] {
            // #293 - do not stop anything
            //engine().at_exit([]{ return gms::get_failure_detector().stop(); });
        });
--- a/init.hh
+++ b/init.hh
@@ -27,4 +27,4 @@
 #include "database.hh"

 future<> init_storage_service(distributed<database>& db);
-future<> init_ms_fd_gossiper(sstring listen_address, uint16_t storage_port, db::seed_provider_type seed_provider, sstring cluster_name = "Test Cluster");
+future<> init_ms_fd_gossiper(sstring listen_address, uint16_t storage_port, db::seed_provider_type seed_provider, sstring cluster_name = "Test Cluster", double phi = 8);
--- a/locator/ec2_multi_region_snitch.cc
+++ b/locator/ec2_multi_region_snitch.cc
@@ -61,8 +61,7 @@ future<> ec2_multi_region_snitch::start() {
                // value to a public address in cassandra.yaml.
                //
                utils::fb_utilities::set_broadcast_address(local_public_address);
-                //DatabaseDescriptor.setBroadcastRpcAddress(local_public_address);
-                //
+                utils::fb_utilities::set_broadcast_rpc_address(local_public_address);

                return aws_api_call(AWS_QUERY_SERVER_ADDR, PRIVATE_IP_QUERY_REQ).then(
                        [this] (sstring priv_addr) {
--- a/locator/token_metadata.cc
+++ b/locator/token_metadata.cc
@@ -461,10 +461,23 @@ void token_metadata::calculate_pending_ranges(abstract_replication_strategy& str
    _pending_ranges[keyspace_name] = std::move(new_pending_ranges);

    if (logger.is_enabled(logging::log_level::debug)) {
-        // TODO: Enable printPendingRanges
-        // logger.debug("Pending ranges: {}", (_pending_ranges.empty() ? "<empty>" : printPendingRanges()));
+        logger.debug("Pending ranges: {}", (_pending_ranges.empty() ? "<empty>" : print_pending_ranges()));
    }
 }
+sstring token_metadata::print_pending_ranges() {
+    std::stringstream ss;
+
+    for (auto& x : _pending_ranges) {
+        auto& keyspace_name = x.first;
+        ss << "\nkeyspace_name = " << keyspace_name << " {\n";
+        for (auto& m : x.second) {
+            ss << m.second << " : " << m.first << "\n";
+        }
+        ss << "}\n";
+    }
+
+    return sstring(ss.str());
+}

 void token_metadata::add_leaving_endpoint(inet_address endpoint) {
     _leaving_endpoints.emplace(endpoint);
--- a/locator/token_metadata.hh
+++ b/locator/token_metadata.hh
@@ -865,23 +865,9 @@ public:

        return sb.toString();
    }
-
-    private String printPendingRanges()
-    {
-        StringBuilder sb = new StringBuilder();
-
-        for (Map.Entry<String, Multimap<Range<Token>, InetAddress>> entry : _pending_ranges.entrySet())
-        {
-            for (Map.Entry<Range<Token>, InetAddress> rmap : entry.getValue().entries())
-            {
-                sb.append(rmap.getValue()).append(":").append(rmap.getKey());
-                sb.append(System.getProperty("line.separator"));
-            }
-        }
-
-        return sb.toString();
-    }
 #endif
+    sstring print_pending_ranges();
+public:
    std::vector<gms::inet_address> pending_endpoints_for(const token& token, const sstring& keyspace_name);
 #if 0
    /**
--- a/main.cc
+++ b/main.cc
@@ -200,12 +200,14 @@ int main(int ac, char** av) {
            uint16_t storage_port = cfg->storage_port();
            ctx.api_dir = cfg->api_ui_dir();
            ctx.api_doc = cfg->api_doc_dir();
+            double phi = cfg->phi_convict_threshold();
            sstring cluster_name = cfg->cluster_name();
            sstring listen_address = cfg->listen_address();
            sstring rpc_address = cfg->rpc_address();
            sstring api_address = cfg->api_address() != "" ? cfg->api_address() : rpc_address;
            auto seed_provider= cfg->seed_provider();
            sstring broadcast_address = cfg->broadcast_address();
+            sstring broadcast_rpc_address = cfg->broadcast_rpc_address();

            if (!broadcast_address.empty()) {
                utils::fb_utilities::set_broadcast_address(broadcast_address);
@@ -216,6 +218,16 @@ int main(int ac, char** av) {
                throw bad_configuration_error();
            }

+            if (!broadcast_rpc_address.empty()) {
+                utils::fb_utilities::set_broadcast_rpc_address(broadcast_rpc_address);
+            } else {
+                if (rpc_address == "0.0.0.0") {
+                    startlog.error("If rpc_address is set to a wildcard address {}, then you must set broadcast_rpc_address to a value other than {}", rpc_address, rpc_address);
+                    throw bad_configuration_error();
+                }
+                utils::fb_utilities::set_broadcast_rpc_address(rpc_address);
+            }
+
            using namespace locator;
            return i_endpoint_snitch::create_snitch(cfg->endpoint_snitch()).then([] {
                // #293 - do not stop anything
@@ -236,8 +248,8 @@ int main(int ac, char** av) {
                        });
                    });
                });
-            }).then([listen_address, storage_port, seed_provider, cluster_name] {
-                return init_ms_fd_gossiper(listen_address, storage_port, seed_provider, cluster_name);
+            }).then([listen_address, storage_port, seed_provider, cluster_name, phi] {
+                return init_ms_fd_gossiper(listen_address, storage_port, seed_provider, cluster_name, phi);
            }).then([&db] {
                return streaming::stream_session::init_streaming_service(db);
            }).then([&proxy, &db] {
--- a/memtable.cc
+++ b/memtable.cc
@@ -201,6 +201,16 @@ memtable::update(const db::replay_position& rp) {
    }
 }

+future<>
+memtable::apply(const memtable& mt) {
+    return do_with(mt.make_reader(), [this] (auto&& rd) mutable {
+        return consume(rd, [self = this->shared_from_this(), &rd] (mutation&& m) {
+            self->apply(m);
+            return stop_iteration::no;
+        });
+    });
+}
+
 void
 memtable::apply(const mutation& m, const db::replay_position& rp) {
    with_allocator(_region.allocator(), [this, &m] {
--- a/memtable.hh
+++ b/memtable.hh
@@ -107,6 +107,7 @@ public:
    explicit memtable(schema_ptr schema, logalloc::region_group* dirty_memory_region_group = nullptr);
    ~memtable();
    schema_ptr schema() const { return _schema; }
+    future<> apply(const memtable&);
    void apply(const mutation& m, const db::replay_position& = db::replay_position());
    void apply(const frozen_mutation& m, const db::replay_position& = db::replay_position());
    const logalloc::region& region() const {
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -36,6 +36,8 @@

 namespace net {

+static logging::logger logger("messaging_service");
+
 using inet_address = gms::inet_address;
 using gossip_digest_syn = gms::gossip_digest_syn;
 using gossip_digest_ack = gms::gossip_digest_ack;
@@ -108,17 +110,11 @@ query::read_command net::serializer::read(Input& in, rpc::type<query::read_comma

 template <typename Output>
 void net::serializer::write(Output& out, const query::result& v) const {
-    // FIXME: allow const call to query::result::serialize()
-    uint32_t sz = v.serialized_size();
-    write(out, sz);
-    bytes b(bytes::initialized_later(), sz);
-    auto _out = b.begin();
-    const_cast<query::result&>(v).serialize(_out);
-    out.write(reinterpret_cast<const char*>(b.c_str()), sz);
+    write_serializable(out, v);
 }
 template <typename Input>
 query::result net::serializer::read(Input& in, rpc::type<query::result>) const {
-    return read_gms<query::result>(in);
+    return read_serializable<query::result>(in);
 }

 template <typename Output>
@@ -351,7 +347,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
        if (!c->error()) {
            return c;
        }
-        remove_rpc_client(verb, id);
+        remove_error_rpc_client(verb, id);
    }

    auto remote_addr = ipv4_addr(get_preferred_ip(id.addr).raw_addr(), _port);
@@ -361,9 +357,9 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
    return it->second.rpc_client;
 }

-void messaging_service::remove_rpc_client_one(clients_map& clients, shard_id id) {
+void messaging_service::remove_rpc_client_one(clients_map& clients, shard_id id, bool dead_only) {
    auto it = clients.find(id);
-    if (it != clients.end()) {
+    if (it != clients.end() && (!dead_only || it->second.rpc_client->error())) {
        auto client = std::move(it->second.rpc_client);
        clients.erase(it);
        //
@@ -372,17 +368,19 @@ void messaging_service::remove_rpc_client_one(clients_map& clients, shard_id id)
        // This will make sure messaging_service::stop() blocks until
        // client->stop() is over.
        //
-        client->stop().finally([c = client, ms = shared_from_this()] {}).discard_result();
+        client->stop().finally([id, client, ms = shared_from_this()] {
+            logger.debug("dropped connection to {}", id.addr);
+        }).discard_result();
    }
 }

-void messaging_service::remove_rpc_client(messaging_verb verb, shard_id id) {
-    remove_rpc_client_one(_clients[get_rpc_client_idx(verb)], id);
+void messaging_service::remove_error_rpc_client(messaging_verb verb, shard_id id) {
+    remove_rpc_client_one(_clients[get_rpc_client_idx(verb)], id, true);
 }

 void messaging_service::remove_rpc_client(shard_id id) {
    for (auto& c : _clients) {
-        remove_rpc_client_one(c, id);
+        remove_rpc_client_one(c, id, false);
    }
 }

@@ -406,7 +404,7 @@ auto send_message(messaging_service* ms, messaging_verb verb, shard_id id, MsgOu
            return std::move(f);
        } catch (rpc::closed_error) {
            // This is a transport error
-            ms->remove_rpc_client(verb, id);
+            ms->remove_error_rpc_client(verb, id);
            throw;
        } catch (...) {
            // This is expected to be a rpc server error, e.g., the rpc handler throws a std::runtime_error.
@@ -431,7 +429,7 @@ auto send_message_timeout(messaging_service* ms, messaging_verb verb, shard_id i
            return std::move(f);
        } catch (rpc::closed_error) {
            // This is a transport error
-            ms->remove_rpc_client(verb, id);
+            ms->remove_error_rpc_client(verb, id);
            throw;
        } catch (...) {
            // This is expected to be a rpc server error, e.g., the rpc handler throws a std::runtime_error.
--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -550,8 +550,8 @@ public:
 public:
    // Return rpc::protocol::client for a shard which is a ip + cpuid pair.
    shared_ptr<rpc_protocol_client_wrapper> get_rpc_client(messaging_verb verb, shard_id id);
-    void remove_rpc_client_one(clients_map& clients, shard_id id);
-    void remove_rpc_client(messaging_verb verb, shard_id id);
+    void remove_rpc_client_one(clients_map& clients, shard_id id, bool dead_only);
+    void remove_error_rpc_client(messaging_verb verb, shard_id id);
    void remove_rpc_client(shard_id id);
    std::unique_ptr<rpc_protocol_wrapper>& rpc();
 };
--- a/mutation.hh
+++ b/mutation.hh
@@ -73,6 +73,7 @@ public:
    std::experimental::optional<atomic_cell_or_collection> get_cell(const clustering_key& rkey, const column_definition& def) const;
    const partition_key& key() const { return _ptr->_dk._key; };
    const dht::decorated_key& decorated_key() const { return _ptr->_dk; };
+    dht::ring_position ring_position() const { return { decorated_key() }; }
    const dht::token& token() const { return _ptr->_dk._token; }
    const schema_ptr& schema() const { return _ptr->_schema; }
    const mutation_partition& partition() const { return _ptr->_p; }
--- a/mutation_partition_serializer.cc
+++ b/mutation_partition_serializer.cc
@@ -21,6 +21,7 @@
 */

 #include "mutation_partition_serializer.hh"
+#include "mutation_partition.hh"
 #include "db/serializer.hh"

 //
--- a/mutation_partition_view.cc
+++ b/mutation_partition_view.cc
@@ -25,6 +25,7 @@
 #include "db/serializer.hh"
 #include "utils/data_input.hh"
 #include "mutation_partition_serializer.hh"
+#include "mutation_partition.hh"

 //
 // See mutation_partition_serializer.cc for representation layout.
--- a/noexcept_traits.hh
+++ b/noexcept_traits.hh
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2015 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <type_traits>
+#include <memory>
+#include <seastar/core/future.hh>
+
+#pragma once
+
+//
+// Utility for adapting types which are not nothrow move constructible into such
+// by wrapping them if necessary.
+//
+// Example usage:
+//
+//   T val{};
+//   using traits = noexcept_movable<T>;
+//   auto f = make_ready_future<typename traits::type>(traits::wrap(std::move(val)));
+//   T val2 = traits::unwrap(f.get0());
+//
+
+template<typename T, typename Enable = void>
+struct noexcept_movable;
+
+template<typename T>
+struct noexcept_movable<T, std::enable_if_t<std::is_nothrow_move_constructible<T>::value>> {
+    using type = T;
+
+    static type wrap(T&& v) {
+        return std::move(v);
+    }
+
+    static future<T> wrap(future<T>&& v) {
+        return std::move(v);
+    }
+
+    static T unwrap(type&& v) {
+        return std::move(v);
+    }
+
+    static future<T> unwrap(future<type>&& v) {
+        return std::move(v);
+    }
+};
+
+template<typename T>
+struct noexcept_movable<T, std::enable_if_t<!std::is_nothrow_move_constructible<T>::value>> {
+    using type = std::unique_ptr<T>;
+
+    static type wrap(T&& v) {
+        return std::make_unique<T>(std::move(v));
+    }
+
+    static T unwrap(type&& v) {
+        return std::move(*v);
+    }
+};
+
+template<typename T>
+using noexcept_movable_t = typename noexcept_movable<T>::type;
--- a/query-result.hh
+++ b/query-result.hh
@@ -25,6 +25,7 @@
 #include <cryptopp/md5.h>
 #include "bytes_ostream.hh"
 #include "query-request.hh"
+#include "db/serializer.hh"

 namespace query {

@@ -139,17 +140,16 @@ public:
        return result_digest(std::move(b));
    }
    sstring pretty_print(schema_ptr, const query::partition_slice&) const;
-    size_t serialized_size() const { return _w.size(); }
-    void serialize(bytes::iterator& out) {
-        auto v = _w.linearize();
-        out = std::copy(v.begin(), v.end(), out);
-    }
-    static result deserialize(bytes_view& in) {
-        bytes_ostream w;
-        w.write(in);
-        in.remove_prefix(in.size());
-        return result(std::move(w));
-    }
 };

 }
+
+namespace db {
+
+template<> serializer<query::result>::serializer(const query::result&);
+template<> void serializer<query::result>::write(output&, const query::result&);
+template<> query::result serializer<query::result>::read(input&);
+
+extern template class serializer<query::result>;
+
+}
--- a/query.cc
+++ b/query.cc
@@ -19,6 +19,7 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <limits>
 #include "db/serializer.hh"
 #include "query-request.hh"
 #include "query-result.hh"
@@ -270,3 +271,34 @@ result::pretty_print(schema_ptr s, const query::partition_slice& slice) const {
 }

 }
+
+template class db::serializer<query::result>;
+
+using query_result_size_type = uint32_t;
+
+template<>
+db::serializer<query::result>::serializer(const query::result& v)
+        : _item(v)
+        , _size(sizeof(query_result_size_type) + v.buf().size())
+{
+    static_assert(std::numeric_limits<bytes_ostream::size_type>::max() <=
+                  std::numeric_limits<query_result_size_type>::max(), "query_result_size_type too small");
+}
+
+template<>
+void
+db::serializer<query::result>::write(output& out, const query::result& v) {
+    const bytes_ostream& buf = v.buf();
+    out.write<query_result_size_type>(buf.size());
+    for (bytes_view frag : buf.fragments()) {
+        out.write(frag.begin(), frag.end());
+    }
+}
+
+template<>
+query::result db::serializer<query::result>::read(input& in) {
+    bytes_ostream buf;
+    auto size = in.read<query_result_size_type>();
+    buf.write(in.read_view(size));
+    return query::result(std::move(buf));
+}
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -164,7 +164,7 @@ public:
    { }

    virtual future<mutation_opt> operator()() override {
-        return _delegate().then([this] (mutation_opt&& mo) {
+        return _delegate().then([this, op = _cache._populate_phaser.start()] (mutation_opt&& mo) {
            if (mo) {
                _cache.populate(*mo);
            }
@@ -250,11 +250,13 @@ class scanning_and_populating_reader final : public mutation_reader::impl {
    mutation_opt _next_primary;
    mutation_source& _underlying;
    mutation_reader _secondary;
+    utils::phased_barrier::phase_type _secondary_phase;
    const query::partition_range& _original_range;
    query::partition_range _range;
    key_source& _underlying_keys;
    key_reader _keys;
    dht::decorated_key_opt _next_key;
+    dht::decorated_key_opt _last_secondary_key;
 public:
    scanning_and_populating_reader(row_cache& cache, const query::partition_range& range)
        : _cache(cache), _schema(cache._schema),
@@ -293,6 +295,8 @@ public:
                    end = _original_range.end();
                }
                _range = query::partition_range(query::partition_range::bound { std::move(*dk), true }, std::move(end));
+                _last_secondary_key = {};
+                _secondary_phase = _cache._populate_phaser.phase();
                _secondary = _underlying(_range);
                _secondary_only = true;
                return next_secondary();
@@ -301,7 +305,14 @@ public:
    }
 private:
    future<mutation_opt> next_secondary() {
-        return _secondary().then([this] (mutation_opt&& mo) {
+        if (_secondary_phase != _cache._populate_phaser.phase()) {
+            assert(_last_secondary_key);
+            auto cmp = dht::ring_position_comparator(*_schema);
+            _range = _range.split_after(*_last_secondary_key, cmp);
+            _secondary_phase = _cache._populate_phaser.phase();
+            _secondary = _underlying(_range);
+        }
+        return _secondary().then([this, op = _cache._populate_phaser.start()] (mutation_opt&& mo) {
            if (!mo && _next_primary) {
                auto cmp = dht::ring_position_comparator(*_schema);
                _range = _original_range.split_after(_next_primary->decorated_key(), cmp);
@@ -312,6 +323,7 @@ private:
            }
            if (mo) {
                _cache.populate(*mo);
+                _last_secondary_key = mo->decorated_key();
            }
            _cache.on_miss();
            return std::move(mo);
@@ -397,6 +409,7 @@ future<> row_cache::update(memtable& m, partition_presence_checker presence_chec
            m.partitions.clear_and_dispose(current_deleter<partition_entry>());
          });
      });
+      _populate_phaser.advance_and_await().get();
      while (!m.partitions.empty()) {
        with_allocator(_tracker.allocator(), [this, &m, &presence_checker] () {
            unsigned quota = 30;
--- a/row_cache.hh
+++ b/row_cache.hh
@@ -31,6 +31,7 @@
 #include "mutation_partition.hh"
 #include "utils/logalloc.hh"
 #include "key_reader.hh"
+#include "utils/phased_barrier.hh"

 namespace scollectd {

@@ -174,6 +175,17 @@ private:
    partitions_type _partitions; // Cached partitions are complete.
    mutation_source _underlying;
    key_source _underlying_keys;
+
+    // Synchronizes populating reads with update() to ensure that cache
+    // remains consistent across flushes with the underlying data source.
+    // Readers obtained from the underlying data source in earlier than
+    // current phases must not be used to populate the cache, unless they hold
+    // phaser::operation created in the reader's phase of origin. Readers
+    // should hold to a phase only briefly because this inhibits progress of
+    // update(). Phase changes occur only in update(), which can be assumed to
+    // be asynchronous wrt invoking of the underlying data source.
+    utils::phased_barrier _populate_phaser;
+
    logalloc::allocating_section _update_section;
    logalloc::allocating_section _populate_section;
    logalloc::allocating_section _read_section;
@@ -188,7 +200,7 @@ public:
    row_cache(const row_cache&) = delete;
    row_cache& operator=(row_cache&&) = default;
 public:
-    mutation_reader make_reader(const query::partition_range&);
+    mutation_reader make_reader(const query::partition_range& = query::full_partition_range);
    const stats& stats() const { return _stats; }
 public:
    // Populate cache from given mutation. The mutation must contain all
--- a/scylla-gdb.py
+++ b/scylla-gdb.py
@@ -44,7 +44,7 @@ def cpus():
    return int(gdb.parse_and_eval('smp::count'))

 def find_db(shard):
-    return gdb.parse_and_eval('debug::db')['_instances']['_M_impl']['_M_start'][shard]
+    return gdb.parse_and_eval('debug::db')['_instances']['_M_impl']['_M_start'][shard]['service']['_p']

 def find_dbs():
    return [find_db(shard) for shard in range(cpus())]
@@ -102,9 +102,15 @@ class scylla_memory(gdb.Command):
        gdb.Command.__init__(self, 'scylla memory', gdb.COMMAND_USER, gdb.COMPLETE_COMMAND)
    def invoke(self, arg, from_tty):
        cpu_mem = gdb.parse_and_eval('memory::cpu_mem')
+        page_size = int(gdb.parse_and_eval('memory::page_size'))
+        free_mem = int(cpu_mem['nr_free_pages']) * page_size
+        total_mem = int(cpu_mem['nr_pages']) * page_size
+        gdb.write('Used memory: {used_mem:>13}\nFree memory: {free_mem:>13}\nTotal memory: {total_mem:>12}\n\n'
+            .format(used_mem=total_mem-free_mem, free_mem=free_mem, total_mem=total_mem))
+
+        gdb.write('Small pools:\n')
        small_pools = cpu_mem['small_pools']
        nr = small_pools['nr_small_pools']
-        page_size = int(gdb.parse_and_eval('memory::page_size'))
        gdb.write('{objsize:>5} {span_size:>6} {use_count:>10} {memory:>12} {wasted_percent:>5}\n'
              .format(objsize='objsz', span_size='spansz', use_count='usedobj', memory='memory', wasted_percent='wst%'))
        for i in range(int(nr)):
@@ -133,9 +139,46 @@ class scylla_memory(gdb.Command):
                front = int(span['link']['_next'])
            gdb.write('{index:5} {size:13} {total}\n'.format(index=index, size=(1<<index)*page_size, total=total*page_size))

+class scylla_lsa(gdb.Command):
+    def __init__(self):
+        gdb.Command.__init__(self, 'scylla lsa', gdb.COMMAND_USER, gdb.COMPLETE_COMMAND)
+    def invoke(self, arg, from_tty):
+        lsa = gdb.parse_and_eval('logalloc::shard_segment_pool')
+        segment_size = int(gdb.parse_and_eval('logalloc::segment::size'))
+
+        lsa_mem = int(lsa['_segments_in_use']) * segment_size
+        non_lsa_mem = int(lsa['_non_lsa_memory_in_use'])
+        total_mem = lsa_mem + non_lsa_mem
+        gdb.write('Log Structured Allocator\n\nLSA memory in use: {lsa_mem:>16}\n'
+            'Non-LSA memory in use: {non_lsa_mem:>12}\nTotal memory in use: {total_mem:>14}\n\n'
+            .format(lsa_mem=lsa_mem, non_lsa_mem = non_lsa_mem, total_mem = total_mem))
+
+        er_goal = int(lsa['_current_emergency_reserve_goal'])
+        er_max = int(lsa['_emergency_reserve_max'])
+        er_current = int(lsa['_emergency_reserve']['_size'])
+        gdb.write('Emergency reserve goal: {er_goal:>11}\nEmergency reserve max: {er_max:>12}\n'
+            'Emergency reserve current: {er_current:>8}\n\n'
+            .format(er_goal=er_goal, er_max=er_max, er_current=er_current))
+
+        lsa_tracker = gdb.parse_and_eval('logalloc::tracker_instance._impl')['_M_t']['_M_head_impl']
+        regions = lsa_tracker['_regions']
+        region = regions['_M_impl']['_M_start']
+        gdb.write('LSA regions:\n')
+        while region != regions['_M_impl']['_M_finish']:
+            gdb.write('    Region #{r_id}\n      - reclaimable: {r_en:>14}\n'
+                '      - evictable: {r_ev:16}\n      - non-LSA memory: {r_non_lsa:>11}\n'
+                '      - closed LSA memory: {r_lsa:>8}\n      - unused memory: {r_unused:>12}\n'
+                .format(r_id=int(region['_id']), r_en=bool(region['_reclaiming_enabled']),
+                    r_ev=bool(region['_evictable']),
+                    r_non_lsa=int(region['_non_lsa_occupancy']['_total_space']),
+                    r_lsa=int(region['_closed_occupancy']['_total_space']),
+                    r_unused=int(region['_closed_occupancy']['_free_space'])))
+            region = region + 1
+

 scylla()
 scylla_databases()
 scylla_keyspaces()
 scylla_column_families()
-scylla_memory()
+scylla_memory()
+scylla_lsa()
--- a/2
+++ b/2
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -88,6 +88,29 @@ future<> migration_manager::schedule_schema_pull(const gms::inet_address& endpoi
    return make_ready_future<>();
 }

+bool migration_manager::is_ready_for_bootstrap() {
+    auto our_version = get_local_storage_proxy().get_db().local().get_version();
+    bool match = false;
+    for (auto& x : gms::get_local_gossiper().endpoint_state_map) {
+        auto& endpoint = x.first;
+        auto& eps = x.second;
+        if (endpoint == utils::fb_utilities::get_broadcast_address() || !eps.is_alive()) {
+            continue;
+        }
+        auto schema = eps.get_application_state(gms::application_state::SCHEMA);
+        if (!schema) {
+            return false;
+        }
+        utils::UUID remote_version{schema->value};
+        if (our_version != remote_version) {
+            return false;
+        } else {
+            match = true;
+        }
+    }
+    return match;
+}
+
 /**
 * If versions differ this node sends request with local migration list to the endpoint
 * and expecting to receive a list of migrations to apply locally.
--- a/service/migration_manager.hh
+++ b/service/migration_manager.hh
@@ -113,6 +113,8 @@ public:
    static future<> passive_announce(utils::UUID version);

    future<> stop();
+
+    bool is_ready_for_bootstrap();
 };

 extern distributed<migration_manager> _the_migration_manager;
--- a/service/pending_range_calculator_service.cc
+++ b/service/pending_range_calculator_service.cc
@@ -68,11 +68,16 @@ future<> pending_range_calculator_service::stop() {

 future<> pending_range_calculator_service::update() {
    return smp::submit_to(0, [] {
-        get_local_pending_range_calculator_service()._update_jobs++;
-        get_local_pending_range_calculator_service().run();
+        get_local_pending_range_calculator_service().do_update();
    });
 }

+void pending_range_calculator_service::do_update() {
+    assert(engine().cpu_id() == 0);
+    get_local_pending_range_calculator_service()._update_jobs++;
+    get_local_pending_range_calculator_service().run();
+}
+
 future<> pending_range_calculator_service::block_until_finished() {
    // We want to be sure the job we're blocking for is actually finished and we can't trust the TPE's active job count
    return smp::submit_to(0, [] {
--- a/service/pending_range_calculator_service.hh
+++ b/service/pending_range_calculator_service.hh
@@ -51,6 +51,7 @@ private:
    void run();
 public:
    pending_range_calculator_service(distributed<database>& db) : _db(db) {}
+    void do_update();
    future<> update();
    future<> block_until_finished();
    future<> stop();
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -343,6 +343,19 @@ storage_proxy::storage_proxy(distributed<database>& db) : _db(db) {

 storage_proxy::rh_entry::rh_entry(std::unique_ptr<abstract_write_response_handler>&& h, std::function<void()>&& cb) : handler(std::move(h)), expire_timer(std::move(cb)) {}

+storage_proxy::unique_response_handler::unique_response_handler(storage_proxy& p_, response_id_type id_) : id(id_), p(p_) {}
+storage_proxy::unique_response_handler::unique_response_handler(unique_response_handler&& x) : id(x.id), p(x.p) { x.id = 0; };
+storage_proxy::unique_response_handler::~unique_response_handler() {
+    if (id) {
+        p.remove_response_handler(id);
+    }
+}
+storage_proxy::response_id_type storage_proxy::unique_response_handler::release() {
+    auto r = id;
+    id = 0;
+    return r;
+}
+
 #if 0
    static
    {
@@ -767,6 +780,12 @@ storage_proxy::create_write_response_handler(const mutation& m, db::consistency_
    std::vector<gms::inet_address> pending_endpoints =
        get_local_storage_service().get_token_metadata().pending_endpoints_for(m.token(), keyspace_name);

+    // filter out naturale_endpoints from pending_endpoint if later is not yet updated during node join
+    auto itend = boost::range::remove_if(pending_endpoints, [&natural_endpoints] (gms::inet_address& p) {
+        return boost::range::find(natural_endpoints, p) != natural_endpoints.end();
+    });
+    pending_endpoints.erase(itend, pending_endpoints.end());
+
    auto all = boost::range::join(natural_endpoints, pending_endpoints);

    if (std::find_if(all.begin(), all.end(), std::bind1st(std::mem_fn(&storage_proxy::cannot_hint), this)) != all.end()) {
@@ -804,29 +823,27 @@ storage_proxy::hint_to_dead_endpoints(response_id_type id, db::consistency_level
 }

 template<typename Range, typename CreateWriteHandler>
-future<std::vector<storage_proxy::response_id_type>> storage_proxy::mutate_prepare(const Range& mutations, db::consistency_level cl, db::write_type type, CreateWriteHandler create_handler) {
-    std::vector<response_id_type> ids;
-
-    try {
+future<std::vector<storage_proxy::unique_response_handler>> storage_proxy::mutate_prepare(const Range& mutations, db::consistency_level cl, db::write_type type, CreateWriteHandler create_handler) {
+    // apply is used to convert exceptions to exceptional future
+    return futurize<std::vector<storage_proxy::unique_response_handler>>::apply([this] (const Range& mutations, db::consistency_level cl, db::write_type type, CreateWriteHandler create_handler) {
+        std::vector<unique_response_handler> ids;
        ids.reserve(mutations.size());
        for (auto& m : mutations) {
-            ids.emplace_back(create_handler(m, cl, type));
+            ids.emplace_back(*this, create_handler(m, cl, type));
        }
-        return make_ready_future<std::vector<response_id_type>>(std::move(ids));
-    } catch(...) {
-        boost::for_each(ids, std::bind(&storage_proxy::remove_response_handler, this, std::placeholders::_1));
-        return make_exception_future<std::vector<response_id_type>>(std::current_exception());
-    }
+        return make_ready_future<std::vector<unique_response_handler>>(std::move(ids));
+    }, mutations, cl, type, std::move(create_handler));
 }

-future<std::vector<storage_proxy::response_id_type>> storage_proxy::mutate_prepare(std::vector<mutation>& mutations, db::consistency_level cl, db::write_type type) {
+future<std::vector<storage_proxy::unique_response_handler>> storage_proxy::mutate_prepare(std::vector<mutation>& mutations, db::consistency_level cl, db::write_type type) {
    return mutate_prepare<>(mutations, cl, type, [this] (const mutation& m, db::consistency_level cl, db::write_type type) {
        return create_write_response_handler(m, cl, type);
    });
 }

-future<> storage_proxy::mutate_begin(std::vector<storage_proxy::response_id_type> ids, db::consistency_level cl) {
-    return parallel_for_each(ids, [this, cl] (storage_proxy::response_id_type response_id) {
+future<> storage_proxy::mutate_begin(std::vector<unique_response_handler> ids, db::consistency_level cl) {
+    return parallel_for_each(ids, [this, cl] (unique_response_handler& protected_response) {
+        auto response_id = protected_response.id;
        // it is better to send first and hint afterwards to reduce latency
        // but request may complete before hint_to_dead_endpoints() is called and
        // response_id handler will be removed, so we will have to do hint with separate
@@ -835,7 +852,7 @@ future<> storage_proxy::mutate_begin(std::vector<storage_proxy::response_id_type

        // call before send_to_live_endpoints() for the same reason as above
        auto f = response_wait(response_id);
-        send_to_live_endpoints(response_id);
+        send_to_live_endpoints(protected_response.release()); // response is now running and it will either complete or timeout
        return std::move(f);
    });
 }
@@ -846,7 +863,7 @@ future<> storage_proxy::mutate_end(future<> mutate_result, utils::latency_counte
    assert(mutate_result.available());
    _stats.write.mark(lc.stop().latency_in_nano());
    if (lc.is_start()) {
-        _stats.estimated_write.add(lc.latency_in_nano(), _stats.write.count);
+        _stats.estimated_write.add(lc.latency(), _stats.write.count);
    }
    try {
        mutate_result.get();
@@ -885,7 +902,7 @@ storage_proxy::mutate(std::vector<mutation> mutations, db::consistency_level cl)
    utils::latency_counter lc;
    lc.start();

-    return mutate_prepare(mutations, cl, type).then([this, cl] (std::vector<storage_proxy::response_id_type> ids) {
+    return mutate_prepare(mutations, cl, type).then([this, cl] (std::vector<storage_proxy::unique_response_handler> ids) {
        return mutate_begin(std::move(ids), cl);
    }).then_wrapped([p = shared_from_this(), lc] (future<> f) {
        return p->mutate_end(std::move(f), lc);
@@ -959,7 +976,7 @@ storage_proxy::mutate_atomically(std::vector<mutation> mutations, db::consistenc
            return _p.mutate_prepare<>(std::array<mutation, 1>{std::move(m)}, cl, db::write_type::BATCH_LOG, [this] (const mutation& m, db::consistency_level cl, db::write_type type) {
                auto& ks = _p._db.local().find_keyspace(m.schema()->ks_name());
                return _p.create_write_response_handler(ks, cl, type, freeze(m), _batchlog_endpoints, {}, {});
-            }).then([this, cl] (std::vector<response_id_type> ids) {
+            }).then([this, cl] (std::vector<unique_response_handler> ids) {
                return _p.mutate_begin(std::move(ids), cl);
            });
        }
@@ -979,16 +996,9 @@ storage_proxy::mutate_atomically(std::vector<mutation> mutations, db::consistenc
        };

        future<> run() {
-            return _p.mutate_prepare(_mutations, _cl, db::write_type::BATCH).then([this] (std::vector<response_id_type> ids) {
-                return sync_write_to_batchlog().then_wrapped([this, ids = std::move(ids)] (future<> f) {
-                    try {
-                        f.get();
-                        return _p.mutate_begin(std::move(ids), _cl);
-                    } catch(...) {
-                        // writing batchlog failed, remove responce handlers that will not be used now
-                        boost::for_each(ids, std::bind(&storage_proxy::remove_response_handler, &_p, std::placeholders::_1));
-                        throw;
-                    }
+            return _p.mutate_prepare(_mutations, _cl, db::write_type::BATCH).then([this] (std::vector<unique_response_handler> ids) {
+                return sync_write_to_batchlog().then([this, ids = std::move(ids)] () mutable {
+                    return _p.mutate_begin(std::move(ids), _cl);
                }).then(std::bind(&context::async_remove_from_batchlog, this));
            });
        }
@@ -1295,7 +1305,7 @@ future<> storage_proxy::schedule_repair(std::unordered_map<gms::inet_address, st
        return mutate_prepare<>(std::move(i.second), db::consistency_level::ONE, type, [ep = i.first, this] (const mutation& m, db::consistency_level cl, db::write_type type) {
            auto& ks = _db.local().find_keyspace(m.schema()->ks_name());
            return create_write_response_handler(ks, cl, type, freeze(m), std::unordered_set<gms::inet_address>({ep}, 1), {}, {});
-        }).then([this] (std::vector<response_id_type> ids) {
+        }).then([this] (std::vector<unique_response_handler> ids) {
            return mutate_begin(std::move(ids), db::consistency_level::ONE);
        }).then_wrapped([this, lc] (future<> f) {
            return mutate_end(std::move(f), lc);
@@ -1523,7 +1533,7 @@ public:

        // reconcile all versions
        boost::range::transform(boost::make_iterator_range(versions.begin(), versions.end()), std::back_inserter(reconciled_partitions), [this, schema] (std::vector<version>& v) {
-            return boost::accumulate(v, mutation(v.front().par.mut().key(*schema), schema), [this, schema = std::move(schema)] (mutation& m, const version& ver) {
+            return boost::accumulate(v, mutation(v.front().par.mut().key(*schema), schema), [this, schema] (mutation& m, const version& ver) {
                m.partition().apply(*schema, ver.par.mut().partition());
                return std::move(m);
            });
@@ -2086,7 +2096,7 @@ storage_proxy::do_query(schema_ptr s,
            return query_singular(cmd, std::move(partition_ranges), cl).finally([lc, p] () mutable {
                    p->_stats.read.mark(lc.stop().latency_in_nano());
                    if (lc.is_start()) {
-                        p->_stats.estimated_read.add(lc.latency_in_nano(), p->_stats.read.count);
+                        p->_stats.estimated_read.add(lc.latency(), p->_stats.read.count);
                    }
            });
        } catch (const no_such_column_family&) {
@@ -2584,30 +2594,26 @@ void storage_proxy::init_messaging_service() {
    });
    ms.register_mutation([] (frozen_mutation in, std::vector<gms::inet_address> forward, gms::inet_address reply_to, unsigned shard, storage_proxy::response_id_type response_id) {
        do_with(std::move(in), get_local_shared_storage_proxy(), [forward = std::move(forward), reply_to, shard, response_id] (const frozen_mutation& m, shared_ptr<storage_proxy>& p) {
-            return make_ready_future<>().then([&p, &m, reply_to, shard, response_id, forward = std::move(forward)] () mutable {
-                return when_all(
-                    p->mutate_locally(m).then_wrapped([reply_to, shard, response_id] (future<> f) {
-                        try {
-                            f.get();
-                            auto& ms = net::get_local_messaging_service();
-                            ms.send_mutation_done(net::messaging_service::shard_id{reply_to, shard}, shard, response_id).then_wrapped([] (future<> f) {
-                                f.ignore_ready_future();
-                            });
-                            // return void, no need to wait for send to complete
-                        } catch (std::exception& e){
-                            logger.warn("MUTATION verb handler: {}", e.what());
-                        } catch(...) {
-                            logger.warn("MUTATION verb handler: unknown exception is thrown");
-                        }
-                    }),
-                    parallel_for_each(forward.begin(), forward.end(), [reply_to, shard, response_id, &m] (gms::inet_address forward) {
-                        auto& ms = net::get_local_messaging_service();
-                        return ms.send_mutation(net::messaging_service::shard_id{forward, 0}, m, {}, reply_to, shard, response_id).then_wrapped([] (future<> f) {
-                            f.ignore_ready_future();
-                        });
-                    })
-                );
-            });
+            return when_all(
+                // mutate_locally() may throw, putting it into apply() converts exception to a future.
+                futurize<void>::apply([&p, &m] {
+                    return p->mutate_locally(m);
+                }).then([reply_to, shard, response_id] {
+                    auto& ms = net::get_local_messaging_service();
+                    ms.send_mutation_done(net::messaging_service::shard_id{reply_to, shard}, shard, response_id).then_wrapped([] (future<> f) {
+                        f.ignore_ready_future();
+                    });
+                    // return void, no need to wait for send to complete
+                }).handle_exception([] (std::exception_ptr eptr) {
+                    logger.warn("MUTATION verb handler: {}", eptr);
+                }),
+                parallel_for_each(forward.begin(), forward.end(), [reply_to, shard, response_id, &m] (gms::inet_address forward) {
+                    auto& ms = net::get_local_messaging_service();
+                    return ms.send_mutation(net::messaging_service::shard_id{forward, 0}, m, {}, reply_to, shard, response_id).then_wrapped([] (future<> f) {
+                        f.ignore_ready_future();
+                    });
+                })
+            );
        }).discard_result();

        return net::messaging_service::no_wait();
@@ -2642,8 +2648,7 @@ void storage_proxy::init_messaging_service() {
    });

    ms.register_replication_finished([] (gms::inet_address from) {
-        get_local_storage_service().confirm_replication(from);
-        return make_ready_future<>();
+        return get_local_storage_service().confirm_replication(from);
    });
 }

--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -63,6 +63,18 @@ class storage_proxy : public seastar::async_sharded_service<storage_proxy> /*imp
        rh_entry(std::unique_ptr<abstract_write_response_handler>&& h, std::function<void()>&& cb);
    };

+    using response_id_type = uint64_t;
+    struct unique_response_handler {
+        response_id_type id;
+        storage_proxy& p;
+        unique_response_handler(storage_proxy& p_, response_id_type id_);
+        unique_response_handler(const unique_response_handler&) = delete;
+        unique_response_handler& operator=(const unique_response_handler&) = delete;
+        unique_response_handler(unique_response_handler&& x);
+        ~unique_response_handler();
+        response_id_type release();
+    };
+
 public:
    struct stats {
        uint64_t read_timeouts = 0;
@@ -84,10 +96,9 @@ public:
        uint64_t reads = 0;
        uint64_t background_reads = 0; // client no longer waits for the read
    };
-    using response_id_type = uint64_t;
 private:
    distributed<database>& _db;
-    response_id_type _next_response_id = 0;
+    response_id_type _next_response_id = 1; // 0 is reserved for unique_response_handler
    std::unordered_map<response_id_type, rh_entry> _response_handlers;
    constexpr static size_t _max_hints_in_progress = 128; // origin multiplies by FBUtilities.getAvailableProcessors() but we already sharded
    size_t _total_hints_in_progress = 0;
@@ -136,9 +147,9 @@ private:
        std::vector<query::partition_range>&& partition_ranges,
        db::consistency_level cl);
    template<typename Range, typename CreateWriteHandler>
-    future<std::vector<storage_proxy::response_id_type>> mutate_prepare(const Range& mutations, db::consistency_level cl, db::write_type type, CreateWriteHandler handler);
-    future<std::vector<storage_proxy::response_id_type>> mutate_prepare(std::vector<mutation>& mutations, db::consistency_level cl, db::write_type type);
-    future<> mutate_begin(const std::vector<storage_proxy::response_id_type> ids, db::consistency_level cl);
+    future<std::vector<unique_response_handler>> mutate_prepare(const Range& mutations, db::consistency_level cl, db::write_type type, CreateWriteHandler handler);
+    future<std::vector<unique_response_handler>> mutate_prepare(std::vector<mutation>& mutations, db::consistency_level cl, db::write_type type);
+    future<> mutate_begin(std::vector<unique_response_handler> ids, db::consistency_level cl);
    future<> mutate_end(future<> mutate_result, utils::latency_counter);
    future<> schedule_repair(std::unordered_map<gms::inet_address, std::vector<mutation>> diffs);

@@ -197,8 +208,6 @@ public:
        std::vector<query::partition_range>&& partition_ranges,
        db::consistency_level cl);

-    future<foreign_ptr<lw_shared_ptr<query::result>>> query_local(lw_shared_ptr<query::read_command> cmd, std::vector<query::partition_range>&& partition_ranges);
-
    future<foreign_ptr<lw_shared_ptr<reconcilable_result>>> query_mutations_locally(
        lw_shared_ptr<query::read_command> cmd, const query::partition_range&);

--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -85,11 +85,6 @@ static int get_generation_number() {
    return generation_number;
 }

-bool is_replacing() {
-    // FIXME: DatabaseDescriptor.isReplacing()
-    return false;
-}
-
 bool storage_service::is_auto_bootstrap() {
    return _db.local().get_config().auto_bootstrap();
 }
@@ -100,34 +95,46 @@ std::set<inet_address> get_seeds() {
    return gossiper.get_seeds();
 }

-std::set<inet_address> get_replace_tokens() {
-    // FIXME: DatabaseDescriptor.getReplaceTokens()
-    return {};
+std::unordered_set<token> get_replace_tokens() {
+    std::unordered_set<token> ret;
+    std::unordered_set<sstring> tokens;
+    auto tokens_string = get_local_storage_service().db().local().get_config().replace_token();
+    try {
+        boost::split(tokens, tokens_string, boost::is_any_of(sstring(",")));
+    } catch (...) {
+        throw std::runtime_error(sprint("Unable to parse replace_token=%s", tokens_string));
+    }
+    tokens.erase("");
+    for (auto token_string : tokens) {
+        auto token = dht::global_partitioner().from_sstring(token_string);
+        ret.insert(token);
+    }
+    return ret;
 }

 std::experimental::optional<UUID> get_replace_node() {
-    // FIXME: DatabaseDescriptor.getReplaceNode()
-    return {};
-}
-
-std::experimental::optional<inet_address> get_replace_address() {
-    // FIXME: DatabaseDescriptor.getReplaceAddress()
-    return {};
+    auto replace_node = get_local_storage_service().db().local().get_config().replace_node();
+    if (replace_node.empty()) {
+        return std::experimental::nullopt;
+    }
+    try {
+        return utils::UUID(replace_node);
+    } catch (...) {
+        logger.error("Format of host-id = {} is incorrect {}", std::current_exception());
+        throw;
+    }
 }

 bool get_property_join_ring() {
-    // FIXME: Boolean.parseBoolean(System.getProperty("cassandra.join_ring", "true")))
-    return true;
+    return get_local_storage_service().db().local().get_config().join_ring();
 }

 bool get_property_rangemovement() {
-    // FIXME: Boolean.parseBoolean(System.getProperty("cassandra.consistent.rangemovement", "true")
-    return true;
+    return get_local_storage_service().db().local().get_config().consistent_rangemovement();
 }

 bool get_property_load_ring_state() {
-    // FIXME: Boolean.parseBoolean(System.getProperty("cassandra.load_ring_state", "true"))
-    return true;
+    return get_local_storage_service().db().local().get_config().load_ring_state();
 }

 bool storage_service::should_bootstrap() {
@@ -141,13 +148,13 @@ future<> storage_service::prepare_to_join() {

    auto app_states = make_shared<std::map<gms::application_state, gms::versioned_value>>();
    auto f = make_ready_future<>();
-    if (is_replacing() && !get_property_join_ring()) {
+    if (db().local().is_replacing() && !get_property_join_ring()) {
        throw std::runtime_error("Cannot set both join_ring=false and attempt to replace a node");
    }
    if (get_replace_tokens().size() > 0 || get_replace_node()) {
         throw std::runtime_error("Replace method removed; use cassandra.replace_address instead");
    }
-    if (is_replacing()) {
+    if (db().local().is_replacing()) {
        if (db::system_keyspace::bootstrap_complete()) {
            throw std::runtime_error("Cannot replace address with a node that is already bootstrapped");
        }
@@ -171,8 +178,7 @@ future<> storage_service::prepare_to_join() {
        return db::system_keyspace::get_local_host_id();
    }).then([this, app_states] (auto local_host_id) mutable {
        _token_metadata.update_host_id(local_host_id, this->get_broadcast_address());
-        // FIXME: DatabaseDescriptor.getBroadcastRpcAddress()
-        auto broadcast_rpc_address = this->get_broadcast_address();
+        auto broadcast_rpc_address = utils::fb_utilities::get_broadcast_rpc_address();
        app_states->emplace(gms::application_state::NET_VERSION, value_factory.network_version());
        app_states->emplace(gms::application_state::HOST_ID, value_factory.host_id(local_host_id));
        app_states->emplace(gms::application_state::RPC_ADDRESS, value_factory.rpcaddress(broadcast_rpc_address));
@@ -183,7 +189,7 @@ future<> storage_service::prepare_to_join() {
        gossiper.register_(this->shared_from_this());
        // FIXME: SystemKeyspace.incrementAndGetGeneration()
        print("Start gossiper service ...\n");
-        return gossiper.start(get_generation_number(), *app_states).then([this] {
+        return gossiper.start_gossiping(get_generation_number(), *app_states).then([this] {
 #if SS_DEBUG
            gms::get_local_gossiper().debug_show();
            _token_metadata.debug_show();
@@ -244,38 +250,53 @@ void storage_service::join_token_ring(int delay) {
            }
            sleep(std::chrono::seconds(1)).get();
        }
-#if 0
        // if our schema hasn't matched yet, keep sleeping until it does
        // (post CASSANDRA-1391 we don't expect this to be necessary very often, but it doesn't hurt to be careful)
-        while (!MigrationManager.isReadyForBootstrap())
-        {
+        while (!get_local_migration_manager().is_ready_for_bootstrap()) {
            set_mode(mode::JOINING, "waiting for schema information to complete", true);
-            Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
+            sleep(std::chrono::seconds(1)).get();
        }
-#endif
        set_mode(mode::JOINING, "schema complete, ready to bootstrap", true);
        set_mode(mode::JOINING, "waiting for pending range calculation", true);
        get_local_pending_range_calculator_service().block_until_finished().get();
        set_mode(mode::JOINING, "calculation complete, ready to bootstrap", true);
        logger.debug("... got ring + schema info");
-#if 0
-        if (Boolean.parseBoolean(System.getProperty("cassandra.consistent.rangemovement", "true")) &&
-                (
-                    _token_metadata.getBootstrapTokens().valueSet().size() > 0 ||
-                    _token_metadata.getLeavingEndpoints().size() > 0 ||
-                    _token_metadata.getMovingEndpoints().size() > 0
-                ))
-            throw new UnsupportedOperationException("Other bootstrapping/leaving/moving nodes detected, cannot bootstrap while cassandra.consistent.rangemovement is true");
-#endif

-        if (!is_replacing()) {
+        auto t = gms::gossiper::clk::now();
+        while (get_property_rangemovement() &&
+            (!_token_metadata.get_bootstrap_tokens().empty() ||
+             !_token_metadata.get_leaving_endpoints().empty() ||
+             !_token_metadata.get_moving_endpoints().empty())) {
+            auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(gms::gossiper::clk::now() - t).count();
+            logger.info("Checking bootstrapping/leaving/moving nodes: tokens {}, leaving {}, moving {}, sleep 1 second and check again ({} seconds elpased)",
+                _token_metadata.get_bootstrap_tokens().size(),
+                _token_metadata.get_leaving_endpoints().size(),
+                _token_metadata.get_moving_endpoints().size(),
+                elapsed);
+
+            sleep(std::chrono::seconds(1)).get();
+
+            if (gms::gossiper::clk::now() > t + std::chrono::seconds(60)) {
+                throw std::runtime_error("Other bootstrapping/leaving/moving nodes detected, cannot bootstrap while cassandra.consistent.rangemovement is true");
+            }
+
+            // Check the schema and pending range again
+            while (!get_local_migration_manager().is_ready_for_bootstrap()) {
+                set_mode(mode::JOINING, "waiting for schema information to complete", true);
+                sleep(std::chrono::seconds(1)).get();
+            }
+            get_local_pending_range_calculator_service().block_until_finished().get();
+        }
+        logger.info("Checking bootstrapping/leaving/moving nodes: ok");
+
+        if (!db().local().is_replacing()) {
            if (_token_metadata.is_member(get_broadcast_address())) {
                throw std::runtime_error("This node is already a member of the token ring; bootstrap aborted. (If replacing a dead node, remove the old one from the ring first.)");
            }
            set_mode(mode::JOINING, "getting bootstrap token", true);
            _bootstrap_tokens = boot_strapper::get_bootstrap_tokens(_token_metadata, _db.local());
        } else {
-            auto replace_addr = get_replace_address();
+            auto replace_addr = db().local().get_replace_address();
            if (replace_addr && *replace_addr != get_broadcast_address()) {
                // Sleep additionally to make sure that the server actually is not alive
                // and giving it more time to gossip if alive.
@@ -376,7 +397,7 @@ future<> storage_service::join_ring() {

 future<bool> storage_service::is_joined() {
    return run_with_read_api_lock([] (storage_service& ss) {
-        return ss._joined;
+        return ss._joined && !ss._is_survey_mode;
    });
 }

@@ -387,7 +408,7 @@ void storage_service::bootstrap(std::unordered_set<token> tokens) {
    // DON'T use set_token, that makes us part of the ring locally which is incorrect until we are done bootstrapping
    db::system_keyspace::update_tokens(tokens).get();
    auto& gossiper = gms::get_local_gossiper();
-    if (!is_replacing()) {
+    if (!db().local().is_replacing()) {
        // if not an existing token then bootstrap
        gossiper.add_local_application_state(gms::application_state::TOKENS, value_factory.tokens(tokens)).get();
        gossiper.add_local_application_state(gms::application_state::STATUS, value_factory.bootstrapping(tokens)).get();
@@ -396,7 +417,7 @@ void storage_service::bootstrap(std::unordered_set<token> tokens) {
    } else {
        // Dont set any state for the node which is bootstrapping the existing token...
        _token_metadata.update_normal_tokens(tokens, get_broadcast_address());
-        auto replace_addr = get_replace_address();
+        auto replace_addr = db().local().get_replace_address();
        if (replace_addr) {
            db::system_keyspace::remove_endpoint(*replace_addr).get();
        }
@@ -462,10 +483,10 @@ void storage_service::handle_state_normal(inet_address endpoint) {
    if (gossiper.uses_host_id(endpoint)) {
        auto host_id = gossiper.get_host_id(endpoint);
        auto existing = _token_metadata.get_endpoint_for_host_id(host_id);
-        if (is_replacing() &&
-            get_replace_address() &&
-            gossiper.get_endpoint_state_for_endpoint(get_replace_address().value())  &&
-            (host_id == gossiper.get_host_id(get_replace_address().value()))) {
+        if (db().local().is_replacing() &&
+            db().local().get_replace_address() &&
+            gossiper.get_endpoint_state_for_endpoint(db().local().get_replace_address().value())  &&
+            (host_id == gossiper.get_host_id(db().local().get_replace_address().value()))) {
            logger.warn("Not updating token metadata for {} because I am replacing it", endpoint);
        } else {
            if (existing && *existing != endpoint) {
@@ -522,11 +543,17 @@ void storage_service::handle_state_normal(inet_address endpoint) {
    }

    bool is_moving = _token_metadata.is_moving(endpoint); // capture because updateNormalTokens clears moving status
+
+    // Update pending ranges after update of normal tokens immediately to avoid
+    // a race where natural endpoint was updated to contain node A, but A was
+    // not yet removed from pending endpoints
    _token_metadata.update_normal_tokens(tokens_to_update_in_metadata, endpoint);
+    get_local_pending_range_calculator_service().do_update();
+
    for (auto ep : endpoints_to_remove) {
        remove_endpoint(ep);
-        auto replace_addr = get_replace_address();
-        if (is_replacing() && replace_addr && *replace_addr == ep) {
+        auto replace_addr = db().local().get_replace_address();
+        if (db().local().is_replacing() && replace_addr && *replace_addr == ep) {
            gossiper.replacement_quarantine(ep); // quarantine locally longer than normally; see CASSANDRA-8260
        }
    }
@@ -545,7 +572,7 @@ void storage_service::handle_state_normal(inet_address endpoint) {
        db::system_keyspace::update_local_tokens(std::unordered_set<dht::token>(), local_tokens_to_remove).discard_result().get();
    }

-    if (is_moving) {
+    if (is_moving || _operation_mode == mode::MOVING) {
        _token_metadata.remove_from_moving(endpoint);
        get_storage_service().invoke_on_all([endpoint] (auto&& ss) {
            for (auto&& subscriber : ss._lifecycle_subscribers) {
@@ -702,7 +729,8 @@ void storage_service::on_change(inet_address endpoint, application_state state,
        sstring move_name = pieces[0];
        if (move_name == sstring(versioned_value::STATUS_BOOTSTRAPPING)) {
            handle_state_bootstrap(endpoint);
-        } else if (move_name == sstring(versioned_value::STATUS_NORMAL)) {
+        } else if (move_name == sstring(versioned_value::STATUS_NORMAL) ||
+                   move_name == sstring(versioned_value::SHUTDOWN)) {
            handle_state_normal(endpoint);
        } else if (move_name == sstring(versioned_value::REMOVING_TOKEN) ||
                   move_name == sstring(versioned_value::REMOVED_TOKEN)) {
@@ -721,11 +749,13 @@ void storage_service::on_change(inet_address endpoint, application_state state,
            logger.debug("Ignoring state change for dead or unknown endpoint: {}", endpoint);
            return;
        }
-        do_update_system_peers_table(endpoint, state, value);
-        if (state == application_state::SCHEMA) {
-            get_local_migration_manager().schedule_schema_pull(endpoint, *ep_state).handle_exception([endpoint] (auto ep) {
-                logger.warn("Fail to pull schmea from {}: {}", endpoint, ep);
-            });
+        if (get_token_metadata().is_member(endpoint)) {
+            do_update_system_peers_table(endpoint, state, value);
+            if (state == application_state::SCHEMA) {
+                get_local_migration_manager().schedule_schema_pull(endpoint, *ep_state).handle_exception([endpoint] (auto ep) {
+                    logger.warn("Fail to pull schmea from {}: {}", endpoint, ep);
+                });
+            }
        }
    }
    replicate_to_all_cores().get();
@@ -740,9 +770,7 @@ void storage_service::on_remove(gms::inet_address endpoint) {

 void storage_service::on_dead(gms::inet_address endpoint, gms::endpoint_state state) {
    logger.debug("on_dead endpoint={}", endpoint);
-#if 0
-    MessagingService.instance().convict(endpoint);
-#endif
+    net::get_local_messaging_service().remove_rpc_client(net::shard_id{endpoint, 0});
    get_storage_service().invoke_on_all([endpoint] (auto&& ss) {
        for (auto&& subscriber : ss._lifecycle_subscribers) {
            subscriber->on_down(endpoint);
@@ -845,11 +873,15 @@ void storage_service::set_tokens(std::unordered_set<token> tokens) {
    db::system_keyspace::update_tokens(tokens).get();
    _token_metadata.update_normal_tokens(tokens, get_broadcast_address());
    auto local_tokens = get_local_tokens();
+    set_gossip_tokens(local_tokens);
+    set_mode(mode::NORMAL, "node is now in normal status", true);
+    replicate_to_all_cores().get();
+}
+
+void storage_service::set_gossip_tokens(const std::unordered_set<dht::token>& local_tokens) {
    auto& gossiper = gms::get_local_gossiper();
    gossiper.add_local_application_state(gms::application_state::TOKENS, value_factory.tokens(local_tokens)).get();
    gossiper.add_local_application_state(gms::application_state::STATUS, value_factory.normal(local_tokens)).get();
-    set_mode(mode::NORMAL, false);
-    replicate_to_all_cores().get();
 }

 void storage_service::register_subscriber(endpoint_lifecycle_subscriber* subscriber)
@@ -1037,33 +1069,50 @@ future<> storage_service::check_for_endpoint_collision() {
    if (!MessagingService.instance().isListening())
        MessagingService.instance().listen(FBUtilities.getLocalAddress());
 #endif
-    auto& gossiper = gms::get_local_gossiper();
-    return gossiper.do_shadow_round().then([this, &gossiper] {
-        auto addr = get_broadcast_address();
-        auto eps = gossiper.get_endpoint_state_for_endpoint(addr);
-        if (eps && !gossiper.is_dead_state(*eps) && !gossiper.is_gossip_only_member(addr)) {
-            throw std::runtime_error(sprint("A node with address %s already exists, cancelling join. "
-                "Use cassandra.replace_address if you want to replace this node.", addr));
-        }
-        if (dht::range_streamer::use_strict_consistency()) {
-            for (auto& x : gossiper.get_endpoint_states()) {
-                auto status = x.second.get_application_state(application_state::STATUS);
-                if (!status) {
-                    continue;
-                }
+    return seastar::async([this] {
+        auto& gossiper = gms::get_local_gossiper();
+        auto t = gms::gossiper::clk::now();
+        bool found_bootstrapping_node = false;
+        do {
+            gossiper.do_shadow_round().get();
+            auto addr = get_broadcast_address();
+            auto eps = gossiper.get_endpoint_state_for_endpoint(addr);
+            if (eps && !gossiper.is_dead_state(*eps) && !gossiper.is_gossip_only_member(addr)) {
+                throw std::runtime_error(sprint("A node with address %s already exists, cancelling join. "
+                    "Use cassandra.replace_address if you want to replace this node.", addr));
+            }
+            if (dht::range_streamer::use_strict_consistency()) {
+                found_bootstrapping_node = false;
+                for (auto& x : gossiper.get_endpoint_states()) {
+                    auto status = x.second.get_application_state(application_state::STATUS);
+                    if (!status) {
+                        continue;
+                    }

-                std::vector<sstring> pieces;
-                boost::split(pieces, status.value().value, boost::is_any_of(sstring(versioned_value::DELIMITER_STR)));
-                assert(pieces.size() > 0);
-                auto state = pieces[0];
-                logger.debug("Check node={}, state={}", x.first, state);
-                if (state == sstring(versioned_value::STATUS_BOOTSTRAPPING) ||
-                    state == sstring(versioned_value::STATUS_LEAVING) ||
-                    state == sstring(versioned_value::STATUS_MOVING)) {
-                    throw std::runtime_error("Other bootstrapping/leaving/moving nodes detected, cannot bootstrap while cassandra.consistent.rangemovement is true");
+                    std::vector<sstring> pieces;
+                    boost::split(pieces, status.value().value, boost::is_any_of(sstring(versioned_value::DELIMITER_STR)));
+                    assert(pieces.size() > 0);
+                    auto state = pieces[0];
+                    logger.debug("Checking node={}, status={} (check_for_endpoint_collision)", x.first, state);
+                    if (state == sstring(versioned_value::STATUS_BOOTSTRAPPING) ||
+                        state == sstring(versioned_value::STATUS_LEAVING) ||
+                        state == sstring(versioned_value::STATUS_MOVING)) {
+                        if (gms::gossiper::clk::now() > t + std::chrono::seconds(60)) {
+                            throw std::runtime_error("Other bootstrapping/leaving/moving nodes detected, cannot bootstrap while cassandra.consistent.rangemovement is true (check_for_endpoint_collision)");
+                        } else {
+                            gossiper.goto_shadow_round();
+                            gossiper.reset_endpoint_state_map();
+                            found_bootstrapping_node = true;
+                            auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(gms::gossiper::clk::now() - t).count();
+                            logger.info("Checking node={}, status={}, sleep 1 second and check again ({} seconds elpased) (check_for_endpoint_collision)", x.first, state, elapsed);
+                            sleep(std::chrono::seconds(1)).get();
+                            break;
+                        }
+                    }
                }
            }
-        }
+        } while (found_bootstrapping_node);
+        logger.info("Checking bootstrapping/leaving/moving nodes: ok (check_for_endpoint_collision)");
        gossiper.reset_endpoint_state_map();
    });
 }
@@ -1083,10 +1132,10 @@ void storage_service::remove_endpoint(inet_address endpoint) {
 }

 future<std::unordered_set<token>> storage_service::prepare_replacement_info() {
-    if (!get_replace_address()) {
+    if (!db().local().get_replace_address()) {
        throw std::runtime_error(sprint("replace_address is empty"));
    }
-    auto replace_address = get_replace_address().value();
+    auto replace_address = db().local().get_replace_address().value();
    logger.info("Gathering node replacement information for {}", replace_address);

    // if (!MessagingService.instance().isListening())
@@ -1242,13 +1291,16 @@ future<bool> storage_service::is_gossip_running() {

 future<> storage_service::start_gossiping() {
    return run_with_write_api_lock([] (storage_service& ss) {
-        if (!ss._initialized) {
-            logger.warn("Starting gossip by operator request");
-            return gms::get_local_gossiper().start(get_generation_number()).then([&ss] {
-                ss._initialized = true;
-            });
-        }
-        return make_ready_future<>();
+        return seastar::async([&ss] {
+            if (!ss._initialized) {
+                logger.warn("Starting gossip by operator request");
+                ss.set_gossip_tokens(ss.get_local_tokens());
+                gms::get_local_gossiper().force_newer_generation();
+                gms::get_local_gossiper().start_gossiping(get_generation_number()).then([&ss] {
+                    ss._initialized = true;
+                }).get();
+            }
+        });
    });
 }

@@ -1256,7 +1308,7 @@ future<> storage_service::stop_gossiping() {
    return run_with_write_api_lock([] (storage_service& ss) {
        if (ss._initialized) {
            logger.warn("Stopping gossip by operator request");
-            return gms::get_local_gossiper().stop().then([&ss] {
+            return gms::get_local_gossiper().stop_gossiping().then([&ss] {
                ss._initialized = false;
            });
        }
@@ -1559,6 +1611,10 @@ future<> storage_service::decommission() {
                throw std::runtime_error("no other normal nodes in the ring; decommission would be pointless");
            }

+            if (ss._operation_mode != mode::NORMAL) {
+                throw std::runtime_error(sprint("Node in %s state; wait for status to become normal or restart", ss._operation_mode));
+            }
+
            get_local_pending_range_calculator_service().block_until_finished().get();

            auto non_system_keyspaces = db.get_non_system_keyspaces();
@@ -1579,7 +1635,7 @@ future<> storage_service::decommission() {

            // FIXME: proper shutdown
            ss.shutdown_client_servers().get();
-            gms::get_local_gossiper().stop().get();
+            gms::get_local_gossiper().stop_gossiping().get();
            // MessagingService.instance().shutdown();
            // StageManager.shutdownNow();
            ss.set_mode(mode::DECOMMISSIONED, true);
@@ -1630,6 +1686,7 @@ future<> storage_service::remove_node(sstring host_id_string) {
                auto& ks = ss.db().local().find_keyspace(keyspace_name);
                // if the replication factor is 1 the data is lost so we shouldn't wait for confirmation
                if (ks.get_replication_strategy().get_replication_factor() == 1) {
+                    logger.warn("keyspace={} has replication factor 1, the data is probably lost", keyspace_name);
                    continue;
                }

@@ -1653,7 +1710,7 @@ future<> storage_service::remove_node(sstring host_id_string) {

            // the gossiper will handle spoofing this node's state to REMOVING_TOKEN for us
            // we add our own token so other nodes to let us know when they're done
-            gossiper.advertise_removing(endpoint, host_id, local_host_id);
+            gossiper.advertise_removing(endpoint, host_id, local_host_id).get();

            // kick off streaming commands
            ss.restore_replica_count(endpoint, my_address).get();
@@ -1667,7 +1724,7 @@ future<> storage_service::remove_node(sstring host_id_string) {
            ss.excise(std::move(tmp), endpoint);

            // gossiper will indicate the token has left
-            gossiper.advertise_token_removed(endpoint, host_id);
+            gossiper.advertise_token_removed(endpoint, host_id).get();

            ss._replicating_nodes.clear();
            ss._removing_node = {};
@@ -2034,15 +2091,17 @@ future<> storage_service::send_replication_notification(inet_address remote) {
    );
 }

-void storage_service::confirm_replication(inet_address node) {
-    // replicatingNodes can be empty in the case where this node used to be a removal coordinator,
-    // but restarted before all 'replication finished' messages arrived. In that case, we'll
-    // still go ahead and acknowledge it.
-    if (!_replicating_nodes.empty()) {
-        _replicating_nodes.erase(node);
-    } else {
-        logger.info("Received unexpected REPLICATION_FINISHED message from {}. Was this node recently a removal coordinator?", node);
-    }
+future<> storage_service::confirm_replication(inet_address node) {
+    return run_with_no_api_lock([node] (storage_service& ss) {
+        // replicatingNodes can be empty in the case where this node used to be a removal coordinator,
+        // but restarted before all 'replication finished' messages arrived. In that case, we'll
+        // still go ahead and acknowledge it.
+        if (!ss._replicating_nodes.empty()) {
+            ss._replicating_nodes.erase(node);
+        } else {
+            logger.info("Received unexpected REPLICATION_FINISHED message from {}. Was this node recently a removal coordinator?", node);
+        }
+    });
 }

 // Runs inside seastar::async context
--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -248,6 +248,7 @@ public:

    /** This method updates the local token on disk  */
    void set_tokens(std::unordered_set<token> tokens);
+    void set_gossip_tokens(const std::unordered_set<dht::token>& local_tokens);
 #if 0

    public void registerDaemon(CassandraDaemon daemon)
@@ -768,7 +769,7 @@ private:
     */
    std::unordered_multimap<inet_address, range<token>> get_new_source_ranges(const sstring& keyspaceName, const std::vector<range<token>>& ranges);
 public:
-    void confirm_replication(inet_address node);
+    future<> confirm_replication(inet_address node);

 private:

--- a/sstables/estimated_histogram.hh
+++ b/sstables/estimated_histogram.hh
@@ -45,10 +45,13 @@
 #include <cmath>
 #include <algorithm>
 #include <vector>
+#include <chrono>

 namespace sstables {

 struct estimated_histogram {
+    using clock = std::chrono::steady_clock;
+    using duration = clock::duration;
    /**
     * The series of values to which the counts in `buckets` correspond:
     * 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 17, 20, etc.
@@ -126,7 +129,7 @@ public:
        if (low == bucket_offsets.end()) {
            low--;
        }
-        auto pos = low - bucket_offsets.begin();
+        auto pos = std::distance(bucket_offsets.begin(), low);
        buckets.at(pos)++;
        _count++;
    }
@@ -138,7 +141,8 @@ public:
     * to the new count
     * @param n
     */
-    void add(int64_t n, int64_t new_count) {
+    void add_nano(int64_t n, int64_t new_count) {
+        n /= 1000;
        if (new_count <= _count) {
            return;
        }
@@ -146,11 +150,15 @@ public:
        if (low == bucket_offsets.end()) {
            low--;
        }
-        auto pos = low - bucket_offsets.begin();
+        auto pos = std::distance(bucket_offsets.begin(), low);
        buckets.at(pos)+= new_count - _count;
        _count = new_count;
    }

+    void add(duration latency, int64_t new_count) {
+        add_nano(std::chrono::duration_cast<std::chrono::nanoseconds>(latency).count(), new_count);
+    }
+
    /**
     * @return the smallest value that could have been added to this histogram
     */
--- a/sstables/leveled_manifest.hh
+++ b/sstables/leveled_manifest.hh
@@ -202,11 +202,14 @@ public:
            auto current_first = current->get_first_decorated_key(s);

            if (previous != nullptr && current_first.tri_compare(s, previous->get_last_decorated_key(s)) <= 0) {
-#if 0
-                logger.warn(String.format("At level %d, %s [%s, %s] overlaps %s [%s, %s].  This could be caused by a bug in Cassandra 1.1.0 .. 1.1.3 or due to the fact that you have dropped sstables from another node into the data directory. " +
-                                            "Sending back to L0.  If you didn't drop in sstables, and have not yet run scrub, you should do so since you may also have rows out-of-order within an sstable",
-                                            level, previous, previous.first, previous.last, current, current.first, current.last));
-#endif
+
+                logger.warn("At level {}, {} [{}, {}] overlaps {} [{}, {}].  This could be caused by a bug in Cassandra 1.1.0 .. 1.1.3 " \
+                    "or due to the fact that you have dropped sstables from another node into the data directory. " \
+                    "Sending back to L0. If you didn't drop in sstables, and have not yet run scrub, you should do so since you may also " \
+                    "have rows out-of-order within an sstable",
+                    level, previous->get_filename(), previous->get_first_partition_key(s), previous->get_last_partition_key(s),
+                    current->get_filename(), current->get_first_partition_key(s), current->get_last_partition_key(s));
+
                out_of_order_sstables.push_back(current);
            } else {
                previous = &*current;
@@ -605,9 +608,9 @@ public:
    std::vector<sstables::shared_sstable> get_candidates_for(int level) {
        const schema& s = *_schema;
        assert(!get_level(level).empty());
-#if 0
-        logger.debug("Choosing candidates for L{}", level);

+        logger.debug("Choosing candidates for L{}", level);
+#if 0
        final Set<SSTableReader> compacting = cfs.getDataTracker().getCompacting();
 #endif
        if (level == 0) {
--- a/sstables/row.cc
+++ b/sstables/row.cc
@@ -347,8 +347,13 @@ public:
 };

 data_consume_context::~data_consume_context() = default;
-data_consume_context::data_consume_context(data_consume_context&&) = default;
-data_consume_context& data_consume_context::operator=(data_consume_context&&) = default;
+data_consume_context::data_consume_context(data_consume_context&& o) noexcept
+    : _pimpl(std::move(o._pimpl))
+{ }
+data_consume_context& data_consume_context::operator=(data_consume_context&& o) noexcept {
+    _pimpl = std::move(o._pimpl);
+    return *this;
+}
 data_consume_context::data_consume_context(std::unique_ptr<impl> p) : _pimpl(std::move(p)) { }
 future<> data_consume_context::read() {
    return _pimpl->read();
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -39,6 +39,7 @@
 #include "index_reader.hh"
 #include "remove.hh"
 #include "memtable.hh"
+#include "range.hh"
 #include "downsampling.hh"
 #include <boost/filesystem/operations.hpp>
 #include <boost/algorithm/string.hpp>
@@ -821,7 +822,7 @@ future<> sstable::read_simple(T& component) {
    auto file_path = filename(Type);
    sstlog.debug(("Reading " + _component_map[Type] + " file {} ").c_str(), file_path);
    return engine().open_file_dma(file_path, open_flags::ro).then([this, &component] (file f) {
-        auto r = make_lw_shared<file_random_access_reader>(std::move(f), 4096);
+        auto r = make_lw_shared<file_random_access_reader>(std::move(f), sstable_buffer_size);
        auto fut = parse(*r, component);
        return fut.finally([r = std::move(r)] {
            return r->close();
@@ -1701,23 +1702,6 @@ remove_by_toc_name(sstring sstable_toc_name) {
    });
 }

-static future<bool>
-file_exists(sstring filename) {
-    return engine().open_file_dma(filename, open_flags::ro).then([] (file f) {
-        return f.close().finally([f] {});
-    }).then_wrapped([] (future<> f) {
-        bool exists = true;
-        try {
-            f.get();
-        } catch (std::system_error& e) {
-            if (e.code() == std::error_code(ENOENT, std::system_category())) {
-                exists = false;
-            }
-        }
-        return make_ready_future<bool>(exists);
-    });
-}
-
 future<>
 sstable::remove_sstable_with_temp_toc(sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f) {
    return seastar::async([ks, cf, dir, generation, v, f] {
@@ -1759,4 +1743,21 @@ sstable::remove_sstable_with_temp_toc(sstring ks, sstring cf, sstring dir, int64
    });
 }

+future<range<partition_key>>
+sstable::get_sstable_key_range(const schema& s, sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f) {
+    auto sst = std::make_unique<sstable>(ks, cf, dir, generation, v, f);
+    auto fut = sst->read_summary();
+    return std::move(fut).then([sst = std::move(sst), &s] () mutable {
+        auto first = sst->get_first_partition_key(s);
+        auto last = sst->get_last_partition_key(s);
+        return make_ready_future<range<partition_key>>(range<partition_key>::make(first, last));
+    });
+}
+
+void sstable::mark_sstable_for_deletion(sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f) {
+    auto sst = sstable(ks, cf, dir, generation, v, f);
+    sstlog.info("sstable {} not relevant for this shard, ignoring", sst.get_filename());
+    sst.mark_for_deletion();
+}
+
 }
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -76,8 +76,8 @@ public:
    // Define (as defaults) the destructor and move operations in the source
    // file, so here we don't need to know the incomplete impl type.
    ~data_consume_context();
-    data_consume_context(data_consume_context&&);
-    data_consume_context& operator=(data_consume_context&&);
+    data_consume_context(data_consume_context&&) noexcept;
+    data_consume_context& operator=(data_consume_context&&) noexcept;
 };

 // mutation_reader is an object returned by sstable::read_rows() et al. which
@@ -507,6 +507,15 @@ public:

    future<> mutate_sstable_level(uint32_t);

+    // Return sstable key range as range<partition_key> reading only the summary component.
+    static future<range<partition_key>>
+    get_sstable_key_range(const schema& s, sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f);
+
+    // Used to mark a sstable for deletion that is not relevant to the current shard.
+    // It doesn't mean that the sstable will be deleted, but that the sstable is not
+    // relevant to the current shard, thus can be deleted by the deletion manager.
+    static void mark_sstable_for_deletion(sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f);
+
    // Allow the test cases from sstable_test.cc to test private methods. We use
    // a placeholder to avoid cluttering this class too much. The sstable_test class
    // will then re-export as public every method it needs.
--- a/streaming/stream_transfer_task.cc
+++ b/streaming/stream_transfer_task.cc
@@ -93,14 +93,22 @@ void stream_transfer_task::start() {
                    return stop_iteration::yes;
                }
                sslog.debug("[Stream #{}] SEND STREAM_MUTATION to {}, cf_id={}", plan_id, id, cf_id);
-                session->ms().send_stream_mutation(id, session->plan_id(), *fm, session->dst_cpu_id).then_wrapped([&msg, this, plan_id, id, fm] (auto&& f) {
+                session->ms().send_stream_mutation(id, session->plan_id(), *fm, session->dst_cpu_id).then_wrapped([&msg, this, cf_id, plan_id, id, fm] (auto&& f) {
                    try {
                        f.get();
                        sslog.debug("[Stream #{}] GOT STREAM_MUTATION Reply", plan_id);
                        msg.mutations_done.signal();
-                    } catch (...) {
-                        sslog.error("[Stream #{}] stream_transfer_task: Fail to send STREAM_MUTATION to {}: {}", plan_id, id, std::current_exception());
-                        msg.mutations_done.broken();
+                    } catch (std::exception& e) {
+                        auto err = std::string(e.what());
+                        // Seastar RPC does not provide exception type info, so we can not catch no_such_column_family here
+                        // Need to compare the exception error msg
+                        if (err.find("Can't find a column family with UUID") != std::string::npos) {
+                            sslog.info("[Stream #{}] remote node {} does not have the cf_id = {}", plan_id, id, cf_id);
+                            msg.mutations_done.signal();
+                        } else {
+                            sslog.error("[Stream #{}] stream_transfer_task: Fail to send STREAM_MUTATION to {}: {}", plan_id, id, err);
+                            msg.mutations_done.broken();
+                        }
                    }
                }).finally([] {
                    get_local_stream_manager().mutation_send_limiter().signal();
--- a/tests/commitlog_test.cc
+++ b/tests/commitlog_test.cc
@@ -42,18 +42,24 @@

 using namespace db;

-typedef std::pair<tmpdir, commitlog> tmplog;
-typedef lw_shared_ptr<tmplog> tmplog_ptr;
-
-// create tmp dir + commit log
-static future<tmplog_ptr> make_commitlog(commitlog::config cfg =
-        commitlog::config()) {
+template<typename Func>
+static future<> cl_test(commitlog::config cfg, Func && f) {
    tmpdir tmp;
    cfg.commit_log_location = tmp.path;
-    return commitlog::create_commitlog(cfg).then(
-            [tmp = std::move(tmp)](commitlog log) mutable {
-                return make_ready_future<tmplog_ptr>(make_lw_shared<tmplog>(std::move(tmp), std::move(log)));
+    return commitlog::create_commitlog(cfg).then([f = std::forward<Func>(f)](commitlog log) mutable {
+        return do_with(std::move(log), [f = std::forward<Func>(f)](commitlog& log) {
+            return futurize<std::result_of_t<Func(commitlog&)>>::apply(f, log).finally([&log] {
+                return log.clear();
            });
+        });
+    }).finally([tmp = std::move(tmp)] {
+    });
+}
+
+template<typename Func>
+static future<> cl_test(Func && f) {
+    commitlog::config cfg;
+    return cl_test(cfg, std::forward<Func>(f));
 }

 #if 0
@@ -63,102 +69,14 @@ static int loggo = [] {
 }();
 #endif

-class file_lister {
-    file _f;
-    subscription<directory_entry> _listing;
-public:
-    file_lister(file f)
-            : _f(std::move(f)), _listing(
-                    _f.list_directory(
-                            [this] (directory_entry de) {return report(de);})) {
-    }
-    future<> done() {
-        return _listing.done();
-    }
-    const std::vector<directory_entry> & contents() const {
-        return _contents;
-    }
-private:
-    std::vector<directory_entry> _contents;
-
-    future<> report(directory_entry de) {
-        _contents.emplace_back(de);
-        return make_ready_future<>();
-    }
-};
-
-static future<lw_shared_ptr<file_lister>> list_files(sstring path) {
-    return engine().open_directory(path).then([](auto dir) {
-        auto l = make_lw_shared<file_lister>(std::move(dir));
-        return l->done().then([l]() {
-            return make_ready_future<lw_shared_ptr<file_lister>>(l);
-        });
-    });
-}
-
-future<std::experimental::optional<directory_entry_type>> entry_type(const sstring & path, const directory_entry & de) {
-    if (!de.type && !de.name.empty()) {
-        return engine().file_type(path + "/" + de.name);
-    }
-    return make_ready_future<std::experimental::optional<directory_entry_type>>(de.type);
-};
-
-static future<size_t> count_files(sstring path) {
-    return list_files(path).then([path](auto l) {
-        auto n = make_lw_shared<size_t>(0);
-        return parallel_for_each(l->contents(), [n, path](auto de) {
-           return entry_type(path, de).then([n](auto type) {
-              if (type == directory_entry_type::regular) {
-                  ++(*n);
-              }
-           });
-        }).then([n] {
-            return make_ready_future<size_t>(*n);
-        });
-    });
-}
-
-static future<size_t> count_files_with_size(sstring path) {
-    return list_files(path).then([path](auto l) {
-        auto n = make_lw_shared<size_t>(0);
-        return parallel_for_each(l->contents().begin(), l->contents().end(), [n, path](directory_entry de) {
-            return entry_type(path, de).then([n, path, de](auto type) {
-                if (type == directory_entry_type::regular) {
-                    return engine().open_file_dma(path + "/" + de.name, open_flags::ro).then([n](file f) {
-                        return do_with(std::move(f), [n] (auto& f) {
-                            return f.stat().then([n](struct stat s) {
-                                if (s.st_size > 0) {
-                                    ++(*n);
-                                }
-                            });
-                        });
-                    });
-                }
-                return make_ready_future();
-            });
-        }).then([n]() {
-           return make_ready_future<size_t>(*n);;
-        });
-    });
-}
-
-namespace db {
-template<typename... Args>
-inline std::basic_ostream<Args...> & operator<<(std::basic_ostream<Args...> & os, const db::replay_position & rp) {
-    return os << "[" << rp.id << ", " << rp.pos << "]" << std::endl;
-
-}
-}
 // just write in-memory...
 SEASTAR_TEST_CASE(test_create_commitlog){
-    return make_commitlog().then([](tmplog_ptr log) {
+    return cl_test([](commitlog& log) {
            sstring tmp = "hej bubba cow";
-            return log->second.add_mutation(utils::UUID_gen::get_time_UUID(), tmp.size(), [tmp](db::commitlog::output& dst) {
+            return log.add_mutation(utils::UUID_gen::get_time_UUID(), tmp.size(), [tmp](db::commitlog::output& dst) {
                        dst.write(tmp.begin(), tmp.end());
                    }).then([](db::replay_position rp) {
                        BOOST_CHECK_NE(rp, db::replay_position());
-                    }).finally([log]() {
-                        return log->second.clear().then([log] {});
                    });
        });
 }
@@ -167,39 +85,33 @@ SEASTAR_TEST_CASE(test_create_commitlog){
 SEASTAR_TEST_CASE(test_commitlog_written_to_disk_batch){
    commitlog::config cfg;
    cfg.mode = commitlog::sync_mode::BATCH;
-    return make_commitlog(cfg).then([](tmplog_ptr log) {
+    return cl_test(cfg, [](commitlog& log) {
            sstring tmp = "hej bubba cow";
-            return log->second.add_mutation(utils::UUID_gen::get_time_UUID(), tmp.size(), [tmp](db::commitlog::output& dst) {
+            return log.add_mutation(utils::UUID_gen::get_time_UUID(), tmp.size(), [tmp](db::commitlog::output& dst) {
                        dst.write(tmp.begin(), tmp.end());
-                    }).then([log](replay_position rp) {
+                    }).then([&log](replay_position rp) {
                        BOOST_CHECK_NE(rp, db::replay_position());
-                        return count_files_with_size(log->first.path).then([log](size_t n) {
-                                    BOOST_REQUIRE(n > 0);
-                                });
-                    }).finally([log]() {
-                        return log->second.clear().then([log] {});
+                        auto n = log.get_flush_count();
+                        BOOST_REQUIRE(n > 0);
                    });
        });
 }

 SEASTAR_TEST_CASE(test_commitlog_written_to_disk_periodic){
-    return make_commitlog().then([](tmplog_ptr log) {
+    return cl_test([](commitlog& log) {
            auto state = make_lw_shared(false);
            auto uuid = utils::UUID_gen::get_time_UUID();
            return do_until([state]() {return *state;},
-                    [log, state, uuid]() {
+                    [&log, state, uuid]() {
                        sstring tmp = "hej bubba cow";
-                        return log->second.add_mutation(uuid, tmp.size(), [tmp](db::commitlog::output& dst) {
+                        return log.add_mutation(uuid, tmp.size(), [tmp](db::commitlog::output& dst) {
                                    dst.write(tmp.begin(), tmp.end());
-                                }).then([log, state](replay_position rp) {
+                                }).then([&log, state](replay_position rp) {
                                    BOOST_CHECK_NE(rp, db::replay_position());
-                                    return count_files_with_size(log->first.path).then([state](size_t n) {
-                                       *state = n > 0;
-                                    });
+                                    auto n = log.get_flush_count();
+                                    *state = n > 0;
                                });

-                    }).finally([log]() {
-                        return log->second.clear().then([log] {});
                    });
        });
 }
@@ -207,34 +119,39 @@ SEASTAR_TEST_CASE(test_commitlog_written_to_disk_periodic){
 SEASTAR_TEST_CASE(test_commitlog_new_segment){
    commitlog::config cfg;
    cfg.commitlog_segment_size_in_mb = 1;
-    return make_commitlog(cfg).then([](tmplog_ptr log) {
-        return do_with(std::unordered_set<db::segment_id_type>(), [log](auto& set) {
+    return cl_test(cfg, [](commitlog& log) {
+        return do_with(std::unordered_set<db::segment_id_type>(), [&log](auto& set) {
            auto uuid = utils::UUID_gen::get_time_UUID();
-            return do_until([&set]() { return set.size() > 1; }, [log, &set, uuid]() {
+            return do_until([&set]() { return set.size() > 1; }, [&log, &set, uuid]() {
                sstring tmp = "hej bubba cow";
-                return log->second.add_mutation(uuid, tmp.size(), [tmp](db::commitlog::output& dst) {
+                return log.add_mutation(uuid, tmp.size(), [tmp](db::commitlog::output& dst) {
                    dst.write(tmp.begin(), tmp.end());
-                }).then([log, &set](replay_position rp) {
+                }).then([&set](replay_position rp) {
                    BOOST_CHECK_NE(rp, db::replay_position());
                    set.insert(rp.id);
                });
            });
-        }).then([log] {
-            return count_files(log->first.path).then([](size_t n) {
-                        BOOST_REQUIRE(n > 1);
-                    });
-        }).finally([log] {
-            return log->second.clear().then([log] {});
+        }).then([&log] {
+            auto n = log.get_active_segment_names().size();
+            BOOST_REQUIRE(n > 1);
        });
    });
 }

+typedef std::vector<sstring> segment_names;
+
+static segment_names segment_diff(commitlog& log, segment_names prev = {}) {
+    segment_names now = log.get_active_segment_names();
+    segment_names diff;
+    std::set_difference(prev.begin(), prev.end(), now.begin(), now.end(), std::back_inserter(diff));
+    return diff;
+}

 SEASTAR_TEST_CASE(test_commitlog_discard_completed_segments){
    //logging::logger_registry().set_logger_level("commitlog", logging::log_level::trace);
    commitlog::config cfg;
    cfg.commitlog_segment_size_in_mb = 1;
-    return make_commitlog(cfg).then([](tmplog_ptr log) {
+    return cl_test(cfg, [](commitlog& log) {
            struct state_type {
                std::vector<utils::UUID> uuids;
                std::unordered_map<utils::UUID, replay_position> rps;
@@ -254,57 +171,54 @@ SEASTAR_TEST_CASE(test_commitlog_discard_completed_segments){

            auto state = make_lw_shared<state_type>();
            return do_until([state]() { return state->ids.size() > 1; },
-                    [log, state]() {
+                    [&log, state]() {
                        sstring tmp = "hej bubba cow";
                        auto uuid = state->next_uuid();
-                        return log->second.add_mutation(uuid, tmp.size(), [tmp](db::commitlog::output& dst) {
+                        return log.add_mutation(uuid, tmp.size(), [tmp](db::commitlog::output& dst) {
                                    dst.write(tmp.begin(), tmp.end());
-                                }).then([log, state, uuid](replay_position pos) {
+                                }).then([state, uuid](replay_position pos) {
                                    state->ids.insert(pos.id);
                                    state->rps[uuid] = pos;
                                });
-                    }).then([log, state]() {
-                        return count_files(log->first.path).then([log, state](size_t n) {
-                                    BOOST_REQUIRE(n > 1);
-                                    // sync all so we have no outstanding async sync ops that
-                                    // might prevent discard_completed_segments to actually dispose
-                                    // of clean segments (shared_ptr in task)
-                                    return log->second.sync_all_segments().then([log, state, n] {
-                                        for (auto & p : state->rps) {
-                                            log->second.discard_completed_segments(p.first, p.second);
-                                        }
-                                        size_t nn = log->second.get_num_segments_destroyed();
-                                        BOOST_REQUIRE(nn > 0);
-                                        BOOST_REQUIRE(nn <= n);
-                                    });
-                                });
-                    }).finally([log]() {
-                        return log->second.clear().then([log] {});
+                    }).then([&log, state]() {
+                        auto names = log.get_active_segment_names();
+                        BOOST_REQUIRE(names.size() > 1);
+                        // sync all so we have no outstanding async sync ops that
+                        // might prevent discard_completed_segments to actually dispose
+                        // of clean segments (shared_ptr in task)
+                        return log.sync_all_segments().then([&log, state, names] {
+                            for (auto & p : state->rps) {
+                                log.discard_completed_segments(p.first, p.second);
+                            }
+                            auto diff = segment_diff(log, names);
+                            auto nn = diff.size();
+                            auto dn = log.get_num_segments_destroyed();
+
+                            BOOST_REQUIRE(nn > 0);
+                            BOOST_REQUIRE(nn <= names.size());
+                            BOOST_REQUIRE(dn <= nn);
+                        });
                    });
        });
 }

 SEASTAR_TEST_CASE(test_equal_record_limit){
-    return make_commitlog().then([](tmplog_ptr log) {
-            auto size = log->second.max_record_size();
-            return log->second.add_mutation(utils::UUID_gen::get_time_UUID(), size, [size](db::commitlog::output& dst) {
+    return cl_test([](commitlog& log) {
+            auto size = log.max_record_size();
+            return log.add_mutation(utils::UUID_gen::get_time_UUID(), size, [size](db::commitlog::output& dst) {
                        dst.write(char(1), size);
                    }).then([](db::replay_position rp) {
                        BOOST_CHECK_NE(rp, db::replay_position());
-                    }).finally([log]() {
-                        return log->second.clear().then([log] {});
                    });
        });
 }

 SEASTAR_TEST_CASE(test_exceed_record_limit){
-    return make_commitlog().then([](tmplog_ptr log) {
-            auto size = log->second.max_record_size() + 1;
-            return log->second.add_mutation(utils::UUID_gen::get_time_UUID(), size, [size](db::commitlog::output& dst) {
+    return cl_test([](commitlog& log) {
+            auto size = log.max_record_size() + 1;
+            return log.add_mutation(utils::UUID_gen::get_time_UUID(), size, [size](db::commitlog::output& dst) {
                        dst.write(char(1), size);
-                    }).then([](db::replay_position rp) {
-                        // should not reach.
-                    }).then_wrapped([](future<> f) {
+                    }).then_wrapped([](future<db::replay_position> f) {
                        try {
                            f.get();
                        } catch (...) {
@@ -312,59 +226,65 @@ SEASTAR_TEST_CASE(test_exceed_record_limit){
                            return make_ready_future();
                        }
                        throw std::runtime_error("Did not get expected exception from writing too large record");
-                    }).finally([log]() {
-                        return log->second.clear().then([log] {});
                    });
        });
 }

-SEASTAR_TEST_CASE(test_commitlog_delete_when_over_disk_limit){
+SEASTAR_TEST_CASE(test_commitlog_delete_when_over_disk_limit) {
    commitlog::config cfg;
    cfg.commitlog_segment_size_in_mb = 2;
    cfg.commitlog_total_space_in_mb = 1;
-    return make_commitlog(cfg).then([](tmplog_ptr log) {
+    cfg.commitlog_sync_period_in_ms = 1;
+    return cl_test(cfg, [](commitlog& log) {
            auto sem = make_lw_shared<semaphore>(0);
+            auto segments = make_lw_shared<segment_names>();
+
            // add a flush handler that simply says we're done with the range.
-            auto r = log->second.add_flush_handler([log, sem](cf_id_type id, replay_position pos) {
-                log->second.discard_completed_segments(id, pos);
+            auto r = log.add_flush_handler([&log, sem, segments](cf_id_type id, replay_position pos) {
+                *segments = log.get_active_segment_names();
+                log.discard_completed_segments(id, pos);
                sem->signal();
            });
+
            auto set = make_lw_shared<std::set<segment_id_type>>();
            auto uuid = utils::UUID_gen::get_time_UUID();
-            return do_until([set, sem]() {return set->size() > 1 && sem->try_wait();},
-                    [log, set, uuid]() {
+            return do_until([set]() {return set->size() > 2;},
+                    [&log, set, uuid]() {
                        sstring tmp = "hej bubba cow";
-                        return log->second.add_mutation(uuid, tmp.size(), [tmp](db::commitlog::output& dst) {
+                        return log.add_mutation(uuid, tmp.size(), [tmp](db::commitlog::output& dst) {
                                    dst.write(tmp.begin(), tmp.end());
-                                }).then([log, set](replay_position rp) {
+                                }).then([set](replay_position rp) {
                                    BOOST_CHECK_NE(rp, db::replay_position());
                                    set->insert(rp.id);
                                });
-                    }).then([log]() {
-                        auto n = log->second.get_active_segment_names().size();
-                        auto d = log->second.get_num_segments_destroyed();
-                        BOOST_REQUIRE(n > 0);
-                        BOOST_REQUIRE(d > 0);
-                    }).finally([log, r = std::move(r)]() {
-                        return log->second.clear().then([log] {});
+                    }).then([&log, sem, segments]() {
+                        auto names = log.get_active_segment_names();
+                        auto diff = segment_diff(log, *segments);
+                        auto nn = diff.size();
+                        auto dn = log.get_num_segments_destroyed();
+
+                        BOOST_REQUIRE(nn > 0);
+                        BOOST_REQUIRE(nn <= names.size());
+                        BOOST_REQUIRE(dn <= nn);
+                    }).finally([r = std::move(r)] {
                    });
-        }).then([]{});
+        });
 }

 SEASTAR_TEST_CASE(test_commitlog_reader){
    commitlog::config cfg;
    cfg.commitlog_segment_size_in_mb = 1;
-    return make_commitlog(cfg).then([](tmplog_ptr log) {
+    return cl_test(cfg, [](commitlog& log) {
            auto set = make_lw_shared<std::set<segment_id_type>>();
            auto count = make_lw_shared<size_t>(0);
            auto count2 = make_lw_shared<size_t>(0);
            auto uuid = utils::UUID_gen::get_time_UUID();
            return do_until([count, set]() {return set->size() > 1;},
-                    [log, uuid, count, set]() {
+                    [&log, uuid, count, set]() {
                        sstring tmp = "hej bubba cow";
-                        return log->second.add_mutation(uuid, tmp.size(), [tmp](db::commitlog::output& dst) {
+                        return log.add_mutation(uuid, tmp.size(), [tmp](db::commitlog::output& dst) {
                                    dst.write(tmp.begin(), tmp.end());
-                                }).then([log, set, count](replay_position rp) {
+                                }).then([&log, set, count](replay_position rp) {
                                    BOOST_CHECK_NE(rp, db::replay_position());
                                    set->insert(rp.id);
                                    if (set->size() == 1) {
@@ -372,34 +292,134 @@ SEASTAR_TEST_CASE(test_commitlog_reader){
                                    }
                                });

-                    }).then([log]() {
-                        return count_files(log->first.path).then([](size_t n) {
-                                    BOOST_REQUIRE(n > 1);
+                    }).then([&log, set, count2]() {
+                        auto segments = log.get_active_segment_names();
+                        BOOST_REQUIRE(segments.size() > 1);
+
+                        auto id = *set->begin();
+                        auto i = std::find_if(segments.begin(), segments.end(), [id](sstring filename) {
+                            commitlog::descriptor desc(filename);
+                            return desc.id == id;
+                        });
+                        if (i == segments.end()) {
+                            throw std::runtime_error("Did not find expected log file");
+                        }
+                        return db::commitlog::read_log_file(*i, [count2](temporary_buffer<char> buf, db::replay_position rp) {
+                                    sstring str(buf.get(), buf.size());
+                                    BOOST_CHECK_EQUAL(str, "hej bubba cow");
+                                    (*count2)++;
+                                    return make_ready_future<>();
+                                }).then([](auto s) {
+                                    return do_with(std::move(s), [](auto& s) {
+                                        return s->done();
+                                    });
                                });
-                    }).then([log, set, count2] {
-                        // TODO, meh, hard coded name...
-                        auto findme = sstring("CommitLog-1-") + std::to_string(*set->begin()) + ".log";
-                        return list_files(log->first.path).then([log, findme, count2](auto l) {
-                                    for (auto & de : l->contents()) {
-                                        if (de.name == findme) {
-                                            auto path = log->first.path + "/" + de.name;
-                                            return db::commitlog::read_log_file(path, [count2](temporary_buffer<char> buf, db::replay_position rp) {
-                                                        sstring str(buf.get(), buf.size());
-                                                        BOOST_CHECK_EQUAL(str, "hej bubba cow");
-                                                        (*count2)++;
-                                                        return make_ready_future<>();
-                                                    }).then([log](auto s) {
-                                                        auto ss = make_lw_shared(std::move(s));
-                                                        return ss->done().then([ss] {});
-                                                    });
-                                        }
-                                    }
-                                    throw std::runtime_error("Did not find expected log file");
-                               });
                    }).then([count, count2] {
                        BOOST_CHECK_EQUAL(*count, *count2);
-                    }).finally([log]() {
-                        return log->second.clear().then([log] {});
+                    });
+        });
+}
+
+static future<> corrupt_segment(sstring seg, uint64_t off, uint32_t value) {
+    return engine().open_file_dma(seg, open_flags::rw).then([off, value](file f) {
+        size_t size = align_up<size_t>(off, 4096);
+        return do_with(std::move(f), [size, off, value](file& f) {
+            return f.dma_read_exactly<char>(0, size).then([&f, off, value](auto buf) {
+                *reinterpret_cast<uint32_t *>(buf.get_write() + off) = value;
+                auto dst = buf.get();
+                auto size = buf.size();
+                return f.dma_write(0, dst, size).then([buf = std::move(buf)](size_t) {});
+            });
+        });
+    });
+}
+
+SEASTAR_TEST_CASE(test_commitlog_entry_corruption){
+    commitlog::config cfg;
+    cfg.commitlog_segment_size_in_mb = 1;
+    return cl_test(cfg, [](commitlog& log) {
+        auto count = make_lw_shared<size_t>(0);
+        auto rps = make_lw_shared<std::vector<db::replay_position>>();
+        return do_until([count]() {return *count  > 1;},
+                    [&log, count, rps]() {
+                        auto uuid = utils::UUID_gen::get_time_UUID();
+                        sstring tmp = "hej bubba cow";
+                        return log.add_mutation(uuid, tmp.size(), [tmp](db::commitlog::output& dst) {
+                                    dst.write(tmp.begin(), tmp.end());
+                                }).then([&log, rps, count](replay_position rp) {
+                                    BOOST_CHECK_NE(rp, db::replay_position());
+                                    rps->push_back(rp);
+                                    ++(*count);
+                                });
+                    }).then([&log, rps]() {
+                        return log.sync_all_segments();
+                    }).then([&log, rps] {
+                        auto segments = log.get_active_segment_names();
+                        BOOST_REQUIRE(!segments.empty());
+                        auto seg = segments[0];
+                        return corrupt_segment(seg, rps->at(1).pos + 4, 0x451234ab).then([seg, rps, &log] {
+                            return db::commitlog::read_log_file(seg, [rps](temporary_buffer<char> buf, db::replay_position rp) {
+                                BOOST_CHECK_EQUAL(rp, rps->at(0));
+                                return make_ready_future<>();
+                            }).then([](auto s) {
+                                return do_with(std::move(s), [](auto& s) {
+                                    return s->done();
+                                });
+                            }).then_wrapped([](auto&& f) {
+                                try {
+                                    f.get();
+                                    BOOST_FAIL("Expected exception");
+                                } catch (commitlog::segment_data_corruption_error& e) {
+                                    // ok.
+                                    BOOST_REQUIRE(e.bytes() > 0);
+                                }
+                            });
+                        });
+                    });
+        });
+}
+
+SEASTAR_TEST_CASE(test_commitlog_chunk_corruption){
+    commitlog::config cfg;
+    cfg.commitlog_segment_size_in_mb = 1;
+    return cl_test(cfg, [](commitlog& log) {
+        auto count = make_lw_shared<size_t>(0);
+        auto rps = make_lw_shared<std::vector<db::replay_position>>();
+        return do_until([count]() {return *count  > 1;},
+                    [&log, count, rps]() {
+                        auto uuid = utils::UUID_gen::get_time_UUID();
+                        sstring tmp = "hej bubba cow";
+                        return log.add_mutation(uuid, tmp.size(), [tmp](db::commitlog::output& dst) {
+                                    dst.write(tmp.begin(), tmp.end());
+                                }).then([&log, rps, count](replay_position rp) {
+                                    BOOST_CHECK_NE(rp, db::replay_position());
+                                    rps->push_back(rp);
+                                    ++(*count);
+                                });
+                    }).then([&log, rps]() {
+                        return log.sync_all_segments();
+                    }).then([&log, rps] {
+                        auto segments = log.get_active_segment_names();
+                        BOOST_REQUIRE(!segments.empty());
+                        auto seg = segments[0];
+                        return corrupt_segment(seg, rps->at(0).pos - 4, 0x451234ab).then([seg, rps, &log] {
+                            return db::commitlog::read_log_file(seg, [rps](temporary_buffer<char> buf, db::replay_position rp) {
+                                BOOST_FAIL("Should not reach");
+                                return make_ready_future<>();
+                            }).then([](auto s) {
+                                return do_with(std::move(s), [](auto& s) {
+                                    return s->done();
+                                });
+                            }).then_wrapped([](auto&& f) {
+                                try {
+                                    f.get();
+                                    BOOST_FAIL("Expected exception");
+                                } catch (commitlog::segment_data_corruption_error& e) {
+                                    // ok.
+                                    BOOST_REQUIRE(e.bytes() > 0);
+                                }
+                            });
+                        });
                    });
        });
 }
@@ -412,9 +432,9 @@ SEASTAR_TEST_CASE(test_commitlog_counters) {
        });
    };
    BOOST_CHECK_EQUAL(count_cl_counters(), 0);
-    return make_commitlog().then([&](tmplog_ptr log) {
+    return cl_test([count_cl_counters](commitlog& log) {
        BOOST_CHECK_GT(count_cl_counters(), 0);
-    }).finally([&]() {
+    }).finally([count_cl_counters] {
        BOOST_CHECK_EQUAL(count_cl_counters(), 0);
    });
 }
@@ -422,9 +442,8 @@ SEASTAR_TEST_CASE(test_commitlog_counters) {
 #ifndef DEFAULT_ALLOCATOR

 SEASTAR_TEST_CASE(test_allocation_failure){
-    commitlog::config cfg;
-    return make_commitlog(cfg).then([](tmplog_ptr log) {
-            auto size = log->second.max_record_size() - 1;
+    return cl_test([](commitlog& log) {
+            auto size = log.max_record_size() - 1;

            auto junk = make_lw_shared<std::list<std::unique_ptr<char[]>>>();

@@ -435,11 +454,9 @@ SEASTAR_TEST_CASE(test_allocation_failure){
                }
            } catch (std::bad_alloc&) {
            }
-            return log->second.add_mutation(utils::UUID_gen::get_time_UUID(), size, [size](db::commitlog::output& dst) {
+            return log.add_mutation(utils::UUID_gen::get_time_UUID(), size, [size](db::commitlog::output& dst) {
                        dst.write(char(1), size);
-                    }).then([](db::replay_position rp) {
-                        // should not reach.
-                    }).then_wrapped([junk](future<> f) {
+                    }).then_wrapped([junk](future<db::replay_position> f) {
                        try {
                            f.get();
                        } catch (std::bad_alloc&) {
@@ -447,11 +464,8 @@ SEASTAR_TEST_CASE(test_allocation_failure){
                            junk->clear();
                            return make_ready_future();
                        } catch (...) {
-                            throw std::runtime_error("Did not get expected exception from writing too large record");
                        }
-                        return make_ready_future<>();
-                    }).finally([log]() {
-                        return log->second.clear().then([log] {});
+                        throw std::runtime_error("Did not get expected exception from writing too large record");
                    });
        });
 }
--- a/tests/cql_test_env.cc
+++ b/tests/cql_test_env.cc
@@ -283,6 +283,7 @@ public:
        }
        return seastar::async([this] {
            utils::fb_utilities::set_broadcast_address(gms::inet_address("localhost"));
+            utils::fb_utilities::set_broadcast_rpc_address(gms::inet_address("localhost"));
            locator::i_endpoint_snitch::create_snitch("SimpleSnitch").get();
            auto db = ::make_shared<distributed<database>>();
            init_once(db).get();
--- a/tests/ec2_snitch_test.cc
+++ b/tests/ec2_snitch_test.cc
@@ -43,6 +43,7 @@ future<> one_test(const std::string& property_fname, bool exp_result) {
    fname /= path(property_fname);

    utils::fb_utilities::set_broadcast_address(gms::inet_address("localhost"));
+    utils::fb_utilities::set_broadcast_rpc_address(gms::inet_address("localhost"));

    return i_endpoint_snitch::create_snitch<const sstring&>(
        "EC2Snitch",
--- a/tests/gossip.cc
+++ b/tests/gossip.cc
@@ -65,6 +65,7 @@ int main(int ac, char ** av) {
        logging::logger_registry().set_logger_level("gossip", logging::log_level::trace);
        const gms::inet_address listen = gms::inet_address(config["listen-address"].as<std::string>());
        utils::fb_utilities::set_broadcast_address(listen);
+        utils::fb_utilities::set_broadcast_rpc_address(listen);
        auto vv = std::make_shared<gms::versioned_value::factory>();
        locator::i_endpoint_snitch::create_snitch("SimpleSnitch").then([&db] {
            return service::init_storage_service(db);
@@ -96,7 +97,7 @@ int main(int ac, char ** av) {
            using namespace std::chrono;
            auto now = high_resolution_clock::now().time_since_epoch();
            int generation_number = duration_cast<seconds>(now).count();
-            return gossiper.start(generation_number, app_states);
+            return gossiper.start_gossiping(generation_number, app_states);
        }).then([vv] {
            return seastar::async([vv] {
                static double load = 0.5;
--- a/tests/gossiping_property_file_snitch_test.cc
+++ b/tests/gossiping_property_file_snitch_test.cc
@@ -43,6 +43,7 @@ future<> one_test(const std::string& property_fname, bool exp_result) {
    fname /= path(property_fname);

    utils::fb_utilities::set_broadcast_address(gms::inet_address("localhost"));
+    utils::fb_utilities::set_broadcast_rpc_address(gms::inet_address("localhost"));

    return i_endpoint_snitch::create_snitch<const sstring&>(
        "org.apache.cassandra.locator.GossipingPropertyFileSnitch",
--- a/tests/mutation_assertions.hh
+++ b/tests/mutation_assertions.hh
@@ -41,3 +41,27 @@ static inline
 mutation_assertion assert_that(const mutation& m) {
    return { m };
 }
+
+class mutation_opt_assertions {
+    mutation_opt _mo;
+public:
+    mutation_opt_assertions(mutation_opt mo) : _mo(std::move(mo)) {}
+
+    mutation_assertion has_mutation() {
+        if (!_mo) {
+            BOOST_FAIL("Expected engaged mutation_opt, but found not");
+        }
+        return { *_mo };
+    }
+
+    void has_no_mutation() {
+        if (_mo) {
+            BOOST_FAIL("Expected disengaged mutation_opt");
+        }
+    }
+};
+
+static inline
+mutation_opt_assertions assert_that(mutation_opt mo) {
+    return { std::move(mo) };
+}
--- a/tests/network_topology_strategy_test.cc
+++ b/tests/network_topology_strategy_test.cc
@@ -157,6 +157,7 @@ void full_ring_check(const std::vector<ring_point>& ring_points,

 future<> simple_test() {
    utils::fb_utilities::set_broadcast_address(gms::inet_address("localhost"));
+    utils::fb_utilities::set_broadcast_rpc_address(gms::inet_address("localhost"));

    // Create the RackInferringSnitch
    return i_endpoint_snitch::create_snitch("RackInferringSnitch").then(
@@ -230,6 +231,7 @@ future<> simple_test() {

 future<> heavy_origin_test() {
    utils::fb_utilities::set_broadcast_address(gms::inet_address("localhost"));
+    utils::fb_utilities::set_broadcast_rpc_address(gms::inet_address("localhost"));

    // Create the RackInferringSnitch
    return i_endpoint_snitch::create_snitch("RackInferringSnitch").then(
--- a/tests/row_cache_test.cc
+++ b/tests/row_cache_test.cc
@@ -22,6 +22,7 @@
 #define BOOST_TEST_DYN_LINK

 #include <boost/test/unit_test.hpp>
+#include <seastar/core/sleep.hh>

 #include "tests/test-utils.hh"
 #include "tests/mutation_assertions.hh"
@@ -33,6 +34,8 @@
 #include "core/thread.hh"
 #include "memtable.hh"

+using namespace std::chrono_literals;
+
 static schema_ptr make_schema() {
    return schema_builder("ks", "cf")
        .with_column("pk", bytes_type, column_kind::partition_key)
@@ -49,11 +52,6 @@ mutation make_new_mutation(schema_ptr s, partition_key key) {
    return m;
 }

-static
-mutation make_key_mutation(schema_ptr s, bytes key) {
-    return make_new_mutation(s, partition_key::from_single_value(*s, key));
-}
-
 static
 partition_key new_key(schema_ptr s) {
    static thread_local int next = 0;
@@ -116,20 +114,22 @@ struct decorated_key_order {
    }
 };

+static std::vector<mutation> make_ring(schema_ptr s, int n_mutations) {
+    std::vector<mutation> mutations;
+    for (int i = 0; i < n_mutations; ++i) {
+        mutations.push_back(make_new_mutation(s));
+    }
+    std::sort(mutations.begin(), mutations.end(), mutation_decorated_key_less_comparator());
+    return mutations;
+}
+
 SEASTAR_TEST_CASE(test_query_of_incomplete_range_goes_to_underlying) {
    return seastar::async([] {
        auto s = make_schema();

-        std::vector<mutation> mutations = {
-            make_key_mutation(s, "key1"),
-            make_key_mutation(s, "key2"),
-            make_key_mutation(s, "key3")
-        };
-
-        std::sort(mutations.begin(), mutations.end(), mutation_decorated_key_less_comparator());
+        std::vector<mutation> mutations = make_ring(s, 3);

        auto mt = make_lw_shared<memtable>(s);
-
        for (auto&& m : mutations) {
            mt->apply(m);
        }
@@ -173,16 +173,10 @@ SEASTAR_TEST_CASE(test_single_key_queries_after_population_in_reverse_order) {
    return seastar::async([] {
        auto s = make_schema();

-        std::vector<mutation> mutations = {
-            make_key_mutation(s, "key1"),
-            make_key_mutation(s, "key2"),
-            make_key_mutation(s, "key3")
-        };
-
-        std::sort(mutations.begin(), mutations.end(), mutation_decorated_key_less_comparator());
-
        auto mt = make_lw_shared<memtable>(s);

+        std::vector<mutation> mutations = make_ring(s, 3);
+
        for (auto&& m : mutations) {
            mt->apply(m);
        }
@@ -257,7 +251,8 @@ SEASTAR_TEST_CASE(test_eviction) {
 }

 bool has_key(row_cache& cache, const dht::decorated_key& key) {
-    auto reader = cache.make_reader(query::partition_range::make_singular(key));
+    auto range = query::partition_range::make_singular(key);
+    auto reader = cache.make_reader(range);
    auto mo = reader().get0();
    return bool(mo);
 }
@@ -271,7 +266,8 @@ void verify_does_not_have(row_cache& cache, const dht::decorated_key& key) {
 }

 void verify_has(row_cache& cache, const mutation& m) {
-    auto reader = cache.make_reader(query::partition_range::make_singular(m.decorated_key()));
+    auto range = query::partition_range::make_singular(m.decorated_key());
+    auto reader = cache.make_reader(range);
    auto mo = reader().get0();
    BOOST_REQUIRE(bool(mo));
    assert_that(*mo).is_equal_to(m);
@@ -359,3 +355,182 @@ SEASTAR_TEST_CASE(test_update) {
        }
    });
 }
+
+class throttle {
+    unsigned _block_counter = 0;
+    promise<> _p; // valid when _block_counter != 0, resolves when goes down to 0
+public:
+    future<> enter() {
+        if (_block_counter) {
+            promise<> p1;
+            promise<> p2;
+
+            auto f1 = p1.get_future();
+
+            p2.get_future().then([p1 = std::move(p1), p3 = std::move(_p)] () mutable {
+                p1.set_value();
+                p3.set_value();
+            });
+            _p = std::move(p2);
+
+            return f1;
+        } else {
+            return make_ready_future<>();
+        }
+    }
+
+    void block() {
+        ++_block_counter;
+        _p = promise<>();
+    }
+
+    void unblock() {
+        assert(_block_counter);
+        if (--_block_counter == 0) {
+            _p.set_value();
+        }
+    }
+};
+
+class throttled_mutation_source {
+private:
+    class impl : public enable_lw_shared_from_this<impl> {
+        mutation_source _underlying;
+        ::throttle _throttle;
+    private:
+        class reader : public mutation_reader::impl {
+            throttle& _throttle;
+            mutation_reader _reader;
+        public:
+            reader(throttle& t, mutation_reader r)
+                    : _throttle(t)
+                    , _reader(std::move(r))
+            {}
+
+            virtual future<mutation_opt> operator()() override {
+                return _reader().finally([this] () {
+                    return _throttle.enter();
+                });
+            }
+        };
+    public:
+        impl(mutation_source underlying)
+            : _underlying(std::move(underlying))
+        { }
+
+        mutation_reader make_reader(const query::partition_range& pr) {
+            return make_mutation_reader<reader>(_throttle, _underlying(pr));
+        }
+
+        ::throttle& throttle() { return _throttle; }
+    };
+    lw_shared_ptr<impl> _impl;
+public:
+    throttled_mutation_source(mutation_source underlying)
+        : _impl(make_lw_shared<impl>(std::move(underlying)))
+    { }
+
+    void block() {
+        _impl->throttle().block();
+    }
+
+    void unblock() {
+        _impl->throttle().unblock();
+    }
+
+    mutation_reader operator()(const query::partition_range& pr) {
+        return _impl->make_reader(pr);
+    }
+};
+
+static std::vector<mutation> updated_ring(std::vector<mutation>& mutations) {
+    std::vector<mutation> result;
+    for (auto&& m : mutations) {
+        result.push_back(make_new_mutation(m.schema(), m.key()));
+    }
+    return result;
+}
+
+SEASTAR_TEST_CASE(test_cache_population_and_update_race) {
+    return seastar::async([] {
+        auto s = make_schema();
+        std::vector<lw_shared_ptr<memtable>> memtables;
+        auto memtables_data_source = [&] (const query::partition_range& pr) {
+            std::vector<mutation_reader> readers;
+            for (auto&& mt : memtables) {
+                readers.emplace_back(mt->make_reader(pr));
+            }
+            return make_combined_reader(std::move(readers));
+        };
+        auto memtables_key_source = [&] (const query::partition_range& pr) {
+            std::vector<key_reader> readers;
+            for (auto&& mt : memtables) {
+                readers.emplace_back(mt->as_key_source()(pr));
+            }
+            return make_combined_reader(s, std::move(readers));
+        };
+        throttled_mutation_source cache_source(memtables_data_source);
+        cache_tracker tracker;
+        row_cache cache(s, cache_source, memtables_key_source, tracker);
+
+        auto mt1 = make_lw_shared<memtable>(s);
+        memtables.push_back(mt1);
+        auto ring = make_ring(s, 3);
+        for (auto&& m : ring) {
+            mt1->apply(m);
+        }
+
+        auto mt2 = make_lw_shared<memtable>(s);
+        auto ring2 = updated_ring(ring);
+        for (auto&& m : ring2) {
+            mt2->apply(m);
+        }
+
+        cache_source.block();
+
+        auto m0_range = query::partition_range::make_singular(ring[0].ring_position());
+        auto rd1 = cache.make_reader(m0_range);
+        auto rd1_result = rd1();
+
+        auto rd2 = cache.make_reader();
+        auto rd2_result = rd2();
+
+        sleep(10ms).get();
+        auto mt2_flushed = make_lw_shared<memtable>(s);
+        mt2_flushed->apply(*mt2).get();
+        memtables.push_back(mt2_flushed);
+
+        // This update should miss on all partitions
+        auto update_future = cache.update(*mt2, make_default_partition_presence_checker());
+
+        auto rd3 = cache.make_reader();
+
+        // rd2, which is in progress, should not prevent forward progress of update()
+        cache_source.unblock();
+        update_future.get();
+
+        // Reads started before memtable flush should return previous value, otherwise this test
+        // doesn't trigger the conditions it is supposed to protect against.
+        assert_that(rd1_result.get0()).has_mutation().is_equal_to(ring[0]);
+
+        assert_that(rd2_result.get0()).has_mutation().is_equal_to(ring[0]);
+        assert_that(rd2().get0()).has_mutation().is_equal_to(ring2[1]);
+        assert_that(rd2().get0()).has_mutation().is_equal_to(ring2[2]);
+        assert_that(rd2().get0()).has_no_mutation();
+
+        // Reads started after update was started but before previous populations completed
+        // should already see the new data
+        assert_that(std::move(rd3))
+                .produces(ring2[0])
+                .produces(ring2[1])
+                .produces(ring2[2])
+                .produces_end_of_stream();
+
+        // Reads started after flush should see new data
+        assert_that(cache.make_reader())
+                .produces(ring2[0])
+                .produces(ring2[1])
+                .produces(ring2[2])
+                .produces_end_of_stream();
+    });
+}
--- a/tests/snitch_reset_test.cc
+++ b/tests/snitch_reset_test.cc
@@ -38,6 +38,7 @@ future<> one_test(const std::string& property_fname1,
    using namespace boost::filesystem;

    utils::fb_utilities::set_broadcast_address(gms::inet_address("localhost"));
+    utils::fb_utilities::set_broadcast_rpc_address(gms::inet_address("localhost"));

    printf("Testing %s and %s property files. Expected result is %s\n",
           property_fname1.c_str(), property_fname2.c_str(),
--- a/Show More
+++ b/Show More