dist/docker: Add missing "hostname" package

The Fedora base image has changed so we need to add "hostname" that's used by the Docker-specific launch script to our image. Fixes Scylla startup. Signed-off-by: Pekka Enberg <penberg@scylladb.com>
dist: fix file not found error on centos_dep/build_dependency.sh
2015-10-15 13:44:38 +03:00 · 2015-10-14 14:12:46 +03:00 · 2015-10-12 16:12:35 +03:00 · 2015-10-09 17:39:47 +03:00 · 2015-10-08 19:06:38 +03:00 · 2015-10-08 19:06:33 +03:00
78 changed files with 2601 additions and 1285 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+*.cc diff=cpp
+*.hh diff=cpp
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=0.9
+VERSION=0.10

 if test -f version
 then
--- a/api/api-doc/column_family.json
+++ b/api/api-doc/column_family.json
@@ -950,6 +950,33 @@
            }
         ]
      },
+      {
+         "path":"/column_family/metrics/estimated_row_count/{name}",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get estimated row count",
+               "type":"array",
+               "items":{
+                  "type":"long"
+               },
+               "nickname":"get_estimated_row_count",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"name",
+                     "description":"The column family name in keysspace:name format",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            }
+         ]
+      },
      {
         "path":"/column_family/metrics/estimated_column_count_histogram/{name}",
         "operations":[
--- a/api/api-doc/gossiper.json
+++ b/api/api-doc/gossiper.json
@@ -93,6 +93,30 @@
            }
         ]
      },
+      {
+         "path":"/gossiper/heart_beat_version/{addr}",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get heart beat version for a node",
+               "type":"int",
+               "nickname":"get_current_heart_beat_version",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"addr",
+                     "description":"The endpoint address",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            }
+         ]
+      },
      {
         "path":"/gossiper/assassinate/{addr}",
         "operations":[
@@ -126,4 +150,4 @@
         ]
      }
   ]
-}
+}
--- a/api/api-doc/storage_proxy.json
+++ b/api/api-doc/storage_proxy.json
@@ -546,7 +546,58 @@
         ]
      },
      {
-      "path": "/storage_service/metrics/cas_write/unfinished_commit",
+         "path":"/storage_proxy/metrics/cas_read/unavailables",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get CAS read unavailables",
+               "type":"long",
+               "nickname":"get_cas_read_unavailables",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/storage_proxy/metrics/cas_write/timeouts",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get CAS write timeout",
+               "type":"long",
+               "nickname":"get_cas_write_timeouts",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/storage_proxy/metrics/cas_write/unavailables",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get CAS write unavailables",
+               "type":"long",
+               "nickname":"get_cas_write_unavailables",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+
+               ]
+            }
+         ]
+      },
+      {
+      "path": "/storage_proxy/metrics/cas_write/unfinished_commit",
      "operations": [
        {
          "method": "GET",
@@ -561,7 +612,7 @@
      ]
    },
    {
-      "path": "/storage_service/metrics/cas_write/contention",
+      "path": "/storage_proxy/metrics/cas_write/contention",
      "operations": [
        {
          "method": "GET",
@@ -576,7 +627,7 @@
      ]
    },
    {
-      "path": "/storage_service/metrics/cas_write/condition_not_met",
+      "path": "/storage_proxy/metrics/cas_write/condition_not_met",
      "operations": [
        {
          "method": "GET",
@@ -591,7 +642,7 @@
      ]
    },
    {
-      "path": "/storage_service/metrics/cas_read/unfinished_commit",
+      "path": "/storage_proxy/metrics/cas_read/unfinished_commit",
      "operations": [
        {
          "method": "GET",
@@ -606,7 +657,7 @@
      ]
    },
    {
-      "path": "/storage_service/metrics/cas_read/contention",
+      "path": "/storage_proxy/metrics/cas_read/contention",
      "operations": [
        {
          "method": "GET",
@@ -621,7 +672,7 @@
      ]
    },
    {
-      "path": "/storage_service/metrics/cas_read/condition_not_met",
+      "path": "/storage_proxy/metrics/cas_read/condition_not_met",
      "operations": [
        {
          "method": "GET",
@@ -636,7 +687,7 @@
      ]
    },
    {
-      "path": "/storage_service/metrics/read/timeouts",
+      "path": "/storage_proxy/metrics/read/timeouts",
      "operations": [
        {
          "method": "GET",
@@ -651,7 +702,7 @@
      ]
    },
    {
-      "path": "/storage_service/metrics/read/unavailables",
+      "path": "/storage_proxy/metrics/read/unavailables",
      "operations": [
        {
          "method": "GET",
@@ -696,7 +747,7 @@
      ]
    },
    {
-      "path": "/storage_service/metrics/range/timeouts",
+      "path": "/storage_proxy/metrics/range/timeouts",
      "operations": [
        {
          "method": "GET",
@@ -711,7 +762,7 @@
      ]
    },
    {
-      "path": "/storage_service/metrics/range/unavailables",
+      "path": "/storage_proxy/metrics/range/unavailables",
      "operations": [
        {
          "method": "GET",
@@ -726,7 +777,7 @@
      ]
    },
    {
-      "path": "/storage_service/metrics/write/timeouts",
+      "path": "/storage_proxy/metrics/write/timeouts",
      "operations": [
        {
          "method": "GET",
@@ -741,7 +792,7 @@
      ]
    },
    {
-      "path": "/storage_service/metrics/write/unavailables",
+      "path": "/storage_proxy/metrics/write/unavailables",
      "operations": [
        {
          "method": "GET",
--- a/api/api.hh
+++ b/api/api.hh
@@ -144,7 +144,9 @@ inline httpd::utils_json::histogram add_histogram(httpd::utils_json::histogram r
        res.max = val.max;
    }
    double ncount = res.count() + val.count;
-    res.sum = res.sum() + val.sum;
+    // To get an estimated sum we take the estimated mean
+    // and multiply it by the true count
+    res.sum = res.sum() + val.mean * val.count;
    double a = res.count()/ncount;
    double b = val.count/ncount;

--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -76,14 +76,30 @@ future<json::json_return_type>  get_cf_stats(http_context& ctx,
    }, std::plus<int64_t>());
 }

-static future<json::json_return_type>  get_cf_stats_sum(http_context& ctx, const sstring& name,
+static future<json::json_return_type>  get_cf_stats_count(http_context& ctx, const sstring& name,
        utils::ihistogram column_family::stats::*f) {
    return map_reduce_cf(ctx, name, 0, [f](const column_family& cf) {
        return (cf.get_stats().*f).count;
    }, std::plus<int64_t>());
 }

-static future<json::json_return_type>  get_cf_stats_sum(http_context& ctx,
+static future<json::json_return_type>  get_cf_stats_sum(http_context& ctx, const sstring& name,
+        utils::ihistogram column_family::stats::*f) {
+    auto uuid = get_uuid(name, ctx.db.local());
+    return ctx.db.map_reduce0([uuid, f](database& db) {
+        // Histograms information is sample of the actual load
+        // so to get an estimation of sum, we multiply the mean
+        // with count. The information is gather in nano second,
+        // but reported in micro
+        column_family& cf = db.find_column_family(uuid);
+        return ((cf.get_stats().*f).count/1000.0) * (cf.get_stats().*f).mean;
+    }, 0.0, std::plus<double>()).then([](double res) {
+        return make_ready_future<json::json_return_type>((int64_t)res);
+    });
+}
+
+
+static future<json::json_return_type>  get_cf_stats_count(http_context& ctx,
        utils::ihistogram column_family::stats::*f) {
    return map_reduce_cf(ctx, 0, [f](const column_family& cf) {
        return (cf.get_stats().*f).count;
@@ -285,19 +301,26 @@ void set_column_family(http_context& ctx, routes& r) {
        sstables::merge, utils_json::estimated_histogram());
    });

-    cf::get_estimated_column_count_histogram.set(r, [] (std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        //auto id = get_uuid(req->param["name"], ctx.db.local());
-        std::vector<double> res;
-        return make_ready_future<json::json_return_type>(res);
+    cf::get_estimated_row_count.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return map_reduce_cf(ctx, req->param["name"], 0, [](column_family& cf) {
+            uint64_t res = 0;
+            for (auto i: *cf.get_sstables() ) {
+                res += i.second->get_stats_metadata().estimated_row_size.count();
+            }
+            return res;
+        },
+        std::plus<uint64_t>());
    });

-    cf::get_compression_ratio.set(r, [] (std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        //auto id = get_uuid(req->param["name"], ctx.db.local());
-        return make_ready_future<json::json_return_type>(0);
+    cf::get_estimated_column_count_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return map_reduce_cf(ctx, req->param["name"], sstables::estimated_histogram(0), [](column_family& cf) {
+            sstables::estimated_histogram res(0);
+            for (auto i: *cf.get_sstables() ) {
+                res.merge(i.second->get_stats_metadata().estimated_column_count);
+            }
+            return res;
+        },
+        sstables::merge, utils_json::estimated_histogram());
    });

    cf::get_all_compression_ratio.set(r, [] (std::unique_ptr<request> req) {
@@ -315,25 +338,33 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_read.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats_sum(ctx,req->param["name"] ,&column_family::stats::reads);
+        return get_cf_stats_count(ctx,req->param["name"] ,&column_family::stats::reads);
    });

    cf::get_all_read.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats_sum(ctx, &column_family::stats::reads);
+        return get_cf_stats_count(ctx, &column_family::stats::reads);
    });

    cf::get_write.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats_sum(ctx, req->param["name"] ,&column_family::stats::writes);
+        return get_cf_stats_count(ctx, req->param["name"] ,&column_family::stats::writes);
    });

    cf::get_all_write.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats_sum(ctx, &column_family::stats::writes);
+        return get_cf_stats_count(ctx, &column_family::stats::writes);
    });

    cf::get_read_latency_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
        return get_cf_histogram(ctx, req->param["name"], &column_family::stats::reads);
    });

+    cf::get_read_latency.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return get_cf_stats_sum(ctx,req->param["name"] ,&column_family::stats::reads);
+    });
+
+    cf::get_write_latency.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return get_cf_stats_sum(ctx, req->param["name"] ,&column_family::stats::writes);
+    });
+
    cf::get_all_read_latency_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
        return get_cf_histogram(ctx, &column_family::stats::writes);
    });
@@ -490,20 +521,20 @@ void set_column_family(http_context& ctx, routes& r) {
        }, std::plus<uint64_t>());
    });

-    cf::get_bloom_filter_off_heap_memory_used.set(r, [] (std::unique_ptr<request> req) {
-        //TBD
-        // FIXME
-        // We are missing the off heap memory calculation
-        // Return 0 is the wrong value. It's a work around
-        // until the memory calculation will be available
-        //auto id = get_uuid(req->param["name"], ctx.db.local());
-        return make_ready_future<json::json_return_type>(0);
+    cf::get_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
+            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
+                return sst.second->filter_memory_size();
+            });
+        }, std::plus<uint64_t>());
    });

-    cf::get_all_bloom_filter_off_heap_memory_used.set(r, [] (std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        return make_ready_future<json::json_return_type>(0);
+    cf::get_all_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
+            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
+                return sst.second->filter_memory_size();
+            });
+        }, std::plus<uint64_t>());
    });

    cf::get_index_summary_off_heap_memory_used.set(r, [] (std::unique_ptr<request> req) {
@@ -560,7 +591,7 @@ void set_column_family(http_context& ctx, routes& r) {

    cf::get_true_snapshots_size.set(r, [] (std::unique_ptr<request> req) {
        //TBD
-        unimplemented();
+        // FIXME
        //auto id = get_uuid(req->param["name"], ctx.db.local());
        return make_ready_future<json::json_return_type>(0);
    });
@@ -641,17 +672,30 @@ void set_column_family(http_context& ctx, routes& r) {

    cf::get_tombstone_scanned_histogram.set(r, [] (std::unique_ptr<request> req) {
        //TBD
-        unimplemented();
+        // FIXME
        //auto id = get_uuid(req->param["name"], ctx.db.local());
-        std::vector<double> res;
+        httpd::utils_json::histogram res;
+        res.count = 0;
+        res.mean = 0;
+        res.max = 0;
+        res.min = 0;
+        res.sum = 0;
+        res.variance = 0;
        return make_ready_future<json::json_return_type>(res);
    });

    cf::get_live_scanned_histogram.set(r, [] (std::unique_ptr<request> req) {
        //TBD
-        unimplemented();
+        // FIXME
        //auto id = get_uuid(req->param["name"], ctx.db.local());
-        std::vector<double> res;
+        //std::vector<double> res;
+        httpd::utils_json::histogram res;
+        res.count = 0;
+        res.mean = 0;
+        res.max = 0;
+        res.min = 0;
+        res.sum = 0;
+        res.variance = 0;
        return make_ready_future<json::json_return_type>(res);
    });

@@ -741,8 +785,9 @@ void set_column_family(http_context& ctx, routes& r) {
        // TBD
        // FIXME
        // This is a workaround, until there will be an API to return the count
-        // per level, we return 0
-        return make_ready_future<json::json_return_type>(0);
+        // per level, we return an empty array
+        vector<uint64_t> res;
+        return make_ready_future<json::json_return_type>(res);
    });
 }
 }
--- a/api/gossiper.cc
+++ b/api/gossiper.cc
@@ -53,6 +53,13 @@ void set_gossiper(http_context& ctx, routes& r) {
        });
    });

+    httpd::gossiper_json::get_current_heart_beat_version.set(r, [](std::unique_ptr<request> req) {
+        gms::inet_address ep(req->param["addr"]);
+        return gms::get_current_heart_beat_version(ep).then([](int res) {
+            return make_ready_future<json::json_return_type>(res);
+        });
+    });
+
    httpd::gossiper_json::assassinate_endpoint.set(r, [](std::unique_ptr<request> req) {
        if (req->get_query_param("unsafe") != "True") {
            return gms::assassinate_endpoint(req->param["addr"]).then([] {
--- a/api/storage_proxy.cc
+++ b/api/storage_proxy.cc
@@ -219,7 +219,29 @@ void set_storage_proxy(http_context& ctx, routes& r) {

    sp::get_cas_read_timeouts.set(r, [](std::unique_ptr<request> req) {
        //TBD
-        unimplemented();
+        // FIXME
+        // cas is not supported yet, so just return 0
+        return make_ready_future<json::json_return_type>(0);
+    });
+
+    sp::get_cas_read_unavailables.set(r, [](std::unique_ptr<request> req) {
+        //TBD
+        // FIXME
+        // cas is not supported yet, so just return 0
+        return make_ready_future<json::json_return_type>(0);
+    });
+
+    sp::get_cas_write_timeouts.set(r, [](std::unique_ptr<request> req) {
+        //TBD
+        // FIXME
+        // cas is not supported yet, so just return 0
+        return make_ready_future<json::json_return_type>(0);
+    });
+
+    sp::get_cas_write_unavailables.set(r, [](std::unique_ptr<request> req) {
+        //TBD
+        // FIXME
+        // cas is not supported yet, so just return 0
        return make_ready_future<json::json_return_type>(0);
    });

--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -513,16 +513,38 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::is_incremental_backups_enabled.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        return make_ready_future<json::json_return_type>(false);
+        // If this is issued in parallel with an ongoing change, we may see values not agreeing.
+        // Reissuing is asking for trouble, so we will just return true upon seeing any true value.
+        return service::get_local_storage_service().db().map_reduce(adder<bool>(), [] (database& db) {
+            for (auto& pair: db.get_keyspaces()) {
+                auto& ks = pair.second;
+                if (ks.incremental_backups_enabled()) {
+                    return true;
+                }
+            }
+            return false;
+        }).then([] (bool val) {
+            return make_ready_future<json::json_return_type>(val);
+        });
    });

    ss::set_incremental_backups_enabled.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        auto value = req->get_query_param("value");
-        return make_ready_future<json::json_return_type>(json_void());
+        auto val_str = req->get_query_param("value");
+        bool value = (val_str == "True") || (val_str == "true") || (val_str == "1");
+        return service::get_local_storage_service().db().invoke_on_all([value] (database& db) {
+            // Change both KS and CF, so they are in sync
+            for (auto& pair: db.get_keyspaces()) {
+                auto& ks = pair.second;
+                ks.set_incremental_backups(value);
+            }
+
+            for (auto& pair: db.get_column_families()) {
+                auto cf_ptr = pair.second;
+                cf_ptr->set_incremental_backups(value);
+            }
+        }).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
    });

    ss::rebuild.set(r, [](std::unique_ptr<request> req) {
--- a/configure.py
+++ b/configure.py
@@ -120,7 +120,7 @@ class Antlr3Grammar(object):
 modes = {
    'debug': {
        'sanitize': '-fsanitize=address -fsanitize=leak -fsanitize=undefined',
-        'sanitize_libs': '-lubsan -lasan',
+        'sanitize_libs': '-lasan -lubsan',
        'opt': '-O0 -DDEBUG -DDEBUG_SHARED_PTR -DDEFAULT_ALLOCATOR',
        'libs': '',
    },
@@ -147,6 +147,7 @@ urchin_tests = [
    'tests/perf/perf_hash',
    'tests/perf/perf_cql_parser',
    'tests/perf/perf_simple_query',
+    'tests/memory_footprint',
    'tests/perf/perf_sstable',
    'tests/cql_query_test',
    'tests/storage_proxy_test',
@@ -213,6 +214,8 @@ arg_parser.add_argument('--dpdk-target', action = 'store', dest = 'dpdk_target',
                        help = 'Path to DPDK SDK target location (e.g. <DPDK SDK dir>/x86_64-native-linuxapp-gcc)')
 arg_parser.add_argument('--debuginfo', action = 'store', dest = 'debuginfo', type = int, default = 1,
                        help = 'Enable(1)/disable(0)compiler debug information generation')
+arg_parser.add_argument('--static-stdc++', dest = 'staticcxx', action = 'store_true',
+			help = 'Link libgcc and libstdc++ statically')
 add_tristate(arg_parser, name = 'hwloc', dest = 'hwloc', help = 'hwloc support')
 add_tristate(arg_parser, name = 'xen', dest = 'xen', help = 'Xen support')
 args = arg_parser.parse_args()
@@ -277,6 +280,7 @@ urchin_core = (['database.cc',
                 'cql3/statements/index_prop_defs.cc',
                 'cql3/statements/index_target.cc',
                 'cql3/statements/create_index_statement.cc',
+                 'cql3/statements/truncate_statement.cc',
                 'cql3/update_parameters.cc',
                 'cql3/ut_name.cc',
                 'thrift/handler.cc',
@@ -379,6 +383,7 @@ urchin_core = (['database.cc',
                 'partition_slice_builder.cc',
                 'init.cc',
                 'repair/repair.cc',
+                 'exceptions/exceptions.cc',
                 ]
                + [Antlr3Grammar('cql3/Cql.g')]
                + [Thrift('interface/cassandra.thrift', 'Cassandra')]
@@ -448,6 +453,7 @@ tests_not_using_seastar_test_framework = set([
    'tests/perf/perf_cql_parser',
    'tests/message',
    'tests/perf/perf_simple_query',
+    'tests/memory_footprint',
    'tests/test-serialization',
    'tests/gossip',
    'tests/compound_test',
@@ -548,6 +554,8 @@ for mode in build_modes:
    cfg =  dict([line.strip().split(': ', 1)
                 for line in open('seastar/' + pc[mode])
                 if ': ' in line])
+    if args.staticcxx:
+        cfg['Libs'] = cfg['Libs'].replace('-lstdc++ ', '')
    modes[mode]['seastar_cflags'] = cfg['Cflags']
    modes[mode]['seastar_libs'] = cfg['Libs']

@@ -556,6 +564,9 @@ seastar_deps = 'practically_anything_can_change_so_lets_run_it_every_time_and_re
 args.user_cflags += " " + pkg_config("--cflags", "jsoncpp")
 libs = "-lyaml-cpp -llz4 -lz -lsnappy " + pkg_config("--libs", "jsoncpp") + ' -lboost_filesystem'
 user_cflags = args.user_cflags
+user_ldflags = args.user_ldflags
+if args.staticcxx:
+    user_ldflags += " -static-libgcc -static-libstdc++"

 outdir = 'build'
 buildfile = 'build.ninja'
@@ -597,11 +608,11 @@ with open(buildfile, 'w') as f:
              description = CXX $out
              depfile = $out.d
            rule link.{mode}
-              command = $cxx  $cxxflags_{mode} $ldflags {seastar_libs} -o $out $in $libs $libs_{mode}
+              command = $cxx  $cxxflags_{mode} {sanitize_libs} $ldflags {seastar_libs} -o $out $in $libs $libs_{mode}
              description = LINK $out
              pool = link_pool
            rule link_stripped.{mode}
-              command = $cxx  $cxxflags_{mode} -s $ldflags {seastar_libs} -o $out $in $libs $libs_{mode}
+              command = $cxx  $cxxflags_{mode} -s {sanitize_libs} $ldflags {seastar_libs} -o $out $in $libs $libs_{mode}
              description = LINK (stripped) $out
              pool = link_pool
            rule ar.{mode}
--- a/cql3/maps.cc
+++ b/cql3/maps.cc
@@ -255,7 +255,14 @@ maps::delayed_value::bind(const query_options& options) {

 ::shared_ptr<terminal>
 maps::marker::bind(const query_options& options) {
-    throw std::runtime_error("");
+    auto val = options.get_value_at(_bind_index);
+    return val ?
+            ::make_shared<maps::value>(
+                    maps::value::from_serialized(*val,
+                            static_pointer_cast<const map_type_impl>(
+                                    _receiver->type),
+                            options.get_serialization_format())) :
+            nullptr;
 }

 void
--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -97,7 +97,7 @@ cql3::statements::create_index_statement::validate(distributed<service::storage_
        }
    } else {
        // validateNotFullIndex
-        if (target->type != index_target::target_type::full) {
+        if (target->type == index_target::target_type::full) {
            throw exceptions::invalid_request_exception("full() indexes can only be created on frozen collections");
        }
        // validateIsValuesIndexIfTargetColumnNotCollection
--- a/cql3/statements/drop_keyspace_statement.cc
+++ b/cql3/statements/drop_keyspace_statement.cc
@@ -77,7 +77,7 @@ const sstring& drop_keyspace_statement::keyspace() const

 future<bool> drop_keyspace_statement::announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only)
 {
-    return make_ready_future<>().then([&] {
+    return make_ready_future<>().then([this, is_local_only] {
        return service::get_local_migration_manager().announce_keyspace_drop(_keyspace, is_local_only);
    }).then_wrapped([this] (auto&& f) {
        try {
--- a/cql3/statements/drop_table_statement.cc
+++ b/cql3/statements/drop_table_statement.cc
@@ -76,7 +76,7 @@ void drop_table_statement::validate(distributed<service::storage_proxy>&, const

 future<bool> drop_table_statement::announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only)
 {
-    return make_ready_future<>().then([&] {
+    return make_ready_future<>().then([this, is_local_only] {
        return service::get_local_migration_manager().announce_column_family_drop(keyspace(), column_family(), is_local_only);
    }).then_wrapped([this] (auto&& f) {
        try {
--- a/cql3/statements/truncate_statement.cc
+++ b/cql3/statements/truncate_statement.cc
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright 2014 Cloudius Systems
+ *
+ * Modified by Cloudius Systems
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cql3/statements/truncate_statement.hh"
+#include "cql3/cql_statement.hh"
+
+#include <experimental/optional>
+
+namespace cql3 {
+
+namespace statements {
+
+truncate_statement::truncate_statement(::shared_ptr<cf_name> name)
+    : cf_statement{std::move(name)}
+{
+}
+
+uint32_t truncate_statement::get_bound_terms()
+{
+    return 0;
+}
+
+::shared_ptr<parsed_statement::prepared> truncate_statement::prepare(database& db)
+{
+    return ::make_shared<parsed_statement::prepared>(this->shared_from_this());
+}
+
+bool truncate_statement::uses_function(const sstring& ks_name, const sstring& function_name) const
+{
+    return parsed_statement::uses_function(ks_name, function_name);
+}
+
+void truncate_statement::check_access(const service::client_state& state)
+{
+    warn(unimplemented::cause::AUTH);
+#if 0
+    state.hasColumnFamilyAccess(keyspace(), columnFamily(), Permission.MODIFY);
+#endif
+}
+
+void truncate_statement::validate(distributed<service::storage_proxy>&, const service::client_state& state)
+{
+    warn(unimplemented::cause::VALIDATION);
+#if 0
+    ThriftValidation.validateColumnFamily(keyspace(), columnFamily());
+#endif
+}
+
+future<::shared_ptr<transport::messages::result_message>>
+truncate_statement::execute(distributed<service::storage_proxy>& proxy, service::query_state& state, const query_options& options)
+{
+    return service::get_local_storage_proxy().truncate_blocking(keyspace(), column_family()).handle_exception([](auto ep) {
+        throw exceptions::truncate_exception(ep);
+    }).then([] {
+        return ::shared_ptr<transport::messages::result_message>{};
+    });
+}
+
+future<::shared_ptr<transport::messages::result_message>>
+truncate_statement::execute_internal(distributed<service::storage_proxy>& proxy, service::query_state& state, const query_options& options)
+{
+    throw std::runtime_error("unsupported operation");
+}
+
+}
+
+}
--- a/cql3/statements/truncate_statement.hh
+++ b/cql3/statements/truncate_statement.hh
@@ -52,64 +52,23 @@ namespace statements {

 class truncate_statement : public cf_statement, public cql_statement, public ::enable_shared_from_this<truncate_statement> {
 public:
-    truncate_statement(::shared_ptr<cf_name> name)
-        : cf_statement{std::move(name)}
-    { }
+    truncate_statement(::shared_ptr<cf_name> name);

-    virtual uint32_t get_bound_terms() override {
-        return 0;
-    }
+    virtual uint32_t get_bound_terms() override;

-    virtual ::shared_ptr<prepared> prepare(database& db) override {
-        return ::make_shared<parsed_statement::prepared>(this->shared_from_this());
-    }
+    virtual ::shared_ptr<prepared> prepare(database& db) override;

-    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
-        return parsed_statement::uses_function(ks_name, function_name);
-    }
+    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override;

-    virtual void check_access(const service::client_state& state) override {
-        throw std::runtime_error("not implemented");
-#if 0
-        state.hasColumnFamilyAccess(keyspace(), columnFamily(), Permission.MODIFY);
-#endif
-    }
+    virtual void check_access(const service::client_state& state) override;

-    virtual void validate(distributed<service::storage_proxy>&, const service::client_state& state) override {
-        throw std::runtime_error("not implemented");
-#if 0
-        ThriftValidation.validateColumnFamily(keyspace(), columnFamily());
-#endif
-    }
+    virtual void validate(distributed<service::storage_proxy>&, const service::client_state& state) override;

    virtual future<::shared_ptr<transport::messages::result_message>>
-    execute(distributed<service::storage_proxy>& proxy, service::query_state& state, const query_options& options) override {
-        throw std::runtime_error("not implemented");
-#if 0
-        try
-        {
-            StorageProxy.truncateBlocking(keyspace(), columnFamily());
-        }
-        catch (UnavailableException e)
-        {
-            throw new TruncateException(e);
-        }
-        catch (TimeoutException e)
-        {
-            throw new TruncateException(e);
-        }
-        catch (IOException e)
-        {
-            throw new TruncateException(e);
-        }
-        return null;
-#endif
-    }
+    execute(distributed<service::storage_proxy>& proxy, service::query_state& state, const query_options& options) override;

    virtual future<::shared_ptr<transport::messages::result_message>>
-    execute_internal(distributed<service::storage_proxy>& proxy, service::query_state& state, const query_options& options) override {
-        throw std::runtime_error("unsupported operation");
-    }
+    execute_internal(distributed<service::storage_proxy>& proxy, service::query_state& state, const query_options& options) override;
 };

 }
--- a/cql3/untyped_result_set.hh
+++ b/cql3/untyped_result_set.hh
@@ -70,17 +70,25 @@ public:
        }
        // this could maybe be done as an overload of get_as (or something), but that just
        // muddles things for no real gain. Let user (us) attempt to know what he is doing instead.
-        template<typename K, typename V>
-        std::unordered_map<K, V> get_map(const sstring& name) const {
-            auto vec = boost::any_cast<const map_type_impl::native_type&>(
-                    map_type_impl::get_instance(data_type_for<K>(),
-                            data_type_for<V>(), false)->deserialize(
-                            get_blob(name)));
-            std::unordered_map<K, V> res;
-            std::transform(vec.begin(), vec.end(),
-                    std::inserter(res, res.end()), [](auto& p) {
+        template<typename K, typename V, typename Iter>
+        void get_map_data(const sstring& name, Iter out, data_type keytype =
+                data_type_for<K>(), data_type valtype =
+                data_type_for<V>()) const {
+            auto vec =
+                    boost::any_cast<const map_type_impl::native_type&>(
+                            map_type_impl::get_instance(keytype, valtype, false)->deserialize(
+                                    get_blob(name)));
+            std::transform(vec.begin(), vec.end(), out,
+                    [](auto& p) {
                        return std::pair<K, V>(boost::any_cast<const K&>(p.first), boost::any_cast<const V&>(p.second));
                    });
+        }
+        template<typename K, typename V, typename ... Rest>
+        std::unordered_map<K, V, Rest...> get_map(const sstring& name,
+                data_type keytype = data_type_for<K>(), data_type valtype =
+                        data_type_for<V>()) const {
+            std::unordered_map<K, V, Rest...> res;
+            get_map_data<K, V>(name, std::inserter(res, res.end()), keytype, valtype);
            return res;
        }
        const std::vector<::shared_ptr<column_specification>>& get_columns() const {
--- a/database.cc
+++ b/database.cc
@@ -52,6 +52,8 @@
 #include "service/storage_service.hh"
 #include "mutation_query.hh"
 #include "sstable_mutation_readers.hh"
+#include <core/fstream.hh>
+#include "utils/latency.hh"

 using namespace std::chrono_literals;

@@ -496,7 +498,26 @@ column_family::try_flush_memtable_to_sstable(lw_shared_ptr<memtable> old) {
    newtab->set_unshared();
    dblog.debug("Flushing to {}", newtab->get_filename());
    return newtab->write_components(*old).then([this, newtab, old] {
-        return newtab->load();
+        return newtab->open_data().then([this, newtab] {
+            // Note that due to our sharded architecture, it is possible that
+            // in the face of a value change some shards will backup sstables
+            // while others won't.
+            //
+            // This is, in theory, possible to mitigate through a rwlock.
+            // However, this doesn't differ from the situation where all tables
+            // are coming from a single shard and the toggle happens in the
+            // middle of them.
+            //
+            // The code as is guarantees that we'll never partially backup a
+            // single sstable, so that is enough of a guarantee.
+            if (!incremental_backups_enabled()) {
+                return make_ready_future<>();
+            }
+            auto dir = newtab->get_dir() + "/backups/";
+            return touch_directory(dir).then([dir, newtab] {
+                return newtab->create_links(dir);
+            });
+        });
    }).then([this, old, newtab] {
        dblog.debug("Flushing done");
        // We must add sstable before we call update_cache(), because
@@ -641,8 +662,10 @@ void column_family::start_compaction() {

 void column_family::trigger_compaction() {
    // Submitting compaction job to compaction manager.
-    _stats.pending_compactions++;
-    _compaction_manager.submit(this);
+    if (!_compaction_disabled) {
+        _stats.pending_compactions++;
+        _compaction_manager.submit(this);
+    }
 }

 future<> column_family::run_compaction() {
@@ -811,7 +834,7 @@ future<> database::populate_keyspace(sstring datadir, sstring ks_name) {
    if (i == _keyspaces.end()) {
        dblog.warn("Skipping undefined keyspace: {}", ks_name);
    } else {
-        dblog.warn("Populating Keyspace {}", ks_name);
+        dblog.info("Populating Keyspace {}", ks_name);
        return lister::scan_dir(ksdir, directory_entry_type::directory, [this, ksdir, ks_name] (directory_entry de) {
            auto comps = parse_fname(de.name);
            if (comps.size() < 2) {
@@ -965,7 +988,7 @@ void database::update_keyspace(const sstring& name) {
 }

 void database::drop_keyspace(const sstring& name) {
-    throw std::runtime_error("not implemented");
+    _keyspaces.erase(name);
 }

 void database::add_column_family(schema_ptr schema, column_family::config cfg) {
@@ -1005,8 +1028,18 @@ future<> database::update_column_family(const sstring& ks_name, const sstring& c
    });
 }

-void database::drop_column_family(const sstring& ks_name, const sstring& cf_name) {
-    throw std::runtime_error("not implemented");
+future<> database::drop_column_family(db_clock::time_point dropped_at, const sstring& ks_name, const sstring& cf_name) {
+    auto uuid = find_uuid(ks_name, cf_name);
+    auto& ks = find_keyspace(ks_name);
+    auto cf = _column_families.at(uuid);
+    _column_families.erase(uuid);
+    ks.metadata()->remove_column_family(cf->schema());
+    _ks_cf_to_uuid.erase(std::make_pair(ks_name, cf_name));
+    return truncate(dropped_at, ks, *cf).then([this, cf] {
+        return cf->stop();
+    }).then([this, cf] {
+        return make_ready_future<>();
+    });
 }

 const utils::UUID& database::find_uuid(const sstring& ks, const sstring& cf) const throw (std::out_of_range) {
@@ -1051,7 +1084,7 @@ column_family& database::find_column_family(const sstring& ks_name, const sstrin
    try {
        return find_column_family(find_uuid(ks_name, cf_name));
    } catch (...) {
-        std::throw_with_nested(no_such_column_family("Can't find a  column family " + cf_name + " in a keyspace " + ks_name));
+        std::throw_with_nested(no_such_column_family(ks_name, cf_name));
    }
 }

@@ -1059,7 +1092,7 @@ const column_family& database::find_column_family(const sstring& ks_name, const
    try {
        return find_column_family(find_uuid(ks_name, cf_name));
    } catch (...) {
-        std::throw_with_nested(no_such_column_family("Can't find a  column family " + cf_name + " in a keyspace " + ks_name));
+        std::throw_with_nested(no_such_column_family(ks_name, cf_name));
    }
 }

@@ -1067,7 +1100,7 @@ column_family& database::find_column_family(const utils::UUID& uuid) throw (no_s
    try {
        return *_column_families.at(uuid);
    } catch (...) {
-        std::throw_with_nested(no_such_column_family("Can't find a column family with UUID: " + uuid.to_sstring()));
+        std::throw_with_nested(no_such_column_family(uuid));
    }
 }

@@ -1075,7 +1108,7 @@ const column_family& database::find_column_family(const utils::UUID& uuid) const
    try {
        return *_column_families.at(uuid);
    } catch (...) {
-        std::throw_with_nested(no_such_column_family("Can't find a column family with UUID: " + uuid.to_sstring()));
+        std::throw_with_nested(no_such_column_family(uuid));
    }
 }

@@ -1116,6 +1149,7 @@ keyspace::make_column_family_config(const schema& s) const {
    cfg.enable_cache = _config.enable_cache;
    cfg.max_memtable_size = _config.max_memtable_size;
    cfg.dirty_memory_region_group = _config.dirty_memory_region_group;
+    cfg.enable_incremental_backups = _config.enable_incremental_backups;

    return cfg;
 }
@@ -1132,6 +1166,21 @@ keyspace::make_directory_for_column_family(const sstring& name, utils::UUID uuid
    return make_directory(column_family_directory(name, uuid));
 }

+no_such_keyspace::no_such_keyspace(const sstring& ks_name)
+    : runtime_error{sprint("Can't find a keyspace %s", ks_name)}
+{
+}
+
+no_such_column_family::no_such_column_family(const utils::UUID& uuid)
+    : runtime_error{sprint("Can't find a column family with UUID %s", uuid)}
+{
+}
+
+no_such_column_family::no_such_column_family(const sstring& ks_name, const sstring& cf_name)
+    : runtime_error{sprint("Can't find a column family %s in keyspace %s", cf_name, ks_name)}
+{
+}
+
 column_family& database::find_column_family(const schema_ptr& schema) throw (no_such_column_family) {
    return find_column_family(schema->id());
 }
@@ -1151,7 +1200,7 @@ schema_ptr database::find_schema(const sstring& ks_name, const sstring& cf_name)
    try {
        return find_schema(find_uuid(ks_name, cf_name));
    } catch (std::out_of_range&) {
-        std::throw_with_nested(no_such_column_family(ks_name + ":" + cf_name));
+        std::throw_with_nested(no_such_column_family(ks_name, cf_name));
    }
 }

@@ -1261,7 +1310,9 @@ struct query_state {
 };

 future<lw_shared_ptr<query::result>>
-column_family::query(const query::read_command& cmd, const std::vector<query::partition_range>& partition_ranges) const {
+column_family::query(const query::read_command& cmd, const std::vector<query::partition_range>& partition_ranges) {
+    utils::latency_counter lc;
+    _stats.reads.set_latency(lc);
    return do_with(query_state(cmd, partition_ranges), [this] (query_state& qs) {
        return do_until(std::bind(&query_state::done, &qs), [this, &qs] {
            auto&& range = *qs.current_partition_range++;
@@ -1284,6 +1335,8 @@ column_family::query(const query::read_command& cmd, const std::vector<query::pa
            return make_ready_future<lw_shared_ptr<query::result>>(
                    make_lw_shared<query::result>(qs.builder.build()));
        });
+    }).finally([lc, this]() mutable {
+        _stats.reads.mark(lc);
    });
 }

@@ -1438,6 +1491,7 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
        cfg.max_memtable_size = std::numeric_limits<size_t>::max();
    }
    cfg.dirty_memory_region_group = &_dirty_memory_region_group;
+    cfg.enable_incremental_backups = _cfg->incremental_backups();
    return cfg;
 }

@@ -1511,6 +1565,47 @@ future<> database::flush_all_memtables() {
    });
 }

+future<> database::truncate(db_clock::time_point truncated_at, sstring ksname, sstring cfname) {
+    auto& ks = find_keyspace(ksname);
+    auto& cf = find_column_family(ksname, cfname);
+    return truncate(truncated_at, ks, cf);
+}
+
+future<> database::truncate(db_clock::time_point truncated_at, const keyspace& ks, column_family& cf)
+{
+    const auto durable = ks.metadata()->durable_writes();
+    const auto auto_snapshot = get_config().auto_snapshot();
+
+    future<> f = make_ready_future<>();
+    if (durable || auto_snapshot) {
+        // TODO:
+        // this is not really a guarantee at all that we've actually
+        // gotten all things to disk. Again, need queue-ish or something.
+        f = cf.flush();
+    } else {
+        cf.clear();
+    }
+
+    return cf.run_with_compaction_disabled([truncated_at, f = std::move(f), &cf, auto_snapshot, cfname = cf.schema()->cf_name()]() mutable {
+        return f.then([truncated_at, &cf, auto_snapshot, cfname = std::move(cfname)] {
+            dblog.debug("Discarding sstable data for truncated CF + indexes");
+            // TODO: notify truncation
+
+            future<> f = make_ready_future<>();
+            if (auto_snapshot) {
+                auto name = sprint("%d-%s", truncated_at.time_since_epoch().count(), cfname);
+                f = cf.snapshot(name);
+            }
+            return f.then([&cf, truncated_at] {
+                return cf.discard_sstables(truncated_at).then([&cf, truncated_at](db::replay_position rp) {
+                    // TODO: indexes.
+                    return db::system_keyspace::save_truncation_record(cf, truncated_at, rp);
+                });
+            });
+        });
+    });
+}
+
 const sstring& database::get_snitch_name() const {
    return _cfg->endpoint_snitch();
 }
@@ -1529,6 +1624,131 @@ future<> update_schema_version_and_announce(distributed<service::storage_proxy>&
    });
 }

+// Snapshots: snapshotting the files themselves is easy: if more than one CF
+// happens to link an SSTable twice, all but one will fail, and we will end up
+// with one copy.
+//
+// The problem for us, is that the snapshot procedure is supposed to leave a
+// manifest file inside its directory.  So if we just call snapshot() from
+// multiple shards, only the last one will succeed, writing its own SSTables to
+// the manifest leaving all other shards' SSTables unaccounted for.
+//
+// Moreover, for things like drop table, the operation should only proceed when the
+// snapshot is complete. That includes the manifest file being correctly written,
+// and for this reason we need to wait for all shards to finish their snapshotting
+// before we can move on.
+//
+// To know which files we must account for in the manifest, we will keep an
+// SSTable set.  Theoretically, we could just rescan the snapshot directory and
+// see what's in there. But we would need to wait for all shards to finish
+// before we can do that anyway. That is the hard part, and once that is done
+// keeping the files set is not really a big deal.
+//
+// This code assumes that all shards will be snapshotting at the same time. So
+// far this is a safe assumption, but if we ever want to take snapshots from a
+// group of shards only, this code will have to be updated to account for that.
+struct snapshot_manager {
+    std::unordered_set<sstring> files;
+    semaphore requests;
+    semaphore manifest_write;
+    snapshot_manager() : requests(0), manifest_write(0) {}
+};
+static thread_local std::unordered_map<sstring, lw_shared_ptr<snapshot_manager>> pending_snapshots;
+
+static future<>
+seal_snapshot(sstring jsondir) {
+    std::ostringstream ss;
+    int n = 0;
+    ss << "{" << std::endl << "\t\"files\" : [ ";
+    for (auto&& rf: pending_snapshots.at(jsondir)->files) {
+        if (n++ > 0) {
+            ss << ", ";
+        }
+        ss << "\"" << rf << "\"";
+    }
+    ss << " ]" << std::endl << "}" << std::endl;
+
+    auto json = ss.str();
+    auto jsonfile = jsondir + "/manifest.json";
+
+    dblog.debug("Storing manifest {}", jsonfile);
+
+    return recursive_touch_directory(jsondir).then([jsonfile, json = std::move(json)] {
+        return engine().open_file_dma(jsonfile, open_flags::wo | open_flags::create | open_flags::truncate).then([json](file f) {
+            return do_with(make_file_output_stream(std::move(f)), [json] (output_stream<char>& out) {
+                return out.write(json.c_str(), json.size()).then([&out] {
+                   return out.flush();
+                }).then([&out] {
+                   return out.close();
+                });
+            });
+        });
+    }).then([jsondir] {
+        return sync_directory(std::move(jsondir));
+    }).finally([jsondir] {
+        pending_snapshots.erase(jsondir);
+        return make_ready_future<>();
+    });
+}
+
+future<> column_family::snapshot(sstring name) {
+    return flush().then([this, name = std::move(name)]() {
+        auto tables = boost::copy_range<std::vector<sstables::shared_sstable>>(*_sstables | boost::adaptors::map_values);
+        return do_with(std::move(tables), [this, name](std::vector<sstables::shared_sstable> & tables) {
+            auto jsondir = _config.datadir + "/snapshots/" + name;
+
+            return parallel_for_each(tables, [name](sstables::shared_sstable sstable) {
+                auto dir = sstable->get_dir() + "/snapshots/" + name;
+                return recursive_touch_directory(dir).then([sstable, dir] {
+                    return sstable->create_links(dir);
+                });
+            }).then([jsondir, &tables] {
+                // This is not just an optimization. If we have no files, jsondir may not have been created,
+                // and sync_directory would throw.
+                if (tables.size()) {
+                    return sync_directory(std::move(jsondir));
+                } else {
+                    return make_ready_future<>();
+                }
+            }).then([this, &tables, jsondir] {
+                auto shard = std::hash<sstring>()(jsondir) % smp::count;
+                std::unordered_set<sstring> table_names;
+                for (auto& sst : tables) {
+                    auto f = sst->get_filename();
+                    auto rf = f.substr(sst->get_dir().size() + 1);
+                    table_names.insert(std::move(rf));
+                }
+                return smp::submit_to(shard, [requester = engine().cpu_id(), jsondir = std::move(jsondir),
+                                              tables = std::move(table_names), datadir = _config.datadir] {
+
+                    if (pending_snapshots.count(jsondir) == 0) {
+                        pending_snapshots.emplace(jsondir, make_lw_shared<snapshot_manager>());
+                    }
+                    auto snapshot = pending_snapshots.at(jsondir);
+                    for (auto&& sst: tables) {
+                        snapshot->files.insert(std::move(sst));
+                    }
+
+                    snapshot->requests.signal(1);
+                    auto my_work = make_ready_future<>();
+                    if (requester == engine().cpu_id()) {
+                        my_work = snapshot->requests.wait(smp::count).then([jsondir = std::move(jsondir),
+                                                                            snapshot] () mutable {
+                            return seal_snapshot(jsondir).then([snapshot] {
+                                snapshot->manifest_write.signal(smp::count);
+                                return make_ready_future<>();
+                            });
+                        });
+                    }
+                    return my_work.then([snapshot] {
+                        return snapshot->manifest_write.wait(1);
+                    }).then([snapshot] {});
+                });
+            });
+        });
+    });
+}
+
 future<> column_family::flush() {
    // FIXME: this will synchronously wait for this write to finish, but doesn't guarantee
    // anything about previous writes.
@@ -1558,6 +1778,40 @@ future<> column_family::flush(const db::replay_position& pos) {
    return seal_active_memtable();
 }

+void column_family::clear() {
+    _cache.clear();
+    _memtables->clear();
+    add_memtable();
+}
+
+// NOTE: does not need to be futurized, but might eventually, depending on
+// if we implement notifications, whatnot.
+future<db::replay_position> column_family::discard_sstables(db_clock::time_point truncated_at) {
+    assert(_stats.pending_compactions == 0);
+
+    db::replay_position rp;
+    auto gc_trunc = to_gc_clock(truncated_at);
+
+    auto pruned = make_lw_shared<sstable_list>();
+
+    for (auto&p : *_sstables) {
+        if (p.second->max_data_age() <= gc_trunc) {
+            rp = std::max(p.second->get_stats_metadata().position, rp);
+            p.second->mark_for_deletion();
+            continue;
+        }
+        pruned->emplace(p.first, p.second);
+    }
+
+    _sstables = std::move(pruned);
+
+    dblog.debug("cleaning out row cache");
+    _cache.clear();
+
+    return make_ready_future<db::replay_position>(rp);
+}
+
+
 std::ostream& operator<<(std::ostream& os, const user_types_metadata& m) {
    os << "org.apache.cassandra.config.UTMetaData@" << &m;
    return os;
--- a/database.hh
+++ b/database.hh
@@ -106,6 +106,7 @@ public:
        bool enable_disk_reads = true;
        bool enable_cache = true;
        bool enable_commitlog = true;
+        bool enable_incremental_backups = false;
        size_t max_memtable_size = 5'000'000;
        logalloc::region_group* dirty_memory_region_group = nullptr;
    };
@@ -120,8 +121,8 @@ public:
        int64_t live_sstable_count = 0;
        /** Estimated number of compactions pending for this column family */
        int64_t pending_compactions = 0;
-        utils::ihistogram reads{256, 100};
-        utils::ihistogram writes{256, 100};
+        utils::ihistogram reads{256};
+        utils::ihistogram writes{256};
        sstables::estimated_histogram estimated_read;
        sstables::estimated_histogram estimated_write;
    };
@@ -143,6 +144,7 @@ private:
    compaction_manager& _compaction_manager;
    // Whether or not a cf is queued by its compaction manager.
    bool _compaction_manager_queued = false;
+    int _compaction_disabled = 0;
 private:
    void update_stats_for_new_sstable(uint64_t new_sstable_data_size);
    void add_sstable(sstables::sstable&& sstable);
@@ -195,7 +197,7 @@ public:
    void apply(const mutation& m, const db::replay_position& = db::replay_position());

    // Returns at most "cmd.limit" rows
-    future<lw_shared_ptr<query::result>> query(const query::read_command& cmd, const std::vector<query::partition_range>& ranges) const;
+    future<lw_shared_ptr<query::result>> query(const query::read_command& cmd, const std::vector<query::partition_range>& ranges);

    future<> populate(sstring datadir);

@@ -203,6 +205,8 @@ public:
    future<> stop();
    future<> flush();
    future<> flush(const db::replay_position&);
+    void clear(); // discards memtable(s) without flushing them to disk.
+    future<db::replay_position> discard_sstables(db_clock::time_point);

    // FIXME: this is just an example, should be changed to something more
    // general. compact_all_sstables() starts a compaction of all sstables.
@@ -212,6 +216,16 @@ public:
    // Compact all sstables provided in the vector.
    future<> compact_sstables(std::vector<lw_shared_ptr<sstables::sstable>> sstables);

+    future<> snapshot(sstring name);
+
+    const bool incremental_backups_enabled() const {
+        return _config.enable_incremental_backups;
+    }
+
+    void set_incremental_backups(bool val) {
+        _config.enable_incremental_backups = val;
+    }
+
    lw_shared_ptr<sstable_list> get_sstables();
    size_t sstables_count();
    int64_t get_unleveled_sstables() const;
@@ -236,6 +250,15 @@ public:
        return _stats;
    }

+    template<typename Func, typename Result = futurize_t<std::result_of_t<Func()>>>
+    Result run_with_compaction_disabled(Func && func) {
+        ++_compaction_disabled;
+        return _compaction_manager.remove(this).then(std::forward<Func>(func)).finally([this] {
+            if (--_compaction_disabled == 0) {
+                trigger_compaction();
+            }
+        });
+    }
 private:
    // One does not need to wait on this future if all we are interested in, is
    // initiating the write.  The writes initiated here will eventually
@@ -345,6 +368,9 @@ public:
    void add_column_family(const schema_ptr& s) {
        _cf_meta_data.emplace(s->cf_name(), s);
    }
+    void remove_column_family(const schema_ptr& s) {
+        _cf_meta_data.erase(s->cf_name());
+    }
    friend std::ostream& operator<<(std::ostream& os, const keyspace_metadata& m);
 };

@@ -356,6 +382,7 @@ public:
        bool enable_disk_reads = true;
        bool enable_disk_writes = true;
        bool enable_cache = true;
+        bool enable_incremental_backups = false;
        size_t max_memtable_size = 5'000'000;
        logalloc::region_group* dirty_memory_region_group = nullptr;
    };
@@ -384,6 +411,14 @@ public:
    // FIXME to allow simple registration at boostrap
    void set_replication_strategy(std::unique_ptr<locator::abstract_replication_strategy> replication_strategy);

+    const bool incremental_backups_enabled() const {
+        return _config.enable_incremental_backups;
+    }
+
+    void set_incremental_backups(bool val) {
+        _config.enable_incremental_backups = val;
+    }
+
    const sstring& datadir() const {
        return _config.datadir;
    }
@@ -393,12 +428,13 @@ private:

 class no_such_keyspace : public std::runtime_error {
 public:
-    using runtime_error::runtime_error;
+    no_such_keyspace(const sstring& ks_name);
 };

 class no_such_column_family : public std::runtime_error {
 public:
-    using runtime_error::runtime_error;
+    no_such_column_family(const utils::UUID& uuid);
+    no_such_column_family(const sstring& ks_name, const sstring& cf_name);
 };

 // Policy for distributed<database>:
@@ -463,7 +499,7 @@ public:
    void add_column_family(schema_ptr schema, column_family::config cfg);

    future<> update_column_family(const sstring& ks_name, const sstring& cf_name);
-    void drop_column_family(const sstring& ks_name, const sstring& cf_name);
+    future<> drop_column_family(db_clock::time_point changed_at, const sstring& ks_name, const sstring& cf_name);

    /* throws std::out_of_range if missing */
    const utils::UUID& find_uuid(const sstring& ks, const sstring& cf) const throw (std::out_of_range);
@@ -507,9 +543,19 @@ public:
    const std::unordered_map<sstring, keyspace>& get_keyspaces() const {
        return _keyspaces;
    }
+
+    std::unordered_map<sstring, keyspace>& get_keyspaces() {
+        return _keyspaces;
+    }
+
    const std::unordered_map<utils::UUID, lw_shared_ptr<column_family>>& get_column_families() const {
        return _column_families;
    }
+
+    std::unordered_map<utils::UUID, lw_shared_ptr<column_family>>& get_column_families() {
+        return _column_families;
+    }
+
    const std::unordered_map<std::pair<sstring, sstring>, utils::UUID, utils::tuple_hash>&
    get_column_families_mapping() const {
        return _ks_cf_to_uuid;
@@ -520,6 +566,9 @@ public:
    }

    future<> flush_all_memtables();
+    /** Truncates the given column family */
+    future<> truncate(db_clock::time_point truncated_at, sstring ksname, sstring cfname);
+    future<> truncate(db_clock::time_point truncated_at, const keyspace& ks, column_family& cf);

    const logalloc::region_group& dirty_memory_region_group() const {
        return _dirty_memory_region_group;
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -39,8 +39,8 @@
 */

 #include <chrono>
-#include <core/future-util.hh>
-#include <core/do_with.hh>
+#include <seastar/core/future-util.hh>
+#include <seastar/core/do_with.hh>
 #include <boost/range/adaptor/map.hpp>
 #include <boost/range/adaptor/sliced.hpp>

@@ -57,7 +57,7 @@
 #include "db/config.hh"
 #include "gms/failure_detector.hh"

-static logging::logger logger("BatchLog Manager");
+static logging::logger logger("batchlog_manager");

 const uint32_t db::batchlog_manager::replay_interval;
 const uint32_t db::batchlog_manager::page_size;
@@ -68,21 +68,37 @@ db::batchlog_manager::batchlog_manager(cql3::query_processor& qp)
 {}

 future<> db::batchlog_manager::start() {
-    _timer.set_callback(
-            std::bind(&batchlog_manager::replay_all_failed_batches, this));
-    _timer.arm(
-            lowres_clock::now()
-                    + std::chrono::milliseconds(
-                            service::storage_service::RING_DELAY),
-            std::experimental::optional<lowres_clock::duration> {
-                    std::chrono::milliseconds(replay_interval) });
+    // Since replay is a "node global" operation, we should not attempt to
+    // do it in parallel on each shard. It will just overlap/interfere.
+    // Could just run this on cpu 0 or so, but since this _could_ be a
+    // lengty operation, we'll round-robin it between shards just in case...
+    if (smp::main_thread()) {
+        auto cpu = engine().cpu_id();
+        _timer.set_callback(
+                [this, cpu]() mutable {
+                    auto dest = (cpu++ % smp::count);
+                    return smp::submit_to(dest, [] {
+                                return get_local_batchlog_manager().replay_all_failed_batches();
+                            }).handle_exception([](auto ep) {
+                                logger.error("Exception in batch replay: {}", ep);
+                            }).finally([this] {
+                                _timer.arm(lowres_clock::now()
+                                        + std::chrono::milliseconds(replay_interval)
+                                );
+                            });
+                });
+        _timer.arm(
+                lowres_clock::now()
+                        + std::chrono::milliseconds(
+                                service::storage_service::RING_DELAY));
+    }
    return make_ready_future<>();
 }

 future<> db::batchlog_manager::stop() {
    _stop = true;
    _timer.cancel();
-    return _sem.wait(std::chrono::milliseconds(60));
+    return _gate.close();
 }

 future<size_t> db::batchlog_manager::count_all_batches() const {
@@ -98,7 +114,7 @@ mutation db::batchlog_manager::get_batch_log_mutation_for(const std::vector<muta

 mutation db::batchlog_manager::get_batch_log_mutation_for(const std::vector<mutation>& mutations, const utils::UUID& id, int32_t version, db_clock::time_point now) {
    auto schema = _qp.db().local().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
-    auto key = partition_key::from_exploded(*schema, {uuid_type->decompose(id)});
+    auto key = partition_key::from_singular(*schema, id);
    auto timestamp = db_clock::now_in_usecs();
    auto data = [this, &mutations] {
        std::vector<frozen_mutation> fm(mutations.begin(), mutations.end());
@@ -164,7 +180,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
            }
            auto& fm = fms->front();
            auto mid = fm.column_family_id();
-            return system_keyspace::get_truncated_at(_qp, mid).then([this, &fm, written_at, mutations](db_clock::time_point t) {
+            return system_keyspace::get_truncated_at(mid).then([this, &fm, written_at, mutations](db_clock::time_point t) {
                auto schema = _qp.db().local().find_schema(fm.column_family_id());
                if (written_at > t) {
                    auto schema = _qp.db().local().find_schema(fm.column_family_id());
@@ -206,7 +222,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
        }).then([this, id] {
            // delete batch
            auto schema = _qp.db().local().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
-            auto key = partition_key::from_exploded(*schema, {uuid_type->decompose(id)});
+            auto key = partition_key::from_singular(*schema, id);
            mutation m(key, schema);
            auto now = service::client_state(service::client_state::internal_tag()).get_timestamp();
            m.partition().apply_delete(*schema, {}, tombstone(now, gc_clock::now()));
@@ -214,8 +230,8 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
        });
    };

-    return _sem.wait().then([this, batch = std::move(batch)] {
-        logger.debug("Started replayAllFailedBatches");
+    return seastar::with_gate(_gate, [this, batch = std::move(batch)] {
+        logger.debug("Started replayAllFailedBatches (cpu {})", engine().cpu_id());

        typedef ::shared_ptr<cql3::untyped_result_set> page_ptr;
        sstring query = sprint("SELECT id, data, written_at, version FROM %s.%s LIMIT %d", system_keyspace::NAME, system_keyspace::BATCHLOG, page_size);
@@ -257,8 +273,6 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
        }).then([this] {
            logger.debug("Finished replayAllFailedBatches");
        });
-    }).finally([this] {
-        _sem.signal();
    });
 }

--- a/db/batchlog_manager.hh
+++ b/db/batchlog_manager.hh
@@ -42,9 +42,11 @@
 #pragma once

 #include <unordered_map>
-#include "core/future.hh"
-#include "core/distributed.hh"
-#include "core/timer.hh"
+#include <seastar/core/future.hh>
+#include <seastar/core/distributed.hh>
+#include <seastar/core/timer.hh>
+#include <seastar/core/gate.hh>
+
 #include "cql3/query_processor.hh"
 #include "gms/inet_address.hh"
 #include "db_clock.hh"
@@ -61,7 +63,7 @@ private:
    size_t _total_batches_replayed = 0;
    cql3::query_processor& _qp;
    timer<clock_type> _timer;
-    semaphore _sem;
+    seastar::gate _gate;
    bool _stop = false;

    std::random_device _rd;
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -193,7 +193,7 @@ public:
            cfg.commit_log_location = "/var/lib/scylla/commitlog";
        }
        logger.trace("Commitlog maximum disk size: {} MB / cpu ({} cpus)",
-                max_disk_size / (1024*1024));
+                max_disk_size / (1024*1024), smp::count);
        _regs = create_counters();
    }

@@ -204,6 +204,8 @@ public:
    future<> init();
    future<sseg_ptr> new_segment();
    future<sseg_ptr> active_segment();
+    future<sseg_ptr> allocate_segment(bool active);
+
    future<> clear();
    future<> sync_all_segments();
    future<> shutdown();
@@ -213,9 +215,10 @@ public:
    void discard_unused_segments();
    void discard_completed_segments(const cf_id_type& id,
            const replay_position& pos);
+    void on_timer();
    void sync();
-    void arm() {
-        _timer.arm(std::chrono::milliseconds(cfg.commitlog_sync_period_in_ms));
+    void arm(uint32_t extra = 0) {
+        _timer.arm(std::chrono::milliseconds(cfg.commitlog_sync_period_in_ms + extra));
    }

    std::vector<sstring> get_active_names() const;
@@ -241,11 +244,21 @@ public:
 private:
    segment_id_type _ids = 0;
    std::vector<sseg_ptr> _segments;
+    std::deque<sseg_ptr> _reserve_segments;
    std::vector<buffer_type> _temp_buffers;
    std::unordered_map<flush_handler_id, flush_handler> _flush_handlers;
    flush_handler_id _flush_ids = 0;
    replay_position _flush_position;
    timer<clock_type> _timer;
+    size_t _reserve_allocating = 0;
+    // # segments to try to keep available in reserve
+    // i.e. the amount of segments we expect to consume inbetween timer
+    // callbacks.
+    // The idea is that since the files are 0 len at start, and thus cost little,
+    // it is easier to adapt this value compared to timer freq.
+    size_t _num_reserve_segments = 0;
+    seastar::gate _gate;
+    uint64_t _new_counter = 0;
 };

 /*
@@ -296,12 +309,12 @@ public:
    // TODO : tune initial / default size
    static constexpr size_t default_size = align_up<size_t>(128 * 1024, alignment);

-    segment(segment_manager* m, const descriptor& d, file && f)
+    segment(segment_manager* m, const descriptor& d, file && f, bool active)
            : _segment_manager(m), _desc(std::move(d)), _file(std::move(f)), _sync_time(
                    clock_type::now())
    {
        ++_segment_manager->totals.segments_created;
-        logger.debug("Created new segment {}", *this);
+        logger.debug("Created new {} segment {}", active ? "active" : "reserve", *this);
    }
    ~segment() {
        if (is_clean()) {
@@ -324,7 +337,7 @@ public:
        auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(
                now - _sync_time).count();
        if ((_segment_manager->cfg.commitlog_sync_period_in_ms * 2) < uint64_t(ms)) {
-            logger.debug("Need sync. {} ms elapsed", ms);
+            logger.debug("{} needs sync. {} ms elapsed", *this, ms);
            return true;
        }
        return false;
@@ -337,12 +350,16 @@ public:
        sync();
        return _segment_manager->active_segment();
    }
+    void reset_sync_time() {
+        _sync_time = clock_type::now();
+    }
    future<sseg_ptr> sync() {
        // Note: this is not a marker for when sync was finished.
        // It is when it was initiated
-        _sync_time = clock_type::now();
+        reset_sync_time();
+
        if (position() <= _flush_pos) {
-            logger.trace("Sync not needed : ({} / {})", position(), _flush_pos);
+            logger.trace("Sync not needed {}: ({} / {})", *this, position(), _flush_pos);
            return make_ready_future<sseg_ptr>(shared_from_this());
        }
        return cycle().then([](auto seg) {
@@ -358,7 +375,7 @@ public:
            pos = _file_pos;
        }
        if (pos != 0 && pos <= _flush_pos) {
-            logger.trace("Already synced! ({} < {})", pos, _flush_pos);
+            logger.trace("{} already synced! ({} < {})", *this, pos, _flush_pos);
            return make_ready_future<sseg_ptr>(std::move(me));
        }
        logger.trace("Syncing {} -> {}", _flush_pos, pos);
@@ -370,7 +387,7 @@ public:
                    _dwrite.write_unlock(); // release it already.
                    pos = std::max(pos, _file_pos);
                    if (pos <= _flush_pos) {
-                        logger.trace("Already synced! ({} < {})", pos, _flush_pos);
+                        logger.trace("{} already synced! ({} < {})", *this, pos, _flush_pos);
                        return make_ready_future<sseg_ptr>(std::move(me));
                    }
                    ++_segment_manager->totals.pending_operations;
@@ -389,7 +406,7 @@ public:
                            }).then([this, pos, me = std::move(me)]() {
                                _flush_pos = std::max(pos, _flush_pos);
                                ++_segment_manager->totals.flush_count;
-                                logger.trace("Synced to {}", _flush_pos);
+                                logger.trace("{} synced to {}", *this, _flush_pos);
                                return make_ready_future<sseg_ptr>(std::move(me));
                            }).finally([this] {
                                --_segment_manager->totals.pending_operations;
@@ -488,16 +505,13 @@ public:
                        }
                        // gah, partial write. should always get here with dma chunk sized
                        // "bytes", but lets make sure...
-                        logger.debug("Partial write: {}/{} bytes", *written, size);
+                        logger.debug("Partial write {}: {}/{} bytes", *this, *written, size);
                        *written = align_down(*written, alignment);
                        return make_ready_future<stop_iteration>(stop_iteration::no);
                        // TODO: retry/ignore/fail/stop - optional behaviour in origin.
                        // we fast-fail the whole commit.
-                    } catch (std::exception& e) {
-                        logger.error("Failed to persist commits to disk: {}", e.what());
-                        throw;
                    } catch (...) {
-                        logger.error("Failed to persist commits to disk.");
+                        logger.error("Failed to persist commits to disk for {}: {}", *this, std::current_exception());
                        throw;
                    }
                });
@@ -688,11 +702,11 @@ future<> db::commitlog::segment_manager::init() {

        // base id counter is [ <shard> | <base> ]
        _ids = replay_position(engine().cpu_id(), id).id;
-
-        if (cfg.mode != sync_mode::BATCH) {
-            _timer.set_callback(std::bind(&segment_manager::sync, this));
-            this->arm();
-        }
+        // always run the timer now, since we need to handle segment pre-alloc etc as well.
+        _timer.set_callback(std::bind(&segment_manager::on_timer, this));
+        auto delay = engine().cpu_id() * std::ceil(double(cfg.commitlog_sync_period_in_ms) / smp::count);
+        logger.trace("Delaying timer loop {} ms", delay);
+        this->arm(delay);
    });
 }

@@ -803,22 +817,37 @@ void db::commitlog::segment_manager::flush_segments(bool force) {
    }
 }

+future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::allocate_segment(bool active) {
+    descriptor d(next_id());
+    return engine().open_file_dma(cfg.commit_log_location + "/" + d.filename(), open_flags::wo | open_flags::create).then([this, d, active](file f) {
+        auto s = make_lw_shared<segment>(this, d, std::move(f), active);
+        return make_ready_future<sseg_ptr>(s);
+    });
+}
+
 future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::new_segment() {
    if (_shutdown) {
        throw std::runtime_error("Commitlog has been shut down. Cannot add data");
    }
-    descriptor d(next_id());
-    return engine().open_file_dma(cfg.commit_log_location + "/" + d.filename(), open_flags::wo | open_flags::create).then([this, d](file f) {
-        _segments.emplace_back(make_lw_shared<segment>(this, d, std::move(f)));
-        auto max = max_disk_size;
-        auto cur = totals.total_size_on_disk;
-        if (max != 0 && cur >= max) {
-            logger.debug("Size on disk {} MB exceeds local maximum {} MB", cur / (1024 * 1024), max / (1024 * 1024));
-            flush_segments();
+
+    ++_new_counter;
+
+    if (_reserve_segments.empty()) {
+        if (_num_reserve_segments < cfg.max_reserve_segments) {
+            ++_num_reserve_segments;
+            logger.trace("Increased segment reserve count to {}", _num_reserve_segments);
        }
-    }).then([this] {
-        return make_ready_future<sseg_ptr>(_segments.back());
-    });
+        return allocate_segment(true).then([this](sseg_ptr s) {
+            _segments.push_back(s);
+            return make_ready_future<sseg_ptr>(s);
+        });
+    }
+
+    _segments.push_back(_reserve_segments.front());
+    _reserve_segments.pop_front();
+    _segments.back()->reset_sync_time();
+    logger.trace("Acquired segment {} from reserve", _segments.back());
+    return make_ready_future<sseg_ptr>(_segments.back());
 }

 future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::active_segment() {
@@ -841,7 +870,7 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
 */
 void db::commitlog::segment_manager::discard_completed_segments(
        const cf_id_type& id, const replay_position& pos) {
-    logger.debug("discard completed log segments for {}, table {}", pos, id);
+    logger.debug("Discard completed segments for {}, table {}", pos, id);
    for (auto&s : _segments) {
        s->mark_clean(id, pos);
    }
@@ -849,7 +878,7 @@ void db::commitlog::segment_manager::discard_completed_segments(
 }

 std::ostream& db::operator<<(std::ostream& out, const db::commitlog::segment& s) {
-    return out << "commit log segment (" << s._desc.filename() << ")";
+    return out << s._desc.filename();
 }

 std::ostream& db::operator<<(std::ostream& out, const db::commitlog::segment::cf_mark& m) {
@@ -863,10 +892,14 @@ std::ostream& db::operator<<(std::ostream& out, const db::replay_position& p) {
 void db::commitlog::segment_manager::discard_unused_segments() {
    auto i = std::remove_if(_segments.begin(), _segments.end(), [=](auto& s) {
        if (s->is_unused()) {
-            logger.debug("{} is unused", *s);
+            logger.debug("Segment {} is unused", *s);
            return true;
        }
-        logger.debug("Not safe to delete {}; dirty is {}", s, segment::cf_mark {*s});
+        if (s->is_still_allocating()) {
+            logger.debug("Not safe to delete segment {}; still allocating.", s);
+        } else {
+            logger.debug("Not safe to delete segment {}; dirty is {}", s, segment::cf_mark {*s});
+        }
        return false;
    });
    if (i != _segments.end()) {
@@ -878,16 +911,22 @@ future<> db::commitlog::segment_manager::sync_all_segments() {
    logger.debug("Issuing sync for all segments");
    return parallel_for_each(_segments, [this](sseg_ptr s) {
        return s->sync().then([](sseg_ptr s) {
-            logger.debug("Synced {}", *s);
+            logger.debug("Synced segment {}", *s);
        });
    });
 }

 future<> db::commitlog::segment_manager::shutdown() {
-    _shutdown = true;
-    return parallel_for_each(_segments, [this](sseg_ptr s) {
-        return s->shutdown();
-    });
+    if (!_shutdown) {
+        _shutdown = true;
+        _timer.cancel();
+        return _gate.close().then([this] {
+            return parallel_for_each(_segments, [this](sseg_ptr s) {
+                return s->shutdown();
+            });
+        });
+    }
+    return make_ready_future<>();
 }


@@ -898,6 +937,8 @@ future<> db::commitlog::segment_manager::shutdown() {
 */
 future<> db::commitlog::segment_manager::clear() {
    logger.debug("Clearing all segments");
+    _shutdown = true;
+    _timer.cancel();
    flush_segments(true);
    return sync_all_segments().then([this] {
        for (auto& s : _segments) {
@@ -913,6 +954,51 @@ void db::commitlog::segment_manager::sync() {
    for (auto& s : _segments) {
        s->sync(); // we do not care about waiting...
    }
+}
+
+void db::commitlog::segment_manager::on_timer() {
+    if (cfg.mode != sync_mode::BATCH) {
+        sync();
+    }
+    // IFF a new segment was put in use since last we checked, and we're
+    // above threshold, request flush.
+    if (_new_counter > 0) {
+        auto max = max_disk_size;
+        auto cur = totals.total_size_on_disk;
+        if (max != 0 && cur >= max) {
+            _new_counter = 0;
+            logger.debug("Size on disk {} MB exceeds local maximum {} MB", cur / (1024 * 1024), max / (1024 * 1024));
+            flush_segments();
+        }
+    }
+    // Gate, because we are starting potentially blocking ops
+    // without waiting for them, so segement_manager could be shut down
+    // while they are running.
+    seastar::with_gate(_gate, [this] {
+        // take outstanding allocations into regard. This is paranoid,
+        // but if for some reason the file::open takes longer than timer period,
+        // we could flood the reserve list with new segments
+        auto n = _reserve_segments.size() + _reserve_allocating;
+        return parallel_for_each(boost::irange(n, _num_reserve_segments), [this, n](auto i) {
+            ++_reserve_allocating;
+            return this->allocate_segment(false).then([this](sseg_ptr s) {
+                if (!_shutdown) {
+                    // insertion sort.
+                    auto i = std::upper_bound(_reserve_segments.begin(), _reserve_segments.end(), s, [](auto s1, auto s2) {
+                        const descriptor& d1 = s1->_desc;
+                        const descriptor& d2 = s2->_desc;
+                        return d1.id < d2.id;
+                    });
+                    i = _reserve_segments.emplace(i, std::move(s));
+                    logger.trace("Added reserve segment {}", *i);
+                }
+            }).finally([this] {
+                --_reserve_allocating;
+            });
+        });
+    }).handle_exception([](auto ep) {
+        logger.warn("Exception in segment reservation: {}", ep);
+    });
    arm();
 }

@@ -944,6 +1030,7 @@ db::commitlog::segment_manager::buffer_type db::commitlog::segment_manager::acqu
    if (a == nullptr) {
        throw std::bad_alloc();
    }
+    logger.trace("Allocated {} k buffer", s / 1024);
    return buffer_type(reinterpret_cast<char *>(a), s, make_free_deleter(a));
 }

@@ -956,6 +1043,7 @@ void db::commitlog::segment_manager::release_buffer(buffer_type&& b) {
    constexpr const size_t max_temp_buffers = 4;

    if (_temp_buffers.size() > max_temp_buffers) {
+        logger.trace("Deleting {} buffers", _temp_buffers.size() - max_temp_buffers);
        _temp_buffers.erase(_temp_buffers.begin() + max_temp_buffers, _temp_buffers.end());
    }
    totals.buffer_list_bytes = std::accumulate(_temp_buffers.begin(),
@@ -1104,7 +1192,10 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
        }
        future<> read_header() {
            return fin.read_exactly(segment::descriptor_header_size).then([this](temporary_buffer<char> buf) {
-                advance(buf);
+                if (!advance(buf)) {
+                    // zero length file. accept it just to be nice.
+                    return make_ready_future<>();
+                }
                // Will throw if we got eof
                data_input in(buf);
                auto ver = in.read<uint32_t>();
@@ -1124,9 +1215,6 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
                this->id = id;
                this->next = 0;

-                if (start_off > pos) {
-                    return skip(start_off - pos);
-                }
                return make_ready_future<>();
            });
        }
@@ -1154,6 +1242,10 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type

                this->next = next;

+                if (start_off >= next) {
+                    return skip(next - pos);
+                }
+
                return do_until(std::bind(&work::end_of_chunk, this), std::bind(&work::read_entry, this));
            });
        }
@@ -1181,6 +1273,10 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
                    throw std::runtime_error("Invalid entry size");
                }

+                if (start_off > pos) {
+                    return skip(size - entry_header_size);
+                }
+
                return fin.read_exactly(size - entry_header_size).then([this, size, checksum, rp](temporary_buffer<char> buf) {
                    advance(buf);

@@ -1213,8 +1309,10 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
    auto w = make_lw_shared<work>(std::move(f), off);
    auto ret = w->s.listen(std::move(next));

-    w->s.started().then(std::bind(&work::read_file, w.get())).finally([w] {
+    w->s.started().then(std::bind(&work::read_file, w.get())).then([w] {
        w->s.close();
+    }).handle_exception([w](auto ep) {
+        w->s.set_exception(ep);
    });

    return ret;
@@ -1236,6 +1334,14 @@ uint64_t db::commitlog::get_pending_tasks() const {
    return _segment_manager->totals.pending_operations;
 }

+uint64_t db::commitlog::get_num_segments_created() const {
+    return _segment_manager->totals.segments_created;
+}
+
+uint64_t db::commitlog::get_num_segments_destroyed() const {
+    return _segment_manager->totals.segments_destroyed;
+}
+
 future<std::vector<db::commitlog::descriptor>> db::commitlog::list_existing_descriptors() const {
    return list_existing_descriptors(active_config().commit_log_location);
 }
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -111,6 +111,9 @@ public:
        uint64_t commitlog_total_space_in_mb = 0;
        uint64_t commitlog_segment_size_in_mb = 32;
        uint64_t commitlog_sync_period_in_ms = 10 * 1000; //TODO: verify default!
+        // Max number of segments to keep in pre-alloc reserve.
+        // Not (yet) configurable from scylla.conf.
+        uint64_t max_reserve_segments = 12;

        sync_mode mode = sync_mode::PERIODIC;
    };
@@ -229,6 +232,8 @@ public:
    uint64_t get_total_size() const;
    uint64_t get_completed_tasks() const;
    uint64_t get_pending_tasks() const;
+    uint64_t get_num_segments_created() const;
+    uint64_t get_num_segments_destroyed() const;

    /**
     * Returns the largest amount of data that can be written in a single "mutation".
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -117,11 +117,16 @@ future<> db::commitlog_replayer::impl::init() {
                        logger.warn("Could not read sstable metadata {}", std::current_exception());
                    }
                }
-                // TODO: this is not correct. Truncation does not fully take sharding into consideration
-                return db::system_keyspace::get_truncated_position(qp, uuid).then([&map, uuid](auto truncated_rp) {
-                    if (truncated_rp != replay_position()) {
-                        auto& pp = map[engine().cpu_id()][uuid];
-                        pp = std::max(pp, truncated_rp);
+                // We do this on each cpu, for each CF, which technically is a little wasteful, but the values are
+                // cached, this is only startup, and it makes the code easier.
+                // Get all truncation records for the CF and initialize max rps if
+                // present. Cannot do this on demand, as there may be no sstables to
+                // mark the CF as "needed".
+                return db::system_keyspace::get_truncated_position(uuid).then([&map, &uuid](std::vector<db::replay_position> tpps) {
+                    for (auto& p : tpps) {
+                        logger.trace("CF {} truncated at {}", uuid, p);
+                        auto& pp = map[p.shard_id()][uuid];
+                        pp = std::max(pp, p);
                    }
                });
            }).then([&map] {
@@ -183,8 +188,8 @@ future<> db::commitlog_replayer::impl::process(stats* s, temporary_buffer<char>
        auto uuid = fm.column_family_id();
        auto& map = _rpm[shard];
        auto i = map.find(uuid);
-        if (i != map.end() && rp < i->second) {
-            logger.trace("entry {} at {} is less than recorded replay position {}. skipping", fm.column_family_id(), rp, i->second);
+        if (i != map.end() && rp <= i->second) {
+            logger.trace("entry {} at {} is younger than recorded replay position {}. skipping", fm.column_family_id(), rp, i->second);
            s->skipped_mutations++;
            return make_ready_future<>();
        }
@@ -248,7 +253,10 @@ future<> db::commitlog_replayer::recover(std::vector<sstring> files) {
    logger.info("Replaying {}", files);

    return parallel_for_each(files, [this](auto f) {
-        return this->recover(std::move(f));
+        return this->recover(f).handle_exception([f](auto ep) {
+            logger.error("Error recovering {}: {}", f, ep);
+            std::rethrow_exception(ep);
+        });
    });
 }

--- a/db/commitlog/replay_position.hh
+++ b/db/commitlog/replay_position.hh
@@ -71,6 +71,9 @@ struct replay_position {
    bool operator<(const replay_position & r) const {
        return id < r.id ? true : (r.id < id ? false : pos < r.pos);
    }
+    bool operator<=(const replay_position & r) const {
+        return !(r < *this);
+    }
    bool operator==(const replay_position & r) const {
        return id == r.id && pos == r.pos;
    }
--- a/db/config.hh
+++ b/db/config.hh
@@ -407,7 +407,7 @@ public:
            "The port for inter-node communication."  \
    )                                                   \
    /* Advanced automatic backup setting */ \
-    val(auto_snapshot, bool, true, Unused,     \
+    val(auto_snapshot, bool, true, Used,     \
            "Enable or disable whether a snapshot is taken of the data before keyspace truncation or dropping of tables. To prevent data loss, using the default setting is strongly advised. If you set to false, you will lose data on truncation or drop."  \
    )   \
    /* Key caches and global row properties */  \
--- a/db/query_context.hh
+++ b/db/query_context.hh
@@ -55,6 +55,9 @@ struct query_context {
    api::timestamp_type next_timestamp() {
        return _qp.local().next_timestamp();
    }
+    cql3::query_processor& qp() {
+        return _qp.local();
+    }
 };

 // This does not have to be thread local, because all cores will share the same context.
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
--- a/db/schema_tables.hh
+++ b/db/schema_tables.hh
@@ -87,6 +87,8 @@ future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& p

 std::vector<mutation> make_create_keyspace_mutations(lw_shared_ptr<keyspace_metadata> keyspace, api::timestamp_type timestamp, bool with_tables_and_types_and_functions = true);

+std::vector<mutation> make_drop_keyspace_mutations(lw_shared_ptr<keyspace_metadata> keyspace, api::timestamp_type timestamp);
+
 lw_shared_ptr<keyspace_metadata> create_keyspace_from_schema_partition(const schema_result::value_type& partition);

 future<> merge_tables(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after);
@@ -100,7 +102,9 @@ std::vector<mutation> make_create_table_mutations(lw_shared_ptr<keyspace_metadat
 future<std::map<sstring, schema_ptr>> create_tables_from_tables_partition(distributed<service::storage_proxy>& proxy, const schema_result::mapped_type& result);

 void add_table_to_schema_mutation(schema_ptr table, api::timestamp_type timestamp, bool with_columns_and_triggers, const partition_key& pkey, std::vector<mutation>& mutations);
-    
+
+std::vector<mutation> make_drop_table_mutations(lw_shared_ptr<keyspace_metadata> keyspace, schema_ptr table, api::timestamp_type timestamp);
+
 future<schema_ptr> create_table_from_name(distributed<service::storage_proxy>& proxy, const sstring& keyspace, const sstring& table);

 future<schema_ptr> create_table_from_table_row(distributed<service::storage_proxy>& proxy, const query::result_set_row& row);
@@ -109,6 +113,8 @@ void create_table_from_table_row_and_column_rows(schema_builder& builder, const

 future<schema_ptr> create_table_from_table_partition(distributed<service::storage_proxy>& proxy, lw_shared_ptr<query::result_set>&& partition);

+void drop_column_from_schema_mutation(schema_ptr table, const column_definition& column, long timestamp, std::vector<mutation>& mutations);
+
 std::vector<column_definition> create_columns_from_column_rows(const schema_result::mapped_type& rows,
                                                               const sstring& keyspace,
                                                               const sstring& table,/*,
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -40,6 +40,8 @@

 #include <boost/range/algorithm_ext/push_back.hpp>
 #include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/adaptor/filtered.hpp>
+#include <boost/range/adaptor/map.hpp>

 #include "system_keyspace.hh"
 #include "types.hh"
@@ -50,6 +52,7 @@
 #include "cql3/query_options.hh"
 #include "cql3/query_processor.hh"
 #include "utils/fb_utilities.hh"
+#include "utils/hash.hh"
 #include "dht/i_partitioner.hh"
 #include "version.hh"
 #include "thrift/server.hh"
@@ -482,10 +485,14 @@ future<> init_local_cache() {
    });
 }

-future<> setup(distributed<database>& db, distributed<cql3::query_processor>& qp) {
+void minimal_setup(distributed<database>& db, distributed<cql3::query_processor>& qp) {
    auto new_ctx = std::make_unique<query_context>(db, qp);
    qctx.swap(new_ctx);
    assert(!new_ctx);
+}
+
+future<> setup(distributed<database>& db, distributed<cql3::query_processor>& qp) {
+    minimal_setup(db, qp);
    return setup_version().then([&db] {
        return update_schema_version(db.local().get_version());
    }).then([] {
@@ -499,24 +506,40 @@ future<> setup(distributed<database>& db, distributed<cql3::query_processor>& qp
    }).then([] {
        return db::schema_tables::save_system_keyspace_schema();
    });
+    return make_ready_future<>();
 }

-typedef std::pair<db::replay_position, db_clock::time_point> truncation_entry;
-typedef std::unordered_map<utils::UUID, truncation_entry> truncation_map;
+typedef std::pair<replay_positions, db_clock::time_point> truncation_entry;
+typedef utils::UUID truncation_key;
+typedef std::unordered_map<truncation_key, truncation_entry> truncation_map;
+
 static thread_local std::experimental::optional<truncation_map> truncation_records;

-future<> save_truncation_record(cql3::query_processor& qp, const column_family& cf, db_clock::time_point truncated_at, const db::replay_position& rp) {
-    db::serializer<replay_position> rps(rp);
-    bytes buf(bytes::initialized_later(), sizeof(db_clock::rep) + rps.size());
+future<> save_truncation_records(const column_family& cf, db_clock::time_point truncated_at, replay_positions positions) {
+    auto size =
+            sizeof(db_clock::rep)
+                    + positions.size()
+                            * db::serializer<replay_position>(
+                                    db::replay_position()).size();
+    bytes buf(bytes::initialized_later(), size);
    data_output out(buf);
-    rps(out);
+
+    // Old version would write a single RP. We write N. Resulting blob size
+    // will determine how many.
+    // An external entity reading this blob would get a "correct" RP
+    // and a garbled time stamp. But an external entity has no business
+    // reading this data anyway, since it is meaningless outside this
+    // machine instance.
+    for (auto& rp : positions) {
+        db::serializer<replay_position>::write(out, rp);
+    }
    out.write<db_clock::rep>(truncated_at.time_since_epoch().count());

    map_type_impl::native_type tmp;
    tmp.emplace_back(boost::any{ cf.schema()->id() }, boost::any{ buf });

    sstring req = sprint("UPDATE system.%s SET truncated_at = truncated_at + ? WHERE key = '%s'", LOCAL, LOCAL);
-    return qp.execute_internal(req, {tmp}).then([&qp](auto rs) {
+    return qctx->qp().execute_internal(req, {tmp}).then([](auto rs) {
        truncation_records = {};
        return force_blocking_flush(LOCAL);
    });
@@ -525,49 +548,84 @@ future<> save_truncation_record(cql3::query_processor& qp, const column_family&
 /**
 * This method is used to remove information about truncation time for specified column family
 */
-future<> remove_truncation_record(cql3::query_processor& qp, utils::UUID id) {
+future<> remove_truncation_record(utils::UUID id) {
    sstring req = sprint("DELETE truncated_at[?] from system.%s WHERE key = '%s'", LOCAL, LOCAL);
-    return qp.execute_internal(req, {id}).then([&qp](auto rs) {
+    return qctx->qp().execute_internal(req, {id}).then([](auto rs) {
        truncation_records = {};
        return force_blocking_flush(LOCAL);
    });
 }

-static future<truncation_entry> get_truncation_record(cql3::query_processor& qp, utils::UUID cf_id) {
+static future<truncation_entry> get_truncation_record(utils::UUID cf_id) {
    if (!truncation_records) {
        sstring req = sprint("SELECT truncated_at FROM system.%s WHERE key = '%s'", LOCAL, LOCAL);
-        return qp.execute_internal(req).then([&qp, cf_id](::shared_ptr<cql3::untyped_result_set> rs) {
+        return qctx->qp().execute_internal(req).then([cf_id](::shared_ptr<cql3::untyped_result_set> rs) {
            truncation_map tmp;
-            if (!rs->empty() && rs->one().has("truncated_set")) {
+            if (!rs->empty() && rs->one().has("truncated_at")) {
                auto map = rs->one().get_map<utils::UUID, bytes>("truncated_at");
                for (auto& p : map) {
+                    auto uuid = p.first;
+                    auto buf = p.second;
+
                    truncation_entry e;
-                    data_input in(p.second);
-                    e.first = db::serializer<replay_position>::read(in);
+
+                    data_input in(buf);
+
+                    while (in.avail() > sizeof(db_clock::rep)) {
+                        e.first.emplace_back(db::serializer<replay_position>::read(in));
+                    }
                    e.second = db_clock::time_point(db_clock::duration(in.read<db_clock::rep>()));
-                    tmp[p.first] = e;
+                    tmp[uuid] = e;
                }
            }
            truncation_records = std::move(tmp);
-            return get_truncation_record(qp, cf_id);
+            return get_truncation_record(cf_id);
        });
    }
    return make_ready_future<truncation_entry>((*truncation_records)[cf_id]);
 }

-future<db::replay_position> get_truncated_position(cql3::query_processor& qp, utils::UUID cf_id) {
-    return get_truncation_record(qp, cf_id).then([](truncation_entry e) {
-        return make_ready_future<db::replay_position>(e.first);
+future<> save_truncation_record(const column_family& cf, db_clock::time_point truncated_at, db::replay_position rp) {
+    // TODO: this is horribly ineffective, we're doing a full flush of all system tables for all cores
+    // once, for each core (calling us). But right now, redesigning so that calling here (or, rather,
+    // save_truncation_records), is done from "somewhere higher, once per machine, not shard" is tricky.
+    // Mainly because drop_tables also uses truncate. And is run per-core as well. Gah.
+    return get_truncated_position(cf.schema()->id()).then([&cf, truncated_at, rp](replay_positions positions) {
+        auto i = std::find_if(positions.begin(), positions.end(), [rp](auto& p) {
+            return p.shard_id() == rp.shard_id();
+        });
+        if (i == positions.end()) {
+            positions.emplace_back(rp);
+        } else {
+            *i = rp;
+        }
+        return save_truncation_records(cf, truncated_at, positions);
    });
 }

-future<db_clock::time_point> get_truncated_at(cql3::query_processor& qp, utils::UUID cf_id) {
-    return get_truncation_record(qp, cf_id).then([](truncation_entry e) {
+future<db::replay_position> get_truncated_position(utils::UUID cf_id, uint32_t shard) {
+    return get_truncated_position(std::move(cf_id)).then([shard](replay_positions positions) {
+       for (auto& rp : positions) {
+           if (shard == rp.shard_id()) {
+               return make_ready_future<db::replay_position>(rp);
+           }
+       }
+       return make_ready_future<db::replay_position>();
+    });
+}
+
+ future<replay_positions> get_truncated_position(utils::UUID cf_id) {
+    return get_truncation_record(cf_id).then([](truncation_entry e) {
+        return make_ready_future<replay_positions>(e.first);
+    });
+}
+
+future<db_clock::time_point> get_truncated_at(utils::UUID cf_id) {
+    return get_truncation_record(cf_id).then([](truncation_entry e) {
        return make_ready_future<db_clock::time_point>(e.second);
    });
 }

-
 set_type_impl::native_type prepare_tokens(std::unordered_set<dht::token>& tokens) {
    set_type_impl::native_type tset;
    for (auto& t: tokens) {
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -84,6 +84,9 @@ extern schema_ptr hints();
 extern schema_ptr batchlog();
 extern schema_ptr built_indexes(); // TODO (from Cassandra): make private

+// Only for testing.
+void minimal_setup(distributed<database>& db, distributed<cql3::query_processor>& qp);
+
 future<> init_local_cache();
 future<> setup(distributed<database>& db, distributed<cql3::query_processor>& qp);
 future<> update_schema_version(utils::UUID version);
@@ -274,10 +277,14 @@ enum class bootstrap_state {
        return CompactionHistoryTabularData.from(queryResultSet);
    }
 #endif
-    future<> save_truncation_record(cql3::query_processor&, const column_family&, db_clock::time_point truncated_at, const db::replay_position&);
-    future<> remove_truncation_record(cql3::query_processor&, utils::UUID);
-    future<db::replay_position> get_truncated_position(cql3::query_processor&, utils::UUID);
-    future<db_clock::time_point> get_truncated_at(cql3::query_processor&, utils::UUID);
+    typedef std::vector<db::replay_position> replay_positions;
+
+    future<> save_truncation_record(const column_family&, db_clock::time_point truncated_at, db::replay_position);
+    future<> save_truncation_records(const column_family&, db_clock::time_point truncated_at, replay_positions);
+    future<> remove_truncation_record(utils::UUID);
+    future<replay_positions> get_truncated_position(utils::UUID);
+    future<db::replay_position> get_truncated_position(utils::UUID, uint32_t shard);
+    future<db_clock::time_point> get_truncated_at(utils::UUID);

 #if 0

--- a/dist/docker/Dockerfile
+++ b/dist/docker/Dockerfile
@@ -4,7 +4,7 @@ MAINTAINER Avi Kivity <avi@cloudius-systems.com>

 ADD scylla.repo /etc/yum.repos.d/
 RUN dnf -y update
-RUN dnf -y install scylla-server
+RUN dnf -y install scylla-server hostname
 RUN dnf clean all
 ADD start-scylla /start-scylla
 RUN chown scylla /start-scylla
--- a/dist/redhat/build_rpm.sh
+++ b/dist/redhat/build_rpm.sh
@@ -37,6 +37,6 @@ if [ "$OS" = "Fedora" ]; then
    rpmbuild -bs --define "_topdir $RPMBUILD" $RPMBUILD/SPECS/scylla-server.spec
    mock rebuild --resultdir=`pwd`/build/rpms $RPMBUILD/SRPMS/scylla-server-$VERSION*.src.rpm
 else
-    sudo yum-builddep $RPMBUILD/SPECS/scylla-server.spec
+    sudo yum-builddep -y  $RPMBUILD/SPECS/scylla-server.spec
    rpmbuild -ba --define "_topdir $RPMBUILD" $RPMBUILD/SPECS/scylla-server.spec
 fi
--- a/dist/redhat/centos_dep/build_dependency.sh
+++ b/dist/redhat/centos_dep/build_dependency.sh
@@ -1,55 +1,111 @@
 #!/bin/sh -e
 export RPMBUILD=`pwd`/build/rpmbuild

+do_install()
+{
+    pkg=$1
+    sudo yum install -y $RPMBUILD/RPMS/*/$pkg 2> build/err || if [ "`cat build/err`" != "Error: Nothing to do" ]; then cat build/err; exit 1;fi
+    echo Install $name done
+}
+
 sudo yum install -y wget yum-utils rpm-build rpmdevtools gcc gcc-c++ make patch
 mkdir -p build/srpms
 cd build/srpms
-wget http://download.fedoraproject.org/pub/fedora/linux/releases/22/Everything/source/SRPMS/b/boost-1.57.0-6.fc22.src.rpm
-wget http://download.fedoraproject.org/pub/fedora/linux/releases/22/Everything/source/SRPMS/n/ninja-build-1.5.3-2.fc22.src.rpm
-wget http://download.fedoraproject.org/pub/fedora/linux/releases/22/Everything/source/SRPMS/r/ragel-6.8-3.fc22.src.rpm
-wget http://download.fedoraproject.org/pub/fedora/linux/releases/22/Everything/source/SRPMS/r/re2c-0.13.5-9.fc22.src.rpm
+
+if [ ! -f binutils-2.25-5.fc22.src.rpm ]; then
+    wget http://ftp.riken.jp/Linux/fedora/releases/22/Everything/source/SRPMS/b/binutils-2.25-5.fc22.src.rpm
+fi
+
+if [ ! -f isl-0.14-3.fc22.src.rpm ]; then
+    wget http://ftp.riken.jp/Linux/fedora/releases/22/Everything/source/SRPMS/i/isl-0.14-3.fc22.src.rpm
+fi
+
+if [ ! -f gcc-5.1.1-4.fc22.src.rpm ]; then
+    wget http://ftp.riken.jp/Linux/fedora/updates/22/SRPMS/g/gcc-5.1.1-4.fc22.src.rpm
+fi
+
+if [ ! -f boost-1.57.0-6.fc22.src.rpm ]; then
+    wget http://download.fedoraproject.org/pub/fedora/linux/releases/22/Everything/source/SRPMS/b/boost-1.57.0-6.fc22.src.rpm
+fi
+
+if [ ! -f ninja-build-1.5.3-2.fc22.src.rpm ]; then
+    wget http://download.fedoraproject.org/pub/fedora/linux/releases/22/Everything/source/SRPMS/n/ninja-build-1.5.3-2.fc22.src.rpm
+fi
+
+if [ ! -f ragel-6.8-3.fc22.src.rpm ]; then
+   wget http://download.fedoraproject.org/pub/fedora/linux/releases/22/Everything/source/SRPMS/r/ragel-6.8-3.fc22.src.rpm
+fi
+
+if [ ! -f re2c-0.13.5-9.fc22.src.rpm ]; then
+   wget http://download.fedoraproject.org/pub/fedora/linux/releases/22/Everything/source/SRPMS/r/re2c-0.13.5-9.fc22.src.rpm
+fi
+
 cd -

 sudo yum install -y epel-release
 sudo yum install -y cryptopp cryptopp-devel jsoncpp jsoncpp-devel lz4 lz4-devel yaml-cpp yaml-cpp-devel thrift thrift-devel scons gtest gtest-devel python34
 sudo ln -sf /usr/bin/python3.4 /usr/bin/python3

-sudo yum install -y scl-utils
-sudo yum install -y https://www.softwarecollections.org/en/scls/rhscl/devtoolset-3/epel-7-x86_64/download/rhscl-devtoolset-3-epel-7-x86_64.noarch.rpm
-sudo yum install -y devtoolset-3-gcc-c++
-
 sudo yum install -y python-devel libicu-devel openmpi-devel mpich-devel libstdc++-devel bzip2-devel zlib-devel
-rpmbuild --define "_topdir $RPMBUILD" --without python3 --rebuild build/srpms/boost-1.57.0-6.fc22.src.rpm
-sudo yum install -y `ls $RPMBUILD/RPMS/x86_64/boost*|grep -v debuginfo`
-
-rpmbuild --define "_topdir $RPMBUILD" --rebuild build/srpms/re2c-0.13.5-9.fc22.src.rpm
-sudo yum install -y $RPMBUILD/RPMS/x86_64/re2c-0.13.5-9.el7.centos.x86_64.rpm
-
-rpm --define "_topdir $RPMBUILD" -ivh build/srpms/ninja-build-1.5.3-2.fc22.src.rpm
-patch $RPMBUILD/SPECS/ninja-build.spec < dist/redhat/centos_dep/ninja-build.diff
-rpmbuild --define "_topdir $RPMBUILD" -ba $RPMBUILD/SPECS/ninja-build.spec
-sudo yum install -y $RPMBUILD/RPMS/x86_64/ninja-build-1.5.3-2.el7.centos.x86_64.rpm
-
+sudo yum install -y flex bison dejagnu zlib-static glibc-static sharutils bc libstdc++-static gmp-devel texinfo texinfo-tex systemtap-sdt-devel mpfr-devel libmpc-devel elfutils-devel elfutils-libelf-devel glibc-devel.x86_64 glibc-devel.i686 gcc-gnat libgnat doxygen graphviz dblatex texlive-collection-latex docbook5-style-xsl python-sphinx cmake
 sudo yum install -y gcc-objc
-rpm --define "_topdir $RPMBUILD" -ivh build/srpms/ragel-6.8-3.fc22.src.rpm
-patch $RPMBUILD/SPECS/ragel.spec < dist/redhat/centos_dep/ragel.diff
-rpmbuild --define "_topdir $RPMBUILD" -ba $RPMBUILD/SPECS/ragel.spec
-sudo yum install -y $RPMBUILD/RPMS/x86_64/ragel-6.8-3.el7.centos.x86_64.rpm

-mkdir build/antlr3-tool-3.5.2
-cp dist/redhat/centos_dep/antlr3 build/antlr3-tool-3.5.2
-cd build/antlr3-tool-3.5.2
-wget http://www.antlr3.org/download/antlr-3.5.2-complete-no-st3.jar
-cd -
-cd build
-tar cJpf $RPMBUILD/SOURCES/antlr3-tool-3.5.2.tar.xz antlr3-tool-3.5.2
-cd -
-rpmbuild --define "_topdir $RPMBUILD" -ba dist/redhat/centos_dep/antlr3-tool.spec
-sudo yum install -y $RPMBUILD/RPMS/noarch/antlr3-tool-3.5.2-1.el7.centos.noarch.rpm
+if [ ! -f $RPMBUILD/RPMS/x86_64/binutils-2.25-5.el7.centos.x86_64.rpm ]; then
+    rpmbuild --define "_topdir $RPMBUILD" --rebuild build/srpms/binutils-2.25-5.fc22.src.rpm
+fi
+do_install binutils-2.25-5.el7.centos.x86_64.rpm

-wget -O build/3.5.2.tar.gz https://github.com/antlr/antlr3/archive/3.5.2.tar.gz
-mv build/3.5.2.tar.gz $RPMBUILD/SOURCES
-rpmbuild --define "_topdir $RPMBUILD" -ba dist/redhat/centos_dep/antlr3-C++-devel.spec
-sudo yum install -y $RPMBUILD/RPMS/x86_64/antlr3-C++-devel-3.5.2-1.el7.centos.x86_64.rpm
+if [ ! -f $RPMBUILD/RPMS/x86_64/isl-0.14-3.el7.centos.x86_64.rpm ]; then
+    rpmbuild --define "_topdir $RPMBUILD" --rebuild build/srpms/isl-0.14-3.fc22.src.rpm
+fi
+do_install isl-0.14-3.el7.centos.x86_64.rpm
+do_install isl-devel-0.14-3.el7.centos.x86_64.rpm

+if [ ! -f $RPMBUILD/RPMS/x86_64/gcc-5.1.1-4.el7.centos.x86_64.rpm ]; then
+    rpmbuild --define "_topdir $RPMBUILD" --define "fedora 21" --rebuild build/srpms/gcc-5.1.1-4.fc22.src.rpm
+fi
+do_install *5.1.1-4*

+if [ ! -f $RPMBUILD/RPMS/x86_64/boost-1.57.0-6.el7.centos.x86_64.rpm ]; then
+    rpmbuild --define "_topdir $RPMBUILD" --without python3 --rebuild build/srpms/boost-1.57.0-6.fc22.src.rpm
+fi
+do_install boost*
+
+if [ ! -f $RPMBUILD/RPMS/x86_64/re2c-0.13.5-9.el7.centos.x86_64.rpm ]; then
+    rpmbuild --define "_topdir $RPMBUILD" --rebuild build/srpms/re2c-0.13.5-9.fc22.src.rpm
+fi
+do_install re2c-0.13.5-9.el7.centos.x86_64.rpm
+
+if [ ! -f $RPMBUILD/RPMS/x86_64/ninja-build-1.5.3-2.el7.centos.x86_64.rpm ]; then
+   rpm --define "_topdir $RPMBUILD" -ivh build/srpms/ninja-build-1.5.3-2.fc22.src.rpm
+   patch $RPMBUILD/SPECS/ninja-build.spec < dist/redhat/centos_dep/ninja-build.diff
+   rpmbuild --define "_topdir $RPMBUILD" -ba $RPMBUILD/SPECS/ninja-build.spec
+fi
+do_install ninja-build-1.5.3-2.el7.centos.x86_64.rpm
+
+if [ ! -f $RPMBUILD/RPMS/x86_64/ragel-6.8-3.el7.centos.x86_64.rpm ]; then
+    rpm --define "_topdir $RPMBUILD" -ivh build/srpms/ragel-6.8-3.fc22.src.rpm
+    patch $RPMBUILD/SPECS/ragel.spec < dist/redhat/centos_dep/ragel.diff
+    rpmbuild --define "_topdir $RPMBUILD" -ba $RPMBUILD/SPECS/ragel.spec
+fi
+do_install ragel-6.8-3.el7.centos.x86_64.rpm
+
+if [ ! -f $RPMBUILD/RPMS/noarch/antlr3-tool-3.5.2-1.el7.centos.noarch.rpm ]; then
+   mkdir build/antlr3-tool-3.5.2
+   cp dist/redhat/centos_dep/antlr3 build/antlr3-tool-3.5.2
+   cd build/antlr3-tool-3.5.2
+   wget http://www.antlr3.org/download/antlr-3.5.2-complete-no-st3.jar
+   cd -
+   cd build
+   tar cJpf $RPMBUILD/SOURCES/antlr3-tool-3.5.2.tar.xz antlr3-tool-3.5.2
+   cd -
+   rpmbuild --define "_topdir $RPMBUILD" -ba dist/redhat/centos_dep/antlr3-tool.spec
+fi
+do_install antlr3-tool-3.5.2-1.el7.centos.noarch.rpm
+
+if [ ! -f $RPMBUILD/RPMS/x86_64/antlr3-C++-devel-3.5.2-1.el7.centos.x86_64.rpm ];then
+   wget -O build/3.5.2.tar.gz https://github.com/antlr/antlr3/archive/3.5.2.tar.gz
+   mv build/3.5.2.tar.gz $RPMBUILD/SOURCES
+   rpmbuild --define "_topdir $RPMBUILD" -ba dist/redhat/centos_dep/antlr3-C++-devel.spec
+fi
+do_install antlr3-C++-devel-3.5.2-1.el7.centos.x86_64.rpm
--- a/dist/redhat/scylla-server.spec.in
+++ b/dist/redhat/scylla-server.spec.in
@@ -8,13 +8,10 @@ License:        AGPLv3
 URL:            http://www.scylladb.com/
 Source0:        %{name}-@@VERSION@@-@@RELEASE@@.tar

-BuildRequires:  libaio-devel boost-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel openssl-devel libcap-devel libselinux-devel libgcrypt-devel libgpg-error-devel elfutils-devel krb5-devel libcom_err-devel libattr-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel ninja-build ragel antlr3-tool antlr3-C++-devel make
+BuildRequires:  libaio-devel boost-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel openssl-devel libcap-devel libselinux-devel libgcrypt-devel libgpg-error-devel elfutils-devel krb5-devel libcom_err-devel libattr-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel ninja-build ragel antlr3-tool antlr3-C++-devel xfsprogs-devel make
 %{?fedora:BuildRequires: python3 gcc-c++ libasan libubsan}
-%{?rhel:BuildRequires: python34 devtoolset-3-gcc-c++}
-Requires:       libaio boost-program-options boost-system libstdc++ boost-thread cryptopp hwloc-libs numactl-libs libpciaccess libxml2 zlib thrift yaml-cpp lz4 snappy jsoncpp boost-filesystem systemd-libs xz-libs openssl-libs libcap libselinux libgcrypt libgpg-error elfutils-libs krb5-libs libcom_err libattr pcre elfutils-libelf bzip2-libs keyutils-libs
-
-# TODO: create our own bridge device for virtio
-Requires:       libvirt-daemon
+%{?rhel:BuildRequires: python34 gcc-c++ >= 5.1.1}
+Requires:       systemd-libs xfsprogs

 %description

@@ -26,7 +23,7 @@ Requires:       libvirt-daemon
 ./configure.py --with scylla --disable-xen --enable-dpdk --mode=release
 %endif
 %if 0%{?rhel}
-./configure.py --with scylla --disable-xen --enable-dpdk --mode=release --compiler=/opt/rh/devtoolset-3/root/usr/bin/g++
+./configure.py --with scylla --disable-xen --enable-dpdk --mode=release --static-stdc++
 %endif
 ninja-build -j2

--- a/exceptions/exceptions.cc
+++ b/exceptions/exceptions.cc
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright 2015 Cloudius Systems
+ *
+ * Modified by Cloudius Systems
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sstream>
+
+#include "exceptions.hh"
+#include "log.hh"
+
+exceptions::truncate_exception::truncate_exception(std::exception_ptr ep)
+    : request_execution_exception(exceptions::exception_code::PROTOCOL_ERROR, sprint("Error during truncate: %s", ep))
+{}
--- a/exceptions/exceptions.hh
+++ b/exceptions/exceptions.hh
@@ -107,6 +107,19 @@ struct unavailable_exception : cassandra_exception {
    {}
 };

+class request_execution_exception : public cassandra_exception {
+public:
+    request_execution_exception(exception_code code, sstring msg)
+        : cassandra_exception(code, std::move(msg))
+    { }
+};
+
+class truncate_exception : public request_execution_exception
+{
+public:
+    truncate_exception(std::exception_ptr ep);
+};
+
 class request_timeout_exception : public cassandra_exception {
 public:
    db::consistency_level consistency;
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -840,6 +840,10 @@ int gossiper::get_current_generation_number(inet_address endpoint) {
    return endpoint_state_map.at(endpoint).get_heart_beat_state().get_generation();
 }

+int gossiper::get_current_heart_beat_version(inet_address endpoint) {
+    return endpoint_state_map.at(endpoint).get_heart_beat_state().get_heart_beat_version();
+}
+
 future<bool> gossiper::do_gossip_to_live_member(gossip_digest_syn message) {
    size_t size = _live_endpoints.size();
    if (size == 0) {
@@ -1280,11 +1284,11 @@ future<> gossiper::do_shadow_round() {
                    return make_ready_future<>();
                }).get();
            }
-            if (clk::now() > t + storage_service_ring_delay()) {
+            if (clk::now() > t + storage_service_ring_delay() * 60) {
                throw std::runtime_error(sprint("Unable to gossip with any seeds (ShadowRound)"));
            }
            if (this->_in_shadow_round) {
-                logger.trace("Sleep 1 second and retry ...");
+                logger.info("Sleep 1 second and connect seeds again ... ({} seconds passed)", std::chrono::duration_cast<std::chrono::seconds>(clk::now() - t).count());
                sleep(std::chrono::seconds(1)).get();
            }
        }
@@ -1477,6 +1481,12 @@ future<int> get_current_generation_number(inet_address ep) {
    });
 }

+future<int> get_current_heart_beat_version(inet_address ep) {
+    return smp::submit_to(0, [ep] {
+        return get_local_gossiper().get_current_heart_beat_version(ep);
+    });
+}
+
 future<> unsafe_assassinate_endpoint(sstring ep) {
    return smp::submit_to(0, [ep] {
        return get_local_gossiper().unsafe_assassinate_endpoint(ep);
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -305,6 +305,7 @@ public:
    bool is_known_endpoint(inet_address endpoint);

    int get_current_generation_number(inet_address endpoint);
+    int get_current_heart_beat_version(inet_address endpoint);

    bool is_gossip_only_member(inet_address endpoint);
 private:
@@ -462,6 +463,7 @@ future<std::set<inet_address>> get_unreachable_members();
 future<std::set<inet_address>> get_live_members();
 future<int64_t> get_endpoint_downtime(inet_address ep);
 future<int> get_current_generation_number(inet_address ep);
+future<int> get_current_heart_beat_version(inet_address ep);
 future<> unsafe_assassinate_endpoint(sstring ep);
 future<> assassinate_endpoint(sstring ep);

--- a/keys.hh
+++ b/keys.hh
@@ -160,6 +160,17 @@ public:
        return TopLevel::from_bytes(get_compound_type(s)->serialize_single(std::move(v)));
    }

+    template <typename T>
+    static
+    TopLevel from_singular(const schema& s, const T& v) {
+        auto ct = get_compound_type(s);
+        if (!ct->is_singular()) {
+            throw std::invalid_argument("compound is not singular");
+        }
+        auto type = ct->types()[0];
+        return from_single_value(s, type->decompose(v));
+    }
+
    TopLevelView view() const {
        return TopLevelView::from_bytes(_bytes);
    }
--- a/log.cc
+++ b/log.cc
@@ -214,8 +214,8 @@ logging::log_level lexical_cast(const std::string& source) {

 }

-
-std::ostream& operator<<(std::ostream&out, std::exception_ptr eptr) {
+namespace std {
+std::ostream& operator<<(std::ostream&out, const std::exception_ptr eptr) {
    if (!eptr) {
        out << "<no exception>";
        return out;
@@ -241,3 +241,4 @@ std::ostream& operator<<(std::ostream&out, std::exception_ptr eptr) {
    }
    return out;
 }
+}
--- a/log.hh
+++ b/log.hh
@@ -179,6 +179,8 @@ logger::do_log(log_level level, const char* fmt, Args&&... args) {
 }

 // Pretty-printer for exceptions to be logged, e.g., std::current_exception().
-std::ostream& operator<<(std::ostream&, std::exception_ptr);
+namespace std {
+std::ostream& operator<<(std::ostream&, const std::exception_ptr);
+}

 #endif /* LOG_HH_ */
--- a/main.cc
+++ b/main.cc
@@ -43,6 +43,7 @@
 #include "init.hh"
 #include "release.hh"
 #include <cstdio>
+#include <core/file.hh>

 logging::logger startlog("init");

@@ -89,6 +90,17 @@ static logging::log_level to_loglevel(sstring level) {
    }
 }

+static future<> disk_sanity(sstring path) {
+    return check_direct_io_support(path).then([path] {
+        return file_system_at(path).then([path] (auto fs) {
+            if (fs != fs_type::xfs) {
+                startlog.warn("{} is not on XFS. This is a non-supported setup, and performance is expected to be very bad.\n"
+                    "For better performance, placing your data on XFS-formatted directories is strongly recommended", path);
+            }
+        });
+    });
+};
+
 static void apply_logger_settings(sstring default_level, db::config::string_map levels,
        bool log_to_stdout, bool log_to_syslog) {
    logging::logger_registry().set_all_loggers_level(to_loglevel(default_level));
@@ -246,6 +258,12 @@ int main(int ac, char** av) {
                return dirs.touch_and_lock(db.local().get_config().data_file_directories());
            }).then([&db, &dirs] {
                return dirs.touch_and_lock(db.local().get_config().commitlog_directory());
+            }).then([&db] {
+                return parallel_for_each(db.local().get_config().data_file_directories(), [] (sstring pathname) {
+                    return disk_sanity(pathname);
+                }).then([&db] {
+                    return disk_sanity(db.local().get_config().commitlog_directory());
+                });
            }).then([&db] {
                return db.invoke_on_all([] (database& db) {
                    return db.init_system_keyspace();
@@ -282,6 +300,10 @@ int main(int ac, char** av) {
            }).then([] {
                auto& ss = service::get_local_storage_service();
                return ss.init_server();
+            }).then([] {
+                return db::get_batchlog_manager().invoke_on_all([] (db::batchlog_manager& b) {
+                    return b.start();
+                });
            }).then([rpc_address] {
                return dns::gethostbyname(rpc_address);
            }).then([&db, &proxy, &qp, rpc_address, cql_port, thrift_port, start_thrift] (dns::hostent e) {
@@ -319,6 +341,8 @@ int main(int ac, char** av) {
                }).then([api_address, api_port] {
                    print("Seastar HTTP server listening on %s:%s ...\n", api_address, api_port);
                });
+            }).then([] {
+                startlog.warn("Polling mode enabled. ScyllaDB will use 100% of all your CPUs.\nSee https://github.com/scylladb/scylla/issues/417 for a more detailed explanation");
            });
        }).or_terminate();
    });
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -178,6 +178,9 @@ public:
        // FIXME: not really true, a previous stop could be in progress?
        return make_ready_future<>();
    }
+    bool error() {
+        return _p->error();
+    }
    operator rpc_protocol::client&() { return *_p; }
 };

@@ -295,14 +298,19 @@ static unsigned get_rpc_client_idx(messaging_verb verb) {
 shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::get_rpc_client(messaging_verb verb, shard_id id) {
    auto idx = get_rpc_client_idx(verb);
    auto it = _clients[idx].find(id);
-    if (it == _clients[idx].end()) {
-        auto remote_addr = ipv4_addr(id.addr.raw_addr(), _port);
-        auto client = make_shared<rpc_protocol_client_wrapper>(*_rpc, remote_addr, ipv4_addr{_listen_address.raw_addr(), 0});
-        it = _clients[idx].emplace(id, shard_info(std::move(client))).first;
-        return it->second.rpc_client;
-    } else {
-        return it->second.rpc_client;
+
+    if (it != _clients[idx].end()) {
+        auto c = it->second.rpc_client;
+        if (!c->error()) {
+            return c;
+        }
+        remove_rpc_client(verb, id);
    }
+
+    auto remote_addr = ipv4_addr(id.addr.raw_addr(), _port);
+    auto client = make_shared<rpc_protocol_client_wrapper>(*_rpc, remote_addr, ipv4_addr{_listen_address.raw_addr(), 0});
+    it = _clients[idx].emplace(id, shard_info(std::move(client))).first;
+    return it->second.rpc_client;
 }

 void messaging_service::remove_rpc_client(messaging_verb verb, shard_id id) {
@@ -536,4 +544,18 @@ future<query::result_digest> messaging_service::send_read_digest(shard_id id, qu
    return send_message<query::result_digest>(this, net::messaging_verb::READ_DIGEST, std::move(id), cmd, pr);
 }

+// Wrapper for TRUNCATE
+void messaging_service::register_truncate(std::function<future<> (sstring, sstring)>&& func) {
+    register_handler(this, net::messaging_verb::TRUNCATE, std::move(func));
+}
+
+void messaging_service::unregister_truncate() {
+    _rpc->unregister_handler(net::messaging_verb::TRUNCATE);
+}
+
+future<> messaging_service::send_truncate(shard_id id, std::chrono::milliseconds timeout, sstring ks, sstring cf) {
+    return send_message_timeout<void>(this, net::messaging_verb::TRUNCATE, std::move(id), std::move(timeout), std::move(ks), std::move(cf));
+}
+
+
 } // namespace net
--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -528,6 +528,11 @@ public:
    void unregister_read_digest();
    future<query::result_digest> send_read_digest(shard_id id, query::read_command& cmd, query::partition_range& pr);

+    // Wrapper for TRUNCATE
+    void register_truncate(std::function<future<>(sstring, sstring)>&& func);
+    void unregister_truncate();
+    future<> send_truncate(shard_id, std::chrono::milliseconds, sstring, sstring);
+
 public:
    // Return rpc::protocol::client for a shard which is a ip + cpuid pair.
    shared_ptr<rpc_protocol_client_wrapper> get_rpc_client(messaging_verb verb, shard_id id);
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -519,6 +519,7 @@ class mutation_partition final {
        boost::intrusive::compare<row_tombstones_entry::compare>>;
    friend rows_entry;
    friend row_tombstones_entry;
+    friend class size_calculator;
 private:
    tombstone _tombstone;
    row _static_row;
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -210,12 +210,7 @@ row_cache::make_reader(const query::partition_range& range) {
 }

 row_cache::~row_cache() {
-    with_allocator(_tracker.allocator(), [this] {
-        _partitions.clear_and_dispose([this, deleter = current_deleter<cache_entry>()] (auto&& p) mutable {
-            _tracker.on_erase();
-            deleter(p);
-        });
-    });
+    clear();
 }

 void row_cache::populate(const mutation& m) {
@@ -235,6 +230,15 @@ void row_cache::populate(const mutation& m) {
    });
 }

+void row_cache::clear() {
+    with_allocator(_tracker.allocator(), [this] {
+        _partitions.clear_and_dispose([this, deleter = current_deleter<cache_entry>()] (auto&& p) mutable {
+            _tracker.on_erase();
+            deleter(p);
+        });
+    });
+}
+
 future<> row_cache::update(memtable& m, partition_presence_checker presence_checker) {
    _tracker.region().merge(m._region); // Now all data in memtable belongs to cache
    auto attr = seastar::thread_attributes();
--- a/row_cache.hh
+++ b/row_cache.hh
@@ -56,6 +56,7 @@ class cache_entry {
    mutation_partition _p;
    lru_link_type _lru_link;
    cache_link_type _cache_link;
+    friend class size_calculator;
 public:
    friend class row_cache;
    friend class cache_tracker;
@@ -182,6 +183,9 @@ public:
    // information there is for its partition in the underlying data sources.
    void populate(const mutation& m);

+    // Clears the cache.
+    void clear();
+
    // Synchronizes cache with the underlying data source from a memtable which
    // has just been flushed to the underlying data source.
    // The memtable can be queried during the process, but must not be written.
--- a/2
+++ b/2
--- a/service/migration_manager.cc
+++ b/service/migration_manager.cc
@@ -222,11 +222,11 @@ public void notifyUpdateAggregate(UDAggregate udf)
 }
 #endif

-future<> migration_manager::notify_drop_keyspace(const lw_shared_ptr<keyspace_metadata>& ksm)
+future<> migration_manager::notify_drop_keyspace(sstring ks_name)
 {
-    return get_migration_manager().invoke_on_all([name = ksm->name()] (auto&& mm) {
+    return get_migration_manager().invoke_on_all([ks_name] (auto&& mm) {
        for (auto&& listener : mm._listeners) {
-            listener->on_drop_keyspace(name);
+            listener->on_drop_keyspace(ks_name);
        }
    });
 }
@@ -381,13 +381,12 @@ future<> migration_manager::announce_keyspace_drop(const sstring& ks_name, bool
 {
    try {
        auto& db = get_local_storage_proxy().get_db().local();
-        /*auto&& keyspace = */db.find_keyspace(ks_name);
+        auto& keyspace = db.find_keyspace(ks_name);
 #if 0
        logger.info(String.format("Drop Keyspace '%s'", oldKsm.name));
-        announce(LegacySchemaTables.makeDropKeyspaceMutation(oldKsm, FBUtilities.timestampMicros()), announceLocally);
 #endif
-        // FIXME
-        throw std::runtime_error("not implemented");
+        auto&& mutations = db::schema_tables::make_drop_keyspace_mutations(keyspace.metadata(), db_clock::now_in_usecs());
+        return announce(std::move(mutations), announce_locally);
    } catch (const no_such_keyspace& e) {
        throw exceptions::configuration_exception(sprint("Cannot drop non existing keyspace '%s'.", ks_name));
    }
@@ -399,14 +398,11 @@ future<> migration_manager::announce_column_family_drop(const sstring& ks_name,
 {
    try {
        auto& db = get_local_storage_proxy().get_db().local();
-        /*auto&& cfm = */db.find_schema(ks_name, cf_name);
-        /*auto&& ksm = */db.find_keyspace(ks_name);
-#if 0
-        logger.info(String.format("Drop table '%s/%s'", oldCfm.ksName, oldCfm.cfName));
-        announce(LegacySchemaTables.makeDropTableMutation(ksm, oldCfm, FBUtilities.timestampMicros()), announceLocally);
-#endif
-        // FIXME
-        throw std::runtime_error("not implemented");
+        auto&& old_cfm = db.find_schema(ks_name, cf_name);
+        auto&& keyspace = db.find_keyspace(ks_name);
+        logger.info("Drop table '{}/{}'", old_cfm->ks_name(), old_cfm->cf_name());
+        auto mutations = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), old_cfm, db_clock::now_in_usecs());
+        return announce(std::move(mutations), announce_locally);
    } catch (const no_such_column_family& e) {
        throw exceptions::configuration_exception(sprint("Cannot drop non existing table '%s' in keyspace '%s'.", cf_name, ks_name));
    }
--- a/service/migration_manager.hh
+++ b/service/migration_manager.hh
@@ -79,7 +79,7 @@ public:

    static future<> notify_update_column_family(schema_ptr cfm);

-    static future<> notify_drop_keyspace(const lw_shared_ptr<keyspace_metadata>& ksm);
+    static future<> notify_drop_keyspace(sstring ks_name);

    static future<> notify_drop_column_family(schema_ptr cfm);

--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -61,7 +61,6 @@
 #include <boost/range/adaptor/transformed.hpp>
 #include <boost/iterator/counting_iterator.hpp>
 #include <boost/range/adaptor/filtered.hpp>
-#include <boost/range/adaptor/indirected.hpp>
 #include <boost/range/algorithm/count_if.hpp>
 #include <boost/range/algorithm/find.hpp>
 #include <boost/range/algorithm/find_if.hpp>
@@ -1293,21 +1292,22 @@ class digest_read_resolver : public abstract_read_resolver {
        _digest_results.clear();
    }
    virtual size_t response_count() const override {
-        return _digest_results.size() + _data_results.size();
+        return _digest_results.size();
    }
    bool digests_match() const {
        assert(response_count());
        if (response_count() == 1) {
            return true;
        }
-        auto digests = boost::range::join(_digest_results, _data_results | boost::adaptors::indirected | boost::adaptors::transformed(std::mem_fn(&query::result::digest)));
-        const query::result_digest& first = *digests.begin();
-        return std::find_if(digests.begin() + 1, digests.end(), [&first] (const query::result_digest& digest) { return digest != first; }) == digests.end();
+        auto& first = *_digest_results.begin();
+        return std::find_if(_digest_results.begin() + 1, _digest_results.end(), [&first] (query::result_digest digest) { return digest != first; }) == _digest_results.end();
    }
 public:
    digest_read_resolver(db::consistency_level cl, size_t block_for, std::chrono::high_resolution_clock::time_point timeout) : abstract_read_resolver(cl, 0, timeout), _block_for(block_for) {}
    void add_data(gms::inet_address from, foreign_ptr<lw_shared_ptr<query::result>> result) {
        if (!_timedout) {
+            // if only one target was queried digest_check() will be skipped so we can also skip digest calculation
+            _digest_results.emplace_back(_targets_count == 1 ? query::result_digest(bytes()) : result->digest());
            _data_results.emplace_back(std::move(result));
            got_response(from);
        }
@@ -2225,6 +2225,42 @@ bool storage_proxy::should_hint(gms::inet_address ep) {
 #endif
 }

+future<> storage_proxy::truncate_blocking(sstring keyspace, sstring cfname) {
+    logger.debug("Starting a blocking truncate operation on keyspace {}, CF {}", keyspace, cfname);
+
+    auto& gossiper = gms::get_local_gossiper();
+
+    if (!gossiper.get_unreachable_token_owners().empty()) {
+        logger.info("Cannot perform truncate, some hosts are down");
+        // Since the truncate operation is so aggressive and is typically only
+        // invoked by an admin, for simplicity we require that all nodes are up
+        // to perform the operation.
+        auto live_members = gossiper.get_live_members().size();
+
+        throw exceptions::unavailable_exception(db::consistency_level::ALL,
+                live_members + gossiper.get_unreachable_members().size(),
+                live_members);
+    }
+
+    auto all_endpoints = gossiper.get_live_token_owners();
+    auto& ms = net::get_local_messaging_service();
+    auto timeout = std::chrono::milliseconds(_db.local().get_config().truncate_request_timeout_in_ms());
+
+    logger.trace("Enqueuing truncate messages to hosts {}", all_endpoints);
+
+    return parallel_for_each(all_endpoints, [keyspace, cfname, &ms, timeout](auto ep) {
+        return ms.send_truncate(net::messaging_service::shard_id{ep, 0}, timeout, keyspace, cfname);
+    }).handle_exception([cfname](auto ep) {
+       try {
+           std::rethrow_exception(ep);
+       } catch (rpc::timeout_error& e) {
+           logger.trace("Truncation of {} timed out: {}", cfname, e.what());
+       } catch (...) {
+           throw;
+       }
+    });
+}
+
 #if 0
    /**
     * Performs the truncate operatoin, which effectively deletes all data from
@@ -2529,6 +2565,12 @@ void storage_proxy::init_messaging_service() {
            return p->query_singular_local_digest(cmd, pr);
        });
    });
+    ms.register_truncate([](sstring ksname, sstring cfname) {
+        const auto truncated_at = db_clock::now();
+        return get_storage_proxy().invoke_on_all([truncated_at, ksname, cfname](storage_proxy& sp) {
+            return sp._db.local().truncate(truncated_at, ksname, cfname);
+        });
+    });
 }

 void storage_proxy::uninit_messaging_service() {
@@ -2540,6 +2582,7 @@ void storage_proxy::uninit_messaging_service() {
    ms.unregister_read_data();
    ms.unregister_read_mutation_data();
    ms.unregister_read_digest();
+    ms.unregister_truncate();
 }

 // Merges reconcilable_result:s from different shards into one
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -160,6 +160,14 @@ public:
    */
    future<> mutate_atomically(std::vector<mutation> mutations, db::consistency_level cl);

+    /**
+     * Performs the truncate operatoin, which effectively deletes all data from
+     * the column family cfname
+     * @param keyspace
+     * @param cfname
+     */
+    future<> truncate_blocking(sstring keyspace, sstring cfname);
+
    /*
     * Executes data query on the whole cluster.
     *
--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -96,6 +96,9 @@ public:

    void gossip_snitch_info();

+    distributed<database>& db() {
+        return _db;
+    }
 private:
    bool is_auto_bootstrap();
    inet_address get_broadcast_address() {
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -215,7 +215,7 @@ future<> compact_sstables(std::vector<shared_sstable> sstables,

    future<> write_done = newtab->write_components(
            std::move(mutation_queue_reader), estimated_partitions, schema).then([newtab, stats, start_time] {
-        return newtab->load().then([newtab, stats, start_time] {
+        return newtab->open_data().then([newtab, stats, start_time] {
            uint64_t endsize = newtab->data_size();
            double ratio = (double) endsize / (double) stats->start_size;
            auto end_time = std::chrono::high_resolution_clock::now();
@@ -237,10 +237,29 @@ future<> compact_sstables(std::vector<shared_sstable> sstables,
    });

    // Wait for both read_done and write_done fibers to finish.
-    // FIXME: if write_done throws an exception, we get a broken pipe
-    // exception on read_done, and then we don't handle write_done's
-    // exception, causing a warning message of "ignored exceptional future".
-    return read_done.then([write_done = std::move(write_done)] () mutable { return std::move(write_done); });
+    return when_all(std::move(read_done), std::move(write_done)).then([] (std::tuple<future<>, future<>> t) {
+        sstring ex;
+        try {
+            std::get<0>(t).get();
+        } catch(...) {
+            ex += "read exception: ";
+            ex += sprint("%s", std::current_exception());
+        }
+
+        try {
+            std::get<1>(t).get();
+        } catch(...) {
+            if (ex.size()) {
+                ex += ", ";
+            }
+            ex += "write exception: ";
+            ex += sprint("%s", std::current_exception());
+        }
+
+        if (ex.size()) {
+            throw std::runtime_error(ex);
+        }
+    });
 }

 class compaction_strategy_impl {
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -41,6 +41,7 @@
 #include "memtable.hh"
 #include <boost/filesystem/operations.hpp>
 #include <boost/algorithm/string.hpp>
+#include <boost/range/adaptor/map.hpp>
 #include <regex>
 #include <core/align.hh>

@@ -877,7 +878,11 @@ future<> sstable::open_data() {
        _index_file = std::get<file>(std::get<0>(files).get());
        _data_file  = std::get<file>(std::get<1>(files).get());
        return _data_file.size().then([this] (auto size) {
-          _data_file_size = size;
+            if (this->has_component(sstable::component_type::CompressionInfo)) {
+                _compression.update(size);
+            } else {
+                _data_file_size = size;
+            }
        }).then([this] {
            return _index_file.size().then([this] (auto size) {
              _index_file_size = size;
@@ -911,12 +916,6 @@ future<> sstable::load() {
        return read_summary();
    }).then([this] {
        return open_data();
-    }).then([this] {
-        // After we have _compression and _data_file_size, we can update
-        // _compression with additional information it needs:
-        if (has_component(sstable::component_type::CompressionInfo)) {
-            _compression.update(_data_file_size);
-        }
    });
 }

@@ -1386,7 +1385,7 @@ future<uint64_t> sstable::bytes_on_disk() {
    });
 }

-const bool sstable::has_component(component_type f) {
+const bool sstable::has_component(component_type f) const {
    return _components.count(f);
 }

@@ -1394,6 +1393,16 @@ const sstring sstable::filename(component_type f) const {
    return filename(_dir, _ks, _cf, _version, _generation, _format, f);
 }

+std::vector<sstring> sstable::component_filenames() const {
+    std::vector<sstring> res;
+    for (auto c : _component_map | boost::adaptors::map_keys) {
+        if (has_component(c)) {
+            res.emplace_back(filename(c));
+        }
+    }
+    return res;
+}
+
 sstring sstable::toc_filename() const {
    return filename(component_type::TOC);
 }
@@ -1413,6 +1422,21 @@ const sstring sstable::filename(sstring dir, sstring ks, sstring cf, version_typ
    return dir + "/" + strmap[version](entry_descriptor(ks, cf, version, generation, format, component));
 }

+future<> sstable::create_links(sstring dir) const {
+    return parallel_for_each(component_filenames(), [this, dir](sstring f) {
+        auto sdir = get_dir();
+        auto name = f.substr(sdir.size());
+        auto dst = dir + name;
+        return ::link_file(f, dst);
+    }).then([dir] {
+        // sync dir
+        return ::open_directory(dir).then([](file df) {
+            auto f = df.flush();
+            return f.finally([df = std::move(df)] {});
+        });
+    });
+}
+
 entry_descriptor entry_descriptor::make_descriptor(sstring fname) {
    static std::regex la("la-(\\d+)-(\\w+)-(.*)");
    static std::regex ka("(\\w+)-(\\w+)-ka-(\\d+)-(.*)");
@@ -1584,9 +1608,9 @@ remove_by_toc_name(sstring sstable_toc_name) {
 }

 static future<bool>
-file_existence(sstring filename) {
+file_exists(sstring filename) {
    return engine().open_file_dma(filename, open_flags::ro).then([] (file f) {
-        return make_ready_future<>();
+        return f.close().finally([f] {});
    }).then_wrapped([] (future<> f) {
        bool exists = true;
        try {
@@ -1603,11 +1627,11 @@ file_existence(sstring filename) {
 future<>
 sstable::remove_sstable_with_temp_toc(sstring ks, sstring cf, sstring dir, unsigned long generation, version_types v, format_types f) {
    return seastar::async([ks, cf, dir, generation, v, f] {
-        auto toc = file_existence(filename(dir, ks, cf, v, generation, f, component_type::TOC)).get0();
+        auto toc = file_exists(filename(dir, ks, cf, v, generation, f, component_type::TOC)).get0();
        // assert that toc doesn't exist for sstable with temporary toc.
        assert(toc == false);

-        auto tmptoc = file_existence(filename(dir, ks, cf, v, generation, f, component_type::TemporaryTOC)).get0();
+        auto tmptoc = file_exists(filename(dir, ks, cf, v, generation, f, component_type::TemporaryTOC)).get0();
        // assert that temporary toc exists for this sstable.
        assert(tmptoc == true);

@@ -1627,12 +1651,13 @@ sstable::remove_sstable_with_temp_toc(sstring ks, sstring cf, sstring dir, unsig

            auto file_path = filename(dir, ks, cf, v, generation, f, entry.first);
            // Skip component that doesn't exist.
-            auto exists = file_existence(file_path).get0();
+            auto exists = file_exists(file_path).get0();
            if (!exists) {
                continue;
            }
            remove_file(file_path).get();
        }
+        fsync_directory(dir).get();
        // Removing temporary
        remove_file(filename(dir, ks, cf, v, generation, f, component_type::TemporaryTOC)).get();
        // Fsync'ing column family dir to guarantee that deletion completed.
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -183,6 +183,7 @@ public:
                                                 version_types v, format_types f);

    future<> load();
+    future<> open_data();

    void set_generation(unsigned long generation) {
        _generation = generation;
@@ -259,20 +260,40 @@ public:
        return _filter_file_size;
    }

+    uint64_t filter_memory_size() {
+        return _filter->memory_size();
+    }
+
    // Returns the total bytes of all components.
    future<uint64_t> bytes_on_disk();

    partition_key get_first_partition_key(const schema& s) const;
    partition_key get_last_partition_key(const schema& s) const;

-    const sstring get_filename() {
+    const sstring get_filename() const {
        return filename(component_type::Data);
    }
+    const sstring& get_dir() const {
+        return _dir;
+    }
    sstring toc_filename() const;

    metadata_collector& get_metadata_collector() {
        return _collector;
    }
+
+    future<> create_links(sstring dir) const;
+
+    /**
+     * Note. This is using the Origin definition of
+     * max_data_age, which is load time. This could maybe
+     * be improved upon.
+     */
+    gc_clock::time_point max_data_age() const {
+        return _now;
+    }
+    std::vector<sstring> component_filenames() const;
+
 private:
    sstable(size_t wbuffer_size, sstring ks, sstring cf, sstring dir, unsigned long generation, version_types v, format_types f, gc_clock::time_point now = gc_clock::now())
        : sstable_buffer_size(wbuffer_size)
@@ -328,7 +349,7 @@ private:

    gc_clock::time_point _now;

-    const bool has_component(component_type f);
+    const bool has_component(component_type f) const;

    const sstring filename(component_type f) const;

@@ -360,7 +381,6 @@ private:
    future<> read_statistics();
    void write_statistics();

-    future<> open_data();
    future<> create_data();

    future<index_list> read_indexes(uint64_t summary_idx);
--- a/test.py
+++ b/test.py
@@ -127,7 +127,7 @@ if __name__ == "__main__":
           if test[0].startswith(os.path.join('build','debug')):
              mode = 'debug'
           xmlout = args.jenkins+"."+mode+"."+os.path.basename(test[0])+".boost.xml"
-           path = path + " --output_format=XML --log_level=all --report_level=no --log_sink=" + xmlout
+           path = path + " --output_format=XML --log_level=test_suite --report_level=no --log_sink=" + xmlout
           print(path)
        proc = subprocess.Popen(path.split(' '), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env,preexec_fn=os.setsid)
        signal.alarm(args.timeout)
--- a/tests/batchlog_manager_test.cc
+++ b/tests/batchlog_manager_test.cc
@@ -43,6 +43,7 @@ static atomic_cell make_atomic_cell(bytes value) {

 SEASTAR_TEST_CASE(test_execute_batch) {
    return do_with_cql_env([] (auto& e) {
+        db::system_keyspace::minimal_setup(e.db(), e.qp());
        auto& qp = e.local_qp();
        auto bp = make_lw_shared<db::batchlog_manager>(qp);

--- a/tests/commitlog_test.cc
+++ b/tests/commitlog_test.cc
@@ -310,13 +310,15 @@ SEASTAR_TEST_CASE(test_commitlog_delete_when_over_disk_limit){
    cfg.commitlog_segment_size_in_mb = 2;
    cfg.commitlog_total_space_in_mb = 1;
    return make_commitlog(cfg).then([](tmplog_ptr log) {
+            auto sem = make_lw_shared<semaphore>(0);
            // add a flush handler that simply says we're done with the range.
-            auto r = log->second.add_flush_handler([log](cf_id_type id, replay_position pos) {
+            auto r = log->second.add_flush_handler([log, sem](cf_id_type id, replay_position pos) {
                log->second.discard_completed_segments(id, pos);
+                sem->signal();
            });
            auto set = make_lw_shared<std::set<segment_id_type>>();
            auto uuid = utils::UUID_gen::get_time_UUID();
-            return do_until([set]() {return set->size() > 1;},
+            return do_until([set, sem]() {return set->size() > 1 && sem->try_wait();},
                    [log, set, uuid]() {
                        sstring tmp = "hej bubba cow";
                        return log->second.add_mutation(uuid, tmp.size(), [tmp](db::commitlog::output& dst) {
@@ -327,8 +329,9 @@ SEASTAR_TEST_CASE(test_commitlog_delete_when_over_disk_limit){
                                });
                    }).then([log]() {
                        auto n = log->second.get_active_segment_names().size();
+                        auto d = log->second.get_num_segments_destroyed();
                        BOOST_REQUIRE(n > 0);
-                        BOOST_REQUIRE(n < 2);
+                        BOOST_REQUIRE(d > 0);
                    }).finally([log, r = std::move(r)]() {
                        return log->second.clear().then([log] {});
                    });
--- a/tests/cql_test_env.cc
+++ b/tests/cql_test_env.cc
@@ -250,6 +250,14 @@ public:
        return _qp->local();
    }

+    distributed<database>& db() override {
+        return *_db;
+    }
+
+    distributed<cql3::query_processor>& qp() override {
+        return *_qp;
+    }
+
    future<> start() {
        return seastar::async([this] {
            locator::i_endpoint_snitch::create_snitch("SimpleSnitch").get();
--- a/tests/cql_test_env.hh
+++ b/tests/cql_test_env.hh
@@ -24,6 +24,7 @@
 #include <functional>
 #include <vector>

+#include <core/distributed.hh>
 #include "core/sstring.hh"
 #include "core/future.hh"
 #include "core/shared_ptr.hh"
@@ -71,6 +72,10 @@ public:
    virtual database& local_db() = 0;

    virtual cql3::query_processor& local_qp() = 0;
+
+    virtual distributed<database>& db() = 0;
+
+    virtual distributed<cql3::query_processor> & qp() = 0;
 };

 future<::shared_ptr<cql_test_env>> make_env_for_test();
--- a/tests/memory_footprint.cc
+++ b/tests/memory_footprint.cc
@@ -0,0 +1,227 @@
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <boost/range/irange.hpp>
+
+#include <seastar/util/defer.hh>
+#include <seastar/core/app-template.hh>
+#include <seastar/core/thread.hh>
+
+#include "schema_builder.hh"
+#include "memtable.hh"
+#include "row_cache.hh"
+#include "frozen_mutation.hh"
+#include "tmpdir.hh"
+#include "sstables/sstables.hh"
+
+class size_calculator {
+    class nest {
+    public:
+        static thread_local int level;
+        nest() { ++level; }
+        ~nest() { --level; }
+    };
+
+    static std::string prefix() {
+        std::string s(" ");
+        for (int i = 0; i < nest::level; ++i) {
+            s += "-- ";
+        }
+        return s;
+    }
+public:
+    static void print_cache_entry_size() {
+        std::cout << prefix() << "sizeof(cache_entry) = " << sizeof(cache_entry) << "\n";
+
+        {
+            nest n;
+            std::cout << prefix() << "sizeof(decorated_key) = " << sizeof(dht::decorated_key) << "\n";
+            std::cout << prefix() << "sizeof(lru_link_type) = " << sizeof(cache_entry::lru_link_type) << "\n";
+            std::cout << prefix() << "sizeof(cache_link_type) = " << sizeof(cache_entry::cache_link_type) << "\n";
+            print_mutation_partition_size();
+        }
+
+        std::cout << "\n";
+
+        std::cout << prefix() << "sizeof(rows_entry) = " << sizeof(rows_entry) << "\n";
+        std::cout << prefix() << "sizeof(deletable_row) = " << sizeof(deletable_row) << "\n";
+        std::cout << prefix() << "sizeof(row) = " << sizeof(row) << "\n";
+        std::cout << prefix() << "sizeof(atomic_cell_or_collection) = " << sizeof(atomic_cell_or_collection) << "\n";
+    }
+
+    static void print_mutation_partition_size() {
+        std::cout << prefix() << "sizeof(mutation_partition) = " << sizeof(mutation_partition) << "\n";
+        {
+            nest n;
+            std::cout << prefix() << "sizeof(_static_row) = " << sizeof(mutation_partition::_static_row) << "\n";
+            std::cout << prefix() << "sizeof(_rows) = " << sizeof(mutation_partition::_rows) << "\n";
+            std::cout << prefix() << "sizeof(_row_tombstones) = " << sizeof(mutation_partition::_row_tombstones) <<
+            "\n";
+        }
+    }
+};
+
+thread_local int size_calculator::nest::level = 0;
+
+static schema_ptr cassandra_stress_schema() {
+    return schema_builder("ks", "cf")
+        .with_column("KEY", bytes_type, column_kind::partition_key)
+        .with_column("C0", bytes_type)
+        .with_column("C1", bytes_type)
+        .with_column("C2", bytes_type)
+        .with_column("C3", bytes_type)
+        .with_column("C4", bytes_type)
+        .build();
+}
+
+[[gnu::unused]]
+static mutation make_cs_mutation() {
+    auto s = cassandra_stress_schema();
+    mutation m(partition_key::from_single_value(*s, bytes_type->from_string("4b343050393536353531")), s);
+    for (auto&& col : s->regular_columns()) {
+        m.set_clustered_cell(clustering_key::make_empty(*s), col,
+            atomic_cell::make_live(1, bytes_type->from_string("8f75da6b3dcec90c8a404fb9a5f6b0621e62d39c69ba5758e5f41b78311fbb26cc7a")));
+    }
+    return m;
+}
+
+bytes random_bytes(size_t size) {
+    bytes result(bytes::initialized_later(), size);
+    for (size_t i = 0; i < size; ++i) {
+        result[i] = std::rand() % std::numeric_limits<uint8_t>::max();
+    }
+    return result;
+}
+
+sstring random_string(size_t size) {
+    sstring result(sstring::initialized_later(), size);
+    static const char chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+    for (size_t i = 0; i < size; ++i) {
+        result[i] = chars[std::rand() % sizeof(chars)];
+    }
+    return result;
+}
+
+struct mutation_settings {
+    size_t column_count;
+    size_t column_name_size;
+    size_t row_count;
+    size_t partition_key_size;
+    size_t clustering_key_size;
+    size_t data_size;
+};
+
+static mutation make_mutation(mutation_settings settings) {
+    auto builder = schema_builder("ks", "cf")
+        .with_column("pk", bytes_type, column_kind::partition_key)
+        .with_column("ck", bytes_type, column_kind::clustering_key);
+
+    for (size_t i = 0; i < settings.column_count; ++i) {
+        builder.with_column(to_bytes(random_string(settings.column_name_size)), bytes_type);
+    }
+
+    auto s = builder.build();
+
+    mutation m(partition_key::from_single_value(*s, bytes_type->decompose(random_bytes(settings.partition_key_size))), s);
+
+    for (size_t i = 0; i < settings.row_count; ++i) {
+        auto ck = clustering_key::from_single_value(*s, bytes_type->decompose(random_bytes(settings.clustering_key_size)));
+        for (auto&& col : s->regular_columns()) {
+            m.set_clustered_cell(ck, col,
+                atomic_cell::make_live(1,
+                    bytes_type->decompose(random_bytes(settings.data_size))));
+        }
+    }
+    return m;
+}
+
+struct sizes {
+    size_t memtable;
+    size_t cache;
+    size_t sstable;
+    size_t frozen;
+};
+
+static sizes calculate_sizes(const mutation& m) {
+    sizes result;
+    auto s = m.schema();
+    auto mt = make_lw_shared<memtable>(s);
+    cache_tracker tracker;
+    row_cache cache(s, mt->as_data_source(), tracker);
+
+    assert(tracker.region().occupancy().used_space() == 0);
+    assert(mt->occupancy().used_space() == 0);
+
+    mt->apply(m);
+    cache.populate(m);
+
+    result.memtable = mt->occupancy().used_space();
+    result.cache = tracker.region().occupancy().used_space();
+    result.frozen = freeze(m).representation().size();
+
+    tmpdir sstable_dir;
+    auto sst = make_lw_shared<sstables::sstable>(s->ks_name(), s->cf_name(),
+        sstable_dir.path,
+        1 /* generation */,
+        sstables::sstable::version_types::la,
+        sstables::sstable::format_types::big);
+    sst->write_components(*mt).get();
+    sst->load().get();
+    result.sstable = sst->data_size();
+
+    return result;
+}
+
+int main(int argc, char** argv) {
+    namespace bpo = boost::program_options;
+    app_template app;
+    app.add_options()
+        ("column-count", bpo::value<size_t>()->default_value(5), "column count")
+        ("column-name-size", bpo::value<size_t>()->default_value(2), "column name size")
+        ("row-count", bpo::value<size_t>()->default_value(1), "row count")
+        ("partition-key-size", bpo::value<size_t>()->default_value(10), "partition key size")
+        ("clustering-key-size", bpo::value<size_t>()->default_value(10), "clustering key size")
+        ("data-size", bpo::value<size_t>()->default_value(32), "cell data size");
+
+    return app.run(argc, argv, [&] {
+        return seastar::async([&] {
+            mutation_settings settings;
+            settings.column_count = app.configuration()["column-count"].as<size_t>();
+            settings.column_name_size = app.configuration()["column-name-size"].as<size_t>();
+            settings.row_count = app.configuration()["row-count"].as<size_t>();
+            settings.partition_key_size = app.configuration()["partition-key-size"].as<size_t>();
+            settings.clustering_key_size = app.configuration()["clustering-key-size"].as<size_t>();
+            settings.data_size = app.configuration()["data-size"].as<size_t>();
+
+            auto m = make_mutation(settings);
+            auto sizes = calculate_sizes(m);
+
+            std::cout << "mutation footprint:" << "\n";
+            std::cout << " - in cache:    " << sizes.cache << "\n";
+            std::cout << " - in memtable: " << sizes.memtable << "\n";
+            std::cout << " - in sstable:  " << sizes.sstable << "\n";
+            std::cout << " - frozen:      " << sizes.frozen << "\n";
+
+            std::cout << "\n";
+            size_calculator::print_cache_entry_size();
+        });
+    });
+}
--- a/tests/mutation_test.cc
+++ b/tests/mutation_test.cc
@@ -273,6 +273,7 @@ SEASTAR_TEST_CASE(test_multiple_memtables_one_partition) {
    column_family::config cfg;
    cfg.enable_disk_reads = false;
    cfg.enable_disk_writes = false;
+    cfg.enable_incremental_backups = false;
    return with_column_family(s, cfg, [s] (column_family& cf) {
        const column_definition& r1_col = *s->get_column_definition("r1");
        auto key = partition_key::from_exploded(*s, {to_bytes("key1")});
@@ -319,6 +320,7 @@ SEASTAR_TEST_CASE(test_flush_in_the_middle_of_a_scan) {
    cfg.enable_disk_reads = true;
    cfg.enable_disk_writes = true;
    cfg.enable_cache = true;
+    cfg.enable_incremental_backups = false;

    return with_column_family(s, cfg, [s](column_family& cf) {
        return seastar::async([s, &cf] {
@@ -391,6 +393,7 @@ SEASTAR_TEST_CASE(test_multiple_memtables_multiple_partitions) {
    column_family::config cfg;
    cfg.enable_disk_reads = false;
    cfg.enable_disk_writes = false;
+    cfg.enable_incremental_backups = false;
    auto cm = make_lw_shared<compaction_manager>();
    return do_with(make_lw_shared<column_family>(s, cfg, column_family::no_commitlog(), *cm), [s, cm] (auto& cf_ptr) mutable {
        column_family& cf = *cf_ptr;
--- a/tests/sstable_datafile_test.cc
+++ b/tests/sstable_datafile_test.cc
@@ -980,6 +980,7 @@ SEASTAR_TEST_CASE(compaction_manager_test) {
    column_family::config cfg;
    cfg.datadir = tmp->path;
    cfg.enable_commitlog = false;
+    cfg.enable_incremental_backups = false;
    auto cf = make_lw_shared<column_family>(s, cfg, column_family::no_commitlog(), *cm);
    cf->start();
    cf->set_compaction_strategy(sstables::compaction_strategy_type::size_tiered);
--- a/to_string.hh
+++ b/to_string.hh
@@ -65,3 +65,9 @@ std::ostream& operator<<(std::ostream& os, const std::unordered_set<T>& items) {
    os << "{" << join(", ", items) << "}";
    return os;
 }
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::set<T>& items) {
+    os << "{" << join(", ", items) << "}";
+    return os;
+}
--- a/transport/server.cc
+++ b/transport/server.cc
@@ -27,7 +27,6 @@
 #include <boost/assign.hpp>
 #include <boost/locale/encoding_utf.hpp>
 #include <boost/range/adaptor/sliced.hpp>
-#include <boost/range/algorithm/remove.hpp>

 #include "cql3/statements/batch_statement.hh"
 #include "service/migration_manager.hh"
@@ -206,17 +205,6 @@ cql_server::cql_server(distributed<service::storage_proxy>& proxy, distributed<c
 {
 }

-bool
-cql_server::poll_pending_responders() {
-    while (!_pending_responders.empty()) {
-        auto c = _pending_responders.front();
-        _pending_responders.pop_front();
-        c->do_flush();
-        c->_flush_requested = false;
-    }
-    return false;
-}
-
 scollectd::registrations
 cql_server::setup_collectd() {
    return {
@@ -431,16 +419,8 @@ future<> cql_server::connection::process()
        }
    }).finally([this] {
        return _pending_requests_gate.close().then([this] {
-            // Remove ourselves from poll list
-            auto i = std::remove(_server._pending_responders.begin(), _server._pending_responders.end(), this);
-            if (i != _server._pending_responders.end()) {
-                _server._pending_responders.pop_back();
-            }
-            // prevent the connection from been added to the poller
-            _flush_requested = true;
-            return std::move(_ready_to_respond).then([this] {
-                // do the final flush here since poller was disabled for the connection
-                return  _write_buf.flush();
+            return _ready_to_respond.finally([this] {
+                return _write_buf.close();
            });
        });
    });
@@ -829,22 +809,12 @@ future<> cql_server::connection::write_response(shared_ptr<cql_server::response>
 {
    _ready_to_respond = _ready_to_respond.then([this, response = std::move(response)] () mutable {
        return response->output(_write_buf, _version).then([this, response] {
-            if (!_flush_requested) {
-                _flush_requested = true;
-                _server._pending_responders.push_back(this);
-            }
+            return _write_buf.flush();
        });
    });
    return make_ready_future<>();
 }

-void
-cql_server::connection::do_flush() {
-    _ready_to_respond = _ready_to_respond.then([this] {
-        return _write_buf.flush();
-    });
-}
-
 void cql_server::connection::check_room(temporary_buffer<char>& buf, size_t n)
 {
    if (buf.size() < n) {
--- a/transport/server.hh
+++ b/transport/server.hh
@@ -67,7 +67,6 @@ struct [[gnu::packed]] cql_binary_frame_v3 {

 class cql_server {
    class event_notifier;
-    class connection;

    static constexpr int current_version = 3;

@@ -76,10 +75,7 @@ class cql_server {
    distributed<cql3::query_processor>& _query_processor;
    std::unique_ptr<scollectd::registrations> _collectd_registrations;
    std::unique_ptr<event_notifier> _notifier;
-    circular_buffer<connection*> _pending_responders;
-    reactor::poller _poller{[this] { return poll_pending_responders(); }}; // FIXME: register before tcp poller
 private:
-    bool poll_pending_responders();
    scollectd::registrations setup_collectd();
    uint64_t _connects = 0;
    uint64_t _connections = 0;
@@ -92,6 +88,7 @@ public:
    future<> stop();
 private:
    class fmt_visitor;
+    class connection;
    class response;
    friend class type_codec;
 };
@@ -153,13 +150,11 @@ class cql_server::connection {
    serialization_format _serialization_format = serialization_format::use_16_bit();
    service::client_state _client_state;
    std::unordered_map<uint16_t, cql_query_state> _query_states;
-    bool _flush_requested = false;
 public:
    connection(cql_server& server, connected_socket&& fd, socket_address addr);
    ~connection();
    future<> process();
    future<> process_request();
-    void do_flush();
 private:

    future<> process_request_one(temporary_buffer<char> buf,
@@ -215,7 +210,6 @@ private:
    void init_serialization_format();

    friend event_notifier;
-    friend class cql_server;
 };

 }
--- a/types.cc
+++ b/types.cc
@@ -225,7 +225,7 @@ struct string_type_impl : public abstract_type {
            }
        } else {
            try {
-                boost::locale::conv::utf_to_utf<char>(v.data(), boost::locale::conv::stop);
+                boost::locale::conv::utf_to_utf<char>(v.data(), v.end(), boost::locale::conv::stop);
            } catch (const boost::locale::conv::conversion_error& ex) {
                throw marshal_exception(ex.what());
            }
@@ -1182,6 +1182,8 @@ struct empty_type_impl : abstract_type {
 logging::logger collection_type_impl::_logger("collection_type_impl");
 const size_t collection_type_impl::max_elements;

+thread_local std::unordered_map<data_type, shared_ptr<cql3::cql3_type>> collection_type_impl::_cql3_type_cache;
+
 const collection_type_impl::kind collection_type_impl::kind::map(
        [] (shared_ptr<cql3::column_specification> collection, bool is_key) -> shared_ptr<cql3::column_specification> {
            // FIXME: implement
@@ -1241,14 +1243,16 @@ collection_type_impl::is_compatible_with(const abstract_type& previous) const {

 shared_ptr<cql3::cql3_type>
 collection_type_impl::as_cql3_type() const {
-    if (!_cql3_type) {
+    auto ret = _cql3_type_cache[shared_from_this()];
+    if (!ret) {
        auto name = cql3_type_name();
        if (!is_multi_cell()) {
            name = "frozen<" + name + ">";
        }
-        _cql3_type = make_shared<cql3::cql3_type>(name, shared_from_this(), false);
+        ret = make_shared<cql3::cql3_type>(name, shared_from_this(), false);
+        _cql3_type_cache[shared_from_this()] = ret;
    }
-    return _cql3_type;
+    return ret;
 }

 bytes
--- a/types.hh
+++ b/types.hh
@@ -387,7 +387,7 @@ bool equal(data_type t, bytes_view e1, bytes_view e2) {

 class collection_type_impl : public abstract_type {
    static logging::logger _logger;
-    mutable shared_ptr<cql3::cql3_type> _cql3_type;  // initialized on demand, so mutable
+    static thread_local std::unordered_map<data_type, shared_ptr<cql3::cql3_type>> _cql3_type_cache;  // initialized on demand
 public:
    static constexpr size_t max_elements = 65535;

--- a/utils/bloom_filter.hh
+++ b/utils/bloom_filter.hh
@@ -96,6 +96,10 @@ public:
    }

    virtual void close() override { }
+
+    virtual size_t memory_size() override {
+        return sizeof(_hash_count) + _bitset.memory_size();
+    }
 };

 struct murmur3_bloom_filter: public bloom_filter {
@@ -118,6 +122,10 @@ struct always_present_filter: public i_filter {
    virtual void clear() override { }

    virtual void close() override { }
+
+    virtual size_t memory_size() override {
+        return 0;
+    }
 };

 filter_ptr create_filter(int hash, large_bitset&& bitset);
--- a/utils/compaction_manager.cc
+++ b/utils/compaction_manager.cc
@@ -21,6 +21,7 @@

 #include "compaction_manager.hh"
 #include "database.hh"
+#include "core/scollectd.hh"

 static logging::logger cmlog("compaction_manager");

@@ -46,6 +47,7 @@ void compaction_manager::task_start(lw_shared_ptr<compaction_manager::task>& tas
                    _stats.pending_tasks--;
                }

+                _stats.active_tasks++;
                return task->compacting_cf->run_compaction().then([this, task] {
                    // If compaction completed successfully, let's reset
                    // sleep time of compaction_retry.
@@ -64,6 +66,8 @@ void compaction_manager::task_start(lw_shared_ptr<compaction_manager::task>& tas
                    task->compacting_cf = nullptr;

                    _stats.completed_tasks++;
+                }).finally([this] {
+                    _stats.active_tasks--;
                });
            });
        }).then_wrapped([this, task] (future<> f) {
@@ -139,9 +143,22 @@ compaction_manager::~compaction_manager() {
    assert(_stopped == true);
 }

+void compaction_manager::register_collectd_metrics() {
+    auto add = [this] (auto type_name, auto name, auto data_type, auto func) {
+        _registrations.push_back(
+            scollectd::add_polled_metric(scollectd::type_instance_id("compaction_manager",
+                scollectd::per_cpu_plugin_instance,
+                type_name, name),
+                scollectd::make_typed(data_type, func)));
+    };
+
+    add("objects", "compactions", scollectd::data_type::GAUGE, [&] { return _stats.active_tasks; });
+}
+
 void compaction_manager::start(int task_nr) {
    _stopped = false;
    _tasks.reserve(task_nr);
+    register_collectd_metrics();
    for (int i = 0; i < task_nr; i++) {
        auto task = make_lw_shared<compaction_manager::task>();
        task_start(task);
@@ -150,6 +167,7 @@ void compaction_manager::start(int task_nr) {
 }

 future<> compaction_manager::stop() {
+    _registrations.clear();
    return do_for_each(_tasks, [this] (auto& task) {
        return this->task_stop(task);
    }).then([this] {
--- a/utils/compaction_manager.hh
+++ b/utils/compaction_manager.hh
@@ -42,6 +42,7 @@ public:
    struct stats {
        int64_t pending_tasks = 0;
        int64_t completed_tasks = 0;
+        uint64_t active_tasks = 0; // Number of compaction going on.
    };
 private:
    struct task {
@@ -64,6 +65,7 @@ private:
    bool _stopped = true;

    stats _stats;
+    std::vector<scollectd::registration> _registrations;
 private:
    void task_start(lw_shared_ptr<task>& task);
    future<> task_stop(lw_shared_ptr<task>& task);
@@ -73,6 +75,8 @@ public:
    compaction_manager();
    ~compaction_manager();

+    void register_collectd_metrics();
+
    // Creates N fibers that will allow N compaction jobs to run in parallel.
    // Defaults to only one fiber.
    void start(int task_nr = 1);
--- a/utils/histogram.hh
+++ b/utils/histogram.hh
@@ -58,7 +58,7 @@ public:
            double old_m = mean;
            double old_s = variance;

-            mean = old_m + ((value - old_m) / (total + 1));
+            mean = ((double)(sum + value)) / (total + 1);
            variance = old_s + ((value - old_m) * (value - mean));
        }
        sum += value;
@@ -81,7 +81,7 @@ public:
     * Call set_latency, that would start a latency object if needed.
     */
    bool should_sample() const {
-        return total & sample_mask;
+        return total == 0 || (count & sample_mask);
    }
    /**
     * Set the latency according to the sample rate.
--- a/utils/i_filter.hh
+++ b/utils/i_filter.hh
@@ -58,6 +58,8 @@ struct i_filter {
    virtual void clear() = 0;
    virtual void close() = 0;

+    virtual size_t memory_size() = 0;
+
    /**
     * @return The smallest bloom_filter that can provide the given false
     *         positive probability rate for the given number of elements.
--- a/utils/large_bitset.hh
+++ b/utils/large_bitset.hh
@@ -52,6 +52,11 @@ public:
    size_t size() const {
        return _nr_bits;
    }
+
+    size_t memory_size() const {
+        return block_size() * _storage.size() + sizeof(_nr_bits);
+    }
+
    bool test(size_t idx) const {
        auto idx1 = idx / bits_per_block();
        idx %= bits_per_block();
--- a/utils/logalloc.cc
+++ b/utils/logalloc.cc
@@ -378,7 +378,7 @@ public:
        auto i = _segments.find(seg);
        assert(i != _segments.end());
        _segments.erase(i);
-        delete seg;
+        ::free(seg);
    }
    segment* containing_segment(void* obj) const {
        uintptr_t addr = reinterpret_cast<uintptr_t>(obj);