release: prepare for 1.3.5

main: fix exception handling when initializing data or commitlog dirs
Exception handling was broken because after io checker, storage_io_error exception is wrapped around system error exceptions. Also the message when handling exception wasn't precise enough for all cases. For example, lack of permission to write to existing data directory. Fixes #883. Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com> Message-Id: <b2dc75010a06f16ab1b676ce905ae12e930a700a.1478542388.git.raphaelsc@scylladb.com> (cherry picked from commit 9a9f0d3a0f)
2016-11-29 09:47:38 +02:00 · 2016-11-16 15:13:44 +02:00 · 2016-11-16 13:08:41 +00:00 · 2016-11-16 15:04:24 +02:00 · 2016-11-16 12:54:16 +00:00 · 2016-11-11 10:16:14 +02:00
397 changed files with 29117 additions and 6725 deletions
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@@ -1,9 +1,9 @@
-## Installation details
+*Installation details*
 Scylla version (or git commit hash):
 Cluster size:
 OS (RHEL/CentOS/Ubuntu/AWS AMI):

-## Hardware details (for performance issues)
+*Hardware details (for performance issues)*          Delete if unneeded
 Platform (physical/VM/cloud instance type/docker):
 Hardware: sockets= cores= hyperthreading= memory=
 Disks: (SSD/HDD, count)
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-#Scylla
+# Scylla

-##Building Scylla
+## Building Scylla

 In addition to required packages by Seastar, the following packages are required by Scylla.

@@ -15,7 +15,7 @@ git submodule update --recursive
 * Installing required packages:

 ```
-sudo yum install yaml-cpp-devel lz4-devel zlib-devel snappy-devel jsoncpp-devel thrift-devel antlr3-tool antlr3-C++-devel libasan libubsan gcc-c++ gnutls-devel ninja-build ragel libaio-devel cryptopp-devel xfsprogs-devel numactl-devel hwloc-devel libpciaccess-devel libxml2-devel python3-pyparsing
+sudo yum install yaml-cpp-devel lz4-devel zlib-devel snappy-devel jsoncpp-devel thrift-devel antlr3-tool antlr3-C++-devel libasan libubsan gcc-c++ gnutls-devel ninja-build ragel libaio-devel cryptopp-devel xfsprogs-devel numactl-devel hwloc-devel libpciaccess-devel libxml2-devel python3-pyparsing lksctp-tools-devel
 ```

 * Build Scylla
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=1.1.4
+VERSION=1.3.5

 if test -f version
 then
--- a/api/api-doc/cache_service.json
+++ b/api/api-doc/cache_service.json
@@ -487,6 +487,36 @@
        }
      ]
    },
+    {
+      "path": "/cache_service/metrics/row/hits_moving_avrage",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get row hits moving avrage",
+          "type": "#/utils/rate_moving_average",
+          "nickname": "get_row_hits_moving_avrage",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
+    {
+      "path": "/cache_service/metrics/row/requests_moving_avrage",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get row requests moving avrage",
+          "type": "#/utils/rate_moving_average",
+          "nickname": "get_row_requests_moving_avrage",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
    {
      "path": "/cache_service/metrics/row/size",
      "operations": [
--- a/api/api-doc/collectd.json
+++ b/api/api-doc/collectd.json
@@ -55,6 +55,57 @@
                     "paramType":"query"
                  }
               ]
+            },
+            {
+               "method":"POST",
+               "summary":"Start reporting on one or more collectd metric",
+               "type":"void",
+               "nickname":"enable_collectd",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"pluginid",
+                     "description":"The plugin ID, describe the component the metric belongs to. Examples are cache, thrift, etc'. Regex are supported.The plugin ID, describe the component the metric belong to. Examples are: cache, thrift etc'. regex are supported",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  },
+                  {
+                     "name":"instance",
+                     "description":"The plugin instance typically #CPU indicating per CPU metric. Regex are supported. Omit for all",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"type",
+                     "description":"The plugin type, the type of the information. Examples are total_operations, bytes, total_operations, etc'. Regex are supported. Omit for all",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"type_instance",
+                     "description":"The plugin type instance, the specific metric. Exampls are total_writes, total_size, zones, etc'. Regex are supported, Omit for all",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"enable",
+                     "description":"set to true to enable all, anything else or omit to disable",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
+                  }
+               ]
            }
         ]
      },
@@ -63,10 +114,10 @@
         "operations":[
            {
               "method":"GET",
-               "summary":"Get a collectd value",
+               "summary":"Get a list of all collectd metrics and their status",
               "type":"array",
               "items":{
-                  "type":"type_instance_id"
+                  "type":"collectd_metric_status"
               },
               "nickname":"get_collectd_items",
               "produces":[
@@ -74,6 +125,25 @@
               ],
               "parameters":[
               ]
+            },
+            {
+               "method":"POST",
+               "summary":"Enable or disable all collectd metrics",
+               "type":"void",
+               "nickname":"enable_all_collectd",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"enable",
+                     "description":"set to true to enable all, anything else or omit to disable",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
+                  }
+               ]
            }
         ]
      }
@@ -113,6 +183,20 @@
               }
            }
         }
+      },
+      "collectd_metric_status":{
+         "id":"collectd_metric_status",
+         "description":"Holds a collectd id and an enable flag",
+         "properties":{
+            "id":{
+               "description":"The metric ID",
+               "type":"type_instance_id"
+            },
+            "enable":{
+               "description":"Is the metric enabled",
+               "type":"boolean"
+            }
+         }
      }
   }
 }
--- a/api/api-doc/column_family.json
+++ b/api/api-doc/column_family.json
@@ -1094,7 +1094,7 @@
               "method":"GET",
               "summary":"Get read latency histogram",
               "$ref": "#/utils/histogram",
-               "nickname":"get_read_latency_histogram",
+               "nickname":"get_read_latency_histogram_depricated",
               "produces":[
                  "application/json"
               ],
@@ -1121,6 +1121,49 @@
               "items":{
                  "$ref": "#/utils/histogram"
               },
+               "nickname":"get_all_read_latency_histogram_depricated",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/column_family/metrics/read_latency/moving_average_histogram/{name}",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get read latency moving avrage histogram",
+               "$ref": "#/utils/rate_moving_average_and_histogram",
+               "nickname":"get_read_latency_histogram",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"name",
+                     "description":"The column family name in keysspace:name format",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/column_family/metrics/read_latency/moving_average_histogram/",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get read latency moving avrage histogram from all column family",
+               "type":"array",
+               "items":{
+                  "$ref": "#/utils/rate_moving_average_and_histogram"
+               },
               "nickname":"get_all_read_latency_histogram",
               "produces":[
                  "application/json"
@@ -1260,7 +1303,7 @@
               "method":"GET",
               "summary":"Get write latency histogram",
               "$ref": "#/utils/histogram",
-               "nickname":"get_write_latency_histogram",
+               "nickname":"get_write_latency_histogram_depricated",
               "produces":[
                  "application/json"
               ],
@@ -1287,6 +1330,49 @@
               "items":{
                  "$ref": "#/utils/histogram"
               },
+               "nickname":"get_all_write_latency_histogram_depricated",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/column_family/metrics/write_latency/moving_average_histogram/{name}",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get write latency moving average histogram",
+               "$ref": "#/utils/rate_moving_average_and_histogram",
+               "nickname":"get_write_latency_histogram",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"name",
+                     "description":"The column family name in keysspace:name format",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/column_family/metrics/write_latency/moving_average_histogram/",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get write latency moving average histogram of all column family",
+               "type":"array",
+               "items":{
+                  "$ref": "#/utils/rate_moving_average_and_histogram"
+               },
               "nickname":"get_all_write_latency_histogram",
               "produces":[
                  "application/json"
--- a/api/api-doc/storage_proxy.json
+++ b/api/api-doc/storage_proxy.json
@@ -716,6 +716,36 @@
        }
      ]
    },
+    {
+      "path": "/storage_proxy/metrics/read/timeouts_rates",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get read metrics rates",
+          "type": "#/utils/rate_moving_average",
+          "nickname": "get_read_metrics_timeouts_rates",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
+    {
+      "path": "/storage_proxy/metrics/read/unavailables_rates",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get read metrics rates",
+          "type": "#/utils/rate_moving_average",
+          "nickname": "get_read_metrics_unavailables_rates",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
    {
      "path": "/storage_proxy/metrics/read/histogram",
      "operations": [
@@ -723,7 +753,7 @@
          "method": "GET",
          "summary": "Get read metrics",
          "$ref": "#/utils/histogram",
-          "nickname": "get_read_metrics_latency_histogram",
+          "nickname": "get_read_metrics_latency_histogram_depricated",
          "produces": [
            "application/json"
          ],
@@ -738,6 +768,36 @@
          "method": "GET",
          "summary": "Get range metrics",
          "$ref": "#/utils/histogram",
+          "nickname": "get_range_metrics_latency_histogram_depricated",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
+    {
+      "path": "/storage_proxy/metrics/read/moving_avrage_histogram",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get read metrics",
+          "$ref": "#/utils/rate_moving_average_and_histogram",
+          "nickname": "get_read_metrics_latency_histogram",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
+    {
+      "path": "/storage_proxy/metrics/range/moving_avrage_histogram",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get range metrics rate and histogram",
+          "$ref": "#/utils/rate_moving_average_and_histogram",
          "nickname": "get_range_metrics_latency_histogram",
          "produces": [
            "application/json"
@@ -776,6 +836,36 @@
        }
      ]
    },
+    {
+      "path": "/storage_proxy/metrics/range/timeouts_rates",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get range metrics rates",
+          "type": "#/utils/rate_moving_average",
+          "nickname": "get_range_metrics_timeouts_rates",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
+    {
+      "path": "/storage_proxy/metrics/range/unavailables_rates",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get range metrics rates",
+          "type": "#/utils/rate_moving_average",
+          "nickname": "get_range_metrics_unavailables_rates",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
    {
      "path": "/storage_proxy/metrics/write/timeouts",
      "operations": [
@@ -806,6 +896,36 @@
        }
      ]
    },
+    {
+      "path": "/storage_proxy/metrics/write/timeouts_rates",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get write metrics rates",
+          "type": "#/utils/rate_moving_average",
+          "nickname": "get_write_metrics_timeouts_rates",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
+    {
+      "path": "/storage_proxy/metrics/write/unavailables_rates",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get write metrics rates",
+          "type": "#/utils/rate_moving_average",
+          "nickname": "get_write_metrics_unavailables_rates",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
    {
      "path": "/storage_proxy/metrics/write/histogram",
      "operations": [
@@ -813,6 +933,21 @@
          "method": "GET",
          "summary": "Get write metrics",
          "$ref": "#/utils/histogram",
+          "nickname": "get_write_metrics_latency_histogram_depricated",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
+    {
+      "path": "/storage_proxy/metrics/write/moving_avrage_histogram",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get write metrics",
+          "$ref": "#/utils/rate_moving_average_and_histogram",
          "nickname": "get_write_metrics_latency_histogram",
          "produces": [
            "application/json"
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -177,6 +177,22 @@
            }
         ]
      },
+      {
+         "path":"/storage_service/scylla_release_version",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Fetch a string representation of the Scylla version.",
+               "type":"string",
+               "nickname":"get_scylla_release_version",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+               ]
+            }
+         ]
+      },
      {
         "path":"/storage_service/schema_version",
         "operations":[
--- a/api/api-doc/utils.json
+++ b/api/api-doc/utils.json
@@ -65,6 +65,41 @@
               "description":"The series of values to which the counts in `buckets` correspond"
            }
         }
-      }
-   }
+      },
+    "rate_moving_average": {
+         "id":"rate_moving_average",
+         "description":"A meter metric which measures mean throughput and one, five, and fifteen-minute exponentially-weighted moving average throughputs",
+         "properties":{
+             "rates": {
+               "type":"array",
+               "items":{
+                  "type":"double"
+               },
+               "description":"One, five and fifteen mintues rates"
+            },
+            "mean_rate": {
+               "type":"double",
+               "description":"The mean rate from startup"
+            },
+            "count": {
+               "type":"long",
+               "description":"Total number of events from startup"
+            }
+         }
+    },
+    "rate_moving_average_and_histogram": {
+         "id":"rate_moving_average_and_histogram",
+         "description":"A timer metric which aggregates timing durations and provides duration statistics, plus throughput statistics",
+         "properties":{
+            "meter": {
+               "type":"rate_moving_average",
+               "description":"The metric rate moving average"
+            },
+            "hist": {
+               "type":"histogram",
+               "description":"The metric histogram"
+            }
+         }
+    }
+  }
 }
--- a/api/api.cc
+++ b/api/api.cc
@@ -83,6 +83,10 @@ future<> set_server_storage_service(http_context& ctx) {
    return register_api(ctx, "storage_service", "The storage service API", set_storage_service);
 }

+future<> set_server_snitch(http_context& ctx) {
+    return register_api(ctx, "endpoint_snitch_info", "The endpoint snitch info API", set_endpoint_snitch);
+}
+
 future<> set_server_gossip(http_context& ctx) {
    return register_api(ctx, "gossiper",
                "The gossiper API", set_gossiper);
@@ -118,10 +122,6 @@ future<> set_server_gossip_settle(http_context& ctx) {
        rb->register_function(r, "cache_service",
                "The cache service API");
        set_cache_service(ctx,r);
-
-        rb->register_function(r, "endpoint_snitch_info",
-                "The endpoint snitch info API");
-        set_endpoint_snitch(ctx, r);
    });
 }

--- a/api/api.hh
+++ b/api/api.hh
@@ -110,44 +110,7 @@ future<json::json_return_type>  sum_stats(distributed<T>& d, V F::*f) {
    });
 }

-inline double pow2(double a) {
-    return a * a;
-}

-// FIXME: Move to utils::ihistogram::operator+=()
-inline utils::ihistogram add_histogram(utils::ihistogram res,
-        const utils::ihistogram& val) {
-    if (res.count == 0) {
-        return val;
-    }
-    if (val.count == 0) {
-        return std::move(res);
-    }
-    if (res.min > val.min) {
-        res.min = val.min;
-    }
-    if (res.max < val.max) {
-        res.max = val.max;
-    }
-    double ncount = res.count + val.count;
-    // To get an estimated sum we take the estimated mean
-    // and multiply it by the true count
-    res.sum = res.sum + val.mean * val.count;
-    double a = res.count/ncount;
-    double b = val.count/ncount;
-
-    double mean =  a * res.mean + b * val.mean;
-
-    res.variance = (res.variance + pow2(res.mean - mean) )* a +
-            (val.variance + pow2(val.mean -mean))* b;
-
-    res.mean = mean;
-    res.count = res.count + val.count;
-    for (auto i : val.sample) {
-        res.sample.push_back(i);
-    }
-    return res;
-}

 inline
 httpd::utils_json::histogram to_json(const utils::ihistogram& val) {
@@ -156,15 +119,39 @@ httpd::utils_json::histogram to_json(const utils::ihistogram& val) {
    return h;
 }

-template<class T, class F>
-future<json::json_return_type>  sum_histogram_stats(distributed<T>& d, utils::ihistogram F::*f) {
+inline
+httpd::utils_json::rate_moving_average meter_to_json(const utils::rate_moving_average& val) {
+    httpd::utils_json::rate_moving_average m;
+    m = val;
+    return m;
+}

-    return d.map_reduce0([f](const T& p) {return p.get_stats().*f;}, utils::ihistogram(),
-            add_histogram).then([](const utils::ihistogram& val) {
+inline
+httpd::utils_json::rate_moving_average_and_histogram timer_to_json(const utils::rate_moving_average_and_histogram& val) {
+    httpd::utils_json::rate_moving_average_and_histogram h;
+    h.hist = val.hist;
+    h.meter = meter_to_json(val.rate);
+    return h;
+}
+
+template<class T, class F>
+future<json::json_return_type>  sum_histogram_stats(distributed<T>& d, utils::timed_rate_moving_average_and_histogram F::*f) {
+
+    return d.map_reduce0([f](const T& p) {return (p.get_stats().*f).hist;}, utils::ihistogram(),
+            std::plus<utils::ihistogram>()).then([](const utils::ihistogram& val) {
        return make_ready_future<json::json_return_type>(to_json(val));
    });
 }

+template<class T, class F>
+future<json::json_return_type>  sum_timer_stats(distributed<T>& d, utils::timed_rate_moving_average_and_histogram F::*f) {
+
+    return d.map_reduce0([f](const T& p) {return (p.get_stats().*f).rate();}, utils::rate_moving_average_and_histogram(),
+            std::plus<utils::rate_moving_average_and_histogram>()).then([](const utils::rate_moving_average_and_histogram& val) {
+        return make_ready_future<json::json_return_type>(timer_to_json(val));
+    });
+}
+
 inline int64_t min_int64(int64_t a, int64_t b) {
    return std::min(a,b);
 }
--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -38,6 +38,7 @@ struct http_context {
 };

 future<> set_server_init(http_context& ctx);
+future<> set_server_snitch(http_context& ctx);
 future<> set_server_storage_service(http_context& ctx);
 future<> set_server_gossip(http_context& ctx);
 future<> set_server_load_sstable(http_context& ctx);
--- a/api/cache_service.cc
+++ b/api/cache_service.cc
@@ -194,30 +194,46 @@ void set_cache_service(http_context& ctx, routes& r) {
    });

    cs::get_row_capacity.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
+        return map_reduce_cf(ctx, uint64_t(0), [](const column_family& cf) {
            return cf.get_row_cache().get_cache_tracker().region().occupancy().used_space();
        }, std::plus<uint64_t>());
    });

    cs::get_row_hits.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
-            return cf.get_row_cache().stats().hits;
-        }, std::plus<int64_t>());
+        return map_reduce_cf(ctx, uint64_t(0), [](const column_family& cf) {
+            return cf.get_row_cache().stats().hits.count();
+        }, std::plus<uint64_t>());
    });

    cs::get_row_requests.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
-            return cf.get_row_cache().stats().hits + cf.get_row_cache().stats().misses;
-        }, std::plus<int64_t>());
+        return map_reduce_cf(ctx, uint64_t(0), [](const column_family& cf) {
+            return cf.get_row_cache().stats().hits.count() + cf.get_row_cache().stats().misses.count();
+        }, std::plus<uint64_t>());
    });

    cs::get_row_hit_rate.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, ratio_holder(), [](const column_family& cf) {
-            return ratio_holder(cf.get_row_cache().stats().hits + cf.get_row_cache().stats().misses,
-                    cf.get_row_cache().stats().hits);
+            return ratio_holder(cf.get_row_cache().stats().hits.count() + cf.get_row_cache().stats().misses.count(),
+                    cf.get_row_cache().stats().hits.count());
        }, std::plus<ratio_holder>());
    });

+    cs::get_row_hits_moving_avrage.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return map_reduce_cf_raw(ctx, utils::rate_moving_average(), [](const column_family& cf) {
+            return cf.get_row_cache().stats().hits.rate();
+        }, std::plus<utils::rate_moving_average>()).then([](const utils::rate_moving_average& m) {
+            return make_ready_future<json::json_return_type>(meter_to_json(m));
+        });
+    });
+
+    cs::get_row_requests_moving_avrage.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return map_reduce_cf_raw(ctx, utils::rate_moving_average(), [](const column_family& cf) {
+            return cf.get_row_cache().stats().hits.rate() + cf.get_row_cache().stats().misses.rate();
+        }, std::plus<utils::rate_moving_average>()).then([](const utils::rate_moving_average& m) {
+            return make_ready_future<json::json_return_type>(meter_to_json(m));
+        });
+    });
+
    cs::get_row_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        // In origin row size is the weighted size.
        // We currently do not support weights, so we use num entries instead
--- a/api/collectd.cc
+++ b/api/collectd.cc
@@ -25,10 +25,14 @@
 #include "core/scollectd_api.hh"
 #include "endian.h"
 #include <boost/range/irange.hpp>
+#include <regex>

 namespace api {

 using namespace scollectd;
+using namespace httpd;
+
+using namespace json;
 namespace cd = httpd::collectd_json;

 static auto transformer(const std::vector<collectd_value>& values) {
@@ -49,6 +53,14 @@ static auto transformer(const std::vector<collectd_value>& values) {
    return collected_value;
 }

+
+static const char* str_to_regex(const sstring& v) {
+    if (v != "") {
+        return v.c_str();
+    }
+    return ".*";
+}
+
 void set_collectd(http_context& ctx, routes& r) {
    cd::get_collectd.set(r, [&ctx](std::unique_ptr<request> req) {

@@ -72,7 +84,7 @@ void set_collectd(http_context& ctx, routes& r) {
    });

    cd::get_collectd_items.set(r, [](const_req req) {
-        std::vector<cd::type_instance_id> res;
+        std::vector<cd::collectd_metric_status> res;
        auto ids = scollectd::get_collectd_ids();
        for (auto i: ids) {
            cd::type_instance_id id;
@@ -80,10 +92,44 @@ void set_collectd(http_context& ctx, routes& r) {
            id.plugin_instance = i.plugin_instance();
            id.type = i.type();
            id.type_instance = i.type_instance();
-            res.push_back(id);
+            cd::collectd_metric_status it;
+            it.id = id;
+            it.enable = scollectd::is_enabled(i);
+            res.push_back(it);
        }
        return res;
    });
+
+    cd::enable_collectd.set(r, [](std::unique_ptr<request> req) -> future<json::json_return_type> {
+        std::regex plugin(req->param["pluginid"].c_str());
+        std::regex instance(str_to_regex(req->get_query_param("instance")));
+        std::regex type(str_to_regex(req->get_query_param("type")));
+        std::regex type_instance(str_to_regex(req->get_query_param("type_instance")));
+        bool enable = strcasecmp(req->get_query_param("enable").c_str(), "true") == 0;
+        return smp::invoke_on_all([enable, plugin, instance, type, type_instance]() {
+            for (auto id: scollectd::get_collectd_ids()) {
+                if (std::regex_match(std::string(id.plugin()), plugin) &&
+                        std::regex_match(std::string(id.plugin_instance()), instance) &&
+                        std::regex_match(std::string(id.type()), type) &&
+                        std::regex_match(std::string(id.type_instance()), type_instance)) {
+                    scollectd::enable(id, enable);
+                }
+            }
+        }).then([] {
+            return json::json_return_type(json_void());
+        });
+    });
+
+    cd::enable_all_collectd.set(r, [](std::unique_ptr<request> req) -> future<json::json_return_type> {
+        bool enable = strcasecmp(req->get_query_param("enable").c_str(), "true") == 0;
+        return smp::invoke_on_all([enable] {
+            for (auto id: scollectd::get_collectd_ids()) {
+                scollectd::enable(id, enable);
+            }
+        }).then([] {
+            return json::json_return_type(json_void());
+        });
+    });
 }

 }
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -77,14 +77,14 @@ future<json::json_return_type>  get_cf_stats(http_context& ctx,
 }

 static future<json::json_return_type>  get_cf_stats_count(http_context& ctx, const sstring& name,
-        utils::ihistogram column_family::stats::*f) {
+        utils::timed_rate_moving_average_and_histogram column_family::stats::*f) {
    return map_reduce_cf(ctx, name, int64_t(0), [f](const column_family& cf) {
-        return (cf.get_stats().*f).count;
+        return (cf.get_stats().*f).hist.count;
    }, std::plus<int64_t>());
 }

 static future<json::json_return_type>  get_cf_stats_sum(http_context& ctx, const sstring& name,
-        utils::ihistogram column_family::stats::*f) {
+        utils::timed_rate_moving_average_and_histogram column_family::stats::*f) {
    auto uuid = get_uuid(name, ctx.db.local());
    return ctx.db.map_reduce0([uuid, f](database& db) {
        // Histograms information is sample of the actual load
@@ -92,7 +92,7 @@ static future<json::json_return_type>  get_cf_stats_sum(http_context& ctx, const
        // with count. The information is gather in nano second,
        // but reported in micro
        column_family& cf = db.find_column_family(uuid);
-        return ((cf.get_stats().*f).count/1000.0) * (cf.get_stats().*f).mean;
+        return ((cf.get_stats().*f).hist.count/1000.0) * (cf.get_stats().*f).hist.mean;
    }, 0.0, std::plus<double>()).then([](double res) {
        return make_ready_future<json::json_return_type>((int64_t)res);
    });
@@ -100,28 +100,29 @@ static future<json::json_return_type>  get_cf_stats_sum(http_context& ctx, const


 static future<json::json_return_type>  get_cf_stats_count(http_context& ctx,
-        utils::ihistogram column_family::stats::*f) {
+        utils::timed_rate_moving_average_and_histogram column_family::stats::*f) {
    return map_reduce_cf(ctx, int64_t(0), [f](const column_family& cf) {
-        return (cf.get_stats().*f).count;
+        return (cf.get_stats().*f).hist.count;
    }, std::plus<int64_t>());
 }

 static future<json::json_return_type>  get_cf_histogram(http_context& ctx, const sstring& name,
-        utils::ihistogram column_family::stats::*f) {
+        utils::timed_rate_moving_average_and_histogram column_family::stats::*f) {
    utils::UUID uuid = get_uuid(name, ctx.db.local());
-    return ctx.db.map_reduce0([f, uuid](const database& p) {return p.find_column_family(uuid).get_stats().*f;},
+    return ctx.db.map_reduce0([f, uuid](const database& p) {
+        return (p.find_column_family(uuid).get_stats().*f).hist;},
            utils::ihistogram(),
-            add_histogram)
+            std::plus<utils::ihistogram>())
            .then([](const utils::ihistogram& val) {
                return make_ready_future<json::json_return_type>(to_json(val));
    });
 }

-static future<json::json_return_type> get_cf_histogram(http_context& ctx, utils::ihistogram column_family::stats::*f) {
+static future<json::json_return_type> get_cf_histogram(http_context& ctx, utils::timed_rate_moving_average_and_histogram column_family::stats::*f) {
    std::function<utils::ihistogram(const database&)> fun = [f] (const database& db)  {
        utils::ihistogram res;
        for (auto i : db.get_column_families()) {
-            res = add_histogram(res, i.second->get_stats().*f);
+            res += (i.second->get_stats().*f).hist;
        }
        return res;
    };
@@ -132,6 +133,33 @@ static future<json::json_return_type> get_cf_histogram(http_context& ctx, utils:
    });
 }

+static future<json::json_return_type>  get_cf_rate_and_histogram(http_context& ctx, const sstring& name,
+        utils::timed_rate_moving_average_and_histogram column_family::stats::*f) {
+    utils::UUID uuid = get_uuid(name, ctx.db.local());
+    return ctx.db.map_reduce0([f, uuid](const database& p) {
+        return (p.find_column_family(uuid).get_stats().*f).rate();},
+            utils::rate_moving_average_and_histogram(),
+            std::plus<utils::rate_moving_average_and_histogram>())
+            .then([](const utils::rate_moving_average_and_histogram& val) {
+                return make_ready_future<json::json_return_type>(timer_to_json(val));
+    });
+}
+
+static future<json::json_return_type> get_cf_rate_and_histogram(http_context& ctx, utils::timed_rate_moving_average_and_histogram column_family::stats::*f) {
+    std::function<utils::rate_moving_average_and_histogram(const database&)> fun = [f] (const database& db)  {
+        utils::rate_moving_average_and_histogram res;
+        for (auto i : db.get_column_families()) {
+            res += (i.second->get_stats().*f).rate();
+        }
+        return res;
+    };
+    return ctx.db.map(fun).then([](const std::vector<utils::rate_moving_average_and_histogram> &res) {
+        std::vector<httpd::utils_json::rate_moving_average_and_histogram> r;
+        boost::copy(res | boost::adaptors::transformed(timer_to_json), std::back_inserter(r));
+        return make_ready_future<json::json_return_type>(r);
+    });
+}
+
 static future<json::json_return_type> get_cf_unleveled_sstables(http_context& ctx, const sstring& name) {
    return map_reduce_cf(ctx, name, int64_t(0), [](const column_family& cf) {
        return cf.get_unleveled_sstables();
@@ -141,7 +169,7 @@ static future<json::json_return_type> get_cf_unleveled_sstables(http_context& ct
 static int64_t min_row_size(column_family& cf) {
    int64_t res = INT64_MAX;
    for (auto i: *cf.get_sstables() ) {
-        res = std::min(res, i.second->get_stats_metadata().estimated_row_size.min());
+        res = std::min(res, i->get_stats_metadata().estimated_row_size.min());
    }
    return (res == INT64_MAX) ? 0 : res;
 }
@@ -149,7 +177,7 @@ static int64_t min_row_size(column_family& cf) {
 static int64_t max_row_size(column_family& cf) {
    int64_t res = 0;
    for (auto i: *cf.get_sstables() ) {
-        res = std::max(i.second->get_stats_metadata().estimated_row_size.max(), res);
+        res = std::max(i->get_stats_metadata().estimated_row_size.max(), res);
    }
    return res;
 }
@@ -166,13 +194,58 @@ static double update_ratio(double acc, double f, double total) {
 static ratio_holder mean_row_size(column_family& cf) {
    ratio_holder res;
    for (auto i: *cf.get_sstables() ) {
-        auto c = i.second->get_stats_metadata().estimated_row_size.count();
-        res.sub += i.second->get_stats_metadata().estimated_row_size.mean() * c;
+        auto c = i->get_stats_metadata().estimated_row_size.count();
+        res.sub += i->get_stats_metadata().estimated_row_size.mean() * c;
        res.total += c;
    }
    return res;
 }

+static std::unordered_map<sstring, uint64_t> merge_maps(std::unordered_map<sstring, uint64_t> a,
+        const std::unordered_map<sstring, uint64_t>& b) {
+    a.insert(b.begin(), b.end());
+    return a;
+}
+
+static json::json_return_type sum_map(const std::unordered_map<sstring, uint64_t>& val) {
+    uint64_t res = 0;
+    for (auto i : val) {
+        res += i.second;
+    }
+    return res;
+}
+
+static future<json::json_return_type>  sum_sstable(http_context& ctx, const sstring name, bool total) {
+    auto uuid = get_uuid(name, ctx.db.local());
+    return ctx.db.map_reduce0([uuid, total](database& db) {
+        std::unordered_map<sstring, uint64_t> m;
+        auto sstables = (total) ? db.find_column_family(uuid).get_sstables_including_compacted_undeleted() :
+                db.find_column_family(uuid).get_sstables();
+        for (auto t : *sstables) {
+            m[t->get_filename()] = t->bytes_on_disk();
+        }
+        return m;
+    }, std::unordered_map<sstring, uint64_t>(), merge_maps).
+            then([](const std::unordered_map<sstring, uint64_t>& val) {
+        return sum_map(val);
+    });
+}
+
+
+static future<json::json_return_type> sum_sstable(http_context& ctx, bool total) {
+    return map_reduce_cf_raw(ctx, std::unordered_map<sstring, uint64_t>(), [total](column_family& cf) {
+        std::unordered_map<sstring, uint64_t> m;
+        auto sstables = (total) ? cf.get_sstables_including_compacted_undeleted() :
+                cf.get_sstables();
+        for (auto t : *sstables) {
+            m[t->get_filename()] = t->bytes_on_disk();
+        }
+        return m;
+    },merge_maps).then([](const std::unordered_map<sstring, uint64_t>& val) {
+        return sum_map(val);
+    });
+}
+
 template <typename T>
 class sum_ratio {
    uint64_t _n = 0;
@@ -194,7 +267,7 @@ public:
 static double get_compression_ratio(column_family& cf) {
    sum_ratio<double> result;
    for (auto i : *cf.get_sstables()) {
-        auto compression_ratio = i.second->get_compression_ratio();
+        auto compression_ratio = i->get_compression_ratio();
        if (compression_ratio != sstables::metadata_collector::NO_COMPRESSION_RATIO) {
            result(compression_ratio);
        }
@@ -202,6 +275,14 @@ static double get_compression_ratio(column_family& cf) {
    return std::move(result).get();
 }

+static std::vector<uint64_t> concat_sstable_count_per_level(std::vector<uint64_t> a, std::vector<uint64_t>&& b) {
+    a.resize(std::max(a.size(), b.size()), 0UL);
+    for (auto i = 0U; i < b.size(); i++) {
+        a[i] += b[i];
+    }
+    return a;
+}
+
 void set_column_family(http_context& ctx, routes& r) {
    cf::get_column_family_name.set(r, [&ctx] (const_req req){
        vector<sstring> res;
@@ -325,7 +406,7 @@ void set_column_family(http_context& ctx, routes& r) {
        return map_reduce_cf(ctx, req->param["name"], sstables::estimated_histogram(0), [](column_family& cf) {
            sstables::estimated_histogram res(0);
            for (auto i: *cf.get_sstables() ) {
-                res.merge(i.second->get_stats_metadata().estimated_row_size);
+                res.merge(i->get_stats_metadata().estimated_row_size);
            }
            return res;
        },
@@ -336,7 +417,7 @@ void set_column_family(http_context& ctx, routes& r) {
        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](column_family& cf) {
            uint64_t res = 0;
            for (auto i: *cf.get_sstables() ) {
-                res += i.second->get_stats_metadata().estimated_row_size.count();
+                res += i->get_stats_metadata().estimated_row_size.count();
            }
            return res;
        },
@@ -347,7 +428,7 @@ void set_column_family(http_context& ctx, routes& r) {
        return map_reduce_cf(ctx, req->param["name"], sstables::estimated_histogram(0), [](column_family& cf) {
            sstables::estimated_histogram res(0);
            for (auto i: *cf.get_sstables() ) {
-                res.merge(i.second->get_stats_metadata().estimated_column_count);
+                res.merge(i->get_stats_metadata().estimated_column_count);
            }
            return res;
        },
@@ -384,10 +465,14 @@ void set_column_family(http_context& ctx, routes& r) {
        return get_cf_stats_count(ctx, &column_family::stats::writes);
    });

-    cf::get_read_latency_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
+    cf::get_read_latency_histogram_depricated.set(r, [&ctx] (std::unique_ptr<request> req) {
        return get_cf_histogram(ctx, req->param["name"], &column_family::stats::reads);
    });

+    cf::get_read_latency_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return get_cf_rate_and_histogram(ctx, req->param["name"], &column_family::stats::reads);
+    });
+
    cf::get_read_latency.set(r, [&ctx] (std::unique_ptr<request> req) {
        return get_cf_stats_sum(ctx,req->param["name"] ,&column_family::stats::reads);
    });
@@ -396,24 +481,40 @@ void set_column_family(http_context& ctx, routes& r) {
        return get_cf_stats_sum(ctx, req->param["name"] ,&column_family::stats::writes);
    });

-    cf::get_all_read_latency_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
+    cf::get_all_read_latency_histogram_depricated.set(r, [&ctx] (std::unique_ptr<request> req) {
        return get_cf_histogram(ctx, &column_family::stats::writes);
    });

-    cf::get_write_latency_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
+    cf::get_all_read_latency_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return get_cf_rate_and_histogram(ctx, &column_family::stats::writes);
+    });
+
+    cf::get_write_latency_histogram_depricated.set(r, [&ctx] (std::unique_ptr<request> req) {
        return get_cf_histogram(ctx, req->param["name"], &column_family::stats::writes);
    });

-    cf::get_all_write_latency_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
+    cf::get_write_latency_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return get_cf_rate_and_histogram(ctx, req->param["name"], &column_family::stats::writes);
+    });
+
+    cf::get_all_write_latency_histogram_depricated.set(r, [&ctx] (std::unique_ptr<request> req) {
        return get_cf_histogram(ctx, &column_family::stats::writes);
    });

+    cf::get_all_write_latency_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return get_cf_rate_and_histogram(ctx, &column_family::stats::writes);
+    });
+
    cf::get_pending_compactions.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats(ctx, req->param["name"], &column_family::stats::pending_compactions);
+        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](column_family& cf) {
+            return cf.get_compaction_strategy().estimated_pending_compactions(cf);
+        }, std::plus<int64_t>());
    });

    cf::get_all_pending_compactions.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats(ctx, &column_family::stats::pending_compactions);
+        return map_reduce_cf(ctx, int64_t(0), [](column_family& cf) {
+            return cf.get_compaction_strategy().estimated_pending_compactions(cf);
+        }, std::plus<int64_t>());
    });

    cf::get_live_ss_table_count.set(r, [&ctx] (std::unique_ptr<request> req) {
@@ -429,19 +530,19 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_live_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats(ctx, req->param["name"], &column_family::stats::live_disk_space_used);
+        return sum_sstable(ctx, req->param["name"], false);
    });

    cf::get_all_live_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats(ctx, &column_family::stats::live_disk_space_used);
+        return sum_sstable(ctx, false);
    });

    cf::get_total_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats(ctx, req->param["name"], &column_family::stats::total_disk_space_used);
+        return sum_sstable(ctx, req->param["name"], true);
    });

    cf::get_all_total_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats(ctx, &column_family::stats::total_disk_space_used);
+        return sum_sstable(ctx, true);
    });

    cf::get_min_row_size.set(r, [&ctx] (std::unique_ptr<request> req) {
@@ -471,7 +572,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_false_positives.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return s + sst.second->filter_get_false_positive();
+                return s + sst->filter_get_false_positive();
            });
        }, std::plus<uint64_t>());
    });
@@ -479,7 +580,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_false_positives.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return s + sst.second->filter_get_false_positive();
+                return s + sst->filter_get_false_positive();
            });
        }, std::plus<uint64_t>());
    });
@@ -487,7 +588,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_recent_bloom_filter_false_positives.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return s + sst.second->filter_get_recent_false_positive();
+                return s + sst->filter_get_recent_false_positive();
            });
        }, std::plus<uint64_t>());
    });
@@ -495,7 +596,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_recent_bloom_filter_false_positives.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return s + sst.second->filter_get_recent_false_positive();
+                return s + sst->filter_get_recent_false_positive();
            });
        }, std::plus<uint64_t>());
    });
@@ -503,8 +604,8 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_false_ratio.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], double(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), double(0), [](double s, auto& sst) {
-                double f = sst.second->filter_get_false_positive();
-                return update_ratio(s, f, f + sst.second->filter_get_true_positive());
+                double f = sst->filter_get_false_positive();
+                return update_ratio(s, f, f + sst->filter_get_true_positive());
            });
        }, std::plus<double>());
    });
@@ -512,8 +613,8 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_false_ratio.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, double(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), double(0), [](double s, auto& sst) {
-                double f = sst.second->filter_get_false_positive();
-                return update_ratio(s, f, f + sst.second->filter_get_true_positive());
+                double f = sst->filter_get_false_positive();
+                return update_ratio(s, f, f + sst->filter_get_true_positive());
            });
        }, std::plus<double>());
    });
@@ -521,8 +622,8 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_recent_bloom_filter_false_ratio.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], double(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), double(0), [](double s, auto& sst) {
-                double f = sst.second->filter_get_recent_false_positive();
-                return update_ratio(s, f, f + sst.second->filter_get_recent_true_positive());
+                double f = sst->filter_get_recent_false_positive();
+                return update_ratio(s, f, f + sst->filter_get_recent_true_positive());
            });
        }, std::plus<double>());
    });
@@ -530,8 +631,8 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_recent_bloom_filter_false_ratio.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, double(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), double(0), [](double s, auto& sst) {
-                double f = sst.second->filter_get_recent_false_positive();
-                return update_ratio(s, f, f + sst.second->filter_get_recent_true_positive());
+                double f = sst->filter_get_recent_false_positive();
+                return update_ratio(s, f, f + sst->filter_get_recent_true_positive());
            });
        }, std::plus<double>());
    });
@@ -539,7 +640,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst.second->filter_size();
+                return sst->filter_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -547,7 +648,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst.second->filter_size();
+                return sst->filter_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -555,7 +656,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst.second->filter_memory_size();
+                return sst->filter_memory_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -563,7 +664,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst.second->filter_memory_size();
+                return sst->filter_memory_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -571,7 +672,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst.second->get_summary().memory_footprint();
+                return sst->get_summary().memory_footprint();
            });
        }, std::plus<uint64_t>());
    });
@@ -579,7 +680,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst.second->get_summary().memory_footprint();
+                return sst->get_summary().memory_footprint();
            });
        }, std::plus<uint64_t>());
    });
@@ -652,27 +753,35 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_row_cache_hit.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](const column_family& cf) {
-            return cf.get_row_cache().stats().hits;
-        }, std::plus<int64_t>());
+        return map_reduce_cf_raw(ctx, req->param["name"], utils::rate_moving_average(), [](const column_family& cf) {
+            return cf.get_row_cache().stats().hits.rate();
+        }, std::plus<utils::rate_moving_average>()).then([](const utils::rate_moving_average& m) {
+            return make_ready_future<json::json_return_type>(meter_to_json(m));
+        });
    });

    cf::get_all_row_cache_hit.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, int64_t(0), [](const column_family& cf) {
-            return cf.get_row_cache().stats().hits;
-        }, std::plus<int64_t>());
+        return map_reduce_cf_raw(ctx, utils::rate_moving_average(), [](const column_family& cf) {
+            return cf.get_row_cache().stats().hits.rate();
+        }, std::plus<utils::rate_moving_average>()).then([](const utils::rate_moving_average& m) {
+            return make_ready_future<json::json_return_type>(meter_to_json(m));
+        });
    });

    cf::get_row_cache_miss.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], int64_t(0), [](const column_family& cf) {
-            return cf.get_row_cache().stats().misses;
-        }, std::plus<int64_t>());
+        return map_reduce_cf_raw(ctx, req->param["name"], utils::rate_moving_average(), [](const column_family& cf) {
+            return cf.get_row_cache().stats().misses.rate();
+        }, std::plus<utils::rate_moving_average>()).then([](const utils::rate_moving_average& m) {
+            return make_ready_future<json::json_return_type>(meter_to_json(m));
+        });
    });

    cf::get_all_row_cache_miss.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, int64_t(0), [](const column_family& cf) {
-            return cf.get_row_cache().stats().misses;
-        }, std::plus<int64_t>());
+        return map_reduce_cf_raw(ctx, utils::rate_moving_average(), [](const column_family& cf) {
+            return cf.get_row_cache().stats().misses.rate();
+        }, std::plus<utils::rate_moving_average>()).then([](const utils::rate_moving_average& m) {
+            return make_ready_future<json::json_return_type>(meter_to_json(m));
+        });

    });

@@ -799,12 +908,11 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_sstable_count_per_level.set(r, [&ctx](std::unique_ptr<request> req) {
-        // TBD
-        // FIXME
-        // This is a workaround, until there will be an API to return the count
-        // per level, we return an empty array
-        vector<uint64_t> res;
-        return make_ready_future<json::json_return_type>(res);
+        return map_reduce_cf_raw(ctx, req->param["name"], std::vector<uint64_t>(), [](const column_family& cf) {
+            return cf.sstable_count_per_level();
+        }, concat_sstable_count_per_level).then([](const std::vector<uint64_t>& res) {
+            return make_ready_future<json::json_return_type>(res);
+        });
    });
 }
 }
--- a/api/column_family.hh
+++ b/api/column_family.hh
@@ -34,31 +34,44 @@ future<> foreach_column_family(http_context& ctx, const sstring& name, std::func


 template<class Mapper, class I, class Reducer>
-future<json::json_return_type> map_reduce_cf(http_context& ctx, const sstring& name, I init,
+future<I> map_reduce_cf_raw(http_context& ctx, const sstring& name, I init,
        Mapper mapper, Reducer reducer) {
    auto uuid = get_uuid(name, ctx.db.local());
    return ctx.db.map_reduce0([mapper, uuid](database& db) {
        return mapper(db.find_column_family(uuid));
-    }, init, reducer).then([](const I& res) {
+    }, init, reducer);
+}
+
+
+template<class Mapper, class I, class Reducer>
+future<json::json_return_type> map_reduce_cf(http_context& ctx, const sstring& name, I init,
+        Mapper mapper, Reducer reducer) {
+    return map_reduce_cf_raw(ctx, name, init, mapper, reducer).then([](const I& res) {
        return make_ready_future<json::json_return_type>(res);
    });
 }

 template<class Mapper, class I, class Reducer, class Result>
-future<json::json_return_type> map_reduce_cf(http_context& ctx, const sstring& name, I init,
+future<I> map_reduce_cf_raw(http_context& ctx, const sstring& name, I init,
        Mapper mapper, Reducer reducer, Result result) {
    auto uuid = get_uuid(name, ctx.db.local());
    return ctx.db.map_reduce0([mapper, uuid](database& db) {
        return mapper(db.find_column_family(uuid));
-    }, init, reducer).then([result](const I& res) mutable {
+    }, init, reducer);
+}
+
+
+template<class Mapper, class I, class Reducer, class Result>
+future<json::json_return_type> map_reduce_cf(http_context& ctx, const sstring& name, I init,
+        Mapper mapper, Reducer reducer, Result result) {
+    return map_reduce_cf_raw(ctx, name, init, mapper, reducer, result).then([result](const I& res) mutable {
        result = res;
        return make_ready_future<json::json_return_type>(result);
    });
 }

-
 template<class Mapper, class I, class Reducer>
-future<json::json_return_type> map_reduce_cf(http_context& ctx, I init,
+future<I> map_reduce_cf_raw(http_context& ctx, I init,
        Mapper mapper, Reducer reducer) {
    return ctx.db.map_reduce0([mapper, init, reducer](database& db) {
        auto res = init;
@@ -66,10 +79,18 @@ future<json::json_return_type> map_reduce_cf(http_context& ctx, I init,
            res = reducer(res, mapper(*i.second.get()));
        }
        return res;
-    }, init, reducer).then([](const I& res) {
+    }, init, reducer);
+}
+
+
+template<class Mapper, class I, class Reducer>
+future<json::json_return_type> map_reduce_cf(http_context& ctx, I init,
+        Mapper mapper, Reducer reducer) {
+    return map_reduce_cf_raw(ctx, init, mapper, reducer).then([](const I& res) {
        return make_ready_future<json::json_return_type>(res);
    });
 }
+
 future<json::json_return_type>  get_cf_stats(http_context& ctx, const sstring& name,
        int64_t column_family::stats::*f);

--- a/api/compaction_manager.cc
+++ b/api/compaction_manager.cc
@@ -22,6 +22,7 @@
 #include "compaction_manager.hh"
 #include "api/api-doc/compaction_manager.json.hh"
 #include "db/system_keyspace.hh"
+#include "column_family.hh"

 namespace api {

@@ -78,7 +79,9 @@ void set_compaction_manager(http_context& ctx, routes& r) {
    });

    cm::get_pending_tasks.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cm_stats(ctx, &compaction_manager::stats::pending_tasks);
+        return map_reduce_cf(ctx, int64_t(0), [](column_family& cf) {
+            return cf.get_compaction_strategy().estimated_pending_compactions(cf);
+        }, std::plus<int64_t>());
    });

    cm::get_completed_tasks.set(r, [&ctx] (std::unique_ptr<request> req) {
--- a/api/storage_proxy.cc
+++ b/api/storage_proxy.cc
@@ -33,6 +33,25 @@ namespace sp = httpd::storage_proxy_json;
 using proxy = service::storage_proxy;
 using namespace json;

+static future<utils::rate_moving_average>  sum_timed_rate(distributed<proxy>& d, utils::timed_rate_moving_average proxy::stats::*f) {
+    return d.map_reduce0([f](const proxy& p) {return (p.get_stats().*f).rate();}, utils::rate_moving_average(),
+            std::plus<utils::rate_moving_average>());
+}
+
+static future<json::json_return_type>  sum_timed_rate_as_obj(distributed<proxy>& d, utils::timed_rate_moving_average proxy::stats::*f) {
+    return sum_timed_rate(d, f).then([](const utils::rate_moving_average& val) {
+        httpd::utils_json::rate_moving_average m;
+        m = val;
+        return make_ready_future<json::json_return_type>(m);
+    });
+}
+
+static future<json::json_return_type>  sum_timed_rate_as_long(distributed<proxy>& d, utils::timed_rate_moving_average proxy::stats::*f) {
+    return sum_timed_rate(d, f).then([](const utils::rate_moving_average& val) {
+        return make_ready_future<json::json_return_type>(val.count);
+    });
+}
+
 static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, sstables::estimated_histogram proxy::stats::*f) {
    return ctx.sp.map_reduce0([f](const proxy& p) {return p.get_stats().*f;}, sstables::estimated_histogram(),
            sstables::merge).then([](const sstables::estimated_histogram& val) {
@@ -42,8 +61,8 @@ static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx
    });
 }

-static future<json::json_return_type>  total_latency(http_context& ctx, utils::ihistogram proxy::stats::*f) {
-    return ctx.sp.map_reduce0([f](const proxy& p) {return (p.get_stats().*f).mean * (p.get_stats().*f).count;}, 0.0,
+static future<json::json_return_type>  total_latency(http_context& ctx, utils::timed_rate_moving_average_and_histogram proxy::stats::*f) {
+    return ctx.sp.map_reduce0([f](const proxy& p) {return (p.get_stats().*f).hist.mean * (p.get_stats().*f).hist.count;}, 0.0,
            std::plus<double>()).then([](double val) {
        int64_t res = val;
        return make_ready_future<json::json_return_type>(res);
@@ -291,41 +310,77 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_read_metrics_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_stats(ctx.sp, &proxy::stats::read_timeouts);
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::read_timeouts);
    });

    sp::get_read_metrics_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_stats(ctx.sp, &proxy::stats::read_unavailables);
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::read_unavailables);
    });

    sp::get_range_metrics_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_stats(ctx.sp, &proxy::stats::range_slice_timeouts);
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::range_slice_timeouts);
    });

    sp::get_range_metrics_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_stats(ctx.sp, &proxy::stats::range_slice_unavailables);
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::range_slice_unavailables);
    });

    sp::get_write_metrics_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_stats(ctx.sp, &proxy::stats::write_timeouts);
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::write_timeouts);
    });

    sp::get_write_metrics_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_stats(ctx.sp, &proxy::stats::write_unavailables);
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::write_unavailables);
    });

-    sp::get_range_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
+    sp::get_read_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::read_timeouts);
+    });
+
+    sp::get_read_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::read_unavailables);
+    });
+
+    sp::get_range_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::range_slice_timeouts);
+    });
+
+    sp::get_range_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::range_slice_unavailables);
+    });
+
+    sp::get_write_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::write_timeouts);
+    });
+
+    sp::get_write_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::write_unavailables);
+    });
+
+    sp::get_range_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<request> req) {
        return sum_histogram_stats(ctx.sp, &proxy::stats::range);
    });

-    sp::get_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
+    sp::get_write_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<request> req) {
        return sum_histogram_stats(ctx.sp, &proxy::stats::write);
    });

-    sp::get_read_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
+    sp::get_read_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<request> req) {
        return sum_histogram_stats(ctx.sp, &proxy::stats::read);
    });

+    sp::get_range_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_timer_stats(ctx.sp, &proxy::stats::range);
+    });
+
+    sp::get_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_timer_stats(ctx.sp, &proxy::stats::write);
+    });
+
+    sp::get_read_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_timer_stats(ctx.sp, &proxy::stats::read);
+    });
+
    sp::get_read_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
        return sum_estimated_histogram(ctx, &proxy::stats::estimated_read);
    });
@@ -342,7 +397,7 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_range_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_histogram_stats(ctx.sp, &proxy::stats::read);
+        return sum_timer_stats(ctx.sp, &proxy::stats::read);
    });

    sp::get_range_latency.set(r, [&ctx](std::unique_ptr<request> req) {
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -31,6 +31,7 @@
 #include "locator/snitch_base.hh"
 #include "column_family.hh"
 #include "log.hh"
+#include "release.hh"

 namespace api {

@@ -121,6 +122,9 @@ void set_storage_service(http_context& ctx, routes& r) {
        return service::get_local_storage_service().get_release_version();
    });

+    ss::get_scylla_release_version.set(r, [](const_req req) {
+        return scylla_version();
+    });
    ss::get_schema_version.set(r, [](const_req req) {
        return service::get_local_storage_service().get_schema_version();
    });
@@ -659,16 +663,22 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
        auto probability = req->get_query_param("probability");
-        return make_ready_future<json::json_return_type>(json_void());
+        try {
+            double real_prob = std::stod(probability.c_str());
+            return tracing::tracing::tracing_instance().invoke_on_all([real_prob] (auto& local_tracing) {
+                local_tracing.set_trace_probability(real_prob);
+            }).then([] {
+                return make_ready_future<json::json_return_type>(json_void());
+            });
+        } catch (...) {
+            throw httpd::bad_param_exception(sprint("Bad format of a probability value: \"%s\"", probability.c_str()));
+        }
+
    });

    ss::get_trace_probability.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        return make_ready_future<json::json_return_type>(0);
+        return make_ready_future<json::json_return_type>(tracing::tracing::get_local_tracing_instance().get_trace_probability());
    });

    ss::enable_auto_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
--- a/atomic_cell_or_collection.hh
+++ b/atomic_cell_or_collection.hh
@@ -63,5 +63,8 @@ public:
            ::feed_hash(as_collection_mutation(), h, def.type);
        }
    }
+    size_t memory_usage() const {
+        return _data.memory_usage();
+    }
    friend std::ostream& operator<<(std::ostream&, const atomic_cell_or_collection&);
 };
--- a/auth/auth.cc
+++ b/auth/auth.cc
@@ -47,7 +47,7 @@
 #include "authorizer.hh"
 #include "database.hh"
 #include "cql3/query_processor.hh"
-#include "cql3/statements/cf_statement.hh"
+#include "cql3/statements/raw/cf_statement.hh"
 #include "cql3/statements/create_table_statement.hh"
 #include "db/config.hh"
 #include "service/migration_manager.hh"
@@ -348,8 +348,8 @@ future<> auth::auth::setup_table(const sstring& name, const sstring& cql) {
        return make_ready_future();
    }

-    ::shared_ptr<cql3::statements::cf_statement> parsed = static_pointer_cast<
-                    cql3::statements::cf_statement>(cql3::query_processor::parse_statement(cql));
+    ::shared_ptr<cql3::statements::raw::cf_statement> parsed = static_pointer_cast<
+                    cql3::statements::raw::cf_statement>(cql3::query_processor::parse_statement(cql));
    parsed->prepare_keyspace(AUTH_KS);
    ::shared_ptr<cql3::statements::create_table_statement> statement =
                    static_pointer_cast<cql3::statements::create_table_statement>(
--- a/auth/data_resource.cc
+++ b/auth/data_resource.cc
@@ -47,11 +47,8 @@
 const sstring auth::data_resource::ROOT_NAME("data");

 auth::data_resource::data_resource(level l, const sstring& ks, const sstring& cf)
-    : _ks(ks), _cf(cf)
+    : _level(l), _ks(ks), _cf(cf)
 {
-    if (l != get_level()) {
-        throw std::invalid_argument("level/keyspace/column mismatch");
-    }
 }

 auth::data_resource::data_resource()
@@ -67,14 +64,7 @@ auth::data_resource::data_resource(const sstring& ks, const sstring& cf)
 {}

 auth::data_resource::level auth::data_resource::get_level() const {
-    if (!_cf.empty()) {
-        assert(!_ks.empty());
-        return level::COLUMN_FAMILY;
-    }
-    if (!_ks.empty()) {
-        return level::KEYSPACE;
-    }
-    return level::ROOT;
+    return _level;
 }

 auth::data_resource auth::data_resource::from_name(
--- a/auth/data_resource.hh
+++ b/auth/data_resource.hh
@@ -56,6 +56,7 @@ private:

    static const sstring ROOT_NAME;

+    level _level;
    sstring _ks;
    sstring _cf;

--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -218,12 +218,12 @@ future<::shared_ptr<auth::authenticated_user> > auth::password_authenticator::au
    // obsolete prepared statements pretty quickly.
    // Rely on query processing caching statements instead, and lets assume
    // that a map lookup string->statement is not gonna kill us much.
-    auto& qp = cql3::get_local_query_processor();
-    return qp.process(
-                    sprint("SELECT %s FROM %s.%s WHERE %s = ?", SALTED_HASH,
-                                    auth::AUTH_KS, CREDENTIALS_CF, USER_NAME),
-                    consistency_for_user(username), { username }, true).then_wrapped(
-                    [=](future<::shared_ptr<cql3::untyped_result_set>> f) {
+    return futurize_apply([this, username, password] {
+        auto& qp = cql3::get_local_query_processor();
+        return qp.process(sprint("SELECT %s FROM %s.%s WHERE %s = ?", SALTED_HASH,
+                                        auth::AUTH_KS, CREDENTIALS_CF, USER_NAME),
+                        consistency_for_user(username), {username}, true);
+    }).then_wrapped([=](future<::shared_ptr<cql3::untyped_result_set>> f) {
        try {
            auto res = f.get0();
            if (res->empty() || !checkpw(password, res->one().get_as<sstring>(SALTED_HASH))) {
@@ -234,6 +234,8 @@ future<::shared_ptr<auth::authenticated_user> > auth::password_authenticator::au
            std::throw_with_nested(exceptions::authentication_exception("Could not verify password"));
        } catch (exceptions::request_execution_exception& e) {
            std::throw_with_nested(exceptions::authentication_exception(e.what()));
+        } catch (...) {
+            std::throw_with_nested(exceptions::authentication_exception("authentication failed"));
        }
    });
 }
--- a/auth/permission.cc
+++ b/auth/permission.cc
@@ -40,6 +40,7 @@
 */

 #include <unordered_map>
+#include <boost/algorithm/string.hpp>
 #include "permission.hh"

 const auth::permission_set auth::permissions::ALL_DATA =
@@ -75,7 +76,9 @@ const sstring& auth::permissions::to_string(permission p) {
 }

 auth::permission auth::permissions::from_string(const sstring& s) {
-    return permission_names.at(s);
+    sstring upper(s);
+    boost::to_upper(upper);
+    return permission_names.at(upper);
 }

 std::unordered_set<sstring> auth::permissions::to_strings(const permission_set& set) {
--- a/checked-file-impl.hh
+++ b/checked-file-impl.hh
@@ -28,7 +28,11 @@ class checked_file_impl : public file_impl {
 public:

    checked_file_impl(disk_error_signal_type& s, file f)
-            : _signal(s) , _file(f) {}
+            : _signal(s) , _file(f) {
+        _memory_dma_alignment = f.memory_dma_alignment();
+        _disk_read_dma_alignment = f.disk_read_dma_alignment();
+        _disk_write_dma_alignment = f.disk_write_dma_alignment();
+    }

    virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override {
        return do_io_check(_signal, [&] {
--- a/clustering_bounds_comparator.hh
+++ b/clustering_bounds_comparator.hh
@@ -0,0 +1,127 @@
+
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "keys.hh"
+#include "schema.hh"
+#include "range.hh"
+
+/**
+ * Represents the kind of bound in a range tombstone.
+ */
+enum class bound_kind : uint8_t {
+    excl_end = 0,
+    incl_start = 1,
+    // values 2 to 5 are reserved for forward Origin compatibility
+    incl_end = 6,
+    excl_start = 7,
+};
+
+std::ostream& operator<<(std::ostream& out, const bound_kind k);
+
+bound_kind invert_kind(bound_kind k);
+int32_t weight(bound_kind k);
+
+static inline bound_kind flip_bound_kind(bound_kind bk)
+{
+    switch (bk) {
+    case bound_kind::excl_end: return bound_kind::excl_start;
+    case bound_kind::incl_end: return bound_kind::incl_start;
+    case bound_kind::excl_start: return bound_kind::excl_end;
+    case bound_kind::incl_start: return bound_kind::incl_end;
+    }
+    abort();
+}
+
+class bound_view {
+    const static thread_local clustering_key empty_prefix;
+public:
+    const clustering_key_prefix& prefix;
+    bound_kind kind;
+    bound_view(const clustering_key_prefix& prefix, bound_kind kind)
+        : prefix(prefix)
+        , kind(kind)
+    { }
+    struct compare {
+        // To make it assignable and to avoid taking a schema_ptr, we
+        // wrap the schema reference.
+        std::reference_wrapper<const schema> _s;
+        compare(const schema& s) : _s(s)
+        { }
+        bool operator()(const clustering_key_prefix& p1, int32_t w1, const clustering_key_prefix& p2, int32_t w2) const {
+            auto type = _s.get().clustering_key_prefix_type();
+            auto res = prefix_equality_tri_compare(type->types().begin(),
+                type->begin(p1), type->end(p1),
+                type->begin(p2), type->end(p2),
+                tri_compare);
+            if (res) {
+                return res < 0;
+            }
+            auto d1 = p1.size(_s);
+            auto d2 = p2.size(_s);
+            if (d1 == d2) {
+                return w1 < w2;
+            }
+            return d1 < d2 ? w1 <= 0 : w2 > 0;
+        }
+        bool operator()(const bound_view b, const clustering_key_prefix& p) const {
+            return operator()(b.prefix, weight(b.kind), p, 0);
+        }
+        bool operator()(const clustering_key_prefix& p, const bound_view b) const {
+            return operator()(p, 0, b.prefix, weight(b.kind));
+        }
+        bool operator()(const bound_view b1, const bound_view b2) const {
+            return operator()(b1.prefix, weight(b1.kind), b2.prefix, weight(b2.kind));
+        }
+    };
+    bool equal(const schema& s, const bound_view other) const {
+        return kind == other.kind && prefix.equal(s, other.prefix);
+    }
+    bool adjacent(const schema& s, const bound_view other) const {
+        return invert_kind(other.kind) == kind && prefix.equal(s, other.prefix);
+    }
+    static bound_view bottom() {
+        return {empty_prefix, bound_kind::incl_start};
+    }
+    static bound_view top() {
+        return {empty_prefix, bound_kind::incl_end};
+    }
+    /*
+    template<template<typename> typename T, typename U>
+    concept bool Range() {
+        return requires (T<U> range) {
+            { range.start() } -> stdx::optional<U>;
+            { range.end() } -> stdx::optional<U>;
+        };
+    };*/
+    template<template<typename> typename Range>
+    static std::pair<bound_view, bound_view> from_range(const Range<clustering_key_prefix>& range) {
+        return {
+            range.start() ? bound_view(range.start()->value(), range.start()->is_inclusive() ? bound_kind::incl_start : bound_kind::excl_start) : bottom(),
+            range.end() ? bound_view(range.end()->value(), range.end()->is_inclusive() ? bound_kind::incl_end : bound_kind::excl_end) : top(),
+        };
+    }
+    friend std::ostream& operator<<(std::ostream& out, const bound_view& b) {
+        return out << "{bound: prefix=" << b.prefix << ", kind=" << b.kind << "}";
+    }
+};
--- a/clustering_key_filter.cc
+++ b/clustering_key_filter.cc
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2016 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "clustering_key_filter.hh"
+#include "keys.hh"
+#include "query-request.hh"
+#include "range.hh"
+
+namespace query {
+
+const clustering_row_ranges&
+clustering_key_filtering_context::get_ranges(const partition_key& key) const {
+    static thread_local clustering_row_ranges full_range = {{}};
+    return _factory ? _factory->get_ranges(key) : full_range;
+}
+
+clustering_key_filtering_context clustering_key_filtering_context::create_no_filtering() {
+    return clustering_key_filtering_context{};
+}
+
+const clustering_key_filtering_context no_clustering_key_filtering =
+    clustering_key_filtering_context::create_no_filtering();
+
+class stateless_clustering_key_filter_factory : public clustering_key_filter_factory {
+    clustering_key_filter _filter;
+    clustering_row_ranges _ranges;
+public:
+    stateless_clustering_key_filter_factory(clustering_row_ranges&& ranges,
+                                    clustering_key_filter&& filter)
+        : _filter(std::move(filter)), _ranges(std::move(ranges)) {}
+
+    virtual clustering_key_filter get_filter(const partition_key& key) override {
+        return _filter;
+    }
+
+    virtual clustering_key_filter get_filter_for_sorted(const partition_key& key) override {
+        return _filter;
+    }
+
+    virtual const clustering_row_ranges& get_ranges(const partition_key& key) override {
+        return _ranges;
+    }
+
+    virtual bool want_static_columns(const partition_key& key) override {
+        return true;
+    }
+};
+
+class partition_slice_clustering_key_filter_factory : public clustering_key_filter_factory {
+    schema_ptr _schema;
+    const partition_slice& _slice;
+    clustering_key_prefix::prefix_equal_tri_compare _cmp;
+    clustering_row_ranges _ck_ranges;
+public:
+    partition_slice_clustering_key_filter_factory(schema_ptr s, const partition_slice& slice)
+        : _schema(std::move(s)), _slice(slice), _cmp(*_schema) {}
+
+    virtual clustering_key_filter get_filter(const partition_key& key) override {
+        const clustering_row_ranges& ranges = _slice.row_ranges(*_schema, key);
+        return [this, &ranges] (const clustering_key& key) {
+            return std::any_of(std::begin(ranges), std::end(ranges),
+                [this, &key] (const clustering_range& r) { return r.contains(key, _cmp); });
+        };
+    }
+
+    virtual clustering_key_filter get_filter_for_sorted(const partition_key& key) override {
+        const clustering_row_ranges& ranges = _slice.row_ranges(*_schema, key);
+        return [this, &ranges] (const clustering_key& key) {
+            return std::any_of(std::begin(ranges), std::end(ranges),
+                [this, &key] (const clustering_range& r) { return r.contains(key, _cmp); });
+        };
+    }
+
+    virtual const clustering_row_ranges& get_ranges(const partition_key& key) override {
+        if (_slice.options.contains(query::partition_slice::option::reversed)) {
+            _ck_ranges = _slice.row_ranges(*_schema, key);
+            std::reverse(_ck_ranges.begin(), _ck_ranges.end());
+            return _ck_ranges;
+        }
+        return _slice.row_ranges(*_schema, key);
+    }
+
+    virtual bool want_static_columns(const partition_key& key) override {
+        return true;
+    }
+};
+
+static const shared_ptr<clustering_key_filter_factory>
+create_partition_slice_filter(schema_ptr s, const partition_slice& slice) {
+    return ::make_shared<partition_slice_clustering_key_filter_factory>(std::move(s), slice);
+}
+
+const clustering_key_filtering_context
+clustering_key_filtering_context::create(schema_ptr schema, const partition_slice& slice) {
+    static thread_local clustering_key_filtering_context accept_all = clustering_key_filtering_context(
+        ::make_shared<stateless_clustering_key_filter_factory>(clustering_row_ranges{{}},
+                                                       [](const clustering_key&) { return true; }));
+    static thread_local clustering_key_filtering_context reject_all = clustering_key_filtering_context(
+        ::make_shared<stateless_clustering_key_filter_factory>(clustering_row_ranges{},
+                                                       [](const clustering_key&) { return false; }));
+
+    if (slice.get_specific_ranges()) {
+        return clustering_key_filtering_context(create_partition_slice_filter(schema, slice));
+    }
+
+    const clustering_row_ranges& ranges = slice.default_row_ranges();
+
+    if (ranges.empty()) {
+        return reject_all;
+    }
+
+    if (ranges.size() == 1 && ranges[0].is_full()) {
+        return accept_all;
+    }
+    return clustering_key_filtering_context(create_partition_slice_filter(schema, slice));
+}
+
+}
--- a/clustering_key_filter.hh
+++ b/clustering_key_filter.hh
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2016 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <functional>
+#include <vector>
+
+#include "core/shared_ptr.hh"
+#include "database_fwd.hh"
+#include "schema.hh"
+
+template<typename T> class range;
+
+namespace query {
+
+class partition_slice;
+
+// A predicate that tells if a clustering key should be accepted.
+using clustering_key_filter = std::function<bool(const clustering_key&)>;
+
+// A factory for clustering key filter which can be reused for multiple clustering keys.
+class clustering_key_filter_factory {
+public:
+    // Create a clustering key filter that can be used for multiple clustering keys with no restrictions.
+    virtual clustering_key_filter get_filter(const partition_key&) = 0;
+    // Create a clustering key filter that can be used for multiple clustering keys but they have to be sorted.
+    virtual clustering_key_filter get_filter_for_sorted(const partition_key&) = 0;
+    virtual const std::vector<range<clustering_key_prefix>>& get_ranges(const partition_key&) = 0;
+    // Whether we want to get the static row, in addition to the desired clustering rows
+    virtual bool want_static_columns(const partition_key&) = 0;
+
+    virtual ~clustering_key_filter_factory() = default;
+};
+
+class clustering_key_filtering_context {
+private:
+    shared_ptr<clustering_key_filter_factory> _factory;
+    clustering_key_filtering_context() {};
+    clustering_key_filtering_context(shared_ptr<clustering_key_filter_factory> factory) : _factory(factory) {}
+public:
+    // Create a clustering key filter that can be used for multiple clustering keys with no restrictions.
+    clustering_key_filter get_filter(const partition_key& key) const {
+        return _factory ? _factory->get_filter(key) : [] (const clustering_key&) { return true; };
+    }
+    // Create a clustering key filter that can be used for multiple clustering keys but they have to be sorted.
+    clustering_key_filter get_filter_for_sorted(const partition_key& key) const {
+        return _factory ? _factory->get_filter_for_sorted(key) : [] (const clustering_key&) { return true; };
+    }
+    const std::vector<range<clustering_key_prefix>>& get_ranges(const partition_key& key) const;
+
+    bool want_static_columns(const partition_key& key)  const {
+        return _factory ? _factory->want_static_columns(key) : true;
+    }
+
+    static const clustering_key_filtering_context create(schema_ptr, const partition_slice&);
+
+    static clustering_key_filtering_context create_no_filtering();
+};
+
+extern const clustering_key_filtering_context no_clustering_key_filtering;
+
+}
--- a/compaction_strategy.hh
+++ b/compaction_strategy.hh
@@ -22,6 +22,8 @@
 #pragma once

 class column_family;
+class schema;
+using schema_ptr = lw_shared_ptr<const schema>;

 namespace sstables {

@@ -30,11 +32,12 @@ enum class compaction_strategy_type {
    major,
    size_tiered,
    leveled,
-    // FIXME: Add support to DateTiered.
+    date_tiered,
 };

 class compaction_strategy_impl;
 class sstable;
+class sstable_set;
 struct compaction_descriptor;

 class compaction_strategy {
@@ -51,6 +54,16 @@ public:
    // Return a list of sstables to be compacted after applying the strategy.
    compaction_descriptor get_sstables_for_compaction(column_family& cfs, std::vector<lw_shared_ptr<sstable>> candidates);

+    // Some strategies may look at the compacted and resulting sstables to
+    // get some useful information for subsequent compactions.
+    void notify_completion(schema_ptr schema, const std::vector<lw_shared_ptr<sstable>>& removed, const std::vector<lw_shared_ptr<sstable>>& added);
+
+    // Return if parallel compaction is allowed by strategy.
+    bool parallel_compaction() const;
+
+    // An estimation of number of compaction for strategy to be satisfied.
+    int64_t estimated_pending_compactions(column_family& cf) const;
+
    static sstring name(compaction_strategy_type type) {
        switch (type) {
        case compaction_strategy_type::null:
@@ -61,6 +74,8 @@ public:
            return "SizeTieredCompactionStrategy";
        case compaction_strategy_type::leveled:
            return "LeveledCompactionStrategy";
+        case compaction_strategy_type::date_tiered:
+            return "DateTieredCompactionStrategy";
        default:
            throw std::runtime_error("Invalid Compaction Strategy");
        }
@@ -77,6 +92,8 @@ public:
            return compaction_strategy_type::size_tiered;
        } else if (short_name == "LeveledCompactionStrategy") {
            return compaction_strategy_type::leveled;
+        } else if (short_name == "DateTieredCompactionStrategy") {
+            return compaction_strategy_type::date_tiered;
        } else {
            throw exceptions::configuration_exception(sprint("Unable to find compaction strategy class '%s'", name));
        }
@@ -87,6 +104,8 @@ public:
    sstring name() const {
        return name(type());
    }
+
+    sstable_set make_sstable_set(schema_ptr schema) const;
 };

 // Creates a compaction_strategy object from one of the strategies available.
--- a/compatible_ring_position.hh
+++ b/compatible_ring_position.hh
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include "query-request.hh"
+#include <experimental/optional>
+
+// Wraps ring_position so it is compatible with old-style C++: default constructor,
+// stateless comparators, yada yada
+class compatible_ring_position {
+    const schema* _schema = nullptr;
+    // optional to supply a default constructor, no more
+    std::experimental::optional<dht::ring_position> _rp;
+public:
+    compatible_ring_position() noexcept = default;
+    compatible_ring_position(const schema& s, const dht::ring_position& rp)
+            : _schema(&s), _rp(rp) {
+    }
+    compatible_ring_position(const schema& s, dht::ring_position&& rp)
+            : _schema(&s), _rp(std::move(rp)) {
+    }
+    friend int tri_compare(const compatible_ring_position& x, const compatible_ring_position& y) {
+        return x._rp->tri_compare(*x._schema, *y._rp);
+    }
+    friend bool operator<(const compatible_ring_position& x, const compatible_ring_position& y) {
+        return tri_compare(x, y) < 0;
+    }
+    friend bool operator<=(const compatible_ring_position& x, const compatible_ring_position& y) {
+        return tri_compare(x, y) <= 0;
+    }
+    friend bool operator>(const compatible_ring_position& x, const compatible_ring_position& y) {
+        return tri_compare(x, y) > 0;
+    }
+    friend bool operator>=(const compatible_ring_position& x, const compatible_ring_position& y) {
+        return tri_compare(x, y) >= 0;
+    }
+    friend bool operator==(const compatible_ring_position& x, const compatible_ring_position& y) {
+        return tri_compare(x, y) == 0;
+    }
+    friend bool operator!=(const compatible_ring_position& x, const compatible_ring_position& y) {
+        return tri_compare(x, y) != 0;
+    }
+};
+
--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -21,7 +21,10 @@

 #pragma once

+#include <boost/range/algorithm/copy.hpp>
+#include <boost/range/adaptor/transformed.hpp>
 #include "compound.hh"
+#include "schema.hh"

 //
 // This header provides adaptors between the representation used by our compound_type<>
@@ -180,3 +183,348 @@ bytes to_legacy(CompoundType& type, bytes_view packed) {
    std::copy(lv.begin(), lv.end(), legacy_form.begin());
    return legacy_form;
 }
+
+// Represents a value serialized according to Origin's CompositeType.
+// If is_compound is true, then the value is one or more components encoded as:
+//
+//   <representation> ::= ( <component> )+
+//   <component>      ::= <length> <value> <EOC>
+//   <length>         ::= <uint16_t>
+//   <EOC>            ::= <uint8_t>
+//
+// If false, then it encodes a single value, without a prefix length or a suffix EOC.
+class composite final {
+    bytes _bytes;
+    bool _is_compound;
+public:
+    composite(bytes&& b, bool is_compound)
+            : _bytes(std::move(b))
+            , _is_compound(is_compound)
+    { }
+
+    composite(bytes&& b)
+            : _bytes(std::move(b))
+            , _is_compound(true)
+    { }
+
+    composite()
+            : _bytes()
+            , _is_compound(true)
+    { }
+
+    using size_type = uint16_t;
+    using eoc_type = int8_t;
+
+    /*
+     * The 'end-of-component' byte should always be 0 for actual column name.
+     * However, it can set to 1 for query bounds. This allows to query for the
+     * equivalent of 'give me the full range'. That is, if a slice query is:
+     *   start = <3><"foo".getBytes()><0>
+     *   end   = <3><"foo".getBytes()><1>
+     * then we'll return *all* the columns whose first component is "foo".
+     * If for a component, the 'end-of-component' is != 0, there should not be any
+     * following component. The end-of-component can also be -1 to allow
+     * non-inclusive query. For instance:
+     *   end = <3><"foo".getBytes()><-1>
+     * allows to query everything that is smaller than <3><"foo".getBytes()>, but
+     * not <3><"foo".getBytes()> itself.
+     */
+    enum class eoc : eoc_type {
+        start = -1,
+        none = 0,
+        end = 1
+    };
+
+    using component = std::pair<bytes, eoc>;
+    using component_view = std::pair<bytes_view, eoc>;
+private:
+    template<typename Value, typename = std::enable_if_t<!std::is_same<const data_value, std::decay_t<Value>>::value>>
+    static size_t size(Value& val) {
+        return val.size();
+    }
+    static size_t size(const data_value& val) {
+        return val.serialized_size();
+    }
+    template<typename Value, typename = std::enable_if_t<!std::is_same<data_value, std::decay_t<Value>>::value>>
+    static void write_value(Value&& val, bytes::iterator& out) {
+        out = std::copy(val.begin(), val.end(), out);
+    }
+    static void write_value(const data_value& val, bytes::iterator& out) {
+        val.serialize(out);
+    }
+    template<typename RangeOfSerializedComponents>
+    static void serialize_value(RangeOfSerializedComponents&& values, bytes::iterator& out, bool is_compound) {
+        if (!is_compound) {
+            auto it = values.begin();
+            write_value(std::forward<decltype(*it)>(*it), out);
+            return;
+        }
+
+        for (auto&& val : values) {
+            write<size_type>(out, static_cast<size_type>(size(val)));
+            write_value(std::forward<decltype(val)>(val), out);
+            // Range tombstones are not keys. For collections, only frozen
+            // values can be keys. Therefore, for as long as it is safe to
+            // assume that this code will be used to create keys, it is safe
+            // to assume the trailing byte is always zero.
+            write<eoc_type>(out, eoc_type(eoc::none));
+        }
+    }
+    template <typename RangeOfSerializedComponents>
+    static size_t serialized_size(RangeOfSerializedComponents&& values, bool is_compound) {
+        size_t len = 0;
+        auto it = values.begin();
+        if (it != values.end()) {
+            // CQL3 uses a specific prefix (0xFFFF) to encode "static columns"
+            // (CASSANDRA-6561). This does mean the maximum size of the first component of a
+            // composite is 65534, not 65535 (or we wouldn't be able to detect if the first 2
+            // bytes is the static prefix or not).
+            auto value_size = size(*it);
+            if (value_size > static_cast<size_type>(std::numeric_limits<size_type>::max() - uint8_t(is_compound))) {
+                throw std::runtime_error(sprint("First component size too large: %d > %d", value_size, std::numeric_limits<size_type>::max() - is_compound));
+            }
+            if (!is_compound) {
+                return value_size;
+            }
+            len += sizeof(size_type) + value_size + sizeof(eoc_type);
+            ++it;
+        }
+        for ( ; it != values.end(); ++it) {
+            auto value_size = size(*it);
+            if (value_size > std::numeric_limits<size_type>::max()) {
+                throw std::runtime_error(sprint("Component size too large: %d > %d", value_size, std::numeric_limits<size_type>::max()));
+            }
+            len += sizeof(size_type) + value_size + sizeof(eoc_type);
+        }
+        return len;
+    }
+public:
+    template <typename Describer>
+    auto describe_type(Describer f) const {
+        return f(const_cast<bytes&>(_bytes));
+    }
+
+    template<typename RangeOfSerializedComponents>
+    static bytes serialize_value(RangeOfSerializedComponents&& values, bool is_compound = true) {
+        auto size = serialized_size(values, is_compound);
+        bytes b(bytes::initialized_later(), size);
+        auto i = b.begin();
+        serialize_value(std::forward<decltype(values)>(values), i, is_compound);
+        return b;
+    }
+
+    class iterator : public std::iterator<std::input_iterator_tag, const component_view> {
+        bytes_view _v;
+        component_view _current;
+    private:
+        eoc to_eoc(int8_t eoc_byte) {
+            return eoc_byte == 0 ? eoc::none : (eoc_byte < 0 ? eoc::start : eoc::end);
+        }
+
+        void read_current() {
+            size_type len;
+            {
+                if (_v.empty()) {
+                    _v = bytes_view(nullptr, 0);
+                    return;
+                }
+                len = read_simple<size_type>(_v);
+                if (_v.size() < len) {
+                    throw marshal_exception();
+                }
+            }
+            auto value = bytes_view(_v.begin(), len);
+            _v.remove_prefix(len);
+            _current = component_view(std::move(value), to_eoc(read_simple<eoc_type>(_v)));
+        }
+    public:
+        struct end_iterator_tag {};
+
+        iterator(const bytes_view& v, bool is_compound, bool is_static)
+                : _v(v) {
+            if (is_static) {
+                _v.remove_prefix(2);
+            }
+            if (is_compound) {
+                read_current();
+            } else {
+                _current = component_view(_v, eoc::none);
+                _v.remove_prefix(_v.size());
+            }
+        }
+
+        iterator(end_iterator_tag) : _v(nullptr, 0) {}
+
+        iterator& operator++() {
+            read_current();
+            return *this;
+        }
+
+        iterator operator++(int) {
+            iterator i(*this);
+            ++(*this);
+            return i;
+        }
+
+        const value_type& operator*() const { return _current; }
+        const value_type* operator->() const { return &_current; }
+        bool operator!=(const iterator& i) const { return _v.begin() != i._v.begin(); }
+        bool operator==(const iterator& i) const { return _v.begin() == i._v.begin(); }
+    };
+
+    iterator begin() const {
+        return iterator(_bytes, _is_compound, is_static());
+    }
+
+    iterator end() const {
+        return iterator(iterator::end_iterator_tag());
+    }
+
+    boost::iterator_range<iterator> components() const & {
+        return { begin(), end() };
+    }
+
+    auto values() const & {
+        return components() | boost::adaptors::transformed([](auto&& c) { return c.first; });
+    }
+
+    std::vector<component> components() const && {
+        std::vector<component> result;
+        std::transform(begin(), end(), std::back_inserter(result), [](auto&& p) {
+            return component(bytes(p.first.begin(), p.first.end()), p.second);
+        });
+        return result;
+    }
+
+    std::vector<bytes> values() const && {
+        std::vector<bytes> result;
+        boost::copy(components() | boost::adaptors::transformed([](auto&& c) { return to_bytes(c.first); }), std::back_inserter(result));
+        return result;
+    }
+
+    const bytes& get_bytes() const {
+        return _bytes;
+    }
+
+    size_t size() const {
+        return _bytes.size();
+    }
+
+    bool empty() const {
+        return _bytes.empty();
+    }
+
+    static bool is_static(bytes_view bytes, bool is_compound) {
+        return is_compound && bytes.size() > 2 && (bytes[0] & bytes[1] & 0xff) == 0xff;
+    }
+
+    bool is_static() const {
+        return is_static(_bytes, _is_compound);
+    }
+
+    bool is_compound() const {
+        return _is_compound;
+    }
+
+    // The following factory functions assume this composite is a compound value.
+    template <typename ClusteringElement>
+    static composite from_clustering_element(const schema& s, const ClusteringElement& ce) {
+        return serialize_value(ce.components(s));
+    }
+
+    static composite from_exploded(const std::vector<bytes_view>& v, eoc marker = eoc::none) {
+        if (v.size() == 0) {
+            return bytes(size_t(1), bytes::value_type(marker));
+        }
+        auto b = serialize_value(v);
+        b.back() = eoc_type(marker);
+        return composite(std::move(b));
+    }
+
+    static composite static_prefix(const schema& s) {
+        static bytes static_marker(size_t(2), bytes::value_type(0xff));
+
+        std::vector<bytes_view> sv(s.clustering_key_size());
+        return static_marker + serialize_value(sv);
+    }
+
+    explicit operator bytes_view() const {
+        return _bytes;
+    }
+
+    template <typename Component>
+    friend inline std::ostream& operator<<(std::ostream& os, const std::pair<Component, eoc>& c) {
+        return os << "{value=" << c.first << "; eoc=" << sprint("0x%02x", eoc_type(c.second) & 0xff) << "}";
+    }
+};
+
+class composite_view final {
+    bytes_view _bytes;
+    bool _is_compound;
+public:
+    composite_view(bytes_view b, bool is_compound = true)
+            : _bytes(b)
+            , _is_compound(is_compound)
+    { }
+
+    composite_view(const composite& c)
+            : composite_view(static_cast<bytes_view>(c), c.is_compound())
+    { }
+
+    composite_view()
+            : _bytes(nullptr, 0)
+            , _is_compound(true)
+    { }
+
+    std::vector<bytes> explode() const {
+        if (!_is_compound) {
+            return { to_bytes(_bytes) };
+        }
+
+        std::vector<bytes> ret;
+        for (auto it = begin(), e = end(); it != e; ) {
+            ret.push_back(to_bytes(it->first));
+            auto marker = it->second;
+            ++it;
+            if (it != e && marker != composite::eoc::none) {
+                throw runtime_exception(sprint("non-zero component divider found (%d) mid", sprint("0x%02x", composite::eoc_type(marker) & 0xff)));
+            }
+        }
+        return ret;
+    }
+
+    composite::iterator begin() const {
+        return composite::iterator(_bytes, _is_compound, is_static());
+    }
+
+    composite::iterator end() const {
+        return composite::iterator(composite::iterator::end_iterator_tag());
+    }
+
+    boost::iterator_range<composite::iterator> components() const {
+        return { begin(), end() };
+    }
+
+    auto values() const {
+        return components() | boost::adaptors::transformed([](auto&& c) { return c.first; });
+    }
+
+    size_t size() const {
+        return _bytes.size();
+    }
+
+    bool empty() const {
+        return _bytes.empty();
+    }
+
+    bool is_static() const {
+        return composite::is_static(_bytes, _is_compound);
+    }
+
+    explicit operator bytes_view() const {
+        return _bytes;
+    }
+
+    bool operator==(const composite_view& k) const { return k._bytes == _bytes && k._is_compound == _is_compound; }
+    bool operator!=(const composite_view& k) const { return !(k == *this); }
+};
--- a/compress.hh
+++ b/compress.hh
@@ -32,7 +32,7 @@ enum class compressor {

 class compression_parameters {
 public:
-    static constexpr int32_t DEFAULT_CHUNK_LENGTH = 64 * 1024;
+    static constexpr int32_t DEFAULT_CHUNK_LENGTH = 4 * 1024;
    static constexpr double DEFAULT_CRC_CHECK_CHANCE = 1.0;

    static constexpr auto SSTABLE_COMPRESSION = "sstable_compression";
--- a/conf/housekeeping.cfg
+++ b/conf/housekeeping.cfg
@@ -0,0 +1,2 @@
+[housekeeping]
+check-version: True
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -784,7 +784,7 @@ commitlog_total_space_in_mb: -1
 # can be:  all  - all traffic is compressed
 #          dc   - traffic between different datacenters is compressed
 #          none - nothing is compressed.
-# internode_compression: all
+# internode_compression: none

 # Enable or disable tcp_nodelay for inter-dc communication.
 # Disabling it will result in larger (but fewer) network packets being sent,
@@ -805,3 +805,11 @@ commitlog_total_space_in_mb: -1
 # true: relaxed environment checks; performance and reliability may degraade.
 #
 # developer_mode: false
+
+
+# Idle-time background processing
+#
+# Scylla can perform certain jobs in the background while the system is otherwise idle,
+# freeing processor resources when there is other work to be done.
+#
+# defragment_memory_on_idle: true
--- a/configure.py
+++ b/configure.py
@@ -162,6 +162,7 @@ modes = {

 scylla_tests = [
    'tests/mutation_test',
+    'tests/streamed_mutation_test',
    'tests/schema_registry_test',
    'tests/canonical_mutation_test',
    'tests/range_test',
@@ -216,6 +217,9 @@ scylla_tests = [
    'tests/dynamic_bitset_test',
    'tests/auth_test',
    'tests/idl_test',
+    'tests/range_tombstone_list_test',
+    'tests/anchorless_list_test',
+    'tests/database_test',
 ]

 apps = [
@@ -278,6 +282,8 @@ scylla_core = (['database.cc',
                 'schema_registry.cc',
                 'bytes.cc',
                 'mutation.cc',
+                 'streamed_mutation.cc',
+                 'partition_version.cc',
                 'row_cache.cc',
                 'canonical_mutation.cc',
                 'frozen_mutation.cc',
@@ -293,16 +299,15 @@ scylla_core = (['database.cc',
                 'mutation_query.cc',
                 'key_reader.cc',
                 'keys.cc',
+                 'clustering_key_filter.cc',
                 'sstables/sstables.cc',
                 'sstables/compress.cc',
                 'sstables/row.cc',
-                 'sstables/key.cc',
                 'sstables/partition.cc',
                 'sstables/filter.cc',
                 'sstables/compaction.cc',
                 'sstables/compaction_strategy.cc',
                 'sstables/compaction_manager.cc',
-                 'log.cc',
                 'transport/event.cc',
                 'transport/event_notifier.cc',
                 'transport/server.cc',
@@ -351,6 +356,7 @@ scylla_core = (['database.cc',
                 'cql3/statements/grant_statement.cc',
                 'cql3/statements/revoke_statement.cc',
                 'cql3/statements/alter_type_statement.cc',
+                 'cql3/statements/alter_keyspace_statement.cc',
                 'cql3/update_parameters.cc',
                 'cql3/ut_name.cc',
                 'cql3/user_options.cc',
@@ -369,6 +375,7 @@ scylla_core = (['database.cc',
                 'cql3/operator.cc',
                 'cql3/relation.cc',
                 'cql3/column_identifier.cc',
+                 'cql3/column_specification.cc',
                 'cql3/constants.cc',
                 'cql3/query_processor.cc',
                 'cql3/query_options.cc',
@@ -384,6 +391,7 @@ scylla_core = (['database.cc',
                 'cql3/selection/selection.cc',
                 'cql3/selection/selector.cc',
                 'cql3/restrictions/statement_restrictions.cc',
+                 'cql3/result_set.cc',
                 'db/consistency_level.cc',
                 'db/system_keyspace.cc',
                 'db/schema_tables.cc',
@@ -470,6 +478,12 @@ scylla_core = (['database.cc',
                 'auth/data_resource.cc',
                 'auth/password_authenticator.cc',
                 'auth/permission.cc',
+                 'tracing/tracing.cc',
+                 'tracing/trace_keyspace_helper.cc',
+                 'tracing/trace_state.cc',
+                 'range_tombstone.cc',
+                 'range_tombstone_list.cc',
+                 'db/size_estimates_recorder.cc'
                 ]
                + [Antlr3Grammar('cql3/Cql.g')]
                + [Thrift('interface/cassandra.thrift', 'Cassandra')]
@@ -529,6 +543,7 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/query.idl.hh',
        'idl/idl_test.idl.hh',
        'idl/commitlog.idl.hh',
+        'idl/tracing.idl.hh',
        ]

 scylla_tests_dependencies = scylla_core + api + idls + [
@@ -551,8 +566,6 @@ tests_not_using_seastar_test_framework = set([
    'tests/keys_test',
    'tests/partitioner_test',
    'tests/map_difference_test',
-    'tests/frozen_mutation_test',
-    'tests/canonical_mutation_test',
    'tests/perf/perf_mutation',
    'tests/lsa_async_eviction_test',
    'tests/lsa_sync_eviction_test',
@@ -573,6 +586,8 @@ tests_not_using_seastar_test_framework = set([
    'tests/managed_vector_test',
    'tests/dynamic_bitset_test',
    'tests/idl_test',
+    'tests/range_tombstone_list_test',
+    'tests/anchorless_list_test',
 ])

 for t in tests_not_using_seastar_test_framework:
@@ -589,7 +604,8 @@ deps['tests/sstable_test'] += ['tests/sstable_datafile_test.cc']
 deps['tests/bytes_ostream_test'] = ['tests/bytes_ostream_test.cc']
 deps['tests/UUID_test'] = ['utils/UUID_gen.cc', 'tests/UUID_test.cc']
 deps['tests/murmur_hash_test'] = ['bytes.cc', 'utils/murmur_hash.cc', 'tests/murmur_hash_test.cc']
-deps['tests/allocation_strategy_test'] = ['tests/allocation_strategy_test.cc', 'utils/logalloc.cc', 'log.cc', 'utils/dynamic_bitset.cc']
+deps['tests/allocation_strategy_test'] = ['tests/allocation_strategy_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
+deps['tests/anchorless_list_test'] = ['tests/anchorless_list_test.cc']

 warnings = [
    '-Wno-mismatched-tags',  # clang-only
--- a/converting_mutation_partition_applier.hh
+++ b/converting_mutation_partition_applier.hh
@@ -35,7 +35,7 @@ class converting_mutation_partition_applier : public mutation_partition_visitor
    deletable_row* _current_row;
 private:
    static bool is_compatible(const column_definition& new_def, const data_type& old_type, column_kind kind) {
-        return new_def.kind == kind && new_def.type->is_value_compatible_with(*old_type);
+        return ::is_compatible(new_def.kind, kind) && new_def.type->is_value_compatible_with(*old_type);
    }
    void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) {
        if (is_compatible(new_def, old_type, kind) && cell.timestamp() > new_def.dropped_at()) {
@@ -90,8 +90,8 @@ public:
        }
    }

-    virtual void accept_row_tombstone(clustering_key_prefix_view prefix, tombstone t) override {
-        _p.apply_row_tombstone(_p_schema, prefix, t);
+    virtual void accept_row_tombstone(const range_tombstone& rt) override {
+        _p.apply_row_tombstone(_p_schema, rt);
    }

    virtual void accept_row(clustering_key_view key, tombstone deleted_at, const row_marker& rm) override {
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -32,6 +32,9 @@ options {

@parser::includes {
 #include "cql3/selection/writetime_or_ttl.hh"
+#include "cql3/statements/raw/parsed_statement.hh"
+#include "cql3/statements/raw/select_statement.hh"
+#include "cql3/statements/alter_keyspace_statement.hh"
 #include "cql3/statements/alter_table_statement.hh"
 #include "cql3/statements/create_keyspace_statement.hh"
 #include "cql3/statements/drop_keyspace_statement.hh"
@@ -43,12 +46,12 @@ options {
 #include "cql3/statements/property_definitions.hh"
 #include "cql3/statements/drop_table_statement.hh"
 #include "cql3/statements/truncate_statement.hh"
-#include "cql3/statements/select_statement.hh"
-#include "cql3/statements/update_statement.hh"
-#include "cql3/statements/delete_statement.hh"
+#include "cql3/statements/raw/update_statement.hh"
+#include "cql3/statements/raw/insert_statement.hh"
+#include "cql3/statements/raw/delete_statement.hh"
 #include "cql3/statements/index_prop_defs.hh"
-#include "cql3/statements/use_statement.hh"
-#include "cql3/statements/batch_statement.hh"
+#include "cql3/statements/raw/use_statement.hh"
+#include "cql3/statements/raw/batch_statement.hh"
 #include "cql3/statements/create_user_statement.hh"
 #include "cql3/statements/alter_user_statement.hh"
 #include "cql3/statements/drop_user_statement.hh"
@@ -294,11 +297,11 @@ struct uninitialized {

 /** STATEMENTS **/

-query returns [shared_ptr<parsed_statement> stmnt]
+query returns [shared_ptr<raw::parsed_statement> stmnt]
    : st=cqlStatement (';')* EOF { $stmnt = st; }
    ;

-cqlStatement returns [shared_ptr<parsed_statement> stmt]
+cqlStatement returns [shared_ptr<raw::parsed_statement> stmt]
    @after{ if (stmt) { stmt->set_bound_variables(_bind_variables); } }
    : st1= selectStatement             { $stmt = st1; }
    | st2= insertStatement             { $stmt = st2; }
@@ -316,9 +319,7 @@ cqlStatement returns [shared_ptr<parsed_statement> stmt]
    | st13=dropIndexStatement          { $stmt = st13; }
 #endif
    | st14=alterTableStatement         { $stmt = st14; }
-#if 0
    | st15=alterKeyspaceStatement      { $stmt = st15; }
-#endif
    | st16=grantStatement              { $stmt = st16; }
    | st17=revokeStatement             { $stmt = st17; }
    | st18=listPermissionsStatement    { $stmt = st18; }
@@ -344,8 +345,8 @@ cqlStatement returns [shared_ptr<parsed_statement> stmt]
 /*
 * USE <KEYSPACE>;
 */
-useStatement returns [::shared_ptr<use_statement> stmt]
-    : K_USE ks=keyspaceName { $stmt = ::make_shared<use_statement>(ks); }
+useStatement returns [::shared_ptr<raw::use_statement> stmt]
+    : K_USE ks=keyspaceName { $stmt = ::make_shared<raw::use_statement>(ks); }
    ;

 /**
@@ -354,11 +355,11 @@ useStatement returns [::shared_ptr<use_statement> stmt]
 * WHERE KEY = "key1" AND COL > 1 AND COL < 100
 * LIMIT <NUMBER>;
 */
-selectStatement returns [shared_ptr<select_statement::raw_statement> expr]
+selectStatement returns [shared_ptr<raw::select_statement> expr]
    @init {
        bool is_distinct = false;
        ::shared_ptr<cql3::term::raw> limit;
-        select_statement::parameters::orderings_type orderings;
+        raw::select_statement::parameters::orderings_type orderings;
        bool allow_filtering = false;
    }
    : K_SELECT ( ( K_DISTINCT { is_distinct = true; } )?
@@ -371,8 +372,8 @@ selectStatement returns [shared_ptr<select_statement::raw_statement> expr]
      ( K_LIMIT rows=intValue { limit = rows; } )?
      ( K_ALLOW K_FILTERING  { allow_filtering = true; } )?
      {
-          auto params = ::make_shared<select_statement::parameters>(std::move(orderings), is_distinct, allow_filtering);
-          $expr = ::make_shared<select_statement::raw_statement>(std::move(cf), std::move(params),
+          auto params = ::make_shared<raw::select_statement::parameters>(std::move(orderings), is_distinct, allow_filtering);
+          $expr = ::make_shared<raw::select_statement>(std::move(cf), std::move(params),
            std::move(sclause), std::move(wclause), std::move(limit));
      }
    ;
@@ -426,7 +427,7 @@ whereClause returns [std::vector<cql3::relation_ptr> clause]
    : relation[$clause] (K_AND relation[$clause])*
    ;

-orderByClause[select_statement::parameters::orderings_type& orderings]
+orderByClause[raw::select_statement::parameters::orderings_type& orderings]
    @init{
        bool reversed = false;
    }
@@ -439,7 +440,7 @@ orderByClause[select_statement::parameters::orderings_type& orderings]
 * USING TIMESTAMP <long>;
 *
 */
-insertStatement returns [::shared_ptr<update_statement::parsed_insert> expr]
+insertStatement returns [::shared_ptr<raw::insert_statement> expr]
    @init {
        auto attrs = ::make_shared<cql3::attributes::raw>();
        std::vector<::shared_ptr<cql3::column_identifier::raw>> column_names;
@@ -454,7 +455,7 @@ insertStatement returns [::shared_ptr<update_statement::parsed_insert> expr]
        ( K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
        ( usingClause[attrs] )?
      {
-          $expr = ::make_shared<update_statement::parsed_insert>(std::move(cf),
+          $expr = ::make_shared<raw::insert_statement>(std::move(cf),
                                                   std::move(attrs),
                                                   std::move(column_names),
                                                   std::move(values),
@@ -477,7 +478,7 @@ usingClauseObjective[::shared_ptr<cql3::attributes::raw> attrs]
 * SET name1 = value1, name2 = value2
 * WHERE key = value;
 */
-updateStatement returns [::shared_ptr<update_statement::parsed_update> expr]
+updateStatement returns [::shared_ptr<raw::update_statement> expr]
    @init {
        auto attrs = ::make_shared<cql3::attributes::raw>();
        std::vector<std::pair<::shared_ptr<cql3::column_identifier::raw>, ::shared_ptr<cql3::operation::raw_update>>> operations;
@@ -488,7 +489,7 @@ updateStatement returns [::shared_ptr<update_statement::parsed_update> expr]
      K_WHERE wclause=whereClause
      ( K_IF conditions=updateConditions )?
      {
-          return ::make_shared<update_statement::parsed_update>(std::move(cf),
+          return ::make_shared<raw::update_statement>(std::move(cf),
                                                  std::move(attrs),
                                                  std::move(operations),
                                                  std::move(wclause),
@@ -507,7 +508,7 @@ updateConditions returns [conditions_type conditions]
 * WHERE KEY = keyname
   [IF (EXISTS | name = value, ...)];
 */
-deleteStatement returns [::shared_ptr<delete_statement::parsed> expr]
+deleteStatement returns [::shared_ptr<raw::delete_statement> expr]
    @init {
        auto attrs = ::make_shared<cql3::attributes::raw>();
        std::vector<::shared_ptr<cql3::operation::raw_deletion>> column_deletions;
@@ -519,7 +520,7 @@ deleteStatement returns [::shared_ptr<delete_statement::parsed> expr]
      K_WHERE wclause=whereClause
      ( K_IF ( K_EXISTS { if_exists = true; } | conditions=updateConditions ))?
      {
-          return ::make_shared<delete_statement::parsed>(cf,
+          return ::make_shared<raw::delete_statement>(cf,
                                            std::move(attrs),
                                            std::move(column_deletions),
                                            std::move(wclause),
@@ -566,11 +567,11 @@ usingClauseDelete[::shared_ptr<cql3::attributes::raw> attrs]
 *   ...
 * APPLY BATCH
 */
-batchStatement returns [shared_ptr<cql3::statements::batch_statement::parsed> expr]
+batchStatement returns [shared_ptr<cql3::statements::raw::batch_statement> expr]
    @init {
-        using btype = cql3::statements::batch_statement::type; 
+        using btype = cql3::statements::raw::batch_statement::type; 
        btype type = btype::LOGGED;
-        std::vector<shared_ptr<cql3::statements::modification_statement::parsed>> statements;
+        std::vector<shared_ptr<cql3::statements::raw::modification_statement>> statements;
        auto attrs = make_shared<cql3::attributes::raw>();
    }
    : K_BEGIN
@@ -579,11 +580,11 @@ batchStatement returns [shared_ptr<cql3::statements::batch_statement::parsed> ex
          ( s=batchStatementObjective ';'? { statements.push_back(std::move(s)); } )*
      K_APPLY K_BATCH
      {
-          $expr = ::make_shared<cql3::statements::batch_statement::parsed>(type, std::move(attrs), std::move(statements));
+          $expr = ::make_shared<cql3::statements::raw::batch_statement>(type, std::move(attrs), std::move(statements));
      }
    ;

-batchStatementObjective returns [shared_ptr<cql3::statements::modification_statement::parsed> statement]
+batchStatementObjective returns [shared_ptr<cql3::statements::raw::modification_statement> statement]
    : i=insertStatement  { $statement = i; }
    | u=updateStatement  { $statement = u; }
    | d=deleteStatement  { $statement = d; }
@@ -809,15 +810,18 @@ dropTriggerStatement returns [DropTriggerStatement expr]
      { $expr = new DropTriggerStatement(cf, name.toString(), ifExists); }
    ;

+#endif
+
 /**
 * ALTER KEYSPACE <KS> WITH <property> = <value>;
 */
-alterKeyspaceStatement returns [AlterKeyspaceStatement expr]
-    @init { KSPropDefs attrs = new KSPropDefs(); }
+alterKeyspaceStatement returns [shared_ptr<cql3::statements::alter_keyspace_statement> expr]
+    @init {
+        auto attrs = make_shared<cql3::statements::ks_prop_defs>();
+    }
    : K_ALTER K_KEYSPACE ks=keyspaceName
-        K_WITH properties[attrs] { $expr = new AlterKeyspaceStatement(ks, attrs); }
+        K_WITH properties[attrs] { $expr = make_shared<cql3::statements::alter_keyspace_statement>(ks, attrs); }
    ;
-#endif

 /**
 * ALTER COLUMN FAMILY <CF> ALTER <column> TYPE <newtype>;
--- a/cql3/column_specification.cc
+++ b/cql3/column_specification.cc
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (C) 2016 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cql3/column_specification.hh"
+
+namespace cql3 {
+
+bool column_specification::all_in_same_table(const std::vector<::shared_ptr<column_specification>>& names)
+{
+    assert(!names.empty());
+
+    auto first = names.front();
+    return std::all_of(std::next(names.begin()), names.end(), [first] (auto&& spec) {
+        return spec->ks_name == first->ks_name && spec->cf_name == first->cf_name;
+    });
+}
+
+}
--- a/cql3/column_specification.hh
+++ b/cql3/column_specification.hh
@@ -75,6 +75,8 @@ public:
    bool is_reversed_type() const {
        return ::dynamic_pointer_cast<const reversed_type_impl>(type) != nullptr;
    }
+
+    static bool all_in_same_table(const std::vector<::shared_ptr<column_specification>>& names);
 };

 }
--- a/cql3/cql_statement.hh
+++ b/cql3/cql_statement.hh
@@ -58,6 +58,9 @@ class result_message;

 namespace cql3 {

+class metadata;
+shared_ptr<const metadata> make_empty_metadata();
+
 class cql_statement {
 public:
    virtual ~cql_statement()
@@ -102,6 +105,15 @@ public:
    virtual bool depends_on_keyspace(const sstring& ks_name) const = 0;

    virtual bool depends_on_column_family(const sstring& cf_name) const = 0;
+
+    virtual shared_ptr<const metadata> get_result_metadata() const = 0;
+};
+
+class cql_statement_no_metadata : public cql_statement {
+public:
+    virtual shared_ptr<const metadata> get_result_metadata() const override {
+        return make_empty_metadata();
+    }
 };

 }
--- a/cql3/query_options.cc
+++ b/cql3/query_options.cc
@@ -47,23 +47,23 @@ namespace cql3 {
 thread_local const query_options::specific_options query_options::specific_options::DEFAULT{-1, {}, {}, api::missing_timestamp};

 thread_local query_options query_options::DEFAULT{db::consistency_level::ONE, std::experimental::nullopt,
-    {}, false, query_options::specific_options::DEFAULT, cql_serialization_format::latest()};
+    std::vector<bytes_view_opt>(), false, query_options::specific_options::DEFAULT, cql_serialization_format::latest()};

 query_options::query_options(db::consistency_level consistency,
                             std::experimental::optional<std::vector<sstring_view>> names,
                             std::vector<bytes_opt> values,
-                             std::vector<bytes_view_opt> value_views,
                             bool skip_metadata,
                             specific_options options,
                             cql_serialization_format sf)
    : _consistency(consistency)
    , _names(std::move(names))
    , _values(std::move(values))
-    , _value_views(std::move(value_views))
+    , _value_views()
    , _skip_metadata(skip_metadata)
    , _options(std::move(options))
    , _cql_serialization_format(sf)
 {
+    fill_value_views();
 }

 query_options::query_options(db::consistency_level consistency,
@@ -72,15 +72,13 @@ query_options::query_options(db::consistency_level consistency,
                             bool skip_metadata,
                             specific_options options,
                             cql_serialization_format sf)
-    : query_options(
-          consistency,
-          std::move(names),
-          {},
-          std::move(value_views),
-          skip_metadata,
-          std::move(options),
-          sf
-      )
+    : _consistency(consistency)
+    , _names(std::move(names))
+    , _values()
+    , _value_views(std::move(value_views))
+    , _skip_metadata(skip_metadata)
+    , _options(std::move(options))
+    , _cql_serialization_format(sf)
 {
 }

@@ -100,19 +98,11 @@ query_options::query_options(db::consistency_level cl, std::vector<bytes_opt> va
          cl,
          {},
          std::move(values),
-          {},
          false,
          query_options::specific_options::DEFAULT,
          cql_serialization_format::latest()
      )
 {
-    for (auto&& value : _values) {
-        if (value) {
-            _value_views.emplace_back(bytes_view{*value});
-        } else {
-            _value_views.emplace_back(std::experimental::nullopt);
-        }
-    }
 }

 query_options::query_options(std::vector<bytes_opt> values)
@@ -214,6 +204,18 @@ void query_options::prepare(const std::vector<::shared_ptr<column_specification>
        }
    }
    _values = std::move(ordered_values);
+    fill_value_views();
+}
+
+void query_options::fill_value_views()
+{
+    for (auto&& value : _values) {
+        if (value) {
+            _value_views.emplace_back(bytes_view{*value});
+        } else {
+            _value_views.emplace_back(std::experimental::nullopt);
+        }
+    }
 }

 }
--- a/cql3/query_options.hh
+++ b/cql3/query_options.hh
@@ -83,7 +83,6 @@ public:
    explicit query_options(db::consistency_level consistency,
                           std::experimental::optional<std::vector<sstring_view>> names,
                           std::vector<bytes_opt> values,
-                           std::vector<bytes_view_opt> value_views,
                           bool skip_metadata,
                           specific_options options,
                           cql_serialization_format sf);
@@ -132,6 +131,8 @@ public:
    const specific_options& get_specific_options() const;
    const query_options& for_statement(size_t i) const;
    void prepare(const std::vector<::shared_ptr<column_specification>>& specs);
+private:
+    void fill_value_views();
 };

 }
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -58,7 +58,7 @@ logging::logger log("query_processor");

 distributed<query_processor> _the_query_processor;

-const sstring query_processor::CQL_VERSION = "3.2.0";
+const sstring query_processor::CQL_VERSION = "3.2.1";

 class query_processor::internal_state {
    service::query_state _qs;
@@ -115,6 +115,7 @@ future<::shared_ptr<result_message>>
 query_processor::process(const sstring_view& query_string, service::query_state& query_state, query_options& options)
 {
    log.trace("process: \"{}\"", query_string);
+    tracing::trace(query_state.get_trace_state(), "Parsing a statement");
    auto p = get_statement(query_string, query_state.get_client_state());
    options.prepare(p->bound_names);
    auto cql_statement = p->statement;
@@ -127,6 +128,7 @@ query_processor::process(const sstring_view& query_string, service::query_state&
        if (!queryState.getClientState().isInternal)
            metrics.regularStatementsExecuted.inc();
 #endif
+    tracing::trace(query_state.get_trace_state(), "Processing a statement");
    return process_statement(std::move(cql_statement), query_state, options);
 }

@@ -144,7 +146,7 @@ query_processor::process_statement(::shared_ptr<cql_statement> statement, servic
        statement->validate(_proxy, client_state);

        future<::shared_ptr<transport::messages::result_message>> fut = make_ready_future<::shared_ptr<transport::messages::result_message>>();
-        if (client_state._is_internal) {
+        if (client_state.is_internal()) {
            fut = statement->execute_internal(_proxy, query_state, options);
        } else  {
            fut = statement->execute(_proxy, query_state, options);
@@ -187,25 +189,25 @@ query_processor::prepare(const std::experimental::string_view& query_string, con
 query_processor::get_stored_prepared_statement(const std::experimental::string_view& query_string, const sstring& keyspace, bool for_thrift)
 {
    if (for_thrift) {
-        throw std::runtime_error(sprint("%s not implemented", __PRETTY_FUNCTION__));
-#if 0
-        Integer thriftStatementId = computeThriftId(queryString, keyspace);
-        ParsedStatement.Prepared existing = thriftPreparedStatements.get(thriftStatementId);
-        return existing == null ? null : ResultMessage.Prepared.forThrift(thriftStatementId, existing.boundNames);
-#endif
+        auto statement_id = compute_thrift_id(query_string, keyspace);
+        auto it = _thrift_prepared_statements.find(statement_id);
+        if (it == _thrift_prepared_statements.end()) {
+            return ::shared_ptr<result_message::prepared>();
+        }
+        return ::make_shared<result_message::prepared::thrift>(statement_id, it->second);
    } else {
        auto statement_id = compute_id(query_string, keyspace);
        auto it = _prepared_statements.find(statement_id);
        if (it == _prepared_statements.end()) {
            return ::shared_ptr<result_message::prepared>();
        }
-        return ::make_shared<result_message::prepared>(statement_id, it->second);
+        return ::make_shared<result_message::prepared::cql>(statement_id, it->second);
    }
 }

 future<::shared_ptr<transport::messages::result_message::prepared>>
 query_processor::store_prepared_statement(const std::experimental::string_view& query_string, const sstring& keyspace,
-        ::shared_ptr<statements::parsed_statement::prepared> prepared, bool for_thrift)
+        ::shared_ptr<statements::prepared_statement> prepared, bool for_thrift)
 {
 #if 0
    // Concatenate the current keyspace so we don't mix prepared statements between keyspace (#5352).
@@ -217,26 +219,20 @@ query_processor::store_prepared_statement(const std::experimental::string_view&
                                                        statementSize,
                                                        MAX_CACHE_PREPARED_MEMORY));
 #endif
+    prepared->raw_cql_statement = query_string.data();
    if (for_thrift) {
-        throw std::runtime_error(sprint("%s not implemented", __PRETTY_FUNCTION__));
-#if 0
-        Integer statementId = computeThriftId(queryString, keyspace);
-        thriftPreparedStatements.put(statementId, prepared);
-        return ResultMessage.Prepared.forThrift(statementId, prepared.boundNames);
-#endif
+        auto statement_id = compute_thrift_id(query_string, keyspace);
+        _thrift_prepared_statements.emplace(statement_id, prepared);
+        auto msg = ::make_shared<result_message::prepared::thrift>(statement_id, prepared);
+        return make_ready_future<::shared_ptr<result_message::prepared>>(std::move(msg));
    } else {
        auto statement_id = compute_id(query_string, keyspace);
        _prepared_statements.emplace(statement_id, prepared);
-        auto msg = ::make_shared<result_message::prepared>(statement_id, prepared);
+        auto msg = ::make_shared<result_message::prepared::cql>(statement_id, prepared);
        return make_ready_future<::shared_ptr<result_message::prepared>>(std::move(msg));
    }
 }

-void query_processor::invalidate_prepared_statement(bytes statement_id)
-{
-    _prepared_statements.erase(statement_id);
-}
-
 static bytes md5_calculate(const std::experimental::string_view& s)
 {
    constexpr size_t size = CryptoPP::Weak1::MD5::DIGESTSIZE;
@@ -246,26 +242,35 @@ static bytes md5_calculate(const std::experimental::string_view& s)
    return std::move(bytes{reinterpret_cast<const int8_t*>(digest), size});
 }

-bytes query_processor::compute_id(const std::experimental::string_view& query_string, const sstring& keyspace)
-{
-    sstring to_hash;
-    if (!keyspace.empty()) {
-        to_hash += keyspace;
-    }
-    to_hash += query_string.to_string();
-    return md5_calculate(to_hash);
+static sstring hash_target(const std::experimental::string_view& query_string, const sstring& keyspace) {
+    return keyspace + query_string.to_string();
 }

-::shared_ptr<parsed_statement::prepared>
+bytes query_processor::compute_id(const std::experimental::string_view& query_string, const sstring& keyspace)
+{
+    return md5_calculate(hash_target(query_string, keyspace));
+}
+
+int32_t query_processor::compute_thrift_id(const std::experimental::string_view& query_string, const sstring& keyspace)
+{
+    auto target = hash_target(query_string, keyspace);
+    uint32_t h = 0;
+    for (auto&& c : hash_target(query_string, keyspace)) {
+        h = 31*h + c;
+    }
+    return static_cast<int32_t>(h);
+}
+
+::shared_ptr<prepared_statement>
 query_processor::get_statement(const sstring_view& query, const service::client_state& client_state)
 {
 #if 0
        Tracing.trace("Parsing {}", queryStr);
 #endif
-    ::shared_ptr<parsed_statement> statement = parse_statement(query);
+    ::shared_ptr<raw::parsed_statement> statement = parse_statement(query);

    // Set keyspace for statement that require login
-    auto cf_stmt = dynamic_pointer_cast<cf_statement>(statement);
+    auto cf_stmt = dynamic_pointer_cast<raw::cf_statement>(statement);
    if (cf_stmt) {
        cf_stmt->prepare_keyspace(client_state);
    }
@@ -276,7 +281,7 @@ query_processor::get_statement(const sstring_view& query, const service::client_
    return statement->prepare(_db.local());
 }

-::shared_ptr<parsed_statement>
+::shared_ptr<raw::parsed_statement>
 query_processor::parse_statement(const sstring_view& query)
 {
    try {
@@ -309,7 +314,7 @@ query_processor::parse_statement(const sstring_view& query)
 }

 query_options query_processor::make_internal_options(
-                ::shared_ptr<statements::parsed_statement::prepared> p,
+                ::shared_ptr<statements::prepared_statement> p,
                const std::initializer_list<data_value>& values,
                db::consistency_level cl) {
    if (p->bound_names.size() != values.size()) {
@@ -330,7 +335,7 @@ query_options query_processor::make_internal_options(
    return query_options(cl, bound_values);
 }

-::shared_ptr<statements::parsed_statement::prepared> query_processor::prepare_internal(
+::shared_ptr<statements::prepared_statement> query_processor::prepare_internal(
        const sstring& query_string) {
    auto& p = _internal_statements[query_string];
    if (p == nullptr) {
@@ -352,7 +357,7 @@ future<::shared_ptr<untyped_result_set>> query_processor::execute_internal(
 }

 future<::shared_ptr<untyped_result_set>> query_processor::execute_internal(
-        ::shared_ptr<statements::parsed_statement::prepared> p,
+        ::shared_ptr<statements::prepared_statement> p,
        const std::initializer_list<data_value>& values) {
    auto opts = make_internal_options(p, values);
    return do_with(std::move(opts),
@@ -376,7 +381,7 @@ future<::shared_ptr<untyped_result_set>> query_processor::process(
 }

 future<::shared_ptr<untyped_result_set>> query_processor::process(
-                ::shared_ptr<statements::parsed_statement::prepared> p,
+                ::shared_ptr<statements::prepared_statement> p,
                db::consistency_level cl, const std::initializer_list<data_value>& values)
 {
    auto opts = make_internal_options(p, values, cl);
@@ -475,17 +480,9 @@ void query_processor::migration_subscriber::on_drop_aggregate(const sstring& ks_

 void query_processor::migration_subscriber::remove_invalid_prepared_statements(sstring ks_name, std::experimental::optional<sstring> cf_name)
 {
-    std::vector<bytes> invalid;
-    for (auto& kv : _qp->_prepared_statements) {
-        auto id   = kv.first;
-        auto stmt = kv.second;
-        if (should_invalidate(ks_name, cf_name, stmt->statement)) {
-            invalid.emplace_back(id);
-        }
-    }
-    for (auto& id : invalid) {
-        _qp->invalidate_prepared_statement(id);
-    }
+    _qp->invalidate_prepared_statements([&] (::shared_ptr<cql_statement> stmt) {
+        return this->should_invalidate(ks_name, cf_name, stmt);
+    });
 }

 bool query_processor::migration_subscriber::should_invalidate(sstring ks_name, std::experimental::optional<sstring> cf_name, ::shared_ptr<cql_statement> statement)
--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -47,11 +47,13 @@
 #include "core/shared_ptr.hh"
 #include "exceptions/exceptions.hh"
 #include "cql3/query_options.hh"
-#include "cql3/statements/cf_statement.hh"
+#include "cql3/statements/raw/parsed_statement.hh"
+#include "cql3/statements/raw/cf_statement.hh"
 #include "service/migration_manager.hh"
 #include "service/query_state.hh"
 #include "log.hh"
 #include "core/distributed.hh"
+#include "statements/prepared_statement.hh"
 #include "transport/messages/result_message.hh"
 #include "untyped_result_set.hh"

@@ -118,10 +120,10 @@ private:
    };
 #endif

-    std::unordered_map<bytes, ::shared_ptr<statements::parsed_statement::prepared>> _prepared_statements;
-    std::unordered_map<sstring, ::shared_ptr<statements::parsed_statement::prepared>> _internal_statements;
+    std::unordered_map<bytes, ::shared_ptr<statements::prepared_statement>> _prepared_statements;
+    std::unordered_map<int32_t, ::shared_ptr<statements::prepared_statement>> _thrift_prepared_statements;
+    std::unordered_map<sstring, ::shared_ptr<statements::prepared_statement>> _internal_statements;
 #if 0
-    private static final ConcurrentLinkedHashMap<Integer, ParsedStatement.Prepared> thriftPreparedStatements;

    // A map for prepared statements used internally (which we don't want to mix with user statement, in particular we don't
    // bother with expiration on those.
@@ -211,20 +213,22 @@ private:
    }
 #endif
 public:
-    ::shared_ptr<statements::parsed_statement::prepared> get_prepared(const bytes& id) {
+    ::shared_ptr<statements::prepared_statement> get_prepared(const bytes& id) {
        auto it = _prepared_statements.find(id);
        if (it == _prepared_statements.end()) {
-            return ::shared_ptr<statements::parsed_statement::prepared>{};
+            return ::shared_ptr<statements::prepared_statement>{};
        }
        return it->second;
    }

-#if 0
-    public ParsedStatement.Prepared getPreparedForThrift(Integer id)
-    {
-        return thriftPreparedStatements.get(id);
+    ::shared_ptr<statements::prepared_statement> get_prepared_for_thrift(int32_t id) {
+        auto it = _thrift_prepared_statements.find(id);
+        if (it == _thrift_prepared_statements.end()) {
+            return ::shared_ptr<statements::prepared_statement>{};
+        }
+        return it->second;
    }
-
+#if 0
    public static void validateKey(ByteBuffer key) throws InvalidRequestException
    {
        if (key == null || key.remaining() == 0)
@@ -328,23 +332,23 @@ public:
    }
 #endif
 private:
-    query_options make_internal_options(::shared_ptr<statements::parsed_statement::prepared>, const std::initializer_list<data_value>&, db::consistency_level = db::consistency_level::ONE);
+    query_options make_internal_options(::shared_ptr<statements::prepared_statement>, const std::initializer_list<data_value>&, db::consistency_level = db::consistency_level::ONE);
 public:
    future<::shared_ptr<untyped_result_set>> execute_internal(
            const sstring& query_string,
            const std::initializer_list<data_value>& = { });

-    ::shared_ptr<statements::parsed_statement::prepared> prepare_internal(const sstring& query);
+    ::shared_ptr<statements::prepared_statement> prepare_internal(const sstring& query);

    future<::shared_ptr<untyped_result_set>> execute_internal(
-            ::shared_ptr<statements::parsed_statement::prepared>,
+            ::shared_ptr<statements::prepared_statement>,
            const std::initializer_list<data_value>& = { });

    future<::shared_ptr<untyped_result_set>> process(
                    const sstring& query_string,
                    db::consistency_level, const std::initializer_list<data_value>& = { }, bool cache = false);
    future<::shared_ptr<untyped_result_set>> process(
-                    ::shared_ptr<statements::parsed_statement::prepared>,
+                    ::shared_ptr<statements::prepared_statement>,
                    db::consistency_level, const std::initializer_list<data_value>& = { });

    /*
@@ -429,22 +433,35 @@ public:
    prepare(const std::experimental::string_view& query_string, const service::client_state& client_state, bool for_thrift);

    static bytes compute_id(const std::experimental::string_view& query_string, const sstring& keyspace);
+    static int32_t compute_thrift_id(const std::experimental::string_view& query_string, const sstring& keyspace);

-#if 0
-    private static Integer computeThriftId(String queryString, String keyspace)
-    {
-        String toHash = keyspace == null ? queryString : keyspace + queryString;
-        return toHash.hashCode();
-    }
-#endif
 private:
    ::shared_ptr<transport::messages::result_message::prepared>
    get_stored_prepared_statement(const std::experimental::string_view& query_string, const sstring& keyspace, bool for_thrift);

    future<::shared_ptr<transport::messages::result_message::prepared>>
-    store_prepared_statement(const std::experimental::string_view& query_string, const sstring& keyspace, ::shared_ptr<statements::parsed_statement::prepared> prepared, bool for_thrift);
+    store_prepared_statement(const std::experimental::string_view& query_string, const sstring& keyspace, ::shared_ptr<statements::prepared_statement> prepared, bool for_thrift);

-    void invalidate_prepared_statement(bytes statement_id);
+    // Erases the statements for which filter returns true.
+    template <typename Pred>
+    void invalidate_prepared_statements(Pred filter) {
+        static_assert(std::is_same<bool, std::result_of_t<Pred(::shared_ptr<cql_statement>)>>::value,
+                      "bad Pred signature");
+        for (auto it = _prepared_statements.begin(); it != _prepared_statements.end(); ) {
+            if (filter(it->second->statement)) {
+                it = _prepared_statements.erase(it);
+            } else {
+                ++it;
+            }
+        }
+        for (auto it = _thrift_prepared_statements.begin(); it != _thrift_prepared_statements.end(); ) {
+            if (filter(it->second->statement)) {
+                it = _thrift_prepared_statements.erase(it);
+            } else {
+                ++it;
+            }
+        }
+    }

 #if 0
    public ResultMessage processPrepared(CQLStatement statement, QueryState queryState, QueryOptions options)
@@ -475,9 +492,9 @@ public:
    future<::shared_ptr<transport::messages::result_message>> process_batch(::shared_ptr<statements::batch_statement>,
            service::query_state& query_state, query_options& options);

-    ::shared_ptr<statements::parsed_statement::prepared> get_statement(const std::experimental::string_view& query,
+    ::shared_ptr<statements::prepared_statement> get_statement(const std::experimental::string_view& query,
            const service::client_state& client_state);
-    static ::shared_ptr<statements::parsed_statement> parse_statement(const std::experimental::string_view& query);
+    static ::shared_ptr<statements::raw::parsed_statement> parse_statement(const std::experimental::string_view& query);

 #if 0
    private static long measure(Object key)
--- a/cql3/restrictions/multi_column_restriction.hh
+++ b/cql3/restrictions/multi_column_restriction.hh
@@ -394,7 +394,11 @@ public:
            return bounds_range_type::bound(prefix, is_inclusive(b));
        };
        auto range = bounds_range_type(read_bound(statements::bound::START), read_bound(statements::bound::END));
-        return { range };
+        auto bounds = bound_view::from_range(range);
+        if (bound_view::compare(*_schema)(bounds.second, bounds.first)) {
+            return { };
+        }
+        return { std::move(range) };
    }
 #if 0
        @Override
--- a/cql3/restrictions/single_column_primary_key_restrictions.hh
+++ b/cql3/restrictions/single_column_primary_key_restrictions.hh
@@ -46,6 +46,8 @@
 #include "cartesian_product.hh"
 #include "cql3/restrictions/primary_key_restrictions.hh"
 #include "cql3/restrictions/single_column_restrictions.hh"
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/adaptor/filtered.hpp>

 namespace cql3 {

@@ -340,7 +342,7 @@ single_column_primary_key_restrictions<partition_key>::bounds_ranges(const query
        if (!r.is_singular()) {
            throw exceptions::invalid_request_exception("Range queries on partition key values not supported.");
        }
-        ranges.emplace_back(std::move(r).transform<query::ring_position>(
+        ranges.emplace_back(std::move(r).transform(
            [this] (partition_key&& k) -> query::ring_position {
                auto token = dht::global_partitioner().get_token(*_schema, k);
                return { std::move(token), std::move(k) };
@@ -352,7 +354,14 @@ single_column_primary_key_restrictions<partition_key>::bounds_ranges(const query
 template<>
 std::vector<query::clustering_range>
 single_column_primary_key_restrictions<clustering_key_prefix>::bounds_ranges(const query_options& options) const {
-    auto bounds = compute_bounds(options);
+    auto wrapping_bounds = compute_bounds(options);
+    auto bounds = boost::copy_range<query::clustering_row_ranges>(wrapping_bounds
+            | boost::adaptors::filtered([&](auto&& r) {
+                auto bounds = bound_view::from_range(r);
+                return !bound_view::compare(*_schema)(bounds.second, bounds.first);
+              })
+            | boost::adaptors::transformed([&](auto&& r) { return query::clustering_range(std::move(r));
+    }));
    auto less_cmp = clustering_key_prefix::less_compare(*_schema);
    std::sort(bounds.begin(), bounds.end(), [&] (query::clustering_range& x, query::clustering_range& y) {
        if (!x.start() && !y.start()) {
--- a/cql3/result_set.cc
+++ b/cql3/result_set.cc
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (C) 2015 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cql3/result_set.hh"
+
+namespace cql3 {
+
+metadata::metadata(std::vector<::shared_ptr<column_specification>> names_)
+    : metadata(flag_enum_set(), std::move(names_), names_.size(), {})
+{ }
+
+metadata::metadata(flag_enum_set flags, std::vector<::shared_ptr<column_specification>> names_, uint32_t column_count,
+        ::shared_ptr<const service::pager::paging_state> paging_state)
+    : _flags(flags)
+    , names(std::move(names_))
+    , _column_count(column_count)
+    , _paging_state(std::move(paging_state))
+{ }
+
+// The maximum number of values that the ResultSet can hold. This can be bigger than columnCount due to CASSANDRA-4911
+uint32_t metadata::value_count() const {
+    return _flags.contains<flag::NO_METADATA>() ? _column_count : names.size();
+}
+
+void metadata::add_non_serialized_column(::shared_ptr<column_specification> name) {
+    // See comment above. Because columnCount doesn't account the newly added name, it
+    // won't be serialized.
+    names.emplace_back(std::move(name));
+}
+
+bool metadata::all_in_same_cf() const {
+    if (_flags.contains<flag::NO_METADATA>()) {
+        return false;
+    }
+
+    return column_specification::all_in_same_table(names);
+}
+
+void metadata::set_has_more_pages(::shared_ptr<const service::pager::paging_state> paging_state) {
+    if (!paging_state) {
+        return;
+    }
+
+    _flags.set<flag::HAS_MORE_PAGES>();
+    _paging_state = std::move(paging_state);
+}
+
+void metadata::set_skip_metadata() {
+    _flags.set<flag::NO_METADATA>();
+}
+
+metadata::flag_enum_set metadata::flags() const {
+    return _flags;
+}
+
+uint32_t metadata::column_count() const {
+    return _column_count;
+}
+
+::shared_ptr<const service::pager::paging_state> metadata::paging_state() const {
+    return _paging_state;
+}
+
+const std::vector<::shared_ptr<column_specification>>& metadata::get_names() const {
+    return names;
+}
+
+prepared_metadata::prepared_metadata(const std::vector<::shared_ptr<column_specification>>& names,
+                                     const std::vector<uint16_t>& partition_key_bind_indices)
+    : _names{names}
+    , _partition_key_bind_indices{partition_key_bind_indices}
+{
+    if (!names.empty() && column_specification::all_in_same_table(_names)) {
+        _flags.set<flag::GLOBAL_TABLES_SPEC>();
+    }
+}
+
+prepared_metadata::flag_enum_set prepared_metadata::flags() const {
+    return _flags;
+}
+
+const std::vector<::shared_ptr<column_specification>>& prepared_metadata::names() const {
+    return _names;
+}
+
+const std::vector<uint16_t>& prepared_metadata::partition_key_bind_indices() const {
+    return _partition_key_bind_indices;
+}
+
+result_set::result_set(std::vector<::shared_ptr<column_specification>> metadata_)
+    : _metadata(::make_shared<metadata>(std::move(metadata_)))
+{ }
+
+result_set::result_set(::shared_ptr<metadata> metadata)
+    : _metadata(std::move(metadata))
+{ }
+
+size_t result_set::size() const {
+    return _rows.size();
+}
+
+bool result_set::empty() const {
+    return _rows.empty();
+}
+
+void result_set::add_row(std::vector<bytes_opt> row) {
+    assert(row.size() == _metadata->value_count());
+    _rows.emplace_back(std::move(row));
+}
+
+void result_set::add_column_value(bytes_opt value) {
+    if (_rows.empty() || _rows.back().size() == _metadata->value_count()) {
+        std::vector<bytes_opt> row;
+        row.reserve(_metadata->value_count());
+        _rows.emplace_back(std::move(row));
+    }
+
+    _rows.back().emplace_back(std::move(value));
+}
+
+void result_set::reverse() {
+    std::reverse(_rows.begin(), _rows.end());
+}
+
+void result_set::trim(size_t limit) {
+    if (_rows.size() > limit) {
+        _rows.resize(limit);
+    }
+}
+
+metadata& result_set::get_metadata() {
+    return *_metadata;
+}
+
+const metadata& result_set::get_metadata() const {
+    return *_metadata;
+}
+
+const std::deque<std::vector<bytes_opt>>& result_set::rows() const {
+    return _rows;
+}
+
+shared_ptr<const cql3::metadata>
+make_empty_metadata() {
+    static thread_local shared_ptr<const metadata> empty_metadata_cache = [] {
+        auto result = ::make_shared<metadata>(std::vector<::shared_ptr<cql3::column_specification>>{});
+        result->set_skip_metadata();
+        return result;
+    }();
+    return empty_metadata_cache;
+}
+
+}
--- a/cql3/result_set.hh
+++ b/cql3/result_set.hh
@@ -50,15 +50,11 @@
 namespace cql3 {

 class metadata {
-#if 0
-        public static final CBCodec<Metadata> codec = new Codec();
-        public static final Metadata EMPTY = new Metadata(EnumSet.of(Flag.NO_METADATA), null, 0, null);
-#endif
 public:
    enum class flag : uint8_t {
-        GLOBAL_TABLES_SPEC,
-        HAS_MORE_PAGES,
-        NO_METADATA
+        GLOBAL_TABLES_SPEC = 0,
+        HAS_MORE_PAGES = 1,
+        NO_METADATA = 2,
    };

    using flag_enum = super_enum<flag,
@@ -83,434 +79,90 @@ private:
    ::shared_ptr<const service::pager::paging_state> _paging_state;

 public:
-    metadata(std::vector<::shared_ptr<column_specification>> names_)
-        : metadata(flag_enum_set(), std::move(names_), names_.size(), {})
-    { }
+    metadata(std::vector<::shared_ptr<column_specification>> names_);

    metadata(flag_enum_set flags, std::vector<::shared_ptr<column_specification>> names_, uint32_t column_count,
-            ::shared_ptr<const service::pager::paging_state> paging_state)
-        : _flags(flags)
-        , names(std::move(names_))
-        , _column_count(column_count)
-        , _paging_state(std::move(paging_state))
-    { }
+            ::shared_ptr<const service::pager::paging_state> paging_state);

    // The maximum number of values that the ResultSet can hold. This can be bigger than columnCount due to CASSANDRA-4911
-    uint32_t value_count() {
-        return _flags.contains<flag::NO_METADATA>() ? _column_count : names.size();
-    }
+    uint32_t value_count() const;

-    void add_non_serialized_column(::shared_ptr<column_specification> name) {
-        // See comment above. Because columnCount doesn't account the newly added name, it
-        // won't be serialized.
-        names.emplace_back(std::move(name));
-    }
+    void add_non_serialized_column(::shared_ptr<column_specification> name);

 private:
-    bool all_in_same_cf() const {
-        if (_flags.contains<flag::NO_METADATA>()) {
-            return false;
-        }
-
-        assert(!names.empty());
-
-        auto first = names.front();
-        return std::all_of(std::next(names.begin()), names.end(), [first] (auto&& spec) {
-            return spec->ks_name == first->ks_name && spec->cf_name == first->cf_name;
-        });
-    }
+    bool all_in_same_cf() const;

 public:
-    void set_has_more_pages(::shared_ptr<const service::pager::paging_state> paging_state) {
-        if (!paging_state) {
-            return;
-        }
+    void set_has_more_pages(::shared_ptr<const service::pager::paging_state> paging_state);

-        _flags.set<flag::HAS_MORE_PAGES>();
-        _paging_state = std::move(paging_state);
-    }
+    void set_skip_metadata();

-    void set_skip_metadata() {
-        _flags.set<flag::NO_METADATA>();
-    }
+    flag_enum_set flags() const;

-    flag_enum_set flags() const {
-        return _flags;
-    }
+    uint32_t column_count() const;

-    uint32_t column_count() const {
-        return _column_count;
-    }
-
-    auto paging_state() const {
-        return _paging_state;
-    }
-
-    auto const& get_names() const {
-        return names;
-    }
-
-#if 0
-    @Override
-    public String toString()
-    {
-        StringBuilder sb = new StringBuilder();
-
-        if (names == null)
-        {
-            sb.append("[").append(columnCount).append(" columns]");
-        }
-        else
-        {
-            for (ColumnSpecification name : names)
-            {
-                sb.append("[").append(name.name);
-                sb.append("(").append(name.ksName).append(", ").append(name.cfName).append(")");
-                sb.append(", ").append(name.type).append("]");
-            }
-        }
-        if (flags.contains(Flag.HAS_MORE_PAGES))
-            sb.append(" (to be continued)");
-        return sb.toString();
-    }
-
-    private static class Codec implements CBCodec<Metadata>
-    {
-        public Metadata decode(ByteBuf body, int version)
-        {
-            // flags & column count
-            int iflags = body.readInt();
-            int columnCount = body.readInt();
-
-            EnumSet<Flag> flags = Flag.deserialize(iflags);
-
-            PagingState state = null;
-            if (flags.contains(Flag.HAS_MORE_PAGES))
-                state = PagingState.deserialize(CBUtil.readValue(body));
-
-            if (flags.contains(Flag.NO_METADATA))
-                return new Metadata(flags, null, columnCount, state);
-
-            boolean globalTablesSpec = flags.contains(Flag.GLOBAL_TABLES_SPEC);
-
-            String globalKsName = null;
-            String globalCfName = null;
-            if (globalTablesSpec)
-            {
-                globalKsName = CBUtil.readString(body);
-                globalCfName = CBUtil.readString(body);
-            }
-
-            // metadata (names/types)
-            List<ColumnSpecification> names = new ArrayList<ColumnSpecification>(columnCount);
-            for (int i = 0; i < columnCount; i++)
-            {
-                String ksName = globalTablesSpec ? globalKsName : CBUtil.readString(body);
-                String cfName = globalTablesSpec ? globalCfName : CBUtil.readString(body);
-                ColumnIdentifier colName = new ColumnIdentifier(CBUtil.readString(body), true);
-                AbstractType type = DataType.toType(DataType.codec.decodeOne(body, version));
-                names.add(new ColumnSpecification(ksName, cfName, colName, type));
-            }
-            return new Metadata(flags, names, names.size(), state);
-        }
-
-        public void encode(Metadata m, ByteBuf dest, int version)
-        {
-            boolean noMetadata = m.flags.contains(Flag.NO_METADATA);
-            boolean globalTablesSpec = m.flags.contains(Flag.GLOBAL_TABLES_SPEC);
-            boolean hasMorePages = m.flags.contains(Flag.HAS_MORE_PAGES);
-
-            assert version > 1 || (!m.flags.contains(Flag.HAS_MORE_PAGES) && !noMetadata): "version = " + version + ", flags = " + m.flags;
-
-            dest.writeInt(Flag.serialize(m.flags));
-            dest.writeInt(m.columnCount);
-
-            if (hasMorePages)
-                CBUtil.writeValue(m.pagingState.serialize(), dest);
-
-            if (!noMetadata)
-            {
-                if (globalTablesSpec)
-                {
-                    CBUtil.writeString(m.names.get(0).ksName, dest);
-                    CBUtil.writeString(m.names.get(0).cfName, dest);
-                }
-
-                for (int i = 0; i < m.columnCount; i++)
-                {
-                    ColumnSpecification name = m.names.get(i);
-                    if (!globalTablesSpec)
-                    {
-                        CBUtil.writeString(name.ksName, dest);
-                        CBUtil.writeString(name.cfName, dest);
-                    }
-                    CBUtil.writeString(name.name.toString(), dest);
-                    DataType.codec.writeOne(DataType.fromType(name.type, version), dest, version);
-                }
-            }
-        }
-
-        public int encodedSize(Metadata m, int version)
-        {
-            boolean noMetadata = m.flags.contains(Flag.NO_METADATA);
-            boolean globalTablesSpec = m.flags.contains(Flag.GLOBAL_TABLES_SPEC);
-            boolean hasMorePages = m.flags.contains(Flag.HAS_MORE_PAGES);
-
-            int size = 8;
-            if (hasMorePages)
-                size += CBUtil.sizeOfValue(m.pagingState.serialize());
-
-            if (!noMetadata)
-            {
-                if (globalTablesSpec)
-                {
-                    size += CBUtil.sizeOfString(m.names.get(0).ksName);
-                    size += CBUtil.sizeOfString(m.names.get(0).cfName);
-                }
-
-                for (int i = 0; i < m.columnCount; i++)
-                {
-                    ColumnSpecification name = m.names.get(i);
-                    if (!globalTablesSpec)
-                    {
-                        size += CBUtil.sizeOfString(name.ksName);
-                        size += CBUtil.sizeOfString(name.cfName);
-                    }
-                    size += CBUtil.sizeOfString(name.name.toString());
-                    size += DataType.codec.oneSerializedSize(DataType.fromType(name.type, version), version);
-                }
-            }
-            return size;
-        }
-    }
-#endif
+    ::shared_ptr<const service::pager::paging_state> paging_state() const;

+    const std::vector<::shared_ptr<column_specification>>& get_names() const;
 };

-inline ::shared_ptr<cql3::metadata> make_empty_metadata()
-{
-    auto result = ::make_shared<cql3::metadata>(std::vector<::shared_ptr<cql3::column_specification>>{});
-    result->set_skip_metadata();
-    return result;
-}
+::shared_ptr<const cql3::metadata> make_empty_metadata();
+
+class prepared_metadata {
+public:
+    enum class flag : uint8_t {
+        GLOBAL_TABLES_SPEC = 0,
+    };
+
+    using flag_enum = super_enum<flag,
+        flag::GLOBAL_TABLES_SPEC>;
+
+    using flag_enum_set = enum_set<flag_enum>;
+private:
+    flag_enum_set _flags;
+    std::vector<::shared_ptr<column_specification>> _names;
+    std::vector<uint16_t> _partition_key_bind_indices;
+public:
+    prepared_metadata(const std::vector<::shared_ptr<column_specification>>& names,
+                      const std::vector<uint16_t>& partition_key_bind_indices);
+
+    flag_enum_set flags() const;
+    const std::vector<::shared_ptr<column_specification>>& names() const;
+    const std::vector<uint16_t>& partition_key_bind_indices() const;
+};

 class result_set {
-#if 0
-    private static final ColumnIdentifier COUNT_COLUMN = new ColumnIdentifier("count", false);
-#endif
-
 public:
    ::shared_ptr<metadata> _metadata;
    std::deque<std::vector<bytes_opt>> _rows;
 public:
-    result_set(std::vector<::shared_ptr<column_specification>> metadata_)
-        : _metadata(::make_shared<metadata>(std::move(metadata_)))
-    { }
+    result_set(std::vector<::shared_ptr<column_specification>> metadata_);

-    result_set(::shared_ptr<metadata> metadata)
-        : _metadata(std::move(metadata))
-    { }
+    result_set(::shared_ptr<metadata> metadata);

-    size_t size() const {
-        return _rows.size();
-    }
+    size_t size() const;

-    bool empty() const {
-        return _rows.empty();
-    }
+    bool empty() const;

-    void add_row(std::vector<bytes_opt> row) {
-        assert(row.size() == _metadata->value_count());
-        _rows.emplace_back(std::move(row));
-    }
+    void add_row(std::vector<bytes_opt> row);

-    void add_column_value(bytes_opt value) {
-        if (_rows.empty() || _rows.back().size() == _metadata->value_count()) {
-            std::vector<bytes_opt> row;
-            row.reserve(_metadata->value_count());
-            _rows.emplace_back(std::move(row));
-        }
+    void add_column_value(bytes_opt value);

-        _rows.back().emplace_back(std::move(value));
-    }
+    void reverse();

-    void reverse() {
-        std::reverse(_rows.begin(), _rows.end());
-    }
-
-    void trim(size_t limit) {
-        if (_rows.size() > limit) {
-            _rows.resize(limit);
-        }
-    }
+    void trim(size_t limit);

    template<typename RowComparator>
    void sort(RowComparator&& cmp) {
        std::sort(_rows.begin(), _rows.end(), std::forward<RowComparator>(cmp));
    }

-    metadata& get_metadata() {
-        return *_metadata;
-    }
+    metadata& get_metadata();

-    const metadata& get_metadata() const {
-        return *_metadata;
-    }
+    const metadata& get_metadata() const;

    // Returns a range of rows. A row is a range of bytes_opt.
-    auto const& rows() const {
-        return _rows;
-    }
-#if 0
-    public CqlResult toThriftResult()
-    {
-        assert metadata.names != null;
-
-        String UTF8 = "UTF8Type";
-        CqlMetadata schema = new CqlMetadata(new HashMap<ByteBuffer, String>(),
-                new HashMap<ByteBuffer, String>(),
-                // The 2 following ones shouldn't be needed in CQL3
-                UTF8, UTF8);
-
-        for (int i = 0; i < metadata.columnCount; i++)
-        {
-            ColumnSpecification spec = metadata.names.get(i);
-            ByteBuffer colName = ByteBufferUtil.bytes(spec.name.toString());
-            schema.name_types.put(colName, UTF8);
-            AbstractType<?> normalizedType = spec.type instanceof ReversedType ? ((ReversedType)spec.type).baseType : spec.type;
-            schema.value_types.put(colName, normalizedType.toString());
-
-        }
-
-        List<CqlRow> cqlRows = new ArrayList<CqlRow>(rows.size());
-        for (List<ByteBuffer> row : rows)
-        {
-            List<Column> thriftCols = new ArrayList<Column>(metadata.columnCount);
-            for (int i = 0; i < metadata.columnCount; i++)
-            {
-                Column col = new Column(ByteBufferUtil.bytes(metadata.names.get(i).name.toString()));
-                col.setValue(row.get(i));
-                thriftCols.add(col);
-            }
-            // The key of CqlRow shoudn't be needed in CQL3
-            cqlRows.add(new CqlRow(ByteBufferUtil.EMPTY_BYTE_BUFFER, thriftCols));
-        }
-        CqlResult res = new CqlResult(CqlResultType.ROWS);
-        res.setRows(cqlRows).setSchema(schema);
-        return res;
-    }
-
-    @Override
-    public String toString()
-    {
-        try
-        {
-            StringBuilder sb = new StringBuilder();
-            sb.append(metadata).append('\n');
-            for (List<ByteBuffer> row : rows)
-            {
-                for (int i = 0; i < row.size(); i++)
-                {
-                    ByteBuffer v = row.get(i);
-                    if (v == null)
-                    {
-                        sb.append(" | null");
-                    }
-                    else
-                    {
-                        sb.append(" | ");
-                        if (metadata.flags.contains(Flag.NO_METADATA))
-                            sb.append("0x").append(ByteBufferUtil.bytesToHex(v));
-                        else
-                            sb.append(metadata.names.get(i).type.getString(v));
-                    }
-                }
-                sb.append('\n');
-            }
-            sb.append("---");
-            return sb.toString();
-        }
-        catch (Exception e)
-        {
-            throw new RuntimeException(e);
-        }
-    }
-
-    public static class Codec implements CBCodec<ResultSet>
-    {
-        /*
-         * Format:
-         *   - metadata
-         *   - rows count (4 bytes)
-         *   - rows
-         */
-        public ResultSet decode(ByteBuf body, int version)
-        {
-            Metadata m = Metadata.codec.decode(body, version);
-            int rowCount = body.readInt();
-            ResultSet rs = new ResultSet(m, new ArrayList<List<ByteBuffer>>(rowCount));
-
-            // rows
-            int totalValues = rowCount * m.columnCount;
-            for (int i = 0; i < totalValues; i++)
-                rs.addColumnValue(CBUtil.readValue(body));
-
-            return rs;
-        }
-
-        public void encode(ResultSet rs, ByteBuf dest, int version)
-        {
-            Metadata.codec.encode(rs.metadata, dest, version);
-            dest.writeInt(rs.rows.size());
-            for (List<ByteBuffer> row : rs.rows)
-            {
-                // Note that we do only want to serialize only the first columnCount values, even if the row
-                // as more: see comment on Metadata.names field.
-                for (int i = 0; i < rs.metadata.columnCount; i++)
-                    CBUtil.writeValue(row.get(i), dest);
-            }
-        }
-
-        public int encodedSize(ResultSet rs, int version)
-        {
-            int size = Metadata.codec.encodedSize(rs.metadata, version) + 4;
-            for (List<ByteBuffer> row : rs.rows)
-            {
-                for (int i = 0; i < rs.metadata.columnCount; i++)
-                    size += CBUtil.sizeOfValue(row.get(i));
-            }
-            return size;
-        }
-    }
-
-    public static enum Flag
-    {
-        // The order of that enum matters!!
-        GLOBAL_TABLES_SPEC,
-        HAS_MORE_PAGES,
-        NO_METADATA;
-
-        public static EnumSet<Flag> deserialize(int flags)
-        {
-            EnumSet<Flag> set = EnumSet.noneOf(Flag.class);
-            Flag[] values = Flag.values();
-            for (int n = 0; n < values.length; n++)
-            {
-                if ((flags & (1 << n)) != 0)
-                    set.add(values[n]);
-            }
-            return set;
-        }
-
-        public static int serialize(EnumSet<Flag> flags)
-        {
-            int i = 0;
-            for (Flag flag : flags)
-                i |= 1 << flag.ordinal();
-            return i;
-        }
-    }
-#endif
+    const std::deque<std::vector<bytes_opt>>& rows() const;
 };

 }
--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -232,7 +232,7 @@ uint32_t selection::add_column_for_ordering(const column_definition& c) {
            raw_selector::to_selectables(raw_selectors, schema), db, schema, defs);

    auto metadata = collect_metadata(schema, raw_selectors, *factories);
-    if (processes_selection(raw_selectors)) {
+    if (processes_selection(raw_selectors) || raw_selectors.size() != defs.size()) {
        return ::make_shared<selection_with_processing>(schema, std::move(defs), std::move(metadata), std::move(factories));
    } else {
        return ::make_shared<simple_selection>(schema, std::move(defs), std::move(metadata), false);
--- a/cql3/selection/selection.hh
+++ b/cql3/selection/selection.hh
@@ -161,7 +161,7 @@ public:
        return std::find(_columns.begin(), _columns.end(), &def) != _columns.end();
    }

-    ::shared_ptr<metadata> get_result_metadata() const {
+    ::shared_ptr<const metadata> get_result_metadata() const {
        return _metadata;
    }

--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (C) 2015 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "alter_keyspace_statement.hh"
+#include "service/migration_manager.hh"
+#include "db/system_keyspace.hh"
+#include "database.hh"
+
+cql3::statements::alter_keyspace_statement::alter_keyspace_statement(sstring name, ::shared_ptr<ks_prop_defs> attrs)
+    : _name(name)
+    , _attrs(std::move(attrs))
+{}
+
+const sstring& cql3::statements::alter_keyspace_statement::keyspace() const {
+    return _name;
+}
+
+future<> cql3::statements::alter_keyspace_statement::check_access(const service::client_state& state) {
+    return state.has_keyspace_access(_name, auth::permission::ALTER);
+}
+
+void cql3::statements::alter_keyspace_statement::validate(distributed<service::storage_proxy>& proxy, const service::client_state& state) {
+    try {
+        service::get_local_storage_proxy().get_db().local().find_keyspace(_name); // throws on failure
+        auto tmp = _name;
+        std::transform(tmp.begin(), tmp.end(), tmp.begin(), ::tolower);
+        if (tmp == db::system_keyspace::NAME) {
+            throw exceptions::invalid_request_exception("Cannot alter system keyspace");
+        }
+
+        _attrs->validate();
+
+        if (!bool(_attrs->get_replication_strategy_class()) && !_attrs->get_replication_options().empty()) {
+            throw exceptions::configuration_exception("Missing replication strategy class");
+        }
+#if 0
+        // The strategy is validated through KSMetaData.validate() in announceKeyspaceUpdate below.
+        // However, for backward compatibility with thrift, this doesn't validate unexpected options yet,
+        // so doing proper validation here.
+        AbstractReplicationStrategy.validateReplicationStrategy(name,
+                                                                AbstractReplicationStrategy.getClass(attrs.getReplicationStrategyClass()),
+                                                                StorageService.instance.getTokenMetadata(),
+                                                                DatabaseDescriptor.getEndpointSnitch(),
+                                                                attrs.getReplicationOptions());
+#endif
+
+
+    } catch (no_such_keyspace& e) {
+        std::throw_with_nested(exceptions::invalid_request_exception("Unknown keyspace " + _name));
+    }
+}
+
+future<bool> cql3::statements::alter_keyspace_statement::announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) {
+    auto old_ksm = service::get_local_storage_proxy().get_db().local().find_keyspace(_name).metadata();
+    return service::get_local_migration_manager().announce_keyspace_update(_attrs->as_ks_metadata_update(old_ksm), is_local_only).then([] {
+       return true;
+    });
+}
+
+shared_ptr<transport::event::schema_change> cql3::statements::alter_keyspace_statement::change_event() {
+    return make_shared<transport::event::schema_change>(
+                    transport::event::schema_change::change_type::UPDATED,
+                    keyspace());
+}
--- a/cql3/statements/alter_keyspace_statement.hh
+++ b/cql3/statements/alter_keyspace_statement.hh
@@ -41,80 +41,29 @@

 #pragma once

+#include <memory>
+
 #include "cql3/statements/schema_altering_statement.hh"
 #include "cql3/statements/ks_prop_defs.hh"

-#include <memory>
-
 namespace cql3 {

 namespace statements {

 class alter_keyspace_statement : public schema_altering_statement {
    sstring _name;
-    std::unique_ptr<ks_prop_defs> _attrs;
+    ::shared_ptr<ks_prop_defs> _attrs;

 public:
-    alter_keyspace_statement(sstring name, std::unique_ptr<ks_prop_defs>&& attrs)
-        : _name{name}
-        , _attrs{std::move(attrs)}
-    { }
+    alter_keyspace_statement(sstring name, ::shared_ptr<ks_prop_defs> attrs);

-    virtual const sstring& keyspace() const override {
-        return _name;
-    }
+    const sstring& keyspace() const override;

-#if 0
-    public void checkAccess(ClientState state) throws UnauthorizedException, InvalidRequestException
-    {
-        state.hasKeyspaceAccess(name, Permission.ALTER);
-    }
-
-    public void validate(ClientState state) throws RequestValidationException
-    {
-        KSMetaData ksm = Schema.instance.getKSMetaData(name);
-        if (ksm == null)
-            throw new InvalidRequestException("Unknown keyspace " + name);
-        if (ksm.name.equalsIgnoreCase(SystemKeyspace.NAME))
-            throw new InvalidRequestException("Cannot alter system keyspace");
-
-        attrs.validate();
-
-        if (attrs.getReplicationStrategyClass() == null && !attrs.getReplicationOptions().isEmpty())
-        {
-            throw new ConfigurationException("Missing replication strategy class");
-        }
-        else if (attrs.getReplicationStrategyClass() != null)
-        {
-            // The strategy is validated through KSMetaData.validate() in announceKeyspaceUpdate below.
-            // However, for backward compatibility with thrift, this doesn't validate unexpected options yet,
-            // so doing proper validation here.
-            AbstractReplicationStrategy.validateReplicationStrategy(name,
-                                                                    AbstractReplicationStrategy.getClass(attrs.getReplicationStrategyClass()),
-                                                                    StorageService.instance.getTokenMetadata(),
-                                                                    DatabaseDescriptor.getEndpointSnitch(),
-                                                                    attrs.getReplicationOptions());
-        }
-    }
-
-    public boolean announceMigration(boolean isLocalOnly) throws RequestValidationException
-    {
-        KSMetaData ksm = Schema.instance.getKSMetaData(name);
-        // In the (very) unlikely case the keyspace was dropped since validate()
-        if (ksm == null)
-            throw new InvalidRequestException("Unknown keyspace " + name);
-
-        MigrationManager.announceKeyspaceUpdate(attrs.asKSMetadataUpdate(ksm), isLocalOnly);
-        return true;
-    }
-
-    public Event.SchemaChange changeEvent()
-    {
-        return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, keyspace());
-    }
-#endif
+    future<> check_access(const service::client_state& state) override;
+    void validate(distributed<service::storage_proxy>& proxy, const service::client_state& state) override;
+    future<bool> announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) override;
+    shared_ptr<transport::event::schema_change> change_event() override;
 };

 }
-
 }
--- a/cql3/statements/authentication_statement.cc
+++ b/cql3/statements/authentication_statement.cc
@@ -46,9 +46,9 @@ uint32_t cql3::statements::authentication_statement::get_bound_terms() {
    return 0;
 }

-::shared_ptr<cql3::statements::parsed_statement::prepared> cql3::statements::authentication_statement::prepare(
+::shared_ptr<cql3::statements::prepared_statement> cql3::statements::authentication_statement::prepare(
                database& db) {
-    return ::make_shared<parsed_statement::prepared>(this->shared_from_this());
+    return ::make_shared<prepared>(this->shared_from_this());
 }

 bool cql3::statements::authentication_statement::uses_function(
--- a/cql3/statements/authentication_statement.hh
+++ b/cql3/statements/authentication_statement.hh
@@ -41,15 +41,16 @@

 #pragma once

-#include "parsed_statement.hh"
 #include "cql3/cql_statement.hh"
+#include "prepared_statement.hh"
+#include "raw/parsed_statement.hh"
 #include "transport/messages_fwd.hh"

 namespace cql3 {

 namespace statements {

-class authentication_statement : public parsed_statement, public cql_statement, public ::enable_shared_from_this<authentication_statement> {
+class authentication_statement : public raw::parsed_statement, public cql_statement_no_metadata, public ::enable_shared_from_this<authentication_statement> {
 public:
    uint32_t get_bound_terms() override;

--- a/cql3/statements/authorization_statement.cc
+++ b/cql3/statements/authorization_statement.cc
@@ -46,7 +46,7 @@ uint32_t cql3::statements::authorization_statement::get_bound_terms() {
    return 0;
 }

-::shared_ptr<cql3::statements::parsed_statement::prepared> cql3::statements::authorization_statement::prepare(
+::shared_ptr<cql3::statements::prepared_statement> cql3::statements::authorization_statement::prepare(
                database& db) {
    return ::make_shared<parsed_statement::prepared>(this->shared_from_this());
 }
--- a/cql3/statements/authorization_statement.hh
+++ b/cql3/statements/authorization_statement.hh
@@ -41,8 +41,9 @@

 #pragma once

-#include "parsed_statement.hh"
 #include "cql3/cql_statement.hh"
+#include "prepared_statement.hh"
+#include "raw/parsed_statement.hh"
 #include "transport/messages_fwd.hh"

 namespace auth {
@@ -53,7 +54,7 @@ namespace cql3 {

 namespace statements {

-class authorization_statement : public parsed_statement, public cql_statement, public ::enable_shared_from_this<authorization_statement> {
+class authorization_statement : public raw::parsed_statement, public cql_statement_no_metadata, public ::enable_shared_from_this<authorization_statement> {
 public:
    uint32_t get_bound_terms() override;

--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -38,6 +38,7 @@
 */

 #include "batch_statement.hh"
+#include "raw/batch_statement.hh"
 #include "db/config.hh"

 namespace cql3 {
@@ -68,7 +69,7 @@ void batch_statement::verify_batch_size(const std::vector<mutation>& mutations)
        void accept_static_cell(column_id, collection_mutation_view v) override {
            size += v.data.size();
        }
-        void accept_row_tombstone(clustering_key_prefix_view, tombstone) override {}
+        void accept_row_tombstone(const range_tombstone&) override {}
        void accept_row(clustering_key_view, tombstone, const row_marker&) override {}
        void accept_row_cell(column_id, atomic_cell_view v) override {
            size += v.value().size();
@@ -100,6 +101,30 @@ void batch_statement::verify_batch_size(const std::vector<mutation>& mutations)
    }
 }

+namespace raw {
+
+shared_ptr<prepared_statement>
+batch_statement::prepare(database& db) {
+    auto&& bound_names = get_bound_variables();
+
+    std::vector<shared_ptr<cql3::statements::modification_statement>> statements;
+    for (auto&& parsed : _parsed_statements) {
+        statements.push_back(parsed->prepare(db, bound_names));
+    }
+
+    auto&& prep_attrs = _attrs->prepare(db, "[batch]", "[batch]");
+    prep_attrs->collect_marker_specification(bound_names);
+
+    cql3::statements::batch_statement batch_statement_(bound_names->size(), _type, std::move(statements), std::move(prep_attrs));
+    batch_statement_.validate();
+
+    return ::make_shared<prepared>(make_shared(std::move(batch_statement_)),
+                                                     bound_names->get_specifications());
+}
+
+}
+
+
 }

 }
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -39,6 +39,8 @@

 #include "cql3/cql_statement.hh"
 #include "modification_statement.hh"
+#include "raw/modification_statement.hh"
+#include "raw/batch_statement.hh"
 #include "service/storage_proxy.hh"
 #include "transport/messages/result_message.hh"
 #include "timestamp.hh"
@@ -60,12 +62,10 @@ namespace statements {
 * A <code>BATCH</code> statement parsed from a CQL query.
 *
 */
-class batch_statement : public cql_statement {
+class batch_statement : public cql_statement_no_metadata {
    static logging::logger _logger;
 public:
-    enum class type {
-        LOGGED, UNLOGGED, COUNTER
-    };
+    using type = raw::batch_statement::type;
 private:
    int _bound_terms;
 public:
@@ -168,16 +168,16 @@ public:
        return _statements;
    }
 private:
-    future<std::vector<mutation>> get_mutations(distributed<service::storage_proxy>& storage, const query_options& options, bool local, api::timestamp_type now) {
+    future<std::vector<mutation>> get_mutations(distributed<service::storage_proxy>& storage, const query_options& options, bool local, api::timestamp_type now, tracing::trace_state_ptr trace_state) {
        // Do not process in parallel because operations like list append/prepend depend on execution order.
-        return do_with(std::vector<mutation>(), [this, &storage, &options, now, local] (auto&& result) {
+        return do_with(std::vector<mutation>(), [this, &storage, &options, now, local, trace_state] (auto&& result) {
            return do_for_each(boost::make_counting_iterator<size_t>(0),
                               boost::make_counting_iterator<size_t>(_statements.size()),
-                               [this, &storage, &options, now, local, &result] (size_t i) {
+                               [this, &storage, &options, now, local, &result, trace_state] (size_t i) {
                auto&& statement = _statements[i];
                auto&& statement_options = options.for_statement(i);
                auto timestamp = _attrs->get_timestamp(now, statement_options);
-                return statement->get_mutations(storage, statement_options, local, timestamp).then([&result] (auto&& more) {
+                return statement->get_mutations(storage, statement_options, local, timestamp, trace_state).then([&result] (auto&& more) {
                    std::move(more.begin(), more.end(), std::back_inserter(result));
                });
            }).then([&result] {
@@ -213,8 +213,8 @@ private:
            return execute_with_conditions(storage, options, query_state);
        }

-        return get_mutations(storage, options, local, now).then([this, &storage, &options] (std::vector<mutation> ms) {
-            return execute_without_conditions(storage, std::move(ms), options.get_consistency());
+        return get_mutations(storage, options, local, now, query_state.get_trace_state()).then([this, &storage, &options, tr_state = query_state.get_trace_state()] (std::vector<mutation> ms) mutable {
+            return execute_without_conditions(storage, std::move(ms), options.get_consistency(), std::move(tr_state));
        }).then([] {
            return make_ready_future<shared_ptr<transport::messages::result_message>>(
                    make_shared<transport::messages::result_message::void_message>());
@@ -224,7 +224,8 @@ private:
    future<> execute_without_conditions(
            distributed<service::storage_proxy>& storage,
            std::vector<mutation> mutations,
-            db::consistency_level cl) {
+            db::consistency_level cl,
+            tracing::trace_state_ptr tr_state) {
        // FIXME: do we need to do this?
 #if 0
        // Extract each collection of cfs from it's IMutation and then lazily concatenate all of them into a single Iterable.
@@ -239,7 +240,7 @@ private:
        verify_batch_size(mutations);

        bool mutate_atomic = _type == type::LOGGED && mutations.size() > 1;
-        return storage.local().mutate_with_triggers(std::move(mutations), cl, mutate_atomic);
+        return storage.local().mutate_with_triggers(std::move(mutations), cl, mutate_atomic, std::move(tr_state));
    }

    future<shared_ptr<transport::messages::result_message>> execute_with_conditions(
@@ -317,46 +318,6 @@ public:
        return sprint("BatchStatement(type=%s, statements=%s)", _type, join(", ", _statements));
    }
 #endif
-
-    class parsed : public cf_statement {
-        type _type;
-        shared_ptr<attributes::raw> _attrs;
-        std::vector<shared_ptr<modification_statement::parsed>> _parsed_statements;
-    public:
-        parsed(
-            type type_,
-            shared_ptr<attributes::raw> attrs,
-            std::vector<shared_ptr<modification_statement::parsed>> parsed_statements)
-                : cf_statement(nullptr)
-                , _type(type_)
-                , _attrs(std::move(attrs))
-                , _parsed_statements(std::move(parsed_statements)) {
-        }
-
-        virtual void prepare_keyspace(const service::client_state& state) override {
-            for (auto&& s : _parsed_statements) {
-                s->prepare_keyspace(state);
-            }
-        }
-
-        virtual shared_ptr<parsed_statement::prepared> prepare(database& db) override {
-            auto&& bound_names = get_bound_variables();
-
-            std::vector<shared_ptr<modification_statement>> statements;
-            for (auto&& parsed : _parsed_statements) {
-                statements.push_back(parsed->prepare(db, bound_names));
-            }
-
-            auto&& prep_attrs = _attrs->prepare(db, "[batch]", "[batch]");
-            prep_attrs->collect_marker_specification(bound_names);
-
-            batch_statement batch_statement_(bound_names->size(), _type, std::move(statements), std::move(prep_attrs));
-            batch_statement_.validate();
-
-            return ::make_shared<parsed_statement::prepared>(make_shared(std::move(batch_statement_)),
-                                                             bound_names->get_specifications());
-        }
-    };
 };

 }
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -92,9 +92,9 @@ void cf_prop_defs::validate() {
            throw exceptions::configuration_exception(sstring("Missing sub-option '") + COMPACTION_STRATEGY_CLASS_KEY + "' for the '" + KW_COMPACTION + "' option.");
        }
        _compaction_strategy_class = sstables::compaction_strategy::type(strategy->second);
-#if 0
-       compactionOptions.remove(COMPACTION_STRATEGY_CLASS_KEY);
+        remove_from_map_if_exists(KW_COMPACTION, COMPACTION_STRATEGY_CLASS_KEY);

+#if 0
       CFMetaData.validateCompactionOptions(compactionStrategyClass, compactionOptions);
 #endif
    }
--- a/cql3/statements/cf_statement.cc
+++ b/cql3/statements/cf_statement.cc
@@ -39,12 +39,15 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

-#include "cql3/statements/cf_statement.hh"
+#include "raw/cf_statement.hh"
+#include "service/client_state.hh"

 namespace cql3 {

 namespace statements {

+namespace raw {
+
 cf_statement::cf_statement(::shared_ptr<cf_name> cf_name)
    : _cf_name(std::move(cf_name))
 {
@@ -81,3 +84,5 @@ const sstring& cf_statement::column_family() const
 }

 }
+
+}
--- a/cql3/statements/create_index_statement.hh
+++ b/cql3/statements/create_index_statement.hh
@@ -44,7 +44,7 @@
 #include "schema_altering_statement.hh"
 #include "index_prop_defs.hh"
 #include "index_target.hh"
-#include "cf_statement.hh"
+#include "raw/cf_statement.hh"

 #include "cql3/index_name.hh"
 #include "cql3/cql3_type.hh"
--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -47,6 +47,7 @@
 #include <boost/range/algorithm/adjacent_find.hpp>

 #include "cql3/statements/create_table_statement.hh"
+#include "cql3/statements/prepared_statement.hh"

 #include "schema_builder.hh"

@@ -160,7 +161,7 @@ create_table_statement::raw_statement::raw_statement(::shared_ptr<cf_name> name,
    , _if_not_exists{if_not_exists}
 { }

-::shared_ptr<parsed_statement::prepared> create_table_statement::raw_statement::prepare(database& db) {
+::shared_ptr<prepared_statement> create_table_statement::raw_statement::prepare(database& db) {
    // Column family name
    const sstring& cf_name = _cf_name->get_column_family();
    std::regex name_regex("\\w+");
@@ -347,7 +348,7 @@ create_table_statement::raw_statement::raw_statement(::shared_ptr<cf_name> name,
        }
    }

-    return ::make_shared<parsed_statement::prepared>(stmt);
+    return ::make_shared<prepared>(stmt);
 }

 data_type create_table_statement::raw_statement::get_type_and_remove(column_map_type& columns, ::shared_ptr<column_identifier> t)
--- a/cql3/statements/create_table_statement.hh
+++ b/cql3/statements/create_table_statement.hh
@@ -43,7 +43,7 @@

 #include "cql3/statements/schema_altering_statement.hh"
 #include "cql3/statements/cf_prop_defs.hh"
-#include "cql3/statements/cf_statement.hh"
+#include "cql3/statements/raw/cf_statement.hh"
 #include "cql3/cql3_type.hh"

 #include "service/migration_manager.hh"
@@ -116,7 +116,7 @@ private:
    void add_column_metadata_from_aliases(schema_builder& builder, std::vector<bytes> aliases, const std::vector<data_type>& types, column_kind kind);
 };

-class create_table_statement::raw_statement : public cf_statement {
+class create_table_statement::raw_statement : public raw::cf_statement {
 private:
    using defs_type = std::unordered_map<::shared_ptr<column_identifier>,
                                         ::shared_ptr<cql3_type::raw>,
--- a/cql3/statements/delete_statement.cc
+++ b/cql3/statements/delete_statement.cc
@@ -40,6 +40,7 @@
 */

 #include "delete_statement.hh"
+#include "raw/delete_statement.hh"

 namespace cql3 {

@@ -76,11 +77,13 @@ void delete_statement::add_update_for_key(mutation& m, const exploded_clustering
    }
 }

-::shared_ptr<modification_statement>
-delete_statement::parsed::prepare_internal(database& db, schema_ptr schema, ::shared_ptr<variable_specifications> bound_names,
-        std::unique_ptr<attributes> attrs) {
+namespace raw {

-    auto stmt = ::make_shared<delete_statement>(statement_type::DELETE, bound_names->size(), schema, std::move(attrs));
+::shared_ptr<cql3::statements::modification_statement>
+delete_statement::prepare_internal(database& db, schema_ptr schema, ::shared_ptr<variable_specifications> bound_names,
+        std::unique_ptr<attributes> attrs) {
+    using statement_type = cql3::statements::modification_statement::statement_type;
+    auto stmt = ::make_shared<cql3::statements::delete_statement>(statement_type::DELETE, bound_names->size(), schema, std::move(attrs));

    for (auto&& deletion : _deletions) {
        auto&& id = deletion->affected_column()->prepare_column_identifier(schema);
@@ -104,13 +107,13 @@ delete_statement::parsed::prepare_internal(database& db, schema_ptr schema, ::sh
    return stmt;
 }

-delete_statement::parsed::parsed(::shared_ptr<cf_name> name,
+delete_statement::delete_statement(::shared_ptr<cf_name> name,
                                 ::shared_ptr<attributes::raw> attrs,
                                 std::vector<::shared_ptr<operation::raw_deletion>> deletions,
                                 std::vector<::shared_ptr<relation>> where_clause,
                                 conditions_vector conditions,
                                 bool if_exists)
-    : modification_statement::parsed(std::move(name), std::move(attrs), std::move(conditions), false, if_exists)
+    : raw::modification_statement(std::move(name), std::move(attrs), std::move(conditions), false, if_exists)
    , _deletions(std::move(deletions))
    , _where_clause(std::move(where_clause))
 { }
@@ -118,3 +121,5 @@ delete_statement::parsed::parsed(::shared_ptr<cf_name> name,
 }

 }
+
+}
--- a/cql3/statements/delete_statement.hh
+++ b/cql3/statements/delete_statement.hh
@@ -42,6 +42,7 @@
 #pragma once

 #include "cql3/statements/modification_statement.hh"
+#include "cql3/statements/raw/modification_statement.hh"
 #include "cql3/attributes.hh"
 #include "cql3/operation.hh"
 #include "database_fwd.hh"
@@ -79,22 +80,6 @@ public:

    }
 #endif
-
-    class parsed : public modification_statement::parsed {
-    private:
-        std::vector<::shared_ptr<operation::raw_deletion>> _deletions;
-        std::vector<::shared_ptr<relation>> _where_clause;
-    public:
-        parsed(::shared_ptr<cf_name> name,
-               ::shared_ptr<attributes::raw> attrs,
-               std::vector<::shared_ptr<operation::raw_deletion>> deletions,
-               std::vector<::shared_ptr<relation>> where_clause,
-               conditions_vector conditions,
-               bool if_exists);
-    protected:
-        virtual ::shared_ptr<modification_statement> prepare_internal(database& db, schema_ptr schema,
-            ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs);
-    };
 };

 }
--- a/cql3/statements/ks_prop_defs.cc
+++ b/cql3/statements/ks_prop_defs.cc
@@ -79,6 +79,18 @@ lw_shared_ptr<keyspace_metadata> ks_prop_defs::as_ks_metadata(sstring ks_name) {
    return keyspace_metadata::new_keyspace(ks_name, get_replication_strategy_class().value(), options, get_boolean(KW_DURABLE_WRITES, true));
 }

+lw_shared_ptr<keyspace_metadata> ks_prop_defs::as_ks_metadata_update(lw_shared_ptr<keyspace_metadata> old) {
+    auto options = get_replication_options();
+    options.erase(REPLICATION_STRATEGY_CLASS_KEY);
+    auto sc = get_replication_strategy_class();
+    if (!sc) {
+        sc = old->strategy_name();
+        options = old->strategy_options();
+    }
+    return keyspace_metadata::new_keyspace(old->name(), *sc, options, get_boolean(KW_DURABLE_WRITES, true));
+}
+
+
 }

 }
--- a/cql3/statements/ks_prop_defs.hh
+++ b/cql3/statements/ks_prop_defs.hh
@@ -66,6 +66,7 @@ public:
    std::map<sstring, sstring> get_replication_options() const;
    std::experimental::optional<sstring> get_replication_strategy_class() const;
    lw_shared_ptr<keyspace_metadata> as_ks_metadata(sstring ks_name);
+    lw_shared_ptr<keyspace_metadata> as_ks_metadata_update(lw_shared_ptr<keyspace_metadata> old);

 #if 0
    public KSMetaData asKSMetadataUpdate(KSMetaData old) throws RequestValidationException
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -40,6 +40,8 @@
 */

 #include "cql3/statements/modification_statement.hh"
+#include "cql3/statements/raw/modification_statement.hh"
+#include "cql3/statements/prepared_statement.hh"
 #include "cql3/restrictions/single_column_restriction.hh"
 #include "cql3/single_column_relation.hh"
 #include "validation.hh"
@@ -111,11 +113,11 @@ uint32_t modification_statement::get_bound_terms() {
    return _bound_terms;
 }

-sstring modification_statement::keyspace() const {
+const sstring& modification_statement::keyspace() const {
    return s->ks_name();
 }

-sstring modification_statement::column_family() const {
+const sstring& modification_statement::column_family() const {
    return s->cf_name();
 }

@@ -146,10 +148,10 @@ future<> modification_statement::check_access(const service::client_state& state
 }

 future<std::vector<mutation>>
-modification_statement::get_mutations(distributed<service::storage_proxy>& proxy, const query_options& options, bool local, int64_t now) {
+modification_statement::get_mutations(distributed<service::storage_proxy>& proxy, const query_options& options, bool local, int64_t now, tracing::trace_state_ptr trace_state) {
    auto keys = make_lw_shared(build_partition_keys(options));
    auto prefix = make_lw_shared(create_exploded_clustering_prefix(options));
-    return make_update_parameters(proxy, keys, prefix, options, local, now).then(
+    return make_update_parameters(proxy, keys, prefix, options, local, now, std::move(trace_state)).then(
            [this, keys, prefix, now] (auto params_ptr) {
                std::vector<mutation> mutations;
                mutations.reserve(keys->size());
@@ -169,8 +171,9 @@ modification_statement::make_update_parameters(
        lw_shared_ptr<exploded_clustering_prefix> prefix,
        const query_options& options,
        bool local,
-        int64_t now) {
-    return read_required_rows(proxy, std::move(keys), std::move(prefix), local, options.get_consistency()).then(
+        int64_t now,
+        tracing::trace_state_ptr trace_state) {
+    return read_required_rows(proxy, std::move(keys), std::move(prefix), local, options.get_consistency(), std::move(trace_state)).then(
            [this, &options, now] (auto rows) {
                return make_ready_future<std::unique_ptr<update_parameters>>(
                        std::make_unique<update_parameters>(s, options,
@@ -253,7 +256,8 @@ modification_statement::read_required_rows(
        lw_shared_ptr<std::vector<partition_key>> keys,
        lw_shared_ptr<exploded_clustering_prefix> prefix,
        bool local,
-        db::consistency_level cl) {
+        db::consistency_level cl,
+        tracing::trace_state_ptr trace_state) {
    if (!requires_read()) {
        return make_ready_future<update_parameters::prefetched_rows_type>(
                update_parameters::prefetched_rows_type{});
@@ -289,7 +293,7 @@ modification_statement::read_required_rows(
    }
    query::read_command cmd(s->id(), s->version(), ps, std::numeric_limits<uint32_t>::max());
    // FIXME: ignoring "local"
-    return proxy.local().query(s, make_lw_shared(std::move(cmd)), std::move(pr), cl).then([this, ps] (auto result) {
+    return proxy.local().query(s, make_lw_shared(std::move(cmd)), std::move(pr), cl, std::move(trace_state)).then([this, ps] (auto result) {
        return query::result_view::do_with(*result, [&] (query::result_view v) {
            auto prefetched_rows = update_parameters::prefetched_rows_type({update_parameters::prefetch_data(s)});
            v.consume(ps, prefetch_data_builder(s, prefetched_rows.value(), ps));
@@ -462,11 +466,12 @@ modification_statement::execute_without_condition(distributed<service::storage_p
        db::validate_for_write(s->ks_name(), cl);
    }

-    return get_mutations(proxy, options, false, options.get_timestamp(qs)).then([cl, &proxy] (auto mutations) {
+    return get_mutations(proxy, options, false, options.get_timestamp(qs), qs.get_trace_state()).then([cl, &proxy, &qs] (auto mutations) {
        if (mutations.empty()) {
            return now();
        }
-        return proxy.local().mutate_with_triggers(std::move(mutations), cl, false);
+
+        return proxy.local().mutate_with_triggers(std::move(mutations), cl, false, qs.get_trace_state());
    });
 }

@@ -503,7 +508,7 @@ modification_statement::execute_internal(distributed<service::storage_proxy>& pr
    if (has_conditions()) {
        throw exceptions::unsupported_operation_exception();
    }
-    return get_mutations(proxy, options, true, options.get_timestamp(qs)).then(
+    return get_mutations(proxy, options, true, options.get_timestamp(qs), qs.get_trace_state()).then(
            [&proxy] (auto mutations) {
                return proxy.local().mutate_locally(std::move(mutations));
            }).then(
@@ -560,21 +565,23 @@ modification_statement::process_where_clause(database& db, std::vector<relation_
    }
 }

-::shared_ptr<parsed_statement::prepared>
-modification_statement::parsed::prepare(database& db) {
+namespace raw {
+
+::shared_ptr<prepared_statement>
+modification_statement::modification_statement::prepare(database& db) {
    auto bound_names = get_bound_variables();
    auto statement = prepare(db, bound_names);
-    return ::make_shared<parsed_statement::prepared>(std::move(statement), *bound_names);
+    return ::make_shared<prepared>(std::move(statement), *bound_names);
 }

-::shared_ptr<modification_statement>
-modification_statement::parsed::prepare(database& db, ::shared_ptr<variable_specifications> bound_names) {
+::shared_ptr<cql3::statements::modification_statement>
+modification_statement::prepare(database& db, ::shared_ptr<variable_specifications> bound_names) {
    schema_ptr schema = validation::validate_column_family(db, keyspace(), column_family());

    auto prepared_attributes = _attrs->prepare(db, keyspace(), column_family());
    prepared_attributes->collect_marker_specification(bound_names);

-    ::shared_ptr<modification_statement> stmt = prepare_internal(db, schema, bound_names, std::move(prepared_attributes));
+    ::shared_ptr<cql3::statements::modification_statement> stmt = prepare_internal(db, schema, bound_names, std::move(prepared_attributes));

    if (_if_not_exists || _if_exists || !_conditions.empty()) {
        if (stmt->is_counter()) {
@@ -616,6 +623,8 @@ modification_statement::parsed::prepare(database& db, ::shared_ptr<variable_spec
    return stmt;
 }

+}
+
 void
 modification_statement::validate(distributed<service::storage_proxy>&, const service::client_state& state) {
    if (has_conditions() && attrs->is_timestamp_set()) {
@@ -688,7 +697,9 @@ void modification_statement::validate_where_clause_for_conditions() {
    //  no-op by default
 }

-modification_statement::parsed::parsed(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, conditions_vector conditions, bool if_not_exists, bool if_exists)
+namespace raw {
+
+modification_statement::modification_statement(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, conditions_vector conditions, bool if_not_exists, bool if_exists)
    : cf_statement{std::move(name)}
    , _attrs{std::move(attrs)}
    , _conditions{std::move(conditions)}
@@ -699,3 +710,5 @@ modification_statement::parsed::parsed(::shared_ptr<cf_name> name, ::shared_ptr<
 }

 }
+
+}
--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -42,7 +42,7 @@
 #pragma once

 #include "cql3/restrictions/restriction.hh"
-#include "cql3/statements/cf_statement.hh"
+#include "cql3/statements/raw/cf_statement.hh"
 #include "cql3/column_identifier.hh"
 #include "cql3/update_parameters.hh"
 #include "cql3/column_condition.hh"
@@ -66,10 +66,13 @@ namespace cql3 {

 namespace statements {

+
+namespace raw { class modification_statement; }
+
 /*
 * Abstract parent class of individual modifications, i.e. INSERT, UPDATE and DELETE.
 */
-class modification_statement : public cql_statement {
+class modification_statement : public cql_statement_no_metadata {
 private:
    static thread_local const ::shared_ptr<column_identifier> CAS_RESULT_COLUMN;

@@ -117,9 +120,9 @@ public:

    virtual uint32_t get_bound_terms() override;

-    virtual sstring keyspace() const;
+    virtual const sstring& keyspace() const;

-    virtual sstring column_family() const;
+    virtual const sstring& column_family() const;

    virtual bool is_counter() const;

@@ -184,7 +187,8 @@ protected:
                lw_shared_ptr<std::vector<partition_key>> keys,
                lw_shared_ptr<exploded_clustering_prefix> prefix,
                bool local,
-                db::consistency_level cl);
+                db::consistency_level cl,
+                tracing::trace_state_ptr trace_state);

 public:
    bool has_conditions();
@@ -327,7 +331,7 @@ public:
     * @return vector of the mutations
     * @throws invalid_request_exception on invalid requests
     */
-    future<std::vector<mutation>> get_mutations(distributed<service::storage_proxy>& proxy, const query_options& options, bool local, int64_t now);
+    future<std::vector<mutation>> get_mutations(distributed<service::storage_proxy>& proxy, const query_options& options, bool local, int64_t now, tracing::trace_state_ptr trace_state);

 public:
    future<std::unique_ptr<update_parameters>> make_update_parameters(
@@ -336,7 +340,8 @@ public:
                lw_shared_ptr<exploded_clustering_prefix> prefix,
                const query_options& options,
                bool local,
-                int64_t now);
+                int64_t now,
+                tracing::trace_state_ptr trace_state);

 protected:
    /**
@@ -345,27 +350,7 @@ protected:
     * @throws InvalidRequestException
     */
    virtual void validate_where_clause_for_conditions();
-
-public:
-    class parsed : public cf_statement {
-    public:
-        using conditions_vector = std::vector<std::pair<::shared_ptr<column_identifier::raw>, ::shared_ptr<column_condition::raw>>>;
-    protected:
-        const ::shared_ptr<attributes::raw> _attrs;
-        const std::vector<std::pair<::shared_ptr<column_identifier::raw>, ::shared_ptr<column_condition::raw>>> _conditions;
-    private:
-        const bool _if_not_exists;
-        const bool _if_exists;
-    protected:
-        parsed(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, conditions_vector conditions, bool if_not_exists, bool if_exists);
-
-    public:
-        virtual ::shared_ptr<parsed_statement::prepared> prepare(database& db) override;
-        ::shared_ptr<modification_statement> prepare(database& db, ::shared_ptr<variable_specifications> bound_names);;
-    protected:
-        virtual ::shared_ptr<modification_statement> prepare_internal(database& db, schema_ptr schema,
-            ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs) = 0;
-    };
+    friend class raw::modification_statement;
 };

 std::ostream& operator<<(std::ostream& out, modification_statement::statement_type t);
--- a/cql3/statements/parsed_statement.cc
+++ b/cql3/statements/parsed_statement.cc
@@ -39,12 +39,16 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

-#include "cql3/statements/parsed_statement.hh"
+#include "raw/parsed_statement.hh"
+
+#include "prepared_statement.hh"

 namespace cql3 {

 namespace statements {

+namespace raw {
+
 parsed_statement::~parsed_statement()
 { }

@@ -61,21 +65,23 @@ bool parsed_statement::uses_function(const sstring& ks_name, const sstring& func
    return false;
 }

-parsed_statement::prepared::prepared(::shared_ptr<cql_statement> statement_, std::vector<::shared_ptr<column_specification>> bound_names_)
+}
+
+prepared_statement::prepared_statement(::shared_ptr<cql_statement> statement_, std::vector<::shared_ptr<column_specification>> bound_names_)
    : statement(std::move(statement_))
    , bound_names(std::move(bound_names_))
 { }

-parsed_statement::prepared::prepared(::shared_ptr<cql_statement> statement_, const variable_specifications& names)
-    : prepared(statement_, names.get_specifications())
+prepared_statement::prepared_statement(::shared_ptr<cql_statement> statement_, const variable_specifications& names)
+    : prepared_statement(statement_, names.get_specifications())
 { }

-parsed_statement::prepared::prepared(::shared_ptr<cql_statement> statement_, variable_specifications&& names)
-    : prepared(statement_, std::move(names).get_specifications())
+prepared_statement::prepared_statement(::shared_ptr<cql_statement> statement_, variable_specifications&& names)
+    : prepared_statement(statement_, std::move(names).get_specifications())
 { }

-parsed_statement::prepared::prepared(::shared_ptr<cql_statement>&& statement_)
-    : prepared(statement_, std::vector<::shared_ptr<column_specification>>())
+prepared_statement::prepared_statement(::shared_ptr<cql_statement>&& statement_)
+    : prepared_statement(statement_, std::vector<::shared_ptr<column_specification>>())
 { }

 }
--- a/cql3/statements/prepared_statement.hh
+++ b/cql3/statements/prepared_statement.hh
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (C) 2016 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "cql3/variable_specifications.hh"
+#include "cql3/column_specification.hh"
+#include "cql3/column_identifier.hh"
+#include "cql3/cql_statement.hh"
+
+#include "core/shared_ptr.hh"
+
+#include <experimental/optional>
+#include <vector>
+
+namespace cql3 {
+
+namespace statements {
+
+class prepared_statement {
+public:
+    sstring raw_cql_statement;
+    const ::shared_ptr<cql_statement> statement;
+    const std::vector<::shared_ptr<column_specification>> bound_names;
+
+    prepared_statement(::shared_ptr<cql_statement> statement_, std::vector<::shared_ptr<column_specification>> bound_names_);
+
+    prepared_statement(::shared_ptr<cql_statement> statement_, const variable_specifications& names);
+
+    prepared_statement(::shared_ptr<cql_statement> statement_, variable_specifications&& names);
+
+    prepared_statement(::shared_ptr<cql_statement>&& statement_);
+};
+
+}
+
+}
--- a/cql3/statements/property_definitions.cc
+++ b/cql3/statements/property_definitions.cc
@@ -181,6 +181,21 @@ long property_definitions::to_long(sstring key, std::experimental::optional<sstr
    }
 }

+void property_definitions::remove_from_map_if_exists(const sstring& name, const sstring& key)
+{
+    auto it = _properties.find(name);
+    if (it == _properties.end()) {
+        return;
+    }
+    try {
+        auto map = boost::any_cast<std::map<sstring, sstring>>(it->second);
+        map.erase(key);
+        _properties[name] = map;
+    } catch (const boost::bad_any_cast& e) {
+        throw exceptions::syntax_exception(sprint("Invalid value for property '%s'. It should be a map.", name));
+    }
+}
+
 }

 }
--- a/cql3/statements/property_definitions.hh
+++ b/cql3/statements/property_definitions.hh
@@ -79,6 +79,7 @@ protected:

    std::experimental::optional<std::map<sstring, sstring>> get_map(const sstring& name) const;

+    void remove_from_map_if_exists(const sstring& name, const sstring& key);
 public:
    bool has_property(const sstring& name) const;

--- a/cql3/statements/raw/batch_statement.hh
+++ b/cql3/statements/raw/batch_statement.hh
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Modified by ScyllaDB
+ * Copyright (C) 2015 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cql3/cql_statement.hh"
+#include "modification_statement.hh"
+#include "service/storage_proxy.hh"
+#include "transport/messages/result_message.hh"
+#include "timestamp.hh"
+#include "log.hh"
+#include "to_string.hh"
+#include <boost/algorithm/cxx11/any_of.hpp>
+#include <boost/algorithm/cxx11/all_of.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/range/adaptor/uniqued.hpp>
+#include <boost/iterator/counting_iterator.hpp>
+
+#pragma once
+
+namespace cql3 {
+
+namespace statements {
+
+namespace raw {
+
+class batch_statement : public raw::cf_statement {
+public:
+    enum class type {
+        LOGGED, UNLOGGED, COUNTER
+    };
+private:
+    type _type;
+    shared_ptr<attributes::raw> _attrs;
+    std::vector<shared_ptr<raw::modification_statement>> _parsed_statements;
+public:
+    batch_statement(
+        type type_,
+        shared_ptr<attributes::raw> attrs,
+        std::vector<shared_ptr<raw::modification_statement>> parsed_statements)
+            : cf_statement(nullptr)
+            , _type(type_)
+            , _attrs(std::move(attrs))
+            , _parsed_statements(std::move(parsed_statements)) {
+    }
+
+    virtual void prepare_keyspace(const service::client_state& state) override {
+        for (auto&& s : _parsed_statements) {
+            s->prepare_keyspace(state);
+        }
+    }
+
+    virtual shared_ptr<prepared> prepare(database& db) override;
+};
+
+}
+
+}
+}
--- a/cql3/statements/raw/cf_statement.hh
+++ b/cql3/statements/raw/cf_statement.hh
@@ -41,15 +41,20 @@

 #pragma once

-#include "cql3/statements/parsed_statement.hh"
 #include "cql3/cf_name.hh"

 #include <experimental/optional>

+#include "parsed_statement.hh"
+
+namespace service { class client_state; }
+
 namespace cql3 {

 namespace statements {

+namespace raw {
+
 /**
 * Abstract class for statements that apply on a given column family.
 */
@@ -72,3 +77,5 @@ public:
 }

 }
+
+}
--- a/cql3/statements/raw/delete_statement.hh
+++ b/cql3/statements/raw/delete_statement.hh
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (C) 2015 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "cql3/statements/modification_statement.hh"
+#include "cql3/statements/raw/modification_statement.hh"
+#include "cql3/attributes.hh"
+#include "cql3/operation.hh"
+#include "database_fwd.hh"
+
+namespace cql3 {
+
+namespace statements {
+
+namespace raw {
+
+class delete_statement : public modification_statement {
+private:
+    std::vector<::shared_ptr<operation::raw_deletion>> _deletions;
+    std::vector<::shared_ptr<relation>> _where_clause;
+public:
+    delete_statement(::shared_ptr<cf_name> name,
+           ::shared_ptr<attributes::raw> attrs,
+           std::vector<::shared_ptr<operation::raw_deletion>> deletions,
+           std::vector<::shared_ptr<relation>> where_clause,
+           conditions_vector conditions,
+           bool if_exists);
+protected:
+    virtual ::shared_ptr<cql3::statements::modification_statement> prepare_internal(database& db, schema_ptr schema,
+        ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs);
+};
+
+}
+
+}
+
+}
--- a/cql3/statements/raw/insert_statement.hh
+++ b/cql3/statements/raw/insert_statement.hh
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (C) 2015 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "cql3/statements/modification_statement.hh"
+#include "cql3/statements/raw/modification_statement.hh"
+#include "cql3/column_identifier.hh"
+#include "cql3/term.hh"
+
+#include "database_fwd.hh"
+
+#include <vector>
+#include "unimplemented.hh"
+
+namespace cql3 {
+
+namespace statements {
+
+namespace raw {
+
+class insert_statement : public raw::modification_statement {
+private:
+    const std::vector<::shared_ptr<column_identifier::raw>> _column_names;
+    const std::vector<::shared_ptr<term::raw>> _column_values;
+public:
+    /**
+     * A parsed <code>INSERT</code> statement.
+     *
+     * @param name column family being operated on
+     * @param columnNames list of column names
+     * @param columnValues list of column values (corresponds to names)
+     * @param attrs additional attributes for statement (CL, timestamp, timeToLive)
+     */
+    insert_statement(::shared_ptr<cf_name> name,
+                  ::shared_ptr<attributes::raw> attrs,
+                  std::vector<::shared_ptr<column_identifier::raw>> column_names,
+                  std::vector<::shared_ptr<term::raw>> column_values,
+                  bool if_not_exists);
+
+    virtual ::shared_ptr<cql3::statements::modification_statement> prepare_internal(database& db, schema_ptr schema,
+                ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs) override;
+
+};
+
+}
+
+}
+
+}
--- a/cql3/statements/raw/modification_statement.hh
+++ b/cql3/statements/raw/modification_statement.hh
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (C) 2015 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "cql3/restrictions/restriction.hh"
+#include "cql3/statements/raw/cf_statement.hh"
+#include "cql3/column_identifier.hh"
+#include "cql3/update_parameters.hh"
+#include "cql3/column_condition.hh"
+#include "cql3/cql_statement.hh"
+#include "cql3/attributes.hh"
+#include "cql3/operation.hh"
+#include "cql3/relation.hh"
+
+#include "db/consistency_level.hh"
+
+#include "core/shared_ptr.hh"
+#include "core/future-util.hh"
+
+#include "unimplemented.hh"
+#include "validation.hh"
+#include "service/storage_proxy.hh"
+
+#include <memory>
+
+namespace cql3 {
+
+namespace statements {
+
+class modification_statement;
+
+namespace raw {
+
+class modification_statement : public cf_statement {
+public:
+    using conditions_vector = std::vector<std::pair<::shared_ptr<column_identifier::raw>, ::shared_ptr<column_condition::raw>>>;
+protected:
+    const ::shared_ptr<attributes::raw> _attrs;
+    const std::vector<std::pair<::shared_ptr<column_identifier::raw>, ::shared_ptr<column_condition::raw>>> _conditions;
+private:
+    const bool _if_not_exists;
+    const bool _if_exists;
+protected:
+    modification_statement(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, conditions_vector conditions, bool if_not_exists, bool if_exists);
+
+public:
+    virtual ::shared_ptr<prepared> prepare(database& db) override;
+    ::shared_ptr<cql3::statements::modification_statement> prepare(database& db, ::shared_ptr<variable_specifications> bound_names);;
+protected:
+    virtual ::shared_ptr<cql3::statements::modification_statement> prepare_internal(database& db, schema_ptr schema,
+        ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs) = 0;
+};
+
+}
+
+}
+
+}
--- a/cql3/statements/raw/parsed_statement.hh
+++ b/cql3/statements/raw/parsed_statement.hh
@@ -17,7 +17,7 @@
 */

 /*
- * Copyright (C) 2014 ScyllaDB
+ * Copyright (C) 2016 ScyllaDB
 *
 * Modified by ScyllaDB
 */
@@ -44,9 +44,8 @@
 #include "cql3/variable_specifications.hh"
 #include "cql3/column_specification.hh"
 #include "cql3/column_identifier.hh"
-#include "cql3/cql_statement.hh"

-#include "core/shared_ptr.hh"
+#include <seastar/core/shared_ptr.hh>

 #include <experimental/optional>
 #include <vector>
@@ -55,31 +54,22 @@ namespace cql3 {

 namespace statements {

+class prepared_statement;
+
+namespace raw {
+
 class parsed_statement {
 private:
    ::shared_ptr<variable_specifications> _variables;

 public:
+    using prepared = statements::prepared_statement;
    virtual ~parsed_statement();

    shared_ptr<variable_specifications> get_bound_variables();

    void set_bound_variables(const std::vector<::shared_ptr<column_identifier>>& bound_names);

-    class prepared {
-    public:
-        const ::shared_ptr<cql_statement> statement;
-        const std::vector<::shared_ptr<column_specification>> bound_names;
-
-        prepared(::shared_ptr<cql_statement> statement_, std::vector<::shared_ptr<column_specification>> bound_names_);
-
-        prepared(::shared_ptr<cql_statement> statement_, const variable_specifications& names);
-
-        prepared(::shared_ptr<cql_statement> statement_, variable_specifications&& names);
-
-        prepared(::shared_ptr<cql_statement>&& statement_);
-    };
-
    virtual ::shared_ptr<prepared> prepare(database& db) = 0;

    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const;
@@ -88,3 +78,5 @@ public:
 }

 }
+
+}
--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (C) 2015 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "cql3/statements/raw/cf_statement.hh"
+#include "cql3/cql_statement.hh"
+#include "cql3/selection/selection.hh"
+#include "cql3/selection/raw_selector.hh"
+#include "cql3/restrictions/statement_restrictions.hh"
+#include "cql3/result_set.hh"
+#include "exceptions/unrecognized_entity_exception.hh"
+#include "service/client_state.hh"
+#include "core/shared_ptr.hh"
+#include "core/distributed.hh"
+#include "validation.hh"
+
+namespace cql3 {
+
+namespace statements {
+
+namespace raw {
+
+/**
+ * Encapsulates a completely parsed SELECT query, including the target
+ * column family, expression, result count, and ordering clause.
+ *
+ */
+class select_statement : public cf_statement
+{
+public:
+    class parameters final {
+    public:
+        using orderings_type = std::vector<std::pair<shared_ptr<column_identifier::raw>, bool>>;
+    private:
+        const orderings_type _orderings;
+        const bool _is_distinct;
+        const bool _allow_filtering;
+    public:
+        parameters();
+        parameters(orderings_type orderings,
+            bool is_distinct,
+            bool allow_filtering);
+        bool is_distinct();
+        bool allow_filtering();
+        orderings_type const& orderings();
+    };
+    template<typename T>
+    using compare_fn = std::function<bool(const T&, const T&)>;
+
+    using result_row_type = std::vector<bytes_opt>;
+    using ordering_comparator_type = compare_fn<result_row_type>;
+private:
+    ::shared_ptr<parameters> _parameters;
+    std::vector<::shared_ptr<selection::raw_selector>> _select_clause;
+    std::vector<::shared_ptr<relation>> _where_clause;
+    ::shared_ptr<term::raw> _limit;
+public:
+    select_statement(::shared_ptr<cf_name> cf_name,
+            ::shared_ptr<parameters> parameters,
+            std::vector<::shared_ptr<selection::raw_selector>> select_clause,
+            std::vector<::shared_ptr<relation>> where_clause,
+            ::shared_ptr<term::raw> limit);
+
+    virtual ::shared_ptr<prepared> prepare(database& db) override;
+private:
+    ::shared_ptr<restrictions::statement_restrictions> prepare_restrictions(
+        database& db,
+        schema_ptr schema,
+        ::shared_ptr<variable_specifications> bound_names,
+        ::shared_ptr<selection::selection> selection);
+
+    /** Returns a ::shared_ptr<term> for the limit or null if no limit is set */
+    ::shared_ptr<term> prepare_limit(database& db, ::shared_ptr<variable_specifications> bound_names);
+
+    static void verify_ordering_is_allowed(::shared_ptr<restrictions::statement_restrictions> restrictions);
+
+    static void validate_distinct_selection(schema_ptr schema,
+        ::shared_ptr<selection::selection> selection,
+        ::shared_ptr<restrictions::statement_restrictions> restrictions);
+
+    void handle_unrecognized_ordering_column(::shared_ptr<column_identifier> column);
+
+    select_statement::ordering_comparator_type get_ordering_comparator(schema_ptr schema,
+        ::shared_ptr<selection::selection> selection,
+        ::shared_ptr<restrictions::statement_restrictions> restrictions);
+
+    bool is_reversed(schema_ptr schema);
+
+    /** If ALLOW FILTERING was not specified, this verifies that it is not needed */
+    void check_needs_filtering(::shared_ptr<restrictions::statement_restrictions> restrictions);
+
+    bool contains_alias(::shared_ptr<column_identifier> name);
+
+    ::shared_ptr<column_specification> limit_receiver();
+
+#if 0
+    public:
+        virtual sstring to_string() override {
+            return sstring("raw_statement(")
+                + "name=" + cf_name->to_string()
+                + ", selectClause=" + to_string(_select_clause)
+                + ", whereClause=" + to_string(_where_clause)
+                + ", isDistinct=" + to_string(_parameters->is_distinct())
+                + ")";
+        }
+    };
+#endif
+};
+
+}
+
+}
+
+}
--- a/cql3/statements/raw/update_statement.hh
+++ b/cql3/statements/raw/update_statement.hh
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (C) 2015 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "cql3/statements/modification_statement.hh"
+#include "cql3/statements/raw/modification_statement.hh"
+#include "cql3/column_identifier.hh"
+#include "cql3/term.hh"
+
+#include "database_fwd.hh"
+
+#include <vector>
+#include "unimplemented.hh"
+
+namespace cql3 {
+
+namespace statements {
+
+class update_statement;
+
+namespace raw {
+
+class update_statement : public raw::modification_statement {
+private:
+    // Provided for an UPDATE
+    std::vector<std::pair<::shared_ptr<column_identifier::raw>, ::shared_ptr<operation::raw_update>>> _updates;
+    std::vector<relation_ptr> _where_clause;
+public:
+    /**
+     * Creates a new UpdateStatement from a column family name, columns map, consistency
+     * level, and key term.
+     *
+     * @param name column family being operated on
+     * @param attrs additional attributes for statement (timestamp, timeToLive)
+     * @param updates a map of column operations to perform
+     * @param whereClause the where clause
+     */
+    update_statement(::shared_ptr<cf_name> name,
+        ::shared_ptr<attributes::raw> attrs,
+        std::vector<std::pair<::shared_ptr<column_identifier::raw>, ::shared_ptr<operation::raw_update>>> updates,
+        std::vector<relation_ptr> where_clause,
+        conditions_vector conditions);
+protected:
+    virtual ::shared_ptr<cql3::statements::modification_statement> prepare_internal(database& db, schema_ptr schema,
+                ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs);
+};
+
+}
+
+}
+
+}
--- a/cql3/statements/raw/use_statement.hh
+++ b/cql3/statements/raw/use_statement.hh
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (C) 2014 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "cql3/statements/raw/parsed_statement.hh"
+
+namespace cql3 {
+
+namespace statements {
+
+class prepared_statement;
+
+namespace raw {
+
+class use_statement : public parsed_statement {
+private:
+    const sstring _keyspace;
+
+public:
+    use_statement(sstring keyspace);
+
+    virtual ::shared_ptr<prepared> prepare(database& db) override;
+};
+
+}
+
+}
+
+}
--- a/cql3/statements/schema_altering_statement.cc
+++ b/cql3/statements/schema_altering_statement.cc
@@ -86,9 +86,9 @@ void schema_altering_statement::prepare_keyspace(const service::client_state& st
    }
 }

-::shared_ptr<parsed_statement::prepared> schema_altering_statement::prepare(database& db)
+::shared_ptr<prepared_statement> schema_altering_statement::prepare(database& db)
 {
-    return ::make_shared<parsed_statement::prepared>(this->shared_from_this());
+    return ::make_shared<prepared>(this->shared_from_this());
 }

 future<::shared_ptr<messages::result_message>>
--- a/cql3/statements/schema_altering_statement.hh
+++ b/cql3/statements/schema_altering_statement.hh
@@ -44,7 +44,7 @@
 #include "transport/messages_fwd.hh"
 #include "transport/event.hh"

-#include "cql3/statements/cf_statement.hh"
+#include "cql3/statements/raw/cf_statement.hh"
 #include "cql3/cql_statement.hh"

 #include "core/shared_ptr.hh"
@@ -60,7 +60,7 @@ namespace messages = transport::messages;
 /**
 * Abstract class for statements that alter the schema.
 */
-class schema_altering_statement : public cf_statement, public cql_statement, public ::enable_shared_from_this<schema_altering_statement> {
+class schema_altering_statement : public raw::cf_statement, public cql_statement_no_metadata, public ::enable_shared_from_this<schema_altering_statement> {
 private:
    const bool _is_column_family_level;

--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -40,6 +40,7 @@
 */

 #include "cql3/statements/select_statement.hh"
+#include "cql3/statements/raw/select_statement.hh"

 #include "transport/messages/result_message.hh"
 #include "cql3/selection/selection.hh"
@@ -60,8 +61,8 @@ select_statement::parameters::parameters()
 { }

 select_statement::parameters::parameters(orderings_type orderings,
-    bool is_distinct,
-    bool allow_filtering)
+                                         bool is_distinct,
+                                         bool allow_filtering)
    : _orderings{std::move(orderings)}
    , _is_distinct{is_distinct}
    , _allow_filtering{allow_filtering}
@@ -80,21 +81,21 @@ select_statement::parameters::orderings_type const& select_statement::parameters
 }

 select_statement::select_statement(schema_ptr schema,
-    uint32_t bound_terms,
-    ::shared_ptr<parameters> parameters,
-    ::shared_ptr<selection::selection> selection,
-    ::shared_ptr<restrictions::statement_restrictions> restrictions,
-    bool is_reversed,
-    ordering_comparator_type ordering_comparator,
-    ::shared_ptr<term> limit)
-        : _schema(schema)
-        , _bound_terms(bound_terms)
-        , _parameters(std::move(parameters))
-        , _selection(std::move(selection))
-        , _restrictions(std::move(restrictions))
-        , _is_reversed(is_reversed)
-        , _limit(std::move(limit))
-        , _ordering_comparator(std::move(ordering_comparator))
+                                   uint32_t bound_terms,
+                                   ::shared_ptr<parameters> parameters,
+                                   ::shared_ptr<selection::selection> selection,
+                                   ::shared_ptr<restrictions::statement_restrictions> restrictions,
+                                   bool is_reversed,
+                                   ordering_comparator_type ordering_comparator,
+                                   ::shared_ptr<term> limit)
+    : _schema(schema)
+    , _bound_terms(bound_terms)
+    , _parameters(std::move(parameters))
+    , _selection(std::move(selection))
+    , _restrictions(std::move(restrictions))
+    , _is_reversed(is_reversed)
+    , _limit(std::move(limit))
+    , _ordering_comparator(std::move(ordering_comparator))
 {
    _opts = _selection->get_query_options();
 }
@@ -117,7 +118,7 @@ select_statement::for_selection(schema_ptr schema, ::shared_ptr<selection::selec
        ::shared_ptr<term>{});
 }

-::shared_ptr<cql3::metadata> select_statement::get_result_metadata() const {
+::shared_ptr<const cql3::metadata> select_statement::get_result_metadata() const {
    // FIXME: COUNT needs special result metadata handling.
    return _selection->get_result_metadata();
 }
@@ -151,7 +152,8 @@ const sstring& select_statement::column_family() const {
 }

 query::partition_slice
-select_statement::make_partition_slice(const query_options& options) {
+select_statement::make_partition_slice(const query_options& options)
+{
    std::vector<column_id> static_columns;
    std::vector<column_id> regular_columns;

@@ -212,7 +214,10 @@ bool select_statement::needs_post_query_ordering() const {
 }

 future<shared_ptr<transport::messages::result_message>>
-select_statement::execute(distributed<service::storage_proxy>& proxy, service::query_state& state, const query_options& options) {
+select_statement::execute(distributed<service::storage_proxy>& proxy,
+                          service::query_state& state,
+                          const query_options& options)
+{
    auto cl = options.get_consistency();

    validate_for_read(_schema->ks_name(), cl);
@@ -221,7 +226,7 @@ select_statement::execute(distributed<service::storage_proxy>& proxy, service::q
    auto now = db_clock::now();

    auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(),
-        make_partition_slice(options), limit, to_gc_clock(now));
+        make_partition_slice(options), limit, to_gc_clock(now), tracing::make_trace_info(state.get_trace_state()), query::max_partitions, options.get_timestamp(state));

    int32_t page_size = options.get_page_size();

@@ -281,9 +286,13 @@ select_statement::execute(distributed<service::storage_proxy>& proxy, service::q
 }

 future<shared_ptr<transport::messages::result_message>>
-select_statement::execute(distributed<service::storage_proxy>& proxy, lw_shared_ptr<query::read_command> cmd, std::vector<query::partition_range>&& partition_ranges,
-        service::query_state& state, const query_options& options, db_clock::time_point now) {
-
+select_statement::execute(distributed<service::storage_proxy>& proxy,
+                          lw_shared_ptr<query::read_command> cmd,
+                          std::vector<query::partition_range>&& partition_ranges,
+                          service::query_state& state,
+                          const query_options& options,
+                          db_clock::time_point now)
+{
    // If this is a query with IN on partition key, ORDER BY clause and LIMIT
    // is specified we need to get "limit" rows from each partition since there
    // is no way to tell which of these rows belong to the query result before
@@ -294,26 +303,28 @@ select_statement::execute(distributed<service::storage_proxy>& proxy, lw_shared_
            return map_reduce(prs.begin(), prs.end(), [this, &proxy, &state, &options, cmd] (auto pr) {
                std::vector<query::partition_range> prange { pr };
                auto command = ::make_lw_shared<query::read_command>(*cmd);
-                return proxy.local().query(_schema, command, std::move(prange), options.get_consistency());
+                return proxy.local().query(_schema, command, std::move(prange), options.get_consistency(), state.get_trace_state());
            }, std::move(merger));
        }).then([this, &options, now, cmd] (auto result) {
            return this->process_results(std::move(result), cmd, options, now);
        });
    } else {
-        return proxy.local().query(_schema, cmd, std::move(partition_ranges), options.get_consistency())
+        return proxy.local().query(_schema, cmd, std::move(partition_ranges), options.get_consistency(), state.get_trace_state())
            .then([this, &options, now, cmd] (auto result) {
                return this->process_results(std::move(result), cmd, options, now);
            });
    }
 }

-
 future<::shared_ptr<transport::messages::result_message>>
-select_statement::execute_internal(distributed<service::storage_proxy>& proxy, service::query_state& state, const query_options& options) {
+select_statement::execute_internal(distributed<service::storage_proxy>& proxy,
+                                   service::query_state& state,
+                                   const query_options& options)
+{
    int32_t limit = get_limit(options);
    auto now = db_clock::now();
    auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(),
-        make_partition_slice(options), limit);
+        make_partition_slice(options), limit, to_gc_clock(now), std::experimental::nullopt, query::max_partitions, options.get_timestamp(state));
    auto partition_ranges = _restrictions->get_partition_key_ranges(options);

    if (needs_post_query_ordering() && _limit) {
@@ -322,23 +333,24 @@ select_statement::execute_internal(distributed<service::storage_proxy>& proxy, s
            return map_reduce(prs.begin(), prs.end(), [this, &proxy, &state, command] (auto pr) {
                std::vector<query::partition_range> prange { pr };
                auto cmd = ::make_lw_shared<query::read_command>(*command);
-                return proxy.local().query(_schema, cmd, std::move(prange), db::consistency_level::ONE);
+                return proxy.local().query(_schema, cmd, std::move(prange), db::consistency_level::ONE, state.get_trace_state());
            }, std::move(merger));
        }).then([command, this, &options, now] (auto result) {
            return this->process_results(std::move(result), command, options, now);
        }).finally([command] { });
    } else {
-        return proxy.local().query(_schema, command, std::move(partition_ranges), db::consistency_level::ONE).then([command, this, &options, now] (auto result) {
+        return proxy.local().query(_schema, command, std::move(partition_ranges), db::consistency_level::ONE, state.get_trace_state()).then([command, this, &options, now] (auto result) {
            return this->process_results(std::move(result), command, options, now);
        }).finally([command] {});
    }
 }

-shared_ptr<transport::messages::result_message> select_statement::process_results(
-        foreign_ptr<lw_shared_ptr<query::result>> results,
-        lw_shared_ptr<query::read_command> cmd, const query_options& options,
-        db_clock::time_point now) {
-
+shared_ptr<transport::messages::result_message>
+select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> results,
+                                  lw_shared_ptr<query::read_command> cmd,
+                                  const query_options& options,
+                                  db_clock::time_point now)
+{
    cql3::selection::result_set_builder builder(*_selection, now,
            options.get_cql_serialization_format());
    query::result_view::consume(*results, cmd->slice,
@@ -356,11 +368,13 @@ shared_ptr<transport::messages::result_message> select_statement::process_result
    return ::make_shared<transport::messages::result_message::rows>(std::move(rs));
 }

-select_statement::raw_statement::raw_statement(::shared_ptr<cf_name> cf_name,
-                                               ::shared_ptr<parameters> parameters,
-                                               std::vector<::shared_ptr<selection::raw_selector>> select_clause,
-                                               std::vector<::shared_ptr<relation>> where_clause,
-                                               ::shared_ptr<term::raw> limit)
+namespace raw {
+
+select_statement::select_statement(::shared_ptr<cf_name> cf_name,
+                                   ::shared_ptr<parameters> parameters,
+                                   std::vector<::shared_ptr<selection::raw_selector>> select_clause,
+                                   std::vector<::shared_ptr<relation>> where_clause,
+                                   ::shared_ptr<term::raw> limit)
    : cf_statement(std::move(cf_name))
    , _parameters(std::move(parameters))
    , _select_clause(std::move(select_clause))
@@ -368,8 +382,7 @@ select_statement::raw_statement::raw_statement(::shared_ptr<cf_name> cf_name,
    , _limit(std::move(limit))
 { }

-::shared_ptr<parsed_statement::prepared>
-select_statement::raw_statement::prepare(database& db) {
+::shared_ptr<prepared_statement> select_statement::prepare(database& db) {
    schema_ptr schema = validation::validate_column_family(db, keyspace(), column_family());
    auto bound_names = get_bound_variables();

@@ -394,7 +407,7 @@ select_statement::raw_statement::prepare(database& db) {

    check_needs_filtering(restrictions);

-    auto stmt = ::make_shared<select_statement>(schema,
+    auto stmt = ::make_shared<cql3::statements::select_statement>(schema,
        bound_names->size(),
        _parameters,
        std::move(selection),
@@ -403,13 +416,14 @@ select_statement::raw_statement::prepare(database& db) {
        std::move(ordering_comparator),
        prepare_limit(db, bound_names));

-    return ::make_shared<parsed_statement::prepared>(std::move(stmt), std::move(*bound_names));
+    return ::make_shared<prepared>(std::move(stmt), std::move(*bound_names));
 }

 ::shared_ptr<restrictions::statement_restrictions>
-select_statement::raw_statement::prepare_restrictions(database& db, schema_ptr schema,
-    ::shared_ptr<variable_specifications> bound_names,
-    ::shared_ptr<selection::selection> selection)
+select_statement::prepare_restrictions(database& db,
+                                       schema_ptr schema,
+                                       ::shared_ptr<variable_specifications> bound_names,
+                                       ::shared_ptr<selection::selection> selection)
 {
    try {
        return ::make_shared<restrictions::statement_restrictions>(db, schema, std::move(_where_clause), bound_names,
@@ -424,7 +438,8 @@ select_statement::raw_statement::prepare_restrictions(database& db, schema_ptr s

 /** Returns a ::shared_ptr<term> for the limit or null if no limit is set */
 ::shared_ptr<term>
-select_statement::raw_statement::prepare_limit(database& db, ::shared_ptr<variable_specifications> bound_names) {
+select_statement::prepare_limit(database& db, ::shared_ptr<variable_specifications> bound_names)
+{
    if (!_limit) {
        return {};
    }
@@ -434,8 +449,7 @@ select_statement::raw_statement::prepare_limit(database& db, ::shared_ptr<variab
    return prep_limit;
 }

-void select_statement::raw_statement::verify_ordering_is_allowed(
-    ::shared_ptr<restrictions::statement_restrictions> restrictions)
+void select_statement::verify_ordering_is_allowed(::shared_ptr<restrictions::statement_restrictions> restrictions)
 {
    if (restrictions->uses_secondary_indexing()) {
        throw exceptions::invalid_request_exception("ORDER BY with 2ndary indexes is not supported.");
@@ -445,9 +459,9 @@ void select_statement::raw_statement::verify_ordering_is_allowed(
    }
 }

-void select_statement::raw_statement::validate_distinct_selection(schema_ptr schema,
-    ::shared_ptr<selection::selection> selection,
-    ::shared_ptr<restrictions::statement_restrictions> restrictions)
+void select_statement::validate_distinct_selection(schema_ptr schema,
+                                                   ::shared_ptr<selection::selection> selection,
+                                                   ::shared_ptr<restrictions::statement_restrictions> restrictions)
 {
    for (auto&& def : selection->get_columns()) {
        if (!def->is_partition_key() && !def->is_static()) {
@@ -471,8 +485,7 @@ void select_statement::raw_statement::validate_distinct_selection(schema_ptr sch
    }
 }

-void select_statement::raw_statement::handle_unrecognized_ordering_column(
-    ::shared_ptr<column_identifier> column)
+void select_statement::handle_unrecognized_ordering_column(::shared_ptr<column_identifier> column)
 {
    if (contains_alias(column)) {
        throw exceptions::invalid_request_exception(sprint("Aliases are not allowed in order by clause ('%s')", *column));
@@ -481,9 +494,9 @@ void select_statement::raw_statement::handle_unrecognized_ordering_column(
 }

 select_statement::ordering_comparator_type
-select_statement::raw_statement::get_ordering_comparator(schema_ptr schema,
-    ::shared_ptr<selection::selection> selection,
-    ::shared_ptr<restrictions::statement_restrictions> restrictions)
+select_statement::get_ordering_comparator(schema_ptr schema,
+                                          ::shared_ptr<selection::selection> selection,
+                                          ::shared_ptr<restrictions::statement_restrictions> restrictions)
 {
    if (!restrictions->key_is_in_relation()) {
        return {};
@@ -530,8 +543,7 @@ select_statement::raw_statement::get_ordering_comparator(schema_ptr schema,
    };
 }

-bool select_statement::raw_statement::is_reversed(schema_ptr schema) {
-
+bool select_statement::is_reversed(schema_ptr schema) {
    assert(_parameters->orderings().size() > 0);
    parameters::orderings_type::size_type i = 0;
    bool is_reversed_ = false;
@@ -576,8 +588,7 @@ bool select_statement::raw_statement::is_reversed(schema_ptr schema) {
 }

 /** If ALLOW FILTERING was not specified, this verifies that it is not needed */
-void select_statement::raw_statement::check_needs_filtering(
-    ::shared_ptr<restrictions::statement_restrictions> restrictions)
+void select_statement::check_needs_filtering(::shared_ptr<restrictions::statement_restrictions> restrictions)
 {
    // non-key-range non-indexed queries cannot involve filtering underneath
    if (!_parameters->allow_filtering() && (restrictions->is_key_range() || restrictions->uses_secondary_indexing())) {
@@ -593,16 +604,19 @@ void select_statement::raw_statement::check_needs_filtering(
    }
 }

-bool select_statement::raw_statement::contains_alias(::shared_ptr<column_identifier> name) {
+bool select_statement::contains_alias(::shared_ptr<column_identifier> name) {
    return std::any_of(_select_clause.begin(), _select_clause.end(), [name] (auto raw) {
        return raw->alias && *name == *raw->alias;
    });
 }

-::shared_ptr<column_specification> select_statement::raw_statement::limit_receiver() {
+::shared_ptr<column_specification> select_statement::limit_receiver() {
    return ::make_shared<column_specification>(keyspace(), column_family(), ::make_shared<column_identifier>("[limit]", true),
        int32_type);
 }

 }
+
+}
+
 }
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -41,7 +41,8 @@

 #pragma once

-#include "cql3/statements/cf_statement.hh"
+#include "cql3/statements/raw/cf_statement.hh"
+#include "cql3/statements/raw/select_statement.hh"
 #include "cql3/cql_statement.hh"
 #include "cql3/selection/selection.hh"
 #include "cql3/selection/raw_selector.hh"
@@ -64,22 +65,7 @@ namespace statements {
 */
 class select_statement : public cql_statement {
 public:
-    class parameters final {
-    public:
-        using orderings_type = std::vector<std::pair<shared_ptr<column_identifier::raw>, bool>>;
-    private:
-        const orderings_type _orderings;
-        const bool _is_distinct;
-        const bool _allow_filtering;
-    public:
-        parameters();
-        parameters(orderings_type orderings,
-            bool is_distinct,
-            bool allow_filtering);
-        bool is_distinct();
-        bool allow_filtering();
-        orderings_type const& orderings();
-    };
+    using parameters = raw::select_statement::parameters;
 private:
    static constexpr int DEFAULT_COUNT_PAGE_SIZE = 10000;
    static thread_local const ::shared_ptr<parameters> _default_parameters;
@@ -92,10 +78,10 @@ private:
    ::shared_ptr<term> _limit;

    template<typename T>
-    using compare_fn = std::function<bool(const T&, const T&)>;
+    using compare_fn = raw::select_statement::compare_fn<T>;

-    using result_row_type = std::vector<bytes_opt>;
-    using ordering_comparator_type = compare_fn<result_row_type>;
+    using result_row_type = raw::select_statement::result_row_type;
+    using ordering_comparator_type = raw::select_statement::ordering_comparator_type;

    /**
     * The comparator used to orders results when multiple keys are selected (using IN).
@@ -121,7 +107,7 @@ public:
    static ::shared_ptr<select_statement> for_selection(
        schema_ptr schema, ::shared_ptr<selection::selection> selection);

-    ::shared_ptr<cql3::metadata> get_result_metadata() const;
+    virtual ::shared_ptr<const cql3::metadata> get_result_metadata() const override;
    virtual uint32_t get_bound_terms() override;
    virtual future<> check_access(const service::client_state& state) override;
    virtual void validate(distributed<service::storage_proxy>&, const service::client_state& state) override;
@@ -430,69 +416,6 @@ private:
        result.add(row.getColumn(def.name));
    }
 #endif
-
-public:
-    class raw_statement;
-};
-
-class select_statement::raw_statement : public cf_statement
-{
-private:
-    ::shared_ptr<parameters> _parameters;
-    std::vector<::shared_ptr<selection::raw_selector>> _select_clause;
-    std::vector<::shared_ptr<relation>> _where_clause;
-    ::shared_ptr<term::raw> _limit;
-public:
-    raw_statement(::shared_ptr<cf_name> cf_name,
-            ::shared_ptr<parameters> parameters,
-            std::vector<::shared_ptr<selection::raw_selector>> select_clause,
-            std::vector<::shared_ptr<relation>> where_clause,
-            ::shared_ptr<term::raw> limit);
-
-    virtual ::shared_ptr<prepared> prepare(database& db) override;
-private:
-    ::shared_ptr<restrictions::statement_restrictions> prepare_restrictions(
-        database& db,
-        schema_ptr schema,
-        ::shared_ptr<variable_specifications> bound_names,
-        ::shared_ptr<selection::selection> selection);
-
-    /** Returns a ::shared_ptr<term> for the limit or null if no limit is set */
-    ::shared_ptr<term> prepare_limit(database& db, ::shared_ptr<variable_specifications> bound_names);
-
-    static void verify_ordering_is_allowed(::shared_ptr<restrictions::statement_restrictions> restrictions);
-
-    static void validate_distinct_selection(schema_ptr schema,
-        ::shared_ptr<selection::selection> selection,
-        ::shared_ptr<restrictions::statement_restrictions> restrictions);
-
-    void handle_unrecognized_ordering_column(::shared_ptr<column_identifier> column);
-
-    select_statement::ordering_comparator_type get_ordering_comparator(schema_ptr schema,
-        ::shared_ptr<selection::selection> selection,
-        ::shared_ptr<restrictions::statement_restrictions> restrictions);
-
-    bool is_reversed(schema_ptr schema);
-
-    /** If ALLOW FILTERING was not specified, this verifies that it is not needed */
-    void check_needs_filtering(::shared_ptr<restrictions::statement_restrictions> restrictions);
-
-    bool contains_alias(::shared_ptr<column_identifier> name);
-
-    ::shared_ptr<column_specification> limit_receiver();
-
-#if 0
-    public:
-        virtual sstring to_string() override {
-            return sstring("raw_statement(")
-                + "name=" + cf_name->to_string()
-                + ", selectClause=" + to_string(_select_clause)
-                + ", whereClause=" + to_string(_where_clause)
-                + ", isDistinct=" + to_string(_parameters->is_distinct())
-                + ")";
-        }
-    };
-#endif
 };

 }
--- a/cql3/statements/truncate_statement.cc
+++ b/cql3/statements/truncate_statement.cc
@@ -40,6 +40,7 @@
 */

 #include "cql3/statements/truncate_statement.hh"
+#include "cql3/statements/prepared_statement.hh"
 #include "cql3/cql_statement.hh"

 #include <experimental/optional>
@@ -58,9 +59,9 @@ uint32_t truncate_statement::get_bound_terms()
    return 0;
 }

-::shared_ptr<parsed_statement::prepared> truncate_statement::prepare(database& db)
+::shared_ptr<prepared_statement> truncate_statement::prepare(database& db)
 {
-    return ::make_shared<parsed_statement::prepared>(this->shared_from_this());
+    return ::make_shared<prepared>(this->shared_from_this());
 }

 bool truncate_statement::uses_function(const sstring& ks_name, const sstring& function_name) const
--- a/cql3/statements/truncate_statement.hh
+++ b/cql3/statements/truncate_statement.hh
@@ -41,7 +41,7 @@

 #pragma once

-#include "cql3/statements/cf_statement.hh"
+#include "cql3/statements/raw/cf_statement.hh"
 #include "cql3/cql_statement.hh"

 #include <experimental/optional>
@@ -50,7 +50,7 @@ namespace cql3 {

 namespace statements {

-class truncate_statement : public cf_statement, public cql_statement, public ::enable_shared_from_this<truncate_statement> {
+class truncate_statement : public raw::cf_statement, public cql_statement_no_metadata, public ::enable_shared_from_this<truncate_statement> {
 public:
    truncate_statement(::shared_ptr<cf_name> name);

--- a/cql3/statements/update_statement.cc
+++ b/cql3/statements/update_statement.cc
@@ -40,6 +40,8 @@
 */

 #include "update_statement.hh"
+#include "raw/update_statement.hh"
+#include "raw/insert_statement.hh"
 #include "unimplemented.hh"

 #include "cql3/operation_impl.hh"
@@ -108,21 +110,24 @@ void update_statement::add_update_for_key(mutation& m, const exploded_clustering
 #endif
 }

-update_statement::parsed_insert::parsed_insert(::shared_ptr<cf_name> name,
+namespace raw {
+
+insert_statement::insert_statement(            ::shared_ptr<cf_name> name,
                                               ::shared_ptr<attributes::raw> attrs,
                                               std::vector<::shared_ptr<column_identifier::raw>> column_names,
                                               std::vector<::shared_ptr<term::raw>> column_values,
                                               bool if_not_exists)
-    : modification_statement::parsed{std::move(name), std::move(attrs), conditions_vector{}, if_not_exists, false}
+    : raw::modification_statement{std::move(name), std::move(attrs), conditions_vector{}, if_not_exists, false}
    , _column_names{std::move(column_names)}
    , _column_values{std::move(column_values)}
 { }

-::shared_ptr<modification_statement>
-update_statement::parsed_insert::prepare_internal(database& db, schema_ptr schema,
+::shared_ptr<cql3::statements::modification_statement>
+insert_statement::prepare_internal(database& db, schema_ptr schema,
    ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs)
 {
-    auto stmt = ::make_shared<update_statement>(statement_type::INSERT, bound_names->size(), schema, std::move(attrs));
+    using statement_type = cql3::statements::modification_statement::statement_type;
+    auto stmt = ::make_shared<cql3::statements::update_statement>(statement_type::INSERT, bound_names->size(), schema, std::move(attrs));

    // Created from an INSERT
    if (stmt->is_counter()) {
@@ -164,21 +169,22 @@ update_statement::parsed_insert::prepare_internal(database& db, schema_ptr schem
    return stmt;
 }

-update_statement::parsed_update::parsed_update(::shared_ptr<cf_name> name,
+update_statement::update_statement(            ::shared_ptr<cf_name> name,
                                               ::shared_ptr<attributes::raw> attrs,
                                               std::vector<std::pair<::shared_ptr<column_identifier::raw>, ::shared_ptr<operation::raw_update>>> updates,
                                               std::vector<relation_ptr> where_clause,
                                               conditions_vector conditions)
-    : modification_statement::parsed(std::move(name), std::move(attrs), std::move(conditions), false, false)
+    : raw::modification_statement(std::move(name), std::move(attrs), std::move(conditions), false, false)
    , _updates(std::move(updates))
    , _where_clause(std::move(where_clause))
 { }

-::shared_ptr<modification_statement>
-update_statement::parsed_update::prepare_internal(database& db, schema_ptr schema,
+::shared_ptr<cql3::statements::modification_statement>
+update_statement::prepare_internal(database& db, schema_ptr schema,
    ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs)
 {
-    auto stmt = ::make_shared<update_statement>(statement_type::UPDATE, bound_names->size(), schema, std::move(attrs));
+    using statement_type = cql3::statements::modification_statement::statement_type;
+    auto stmt = ::make_shared<cql3::statements::update_statement>(statement_type::UPDATE, bound_names->size(), schema, std::move(attrs));

    for (auto&& entry : _updates) {
        auto id = entry.first->prepare_column_identifier(schema);
@@ -203,3 +209,5 @@ update_statement::parsed_update::prepare_internal(database& db, schema_ptr schem
 }

 }
+
+}
--- a/cql3/statements/update_statement.hh
+++ b/cql3/statements/update_statement.hh
@@ -42,6 +42,7 @@
 #pragma once

 #include "cql3/statements/modification_statement.hh"
+#include "cql3/statements/raw/modification_statement.hh"
 #include "cql3/column_identifier.hh"
 #include "cql3/term.hh"

@@ -69,55 +70,6 @@ private:
    virtual bool require_full_clustering_key() const override;

    virtual void add_update_for_key(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) override;
-public:
-    class parsed_insert : public modification_statement::parsed {
-    private:
-        const std::vector<::shared_ptr<column_identifier::raw>> _column_names;
-        const std::vector<::shared_ptr<term::raw>> _column_values;
-    public:
-        /**
-         * A parsed <code>INSERT</code> statement.
-         *
-         * @param name column family being operated on
-         * @param columnNames list of column names
-         * @param columnValues list of column values (corresponds to names)
-         * @param attrs additional attributes for statement (CL, timestamp, timeToLive)
-         */
-        parsed_insert(::shared_ptr<cf_name> name,
-                      ::shared_ptr<attributes::raw> attrs,
-                      std::vector<::shared_ptr<column_identifier::raw>> column_names,
-                      std::vector<::shared_ptr<term::raw>> column_values,
-                      bool if_not_exists);
-
-        virtual ::shared_ptr<modification_statement> prepare_internal(database& db, schema_ptr schema,
-                    ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs) override;
-
-    };
-
-    class parsed_update : public modification_statement::parsed {
-    private:
-        // Provided for an UPDATE
-        std::vector<std::pair<::shared_ptr<column_identifier::raw>, ::shared_ptr<operation::raw_update>>> _updates;
-        std::vector<relation_ptr> _where_clause;
-    public:
-        /**
-         * Creates a new UpdateStatement from a column family name, columns map, consistency
-         * level, and key term.
-         *
-         * @param name column family being operated on
-         * @param attrs additional attributes for statement (timestamp, timeToLive)
-         * @param updates a map of column operations to perform
-         * @param whereClause the where clause
-         */
-        parsed_update(::shared_ptr<cf_name> name,
-            ::shared_ptr<attributes::raw> attrs,
-            std::vector<std::pair<::shared_ptr<column_identifier::raw>, ::shared_ptr<operation::raw_update>>> updates,
-            std::vector<relation_ptr> where_clause,
-            conditions_vector conditions);
-    protected:
-        virtual ::shared_ptr<modification_statement> prepare_internal(database& db, schema_ptr schema,
-                    ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs);
-    };
 };

 }
--- a/cql3/statements/use_statement.cc
+++ b/cql3/statements/use_statement.cc
@@ -40,6 +40,7 @@
 */

 #include "cql3/statements/use_statement.hh"
+#include "cql3/statements/raw/use_statement.hh"

 #include "transport/messages/result_message.hh"

@@ -57,14 +58,23 @@ uint32_t use_statement::get_bound_terms()
    return 0;
 }

-::shared_ptr<parsed_statement::prepared> use_statement::prepare(database& db)
+namespace raw {
+
+use_statement::use_statement(sstring keyspace)
+    : _keyspace(keyspace)
 {
-    return ::make_shared<parsed_statement::prepared>(this->shared_from_this());
+}
+
+::shared_ptr<prepared_statement> use_statement::prepare(database& db)
+{
+    return ::make_shared<prepared>(make_shared<cql3::statements::use_statement>(_keyspace));
+}
+
 }

 bool use_statement::uses_function(const sstring& ks_name, const sstring& function_name) const
 {
-    return parsed_statement::uses_function(ks_name, function_name);
+    return false;
 }

 bool use_statement::depends_on_keyspace(const sstring& ks_name) const
--- a/cql3/statements/use_statement.hh
+++ b/cql3/statements/use_statement.hh
@@ -41,15 +41,16 @@

 #pragma once

-#include "cql3/statements/parsed_statement.hh"
 #include "transport/messages_fwd.hh"
 #include "cql3/cql_statement.hh"
+#include "cql3/statements/raw/parsed_statement.hh"
+#include "prepared_statement.hh"

 namespace cql3 {

 namespace statements {

-class use_statement : public parsed_statement, public cql_statement, public ::enable_shared_from_this<use_statement> {
+class use_statement : public cql_statement_no_metadata {
 private:
    const sstring _keyspace;

@@ -58,8 +59,6 @@ public:

    virtual uint32_t get_bound_terms() override;

-    virtual ::shared_ptr<prepared> prepare(database& db) override;
-
    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override;

    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
--- a/cql3/term.hh
+++ b/cql3/term.hh
@@ -103,7 +103,7 @@ public:
    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const = 0;

    virtual sstring to_string() const {
-        return sprint("term@%p", this);
+        return sprint("term@%p", static_cast<const void*>(this));
    }

    friend std::ostream& operator<<(std::ostream& out, const term& t) {
--- a/database.cc
+++ b/database.cc
--- a/database.hh
+++ b/database.hh
@@ -69,6 +69,7 @@
 #include "utils/histogram.hh"
 #include "sstables/estimated_histogram.hh"
 #include "sstables/compaction.hh"
+#include "sstables/sstable_set.hh"
 #include "key_reader.hh"
 #include <seastar/core/rwlock.hh>
 #include <seastar/core/shared_future.hh>
@@ -98,35 +99,97 @@ void make(database& db, bool durable, bool volatile_testing_only);
 }
 }

-class throttle_state {
-    size_t _max_space;
-    logalloc::region_group& _region_group;
-    throttle_state* _parent;
+class replay_position_reordered_exception : public std::exception {};

-    circular_buffer<promise<>> _throttled_requests;
-    timer<> _throttling_timer{[this] { unthrottle(); }};
-    void unthrottle();
-    bool should_throttle() const {
-        if (_region_group.memory_used() > _max_space) {
-            return true;
-        }
-        if (_parent) {
-            return _parent->should_throttle();
-        }
-        return false;
-    }
+using shared_memtable = lw_shared_ptr<memtable>;
+class memtable_list;
+
+class dirty_memory_manager: public logalloc::region_group_reclaimer {
+    // We need a separate boolean, because from the LSA point of view, pressure may still be
+    // mounting, in which case the pressure flag could be set back on if we force it off.
+    bool _db_shutdown_requested = false;
+
+    database* _db;
+    logalloc::region_group _region_group;
+
+    // We would like to serialize the flushing of memtables. While flushing many memtables
+    // simultaneously can sustain high levels of throughput, the memory is not freed until the
+    // memtable is totally gone. That means that if we have throttled requests, they will stay
+    // throttled for a long time. Even when we have virtual dirty, that only provides a rough
+    // estimate, and we can't release requests that early.
+    //
+    // Ideally, we'd allow one memtable flush per shard (or per database object), and write-behind
+    // would take care of the rest. But that still has issues, so we'll limit parallelism to some
+    // number (4), that we will hopefully reduce to 1 when write behind works.
+    //
+    // When streaming is going on, we'll separate half of that for the streaming code, which
+    // effectively increases the total to 6. That is a bit ugly and a bit redundant with the I/O
+    // Scheduler, but it's the easiest way not to hurt the common case (no streaming) and will have
+    // to do for the moment. Hopefully we can set both to 1 soon (with write behind)
+    //
+    // FIXME: enable write behind and set both to 1. Right now we will take advantage of the fact
+    // that memtables and streaming will use different specialized classes here and set them as
+    // default values here.
+    size_t _concurrency;
+    semaphore _flush_serializer;
+
+    seastar::gate _waiting_flush_gate;
+    std::vector<shared_memtable> _pending_flushes;
+    void maybe_do_active_flush();
+protected:
+    virtual memtable_list& get_memtable_list(column_family& cf) = 0;
+    virtual void start_reclaiming() override;
 public:
-    throttle_state(size_t max_space, logalloc::region_group& region, throttle_state* parent = nullptr)
-        : _max_space(max_space)
-        , _region_group(region)
-        , _parent(parent)
-    {}
+    future<> shutdown();

-    future<> throttle();
+    dirty_memory_manager(database* db, size_t threshold, size_t concurrency)
+                                           : logalloc::region_group_reclaimer(threshold)
+                                           , _db(db)
+                                           , _region_group(*this)
+                                           , _concurrency(concurrency)
+                                           , _flush_serializer(concurrency) {}
+
+    dirty_memory_manager(database* db, dirty_memory_manager *parent, size_t threshold, size_t concurrency)
+                                                                         : logalloc::region_group_reclaimer(threshold)
+                                                                         , _db(db)
+                                                                         , _region_group(&parent->_region_group, *this)
+                                                                         , _concurrency(concurrency)
+                                                                         , _flush_serializer(concurrency) {}
+    logalloc::region_group& region_group() {
+        return _region_group;
+    }
+
+    const logalloc::region_group& region_group() const {
+        return _region_group;
+    }
+
+    template <typename Func>
+    future<> serialize_flush(Func&& func) {
+        return seastar::with_gate(_waiting_flush_gate,  [this, func] () mutable {
+            return with_semaphore(_flush_serializer, 1, func).finally([this] {
+                maybe_do_active_flush();
+            });
+        });
+    }
 };

+class streaming_dirty_memory_manager: public dirty_memory_manager {
+    virtual memtable_list& get_memtable_list(column_family& cf) override;
+public:
+    streaming_dirty_memory_manager(database& db, dirty_memory_manager *parent, size_t threshold) : dirty_memory_manager(&db, parent, threshold, 2) {}
+};

-class replay_position_reordered_exception : public std::exception {};
+class memtable_dirty_memory_manager: public dirty_memory_manager {
+    virtual memtable_list& get_memtable_list(column_family& cf) override;
+public:
+    memtable_dirty_memory_manager(database& db, dirty_memory_manager* parent, size_t threshold) : dirty_memory_manager(&db, parent, threshold, 4) {}
+    // This constructor will be called for the system tables (no parent). Its flushes are usually drive by us
+    // and not the user, and tend to be small in size. So we'll allow only two slots.
+    memtable_dirty_memory_manager(database& db, size_t threshold) : dirty_memory_manager(&db, threshold, 2) {}
+    memtable_dirty_memory_manager() : dirty_memory_manager(nullptr, std::numeric_limits<size_t>::max(), 4) {}
+};
+
+extern thread_local memtable_dirty_memory_manager default_dirty_memory_manager;

 // We could just add all memtables, regardless of types, to a single list, and
 // then filter them out when we read them. Here's why I have chosen not to do
@@ -147,19 +210,21 @@ class replay_position_reordered_exception : public std::exception {};
 // If we are going to have different methods, better have different instances
 // of a common class.
 class memtable_list {
-    using shared_memtable = lw_shared_ptr<memtable>;
+public:
+    enum class flush_behavior { delayed, immediate };
+private:
    std::vector<shared_memtable> _memtables;
-    std::function<future<> ()> _seal_fn;
+    std::function<future<> (flush_behavior)> _seal_fn;
    std::function<schema_ptr()> _current_schema;
    size_t _max_memtable_size;
-    logalloc::region_group* _dirty_memory_region_group;
+    dirty_memory_manager* _dirty_memory_manager;
 public:
-    memtable_list(std::function<future<> ()> seal_fn, std::function<schema_ptr()> cs, size_t max_memtable_size, logalloc::region_group* region_group)
+    memtable_list(std::function<future<> (flush_behavior)> seal_fn, std::function<schema_ptr()> cs, size_t max_memtable_size, dirty_memory_manager* dirty_memory_manager)
        : _memtables({})
        , _seal_fn(seal_fn)
        , _current_schema(cs)
        , _max_memtable_size(max_memtable_size)
-        , _dirty_memory_region_group(region_group) {
+        , _dirty_memory_manager(dirty_memory_manager) {
        add_memtable();
    }

@@ -179,8 +244,8 @@ public:
        return _memtables.size();
    }

-    future<> seal_active_memtable() {
-        return _seal_fn();
+    future<> seal_active_memtable(flush_behavior behavior) {
+        return _seal_fn(behavior);
    }

    auto begin() noexcept {
@@ -215,12 +280,12 @@ public:
        if (should_flush()) {
            // FIXME: if sparse, do some in-memory compaction first
            // FIXME: maybe merge with other in-memory memtables
-            _seal_fn();
+            seal_active_memtable(flush_behavior::immediate);
        }
    }
 private:
    lw_shared_ptr<memtable> new_memtable() {
-        return make_lw_shared<memtable>(_current_schema(), _dirty_memory_region_group);
+        return make_lw_shared<memtable>(_current_schema(), &(_dirty_memory_manager->region_group()));
    }
 };

@@ -247,10 +312,12 @@ public:
        bool enable_incremental_backups = false;
        size_t max_memtable_size = 5'000'000;
        size_t max_streaming_memtable_size = 5'000'000;
-        logalloc::region_group* dirty_memory_region_group = nullptr;
-        logalloc::region_group* streaming_dirty_memory_region_group = nullptr;
+        ::dirty_memory_manager* dirty_memory_manager = &default_dirty_memory_manager;
+        ::dirty_memory_manager* streaming_dirty_memory_manager = &default_dirty_memory_manager;
        restricted_mutation_reader_config read_concurrency_config;
+        restricted_mutation_reader_config streaming_read_concurrency_config;
        ::cf_stats* cf_stats = nullptr;
+        uint64_t max_cached_partition_size_in_bytes;
    };
    struct no_commitlog {};
    struct stats {
@@ -263,13 +330,13 @@ public:
        int64_t live_sstable_count = 0;
        /** Estimated number of compactions pending for this column family */
        int64_t pending_compactions = 0;
-        utils::ihistogram reads{256};
-        utils::ihistogram writes{256};
+        utils::timed_rate_moving_average_and_histogram reads{256};
+        utils::timed_rate_moving_average_and_histogram writes{256};
        sstables::estimated_histogram estimated_read;
        sstables::estimated_histogram estimated_write;
        sstables::estimated_histogram estimated_sstable_per_read;
-        utils::ihistogram tombstone_scanned;
-        utils::ihistogram live_scanned;
+        utils::timed_rate_moving_average_and_histogram tombstone_scanned;
+        utils::timed_rate_moving_average_and_histogram live_scanned;
    };

    struct snapshot_details {
@@ -280,6 +347,7 @@ private:
    schema_ptr _schema;
    config _config;
    stats _stats;
+
    lw_shared_ptr<memtable_list> _memtables;

    // In older incarnations, we simply commited the mutations to memtables.
@@ -300,12 +368,37 @@ private:
    // memory throttling mechanism, guaranteeing we will not overload the
    // server.
    lw_shared_ptr<memtable_list> _streaming_memtables;
+    utils::phased_barrier _streaming_flush_phaser;

+    friend class memtable_dirty_memory_manager;
+    friend class streaming_dirty_memory_manager;
+
+    // If mutations are fragmented during streaming the sstables cannot be made
+    // visible immediately after memtable flush, because that could cause
+    // readers to see only a part of a partition thus violating isolation
+    // guarantees.
+    // Mutations that are sent in fragments are kept separately in per-streaming
+    // plan memtables and the resulting sstables are not made visible until
+    // the streaming is complete.
+    struct streaming_memtable_big {
+        lw_shared_ptr<memtable_list> memtables;
+        std::vector<sstables::shared_sstable> sstables;
+        seastar::gate flush_in_progress;
+    };
+    std::unordered_map<utils::UUID, lw_shared_ptr<streaming_memtable_big>> _streaming_memtables_big;
+
+    future<> flush_streaming_big_mutations(utils::UUID plan_id);
+    void apply_streaming_big_mutation(schema_ptr m_schema, utils::UUID plan_id, const frozen_mutation& m);
+    future<> seal_active_streaming_memtable_big(streaming_memtable_big& smb);
+
+    lw_shared_ptr<memtable_list> make_memory_only_memtable_list();
    lw_shared_ptr<memtable_list> make_memtable_list();
    lw_shared_ptr<memtable_list> make_streaming_memtable_list();
+    lw_shared_ptr<memtable_list> make_streaming_memtable_big_list(streaming_memtable_big& smb);

+    sstables::compaction_strategy _compaction_strategy;
    // generation -> sstable. Ordered by key so we can easily get the most recent.
-    lw_shared_ptr<sstable_list> _sstables;
+    lw_shared_ptr<sstables::sstable_set> _sstables;
    // sstables that have been compacted (so don't look up in query) but
    // have not been deleted yet, so must not GC any tombstones in other sstables
    // that may delete data in these sstables:
@@ -326,10 +419,7 @@ private:
    db::replay_position _highest_flushed_rp;
    // Provided by the database that owns this commitlog
    db::commitlog* _commitlog;
-    sstables::compaction_strategy _compaction_strategy;
    compaction_manager& _compaction_manager;
-    // Whether or not a cf is queued by its compaction manager.
-    bool _compaction_manager_queued = false;
    int _compaction_disabled = 0;
    class memtable_flush_queue;
    std::unique_ptr<memtable_flush_queue> _flush_queue;
@@ -349,7 +439,7 @@ private:
    lw_shared_ptr<memtable> new_memtable();
    lw_shared_ptr<memtable> new_streaming_memtable();
    future<stop_iteration> try_flush_memtable_to_sstable(lw_shared_ptr<memtable> memt);
-    future<> update_cache(memtable&, lw_shared_ptr<sstable_list> old_sstables);
+    future<> update_cache(memtable&, sstables::shared_sstable exclude_sstable);
    struct merge_comparator;

    // update the sstable generation, making sure that new new sstables don't overwrite this one.
@@ -376,11 +466,14 @@ private:
    // Caller needs to ensure that column_family remains live (FIXME: relax this).
    // The 'range' parameter must be live as long as the reader is used.
    // Mutations returned by the reader will all have given schema.
-    mutation_reader make_sstable_reader(schema_ptr schema, const query::partition_range& range, const io_priority_class& pc) const;
+    mutation_reader make_sstable_reader(schema_ptr schema,
+                                        const query::partition_range& range,
+                                        query::clustering_key_filtering_context ck_filtering,
+                                        const io_priority_class& pc) const;

    mutation_source sstables_as_mutation_source();
    key_source sstables_as_key_source() const;
-    partition_presence_checker make_partition_presence_checker(lw_shared_ptr<sstable_list> old_sstables);
+    partition_presence_checker make_partition_presence_checker(sstables::shared_sstable exclude_sstable);
    std::chrono::steady_clock::time_point _sstable_writes_disabled_at;
    void do_trigger_compaction();
 public:
@@ -414,6 +507,7 @@ public:
    // will be scheduled under the priority class given by pc.
    mutation_reader make_reader(schema_ptr schema,
            const query::partition_range& range = query::full_partition_range,
+            const query::clustering_key_filtering_context& ck_filtering = query::no_clustering_key_filtering,
            const io_priority_class& pc = default_priority_class()) const;

    mutation_source as_mutation_source() const;
@@ -434,9 +528,13 @@ public:
    }

    logalloc::occupancy_stats occupancy() const;
+private:
+    column_family(schema_ptr schema, config cfg, db::commitlog* cl, compaction_manager&);
 public:
-    column_family(schema_ptr schema, config cfg, db::commitlog& cl, compaction_manager&);
-    column_family(schema_ptr schema, config cfg, no_commitlog, compaction_manager&);
+    column_family(schema_ptr schema, config cfg, db::commitlog& cl, compaction_manager& cm)
+        : column_family(schema, std::move(cfg), &cl, cm) {}
+    column_family(schema_ptr schema, config cfg, no_commitlog, compaction_manager& cm)
+        : column_family(schema, std::move(cfg), nullptr, cm) {}
    column_family(column_family&&) = delete; // 'this' is being captured during construction
    ~column_family();
    const schema_ptr& schema() const { return _schema; }
@@ -449,7 +547,7 @@ public:
    // The mutation is always upgraded to current schema.
    void apply(const frozen_mutation& m, const schema_ptr& m_schema, const db::replay_position& = db::replay_position());
    void apply(const mutation& m, const db::replay_position& = db::replay_position());
-    void apply_streaming_mutation(schema_ptr, const frozen_mutation&);
+    void apply_streaming_mutation(schema_ptr, utils::UUID plan_id, const frozen_mutation&, bool fragmented);

    // Returns at most "cmd.limit" rows
    future<lw_shared_ptr<query::result>> query(schema_ptr,
@@ -462,7 +560,8 @@ public:
    future<> stop();
    future<> flush();
    future<> flush(const db::replay_position&);
-    future<> flush_streaming_mutations(std::vector<query::partition_range> ranges = std::vector<query::partition_range>{});
+    future<> flush_streaming_mutations(utils::UUID plan_id, std::vector<query::partition_range> ranges = std::vector<query::partition_range>{});
+    future<> fail_streaming_mutations(utils::UUID plan_id);
    future<> clear(); // discards memtable(s) without flushing them to disk.
    future<db::replay_position> discard_sstables(db_clock::time_point);

@@ -473,10 +572,14 @@ public:
    future<int64_t> disable_sstable_write() {
        _sstable_writes_disabled_at = std::chrono::steady_clock::now();
        return _sstables_lock.write_lock().then([this] {
-            if (_sstables->empty()) {
+            if (_sstables->all()->empty()) {
                return make_ready_future<int64_t>(0);
            }
-            return make_ready_future<int64_t>((*_sstables->rbegin()).first);
+            int64_t max = 0;
+            for (auto&& s : *_sstables->all()) {
+                max = std::max(max, s->generation());
+            }
+            return make_ready_future<int64_t>(max);
        });
    }

@@ -490,6 +593,17 @@ public:
        return std::chrono::steady_clock::now() - _sstable_writes_disabled_at;
    }

+    // This function will iterate through upload directory in column family,
+    // and will do the following for each sstable found:
+    // 1) Mutate sstable level to 0.
+    // 2) Create hard links to its components in column family dir.
+    // 3) Remove all of its components in upload directory.
+    // At the end, it's expected that upload dir is empty and all of its
+    // previous content was moved to column family dir.
+    //
+    // Return a vector containing descriptor of sstables to be loaded.
+    future<std::vector<sstables::entry_descriptor>> flush_upload_dir();
+
    // Make sure the generation numbers are sequential, starting from "start".
    // Generations before "start" are left untouched.
    //
@@ -536,9 +650,11 @@ public:
        _config.enable_incremental_backups = val;
    }

-    lw_shared_ptr<sstable_list> get_sstables();
-    lw_shared_ptr<sstable_list> get_sstables_including_compacted_undeleted();
-    size_t sstables_count();
+    lw_shared_ptr<sstable_list> get_sstables() const;
+    lw_shared_ptr<sstable_list> get_sstables_including_compacted_undeleted() const;
+    std::vector<sstables::shared_sstable> select_sstables(const query::partition_range& range) const;
+    size_t sstables_count() const;
+    std::vector<uint64_t> sstable_count_per_level() const;
    int64_t get_unleveled_sstables() const;

    void start_compaction();
@@ -553,10 +669,6 @@ public:
        return _compaction_strategy;
    }

-    bool compaction_manager_queued() const;
-    void set_compaction_manager_queued(bool compaction_manager_queued);
-    bool pending_compactions() const;
-
    const stats& get_stats() const {
        return _stats;
    }
@@ -569,9 +681,7 @@ public:
    Result run_with_compaction_disabled(Func && func) {
        ++_compaction_disabled;
        return _compaction_manager.remove(this).then(std::forward<Func>(func)).finally([this] {
-            // #934. The pending counter is actually a great indicator into whether we
-            // actually need to trigger a compaction again.
-            if (--_compaction_disabled == 0 && _stats.pending_compactions > 0) {
+            if (--_compaction_disabled == 0) {
                // we're turning if on again, use function that does not increment
                // the counter further.
                do_trigger_compaction();
@@ -587,7 +697,7 @@ private:
    // But it is possible to synchronously wait for the seal to complete by
    // waiting on this future. This is useful in situations where we want to
    // synchronously flush data to disk.
-    future<> seal_active_memtable();
+    future<> seal_active_memtable(memtable_list::flush_behavior behavior = memtable_list::flush_behavior::delayed);

    // I am assuming here that the repair process will potentially send ranges containing
    // few mutations, definitely not enough to fill a memtable. It wants to know whether or
@@ -610,9 +720,19 @@ private:
    // repair can now choose whatever strategy - small or big ranges - it wants, resting assure
    // that the incoming memtables will be coalesced together.
    shared_promise<> _waiting_streaming_flushes;
-    timer<> _delayed_streaming_flush{[this] { seal_active_streaming_memtable(); }};
-    future<> seal_active_streaming_memtable();
+    timer<> _delayed_streaming_flush{[this] { seal_active_streaming_memtable_immediate(); }};
    future<> seal_active_streaming_memtable_delayed();
+    future<> seal_active_streaming_memtable_immediate();
+    future<> seal_active_streaming_memtable(memtable_list::flush_behavior behavior) {
+        if (behavior == memtable_list::flush_behavior::delayed) {
+            return seal_active_streaming_memtable_delayed();
+        } else if (behavior == memtable_list::flush_behavior::immediate) {
+            return seal_active_streaming_memtable_immediate();
+        } else {
+            // Impossible
+            assert(0);
+        }
+    }

    // filter manifest.json files out
    static bool manifest_json_filter(const sstring& fname);
@@ -707,8 +827,8 @@ public:
    const lw_shared_ptr<user_types_metadata>& user_types() const {
        return _user_types;
    }
-    void add_column_family(const schema_ptr& s) {
-        _cf_meta_data.emplace(s->cf_name(), s);
+    void add_or_update_column_family(const schema_ptr& s) {
+        _cf_meta_data[s->cf_name()] = s;
    }
    void remove_column_family(const schema_ptr& s) {
        _cf_meta_data.erase(s->cf_name());
@@ -733,9 +853,10 @@ public:
        bool enable_incremental_backups = false;
        size_t max_memtable_size = 5'000'000;
        size_t max_streaming_memtable_size = 5'000'000;
-        logalloc::region_group* dirty_memory_region_group = nullptr;
-        logalloc::region_group* streaming_dirty_memory_region_group = nullptr;
+        ::dirty_memory_manager* dirty_memory_manager = &default_dirty_memory_manager;
+        ::dirty_memory_manager* streaming_dirty_memory_manager = &default_dirty_memory_manager;
        restricted_mutation_reader_config read_concurrency_config;
+        restricted_mutation_reader_config streaming_read_concurrency_config;
        ::cf_stats* cf_stats = nullptr;
    };
 private:
@@ -747,16 +868,30 @@ public:
        : _metadata(std::move(metadata))
        , _config(std::move(cfg))
    {}
-    const lw_shared_ptr<keyspace_metadata>& metadata() const {
+
+    void update_from(lw_shared_ptr<keyspace_metadata>);
+
+    /** Note: return by shared pointer value, since the meta data is
+     * semi-volatile. I.e. we could do alter keyspace at any time, and
+     * boom, it is replaced.
+     */
+    lw_shared_ptr<keyspace_metadata> metadata() const {
        return _metadata;
    }
    void create_replication_strategy(const std::map<sstring, sstring>& options);
+    /**
+     * This should not really be return by reference, since replication
+     * strategy is also volatile in that it could be replaced at "any" time.
+     * However, all current uses at least are "instantateous", i.e. does not
+     * carry it across a continuation. So it is sort of same for now, but
+     * should eventually be refactored.
+     */
    locator::abstract_replication_strategy& get_replication_strategy();
    const locator::abstract_replication_strategy& get_replication_strategy() const;
-    column_family::config make_column_family_config(const schema& s) const;
+    column_family::config make_column_family_config(const schema& s, const db::config& db_config) const;
    future<> make_directory_for_column_family(const sstring& name, utils::UUID uuid);
-    void add_column_family(const schema_ptr& s) {
-        _metadata->add_column_family(s);
+    void add_or_update_column_family(const schema_ptr& s) {
+        _metadata->add_or_update_column_family(s);
    }
    void add_user_type(const user_type ut) {
        _metadata->add_user_type(ut);
@@ -779,7 +914,7 @@ public:
    const sstring& datadir() const {
        return _config.datadir;
    }
-private:
+
    sstring column_family_directory(const sstring& name, utils::UUID uuid) const;
 };

@@ -811,9 +946,12 @@ class database {

    lw_shared_ptr<db_stats> _stats;

-    logalloc::region_group _dirty_memory_region_group;
-    logalloc::region_group _streaming_dirty_memory_region_group;
-
+    std::unique_ptr<db::config> _cfg;
+    size_t _memtable_total_space = 500 << 20;
+    size_t _streaming_memtable_total_space = 500 << 20;
+    memtable_dirty_memory_manager _system_dirty_memory_manager;
+    memtable_dirty_memory_manager _dirty_memory_manager;
+    streaming_dirty_memory_manager _streaming_dirty_memory_manager;
    semaphore _read_concurrency_sem{max_concurrent_reads()};
    restricted_mutation_reader_config _read_concurrency_config;
    semaphore _system_read_concurrency_sem{max_system_concurrent_reads()};
@@ -823,9 +961,6 @@ class database {
    std::unordered_map<utils::UUID, lw_shared_ptr<column_family>> _column_families;
    std::unordered_map<std::pair<sstring, sstring>, utils::UUID, utils::tuple_hash> _ks_cf_to_uuid;
    std::unique_ptr<db::commitlog> _commitlog;
-    std::unique_ptr<db::config> _cfg;
-    size_t _memtable_total_space = 500 << 20;
-    size_t _streaming_memtable_total_space = 500 << 20;
    utils::UUID _version;
    // compaction_manager object is referenced by all column families of a database.
    compaction_manager _compaction_manager;
@@ -833,7 +968,7 @@ class database {
    bool _enable_incremental_backups = false;

    future<> init_commitlog();
-    future<> apply_in_memory(const frozen_mutation& m, const schema_ptr& m_schema, const db::replay_position&);
+    future<> apply_in_memory(const frozen_mutation& m, schema_ptr m_schema, db::replay_position);
    future<> populate(sstring datadir);
    future<> populate_keyspace(sstring datadir, sstring ks_name);

@@ -845,9 +980,6 @@ private:
    friend void db::system_keyspace::make(database& db, bool durable, bool volatile_testing_only);
    void setup_collectd();

-    throttle_state _memtables_throttler;
-    throttle_state _streaming_throttler;
-
    future<> do_apply(schema_ptr, const frozen_mutation&);
 public:
    static utils::UUID empty_version;
@@ -894,7 +1026,7 @@ public:
    keyspace& find_keyspace(const sstring& name);
    const keyspace& find_keyspace(const sstring& name) const;
    bool has_keyspace(const sstring& name) const;
-    void update_keyspace(const sstring& name);
+    future<> update_keyspace(const sstring& name);
    void drop_keyspace(const sstring& name);
    const auto& keyspaces() const { return _keyspaces; }
    std::vector<sstring> get_non_system_keyspaces() const;
@@ -916,7 +1048,7 @@ public:
    future<lw_shared_ptr<query::result>> query(schema_ptr, const query::read_command& cmd, query::result_request request, const std::vector<query::partition_range>& ranges);
    future<reconcilable_result> query_mutations(schema_ptr, const query::read_command& cmd, const query::partition_range& range);
    future<> apply(schema_ptr, const frozen_mutation&);
-    future<> apply_streaming_mutation(schema_ptr, const frozen_mutation&);
+    future<> apply_streaming_mutation(schema_ptr, utils::UUID plan_id, const frozen_mutation&, bool fragmented);
    keyspace::config make_keyspace_config(const keyspace_metadata& ksm);
    const sstring& get_snitch_name() const;
    future<> clear_snapshot(sstring tag, std::vector<sstring> keyspace_names);
@@ -938,6 +1070,8 @@ public:
        return _column_families;
    }

+    std::vector<lw_shared_ptr<column_family>> get_non_system_column_families() const;
+
    const std::unordered_map<std::pair<sstring, sstring>, utils::UUID, utils::tuple_hash>&
    get_column_families_mapping() const {
        return _ks_cf_to_uuid;
@@ -960,7 +1094,7 @@ public:
    future<> drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func);

    const logalloc::region_group& dirty_memory_region_group() const {
-        return _dirty_memory_region_group;
+        return _dirty_memory_manager.region_group();
    }

    std::unordered_set<sstring> get_initial_tokens();
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -257,9 +257,26 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
            // FIXME: verify that the above is reasonably true.
            return limiter->reserve(size).then([this, mutations = std::move(mutations), id] {
                _stats.write_attempts += mutations.size();
-                return _qp.proxy().local().mutate(mutations, db::consistency_level::ANY);
+                // #1222 - change cl level to ALL, emulating origins behaviour of sending/hinting
+                // to all natural end points.
+                // Note however that origin uses hints here, and actually allows for this
+                // send to partially or wholly fail in actually sending stuff. Since we don't
+                // have hints (yet), send with CL=ALL, and hope we can re-do this soon.
+                // See below, we use retry on write failure.
+                return _qp.proxy().local().mutate(mutations, db::consistency_level::ALL, nullptr);
            });
-        }).then([this, id] {
+        }).then_wrapped([this, id](future<> batch_result) {
+            try {
+                batch_result.get();
+            } catch (no_such_keyspace& ex) {
+                // should probably ignore and drop the batch
+            } catch (...) {
+                // timeout, overload etc.
+                // Do _not_ remove the batch, assuning we got a node write error.
+                // Since we don't have hints (which origin is satisfied with),
+                // we have to resort to keeping this batch to next lap.
+                return make_ready_future<>();
+            }
            // delete batch
            auto schema = _qp.db().local().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
            auto key = partition_key::from_singular(*schema, id);
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -63,6 +63,7 @@
 #include "utils/data_input.hh"
 #include "utils/crc.hh"
 #include "utils/runtime.hh"
+#include "utils/flush_queue.hh"
 #include "log.hh"
 #include "commitlog_entry.hh"
 #include "service/priority_manager.hh"
@@ -194,7 +195,6 @@ public:
    stats totals;

    future<> begin_write() {
-        _gate.enter();
        ++totals.pending_writes; // redundant, given semaphore. but easier to read
        if (totals.pending_writes >= cfg.max_active_writes) {
            ++totals.write_limit_exceeded;
@@ -205,11 +205,9 @@ public:
    void end_write() {
        _write_semaphore.signal();
        --totals.pending_writes;
-        _gate.leave();
    }

    future<> begin_flush() {
-        _gate.enter();
        ++totals.pending_flushes;
        if (totals.pending_flushes >= cfg.max_active_flushes) {
            ++totals.flush_limit_exceeded;
@@ -220,11 +218,10 @@ public:
    void end_flush() {
        _flush_semaphore.signal();
        --totals.pending_flushes;
-        _gate.leave();
    }

    bool should_wait_for_write() const {
-        return _write_semaphore.waiters() > 0 || _flush_semaphore.waiters() > 0;
+        return cfg.mode == sync_mode::BATCH || _write_semaphore.waiters() > 0 || _flush_semaphore.waiters() > 0;
    }

    segment_manager(config c)
@@ -276,7 +273,7 @@ public:
    future<sseg_ptr> allocate_segment(bool active);

    future<> clear();
-    future<> sync_all_segments();
+    future<> sync_all_segments(bool shutdown = false);
    future<> shutdown();

    scollectd::registrations create_counters();
@@ -391,18 +388,21 @@ class db::commitlog::segment: public enable_lw_shared_from_this<segment> {
    uint64_t _buf_pos = 0;
    bool _closed = false;

+    size_t _needed_size = 0;
+
    using buffer_type = segment_manager::buffer_type;
    using sseg_ptr = segment_manager::sseg_ptr;
    using clock_type = segment_manager::clock_type;
    using time_point = segment_manager::time_point;

    buffer_type _buffer;
-    rwlock _dwrite; // used as a barrier between write & flush
    std::unordered_map<cf_id_type, position_type> _cf_dirty;
    time_point _sync_time;
    seastar::gate _gate;
    uint64_t _write_waiters = 0;
-    semaphore _queue;
+    utils::flush_queue<replay_position> _pending_ops;
+
+    uint64_t _num_allocs = 0;

    std::unordered_set<table_schema_version> _known_schema_versions;

@@ -413,9 +413,7 @@ class db::commitlog::segment: public enable_lw_shared_from_this<segment> {
        // This is maintaining the semantica of only using the write-lock
        // as a gate for flushing, i.e. once we've begun a flush for position X
        // we are ok with writes to positions > X
-        return _segment_manager->begin_flush().then(std::bind(&rwlock::write_lock, &_dwrite)).finally([this] {
-            _dwrite.write_unlock();
-        });
+        return _segment_manager->begin_flush();
    }

    void end_flush() {
@@ -426,11 +424,10 @@ class db::commitlog::segment: public enable_lw_shared_from_this<segment> {
        // This is maintaining the semantica of only using the write-lock
        // as a gate for flushing, i.e. once we've begun a flush for position X
        // we are ok with writes to positions > X
-        return _segment_manager->begin_write().then(std::bind(&rwlock::read_lock, &_dwrite));
+        return _segment_manager->begin_write();
    }

    void end_write() {
-        _dwrite.read_unlock();
        _segment_manager->end_write();
    }

@@ -456,7 +453,7 @@ public:
    segment(::shared_ptr<segment_manager> m, const descriptor& d, file && f, bool active)
            : _segment_manager(std::move(m)), _desc(std::move(d)), _file(std::move(f)),
        _file_name(_segment_manager->cfg.commit_log_location + "/" + _desc.filename()), _sync_time(
-                    clock_type::now()), _queue(0)
+                    clock_type::now())
    {
        ++_segment_manager->totals.segments_created;
        logger.debug("Created new {} segment {}", active ? "active" : "reserve", *this);
@@ -513,21 +510,30 @@ public:
        _sync_time = clock_type::now();
    }
    // See class comment for info
-    future<sseg_ptr> sync() {
+    future<sseg_ptr> sync(bool shutdown = false) {
+        /**
+         * If we are shutting down, we first
+         * close the allocation gate, thus no new
+         * data can be appended. Then we just issue a
+         * flush, which will wait for any queued ops
+         * to complete as well. Then we close the ops
+         * queue, just to be sure.
+         */
+        if (shutdown) {
+            auto me = shared_from_this();
+            return _gate.close().then([me] {
+                return me->sync().finally([me] {
+                    // When we get here, nothing should add ops,
+                    // and we should have waited out all pending.
+                    return me->_pending_ops.close();
+                });
+            });
+        }
+
        // Note: this is not a marker for when sync was finished.
        // It is when it was initiated
        reset_sync_time();
-
-        if (position() <= _flush_pos) {
-            logger.trace("Sync not needed {}: ({} / {})", *this, position(), _flush_pos);
-            return make_ready_future<sseg_ptr>(shared_from_this());
-        }
-        return cycle().then([](sseg_ptr seg) {
-            return seg->flush();
-        });
-    }
-    future<> shutdown() {
-        return _gate.close();
+        return cycle(true);
    }
    // See class comment for info
    future<sseg_ptr> flush(uint64_t pos = 0) {
@@ -536,46 +542,56 @@ public:
        if (pos == 0) {
            pos = _file_pos;
        }
-        if (pos != 0 && pos <= _flush_pos) {
-            logger.trace("{} already synced! ({} < {})", *this, pos, _flush_pos);
-            return make_ready_future<sseg_ptr>(std::move(me));
-        }
-        logger.trace("Syncing {} {} -> {}", *this, _flush_pos, pos);
-        // Make sure all disk writes are done.
-        // This is not 100% neccesary, we really only need the ones below our flush pos,
-        // but since we pretty much assume that task ordering will make this the case anyway...

-        return begin_flush().then(
-                [this, me, pos]() mutable {
-                    pos = std::max(pos, _file_pos);
-                    if (pos <= _flush_pos) {
-                        logger.trace("{} already synced! ({} < {})", *this, pos, _flush_pos);
-                        return make_ready_future<sseg_ptr>(std::move(me));
-                    }
-                    return _file.flush().then_wrapped([this, pos, me](future<> f) {
-                                try {
-                                    f.get();
-                                    // TODO: retry/ignore/fail/stop - optional behaviour in origin.
-                                    // we fast-fail the whole commit.
-                                    _flush_pos = std::max(pos, _flush_pos);
-                                    ++_segment_manager->totals.flush_count;
-                                    logger.trace("{} synced to {}", *this, _flush_pos);
-                                    return make_ready_future<sseg_ptr>(std::move(me));
-                                } catch (...) {
-                                    logger.error("Failed to flush commits to disk: {}", std::current_exception());
-                                    throw;
-                                }
-                            });
-        }).finally([this, me] {
-            end_flush();
+        logger.trace("Syncing {} {} -> {}", *this, _flush_pos, pos);
+
+        // Only run the flush when all write ops at lower rp:s
+        // have completed.
+        replay_position rp(_desc.id, position_type(pos));
+
+        // Run like this to ensure flush ordering, and making flushes "waitable"
+        return _pending_ops.run_with_ordered_post_op(rp, [] { return make_ready_future<>(); }, [this, pos, me, rp] {
+            assert(_pending_ops.has_operation(rp));
+            return do_flush(pos);
        });
    }
+
+    future<sseg_ptr> do_flush(uint64_t pos) {
+        auto me = shared_from_this();
+        return begin_flush().then([this, pos]() {
+            if (pos <= _flush_pos) {
+                logger.trace("{} already synced! ({} < {})", *this, pos, _flush_pos);
+                return make_ready_future<>();
+            }
+            return _file.flush().then_wrapped([this, pos](future<> f) {
+                try {
+                    f.get();
+                    // TODO: retry/ignore/fail/stop - optional behaviour in origin.
+                    // we fast-fail the whole commit.
+                    _flush_pos = std::max(pos, _flush_pos);
+                    ++_segment_manager->totals.flush_count;
+                    logger.trace("{} synced to {}", *this, _flush_pos);
+                } catch (...) {
+                    logger.error("Failed to flush commits to disk: {}", std::current_exception());
+                    throw;
+                }
+            });
+        }).finally([this] {
+            end_flush();
+        }).then([me] {
+            return make_ready_future<sseg_ptr>(me);
+        });
+    }
+
    /**
     * Allocate a new buffer
     */
    void new_buffer(size_t s) {
        assert(_buffer.empty());

+        s += _needed_size;
+        _needed_size = 0;
+
        auto overhead = segment_overhead_size;
        if (_file_pos == 0) {
            overhead += descriptor_header_size;
@@ -604,25 +620,32 @@ public:
        _segment_manager->totals.total_size += k;
    }

+    bool buffer_is_empty() const {
+        return _buf_pos <= segment_overhead_size
+                        || (_file_pos == 0 && _buf_pos <= (segment_overhead_size + descriptor_header_size));
+    }
    /**
     * Send any buffer contents to disk and get a new tmp buffer
     */
    // See class comment for info
-    future<sseg_ptr> cycle() {
+    future<sseg_ptr> cycle(bool flush_after = false) {
+        if (_buffer.empty()) {
+            return flush_after ? flush() : make_ready_future<sseg_ptr>(shared_from_this());
+        }
+
        auto size = clear_buffer_slack();
        auto buf = std::move(_buffer);
        auto off = _file_pos;
+        auto top = off + size;
+        auto num = _num_allocs;

-        _file_pos += size;
+        _file_pos = top;
        _buf_pos = 0;
+        _num_allocs = 0;

        auto me = shared_from_this();
        assert(!me.owned());

-        if (size == 0) {
-            return make_ready_future<sseg_ptr>(std::move(me));
-        }
-
        auto * p = buf.get_write();
        assert(std::count(p, p + 2 * sizeof(uint32_t), 0) == 2 * sizeof(uint32_t));

@@ -654,41 +677,50 @@ public:

        forget_schema_versions();

-        // acquire read lock
-        return begin_write().then([this, size, off, buf = std::move(buf)]() mutable {
-            auto written = make_lw_shared<size_t>(0);
-            auto p = buf.get();
-            return repeat([this, size, off, written, p]() mutable {
-                auto&& priority_class = service::get_local_commitlog_priority();
-                return _file.dma_write(off + *written, p + *written, size - *written, priority_class).then_wrapped([this, size, written](future<size_t>&& f) {
-                    try {
-                        auto bytes = std::get<0>(f.get());
-                        *written += bytes;
-                        _segment_manager->totals.bytes_written += bytes;
-                        _segment_manager->totals.total_size_on_disk += bytes;
-                        ++_segment_manager->totals.cycle_count;
-                        if (*written == size) {
-                            return make_ready_future<stop_iteration>(stop_iteration::yes);
+        replay_position rp(_desc.id, position_type(off));
+
+        logger.trace("Writing {} entries, {} k in {} -> {}", num, size, off, off + size);
+
+        // The write will be allowed to start now, but flush (below) must wait for not only this,
+        // but all previous write/flush pairs.
+        return _pending_ops.run_with_ordered_post_op(rp, [this, size, off, buf = std::move(buf)]() mutable {
+            // This could "block", if we have to many pending writes.
+            return begin_write().then([this, size, off, buf = std::move(buf)]() mutable {
+                auto written = make_lw_shared<size_t>(0);
+                auto p = buf.get();
+                return repeat([this, size, off, written, p]() mutable {
+                    auto&& priority_class = service::get_local_commitlog_priority();
+                    return _file.dma_write(off + *written, p + *written, size - *written, priority_class).then_wrapped([this, size, written](future<size_t>&& f) {
+                        try {
+                            auto bytes = std::get<0>(f.get());
+                            *written += bytes;
+                            _segment_manager->totals.bytes_written += bytes;
+                            _segment_manager->totals.total_size_on_disk += bytes;
+                            ++_segment_manager->totals.cycle_count;
+                            if (*written == size) {
+                                return make_ready_future<stop_iteration>(stop_iteration::yes);
+                            }
+                            // gah, partial write. should always get here with dma chunk sized
+                            // "bytes", but lets make sure...
+                            logger.debug("Partial write {}: {}/{} bytes", *this, *written, size);
+                            *written = align_down(*written, alignment);
+                            return make_ready_future<stop_iteration>(stop_iteration::no);
+                            // TODO: retry/ignore/fail/stop - optional behaviour in origin.
+                            // we fast-fail the whole commit.
+                        } catch (...) {
+                            logger.error("Failed to persist commits to disk for {}: {}", *this, std::current_exception());
+                            throw;
                        }
-                        // gah, partial write. should always get here with dma chunk sized
-                        // "bytes", but lets make sure...
-                        logger.debug("Partial write {}: {}/{} bytes", *this, *written, size);
-                        *written = align_down(*written, alignment);
-                        return make_ready_future<stop_iteration>(stop_iteration::no);
-                        // TODO: retry/ignore/fail/stop - optional behaviour in origin.
-                        // we fast-fail the whole commit.
-                    } catch (...) {
-                        logger.error("Failed to persist commits to disk for {}: {}", *this, std::current_exception());
-                        throw;
-                    }
+                    });
+                }).finally([this, buf = std::move(buf)]() mutable {
+                    _segment_manager->release_buffer(std::move(buf));
                });
-            }).finally([this, buf = std::move(buf)]() mutable {
-                _segment_manager->release_buffer(std::move(buf));
+            }).finally([this]() {
+                end_write(); // release
            });
-        }).then([me] {
-            return make_ready_future<sseg_ptr>(std::move(me));
-        }).finally([me, this]() {
-            end_write(); // release
+        }, [me, flush_after, top, rp] { // lambda instead of bind, so we keep "me" alive.
+            assert(me->_pending_ops.has_operation(rp));
+            return flush_after ? me->do_flush(top) : make_ready_future<sseg_ptr>(me);
        });
    }

@@ -697,9 +729,7 @@ public:
            ++_write_waiters;
            logger.trace("Too many pending writes. Must wait.");
            return f.finally([this] {
-                if (--_write_waiters == 0) {
-                    _queue.signal(_queue.waiters());
-                }
+                --_write_waiters;
            });
        }
        return make_ready_future<sseg_ptr>(shared_from_this());
@@ -719,19 +749,52 @@ public:
     * buffer memory usage might grow...
     */
    bool must_wait_for_alloc() {
-        return _write_waiters > 0;
+        // Note: write_waiters is decremented _after_ both semaphores and
+        // flush queue might be cleared. So we should not look only at it.
+        // But we still don't want to look at "should_wait_for_write" directly,
+        // since that is "global" and includes other segments, and we want to
+        // know if _this_ segment has blocking write ops pending.
+        // So we also check that the flush queue is non-empty.
+        return _write_waiters > 0 && !_pending_ops.empty();
    }

    future<sseg_ptr> wait_for_alloc() {
        auto me = shared_from_this();
        ++_segment_manager->totals.pending_allocations;
        logger.trace("Previous allocation is blocking. Must wait.");
-        return _queue.wait().then([me] { // TODO: do we need a finally?
+        return _pending_ops.wait_for_pending().then([me] { // TODO: do we need a finally?
            --me->_segment_manager->totals.pending_allocations;
            return make_ready_future<sseg_ptr>(me);
        });
    }

+    future<sseg_ptr> batch_cycle() {
+        /**
+         * For batch mode we force a write "immediately".
+         * However, we first wait for all previous writes/flushes
+         * to complete.
+         *
+         * This has the benefit of allowing several allocations to
+         * queue up in a single buffer.
+         */
+        auto me = shared_from_this();
+        auto fp = _file_pos;
+        return _pending_ops.wait_for_pending().then([me = std::move(me), fp] {
+            if (fp != me->_file_pos) {
+                // some other request already wrote this buffer.
+                // If so, wait for the operation at our intended file offset
+                // to finish, then we know the flush is complete and we
+                // are in accord.
+                // (Note: wait_for_pending(pos) waits for operation _at_ pos (and before),
+                replay_position rp(me->_desc.id, position_type(fp));
+                return me->_pending_ops.wait_for_pending(rp).then([me, fp] {
+                    assert(me->_flush_pos > fp);
+                    return make_ready_future<sseg_ptr>(me);
+                });
+            }
+            return me->sync();
+        });
+    }
    /**
     * Add a "mutation" to the segment.
     */
@@ -758,7 +821,16 @@ public:
        } else if (_buffer.empty()) {
            new_buffer(s);
        } else if (s > (_buffer.size() - _buf_pos)) { // enough data?
-            op = maybe_wait_for_write(cycle());
+            _needed_size += s; // hint to next new_buffer, in case we are not first.
+            if (_segment_manager->cfg.mode == sync_mode::BATCH) {
+                // TODO: this could cause starvation if we're really unlucky.
+                // If we run batch mode and find ourselves not fit in a non-empty
+                // buffer, we must force a cycle and wait for it (to keep flush order)
+                // This will most likely cause parallel writes, and consecutive flushes.
+                op = cycle(true);
+            } else {
+                op = maybe_wait_for_write(cycle());
+            }
        }

        if (op) {
@@ -793,11 +865,12 @@ public:
        out.write(crc.checksum());

        ++_segment_manager->totals.allocation_count;
+        ++_num_allocs;

        _gate.leave();

        if (_segment_manager->cfg.mode == sync_mode::BATCH) {
-            return sync().then([rp](sseg_ptr) {
+            return batch_cycle().then([rp](auto s) {
                return make_ready_future<replay_position>(rp);
            });
        }
@@ -1123,18 +1196,22 @@ void db::commitlog::segment_manager::discard_completed_segments(
    discard_unused_segments();
 }

-std::ostream& db::operator<<(std::ostream& out, const db::commitlog::segment& s) {
+namespace db {
+
+std::ostream& operator<<(std::ostream& out, const db::commitlog::segment& s) {
    return out << s._desc.filename();
 }

-std::ostream& db::operator<<(std::ostream& out, const db::commitlog::segment::cf_mark& m) {
+std::ostream& operator<<(std::ostream& out, const db::commitlog::segment::cf_mark& m) {
    return out << (m.s._cf_dirty | boost::adaptors::map_keys);
 }

-std::ostream& db::operator<<(std::ostream& out, const db::replay_position& p) {
+std::ostream& operator<<(std::ostream& out, const db::replay_position& p) {
    return out << "{" << p.shard_id() << ", " << p.base_id() << ", " << p.pos << "}";
 }

+}
+
 void db::commitlog::segment_manager::discard_unused_segments() {
    logger.trace("Checking for unused segments ({} active)", _segments.size());

@@ -1157,10 +1234,10 @@ void db::commitlog::segment_manager::discard_unused_segments() {
    }
 }

-future<> db::commitlog::segment_manager::sync_all_segments() {
+future<> db::commitlog::segment_manager::sync_all_segments(bool shutdown) {
    logger.debug("Issuing sync for all segments");
-    return parallel_for_each(_segments, [this](sseg_ptr s) {
-        return s->sync().then([](sseg_ptr s) {
+    return parallel_for_each(_segments, [this, shutdown](sseg_ptr s) {
+        return s->sync(shutdown).then([](sseg_ptr s) {
            logger.debug("Synced segment {}", *s);
        });
    });
@@ -1170,11 +1247,9 @@ future<> db::commitlog::segment_manager::shutdown() {
    if (!_shutdown) {
        _shutdown = true; // no re-arm, no create new segments.
        _timer.cancel(); // no more timer calls
-        return parallel_for_each(_segments, [this](sseg_ptr s) {
-            return s->shutdown(); // close each segment (no more alloc)
-        }).then(std::bind(&segment_manager::sync_all_segments, this)).then([this] { // flush all
-            return _gate.close(); // wait for any pending ops
-        });
+        // Now first wait for periodic task to finish, then sync and close all
+        // segments, flushing out any remaining data.
+        return _gate.close().then(std::bind(&segment_manager::sync_all_segments, this, true));
    }
    return make_ready_future<>();
 }
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -114,22 +114,22 @@ future<> db::commitlog_replayer::impl::init() {
                auto& pp = _rpm[p1.first][p2.first];
                pp = std::max(pp, p2.second);

-                auto& min = _min_pos[p1.first];
-                min = (min == replay_position()) ? p2.second : std::min(p2.second, min);
+                auto i = _min_pos.find(p1.first);
+                if (i == _min_pos.end() || p2.second < i->second) {
+                    _min_pos[p1.first] = p2.second;
+                }
            }
        }
    }, [this](cql3::query_processor& qp) {
        return do_with(shard_rpm_map{}, [this, &qp](shard_rpm_map& map) {
            return parallel_for_each(qp.db().local().get_column_families(), [&map, &qp](auto& cfp) {
                auto uuid = cfp.first;
-                for (auto& sst : *cfp.second->get_sstables() | boost::adaptors::map_values) {
+                for (auto& sst : *cfp.second->get_sstables()) {
                    try {
                        auto p = sst->get_stats_metadata().position;
                        logger.trace("sstable {} -> rp {}", sst->get_filename(), p);
-                        if (p != replay_position()) {
-                            auto& pp = map[p.shard_id()][uuid];
-                            pp = std::max(pp, p);
-                        }
+                        auto& pp = map[p.shard_id()][uuid];
+                        pp = std::max(pp, p);
                    } catch (...) {
                        logger.warn("Could not read sstable metadata {}", std::current_exception());
                    }
@@ -151,6 +151,25 @@ future<> db::commitlog_replayer::impl::init() {
            });
        });
    }).finally([this] {
+        // bugfix: the above map-reduce will not_ detect if sstables
+        // are _missing_ from a CF. And because of re-sharding, we can't
+        // just insert initial zeros into the maps, because we don't know
+        // how many shards there we're last time.
+        // However, this only affects global min pos, since
+        // for each CF, the worst that happens is that we have a missing
+        // entry -> empty replay_pos == min value. But calculating
+        // global min pos will be off, since we will only base it on
+        // existing sstables-per-shard.
+        // So, go through all CF:s and check, if a shard mapping does not
+        // have data for it, assume we must set global pos to zero.
+        for (auto&p : _qp.local().db().local().get_column_families()) {
+            for (auto&p1 : _rpm) { // for each shard
+                if (!p1.second.count(p.first)) {
+                    _min_pos[p1.first] = replay_position();
+                }
+            }
+        }
+
        for (auto&p : _min_pos) {
            logger.debug("minimum position for shard {}: {}", p.first, p.second);
        }
--- a/db/config.cc
+++ b/db/config.cc
@@ -47,27 +47,28 @@ db::config::config()

 namespace bpo = boost::program_options;

+namespace db {
 // Special "validator" for boost::program_options to allow reading options
 // into an unordered_map<string, string> (we have in config.hh a bunch of
 // those). This validator allows the parameter of each option to look like
 // 'key=value'. It also allows multiple occurrences of this option to add
 // multiple entries into the map. "String" can be any time which can be
 // converted from std::string, e.g., sstring.
-template<typename String>
 static void validate(boost::any& out, const std::vector<std::string>& in,
-        std::unordered_map<String, String>*, int) {
+        db::string_map*, int) {
    if (out.empty()) {
-        out = boost::any(std::unordered_map<String, String>());
+        out = boost::any(db::string_map());
    }
-    auto* p = boost::any_cast<std::unordered_map<String, String>>(&out);
+    auto* p = boost::any_cast<db::string_map>(&out);
    for (const auto& s : in) {
        auto i = s.find_first_of('=');
        if (i == std::string::npos) {
            throw boost::program_options::invalid_option_value(s);
        }
-        (*p)[String(s.substr(0, i))] = String(s.substr(i+1));
+        (*p)[sstring(s.substr(0, i))] = sstring(s.substr(i+1));
    }
 }
+}

 namespace YAML {
 /*
@@ -114,26 +115,27 @@ struct convert<db::config::string_list> {
    }
 };

-template<typename K, typename V>
-struct convert<std::unordered_map<K, V>> {
-    static Node encode(const std::unordered_map<K, V>& rhs) {
+template<>
+struct convert<db::string_map> {
+    static Node encode(const db::string_map& rhs) {
        Node node(NodeType::Map);
        for (auto& p : rhs) {
            node.force_insert(p.first, p.second);
        }
        return node;
    }
-    static bool decode(const Node& node, std::unordered_map<K, V>& rhs) {
+    static bool decode(const Node& node, db::string_map& rhs) {
        if (!node.IsMap()) {
            return false;
        }
        rhs.clear();
        for (auto& n : node) {
-            rhs[n.first.as<K>()] = n.second.as<V>();
+            rhs[n.first.as<sstring>()] = n.second.as<sstring>();
        }
        return true;
    }
 };
+
 template<>
 struct convert<db::config::seed_provider_type> {
    static Node encode(const db::config::seed_provider_type& rhs) {
@@ -167,6 +169,7 @@ struct convert<db::config::seed_provider_type> {

 }

+namespace db {
 template<typename... Args>
 std::basic_ostream<Args...> & operator<<(std::basic_ostream<Args...> & os, const db::config::string_map & map) {
    int n = 0;
@@ -198,6 +201,7 @@ std::basic_istream<Args...> & operator>>(std::basic_istream<Args...> & is, db::c

    return is;
 }
+}

 /*
 * Helper type to do compile time exclusion of Unused/invalid options from
--- a/Show More
+++ b/Show More