Update seastar submodule

* seastar 63079b7...725269f (1): > build: work around ragel 7 generated code bug
2026-05-02 06:05:53 +00:00 · 2017-08-09 14:22:28 +03:00 · 2017-08-09 13:08:55 +03:00 · 2017-07-19 17:47:23 +03:00 · 2017-07-19 17:28:42 +03:00 · 2017-07-15 08:05:35 +03:00
392 changed files with 12618 additions and 6450 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,11 @@
+# Asking questions or requesting help
+
+Use the [ScyllaDB user mailing list](https://groups.google.com/forum/#!forum/scylladb-users) for general questions and help.
+
+# Reporting an issue
+
+Please use the [Issue Tracker](https://github.com/scylladb/scylla/issues/) to report issues.  Fill in as much information as you can in the issue template, especially for performance problems.
+
+# Contributing Code to Scylla
+
+To contribute code to Scylla, you need to sign the [Contributor License Agreement](http://www.scylladb.com/opensource/cla/) and send your changes as [patches](https://github.com/scylladb/scylla/wiki/Formatting-and-sending-patches) to the [mailing list](https://groups.google.com/forum/#!forum/scylladb-dev). We don't accept pull requests on GitHub.
--- a/README.md
+++ b/README.md
@@ -83,14 +83,6 @@ Run the image with:
 docker run -p $(hostname -i):9042:9042 -i -t <image name>
 ```

-
 ## Contributing to Scylla

-Do not send pull requests.
-
-Send patches to the mailing list address scylladb-dev@googlegroups.com.
-Be sure to subscribe.
-
-In order for your patches to be merged, you must sign the Contributor's
-License Agreement, protecting your rights and ours.  See
-http://www.scylladb.com/opensource/cla/.
+[Guidelines for contributing](CONTRIBUTING.md)
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=1.6.6

 if test -f version
 then
--- a/api/api-doc/cache_service.json
+++ b/api/api-doc/cache_service.json
@@ -397,6 +397,36 @@
        }
      ]
    },
+    {
+      "path": "/cache_service/metrics/key/hits_moving_avrage",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get key hits moving avrage",
+          "type": "#/utils/rate_moving_average",
+          "nickname": "get_key_hits_moving_avrage",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
+    {
+      "path": "/cache_service/metrics/key/requests_moving_avrage",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get key requests moving avrage",
+          "type": "#/utils/rate_moving_average",
+          "nickname": "get_key_requests_moving_avrage",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
    {
      "path": "/cache_service/metrics/key/size",
      "operations": [
@@ -607,6 +637,36 @@
        }
      ]
    },
+    {
+      "path": "/cache_service/metrics/counter/hits_moving_avrage",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get counter hits moving avrage",
+          "type": "#/utils/rate_moving_average",
+          "nickname": "get_counter_hits_moving_avrage",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
+    {
+      "path": "/cache_service/metrics/counter/requests_moving_avrage",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get counter requests moving avrage",
+          "type": "#/utils/rate_moving_average",
+          "nickname": "get_counter_requests_moving_avrage",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
    {
      "path": "/cache_service/metrics/counter/size",
      "operations": [
--- a/api/api-doc/column_family.json
+++ b/api/api-doc/column_family.json
@@ -78,11 +78,19 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"path"
+                  },
+                  {
+                     "name":"split_output",
+                     "description":"true if the output of the major compaction should be split in several sstables",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"bool",
+                     "paramType":"query"
                  }
               ]
            }
@@ -102,7 +110,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -129,7 +137,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -153,7 +161,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -180,7 +188,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -204,7 +212,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -244,7 +252,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -271,7 +279,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -298,7 +306,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -317,7 +325,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -349,7 +357,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -381,7 +389,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -405,7 +413,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -432,7 +440,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -459,7 +467,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -491,7 +499,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -518,7 +526,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -545,7 +553,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -569,7 +577,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -593,7 +601,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -633,7 +641,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -673,7 +681,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -713,7 +721,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -753,7 +761,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -793,7 +801,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -833,7 +841,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -873,7 +881,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -916,7 +924,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -943,7 +951,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -970,7 +978,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -994,7 +1002,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1034,7 +1042,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1058,7 +1066,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1101,7 +1109,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1144,7 +1152,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1203,7 +1211,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1243,7 +1251,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1267,7 +1275,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1310,7 +1318,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1353,7 +1361,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1412,7 +1420,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1452,7 +1460,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1492,7 +1500,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1532,7 +1540,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1572,7 +1580,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1612,7 +1620,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1652,7 +1660,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1692,7 +1700,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1732,7 +1740,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1772,7 +1780,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1812,7 +1820,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1852,7 +1860,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1892,7 +1900,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1932,7 +1940,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1972,7 +1980,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2012,7 +2020,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2052,7 +2060,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2092,7 +2100,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2116,7 +2124,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2156,7 +2164,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2196,7 +2204,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2236,7 +2244,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2276,7 +2284,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2300,7 +2308,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2324,7 +2332,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2351,7 +2359,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2378,7 +2386,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2405,7 +2413,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2432,7 +2440,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2501,7 +2509,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2525,7 +2533,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2549,7 +2557,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2573,7 +2581,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2597,7 +2605,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2621,7 +2629,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2645,7 +2653,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2669,7 +2677,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2693,7 +2701,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2717,7 +2725,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2741,7 +2749,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2765,7 +2773,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
--- a/api/api-doc/endpoint_snitch_info.json
+++ b/api/api-doc/endpoint_snitch_info.json
@@ -21,8 +21,8 @@
               "parameters":[
                  {
                     "name":"host",
-                     "description":"The host name",
-                     "required":true,
+                     "description":"The host name. If absent, the local server broadcast/listen address is used",
+                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"query"
@@ -45,8 +45,8 @@
               "parameters":[
                  {
                     "name":"host",
-                     "description":"The host name",
-                     "required":true,
+                     "description":"The host name. If absent, the local server broadcast/listen address is used",
+                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"query"
--- a/api/api-doc/failure_detector.json
+++ b/api/api-doc/failure_detector.json
@@ -42,6 +42,25 @@
            }
         ]
      },
+      {
+         "path":"/failure_detector/endpoint_phi_values",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get end point phi values",
+               "type":"array",
+               "items":{
+                  "type":"endpoint_phi_values"
+               },
+               "nickname":"get_endpoint_phi_values",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+               ]
+            }
+         ]
+      },
      {
         "path":"/failure_detector/endpoints/",
         "operations":[
@@ -202,6 +221,20 @@
                    "description": "The application state version"
                }
            }
+        },
+        "endpoint_phi_value": {
+            "id" : "endpoint_phi_value",
+            "description": "Holds phi value for a single end point",
+            "properties": {
+                "phi": {
+                    "type": "double",
+                    "description": "Phi value"
+                },
+                "endpoint": {
+                    "type": "string",
+                    "description": "end point address"
+                }
+            }
        }
    }
 }
--- a/api/api-doc/storage_proxy.json
+++ b/api/api-doc/storage_proxy.json
@@ -777,7 +777,7 @@
      ]
    },
    {
-      "path": "/storage_proxy/metrics/read/moving_avrage_histogram",
+      "path": "/storage_proxy/metrics/read/moving_average_histogram",
      "operations": [
        {
          "method": "GET",
@@ -792,7 +792,7 @@
      ]
    },
    {
-      "path": "/storage_proxy/metrics/range/moving_avrage_histogram",
+      "path": "/storage_proxy/metrics/range/moving_average_histogram",
      "operations": [
        {
          "method": "GET",
@@ -942,7 +942,7 @@
      ]
    },
    {
-      "path": "/storage_proxy/metrics/write/moving_avrage_histogram",
+      "path": "/storage_proxy/metrics/write/moving_average_histogram",
      "operations": [
        {
          "method": "GET",
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -1201,11 +1201,12 @@
               ],
               "parameters":[
                  {
-                     "name":"non_system",
-                     "description":"When set to true limit to non system",
+                     "name":"type",
+                     "description":"Which keyspaces to return",
                     "required":false,
                     "allowMultiple":false,
-                     "type":"boolean",
+                     "type":"string",
+                     "enum": [ "all", "user", "non_local_strategy" ],
                     "paramType":"query"
                  }
               ]
--- a/api/api.hh
+++ b/api/api.hh
@@ -166,33 +166,36 @@ inline int64_t max_int64(int64_t a, int64_t b) {
 * It combine total and the sub set for the ratio and its
 * to_json method return the ration sub/total
 */
-struct ratio_holder : public json::jsonable {
-    double total = 0;
-    double sub = 0;
+template<typename T>
+struct basic_ratio_holder : public json::jsonable {
+    T total = 0;
+    T sub = 0;
    virtual std::string to_json() const {
        if (total == 0) {
            return "0";
        }
        return std::to_string(sub/total);
    }
-    ratio_holder() = default;
-    ratio_holder& add(double _total, double _sub) {
+    basic_ratio_holder() = default;
+    basic_ratio_holder& add(T _total, T _sub) {
        total += _total;
        sub += _sub;
        return *this;
    }
-    ratio_holder(double _total, double _sub) {
+    basic_ratio_holder(T _total, T _sub) {
        total = _total;
        sub = _sub;
    }
-    ratio_holder& operator+=(const ratio_holder& a) {
+    basic_ratio_holder<T>& operator+=(const basic_ratio_holder<T>& a) {
        return add(a.total, a.sub);
    }
-    friend ratio_holder operator+(ratio_holder a, const ratio_holder& b) {
+    friend basic_ratio_holder<T> operator+(basic_ratio_holder a, const basic_ratio_holder<T>& b) {
        return a += b;
    }
 };

+typedef basic_ratio_holder<double>  ratio_holder;
+typedef basic_ratio_holder<int64_t> integral_ratio_holder;

 class unimplemented_exception : public base_exception {
 public:
--- a/api/cache_service.cc
+++ b/api/cache_service.cc
@@ -177,6 +177,20 @@ void set_cache_service(http_context& ctx, routes& r) {
        return make_ready_future<json::json_return_type>(0);
    });

+    cs::get_key_hits_moving_avrage.set(r, [&ctx] (std::unique_ptr<request> req) {
+        // TBD
+        // FIXME
+        // See above
+        return make_ready_future<json::json_return_type>(meter_to_json(utils::rate_moving_average()));
+    });
+
+    cs::get_key_requests_moving_avrage.set(r, [&ctx] (std::unique_ptr<request> req) {
+        // TBD
+        // FIXME
+        // See above
+        return make_ready_future<json::json_return_type>(meter_to_json(utils::rate_moving_average()));
+    });
+
    cs::get_key_size.set(r, [] (std::unique_ptr<request> req) {
        // TBD
        // FIXME
@@ -194,7 +208,7 @@ void set_cache_service(http_context& ctx, routes& r) {
    });

    cs::get_row_capacity.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
+        return map_reduce_cf(ctx, uint64_t(0), [](const column_family& cf) {
            return cf.get_row_cache().get_cache_tracker().region().occupancy().used_space();
        }, std::plus<uint64_t>());
    });
@@ -280,6 +294,20 @@ void set_cache_service(http_context& ctx, routes& r) {
        return make_ready_future<json::json_return_type>(0);
    });

+    cs::get_counter_hits_moving_avrage.set(r, [&ctx] (std::unique_ptr<request> req) {
+        // TBD
+        // FIXME
+        // See above
+        return make_ready_future<json::json_return_type>(meter_to_json(utils::rate_moving_average()));
+    });
+
+    cs::get_counter_requests_moving_avrage.set(r, [&ctx] (std::unique_ptr<request> req) {
+        // TBD
+        // FIXME
+        // See above
+        return make_ready_future<json::json_return_type>(meter_to_json(utils::rate_moving_average()));
+    });
+
    cs::get_counter_size.set(r, [] (std::unique_ptr<request> req) {
        // TBD
        // FIXME
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -191,8 +191,8 @@ static double update_ratio(double acc, double f, double total) {
    return acc;
 }

-static ratio_holder mean_row_size(column_family& cf) {
-    ratio_holder res;
+static integral_ratio_holder mean_row_size(column_family& cf) {
+    integral_ratio_holder res;
    for (auto i: *cf.get_sstables() ) {
        auto c = i->get_stats_metadata().estimated_row_size.count();
        res.sub += i->get_stats_metadata().estimated_row_size.mean() * c;
@@ -562,11 +562,13 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_mean_row_size.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], ratio_holder(), mean_row_size, std::plus<ratio_holder>());
+        // Cassandra 3.x mean values are truncated as integrals.
+        return map_reduce_cf(ctx, req->param["name"], integral_ratio_holder(), mean_row_size, std::plus<integral_ratio_holder>());
    });

    cf::get_all_mean_row_size.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, ratio_holder(), mean_row_size, std::plus<ratio_holder>());
+        // Cassandra 3.x mean values are truncated as integrals.
+        return map_reduce_cf(ctx, integral_ratio_holder(), mean_row_size, std::plus<integral_ratio_holder>());
    });

    cf::get_bloom_filter_false_positives.set(r, [&ctx] (std::unique_ptr<request> req) {
--- a/api/endpoint_snitch.cc
+++ b/api/endpoint_snitch.cc
@@ -22,16 +22,22 @@
 #include "locator/snitch_base.hh"
 #include "endpoint_snitch.hh"
 #include "api/api-doc/endpoint_snitch_info.json.hh"
+#include "utils/fb_utilities.hh"

 namespace api {

 void set_endpoint_snitch(http_context& ctx, routes& r) {
-    httpd::endpoint_snitch_info_json::get_datacenter.set(r, [] (const_req req) {
-        return locator::i_endpoint_snitch::get_local_snitch_ptr()->get_datacenter(req.get_query_param("host"));
+    static auto host_or_broadcast = [](const_req req) {
+        auto host = req.get_query_param("host");
+        return host.empty() ? gms::inet_address(utils::fb_utilities::get_broadcast_address()) : gms::inet_address(host);
+    };
+
+    httpd::endpoint_snitch_info_json::get_datacenter.set(r, [](const_req req) {
+        return locator::i_endpoint_snitch::get_local_snitch_ptr()->get_datacenter(host_or_broadcast(req));
    });

-    httpd::endpoint_snitch_info_json::get_rack.set(r, [] (const_req req) {
-        return locator::i_endpoint_snitch::get_local_snitch_ptr()->get_rack(req.get_query_param("host"));
+    httpd::endpoint_snitch_info_json::get_rack.set(r, [](const_req req) {
+        return locator::i_endpoint_snitch::get_local_snitch_ptr()->get_rack(host_or_broadcast(req));
    });

    httpd::endpoint_snitch_info_json::get_snitch_name.set(r, [] (const_req req) {
--- a/api/failure_detector.cc
+++ b/api/failure_detector.cc
@@ -88,6 +88,20 @@ void set_failure_detector(http_context& ctx, routes& r) {
            return make_ready_future<json::json_return_type>(state);
        });
    });
+
+    fd::get_endpoint_phi_values.set(r, [](std::unique_ptr<request> req) {
+        return gms::get_arrival_samples().then([](std::map<gms::inet_address, gms::arrival_window> map) {
+            std::vector<fd::endpoint_phi_value> res;
+            auto now = gms::arrival_window::clk::now();
+            for (auto& p : map) {
+                fd::endpoint_phi_value val;
+                val.endpoint = p.first.to_sstring();
+                val.phi = p.second.phi(now);
+                res.emplace_back(std::move(val));
+            }
+            return make_ready_future<json::json_return_type>(res);
+        });
+    });
 }

 }
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -22,6 +22,8 @@
 #include "storage_service.hh"
 #include "api/api-doc/storage_service.json.hh"
 #include "db/config.hh"
+#include <boost/range/adaptor/map.hpp>
+#include <boost/range/adaptor/filtered.hpp>
 #include <service/storage_service.hh>
 #include <db/commitlog/commitlog.hh>
 #include <gms/gossiper.hh>
@@ -457,8 +459,15 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::get_keyspaces.set(r, [&ctx](const_req req) {
-        auto non_system = req.get_query_param("non_system");
-        return map_keys(ctx.db.local().keyspaces());
+        auto type = req.get_query_param("type");
+        if (type == "user") {
+            return ctx.db.local().get_non_system_keyspaces();
+        } else if (type == "non_local_strategy") {
+            return map_keys(ctx.db.local().get_keyspaces() | boost::adaptors::filtered([](const auto& p) {
+                return p.second.get_replication_strategy().get_type() != locator::replication_strategy_type::local;
+            }));
+        }
+        return map_keys(ctx.db.local().get_keyspaces());
    });

    ss::update_snitch.set(r, [](std::unique_ptr<request> req) {
@@ -684,8 +693,8 @@ void set_storage_service(http_context& ctx, routes& r) {
    ss::get_slow_query_info.set(r, [](const_req req) {
        ss::slow_query_info res;
        res.enable = tracing::tracing::get_local_tracing_instance().slow_query_tracing_enabled();
-        res.ttl = std::chrono::duration_cast<std::chrono::microseconds>(tracing::tracing::get_local_tracing_instance().slow_query_record_ttl()).count() ;
-        res.threshold = std::chrono::duration_cast<std::chrono::microseconds>(tracing::tracing::get_local_tracing_instance().slow_query_threshold()).count();
+        res.ttl = tracing::tracing::get_local_tracing_instance().slow_query_record_ttl().count() ;
+        res.threshold = tracing::tracing::get_local_tracing_instance().slow_query_threshold().count();
        return res;
    });

--- a/atomic_cell_or_collection.hh
+++ b/atomic_cell_or_collection.hh
@@ -63,8 +63,8 @@ public:
            ::feed_hash(as_collection_mutation(), h, def.type);
        }
    }
-    size_t memory_usage() const {
-        return _data.memory_usage();
+    size_t external_memory_usage() const {
+        return _data.external_memory_usage();
    }
    friend std::ostream& operator<<(std::ostream&, const atomic_cell_or_collection&);
 };
--- a/auth/auth.cc
+++ b/auth/auth.cc
@@ -243,7 +243,8 @@ future<> auth::auth::setup() {
        std::map<sstring, sstring> opts;
        opts["replication_factor"] = "1";
        auto ksm = keyspace_metadata::new_keyspace(AUTH_KS, "org.apache.cassandra.locator.SimpleStrategy", opts, true);
-        f = service::get_local_migration_manager().announce_new_keyspace(ksm, false);
+        // We use min_timestamp so that default keyspace metadata will loose with any manual adjustments. See issue #2129.
+        f = service::get_local_migration_manager().announce_new_keyspace(ksm, api::min_timestamp, false);
    }

    return f.then([] {
@@ -353,7 +354,7 @@ future<> auth::auth::setup_table(const sstring& name, const sstring& cql) {
    parsed->prepare_keyspace(AUTH_KS);
    ::shared_ptr<cql3::statements::create_table_statement> statement =
                    static_pointer_cast<cql3::statements::create_table_statement>(
-                                    parsed->prepare(db)->statement);
+                                    parsed->prepare(db, qp.get_cql_stats())->statement);
    auto schema = statement->get_cf_meta_data();
    auto uuid = generate_legacy_id(schema->ks_name(), schema->cf_name());

--- a/auth/data_resource.cc
+++ b/auth/data_resource.cc
@@ -47,11 +47,8 @@
 const sstring auth::data_resource::ROOT_NAME("data");

 auth::data_resource::data_resource(level l, const sstring& ks, const sstring& cf)
-    : _ks(ks), _cf(cf)
+    : _level(l), _ks(ks), _cf(cf)
 {
-    if (l != get_level()) {
-        throw std::invalid_argument("level/keyspace/column mismatch");
-    }
 }

 auth::data_resource::data_resource()
@@ -67,14 +64,7 @@ auth::data_resource::data_resource(const sstring& ks, const sstring& cf)
 {}

 auth::data_resource::level auth::data_resource::get_level() const {
-    if (!_cf.empty()) {
-        assert(!_ks.empty());
-        return level::COLUMN_FAMILY;
-    }
-    if (!_ks.empty()) {
-        return level::KEYSPACE;
-    }
-    return level::ROOT;
+    return _level;
 }

 auth::data_resource auth::data_resource::from_name(
--- a/auth/data_resource.hh
+++ b/auth/data_resource.hh
@@ -56,6 +56,7 @@ private:

    static const sstring ROOT_NAME;

+    level _level;
    sstring _ks;
    sstring _cf;

--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -218,12 +218,12 @@ future<::shared_ptr<auth::authenticated_user> > auth::password_authenticator::au
    // obsolete prepared statements pretty quickly.
    // Rely on query processing caching statements instead, and lets assume
    // that a map lookup string->statement is not gonna kill us much.
-    auto& qp = cql3::get_local_query_processor();
-    return qp.process(
-                    sprint("SELECT %s FROM %s.%s WHERE %s = ?", SALTED_HASH,
-                                    auth::AUTH_KS, CREDENTIALS_CF, USER_NAME),
-                    consistency_for_user(username), { username }, true).then_wrapped(
-                    [=](future<::shared_ptr<cql3::untyped_result_set>> f) {
+    return futurize_apply([this, username, password] {
+        auto& qp = cql3::get_local_query_processor();
+        return qp.process(sprint("SELECT %s FROM %s.%s WHERE %s = ?", SALTED_HASH,
+                                        auth::AUTH_KS, CREDENTIALS_CF, USER_NAME),
+                        consistency_for_user(username), {username}, true);
+    }).then_wrapped([=](future<::shared_ptr<cql3::untyped_result_set>> f) {
        try {
            auto res = f.get0();
            if (res->empty() || !checkpw(password, res->one().get_as<sstring>(SALTED_HASH))) {
@@ -234,6 +234,8 @@ future<::shared_ptr<auth::authenticated_user> > auth::password_authenticator::au
            std::throw_with_nested(exceptions::authentication_exception("Could not verify password"));
        } catch (exceptions::request_execution_exception& e) {
            std::throw_with_nested(exceptions::authentication_exception(e.what()));
+        } catch (...) {
+            std::throw_with_nested(exceptions::authentication_exception("authentication failed"));
        }
    });
 }
--- a/auth/permission.cc
+++ b/auth/permission.cc
@@ -40,6 +40,7 @@
 */

 #include <unordered_map>
+#include <boost/algorithm/string.hpp>
 #include "permission.hh"

 const auth::permission_set auth::permissions::ALL_DATA =
@@ -75,7 +76,9 @@ const sstring& auth::permissions::to_string(permission p) {
 }

 auth::permission auth::permissions::from_string(const sstring& s) {
-    return permission_names.at(s);
+    sstring upper(s);
+    boost::to_upper(upper);
+    return permission_names.at(upper);
 }

 std::unordered_set<sstring> auth::permissions::to_strings(const permission_set& set) {
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -38,7 +38,7 @@ class bytes_ostream {
 public:
    using size_type = bytes::size_type;
    using value_type = bytes::value_type;
-    static constexpr size_type max_chunk_size = 16 * 1024;
+    static constexpr size_type max_chunk_size() { return 16 * 1024; }
 private:
    static_assert(sizeof(value_type) == 1, "value_type is assumed to be one byte long");
    struct chunk {
@@ -59,7 +59,6 @@ private:
    };
    // FIXME: consider increasing chunk size as the buffer grows
    static constexpr size_type chunk_size{512};
-    static constexpr size_type usable_chunk_size{chunk_size - sizeof(chunk)};
 private:
    std::unique_ptr<chunk> _begin;
    chunk* _current;
@@ -100,6 +99,19 @@ private:
        }
        return _current->size - _current->offset;
    }
+    // Figure out next chunk size.
+    //   - must be enough for data_size
+    //   - must be at least chunk_size
+    //   - try to double each time to prevent too many allocations
+    //   - do not exceed max_chunk_size
+    size_type next_alloc_size(size_t data_size) const {
+        auto next_size = _current
+                ? _current->size * 2
+                : chunk_size;
+        next_size = std::min(next_size, max_chunk_size());
+        // FIXME: check for overflow?
+        return std::max<size_type>(next_size, data_size + sizeof(chunk));
+    }
    // Makes room for a contiguous region of given size.
    // The region is accounted for as already written.
    // size must not be zero.
@@ -110,7 +122,7 @@ private:
            _size += size;
            return ret;
        } else {
-            auto alloc_size = size <= usable_chunk_size ? chunk_size : (size + sizeof(chunk));
+            auto alloc_size = next_alloc_size(size);
            auto space = malloc(alloc_size);
            if (!space) {
                throw std::bad_alloc();
@@ -205,7 +217,7 @@ public:
        }

        while (!v.empty()) {
-            auto this_size = std::min(v.size(), size_t(max_chunk_size));
+            auto this_size = std::min(v.size(), size_t(max_chunk_size()));
            std::copy_n(v.begin(), this_size, alloc(this_size));
            v.remove_prefix(this_size);
        }
@@ -329,7 +341,7 @@ public:
        // if its size is below max_chunk_size. We probably could also gain
        // some read performance by doing "real" reduction, i.e. merging
        // all chunks until all but the last one is max_chunk_size.
-        if (size() < max_chunk_size) {
+        if (size() < max_chunk_size()) {
            linearize();
        }
    }
--- a/canonical_mutation.cc
+++ b/canonical_mutation.cc
@@ -44,7 +44,7 @@ canonical_mutation::canonical_mutation(const mutation& m)
    mutation_partition_serializer part_ser(*m.schema(), m.partition());

    bytes_ostream out;
-    ser::writer_of_canonical_mutation wr(out);
+    ser::writer_of_canonical_mutation<bytes_ostream> wr(out);
    std::move(wr).write_table_id(m.schema()->id())
                 .write_schema_version(m.schema()->version())
                 .write_key(m.key())
--- a/checked-file-impl.hh
+++ b/checked-file-impl.hh
@@ -27,125 +27,125 @@
 class checked_file_impl : public file_impl {
 public:

-    checked_file_impl(disk_error_signal_type& s, file f)
-            : _signal(s) , _file(f) {
+    checked_file_impl(const io_error_handler& error_handler, file f)
+            : _error_handler(error_handler), _file(f) {
        _memory_dma_alignment = f.memory_dma_alignment();
        _disk_read_dma_alignment = f.disk_read_dma_alignment();
        _disk_write_dma_alignment = f.disk_write_dma_alignment();
    }

    virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->write_dma(pos, buffer, len, pc);
        });
    }

    virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->write_dma(pos, iov, pc);
        });
    }

    virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->read_dma(pos, buffer, len, pc);
        });
    }

    virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->read_dma(pos, iov, pc);
        });
    }

    virtual future<> flush(void) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->flush();
        });
    }

    virtual future<struct stat> stat(void) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->stat();
        });
    }

    virtual future<> truncate(uint64_t length) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->truncate(length);
        });
    }

    virtual future<> discard(uint64_t offset, uint64_t length) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->discard(offset, length);
        });
    }

    virtual future<> allocate(uint64_t position, uint64_t length) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->allocate(position, length);
        });
    }

    virtual future<uint64_t> size(void) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->size();
        });
    }

    virtual future<> close() override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->close();
        });
    }

    virtual subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->list_directory(next);
        });
    }

 private:
-    disk_error_signal_type &_signal;
+    const io_error_handler& _error_handler;
    file _file;
 };

-inline file make_checked_file(disk_error_signal_type& signal, file& f)
+inline file make_checked_file(const io_error_handler& error_handler, file& f)
 {
-    return file(::make_shared<checked_file_impl>(signal, f));
+    return file(::make_shared<checked_file_impl>(error_handler, f));
 }

 future<file>
-inline open_checked_file_dma(disk_error_signal_type& signal,
+inline open_checked_file_dma(const io_error_handler& error_handler,
                             sstring name, open_flags flags,
                             file_open_options options)
 {
-    return do_io_check(signal, [&] {
+    return do_io_check(error_handler, [&] {
        return open_file_dma(name, flags, options).then([&] (file f) {
-            return make_ready_future<file>(make_checked_file(signal, f));
+            return make_ready_future<file>(make_checked_file(error_handler, f));
        });
    });
 }

 future<file>
-inline open_checked_file_dma(disk_error_signal_type& signal,
+inline open_checked_file_dma(const io_error_handler& error_handler,
                             sstring name, open_flags flags)
 {
-    return do_io_check(signal, [&] {
+    return do_io_check(error_handler, [&] {
        return open_file_dma(name, flags).then([&] (file f) {
-            return make_ready_future<file>(make_checked_file(signal, f));
+            return make_ready_future<file>(make_checked_file(error_handler, f));
        });
    });
 }

 future<file>
-inline open_checked_directory(disk_error_signal_type& signal,
+inline open_checked_directory(const io_error_handler& error_handler,
                              sstring name)
 {
-    return do_io_check(signal, [&] {
+    return do_io_check(error_handler, [&] {
        return engine().open_directory(name).then([&] (file f) {
-            return make_ready_future<file>(make_checked_file(signal, f));
+            return make_ready_future<file>(make_checked_file(error_handler, f));
        });
    });
 }
--- a/compaction_strategy.hh
+++ b/compaction_strategy.hh
@@ -54,6 +54,10 @@ public:
    // Return a list of sstables to be compacted after applying the strategy.
    compaction_descriptor get_sstables_for_compaction(column_family& cfs, std::vector<lw_shared_ptr<sstable>> candidates);

+    // Some strategies may look at the compacted and resulting sstables to
+    // get some useful information for subsequent compactions.
+    void notify_completion(const std::vector<lw_shared_ptr<sstable>>& removed, const std::vector<lw_shared_ptr<sstable>>& added);
+
    // Return if parallel compaction is allowed by strategy.
    bool parallel_compaction() const;

--- a/compatible_ring_position.hh
+++ b/compatible_ring_position.hh
@@ -39,6 +39,9 @@ public:
    compatible_ring_position(const schema& s, dht::ring_position&& rp)
            : _schema(&s), _rp(std::move(rp)) {
    }
+    const dht::token& token() const {
+        return _rp->token();
+    }
    friend int tri_compare(const compatible_ring_position& x, const compatible_ring_position& y) {
        return x._rp->tri_compare(*x._schema, *y._rp);
    }
--- a/compress.hh
+++ b/compress.hh
@@ -39,17 +39,17 @@ public:
    static constexpr auto CHUNK_LENGTH_KB = "chunk_length_kb";
    static constexpr auto CRC_CHECK_CHANCE = "crc_check_chance";
 private:
-    compressor _compressor = compressor::none;
+    compressor _compressor;
    std::experimental::optional<int> _chunk_length;
    std::experimental::optional<double> _crc_check_chance;
 public:
-    compression_parameters() = default;
-    compression_parameters(compressor c) : _compressor(c) { }
+    compression_parameters(compressor c = compressor::lz4) : _compressor(c) { }
    compression_parameters(const std::map<sstring, sstring>& options) {
        validate_options(options);

        auto it = options.find(SSTABLE_COMPRESSION);
        if (it == options.end() || it->second.empty()) {
+            _compressor = compressor::none;
            return;
        }
        const auto& compressor_class = it->second;
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -409,29 +409,6 @@ partitioner: org.apache.cassandra.dht.Murmur3Partitioner
 # the smaller of 1/4 of heap or 512MB.
 # file_cache_size_in_mb: 512

-# Total permitted memory to use for memtables. Scylla will stop 
-# accepting writes when the limit is exceeded until a flush completes,
-# and will trigger a flush based on memtable_cleanup_threshold
-# If omitted, Scylla will set both to 1/4 the size of the heap.
-# memtable_heap_space_in_mb: 2048
-# memtable_offheap_space_in_mb: 2048
-
-# Ratio of occupied non-flushing memtable size to total permitted size
-# that will trigger a flush of the largest memtable.  Lager mct will
-# mean larger flushes and hence less compaction, but also less concurrent
-# flush activity which can make it difficult to keep your disks fed
-# under heavy write load.
-#
-# memtable_cleanup_threshold defaults to 1 / (memtable_flush_writers + 1)
-# memtable_cleanup_threshold: 0.11
-
-# Specify the way Scylla allocates and manages memtable memory.
-# Options are:
-#   heap_buffers:    on heap nio buffers
-#   offheap_buffers: off heap (direct) nio buffers
-#   offheap_objects: native memory, eliminating nio buffer heap overhead
-# memtable_allocation_type: heap_buffers
-
 # Total space to use for commitlogs.
 #
 # If space gets above this value (it will round up to the next nearest
@@ -443,17 +420,6 @@ partitioner: org.apache.cassandra.dht.Murmur3Partitioner
 # available for Scylla.
 commitlog_total_space_in_mb: -1

-# This sets the amount of memtable flush writer threads.  These will
-# be blocked by disk io, and each one will hold a memtable in memory
-# while blocked. 
-#
-# memtable_flush_writers defaults to the smaller of (number of disks,
-# number of cores), with a minimum of 2 and a maximum of 8.
-# 
-# If your data directories are backed by SSD, you should increase this
-# to the number of cores.
-#memtable_flush_writers: 8
-
 # A fixed memory pool size in MB for for SSTable index summaries. If left
 # empty, this will default to 5% of the heap size. If the memory usage of
 # all index summaries exceeds this limit, SSTables with low read rates will
--- a/configure.py
+++ b/configure.py
@@ -108,6 +108,11 @@ def debug_flag(compiler):
        print('Note: debug information disabled; upgrade your compiler')
        return ''

+def maybe_static(flag, libs):
+    if flag and not args.static:
+        libs = '-Wl,-Bstatic {} -Wl,-Bdynamic'.format(libs)
+    return libs
+
 class Thrift(object):
    def __init__(self, source, service):
        self.source = source
@@ -184,7 +189,6 @@ scylla_tests = [
    'tests/storage_proxy_test',
    'tests/schema_change_test',
    'tests/mutation_reader_test',
-    'tests/key_reader_test',
    'tests/mutation_query_test',
    'tests/row_cache_test',
    'tests/test-serialization',
@@ -222,6 +226,8 @@ scylla_tests = [
    'tests/database_test',
    'tests/nonwrapping_range_test',
    'tests/input_stream_test',
+    'tests/sstable_atomic_deletion_test',
+    'tests/virtual_reader_test',
 ]

 apps = [
@@ -263,7 +269,9 @@ arg_parser.add_argument('--debuginfo', action = 'store', dest = 'debuginfo', typ
 arg_parser.add_argument('--static-stdc++', dest = 'staticcxx', action = 'store_true',
 			help = 'Link libgcc and libstdc++ statically')
 arg_parser.add_argument('--static-thrift', dest = 'staticthrift', action = 'store_true',
-			help = 'Link libthrift statically')
+            help = 'Link libthrift statically')
+arg_parser.add_argument('--static-boost', dest = 'staticboost', action = 'store_true',
+            help = 'Link boost statically')
 arg_parser.add_argument('--tests-debuginfo', action = 'store', dest = 'tests_debuginfo', type = int, default = 0,
                        help = 'Enable(1)/disable(0)compiler debug information generation for tests')
 arg_parser.add_argument('--python', action = 'store', dest = 'python', default = 'python3',
@@ -299,7 +307,6 @@ scylla_core = (['database.cc',
                 'mutation_partition_serializer.cc',
                 'mutation_reader.cc',
                 'mutation_query.cc',
-                 'key_reader.cc',
                 'keys.cc',
                 'sstables/sstables.cc',
                 'sstables/compress.cc',
@@ -309,6 +316,7 @@ scylla_core = (['database.cc',
                 'sstables/compaction.cc',
                 'sstables/compaction_strategy.cc',
                 'sstables/compaction_manager.cc',
+                 'sstables/atomic_deletion.cc',
                 'transport/event.cc',
                 'transport/event_notifier.cc',
                 'transport/server.cc',
@@ -328,6 +336,7 @@ scylla_core = (['database.cc',
                 'cql3/statements/authentication_statement.cc',
                 'cql3/statements/create_keyspace_statement.cc',
                 'cql3/statements/create_table_statement.cc',
+                 'cql3/statements/create_view_statement.cc',
                 'cql3/statements/create_type_statement.cc',
                 'cql3/statements/create_user_statement.cc',
                 'cql3/statements/drop_keyspace_statement.cc',
@@ -485,7 +494,7 @@ scylla_core = (['database.cc',
                 'tracing/trace_state.cc',
                 'range_tombstone.cc',
                 'range_tombstone_list.cc',
-                 'db/size_estimates_recorder.cc'
+                 'disk-error-handler.cc'
                 ]
                + [Antlr3Grammar('cql3/Cql.g')]
                + [Thrift('interface/cassandra.thrift', 'Cassandra')]
@@ -564,43 +573,49 @@ deps = {
    'scylla': idls + ['main.cc'] + scylla_core + api,
 }

-tests_not_using_seastar_test_framework = set([
-    'tests/keys_test',
+pure_boost_tests = set([
    'tests/partitioner_test',
    'tests/map_difference_test',
+    'tests/keys_test',
+    'tests/compound_test',
+    'tests/range_tombstone_list_test',
+    'tests/anchorless_list_test',
+    'tests/nonwrapping_range_test',
+    'tests/test-serialization',
+    'tests/range_test',
+    'tests/crc_test',
+    'tests/managed_vector_test',
+    'tests/dynamic_bitset_test',
+    'tests/idl_test',
+    'tests/cartesian_product_test',
+])
+
+tests_not_using_seastar_test_framework = set([
    'tests/perf/perf_mutation',
    'tests/lsa_async_eviction_test',
    'tests/lsa_sync_eviction_test',
    'tests/row_cache_alloc_stress',
    'tests/perf_row_cache_update',
-    'tests/cartesian_product_test',
    'tests/perf/perf_hash',
    'tests/perf/perf_cql_parser',
    'tests/message',
    'tests/perf/perf_simple_query',
    'tests/memory_footprint',
-    'tests/test-serialization',
    'tests/gossip',
-    'tests/compound_test',
-    'tests/range_test',
-    'tests/crc_test',
    'tests/perf/perf_sstable',
-    'tests/managed_vector_test',
-    'tests/dynamic_bitset_test',
-    'tests/idl_test',
-    'tests/range_tombstone_list_test',
-    'tests/anchorless_list_test',
-    'tests/nonwrapping_range_test',
-])
+]) | pure_boost_tests

 for t in tests_not_using_seastar_test_framework:
    if not t in scylla_tests:
        raise Exception("Test %s not found in scylla_tests" % (t))

 for t in scylla_tests:
-    deps[t] = scylla_tests_dependencies + [t + '.cc']
+    deps[t] = [t + '.cc']
    if t not in tests_not_using_seastar_test_framework:
+        deps[t] += scylla_tests_dependencies 
        deps[t] += scylla_tests_seastar_deps
+    else:
+        deps[t] += scylla_core + api + idls + ['tests/cql_test_env.cc']

 deps['tests/sstable_test'] += ['tests/sstable_datafile_test.cc']

@@ -704,6 +719,8 @@ elif args.dpdk_target:
    seastar_flags += ['--dpdk-target', args.dpdk_target]
 if args.staticcxx:
    seastar_flags += ['--static-stdc++']
+if args.staticboost:
+    seastar_flags += ['--static-boost']

 seastar_cflags = args.user_cflags + " -march=nehalem"
 seastar_flags += ['--compiler', args.cxx, '--cflags=%s' % (seastar_cflags)]
@@ -737,7 +754,14 @@ for mode in build_modes:
 seastar_deps = 'practically_anything_can_change_so_lets_run_it_every_time_and_restat.'

 args.user_cflags += " " + pkg_config("--cflags", "jsoncpp")
-libs = "-lyaml-cpp -llz4 -lz -lsnappy " + pkg_config("--libs", "jsoncpp") + ' -lboost_filesystem' + ' -lcrypt' + ' -lboost_date_time'
+libs = ' '.join(['-lyaml-cpp', '-llz4', '-lz', '-lsnappy', pkg_config("--libs", "jsoncpp"),
+                 maybe_static(args.staticboost, '-lboost_filesystem'), ' -lcrypt',
+                 maybe_static(args.staticboost, '-lboost_date_time'),
+                ])
+
+if not args.staticboost:
+    args.user_cflags += ' -DBOOST_TEST_DYN_LINK'
+
 for pkg in pkgs:
    args.user_cflags += ' ' + pkg_config('--cflags', pkg)
    libs += ' ' + pkg_config('--libs', pkg)
@@ -851,6 +875,11 @@ with open(buildfile, 'w') as f:
                f.write('build $builddir/{}/{}: ar.{} {}\n'.format(mode, binary, mode, str.join(' ', objs)))
            else:
                if binary.startswith('tests/'):
+                    local_libs = '$libs'
+                    if binary not in tests_not_using_seastar_test_framework or binary in pure_boost_tests:
+                        local_libs += ' ' + maybe_static(args.staticboost, '-lboost_unit_test_framework') 
+                    if has_thrift:
+                        local_libs += ' ' + thrift_libs + ' ' + maybe_static(args.staticboost, '-lboost_system')
                    # Our code's debugging information is huge, and multiplied
                    # by many tests yields ridiculous amounts of disk space.
                    # So we strip the tests by default; The user can very
@@ -858,15 +887,15 @@ with open(buildfile, 'w') as f:
                    # to the test name, e.g., "ninja build/release/testname_g"
                    f.write('build $builddir/{}/{}: {}.{} {} {}\n'.format(mode, binary, tests_link_rule, mode, str.join(' ', objs),
                                                                                     'seastar/build/{}/libseastar.a'.format(mode)))
-                    if has_thrift:
-                        f.write('   libs =  {} -lboost_system $libs\n'.format(thrift_libs))
+                    f.write('   libs = {}\n'.format(local_libs))
                    f.write('build $builddir/{}/{}_g: link.{} {} {}\n'.format(mode, binary, mode, str.join(' ', objs),
                                                                              'seastar/build/{}/libseastar.a'.format(mode)))
+                    f.write('   libs = {}\n'.format(local_libs))
                else:
                    f.write('build $builddir/{}/{}: link.{} {} {}\n'.format(mode, binary, mode, str.join(' ', objs),
                                                                            'seastar/build/{}/libseastar.a'.format(mode)))
-                if has_thrift:
-                    f.write('   libs =  {} -lboost_system $libs\n'.format(thrift_libs))
+                    if has_thrift:
+                        f.write('   libs =  {} {} $libs\n'.format(thrift_libs, maybe_static(args.staticboost, '-lboost_system')))
            for src in srcs:
                if src.endswith('.cc'):
                    obj = '$builddir/' + mode + '/' + src.replace('.cc', '.o')
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -40,6 +40,7 @@ options {
 #include "cql3/statements/drop_keyspace_statement.hh"
 #include "cql3/statements/create_index_statement.hh"
 #include "cql3/statements/create_table_statement.hh"
+#include "cql3/statements/create_view_statement.hh"
 #include "cql3/statements/create_type_statement.hh"
 #include "cql3/statements/drop_type_statement.hh"
 #include "cql3/statements/alter_type_statement.hh"
@@ -340,6 +341,7 @@ cqlStatement returns [shared_ptr<raw::parsed_statement> stmt]
    | st30=createAggregateStatement    { $stmt = st30; }
    | st31=dropAggregateStatement      { $stmt = st31; }
 #endif
+    | st32=createViewStatement         { $stmt = st32; }
    ;

 /*
@@ -716,7 +718,7 @@ createTableStatement returns [shared_ptr<cql3::statements::create_table_statemen

 cfamDefinition[shared_ptr<cql3::statements::create_table_statement::raw_statement> expr]
    : '(' cfamColumns[expr] ( ',' cfamColumns[expr]? )* ')'
-      ( K_WITH cfamProperty[expr] ( K_AND cfamProperty[expr] )*)?
+      ( K_WITH cfamProperty[$expr->properties()] ( K_AND cfamProperty[$expr->properties()] )*)?
    ;

 cfamColumns[shared_ptr<cql3::statements::create_table_statement::raw_statement> expr]
@@ -732,15 +734,15 @@ pkDef[shared_ptr<cql3::statements::create_table_statement::raw_statement> expr]
    | '(' k1=ident { l.push_back(k1); } ( ',' kn=ident { l.push_back(kn); } )* ')' { $expr->add_key_aliases(l); }
    ;

-cfamProperty[shared_ptr<cql3::statements::create_table_statement::raw_statement> expr]
-    : property[expr->properties]
-    | K_COMPACT K_STORAGE { $expr->set_compact_storage(); }
+cfamProperty[cql3::statements::cf_properties& expr]
+    : property[$expr.properties()]
+    | K_COMPACT K_STORAGE { $expr.set_compact_storage(); }
    | K_CLUSTERING K_ORDER K_BY '(' cfamOrdering[expr] (',' cfamOrdering[expr])* ')'
    ;

-cfamOrdering[shared_ptr<cql3::statements::create_table_statement::raw_statement> expr]
+cfamOrdering[cql3::statements::cf_properties& expr]
    @init{ bool reversed=false; }
-    : k=ident (K_ASC | K_DESC { reversed=true;} ) { $expr->set_ordering(k, reversed); }
+    : k=ident (K_ASC | K_DESC { reversed=true;} ) { $expr.set_ordering(k, reversed); }
    ;


@@ -787,6 +789,39 @@ indexIdent returns [::shared_ptr<index_target::raw> id]
    | K_FULL '(' c=cident ')'    { $id = index_target::raw::full_collection(c); }
    ;

+/**
+ * CREATE MATERIALIZED VIEW <viewName> AS
+ *  SELECT <columns>
+ *  FROM <CF>
+ *  WHERE <pkColumns> IS NOT NULL
+ *  PRIMARY KEY (<pkColumns>)
+ *  WITH <property> = <value> AND ...;
+ */
+createViewStatement returns [::shared_ptr<create_view_statement> expr]
+    @init {
+        bool if_not_exists = false;
+        std::vector<::shared_ptr<cql3::column_identifier::raw>> partition_keys;
+        std::vector<::shared_ptr<cql3::column_identifier::raw>> composite_keys;
+    }
+    : K_CREATE K_MATERIALIZED K_VIEW (K_IF K_NOT K_EXISTS { if_not_exists = true; })? cf=columnFamilyName K_AS
+        K_SELECT sclause=selectClause K_FROM basecf=columnFamilyName
+        (K_WHERE wclause=whereClause)?
+        K_PRIMARY K_KEY (
+        '(' '(' k1=cident { partition_keys.push_back(k1); } ( ',' kn=cident { partition_keys.push_back(kn); } )* ')' ( ',' c1=cident { composite_keys.push_back(c1); } )* ')'
+    |   '(' k1=cident { partition_keys.push_back(k1); } ( ',' cn=cident { composite_keys.push_back(cn); } )* ')'
+        )
+        {
+             $expr = ::make_shared<create_view_statement>(
+                std::move(cf),
+                std::move(basecf),
+                std::move(sclause),
+                std::move(wclause),
+                std::move(partition_keys),
+                std::move(composite_keys),
+                if_not_exists);
+        }
+        ( K_WITH cfamProperty[{ $expr->properties() }] ( K_AND cfamProperty[{ $expr->properties() }] )*)?
+    ;

 #if 0
 /**
@@ -1304,7 +1339,8 @@ relation[std::vector<cql3::relation_ptr>& clauses]

    | K_TOKEN l=tupleOfIdentifiers type=relationType t=term
        { $clauses.emplace_back(::make_shared<cql3::token_relation>(std::move(l), *type, std::move(t))); }
-
+    | name=cident K_IS K_NOT K_NULL {
+          $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), cql3::operator_type::IS_NOT, cql3::constants::NULL_LITERAL)); }
    | name=cident K_IN marker=inMarker
        { $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), cql3::operator_type::IN, std::move(marker))); }
    | name=cident K_IN in_values=singleColumnInValues
@@ -1528,6 +1564,8 @@ K_KEYSPACE:    ( K E Y S P A C E
 K_KEYSPACES:   K E Y S P A C E S;
 K_COLUMNFAMILY:( C O L U M N F A M I L Y
                 | T A B L E );
+K_MATERIALIZED:M A T E R I A L I Z E D;
+K_VIEW:        V I E W;
 K_INDEX:       I N D E X;
 K_CUSTOM:      C U S T O M;
 K_ON:          O N;
@@ -1551,6 +1589,7 @@ K_DESC:        D E S C;
 K_ALLOW:       A L L O W;
 K_FILTERING:   F I L T E R I N G;
 K_IF:          I F;
+K_IS:          I S;
 K_CONTAINS:    C O N T A I N S;

 K_GRANT:       G R A N T;
--- a/cql3/operator.cc
+++ b/cql3/operator.cc
@@ -52,5 +52,6 @@ const operator_type operator_type::IN(7, operator_type::IN, "IN");
 const operator_type operator_type::CONTAINS(5, operator_type::CONTAINS, "CONTAINS");
 const operator_type operator_type::CONTAINS_KEY(6, operator_type::CONTAINS_KEY, "CONTAINS_KEY");
 const operator_type operator_type::NEQ(8, operator_type::NEQ, "!=");
+const operator_type operator_type::IS_NOT(9, operator_type::IS_NOT, "IS NOT");

 }
--- a/cql3/operator.hh
+++ b/cql3/operator.hh
@@ -58,6 +58,7 @@ public:
    static const operator_type CONTAINS;
    static const operator_type CONTAINS_KEY;
    static const operator_type NEQ;
+    static const operator_type IS_NOT;
 private:
    int32_t _b;
    const operator_type& _reverse;
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -92,13 +92,33 @@ query_processor::query_processor(distributed<service::storage_proxy>& proxy,
    : _migration_subscriber{std::make_unique<migration_subscriber>(this)}
    , _proxy(proxy)
    , _db(db)
+    , _collectd_regs{
+        scollectd::add_polled_metric(scollectd::type_instance_id("query_processor"
+            , scollectd::per_cpu_plugin_instance
+            , "total_operations", "statements_prepared")
+            , scollectd::make_typed(scollectd::data_type::DERIVE, _stats.prepare_invocations)),
+        scollectd::add_polled_metric(scollectd::type_instance_id("cql"
+            , scollectd::per_cpu_plugin_instance
+            , "total_operations", "reads")
+            , scollectd::make_typed(scollectd::data_type::DERIVE, _cql_stats.reads)),
+        scollectd::add_polled_metric(scollectd::type_instance_id("cql"
+            , scollectd::per_cpu_plugin_instance
+            , "total_operations", "inserts")
+            , scollectd::make_typed(scollectd::data_type::DERIVE, _cql_stats.inserts)),
+        scollectd::add_polled_metric(scollectd::type_instance_id("cql"
+            , scollectd::per_cpu_plugin_instance
+            , "total_operations", "updates")
+            , scollectd::make_typed(scollectd::data_type::DERIVE, _cql_stats.updates)),
+        scollectd::add_polled_metric(scollectd::type_instance_id("cql"
+            , scollectd::per_cpu_plugin_instance
+            , "total_operations", "deletes")
+            , scollectd::make_typed(scollectd::data_type::DERIVE, _cql_stats.deletes)),
+        scollectd::add_polled_metric(scollectd::type_instance_id("cql"
+            , scollectd::per_cpu_plugin_instance
+            , "total_operations", "batches")
+            , scollectd::make_typed(scollectd::data_type::DERIVE, _cql_stats.batches))}
    , _internal_state(new internal_state())
 {
-    _collectd_regs.push_back(
-        scollectd::add_polled_metric(scollectd::type_instance_id("query_processor"
-                , scollectd::per_cpu_plugin_instance
-                , "total_operations", "statements_prepared")
-                , scollectd::make_typed(scollectd::data_type::DERIVE, _stats.prepare_invocations)));
    service::get_local_migration_manager().register_listener(_migration_subscriber.get());
 }

@@ -285,7 +305,7 @@ query_processor::get_statement(const sstring_view& query, const service::client_
        Tracing.trace("Preparing statement");
 #endif
    ++_stats.prepare_invocations;
-    return statement->prepare(_db.local());
+    return statement->prepare(_db.local(), _cql_stats);
 }

 ::shared_ptr<raw::parsed_statement>
@@ -346,7 +366,7 @@ query_options query_processor::make_internal_options(::shared_ptr<statements::pr
 {
    auto& p = _internal_statements[query_string];
    if (p == nullptr) {
-        auto np = parse_statement(query_string)->prepare(_db.local());
+        auto np = parse_statement(query_string)->prepare(_db.local(), _cql_stats);
        np->statement->validate(_proxy, *_internal_state);
        p = std::move(np); // inserts it into map
    }
@@ -382,7 +402,7 @@ query_processor::process(const sstring& query_string,
                         const std::initializer_list<data_value>& values,
                         bool cache)
 {
-    auto p = cache ? prepare_internal(query_string) : parse_statement(query_string)->prepare(_db.local());
+    auto p = cache ? prepare_internal(query_string) : parse_statement(query_string)->prepare(_db.local(), _cql_stats);
    if (!cache) {
        p->statement->validate(_proxy, *_internal_state);
    }
--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -75,7 +75,9 @@ private:
        uint64_t prepare_invocations = 0;
    } _stats;

-    std::vector<scollectd::registration> _collectd_regs;
+    cql_stats _cql_stats;
+
+    scollectd::registrations _collectd_regs;

    class internal_state;
    std::unique_ptr<internal_state> _internal_state;
@@ -92,6 +94,11 @@ public:
    distributed<service::storage_proxy>& proxy() {
        return _proxy;
    }
+
+    cql_stats& get_cql_stats() {
+        return _cql_stats;
+    }
+
 #if 0
    public static final QueryProcessor instance = new QueryProcessor();
 #endif
--- a/cql3/relation.hh
+++ b/cql3/relation.hh
@@ -156,6 +156,10 @@ public:
            return new_contains_restriction(db, schema, bound_names, false);
        } else if (_relation_type == operator_type::CONTAINS_KEY) {
            return new_contains_restriction(db, schema, bound_names, true);
+        } else if (_relation_type == operator_type::IS_NOT) {
+            // This case is not supposed to happen: statement_restrictions
+            // constructor does not call this function for views' IS_NOT.
+            throw exceptions::invalid_request_exception(sprint("Unsupported \"IS NOT\" relation: %s", to_string()));
        } else {
            throw exceptions::invalid_request_exception(sprint("Unsupported \"!=\" relation: %s", to_string()));
        }
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -28,6 +28,9 @@
 #include "single_column_primary_key_restrictions.hh"
 #include "token_restriction.hh"

+#include "cql3/single_column_relation.hh"
+#include "cql3/constants.hh"
+
 namespace cql3 {
 namespace restrictions {

@@ -131,13 +134,21 @@ statement_restrictions::statement_restrictions(schema_ptr schema)
    , _clustering_columns_restrictions(get_initial_key_restrictions<clustering_key_prefix>())
    , _nonprimary_key_restrictions(::make_shared<single_column_restrictions>(schema))
 { }
+#if 0
+static const column_definition*
+to_column_definition(const schema_ptr& schema, const ::shared_ptr<column_identifier::raw>& entity) {
+    return get_column_definition(schema,
+            *entity->prepare_column_identifier(schema));
+}
+#endif

 statement_restrictions::statement_restrictions(database& db,
        schema_ptr schema,
        const std::vector<::shared_ptr<relation>>& where_clause,
        ::shared_ptr<variable_specifications> bound_names,
        bool selects_only_static_columns,
-        bool select_a_collection)
+        bool select_a_collection,
+        bool for_view)
    : statement_restrictions(schema)
 {
    /*
@@ -149,7 +160,31 @@ statement_restrictions::statement_restrictions(database& db,
     */
    if (!where_clause.empty()) {
        for (auto&& relation : where_clause) {
-            add_restriction(relation->to_restriction(db, schema, bound_names));
+            if (relation->get_operator() == cql3::operator_type::IS_NOT) {
+                single_column_relation* r =
+                        dynamic_cast<single_column_relation*>(relation.get());
+                // The "IS NOT NULL" restriction is only supported (and
+                // mandatory) for materialized view creation:
+                if (!r) {
+                    throw exceptions::invalid_request_exception("IS NOT only supports single column");
+                }
+                // currently, the grammar only allows the NULL argument to be
+                // "IS NOT", so this assertion should not be able to fail
+                assert(r->get_value() == cql3::constants::NULL_LITERAL);
+
+                auto col_id = r->get_entity()->prepare_column_identifier(schema);
+                const auto *cd = get_column_definition(schema, *col_id);
+                if (!cd) {
+                    throw exceptions::invalid_request_exception(sprint("restriction '%s' unknown column %s", relation->to_string(), r->get_entity()->to_string()));
+                }
+                _not_null_columns.insert(cd);
+
+                if (!for_view) {
+                    throw exceptions::invalid_request_exception(sprint("restriction '%s' is only supported in materialized view creation", relation->to_string()));
+                }
+            } else {
+                add_restriction(relation->to_restriction(db, schema, bound_names));
+            }
        }
    }

--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -83,6 +83,8 @@ private:
     */
    ::shared_ptr<single_column_restrictions> _nonprimary_key_restrictions;

+    std::unordered_set<const column_definition*> _not_null_columns;
+
    /**
     * The restrictions used to build the index expressions
     */
@@ -112,7 +114,8 @@ public:
        const std::vector<::shared_ptr<relation>>& where_clause,
        ::shared_ptr<variable_specifications> bound_names,
        bool selects_only_static_columns,
-        bool select_a_collection);
+        bool select_a_collection,
+        bool for_view = false);
 private:
    void add_restriction(::shared_ptr<restriction> restriction);
    void add_single_column_restriction(::shared_ptr<single_column_restriction> restriction);
--- a/cql3/restrictions/token_restriction.hh
+++ b/cql3/restrictions/token_restriction.hh
@@ -97,7 +97,14 @@ public:
            if (!buf) {
                throw exceptions::invalid_request_exception("Invalid null token value");
            }
-            return dht::token(dht::token::kind::key, *buf);
+            auto tk = dht::global_partitioner().from_bytes(*buf);
+            if (tk.is_minimum() && !is_start(b)) {
+                // The token was parsed as a minimum marker (token::kind::before_all_keys), but
+                // as it appears in the end bound position, it is actually the maximum marker
+                // (token::kind::after_all_keys).
+                return dht::maximum_token();
+            }
+            return tk;
        };

        const auto start_token = get_token_bound(statements::bound::START);
--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -232,7 +232,7 @@ uint32_t selection::add_column_for_ordering(const column_definition& c) {
            raw_selector::to_selectables(raw_selectors, schema), db, schema, defs);

    auto metadata = collect_metadata(schema, raw_selectors, *factories);
-    if (processes_selection(raw_selectors)) {
+    if (processes_selection(raw_selectors) || raw_selectors.size() != defs.size()) {
        return ::make_shared<selection_with_processing>(schema, std::move(defs), std::move(metadata), std::move(factories));
    } else {
        return ::make_shared<simple_selection>(schema, std::move(defs), std::move(metadata), false);
--- a/cql3/single_column_relation.hh
+++ b/cql3/single_column_relation.hh
@@ -110,6 +110,11 @@ public:
    ::shared_ptr<term::raw> get_map_key() {
        return _map_key;
    }
+
+    ::shared_ptr<term::raw> get_value() {
+        return _value;
+    }
+
 protected:
    virtual ::shared_ptr<term> to_term(const std::vector<::shared_ptr<column_specification>>& receivers,
                          ::shared_ptr<term::raw> raw, database& db, const sstring& keyspace,
--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -103,7 +103,7 @@ shared_ptr<transport::event::schema_change> cql3::statements::alter_keyspace_sta
 }

 shared_ptr<cql3::statements::prepared_statement>
-cql3::statements::alter_keyspace_statement::prepare(database& db) {
+cql3::statements::alter_keyspace_statement::prepare(database& db, cql_stats& stats) {
    return make_shared<prepared_statement>(make_shared<alter_keyspace_statement>(*this));
 }

--- a/cql3/statements/alter_keyspace_statement.hh
+++ b/cql3/statements/alter_keyspace_statement.hh
@@ -63,7 +63,7 @@ public:
    void validate(distributed<service::storage_proxy>& proxy, const service::client_state& state) override;
    future<bool> announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) override;
    shared_ptr<transport::event::schema_change> change_event() override;
-    virtual shared_ptr<prepared> prepare(database& db) override;
+    virtual shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
 };

 }
--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -274,7 +274,7 @@ shared_ptr<transport::event::schema_change> alter_table_statement::change_event(
 }

 shared_ptr<cql3::statements::prepared_statement>
-cql3::statements::alter_table_statement::prepare(database& db) {
+cql3::statements::alter_table_statement::prepare(database& db, cql_stats& stats) {
    return make_shared<prepared_statement>(make_shared<alter_table_statement>(*this));
 }

--- a/cql3/statements/alter_table_statement.hh
+++ b/cql3/statements/alter_table_statement.hh
@@ -80,7 +80,7 @@ public:
    virtual void validate(distributed<service::storage_proxy>& proxy, const service::client_state& state) override;
    virtual future<bool> announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) override;
    virtual shared_ptr<transport::event::schema_change> change_event() override;
-    virtual shared_ptr<prepared> prepare(database& db) override;
+    virtual shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
 };

 }
--- a/cql3/statements/alter_type_statement.cc
+++ b/cql3/statements/alter_type_statement.cc
@@ -43,6 +43,7 @@
 #include "schema_builder.hh"
 #include "service/migration_manager.hh"
 #include "boost/range/adaptor/map.hpp"
+#include "stdx.hh"

 namespace cql3 {

@@ -86,14 +87,14 @@ const sstring& alter_type_statement::keyspace() const
    return _name.get_keyspace();
 }

-static int32_t get_idx_of_field(user_type type, shared_ptr<column_identifier> field)
+static stdx::optional<uint32_t> get_idx_of_field(user_type type, shared_ptr<column_identifier> field)
 {
    for (uint32_t i = 0; i < type->field_names().size(); ++i) {
        if (field->name() == type->field_names()[i]) {
-            return i;
+            return {i};
        }
    }
-    return -1;
+    return {};
 }

 void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, bool is_local_only)
@@ -164,7 +165,7 @@ alter_type_statement::add_or_alter::add_or_alter(const ut_name& name, bool is_ad

 user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_update) const
 {
-    if (get_idx_of_field(to_update, _field_name) >= 0) {
+    if (get_idx_of_field(to_update, _field_name)) {
        throw exceptions::invalid_request_exception(sprint("Cannot add new field %s to type %s: a field of the same name already exists", _field_name->name(), _name.to_string()));
    }

@@ -181,19 +182,19 @@ user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_

 user_type alter_type_statement::add_or_alter::do_alter(database& db, user_type to_update) const
 {
-    uint32_t idx = get_idx_of_field(to_update, _field_name);
-    if (idx < 0) {
+    stdx::optional<uint32_t> idx = get_idx_of_field(to_update, _field_name);
+    if (!idx) {
        throw exceptions::invalid_request_exception(sprint("Unknown field %s in type %s", _field_name->name(), _name.to_string()));
    }

-    auto previous = to_update->field_types()[idx];
+    auto previous = to_update->field_types()[*idx];
    auto new_type = _field_type->prepare(db, keyspace())->get_type();
    if (!new_type->is_compatible_with(*previous)) {
        throw exceptions::invalid_request_exception(sprint("Type %s in incompatible with previous type %s of field %s in user type %s", _field_type->to_string(), previous->as_cql3_type()->to_string(), _field_name->name(), _name.to_string()));
    }

    std::vector<data_type> new_types(to_update->field_types());
-    new_types[idx] = new_type;
+    new_types[*idx] = new_type;
    return user_type_impl::get_instance(to_update->_keyspace, to_update->_name, to_update->field_names(), std::move(new_types));
 }

@@ -217,11 +218,11 @@ user_type alter_type_statement::renames::make_updated_type(database& db, user_ty
    std::vector<bytes> new_names(to_update->field_names());
    for (auto&& rename : _renames) {
        auto&& from = rename.first;
-        int32_t idx = get_idx_of_field(to_update, from);
-        if (idx < 0) {
+        stdx::optional<uint32_t> idx = get_idx_of_field(to_update, from);
+        if (!idx) {
            throw exceptions::invalid_request_exception(sprint("Unknown field %s in type %s", from->to_string(), _name.to_string()));
        }
-        new_names[idx] = rename.second->name();
+        new_names[*idx] = rename.second->name();
    }
    auto&& updated = user_type_impl::get_instance(to_update->_keyspace, to_update->_name, std::move(new_names), to_update->field_types());
    create_type_statement::check_for_duplicate_names(updated);
@@ -229,12 +230,12 @@ user_type alter_type_statement::renames::make_updated_type(database& db, user_ty
 }

 shared_ptr<cql3::statements::prepared_statement>
-alter_type_statement::add_or_alter::prepare(database& db) {
+alter_type_statement::add_or_alter::prepare(database& db, cql_stats& stats) {
    return make_shared<prepared_statement>(make_shared<alter_type_statement::add_or_alter>(*this));
 }

 shared_ptr<cql3::statements::prepared_statement>
-alter_type_statement::renames::prepare(database& db) {
+alter_type_statement::renames::prepare(database& db, cql_stats& stats) {
    return make_shared<prepared_statement>(make_shared<alter_type_statement::renames>(*this));
 }

--- a/cql3/statements/alter_type_statement.hh
+++ b/cql3/statements/alter_type_statement.hh
@@ -84,7 +84,7 @@ public:
                 const shared_ptr<column_identifier> field_name,
                 const shared_ptr<cql3_type::raw> field_type);
    virtual user_type make_updated_type(database& db, user_type to_update) const override;
-    virtual shared_ptr<prepared> prepare(database& db) override;
+    virtual shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
 private:
    user_type do_add(database& db, user_type to_update) const;
    user_type do_alter(database& db, user_type to_update) const;
@@ -101,7 +101,7 @@ public:
    void add_rename(shared_ptr<column_identifier> previous_name, shared_ptr<column_identifier> new_name);

    virtual user_type make_updated_type(database& db, user_type to_update) const override;
-    virtual shared_ptr<prepared> prepare(database& db) override;
+    virtual shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
 };

 }
--- a/cql3/statements/authentication_statement.cc
+++ b/cql3/statements/authentication_statement.cc
@@ -47,7 +47,7 @@ uint32_t cql3::statements::authentication_statement::get_bound_terms() {
 }

 ::shared_ptr<cql3::statements::prepared_statement> cql3::statements::authentication_statement::prepare(
-                database& db) {
+                database& db, cql_stats& stats) {
    return ::make_shared<prepared>(this->shared_from_this());
 }

--- a/cql3/statements/authentication_statement.hh
+++ b/cql3/statements/authentication_statement.hh
@@ -54,7 +54,7 @@ class authentication_statement : public raw::parsed_statement, public cql_statem
 public:
    uint32_t get_bound_terms() override;

-    ::shared_ptr<prepared> prepare(database& db) override;
+    ::shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;

    bool uses_function(const sstring& ks_name, const sstring& function_name) const override;

--- a/cql3/statements/authorization_statement.cc
+++ b/cql3/statements/authorization_statement.cc
@@ -47,7 +47,7 @@ uint32_t cql3::statements::authorization_statement::get_bound_terms() {
 }

 ::shared_ptr<cql3::statements::prepared_statement> cql3::statements::authorization_statement::prepare(
-                database& db) {
+                database& db, cql_stats& stats) {
    return ::make_shared<parsed_statement::prepared>(this->shared_from_this());
 }

--- a/cql3/statements/authorization_statement.hh
+++ b/cql3/statements/authorization_statement.hh
@@ -58,7 +58,7 @@ class authorization_statement : public raw::parsed_statement, public cql_stateme
 public:
    uint32_t get_bound_terms() override;

-    ::shared_ptr<prepared> prepare(database& db) override;
+    ::shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;

    bool uses_function(const sstring& ks_name, const sstring& function_name) const override;

--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -102,18 +102,18 @@ void batch_statement::verify_batch_size(const std::vector<mutation>& mutations)
 namespace raw {

 shared_ptr<prepared_statement>
-batch_statement::prepare(database& db) {
+batch_statement::prepare(database& db, cql_stats& stats) {
    auto&& bound_names = get_bound_variables();

    std::vector<shared_ptr<cql3::statements::modification_statement>> statements;
    for (auto&& parsed : _parsed_statements) {
-        statements.push_back(parsed->prepare(db, bound_names));
+        statements.push_back(parsed->prepare(db, bound_names, stats));
    }

    auto&& prep_attrs = _attrs->prepare(db, "[batch]", "[batch]");
    prep_attrs->collect_marker_specification(bound_names);

-    cql3::statements::batch_statement batch_statement_(bound_names->size(), _type, std::move(statements), std::move(prep_attrs));
+    cql3::statements::batch_statement batch_statement_(bound_names->size(), _type, std::move(statements), std::move(prep_attrs), stats);
    batch_statement_.validate();

    return ::make_shared<prepared>(make_shared(std::move(batch_statement_)),
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -74,6 +74,7 @@ private:
    std::vector<shared_ptr<modification_statement>> _statements;
    std::unique_ptr<attributes> _attrs;
    bool _has_conditions;
+    cql_stats& _stats;
 public:
    /**
     * Creates a new BatchStatement from a list of statements and a
@@ -85,10 +86,12 @@ public:
     */
    batch_statement(int bound_terms, type type_,
                    std::vector<shared_ptr<modification_statement>> statements,
-                    std::unique_ptr<attributes> attrs)
+                    std::unique_ptr<attributes> attrs,
+                    cql_stats& stats)
            : _bound_terms(bound_terms), _type(type_), _statements(std::move(statements))
            , _attrs(std::move(attrs))
-            , _has_conditions(boost::algorithm::any_of(_statements, std::mem_fn(&modification_statement::has_conditions))) {
+            , _has_conditions(boost::algorithm::any_of(_statements, std::mem_fn(&modification_statement::has_conditions)))
+            , _stats(stats) {
    }

    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
@@ -175,6 +178,7 @@ private:
                               boost::make_counting_iterator<size_t>(_statements.size()),
                               [this, &storage, &options, now, local, &result, trace_state] (size_t i) {
                auto&& statement = _statements[i];
+                statement->inc_cql_stats();
                auto&& statement_options = options.for_statement(i);
                auto timestamp = _attrs->get_timestamp(now, statement_options);
                return statement->get_mutations(storage, statement_options, local, timestamp, trace_state).then([&result] (auto&& more) {
@@ -195,6 +199,7 @@ public:

    virtual future<shared_ptr<transport::messages::result_message>> execute(
            distributed<service::storage_proxy>& storage, service::query_state& state, const query_options& options) override {
+        ++_stats.batches;
        return execute(storage, state, options, false, options.get_timestamp(state));
    }
 private:
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -100,12 +100,12 @@ void cf_prop_defs::validate() {
    }

    auto compression_options = get_compression_options();
-    if (!compression_options.empty()) {
-        auto sstable_compression_class = compression_options.find(sstring(compression_parameters::SSTABLE_COMPRESSION));
-        if (sstable_compression_class == compression_options.end()) {
+    if (compression_options && !compression_options->empty()) {
+        auto sstable_compression_class = compression_options->find(sstring(compression_parameters::SSTABLE_COMPRESSION));
+        if (sstable_compression_class == compression_options->end()) {
            throw exceptions::configuration_exception(sstring("Missing sub-option '") + compression_parameters::SSTABLE_COMPRESSION + "' for the '" + KW_COMPRESSION + "' option.");
        }
-        compression_parameters cp(compression_options);
+        compression_parameters cp(*compression_options);
        cp.validate();
    }

@@ -131,12 +131,12 @@ std::map<sstring, sstring> cf_prop_defs::get_compaction_options() const {
    return std::map<sstring, sstring>{};
 }

-std::map<sstring, sstring> cf_prop_defs::get_compression_options() const {
+stdx::optional<std::map<sstring, sstring>> cf_prop_defs::get_compression_options() const {
    auto compression_options = get_map(KW_COMPRESSION);
    if (compression_options) {
-        return compression_options.value();
+        return { compression_options.value() };
    }
-    return std::map<sstring, sstring>{};
+    return { };
 }

 int32_t cf_prop_defs::get_default_time_to_live() const
@@ -206,8 +206,9 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder) {
    }

    builder.set_bloom_filter_fp_chance(get_double(KW_BF_FP_CHANCE, builder.get_bloom_filter_fp_chance()));
-    if (!get_compression_options().empty()) {
-        builder.set_compressor_params(compression_parameters(get_compression_options()));
+    auto compression_options = get_compression_options();
+    if (compression_options) {
+        builder.set_compressor_params(compression_parameters(*compression_options));
    }
 #if 0
    CachingOptions cachingOptions = getCachingOptions();
--- a/cql3/statements/cf_prop_defs.hh
+++ b/cql3/statements/cf_prop_defs.hh
@@ -82,7 +82,7 @@ private:
 public:
    void validate();
    std::map<sstring, sstring> get_compaction_options() const;
-    std::map<sstring, sstring> get_compression_options() const;
+    stdx::optional<std::map<sstring, sstring>> get_compression_options() const;
 #if 0
    public CachingOptions getCachingOptions() throws SyntaxException, ConfigurationException
    {
--- a/cql3/statements/cf_properties.hh
+++ b/cql3/statements/cf_properties.hh
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright 2016 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "cql3/statements/cf_prop_defs.hh"
+
+namespace cql3 {
+
+namespace statements {
+
+/**
+ * Class for common statement properties.
+ */
+class cf_properties final {
+    const ::shared_ptr<cf_prop_defs> _properties = ::make_shared<cf_prop_defs>();
+    bool _use_compact_storage = false;
+    std::vector<std::pair<::shared_ptr<column_identifier>, bool>> _defined_ordering; // Insertion ordering is important
+public:
+    auto& properties() const {
+        return _properties;
+    }
+
+    bool use_compact_storage() const {
+        return _use_compact_storage;
+    }
+
+    void set_compact_storage() {
+        _use_compact_storage = true;
+    }
+
+    auto& defined_ordering() const {
+        return _defined_ordering;
+    }
+
+    data_type get_reversable_type(::shared_ptr<column_identifier> t, data_type type) const {
+        auto is_reversed = find_ordering_info(t);
+        return is_reversed && *is_reversed ? reversed_type_impl::get_instance(type) : type;
+    }
+
+    std::experimental::optional<bool> find_ordering_info(::shared_ptr<column_identifier> type) const {
+        for (auto& t: _defined_ordering) {
+            if (*(t.first) == *type) {
+                return t.second;
+            }
+        }
+        return {};
+    }
+
+    void set_ordering(::shared_ptr<column_identifier> alias, bool reversed) {
+        _defined_ordering.emplace_back(alias, reversed);
+    }
+
+    void validate() {
+        _properties->validate();
+    }
+};
+
+}
+
+}
--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -207,7 +207,7 @@ cql3::statements::create_index_statement::announce_migration(distributed<service
 }

 shared_ptr<cql3::statements::prepared_statement>
-cql3::statements::create_index_statement::prepare(database& db) {
+cql3::statements::create_index_statement::prepare(database& db, cql_stats& stats) {
    return make_shared<prepared_statement>(make_shared<create_index_statement>(*this));
 }

--- a/cql3/statements/create_index_statement.hh
+++ b/cql3/statements/create_index_statement.hh
@@ -87,7 +87,7 @@ public:
                transport::event::schema_change::target_type::TABLE, keyspace(),
                column_family());
    }
-    virtual shared_ptr<prepared> prepare(database& db) override;
+    virtual shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
 };

 }
--- a/cql3/statements/create_keyspace_statement.cc
+++ b/cql3/statements/create_keyspace_statement.cc
@@ -124,7 +124,7 @@ shared_ptr<transport::event::schema_change> create_keyspace_statement::change_ev
 }

 shared_ptr<cql3::statements::prepared_statement>
-cql3::statements::create_keyspace_statement::prepare(database& db) {
+cql3::statements::create_keyspace_statement::prepare(database& db, cql_stats& stats) {
    return make_shared<prepared_statement>(make_shared<create_keyspace_statement>(*this));
 }

--- a/cql3/statements/create_keyspace_statement.hh
+++ b/cql3/statements/create_keyspace_statement.hh
@@ -84,7 +84,7 @@ public:
    virtual future<bool> announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) override;

    virtual shared_ptr<transport::event::schema_change> change_event() override;
-    virtual shared_ptr<prepared> prepare(database& db) override;
+    virtual shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
 };

 }
--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -64,12 +64,6 @@ create_table_statement::create_table_statement(::shared_ptr<cf_name> name,
    , _properties{properties}
    , _if_not_exists{if_not_exists}
 {
-    if (!properties->has_property(cf_prop_defs::KW_COMPRESSION) && schema::DEFAULT_COMPRESSOR) {
-        std::map<sstring, sstring> compression = {
-            { sstring(compression_parameters::SSTABLE_COMPRESSION), schema::DEFAULT_COMPRESSOR.value() },
-        };
-        properties->add_property(cf_prop_defs::KW_COMPRESSION, compression);
-    }
 }

 future<> create_table_statement::check_access(const service::client_state& state) {
@@ -157,7 +151,7 @@ void create_table_statement::add_column_metadata_from_aliases(schema_builder& bu
 }

 shared_ptr<prepared_statement>
-create_table_statement::prepare(database& db) {
+create_table_statement::prepare(database& db, cql_stats& stats) {
    // Cannot happen; create_table_statement is never instantiated as a raw statement
    // (instead we instantiate create_table_statement::raw_statement)
    abort();
@@ -169,7 +163,7 @@ create_table_statement::raw_statement::raw_statement(::shared_ptr<cf_name> name,
    , _if_not_exists{if_not_exists}
 { }

-::shared_ptr<prepared_statement> create_table_statement::raw_statement::prepare(database& db) {
+::shared_ptr<prepared_statement> create_table_statement::raw_statement::prepare(database& db, cql_stats& stats) {
    // Column family name
    const sstring& cf_name = _cf_name->get_column_family();
    std::regex name_regex("\\w+");
@@ -188,9 +182,9 @@ create_table_statement::raw_statement::raw_statement(::shared_ptr<cf_name> name,
        throw exceptions::invalid_request_exception(sprint("Multiple definition of identifier %s", (*i)->text()));
    }

-    properties->validate();
+    _properties.validate();

-    auto stmt = ::make_shared<create_table_statement>(_cf_name, properties, _if_not_exists, _static_columns);
+    auto stmt = ::make_shared<create_table_statement>(_cf_name, _properties.properties(), _if_not_exists, _static_columns);

    std::experimental::optional<std::map<bytes, data_type>> defined_multi_cell_collections;
    for (auto&& entry : _definitions) {
@@ -214,7 +208,7 @@ create_table_statement::raw_statement::raw_statement(::shared_ptr<cf_name> name,
        throw exceptions::invalid_request_exception("Multiple PRIMARY KEYs specifed (exactly one required)");
    }

-    stmt->_use_compact_storage = _use_compact_storage;
+    stmt->_use_compact_storage = _properties.use_compact_storage();

    auto& key_aliases = _key_aliases[0];
    std::vector<data_type> key_types;
@@ -233,7 +227,7 @@ create_table_statement::raw_statement::raw_statement(::shared_ptr<cf_name> name,

    // Handle column aliases
    if (_column_aliases.empty()) {
-        if (_use_compact_storage) {
+        if (_properties.use_compact_storage()) {
            // There should remain some column definition since it is a non-composite "static" CF
            if (stmt->_columns.empty()) {
                throw exceptions::invalid_request_exception("No definition found that is not part of the PRIMARY KEY");
@@ -246,7 +240,7 @@ create_table_statement::raw_statement::raw_statement(::shared_ptr<cf_name> name,
    } else {
        // If we use compact storage and have only one alias, it is a
        // standard "dynamic" CF, otherwise it's a composite
-        if (_use_compact_storage && _column_aliases.size() == 1) {
+        if (_properties.use_compact_storage() && _column_aliases.size() == 1) {
            if (defined_multi_cell_collections) {
                throw exceptions::invalid_request_exception("Collection types are not supported with COMPACT STORAGE");
            }
@@ -274,7 +268,7 @@ create_table_statement::raw_statement::raw_statement(::shared_ptr<cf_name> name,
                types.emplace_back(type);
            }

-            if (_use_compact_storage) {
+            if (_properties.use_compact_storage()) {
                if (defined_multi_cell_collections) {
                    throw exceptions::invalid_request_exception("Collection types are not supported with COMPACT STORAGE");
                }
@@ -287,7 +281,7 @@ create_table_statement::raw_statement::raw_statement(::shared_ptr<cf_name> name,

    if (!_static_columns.empty()) {
        // Only CQL3 tables can have static columns
-        if (_use_compact_storage) {
+        if (_properties.use_compact_storage()) {
            throw exceptions::invalid_request_exception("Static columns are not supported in COMPACT STORAGE tables");
        }
        // Static columns only make sense if we have at least one clustering column. Otherwise everything is static anyway
@@ -296,7 +290,7 @@ create_table_statement::raw_statement::raw_statement(::shared_ptr<cf_name> name,
        }
    }

-    if (_use_compact_storage && !stmt->_column_aliases.empty()) {
+    if (_properties.use_compact_storage() && !stmt->_column_aliases.empty()) {
        if (stmt->_columns.empty()) {
 #if 0
            // The only value we'll insert will be the empty one, so the default validator don't matter
@@ -322,7 +316,7 @@ create_table_statement::raw_statement::raw_statement(::shared_ptr<cf_name> name,
    } else {
        // For compact, we are in the "static" case, so we need at least one column defined. For non-compact however, having
        // just the PK is fine since we have CQL3 row marker.
-        if (_use_compact_storage && stmt->_columns.empty()) {
+        if (_properties.use_compact_storage() && stmt->_columns.empty()) {
            throw exceptions::invalid_request_exception("COMPACT STORAGE with non-composite PRIMARY KEY require one column not part of the PRIMARY KEY, none given");
        }
 #if 0
@@ -335,18 +329,18 @@ create_table_statement::raw_statement::raw_statement(::shared_ptr<cf_name> name,
    }

    // If we give a clustering order, we must explicitly do so for all aliases and in the order of the PK
-    if (!_defined_ordering.empty()) {
-        if (_defined_ordering.size() > _column_aliases.size()) {
+    if (!_properties.defined_ordering().empty()) {
+        if (_properties.defined_ordering().size() > _column_aliases.size()) {
            throw exceptions::invalid_request_exception("Only clustering key columns can be defined in CLUSTERING ORDER directive");
        }

        int i = 0;
-        for (auto& pair: _defined_ordering){
+        for (auto& pair: _properties.defined_ordering()){
            auto& id = pair.first;
            auto& c = _column_aliases.at(i);

            if (!(*id == *c)) {
-                if (find_ordering_info(c)) {
+                if (_properties.find_ordering_info(c)) {
                    throw exceptions::invalid_request_exception(sprint("The order of columns in the CLUSTERING ORDER directive must be the one of the clustering key (%s must appear before %s)", c, id));
                } else {
                    throw exceptions::invalid_request_exception(sprint("Missing CLUSTERING ORDER for column %s", c));
@@ -371,12 +365,7 @@ data_type create_table_statement::raw_statement::get_type_and_remove(column_map_
    }
    columns.erase(t);

-    auto is_reversed = find_ordering_info(t);
-    if (!is_reversed) {
-        return type;
-    } else {
-        return *is_reversed ? reversed_type_impl::get_instance(type) : type;
-    }
+    return _properties.get_reversable_type(t, type);
 }

 void create_table_statement::raw_statement::add_definition(::shared_ptr<column_identifier> def, ::shared_ptr<cql3_type::raw> type, bool is_static) {
@@ -395,14 +384,6 @@ void create_table_statement::raw_statement::add_column_alias(::shared_ptr<column
    _column_aliases.emplace_back(alias);
 }

-void create_table_statement::raw_statement::set_ordering(::shared_ptr<column_identifier> alias, bool reversed) {
-    _defined_ordering.emplace_back(alias, reversed);
-}
-
-void create_table_statement::raw_statement::set_compact_storage() {
-    _use_compact_storage = true;
-}
-
 }

 }
--- a/cql3/statements/create_table_statement.hh
+++ b/cql3/statements/create_table_statement.hh
@@ -43,6 +43,7 @@

 #include "cql3/statements/schema_altering_statement.hh"
 #include "cql3/statements/cf_prop_defs.hh"
+#include "cql3/statements/cf_properties.hh"
 #include "cql3/statements/raw/cf_statement.hh"
 #include "cql3/cql3_type.hh"

@@ -103,7 +104,7 @@ public:

    virtual shared_ptr<transport::event::schema_change> change_event() override;

-    virtual shared_ptr<prepared> prepare(database& db) override;
+    virtual shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;

    schema_ptr get_cf_meta_data();

@@ -125,30 +126,22 @@ private:
                                         shared_ptr_value_hash<column_identifier>,
                                         shared_ptr_equal_by_value<column_identifier>>;
    defs_type _definitions;
-public:
-    const ::shared_ptr<cf_prop_defs> properties = ::make_shared<cf_prop_defs>();
-private:
    std::vector<std::vector<::shared_ptr<column_identifier>>> _key_aliases;
    std::vector<::shared_ptr<column_identifier>> _column_aliases;
-    std::vector<std::pair<::shared_ptr<column_identifier>, bool>> _defined_ordering; // Insertion ordering is important
-    std::experimental::optional<bool> find_ordering_info(::shared_ptr<column_identifier> type) {
-        for (auto& t: _defined_ordering) {
-            if (*(t.first) == *type) {
-                return t.second;
-            }
-        }
-        return {};
-    }
    create_table_statement::column_set_type _static_columns;

-    bool _use_compact_storage = false;
    std::multiset<::shared_ptr<column_identifier>,
            indirect_less<::shared_ptr<column_identifier>, column_identifier::text_comparator>> _defined_names;
    bool _if_not_exists;
+    cf_properties _properties;
 public:
    raw_statement(::shared_ptr<cf_name> name, bool if_not_exists);

-    virtual ::shared_ptr<prepared> prepare(database& db) override;
+    virtual ::shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
+
+    cf_properties& properties() {
+        return _properties;
+    }

    data_type get_type_and_remove(column_map_type& columns, ::shared_ptr<column_identifier> t);

@@ -157,10 +150,6 @@ public:
    void add_key_aliases(const std::vector<::shared_ptr<column_identifier>> aliases);

    void add_column_alias(::shared_ptr<column_identifier> alias);
-
-    void set_ordering(::shared_ptr<column_identifier> alias, bool reversed);
-
-    void set_compact_storage();
 };

 }
--- a/cql3/statements/create_type_statement.cc
+++ b/cql3/statements/create_type_statement.cc
@@ -157,7 +157,7 @@ future<bool> create_type_statement::announce_migration(distributed<service::stor
 }

 shared_ptr<cql3::statements::prepared_statement>
-create_type_statement::prepare(database& db) {
+create_type_statement::prepare(database& db, cql_stats& stats) {
    return make_shared<prepared_statement>(make_shared<create_type_statement>(*this));
 }

--- a/cql3/statements/create_type_statement.hh
+++ b/cql3/statements/create_type_statement.hh
@@ -69,7 +69,7 @@ public:

    virtual future<bool> announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) override;

-    virtual shared_ptr<prepared> prepare(database& db) override;
+    virtual shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;

    static void check_for_duplicate_names(user_type type);
 private:
--- a/cql3/statements/create_view_statement.cc
+++ b/cql3/statements/create_view_statement.cc
@@ -0,0 +1,344 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (C) 2016 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include <inttypes.h>
+#include <regex>
+
+#include <boost/range/adaptor/map.hpp>
+#include <boost/range/algorithm/adjacent_find.hpp>
+
+#include "cql3/statements/create_view_statement.hh"
+#include "cql3/statements/prepared_statement.hh"
+#include "schema_builder.hh"
+#include "service/storage_proxy.hh"
+
+
+namespace cql3 {
+
+namespace statements {
+
+create_view_statement::create_view_statement(
+        ::shared_ptr<cf_name> view_name,
+        ::shared_ptr<cf_name> base_name,
+        std::vector<::shared_ptr<selection::raw_selector>> select_clause,
+        std::vector<::shared_ptr<relation>> where_clause,
+        std::vector<::shared_ptr<cql3::column_identifier::raw>> partition_keys,
+        std::vector<::shared_ptr<cql3::column_identifier::raw>> clustering_keys,
+        bool if_not_exists)
+    : schema_altering_statement{view_name}
+    , _base_name{base_name}
+    , _select_clause{select_clause}
+    , _where_clause{where_clause}
+    , _partition_keys{partition_keys}
+    , _clustering_keys{clustering_keys}
+    , _if_not_exists{if_not_exists}
+{
+    // TODO: probably need to create a "statement_restrictions" like select does
+    // based on the select_clause, base_name and where_clause; However need to
+    // pass for_view=true.
+    fail(unimplemented::cause::VIEWS);
+}
+
+// FIXME: I copied the following from create_table_statement. I don't know
+// what they do or whether they need to change for create view.
+future<> create_view_statement::check_access(const service::client_state& state) {
+    return state.has_keyspace_access(keyspace(), auth::permission::CREATE);
+}
+
+void create_view_statement::validate(distributed<service::storage_proxy>&, const service::client_state& state) {
+    // validated in announceMigration()
+}
+
+future<bool> create_view_statement::announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) {
+    // FIXME: this code from create_table_view is probably wrong, the Java CreateViewStatement.announceMigration is much more elaborate
+#if 0
+    ****** Our implementation in creat_table_statement (simpler code but that was simpler also in Java)
+    return make_ready_future<>().then([this, is_local_only] {
+        return service::get_local_migration_manager().announce_new_column_family(get_cf_meta_data(), is_local_only);
+    }).then_wrapped([this] (auto&& f) {
+        try {
+            f.get();
+            return true;
+        } catch (const exceptions::already_exists_exception& e) {
+            if (_if_not_exists) {
+                return false;
+            }
+            throw e;
+        }
+    });
+#endif
+#if 0
+    ***** This if 0 code is from Cassandra CreateViewStatement
+    // We need to make sure that:
+    //  - primary key includes all columns in base table's primary key
+    //  - make sure that the select statement does not have anything other than columns
+    //    and their names match the base table's names
+    //  - make sure that primary key does not include any collections
+    //  - make sure there is no where clause in the select statement
+    //  - make sure there is not currently a table or view
+    //  - make sure baseTable gcGraceSeconds > 0
+
+    properties.validate();
+
+    if (properties.useCompactStorage)
+        throw new InvalidRequestException("Cannot use 'COMPACT STORAGE' when defining a materialized view");
+#endif
+
+    // View and base tables must be in the same keyspace, to ensure that RF
+    // is the same (because we assign a view replica to each base replica).
+    // If a keyspace was not specified for the base table name, it is assumed
+    // it is in the same keyspace as the view table being created (which
+    // itself might be the current USEd keyspace, or explicitly specified).
+    if (_base_name->get_keyspace().empty()) {
+        _base_name->set_keyspace(keyspace(), true);
+    }
+    if (_base_name->get_keyspace() != keyspace()) {
+        throw exceptions::invalid_request_exception(sprint(
+                "Cannot create a materialized view on a table in a separate keyspace ('%s' != '%s')",
+                _base_name->get_keyspace(), keyspace()));
+    }
+
+    // Validate that the keyspace and the base table exist, and is not a
+    // special table on which we cannot create views:
+    // CONTINUE HERE: something like the code below taken from migration_manager instead of the validateColumFamily below
+    auto& db = service::get_local_storage_proxy().get_db().local();
+    if (!db.has_keyspace(_base_name->get_keyspace())) {
+        throw exceptions::invalid_request_exception(sprint(
+                "Keyspace '%s' does not exist", _base_name->get_keyspace()));
+    }
+//    auto& ks = db.find_keyspace(_base_name->get_keyspace());
+    if (!db.has_schema(_base_name->get_keyspace(), _base_name->get_column_family())) {
+        throw exceptions::invalid_request_exception(sprint(
+                "Base table '%s' does not exist",
+                _base_name->get_column_family()));
+    }
+    return make_ready_future<bool>(true);
+//    if (db.has_schema(_base_name->get_keyspace(), _base_name->get_column_family())) {
+//            throw exceptions::already_exists_exception(cfm->ks_name(), cfm->cf_name());
+//        }
+#if 0
+    CFMetaData cfm = ThriftValidation.validateColumnFamily(baseName.getKeyspace(), baseName.getColumnFamily());
+
+    if (cfm.isCounter())
+        throw new InvalidRequestException("Materialized views are not supported on counter tables");
+    if (cfm.isView())
+        throw new InvalidRequestException("Materialized views cannot be created against other materialized views");
+
+    if (cfm.params.gcGraceSeconds == 0)
+    {
+        throw new InvalidRequestException(String.format("Cannot create materialized view '%s' for base table " +
+                                                        "'%s' with gc_grace_seconds of 0, since this value is " +
+                                                        "used to TTL undelivered updates. Setting gc_grace_seconds" +
+                                                        " too low might cause undelivered updates to expire " +
+                                                        "before being replayed.", cfName.getColumnFamily(),
+                                                        baseName.getColumnFamily()));
+    }
+
+    Set<ColumnIdentifier> included = new HashSet<>();
+    for (RawSelector selector : selectClause)
+    {
+        Selectable.Raw selectable = selector.selectable;
+        if (selectable instanceof Selectable.WithFieldSelection.Raw)
+            throw new InvalidRequestException("Cannot select out a part of type when defining a materialized view");
+        if (selectable instanceof Selectable.WithFunction.Raw)
+            throw new InvalidRequestException("Cannot use function when defining a materialized view");
+        if (selectable instanceof Selectable.WritetimeOrTTL.Raw)
+            throw new InvalidRequestException("Cannot use function when defining a materialized view");
+        ColumnIdentifier identifier = (ColumnIdentifier) selectable.prepare(cfm);
+        if (selector.alias != null)
+            throw new InvalidRequestException(String.format("Cannot alias column '%s' as '%s' when defining a materialized view", identifier.toString(), selector.alias.toString()));
+
+        ColumnDefinition cdef = cfm.getColumnDefinition(identifier);
+
+        if (cdef == null)
+            throw new InvalidRequestException("Unknown column name detected in CREATE MATERIALIZED VIEW statement : "+identifier);
+
+        included.add(identifier);
+    }
+
+    Set<ColumnIdentifier.Raw> targetPrimaryKeys = new HashSet<>();
+    for (ColumnIdentifier.Raw identifier : Iterables.concat(partitionKeys, clusteringKeys))
+    {
+        if (!targetPrimaryKeys.add(identifier))
+            throw new InvalidRequestException("Duplicate entry found in PRIMARY KEY: "+identifier);
+
+        ColumnDefinition cdef = cfm.getColumnDefinition(identifier.prepare(cfm));
+
+        if (cdef == null)
+            throw new InvalidRequestException("Unknown column name detected in CREATE MATERIALIZED VIEW statement : "+identifier);
+
+        if (cfm.getColumnDefinition(identifier.prepare(cfm)).type.isMultiCell())
+            throw new InvalidRequestException(String.format("Cannot use MultiCell column '%s' in PRIMARY KEY of materialized view", identifier));
+
+        if (cdef.isStatic())
+            throw new InvalidRequestException(String.format("Cannot use Static column '%s' in PRIMARY KEY of materialized view", identifier));
+    }
+
+    // build the select statement
+    Map<ColumnIdentifier.Raw, Boolean> orderings = Collections.emptyMap();
+    SelectStatement.Parameters parameters = new SelectStatement.Parameters(orderings, false, true, false);
+    SelectStatement.RawStatement rawSelect = new SelectStatement.RawStatement(baseName, parameters, selectClause, whereClause, null, null);
+
+    ClientState state = ClientState.forInternalCalls();
+    state.setKeyspace(keyspace());
+
+    rawSelect.prepareKeyspace(state);
+    rawSelect.setBoundVariables(getBoundVariables());
+
+    ParsedStatement.Prepared prepared = rawSelect.prepare(true);
+    SelectStatement select = (SelectStatement) prepared.statement;
+    StatementRestrictions restrictions = select.getRestrictions();
+
+    if (!prepared.boundNames.isEmpty())
+        throw new InvalidRequestException("Cannot use query parameters in CREATE MATERIALIZED VIEW statements");
+
+    if (!restrictions.nonPKRestrictedColumns(false).isEmpty())
+    {
+        throw new InvalidRequestException(String.format(
+                "Non-primary key columns cannot be restricted in the SELECT statement used for materialized view " +
+                "creation (got restrictions on: %s)",
+                restrictions.nonPKRestrictedColumns(false).stream().map(def -> def.name.toString()).collect(Collectors.joining(", "))));
+    }
+
+    String whereClauseText = View.relationsToWhereClause(whereClause.relations);
+
+    Set<ColumnIdentifier> basePrimaryKeyCols = new HashSet<>();
+    for (ColumnDefinition definition : Iterables.concat(cfm.partitionKeyColumns(), cfm.clusteringColumns()))
+        basePrimaryKeyCols.add(definition.name);
+
+    List<ColumnIdentifier> targetClusteringColumns = new ArrayList<>();
+    List<ColumnIdentifier> targetPartitionKeys = new ArrayList<>();
+
+    // This is only used as an intermediate state; this is to catch whether multiple non-PK columns are used
+    boolean hasNonPKColumn = false;
+    for (ColumnIdentifier.Raw raw : partitionKeys)
+        hasNonPKColumn |= getColumnIdentifier(cfm, basePrimaryKeyCols, hasNonPKColumn, raw, targetPartitionKeys, restrictions);
+
+    for (ColumnIdentifier.Raw raw : clusteringKeys)
+        hasNonPKColumn |= getColumnIdentifier(cfm, basePrimaryKeyCols, hasNonPKColumn, raw, targetClusteringColumns, restrictions);
+
+    // We need to include all of the primary key columns from the base table in order to make sure that we do not
+    // overwrite values in the view. We cannot support "collapsing" the base table into a smaller number of rows in
+    // the view because if we need to generate a tombstone, we have no way of knowing which value is currently being
+    // used in the view and whether or not to generate a tombstone. In order to not surprise our users, we require
+    // that they include all of the columns. We provide them with a list of all of the columns left to include.
+    boolean missingClusteringColumns = false;
+    StringBuilder columnNames = new StringBuilder();
+    List<ColumnIdentifier> includedColumns = new ArrayList<>();
+    for (ColumnDefinition def : cfm.allColumns())
+    {
+        ColumnIdentifier identifier = def.name;
+        boolean includeDef = included.isEmpty() || included.contains(identifier);
+
+        if (includeDef && def.isStatic())
+        {
+            throw new InvalidRequestException(String.format("Unable to include static column '%s' which would be included by Materialized View SELECT * statement", identifier));
+        }
+
+        if (includeDef && !targetClusteringColumns.contains(identifier) && !targetPartitionKeys.contains(identifier))
+        {
+            includedColumns.add(identifier);
+        }
+        if (!def.isPrimaryKeyColumn()) continue;
+
+        if (!targetClusteringColumns.contains(identifier) && !targetPartitionKeys.contains(identifier))
+        {
+            if (missingClusteringColumns)
+                columnNames.append(',');
+            else
+                missingClusteringColumns = true;
+            columnNames.append(identifier);
+        }
+    }
+    if (missingClusteringColumns)
+        throw new InvalidRequestException(String.format("Cannot create Materialized View %s without primary key columns from base %s (%s)",
+                                                        columnFamily(), baseName.getColumnFamily(), columnNames.toString()));
+
+    if (targetPartitionKeys.isEmpty())
+        throw new InvalidRequestException("Must select at least a column for a Materialized View");
+
+    if (targetClusteringColumns.isEmpty())
+        throw new InvalidRequestException("No columns are defined for Materialized View other than primary key");
+
+    CFMetaData.Builder cfmBuilder = CFMetaData.Builder.createView(keyspace(), columnFamily());
+    add(cfm, targetPartitionKeys, cfmBuilder::addPartitionKey);
+    add(cfm, targetClusteringColumns, cfmBuilder::addClusteringColumn);
+    add(cfm, includedColumns, cfmBuilder::addRegularColumn);
+    cfmBuilder.withId(properties.properties.getId());
+    TableParams params = properties.properties.asNewTableParams();
+    CFMetaData viewCfm = cfmBuilder.build().params(params);
+    ViewDefinition definition = new ViewDefinition(keyspace(),
+                                                   columnFamily(),
+                                                   Schema.instance.getId(keyspace(), baseName.getColumnFamily()),
+                                                   baseName.getColumnFamily(),
+                                                   included.isEmpty(),
+                                                   rawSelect,
+                                                   whereClauseText,
+                                                   viewCfm);
+
+    try
+    {
+        MigrationManager.announceNewView(definition, isLocalOnly);
+        return new Event.SchemaChange(Event.SchemaChange.Change.CREATED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
+    }
+    catch (AlreadyExistsException e)
+    {
+        if (ifNotExists)
+            return null;
+        throw e;
+    }
+#endif
+}
+
+shared_ptr<transport::event::schema_change> create_view_statement::change_event() {
+    // FIXME: this is probably wrong, I just copied it from create_table_statement
+    return make_shared<transport::event::schema_change>(transport::event::schema_change::change_type::CREATED, transport::event::schema_change::target_type::TABLE, keyspace(), column_family());
+}
+
+shared_ptr<cql3::statements::prepared_statement>
+create_view_statement::prepare(database& db, cql_stats& stats) {
+    return make_shared<prepared_statement>(make_shared<create_view_statement>(*this));
+}
+
+}
+
+}
--- a/cql3/statements/create_view_statement.hh
+++ b/cql3/statements/create_view_statement.hh
@@ -0,0 +1,79 @@
+/*
+ * This file is part of Scylla.
+ * Copyright (C) 2016 ScyllaDB
+ *
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "cql3/statements/schema_altering_statement.hh"
+#include "cql3/statements/cf_prop_defs.hh"
+#include "cql3/statements/cf_properties.hh"
+#include "cql3/cql3_type.hh"
+#include "cql3/selection/raw_selector.hh"
+#include "cql3/relation.hh"
+#include "cql3/cf_name.hh"
+
+#include "service/migration_manager.hh"
+#include "schema.hh"
+
+#include "core/shared_ptr.hh"
+
+#include <utility>
+#include <vector>
+#include <experimental/optional>
+
+namespace cql3 {
+
+namespace statements {
+
+/** A <code>CREATE MATERIALIZED VIEW</code> parsed from a CQL query statement. */
+class create_view_statement : public schema_altering_statement {
+private:
+    ::shared_ptr<cf_name> _base_name;
+    std::vector<::shared_ptr<selection::raw_selector>> _select_clause;
+    std::vector<::shared_ptr<relation>> _where_clause;
+    std::vector<::shared_ptr<cql3::column_identifier::raw>> _partition_keys;
+    std::vector<::shared_ptr<cql3::column_identifier::raw>> _clustering_keys;
+    cf_properties _properties;
+    bool _if_not_exists;
+
+public:
+    create_view_statement(
+            ::shared_ptr<cf_name> view_name,
+            ::shared_ptr<cf_name> base_name,
+            std::vector<::shared_ptr<selection::raw_selector>> select_clause,
+            std::vector<::shared_ptr<relation>> where_clause,
+            std::vector<::shared_ptr<cql3::column_identifier::raw>> partition_keys,
+            std::vector<::shared_ptr<cql3::column_identifier::raw>> clustering_keys,
+            bool if_not_exists);
+
+    auto& properties() {
+        return _properties;
+    }
+
+    // Functions we need to override to subclass schema_altering_statement
+    virtual future<> check_access(const service::client_state& state) override;
+    virtual void validate(distributed<service::storage_proxy>&, const service::client_state& state) override;
+    virtual future<bool> announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) override;
+    virtual shared_ptr<transport::event::schema_change> change_event() override;
+    virtual shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
+
+    // FIXME: continue here. See create_table_statement.hh and CreateViewStatement.java
+};
+
+}
+}
--- a/cql3/statements/delete_statement.cc
+++ b/cql3/statements/delete_statement.cc
@@ -46,8 +46,8 @@ namespace cql3 {

 namespace statements {

-delete_statement::delete_statement(statement_type type, uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs)
-        : modification_statement{type, bound_terms, std::move(s), std::move(attrs)}
+delete_statement::delete_statement(statement_type type, uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs, cql_stats& stats)
+        : modification_statement{type, bound_terms, std::move(s), std::move(attrs), &stats.deletes}
 { }

 bool delete_statement::require_full_clustering_key() const {
@@ -80,10 +80,10 @@ void delete_statement::add_update_for_key(mutation& m, const exploded_clustering
 namespace raw {

 ::shared_ptr<cql3::statements::modification_statement>
-delete_statement::prepare_internal(database& db, schema_ptr schema, ::shared_ptr<variable_specifications> bound_names,
-        std::unique_ptr<attributes> attrs) {
+delete_statement::prepare_internal(database& db, schema_ptr schema, shared_ptr<variable_specifications> bound_names,
+        std::unique_ptr<attributes> attrs, cql_stats& stats) {
    using statement_type = cql3::statements::modification_statement::statement_type;
-    auto stmt = ::make_shared<cql3::statements::delete_statement>(statement_type::DELETE, bound_names->size(), schema, std::move(attrs));
+    auto stmt = ::make_shared<cql3::statements::delete_statement>(statement_type::DELETE, bound_names->size(), schema, std::move(attrs), stats);

    for (auto&& deletion : _deletions) {
        auto&& id = deletion->affected_column()->prepare_column_identifier(schema);
--- a/cql3/statements/delete_statement.hh
+++ b/cql3/statements/delete_statement.hh
@@ -56,7 +56,7 @@ namespace statements {
 */
 class delete_statement : public modification_statement {
 public:
-    delete_statement(statement_type type, uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs);
+    delete_statement(statement_type type, uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs, cql_stats& stats);

    virtual bool require_full_clustering_key() const override;

--- a/cql3/statements/drop_keyspace_statement.cc
+++ b/cql3/statements/drop_keyspace_statement.cc
@@ -99,7 +99,7 @@ shared_ptr<transport::event::schema_change> drop_keyspace_statement::change_even
 }

 shared_ptr<cql3::statements::prepared_statement>
-drop_keyspace_statement::prepare(database& db) {
+drop_keyspace_statement::prepare(database& db, cql_stats& stats) {
    return make_shared<prepared_statement>(make_shared<drop_keyspace_statement>(*this));
 }

--- a/cql3/statements/drop_keyspace_statement.hh
+++ b/cql3/statements/drop_keyspace_statement.hh
@@ -63,7 +63,7 @@ public:

    virtual shared_ptr<transport::event::schema_change> change_event() override;

-    virtual shared_ptr<prepared> prepare(database& db) override;
+    virtual shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
 };

 }
--- a/cql3/statements/drop_table_statement.cc
+++ b/cql3/statements/drop_table_statement.cc
@@ -100,7 +100,7 @@ shared_ptr<transport::event::schema_change> drop_table_statement::change_event()
 }

 shared_ptr<cql3::statements::prepared_statement>
-drop_table_statement::prepare(database& db) {
+drop_table_statement::prepare(database& db, cql_stats& stats) {
    return make_shared<prepared_statement>(make_shared<drop_table_statement>(*this));
 }

--- a/cql3/statements/drop_table_statement.hh
+++ b/cql3/statements/drop_table_statement.hh
@@ -62,7 +62,7 @@ public:

    virtual shared_ptr<transport::event::schema_change> change_event() override;

-    virtual shared_ptr<prepared> prepare(database& db) override;
+    virtual shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
 };

 }
--- a/cql3/statements/drop_type_statement.cc
+++ b/cql3/statements/drop_type_statement.cc
@@ -118,7 +118,7 @@ future<bool> drop_type_statement::announce_migration(distributed<service::storag
 }

 shared_ptr<cql3::statements::prepared_statement>
-drop_type_statement::prepare(database& db) {
+drop_type_statement::prepare(database& db, cql_stats& stats) {
    return make_shared<prepared_statement>(make_shared<drop_type_statement>(*this));
 }

--- a/cql3/statements/drop_type_statement.hh
+++ b/cql3/statements/drop_type_statement.hh
@@ -65,7 +65,7 @@ public:

    virtual future<bool> announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) override;

-    virtual shared_ptr<prepared> prepare(database& db) override;
+    virtual shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
 };

 }
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -73,12 +73,13 @@ operator<<(std::ostream& out, modification_statement::statement_type t) {
    return out;
 }

-modification_statement::modification_statement(statement_type type_, uint32_t bound_terms, schema_ptr schema_, std::unique_ptr<attributes> attrs_)
+modification_statement::modification_statement(statement_type type_, uint32_t bound_terms, schema_ptr schema_, std::unique_ptr<attributes> attrs_, uint64_t* cql_stats_counter_ptr)
    : type{type_}
    , _bound_terms{bound_terms}
    , s{schema_}
    , attrs{std::move(attrs_)}
    , _column_operations{}
+    , _cql_modification_counter_ptr(cql_stats_counter_ptr)
 { }

 bool modification_statement::uses_function(const sstring& ks_name, const sstring& function_name) const {
@@ -453,6 +454,8 @@ modification_statement::execute(distributed<service::storage_proxy>& proxy, serv
        return execute_with_condition(proxy, qs, options);
    }

+    inc_cql_stats();
+
    return execute_without_condition(proxy, qs, options).then([] {
        return make_ready_future<::shared_ptr<transport::messages::result_message>>(
                ::shared_ptr<transport::messages::result_message>{});
@@ -513,6 +516,8 @@ modification_statement::execute_internal(distributed<service::storage_proxy>& pr

    tracing::add_table_name(qs.get_trace_state(), keyspace(), column_family());

+    inc_cql_stats();
+
    return get_mutations(proxy, options, true, options.get_timestamp(qs), qs.get_trace_state()).then(
            [&proxy] (auto mutations) {
                return proxy.local().mutate_locally(std::move(mutations));
@@ -573,20 +578,20 @@ modification_statement::process_where_clause(database& db, std::vector<relation_
 namespace raw {

 ::shared_ptr<prepared_statement>
-modification_statement::modification_statement::prepare(database& db) {
+modification_statement::modification_statement::prepare(database& db, cql_stats& stats) {
    auto bound_names = get_bound_variables();
-    auto statement = prepare(db, bound_names);
+    auto statement = prepare(db, bound_names, stats);
    return ::make_shared<prepared>(std::move(statement), *bound_names);
 }

 ::shared_ptr<cql3::statements::modification_statement>
-modification_statement::prepare(database& db, ::shared_ptr<variable_specifications> bound_names) {
+modification_statement::prepare(database& db, ::shared_ptr<variable_specifications> bound_names, cql_stats& stats) {
    schema_ptr schema = validation::validate_column_family(db, keyspace(), column_family());

    auto prepared_attributes = _attrs->prepare(db, keyspace(), column_family());
    prepared_attributes->collect_marker_specification(bound_names);

-    ::shared_ptr<cql3::statements::modification_statement> stmt = prepare_internal(db, schema, bound_names, std::move(prepared_attributes));
+    ::shared_ptr<cql3::statements::modification_statement> stmt = prepare_internal(db, schema, bound_names, std::move(prepared_attributes), stats);

    if (_if_not_exists || _if_exists || !_conditions.empty()) {
        if (stmt->is_counter()) {
--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -109,8 +109,10 @@ private:
            return cond->column;
        };

+    uint64_t* _cql_modification_counter_ptr = nullptr;
+
 public:
-    modification_statement(statement_type type_, uint32_t bound_terms, schema_ptr schema_, std::unique_ptr<attributes> attrs_);
+    modification_statement(statement_type type_, uint32_t bound_terms, schema_ptr schema_, std::unique_ptr<attributes> attrs_, uint64_t* cql_stats_counter_ptr);

    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override;

@@ -152,6 +154,11 @@ public:
                                staticConditions == null ? Collections.<ColumnDefinition>emptyList() : Iterables.transform(staticConditions, getColumnForCondition));
    }
 #endif
+
+    void inc_cql_stats() {
+        ++(*_cql_modification_counter_ptr);
+    }
+
 public:
    void add_condition(::shared_ptr<column_condition> cond);

--- a/cql3/statements/raw/batch_statement.hh
+++ b/cql3/statements/raw/batch_statement.hh
@@ -84,7 +84,7 @@ public:
        }
    }

-    virtual shared_ptr<prepared> prepare(database& db) override;
+    virtual shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
 };

 }
--- a/cql3/statements/raw/delete_statement.hh
+++ b/cql3/statements/raw/delete_statement.hh
@@ -66,7 +66,7 @@ public:
           bool if_exists);
 protected:
    virtual ::shared_ptr<cql3::statements::modification_statement> prepare_internal(database& db, schema_ptr schema,
-        ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs);
+        ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs, cql_stats& stats);
 };

 }
--- a/cql3/statements/raw/insert_statement.hh
+++ b/cql3/statements/raw/insert_statement.hh
@@ -77,7 +77,7 @@ public:
                  bool if_not_exists);

    virtual ::shared_ptr<cql3::statements::modification_statement> prepare_internal(database& db, schema_ptr schema,
-                ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs) override;
+                ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs, cql_stats& stats) override;

 };

--- a/cql3/statements/raw/modification_statement.hh
+++ b/cql3/statements/raw/modification_statement.hh
@@ -83,11 +83,11 @@ protected:
    modification_statement(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, conditions_vector conditions, bool if_not_exists, bool if_exists);

 public:
-    virtual ::shared_ptr<prepared> prepare(database& db) override;
-    ::shared_ptr<cql3::statements::modification_statement> prepare(database& db, ::shared_ptr<variable_specifications> bound_names);;
+    virtual ::shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
+    ::shared_ptr<cql3::statements::modification_statement> prepare(database& db, ::shared_ptr<variable_specifications> bound_names, cql_stats& stats);
 protected:
    virtual ::shared_ptr<cql3::statements::modification_statement> prepare_internal(database& db, schema_ptr schema,
-        ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs) = 0;
+        ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs, cql_stats& stats) = 0;
 };

 }
--- a/cql3/statements/raw/parsed_statement.hh
+++ b/cql3/statements/raw/parsed_statement.hh
@@ -44,6 +44,7 @@
 #include "cql3/variable_specifications.hh"
 #include "cql3/column_specification.hh"
 #include "cql3/column_identifier.hh"
+#include "cql3/stats.hh"

 #include <seastar/core/shared_ptr.hh>

@@ -70,7 +71,7 @@ public:

    void set_bound_variables(const std::vector<::shared_ptr<column_identifier>>& bound_names);

-    virtual ::shared_ptr<prepared> prepare(database& db) = 0;
+    virtual ::shared_ptr<prepared> prepare(database& db, cql_stats& stats) = 0;

    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const;
 };
--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -100,7 +100,7 @@ public:
            std::vector<::shared_ptr<relation>> where_clause,
            ::shared_ptr<term::raw> limit);

-    virtual ::shared_ptr<prepared> prepare(database& db) override;
+    virtual ::shared_ptr<prepared> prepare(database& db,cql_stats& stats) override;
 private:
    ::shared_ptr<restrictions::statement_restrictions> prepare_restrictions(
        database& db,
--- a/cql3/statements/raw/update_statement.hh
+++ b/cql3/statements/raw/update_statement.hh
@@ -81,7 +81,7 @@ public:
        conditions_vector conditions);
 protected:
    virtual ::shared_ptr<cql3::statements::modification_statement> prepare_internal(database& db, schema_ptr schema,
-                ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs);
+                ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs, cql_stats& stats);
 };

 }
--- a/cql3/statements/raw/use_statement.hh
+++ b/cql3/statements/raw/use_statement.hh
@@ -58,7 +58,7 @@ private:
 public:
    use_statement(sstring keyspace);

-    virtual ::shared_ptr<prepared> prepare(database& db) override;
+    virtual ::shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
 };

 }
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -87,7 +87,8 @@ select_statement::select_statement(schema_ptr schema,
                                   ::shared_ptr<restrictions::statement_restrictions> restrictions,
                                   bool is_reversed,
                                   ordering_comparator_type ordering_comparator,
-                                   ::shared_ptr<term> limit)
+                                   ::shared_ptr<term> limit,
+                                   cql_stats& stats)
    : _schema(schema)
    , _bound_terms(bound_terms)
    , _parameters(std::move(parameters))
@@ -96,6 +97,7 @@ select_statement::select_statement(schema_ptr schema,
    , _is_reversed(is_reversed)
    , _limit(std::move(limit))
    , _ordering_comparator(std::move(ordering_comparator))
+    , _stats(stats)
 {
    _opts = _selection->get_query_options();
 }
@@ -107,7 +109,7 @@ bool select_statement::uses_function(const sstring& ks_name, const sstring& func
 }

 ::shared_ptr<select_statement>
-select_statement::for_selection(schema_ptr schema, ::shared_ptr<selection::selection> selection) {
+select_statement::for_selection(schema_ptr schema, ::shared_ptr<selection::selection> selection, cql_stats& stats) {
    return ::make_shared<select_statement>(schema,
        0,
        _default_parameters,
@@ -115,7 +117,8 @@ select_statement::for_selection(schema_ptr schema, ::shared_ptr<selection::selec
        ::make_shared<restrictions::statement_restrictions>(schema),
        false,
        ordering_comparator_type{},
-        ::shared_ptr<term>{});
+        ::shared_ptr<term>{},
+        stats);
 }

 ::shared_ptr<const cql3::metadata> select_statement::get_result_metadata() const {
@@ -227,6 +230,8 @@ select_statement::execute(distributed<service::storage_proxy>& proxy,
    int32_t limit = get_limit(options);
    auto now = db_clock::now();

+    ++_stats.reads;
+
    auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(),
        make_partition_slice(options), limit, to_gc_clock(now), tracing::make_trace_info(state.get_trace_state()), query::max_partitions, options.get_timestamp(state));

@@ -249,6 +254,7 @@ select_statement::execute(distributed<service::storage_proxy>& proxy,
                now);
    }

+    command->slice.options.set<query::partition_slice::option::allow_short_read>();
    auto p = service::pager::query_pagers::pager(_schema, _selection,
            state, options, command, std::move(key_ranges));

@@ -301,7 +307,8 @@ select_statement::execute(distributed<service::storage_proxy>& proxy,
    // doing post-query ordering.
    if (needs_post_query_ordering() && _limit) {
        return do_with(std::forward<std::vector<query::partition_range>>(partition_ranges), [this, &proxy, &state, &options, cmd](auto prs) {
-            query::result_merger merger;
+            assert(cmd->partition_limit == query::max_partitions);
+            query::result_merger merger(cmd->row_limit * prs.size(), query::max_partitions);
            return map_reduce(prs.begin(), prs.end(), [this, &proxy, &state, &options, cmd] (auto pr) {
                std::vector<query::partition_range> prange { pr };
                auto command = ::make_lw_shared<query::read_command>(*cmd);
@@ -331,9 +338,12 @@ select_statement::execute_internal(distributed<service::storage_proxy>& proxy,

    tracing::add_table_name(state.get_trace_state(), keyspace(), column_family());

+    ++_stats.reads;
+
    if (needs_post_query_ordering() && _limit) {
        return do_with(std::move(partition_ranges), [this, &proxy, &state, command] (auto prs) {
-            query::result_merger merger;
+            assert(command->partition_limit == query::max_partitions);
+            query::result_merger merger(command->row_limit * prs.size(), query::max_partitions);
            return map_reduce(prs.begin(), prs.end(), [this, &proxy, &state, command] (auto pr) {
                std::vector<query::partition_range> prange { pr };
                auto cmd = ::make_lw_shared<query::read_command>(*command);
@@ -367,8 +377,8 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
        if (_is_reversed) {
            rs->reverse();
        }
+        rs->trim(cmd->row_limit);
    }
-    rs->trim(cmd->row_limit);
    return ::make_shared<transport::messages::result_message::rows>(std::move(rs));
 }

@@ -386,7 +396,7 @@ select_statement::select_statement(::shared_ptr<cf_name> cf_name,
    , _limit(std::move(limit))
 { }

-::shared_ptr<prepared_statement> select_statement::prepare(database& db) {
+::shared_ptr<prepared_statement> select_statement::prepare(database& db, cql_stats& stats) {
    schema_ptr schema = validation::validate_column_family(db, keyspace(), column_family());
    auto bound_names = get_bound_variables();

@@ -418,7 +428,8 @@ select_statement::select_statement(::shared_ptr<cf_name> cf_name,
        std::move(restrictions),
        is_reversed_,
        std::move(ordering_comparator),
-        prepare_limit(db, bound_names));
+        prepare_limit(db, bound_names),
+        stats);

    return ::make_shared<prepared>(std::move(stmt), std::move(*bound_names));
 }
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -89,6 +89,7 @@ private:
    ordering_comparator_type _ordering_comparator;

    query::partition_slice::option_set _opts;
+    cql_stats& _stats;
 public:
    select_statement(schema_ptr schema,
            uint32_t bound_terms,
@@ -97,7 +98,8 @@ public:
            ::shared_ptr<restrictions::statement_restrictions> restrictions,
            bool is_reversed,
            ordering_comparator_type ordering_comparator,
-            ::shared_ptr<term> limit);
+            ::shared_ptr<term> limit,
+            cql_stats& stats);

    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override;

@@ -105,7 +107,7 @@ public:
    // Note that the results select statement should not be used for actual queries, but only for processing already
    // queried data through processColumnFamily.
    static ::shared_ptr<select_statement> for_selection(
-        schema_ptr schema, ::shared_ptr<selection::selection> selection);
+        schema_ptr schema, ::shared_ptr<selection::selection> selection, cql_stats& stats);

    virtual ::shared_ptr<const cql3::metadata> get_result_metadata() const override;
    virtual uint32_t get_bound_terms() override;
--- a/cql3/statements/truncate_statement.cc
+++ b/cql3/statements/truncate_statement.cc
@@ -59,7 +59,7 @@ uint32_t truncate_statement::get_bound_terms()
    return 0;
 }

-::shared_ptr<prepared_statement> truncate_statement::prepare(database& db)
+::shared_ptr<prepared_statement> truncate_statement::prepare(database& db,cql_stats& stats)
 {
    return ::make_shared<prepared>(this->shared_from_this());
 }
--- a/cql3/statements/truncate_statement.hh
+++ b/cql3/statements/truncate_statement.hh
@@ -56,7 +56,7 @@ public:

    virtual uint32_t get_bound_terms() override;

-    virtual ::shared_ptr<prepared> prepare(database& db) override;
+    virtual ::shared_ptr<prepared> prepare(database& db,cql_stats& stats) override;

    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override;

--- a/cql3/statements/update_statement.cc
+++ b/cql3/statements/update_statement.cc
@@ -50,8 +50,8 @@ namespace cql3 {

 namespace statements {

-update_statement::update_statement(statement_type type, uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs)
-    : modification_statement{type, bound_terms, std::move(s), std::move(attrs)}
+update_statement::update_statement(statement_type type, uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs, uint64_t* cql_stats_counter_ptr)
+    : modification_statement{type, bound_terms, std::move(s), std::move(attrs), cql_stats_counter_ptr}
 { }

 bool update_statement::require_full_clustering_key() const {
@@ -124,10 +124,10 @@ insert_statement::insert_statement(            ::shared_ptr<cf_name> name,

 ::shared_ptr<cql3::statements::modification_statement>
 insert_statement::prepare_internal(database& db, schema_ptr schema,
-    ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs)
+    ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs, cql_stats& stats)
 {
    using statement_type = cql3::statements::modification_statement::statement_type;
-    auto stmt = ::make_shared<cql3::statements::update_statement>(statement_type::INSERT, bound_names->size(), schema, std::move(attrs));
+    auto stmt = ::make_shared<cql3::statements::update_statement>(statement_type::INSERT, bound_names->size(), schema, std::move(attrs), &stats.inserts);

    // Created from an INSERT
    if (stmt->is_counter()) {
@@ -181,10 +181,10 @@ update_statement::update_statement(            ::shared_ptr<cf_name> name,

 ::shared_ptr<cql3::statements::modification_statement>
 update_statement::prepare_internal(database& db, schema_ptr schema,
-    ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs)
+    ::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs, cql_stats& stats)
 {
    using statement_type = cql3::statements::modification_statement::statement_type;
-    auto stmt = ::make_shared<cql3::statements::update_statement>(statement_type::UPDATE, bound_names->size(), schema, std::move(attrs));
+    auto stmt = ::make_shared<cql3::statements::update_statement>(statement_type::UPDATE, bound_names->size(), schema, std::move(attrs), &stats.updates);

    for (auto&& entry : _updates) {
        auto id = entry.first->prepare_column_identifier(schema);
--- a/cql3/statements/update_statement.hh
+++ b/cql3/statements/update_statement.hh
@@ -65,7 +65,7 @@ public:
    private static final Constants.Value EMPTY = new Constants.Value(ByteBufferUtil.EMPTY_BYTE_BUFFER);
 #endif

-    update_statement(statement_type type, uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs);
+    update_statement(statement_type type, uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs, uint64_t* cql_stats_counter_ptr);
 private:
    virtual bool require_full_clustering_key() const override;

--- a/cql3/statements/use_statement.cc
+++ b/cql3/statements/use_statement.cc
@@ -65,7 +65,7 @@ use_statement::use_statement(sstring keyspace)
 {
 }

-::shared_ptr<prepared_statement> use_statement::prepare(database& db)
+::shared_ptr<prepared_statement> use_statement::prepare(database& db, cql_stats& stats)
 {
    return ::make_shared<prepared>(make_shared<cql3::statements::use_statement>(_keyspace));
 }
--- a/cql3/stats.hh
+++ b/cql3/stats.hh
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2015 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+namespace cql3 {
+
+struct cql_stats {
+    uint64_t reads = 0;
+    uint64_t inserts = 0;
+    uint64_t updates = 0;
+    uint64_t deletes = 0;
+    uint64_t batches = 0;
+};
+
+}
--- a/database.cc
+++ b/database.cc
--- a/database.hh
+++ b/database.hh
@@ -70,10 +70,10 @@
 #include "utils/estimated_histogram.hh"
 #include "sstables/compaction.hh"
 #include "sstables/sstable_set.hh"
-#include "key_reader.hh"
 #include <seastar/core/rwlock.hh>
 #include <seastar/core/shared_future.hh>
 #include "tracing/trace_state.hh"
+#include <boost/intrusive/parent_from_member.hpp>

 class frozen_mutation;
 class reconcilable_result;
@@ -118,44 +118,93 @@ class dirty_memory_manager: public logalloc::region_group_reclaimer {
    // memtable is totally gone. That means that if we have throttled requests, they will stay
    // throttled for a long time. Even when we have virtual dirty, that only provides a rough
    // estimate, and we can't release requests that early.
-    //
-    // Ideally, we'd allow one memtable flush per shard (or per database object), and write-behind
-    // would take care of the rest. But that still has issues, so we'll limit parallelism to some
-    // number (4), that we will hopefully reduce to 1 when write behind works.
-    //
-    // When streaming is going on, we'll separate half of that for the streaming code, which
-    // effectively increases the total to 6. That is a bit ugly and a bit redundant with the I/O
-    // Scheduler, but it's the easiest way not to hurt the common case (no streaming) and will have
-    // to do for the moment. Hopefully we can set both to 1 soon (with write behind)
-    //
-    // FIXME: enable write behind and set both to 1. Right now we will take advantage of the fact
-    // that memtables and streaming will use different specialized classes here and set them as
-    // default values here.
-    size_t _concurrency;
    semaphore _flush_serializer;
+    // We will accept a new flush before another one ends, once it is done with the data write.
+    // That is so we can keep the disk always busy. But there is still some background work that is
+    // left to be done. Mostly, update the caches and seal the auxiliary components of the SSTable.
+    // This semaphore will cap the amount of background work that we have. Note that we're not
+    // overly concerned about memtable memory, because dirty memory will put a limit to that. This
+    // is mostly about dangling continuations. So that doesn't have to be a small number.
+    static constexpr unsigned _max_background_work = 20;
+    semaphore _background_work_flush_serializer = { _max_background_work };
+    condition_variable _should_flush;
+    int64_t _dirty_bytes_released_pre_accounted = 0;

-    seastar::gate _waiting_flush_gate;
-    std::vector<shared_memtable> _pending_flushes;
-    void maybe_do_active_flush();
-protected:
-    virtual memtable_list& get_memtable_list(column_family& cf) = 0;
-    virtual void start_reclaiming() override;
+    future<> flush_when_needed();
+    struct flush_permit {
+        semaphore_units<> permit;
+
+        flush_permit(semaphore_units<>&& permit) : permit(std::move(permit)) {}
+    };
+
+    // We need to start a flush before the current one finishes, otherwise
+    // we'll have a period without significant disk activity when the current
+    // SSTable is being sealed, the caches are being updated, etc. To do that
+    // we need to keep track of who is it that we are flushing this memory from.
+    std::unordered_map<const logalloc::region*, flush_permit> _flush_manager;
+
+    future<> _waiting_flush;
+    virtual void start_reclaiming() noexcept override;
+
+    bool has_pressure() const {
+        return over_soft_limit();
+    }
+
+    std::vector<scollectd::registration> _collectd;
 public:
+    void setup_collectd(sstring namestr);
+
    future<> shutdown();

-    dirty_memory_manager(database* db, size_t threshold, size_t concurrency)
-                                           : logalloc::region_group_reclaimer(threshold)
-                                           , _db(db)
-                                           , _region_group(*this)
-                                           , _concurrency(concurrency)
-                                           , _flush_serializer(concurrency) {}
+    // Limits and pressure conditions:
+    // ===============================
+    //
+    // Virtual Dirty
+    // -------------
+    // We can't free memory until the whole memtable is flushed because we need to keep it in memory
+    // until the end, but we can fake freeing memory. When we are done with an element of the
+    // memtable, we will update the region group pretending memory just went down by that amount.
+    //
+    // Because the amount of memory that we pretend to free should be close enough to the actual
+    // memory used by the memtables, that effectively creates two sub-regions inside the dirty
+    // region group, of equal size. In the worst case, we will have <memtable_total_space> dirty
+    // bytes used, and half of that already virtually freed.
+    //
+    // Hard Limit
+    // ----------
+    // The total space that can be used by memtables in each group is defined by the threshold, but
+    // we will only allow the region_group to grow to half of that. This is because of virtual_dirty
+    // as explained above. Because virtual dirty is implemented by reducing the usage in the
+    // region_group directly on partition written, we want to throttle every time half of the memory
+    // as seen by the region_group. To achieve that we need to set the hard limit (first parameter
+    // of the region_group_reclaimer) to 1/2 of the user-supplied threshold
+    //
+    // Soft Limit
+    // ----------
+    // When the soft limit is hit, no throttle happens. The soft limit exists because we don't want
+    // to start flushing only when the limit is hit, but a bit earlier instead. If we were to start
+    // flushing only when the hard limit is hit, workloads in which the disk is fast enough to cope
+    // would see latency added to some requests unnecessarily.
+    //
+    // We then set the soft limit to 80 % of the virtual dirty hard limit, which is equal to 40 % of
+    // the user-supplied threshold.
+    dirty_memory_manager(database& db, size_t threshold)
+        : logalloc::region_group_reclaimer(threshold / 2, threshold * 0.40)
+        , _db(&db)
+        , _region_group(*this)
+        , _flush_serializer(1)
+        , _waiting_flush(flush_when_needed()) {}
+
+    dirty_memory_manager() : logalloc::region_group_reclaimer()
+        , _db(nullptr)
+        , _region_group(*this)
+        , _flush_serializer(1)
+        , _waiting_flush(make_ready_future<>()) {}
+
+    static dirty_memory_manager& from_region_group(logalloc::region_group *rg) {
+        return *(boost::intrusive::get_parent_from_member(rg, &dirty_memory_manager::_region_group));
+    }

-    dirty_memory_manager(database* db, dirty_memory_manager *parent, size_t threshold, size_t concurrency)
-                                                                         : logalloc::region_group_reclaimer(threshold)
-                                                                         , _db(db)
-                                                                         , _region_group(&parent->_region_group, *this)
-                                                                         , _concurrency(concurrency)
-                                                                         , _flush_serializer(concurrency) {}
    logalloc::region_group& region_group() {
        return _region_group;
    }
@@ -164,33 +213,52 @@ public:
        return _region_group;
    }

-    template <typename Func>
-    future<> serialize_flush(Func&& func) {
-        return seastar::with_gate(_waiting_flush_gate,  [this, func] () mutable {
-            return with_semaphore(_flush_serializer, 1, func).finally([this] {
-                maybe_do_active_flush();
-            });
-        });
+    void revert_potentially_cleaned_up_memory(logalloc::region* from, int64_t delta) {
+        _region_group.update(delta);
+        _dirty_bytes_released_pre_accounted -= delta;
+    }
+
+    void account_potentially_cleaned_up_memory(logalloc::region* from, int64_t delta) {
+        _region_group.update(-delta);
+        _dirty_bytes_released_pre_accounted += delta;
+    }
+
+    // This can be called multiple times during the lifetime of the region, and should always
+    // ultimately be called after the flush ends. However, some flushers may decide to call it
+    // earlier. For instance, the normal memtables sealing function will call this before updating
+    // the cache.
+    //
+    // Also, for sealing methods like the normal memtable sealing method - that may retry after a
+    // failed write, calling this method after the attempt is completed with success or failure is
+    // mandatory. That's because the new attempt will create a new flush reader for the same
+    // SSTable, so we need to make sure that we revert the old charges.
+    void remove_from_flush_manager(const logalloc::region *region) {
+        auto it = _flush_manager.find(region);
+        if (it != _flush_manager.end()) {
+            _flush_manager.erase(it);
+        }
+    }
+
+    void add_to_flush_manager(const logalloc::region *region, flush_permit&& permit) {
+        _flush_manager.emplace(region, std::move(permit));
+    }
+
+    size_t real_dirty_memory() const {
+        return _region_group.memory_used() + _dirty_bytes_released_pre_accounted;
+    }
+
+    size_t virtual_dirty_memory() const {
+        return _region_group.memory_used();
+    }
+
+    future<> flush_one(memtable_list& cf, semaphore_units<> permit);
+
+    future<semaphore_units<>> get_flush_permit() {
+        return get_units(_flush_serializer, 1);
    }
 };

-class streaming_dirty_memory_manager: public dirty_memory_manager {
-    virtual memtable_list& get_memtable_list(column_family& cf) override;
-public:
-    streaming_dirty_memory_manager(database& db, dirty_memory_manager *parent, size_t threshold) : dirty_memory_manager(&db, parent, threshold, 2) {}
-};
-
-class memtable_dirty_memory_manager: public dirty_memory_manager {
-    virtual memtable_list& get_memtable_list(column_family& cf) override;
-public:
-    memtable_dirty_memory_manager(database& db, dirty_memory_manager* parent, size_t threshold) : dirty_memory_manager(&db, parent, threshold, 4) {}
-    // This constructor will be called for the system tables (no parent). Its flushes are usually drive by us
-    // and not the user, and tend to be small in size. So we'll allow only two slots.
-    memtable_dirty_memory_manager(database& db, size_t threshold) : dirty_memory_manager(&db, threshold, 2) {}
-    memtable_dirty_memory_manager() : dirty_memory_manager(nullptr, std::numeric_limits<size_t>::max(), 4) {}
-};
-
-extern thread_local memtable_dirty_memory_manager default_dirty_memory_manager;
+extern thread_local dirty_memory_manager default_dirty_memory_manager;

 // We could just add all memtables, regardless of types, to a single list, and
 // then filter them out when we read them. Here's why I have chosen not to do
@@ -217,18 +285,29 @@ private:
    std::vector<shared_memtable> _memtables;
    std::function<future<> (flush_behavior)> _seal_fn;
    std::function<schema_ptr()> _current_schema;
-    size_t _max_memtable_size;
    dirty_memory_manager* _dirty_memory_manager;
+    std::experimental::optional<shared_promise<>> _flush_coalescing;
 public:
-    memtable_list(std::function<future<> (flush_behavior)> seal_fn, std::function<schema_ptr()> cs, size_t max_memtable_size, dirty_memory_manager* dirty_memory_manager)
+    memtable_list(std::function<future<> (flush_behavior)> seal_fn, std::function<schema_ptr()> cs, dirty_memory_manager* dirty_memory_manager)
        : _memtables({})
        , _seal_fn(seal_fn)
        , _current_schema(cs)
-        , _max_memtable_size(max_memtable_size)
        , _dirty_memory_manager(dirty_memory_manager) {
        add_memtable();
    }

+    memtable_list(std::function<schema_ptr()> cs, dirty_memory_manager* dirty_memory_manager)
+        : _memtables({})
+        , _seal_fn()
+        , _current_schema(cs)
+        , _dirty_memory_manager(dirty_memory_manager) {
+        add_memtable();
+    }
+
+    bool may_flush() const {
+        return bool(_seal_fn);
+    }
+
    shared_memtable back() {
        return _memtables.back();
    }
@@ -273,21 +352,16 @@ public:
        _memtables.emplace_back(new_memtable());
    }

-    bool should_flush() {
-        return active_memtable().occupancy().total_space() >= _max_memtable_size;
-    }
-
-    void seal_on_overflow() {
-        if (should_flush()) {
-            // FIXME: if sparse, do some in-memory compaction first
-            // FIXME: maybe merge with other in-memory memtables
-            seal_active_memtable(flush_behavior::immediate);
-        }
+    logalloc::region_group& region_group() {
+        return _dirty_memory_manager->region_group();
    }
+    // This is used for explicit flushes. Will queue the memtable for flushing and proceed when the
+    // dirty_memory_manager allows us to. We will not seal at this time since the flush itself
+    // wouldn't happen anyway. Keeping the memtable in memory will potentially increase the time it
+    // spends in memory allowing for more coalescing opportunities.
+    future<> request_flush();
 private:
-    lw_shared_ptr<memtable> new_memtable() {
-        return make_lw_shared<memtable>(_current_schema(), &(_dirty_memory_manager->region_group()));
-    }
+    lw_shared_ptr<memtable> new_memtable();
 };

 using sstable_list = sstables::sstable_list;
@@ -320,11 +394,10 @@ public:
        bool enable_cache = true;
        bool enable_commitlog = true;
        bool enable_incremental_backups = false;
-        size_t max_memtable_size = 5'000'000;
-        size_t max_streaming_memtable_size = 5'000'000;
        ::dirty_memory_manager* dirty_memory_manager = &default_dirty_memory_manager;
        ::dirty_memory_manager* streaming_dirty_memory_manager = &default_dirty_memory_manager;
        restricted_mutation_reader_config read_concurrency_config;
+        restricted_mutation_reader_config streaming_read_concurrency_config;
        ::cf_stats* cf_stats = nullptr;
        uint64_t max_cached_partition_size_in_bytes;
    };
@@ -379,9 +452,6 @@ private:
    lw_shared_ptr<memtable_list> _streaming_memtables;
    utils::phased_barrier _streaming_flush_phaser;

-    friend class memtable_dirty_memory_manager;
-    friend class streaming_dirty_memory_manager;
-
    // If mutations are fragmented during streaming the sstables cannot be made
    // visible immediately after memtable flush, because that could cause
    // readers to see only a part of a partition thus violating isolation
@@ -444,7 +514,10 @@ private:
    void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable);
    void add_sstable(sstables::sstable&& sstable);
    void add_sstable(lw_shared_ptr<sstables::sstable> sstable);
-    future<> load_sstable(sstables::sstable&& sstab, bool reset_level = false);
+    // returns an empty pointer if sstable doesn't belong to current shard.
+    future<lw_shared_ptr<sstables::sstable>> open_sstable(sstring dir, int64_t generation,
+        sstables::sstable::version_types v, sstables::sstable::format_types f);
+    void load_sstable(lw_shared_ptr<sstables::sstable>& sstable, bool reset_level = false);
    lw_shared_ptr<memtable> new_memtable();
    lw_shared_ptr<memtable> new_streaming_memtable();
    future<stop_iteration> try_flush_memtable_to_sstable(lw_shared_ptr<memtable> memt);
@@ -471,6 +544,8 @@ private:
                              const std::vector<sstables::shared_sstable>& sstables_to_remove);
    void rebuild_statistics();
 private:
+    using virtual_reader_type = std::function<mutation_reader(schema_ptr, const query::partition_range&, const query::partition_slice&, const io_priority_class&, tracing::trace_state_ptr)>;
+    virtual_reader_type _virtual_reader;
    // Creates a mutation reader which covers sstables.
    // Caller needs to ensure that column_family remains live (FIXME: relax this).
    // The 'range' parameter must be live as long as the reader is used.
@@ -482,7 +557,6 @@ private:
                                        tracing::trace_state_ptr trace_state) const;

    mutation_source sstables_as_mutation_source();
-    key_source sstables_as_key_source() const;
    partition_presence_checker make_partition_presence_checker(sstables::shared_sstable exclude_sstable);
    std::chrono::steady_clock::time_point _sstable_writes_disabled_at;
    void do_trigger_compaction();
@@ -528,8 +602,16 @@ public:
    mutation_reader make_streaming_reader(schema_ptr schema,
            const query::partition_range& range = query::full_partition_range) const;

+    // Requires ranges to be sorted and disjoint.
+    mutation_reader make_streaming_reader(schema_ptr schema,
+            const std::vector<query::partition_range>& ranges) const;
+
    mutation_source as_mutation_source(tracing::trace_state_ptr trace_state) const;

+    void set_virtual_reader(virtual_reader_type virtual_reader) {
+        _virtual_reader = std::move(virtual_reader);
+    }
+
    // Queries can be satisfied from multiple data sources, so they are returned
    // as temporaries.
    //
@@ -571,7 +653,9 @@ public:
    future<lw_shared_ptr<query::result>> query(schema_ptr,
        const query::read_command& cmd, query::result_request request,
        const std::vector<query::partition_range>& ranges,
-        tracing::trace_state_ptr trace_state);
+        tracing::trace_state_ptr trace_state,
+        query::result_memory_limiter& memory_limiter,
+        uint64_t max_result_size);

    future<> populate(sstring datadir);

@@ -669,8 +753,10 @@ public:
        _config.enable_incremental_backups = val;
    }

+    const sstables::sstable_set& get_sstable_set() const;
    lw_shared_ptr<sstable_list> get_sstables() const;
    lw_shared_ptr<sstable_list> get_sstables_including_compacted_undeleted() const;
+    const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const;
    std::vector<sstables::shared_sstable> select_sstables(const query::partition_range& range) const;
    size_t sstables_count() const;
    std::vector<uint64_t> sstable_count_per_level() const;
@@ -743,7 +829,7 @@ private:
    // repair can now choose whatever strategy - small or big ranges - it wants, resting assure
    // that the incoming memtables will be coalesced together.
    shared_promise<> _waiting_streaming_flushes;
-    timer<> _delayed_streaming_flush{[this] { seal_active_streaming_memtable_immediate(); }};
+    timer<> _delayed_streaming_flush{[this] { _streaming_memtables->request_flush(); }};
    future<> seal_active_streaming_memtable_delayed();
    future<> seal_active_streaming_memtable_immediate();
    future<> seal_active_streaming_memtable(memtable_list::flush_behavior behavior) {
@@ -874,11 +960,10 @@ public:
        bool enable_disk_writes = true;
        bool enable_cache = true;
        bool enable_incremental_backups = false;
-        size_t max_memtable_size = 5'000'000;
-        size_t max_streaming_memtable_size = 5'000'000;
        ::dirty_memory_manager* dirty_memory_manager = &default_dirty_memory_manager;
        ::dirty_memory_manager* streaming_dirty_memory_manager = &default_dirty_memory_manager;
        restricted_mutation_reader_config read_concurrency_config;
+        restricted_mutation_reader_config streaming_read_concurrency_config;
        ::cf_stats* cf_stats = nullptr;
    };
 private:
@@ -957,23 +1042,32 @@ public:
 //   use shard_of() for data

 class database {
+public:
+    using timeout_clock = std::chrono::steady_clock;
+private:
    ::cf_stats _cf_stats;
    static constexpr size_t max_concurrent_reads() { return 100; }
    static constexpr size_t max_system_concurrent_reads() { return 10; }
    struct db_stats {
        uint64_t total_writes = 0;
+        uint64_t total_writes_failed = 0;
+        uint64_t total_writes_timedout = 0;
        uint64_t total_reads = 0;
+        uint64_t total_reads_failed = 0;
        uint64_t sstable_read_queue_overloaded = 0;
+
+        uint64_t short_data_queries = 0;
+        uint64_t short_mutation_queries = 0;
    };

    lw_shared_ptr<db_stats> _stats;

    std::unique_ptr<db::config> _cfg;
-    size_t _memtable_total_space = 500 << 20;
-    size_t _streaming_memtable_total_space = 500 << 20;
-    memtable_dirty_memory_manager _system_dirty_memory_manager;
-    memtable_dirty_memory_manager _dirty_memory_manager;
-    streaming_dirty_memory_manager _streaming_dirty_memory_manager;
+
+    dirty_memory_manager _system_dirty_memory_manager;
+    dirty_memory_manager _dirty_memory_manager;
+    dirty_memory_manager _streaming_dirty_memory_manager;
+
    semaphore _read_concurrency_sem{max_concurrent_reads()};
    restricted_mutation_reader_config _read_concurrency_config;
    semaphore _system_read_concurrency_sem{max_system_concurrent_reads()};
@@ -990,7 +1084,7 @@ class database {
    bool _enable_incremental_backups = false;

    future<> init_commitlog();
-    future<> apply_in_memory(const frozen_mutation& m, schema_ptr m_schema, db::replay_position);
+    future<> apply_in_memory(const frozen_mutation& m, schema_ptr m_schema, db::replay_position, timeout_clock::time_point timeout);
    future<> populate(sstring datadir);
    future<> populate_keyspace(sstring datadir, sstring ks_name);

@@ -1002,10 +1096,16 @@ private:
    friend void db::system_keyspace::make(database& db, bool durable, bool volatile_testing_only);
    void setup_collectd();

-    future<> do_apply(schema_ptr, const frozen_mutation&);
+    future<> do_apply(schema_ptr, const frozen_mutation&, timeout_clock::time_point timeout);
+
+    query::result_memory_limiter _result_memory_limiter;
 public:
    static utils::UUID empty_version;

+    query::result_memory_limiter& get_result_memory_limiter() {
+        return _result_memory_limiter;
+    }
+
    void set_enable_incremental_backups(bool val) { _enable_incremental_backups = val; }

    future<> parse_system_tables(distributed<service::storage_proxy>&);
@@ -1067,9 +1167,13 @@ public:
    unsigned shard_of(const dht::token& t);
    unsigned shard_of(const mutation& m);
    unsigned shard_of(const frozen_mutation& m);
-    future<lw_shared_ptr<query::result>> query(schema_ptr, const query::read_command& cmd, query::result_request request, const std::vector<query::partition_range>& ranges, tracing::trace_state_ptr trace_state);
-    future<reconcilable_result> query_mutations(schema_ptr, const query::read_command& cmd, const query::partition_range& range, tracing::trace_state_ptr trace_state);
-    future<> apply(schema_ptr, const frozen_mutation&);
+    future<lw_shared_ptr<query::result>> query(schema_ptr, const query::read_command& cmd, query::result_request request, const std::vector<query::partition_range>& ranges,
+                                               tracing::trace_state_ptr trace_state, uint64_t max_result_size);
+    future<reconcilable_result> query_mutations(schema_ptr, const query::read_command& cmd, const query::partition_range& range,
+                                                query::result_memory_accounter&& accounter, tracing::trace_state_ptr trace_state);
+    // Apply the mutation atomically.
+    // Throws timed_out_error when timeout is reached.
+    future<> apply(schema_ptr, const frozen_mutation&, timeout_clock::time_point timeout = timeout_clock::time_point::max());
    future<> apply_streaming_mutation(schema_ptr, utils::UUID plan_id, const frozen_mutation&, bool fragmented);
    keyspace::config make_keyspace_config(const keyspace_metadata& ksm);
    const sstring& get_snitch_name() const;
--- a/database_fwd.hh
+++ b/database_fwd.hh
@@ -23,6 +23,7 @@

 // database.hh
 class database;
+class memtable_list;

 // mutation.hh
 class mutation;
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -66,6 +66,7 @@
 #include "serialization_visitors.hh"
 #include "idl/uuid.dist.impl.hh"
 #include "idl/frozen_schema.dist.impl.hh"
+#include "message/messaging_service.hh"

 static logging::logger logger("batchlog_manager");

--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -46,6 +46,7 @@
 #include <boost/range/adaptor/map.hpp>
 #include <unordered_map>
 #include <unordered_set>
+#include <exception>

 #include <core/align.hh>
 #include <core/reactor.hh>
@@ -56,6 +57,9 @@
 #include <core/gate.hh>
 #include <core/fstream.hh>
 #include <seastar/core/memory.hh>
+#include <seastar/core/chunked_fifo.hh>
+#include <seastar/core/queue.hh>
+#include <seastar/core/sleep.hh>
 #include <net/byteorder.hh>

 #include "commitlog.hh"
@@ -76,8 +80,10 @@

 static logging::logger logger("commitlog");

+using namespace std::chrono_literals;
+
 class crc32_nbo {
-    crc32 _c;
+    utils::crc32 _c;
 public:
    template <typename T>
    void process(T t) {
@@ -155,6 +161,7 @@ const std::string db::commitlog::descriptor::FILENAME_EXTENSION(".log");
 class db::commitlog::segment_manager : public ::enable_shared_from_this<segment_manager> {
 public:
    config cfg;
+    std::vector<sstring> _segments_to_replay;
    const uint64_t max_size;
    const uint64_t max_mutation_size;
    // Divide the size-on-disk threshold by #cpus used, since we assume
@@ -162,10 +169,12 @@ public:
    const uint64_t max_disk_size; // per-shard

    bool _shutdown = false;
+    std::experimental::optional<shared_promise<>> _shutdown_promise = {};

-    semaphore _new_segment_semaphore {1};
-    semaphore _write_semaphore;
-    semaphore _flush_semaphore;
+    // Allocation must throw timed_out_error by contract.
+    using timeout_exception_factory = default_timeout_exception_factory;
+
+    basic_semaphore<timeout_exception_factory> _flush_semaphore;

    scollectd::registrations _regs;

@@ -174,6 +183,23 @@ public:
    using time_point = clock_type::time_point;
    using sseg_ptr = lw_shared_ptr<segment>;

+    using request_controller_type = basic_semaphore<timeout_exception_factory>;
+    using request_controller_units = semaphore_units<timeout_exception_factory>;
+    request_controller_type _request_controller;
+
+    stdx::optional<shared_future<with_clock<commitlog::timeout_clock>>> _segment_allocating;
+
+    void account_memory_usage(size_t size) {
+        _request_controller.consume(size);
+    }
+
+    void notify_memory_written(size_t size) {
+        _request_controller.signal(size);
+    }
+
+    future<db::replay_position>
+    allocate_when_possible(const cf_id_type& id, shared_ptr<entry_writer> writer, commitlog::timeout_clock::time_point timeout);
+
    struct stats {
        uint64_t cycle_count = 0;
        uint64_t flush_count = 0;
@@ -182,29 +208,18 @@ public:
        uint64_t bytes_slack = 0;
        uint64_t segments_created = 0;
        uint64_t segments_destroyed = 0;
-        uint64_t pending_writes = 0;
        uint64_t pending_flushes = 0;
-        uint64_t pending_allocations = 0;
-        uint64_t write_limit_exceeded = 0;
        uint64_t flush_limit_exceeded = 0;
        uint64_t total_size = 0;
        uint64_t buffer_list_bytes = 0;
        uint64_t total_size_on_disk = 0;
+        uint64_t requests_blocked_memory = 0;
    };

    stats totals;

-    future<> begin_write() {
-        ++totals.pending_writes; // redundant, given semaphore. but easier to read
-        if (totals.pending_writes >= cfg.max_active_writes) {
-            ++totals.write_limit_exceeded;
-            logger.trace("Write ops overflow: {}. Will block.", totals.pending_writes);
-        }
-        return _write_semaphore.wait();
-    }
-    void end_write() {
-        _write_semaphore.signal();
-        --totals.pending_writes;
+    size_t pending_allocations() const {
+        return _request_controller.waiters();
    }

    future<> begin_flush() {
@@ -219,46 +234,7 @@ public:
        _flush_semaphore.signal();
        --totals.pending_flushes;
    }
-
-    bool should_wait_for_write() const {
-        return cfg.mode == sync_mode::BATCH || _write_semaphore.waiters() > 0 || _flush_semaphore.waiters() > 0;
-    }
-
-    segment_manager(config c)
-        : cfg([&c] {
-            config cfg(c);
-
-            if (cfg.commit_log_location.empty()) {
-                cfg.commit_log_location = "/var/lib/scylla/commitlog";
-            }
-
-            if (cfg.max_active_writes == 0) {
-                cfg.max_active_writes = // TODO: call someone to get an idea...
-                                25 * smp::count;
-            }
-            cfg.max_active_writes = std::max(uint64_t(1), cfg.max_active_writes / smp::count);
-            if (cfg.max_active_flushes == 0) {
-                cfg.max_active_flushes = // TODO: call someone to get an idea...
-                                5 * smp::count;
-            }
-            cfg.max_active_flushes = std::max(uint64_t(1), cfg.max_active_flushes / smp::count);
-
-            return cfg;
-        }())
-        , max_size(std::min<size_t>(std::numeric_limits<position_type>::max(), std::max<size_t>(cfg.commitlog_segment_size_in_mb, 1) * 1024 * 1024))
-        , max_mutation_size(max_size >> 1)
-        , max_disk_size(size_t(std::ceil(cfg.commitlog_total_space_in_mb / double(smp::count))) * 1024 * 1024)
-        , _write_semaphore(cfg.max_active_writes)
-        , _flush_semaphore(cfg.max_active_flushes)
-    {
-        assert(max_size > 0);
-
-        logger.trace("Commitlog {} maximum disk size: {} MB / cpu ({} cpus)",
-                cfg.commit_log_location, max_disk_size / (1024 * 1024),
-                smp::count);
-
-        _regs = create_counters();
-    }
+    segment_manager(config c);
    ~segment_manager() {
        logger.trace("Commitlog {} disposed", cfg.commit_log_location);
    }
@@ -267,9 +243,19 @@ public:
        return ++_ids;
    }

+    std::exception_ptr sanity_check_size(size_t size) {
+        if (size > max_mutation_size) {
+            return make_exception_ptr(std::invalid_argument(
+                            "Mutation of " + std::to_string(size)
+                                    + " bytes is too large for the maxiumum size of "
+                                    + std::to_string(max_mutation_size)));
+        }
+        return nullptr;
+    }
+
    future<> init();
    future<sseg_ptr> new_segment();
-    future<sseg_ptr> active_segment();
+    future<sseg_ptr> active_segment(commitlog::timeout_clock::time_point timeout);
    future<sseg_ptr> allocate_segment(bool active);

    future<> clear();
@@ -278,7 +264,7 @@ public:

    scollectd::registrations create_counters();

-    void orphan_all();
+    future<> orphan_all();

    void discard_unused_segments();
    void discard_completed_segments(const cf_id_type& id,
@@ -314,21 +300,19 @@ public:
    void flush_segments(bool = false);

 private:
+    future<> clear_reserve_segments();
+
+    size_t max_request_controller_units() const;
    segment_id_type _ids = 0;
    std::vector<sseg_ptr> _segments;
-    std::deque<sseg_ptr> _reserve_segments;
+    queue<sseg_ptr> _reserve_segments;
    std::vector<buffer_type> _temp_buffers;
    std::unordered_map<flush_handler_id, flush_handler> _flush_handlers;
    flush_handler_id _flush_ids = 0;
    replay_position _flush_position;
    timer<clock_type> _timer;
-    size_t _reserve_allocating = 0;
-    // # segments to try to keep available in reserve
-    // i.e. the amount of segments we expect to consume inbetween timer
-    // callbacks.
-    // The idea is that since the files are 0 len at start, and thus cost little,
-    // it is easier to adapt this value compared to timer freq.
-    size_t _num_reserve_segments = 0;
+    future<> replenish_reserve();
+    future<> _reserve_replenisher;
    seastar::gate _gate;
    uint64_t _new_counter = 0;
 };
@@ -388,8 +372,6 @@ class db::commitlog::segment: public enable_lw_shared_from_this<segment> {
    uint64_t _buf_pos = 0;
    bool _closed = false;

-    size_t _needed_size = 0;
-
    using buffer_type = segment_manager::buffer_type;
    using sseg_ptr = segment_manager::sseg_ptr;
    using clock_type = segment_manager::clock_type;
@@ -420,17 +402,6 @@ class db::commitlog::segment: public enable_lw_shared_from_this<segment> {
        _segment_manager->end_flush();
    }

-    future<> begin_write() {
-        // This is maintaining the semantica of only using the write-lock
-        // as a gate for flushing, i.e. once we've begun a flush for position X
-        // we are ok with writes to positions > X
-        return _segment_manager->begin_write();
-    }
-
-    void end_write() {
-        _segment_manager->end_write();
-    }
-
 public:
    struct cf_mark {
        const segment& s;
@@ -500,11 +471,10 @@ public:
    /**
     * Finalize this segment and get a new one
     */
-    future<sseg_ptr> finish_and_get_new() {
+    future<sseg_ptr> finish_and_get_new(commitlog::timeout_clock::time_point timeout) {
        _closed = true;
-        return maybe_wait_for_write(sync()).then([](sseg_ptr s) {
-            return s->_segment_manager->active_segment();
-        });
+        sync();
+        return _segment_manager->active_segment(timeout);
    }
    void reset_sync_time() {
        _sync_time = clock_type::now();
@@ -589,9 +559,6 @@ public:
    void new_buffer(size_t s) {
        assert(_buffer.empty());

-        s += _needed_size;
-        _needed_size = 0;
-
        auto overhead = segment_overhead_size;
        if (_file_pos == 0) {
            overhead += descriptor_header_size;
@@ -684,8 +651,6 @@ public:
        // The write will be allowed to start now, but flush (below) must wait for not only this,
        // but all previous write/flush pairs.
        return _pending_ops.run_with_ordered_post_op(rp, [this, size, off, buf = std::move(buf)]() mutable {
-            // This could "block", if we have to many pending writes.
-            return begin_write().then([this, size, off, buf = std::move(buf)]() mutable {
                auto written = make_lw_shared<size_t>(0);
                auto p = buf.get();
                return repeat([this, size, off, written, p]() mutable {
@@ -712,63 +677,17 @@ public:
                            throw;
                        }
                    });
-                }).finally([this, buf = std::move(buf)]() mutable {
+                }).finally([this, buf = std::move(buf), size]() mutable {
                    _segment_manager->release_buffer(std::move(buf));
+                    _segment_manager->notify_memory_written(size);
                });
-            }).finally([this]() {
-                end_write(); // release
-            });
        }, [me, flush_after, top, rp] { // lambda instead of bind, so we keep "me" alive.
            assert(me->_pending_ops.has_operation(rp));
            return flush_after ? me->do_flush(top) : make_ready_future<sseg_ptr>(me);
        });
    }

-    future<sseg_ptr> maybe_wait_for_write(future<sseg_ptr> f) {
-        if (_segment_manager->should_wait_for_write()) {
-            ++_write_waiters;
-            logger.trace("Too many pending writes. Must wait.");
-            return f.finally([this] {
-                --_write_waiters;
-            });
-        }
-        return make_ready_future<sseg_ptr>(shared_from_this());
-    }
-
-    /**
-     * If an allocation causes a write, and the write causes a block,
-     * any allocations post that need to wait for this to finish,
-     * other wise we will just continue building up more write queue
-     * eventually (+ loose more ordering)
-     *
-     * Some caution here, since maybe_wait_for_write actually
-     * releases _all_ queued up ops when finishing, we could get
-     * "bursts" of alloc->write, causing build-ups anyway.
-     * This should be measured properly. For now I am hoping this
-     * will work out as these should "block as a group". However,
-     * buffer memory usage might grow...
-     */
-    bool must_wait_for_alloc() {
-        // Note: write_waiters is decremented _after_ both semaphores and
-        // flush queue might be cleared. So we should not look only at it.
-        // But we still don't want to look at "should_wait_for_write" directly,
-        // since that is "global" and includes other segments, and we want to
-        // know if _this_ segment has blocking write ops pending.
-        // So we also check that the flush queue is non-empty.
-        return _write_waiters > 0 && !_pending_ops.empty();
-    }
-
-    future<sseg_ptr> wait_for_alloc() {
-        auto me = shared_from_this();
-        ++_segment_manager->totals.pending_allocations;
-        logger.trace("Previous allocation is blocking. Must wait.");
-        return _pending_ops.wait_for_pending().then_wrapped([me](auto f) { // TODO: do we need a finally?
-            --me->_segment_manager->totals.pending_allocations;
-            return f.failed() ? me->_segment_manager->active_segment() : make_ready_future<sseg_ptr>(me);
-        });
-    }
-
-    future<sseg_ptr> batch_cycle() {
+    future<sseg_ptr> batch_cycle(timeout_clock::time_point timeout) {
        /**
         * For batch mode we force a write "immediately".
         * However, we first wait for all previous writes/flushes
@@ -779,7 +698,7 @@ public:
         */
        auto me = shared_from_this();
        auto fp = _file_pos;
-        return _pending_ops.wait_for_pending().then([me = std::move(me), fp] {
+        return _pending_ops.wait_for_pending(timeout).then([me = std::move(me), fp, timeout] {
            if (fp != me->_file_pos) {
                // some other request already wrote this buffer.
                // If so, wait for the operation at our intended file offset
@@ -787,12 +706,14 @@ public:
                // are in accord.
                // (Note: wait_for_pending(pos) waits for operation _at_ pos (and before),
                replay_position rp(me->_desc.id, position_type(fp));
-                return me->_pending_ops.wait_for_pending(rp).then([me, fp] {
+                return me->_pending_ops.wait_for_pending(rp, timeout).then([me, fp] {
                    assert(me->_flush_pos > fp);
                    return make_ready_future<sseg_ptr>(me);
                });
            }
-            return me->sync();
+            // It is ok to leave the sync behind on timeout because there will be at most one
+            // such sync, all later allocations will block on _pending_ops until it is done.
+            return with_timeout(timeout, me->sync());
        }).handle_exception([me, fp](auto p) {
            // If we get an IO exception (which we assume this is)
            // we should close the segment.
@@ -802,51 +723,52 @@ public:
            return make_exception_future<sseg_ptr>(p);
        });
    }
+
    /**
     * Add a "mutation" to the segment.
     */
-    future<replay_position> allocate(const cf_id_type& id, shared_ptr<entry_writer> writer) {
-        const auto size = writer->size(*this);
-        const auto s = size + entry_overhead_size; // total size
-        if (s > _segment_manager->max_mutation_size) {
-            return make_exception_future<replay_position>(
-                    std::invalid_argument(
-                            "Mutation of " + std::to_string(s)
-                                    + " bytes is too large for the maxiumum size of "
-                                    + std::to_string(_segment_manager->max_mutation_size)));
+    future<replay_position> allocate(const cf_id_type& id, shared_ptr<entry_writer> writer, segment_manager::request_controller_units permit, commitlog::timeout_clock::time_point timeout) {
+        if (must_sync()) {
+            return with_timeout(timeout, sync()).then([this, id, writer = std::move(writer), permit = std::move(permit), timeout] (auto s) mutable {
+                return s->allocate(id, std::move(writer), std::move(permit), timeout);
+            });
        }

-        std::experimental::optional<future<sseg_ptr>> op;
+        const auto size = writer->size(*this);
+        const auto s = size + entry_overhead_size; // total size
+        auto ep = _segment_manager->sanity_check_size(s);
+        if (ep) {
+            return make_exception_future<replay_position>(std::move(ep));
+        }

-        if (must_sync()) {
-            op = sync();
-        } else if (must_wait_for_alloc()) {
-            op = wait_for_alloc();
-        } else if (!is_still_allocating() || position() + s > _segment_manager->max_size) { // would we make the file too big?
-            // do this in next segment instead.
-            op = finish_and_get_new();
-        } else if (_buffer.empty()) {
-            new_buffer(s);
-        } else if (s > (_buffer.size() - _buf_pos)) { // enough data?
-            _needed_size += s; // hint to next new_buffer, in case we are not first.
+
+        if (!is_still_allocating() || position() + s > _segment_manager->max_size) { // would we make the file too big?
+            return finish_and_get_new(timeout).then([id, writer = std::move(writer), permit = std::move(permit), timeout] (auto new_seg) mutable {
+                return new_seg->allocate(id, std::move(writer), std::move(permit), timeout);
+            });
+        } else if (!_buffer.empty() && (s > (_buffer.size() - _buf_pos))) {  // enough data?
            if (_segment_manager->cfg.mode == sync_mode::BATCH) {
                // TODO: this could cause starvation if we're really unlucky.
                // If we run batch mode and find ourselves not fit in a non-empty
                // buffer, we must force a cycle and wait for it (to keep flush order)
                // This will most likely cause parallel writes, and consecutive flushes.
-                op = cycle(true);
+                return with_timeout(timeout, cycle(true)).then([this, id, writer = std::move(writer), permit = std::move(permit), timeout] (auto new_seg) mutable {
+                    return new_seg->allocate(id, std::move(writer), std::move(permit), timeout);
+                });
            } else {
-                op = maybe_wait_for_write(cycle());
+                cycle();
            }
        }

-        if (op) {
-            return op->then([id, writer = std::move(writer)] (sseg_ptr new_seg) mutable {
-                return new_seg->allocate(id, std::move(writer));
-            });
+        size_t buf_memory = s;
+        if (_buffer.empty()) {
+            new_buffer(s);
+            buf_memory += _buf_pos;
        }

        _gate.enter(); // this might throw. I guess we accept this?
+        buf_memory -= permit.release();
+        _segment_manager->account_memory_usage(buf_memory);

        replay_position rp(_desc.id, position());
        auto pos = _buf_pos;
@@ -877,12 +799,18 @@ public:
        _gate.leave();

        if (_segment_manager->cfg.mode == sync_mode::BATCH) {
-            return batch_cycle().then([rp](auto s) {
+            return batch_cycle(timeout).then([rp](auto s) {
                return make_ready_future<replay_position>(rp);
            });
+        } else {
+            // If this buffer alone is too big, potentially bigger than the maximum allowed size,
+            // then no other request will be allowed in to force the cycle()ing of this buffer. We
+            // have to do it ourselves.
+            if ((_buf_pos >= (db::commitlog::segment::default_size))) {
+                cycle();
+            }
+            return make_ready_future<replay_position>(rp);
        }
-
-        return make_ready_future<replay_position>(rp);
    }

    position_type position() const {
@@ -900,6 +828,7 @@ public:
        std::fill(_buffer.get_write() + _buf_pos, _buffer.get_write() + size,
                0);
        _segment_manager->totals.bytes_slack += (size - _buf_pos);
+        _segment_manager->account_memory_usage(size - _buf_pos);
        return size;
    }
    void mark_clean(const cf_id_type& id, position_type pos) {
@@ -941,8 +870,98 @@ public:
    }
 };

+future<db::replay_position>
+db::commitlog::segment_manager::allocate_when_possible(const cf_id_type& id, shared_ptr<entry_writer> writer, commitlog::timeout_clock::time_point timeout) {
+    auto size = writer->size();
+    // If this is already too big now, we should throw early. It's also a correctness issue, since
+    // if we are too big at this moment we'll never reach allocate() to actually throw at that
+    // point.
+    auto ep = sanity_check_size(size);
+    if (ep) {
+        return make_exception_future<replay_position>(std::move(ep));
+    }
+
+    auto fut = get_units(_request_controller, size, timeout);
+    if (_request_controller.waiters()) {
+        totals.requests_blocked_memory++;
+    }
+    return fut.then([this, id, writer = std::move(writer), timeout] (auto permit) mutable {
+        return this->active_segment(timeout).then([this, timeout, id, writer = std::move(writer), permit = std::move(permit)] (auto s) mutable {
+            return s->allocate(id, std::move(writer), std::move(permit), timeout);
+        });
+    });
+}
+
 const size_t db::commitlog::segment::default_size;

+db::commitlog::segment_manager::segment_manager(config c)
+    : cfg([&c] {
+        config cfg(c);
+
+        if (cfg.commit_log_location.empty()) {
+            cfg.commit_log_location = "/var/lib/scylla/commitlog";
+        }
+
+        if (cfg.max_active_writes == 0) {
+            cfg.max_active_writes = // TODO: call someone to get an idea...
+                            25 * smp::count;
+        }
+        cfg.max_active_writes = std::max(uint64_t(1), cfg.max_active_writes / smp::count);
+        if (cfg.max_active_flushes == 0) {
+            cfg.max_active_flushes = // TODO: call someone to get an idea...
+                            5 * smp::count;
+        }
+        cfg.max_active_flushes = std::max(uint64_t(1), cfg.max_active_flushes / smp::count);
+
+        return cfg;
+    }())
+    , max_size(std::min<size_t>(std::numeric_limits<position_type>::max(), std::max<size_t>(cfg.commitlog_segment_size_in_mb, 1) * 1024 * 1024))
+    , max_mutation_size(max_size >> 1)
+    , max_disk_size(size_t(std::ceil(cfg.commitlog_total_space_in_mb / double(smp::count))) * 1024 * 1024)
+    , _flush_semaphore(cfg.max_active_flushes)
+    // That is enough concurrency to allow for our largest mutation (max_mutation_size), plus
+    // an existing in-flight buffer. Since we'll force the cycling() of any buffer that is bigger
+    // than default_size at the end of the allocation, that allows for every valid mutation to
+    // always be admitted for processing.
+    , _request_controller(max_request_controller_units())
+    , _reserve_segments(1)
+    , _reserve_replenisher(make_ready_future<>())
+{
+    assert(max_size > 0);
+
+    logger.trace("Commitlog {} maximum disk size: {} MB / cpu ({} cpus)",
+            cfg.commit_log_location, max_disk_size / (1024 * 1024),
+            smp::count);
+
+    _regs = create_counters();
+}
+
+size_t db::commitlog::segment_manager::max_request_controller_units() const {
+    return max_mutation_size + db::commitlog::segment::default_size;
+}
+
+future<> db::commitlog::segment_manager::replenish_reserve() {
+    return do_until([this] { return _shutdown; }, [this] {
+        return _reserve_segments.not_full().then([this] {
+            if (_shutdown) {
+                return make_ready_future<>();
+            }
+            return with_gate(_gate, [this] {
+                return this->allocate_segment(false).then([this](sseg_ptr s) {
+                    auto ret = _reserve_segments.push(std::move(s));
+                    if (!ret) {
+                        logger.error("Segment reserve is full! Ignoring and trying to continue, but shouldn't happen");
+                    }
+                    return make_ready_future<>();
+                });
+            }).handle_exception([](std::exception_ptr ep) {
+                logger.warn("Exception in segment reservation: {}", ep);
+                return sleep(100ms);
+            });
+        });
+    });
+}
+
 future<std::vector<db::commitlog::descriptor>>
 db::commitlog::segment_manager::list_descriptors(sstring dirname) {
    struct helper {
@@ -992,7 +1011,7 @@ db::commitlog::segment_manager::list_descriptors(sstring dirname) {
        }
    };

-    return open_checked_directory(commit_error, dirname).then([this, dirname](file dir) {
+    return open_checked_directory(commit_error_handler, dirname).then([this, dirname](file dir) {
        auto h = make_lw_shared<helper>(std::move(dirname), std::move(dir));
        return h->done().then([h]() {
            return make_ready_future<std::vector<db::commitlog::descriptor>>(std::move(h->_result));
@@ -1002,9 +1021,11 @@ db::commitlog::segment_manager::list_descriptors(sstring dirname) {

 future<> db::commitlog::segment_manager::init() {
    return list_descriptors(cfg.commit_log_location).then([this](std::vector<descriptor> descs) {
+        assert(_reserve_segments.empty()); // _segments_to_replay must not pick them up
        segment_id_type id = std::chrono::duration_cast<std::chrono::milliseconds>(runtime::get_boot_time().time_since_epoch()).count() + 1;
        for (auto& d : descs) {
            id = std::max(id, replay_position(d.id).base_id());
+            _segments_to_replay.push_back(cfg.commit_log_location + "/" + d.filename());
        }

        // base id counter is [ <shard> | <base> ]
@@ -1013,6 +1034,9 @@ future<> db::commitlog::segment_manager::init() {
        _timer.set_callback(std::bind(&segment_manager::on_timer, this));
        auto delay = engine().cpu_id() * std::ceil(double(cfg.commitlog_sync_period_in_ms) / smp::count);
        logger.trace("Delaying timer loop {} ms", delay);
+        // We need to wait until we have scanned all other segments to actually start serving new
+        // segments. We are ready now
+        this->_reserve_replenisher = replenish_reserve();
        this->arm(delay);
    });
 }
@@ -1070,19 +1094,21 @@ scollectd::registrations db::commitlog::segment_manager::create_counters() {
                , make_typed(data_type::DERIVE, totals.bytes_slack)
        ),

-        add_polled_metric(type_instance_id("commitlog"
-                        , per_cpu_plugin_instance, "queue_length", "pending_writes")
-                , make_typed(data_type::GAUGE, totals.pending_writes)
-        ),
        add_polled_metric(type_instance_id("commitlog"
                        , per_cpu_plugin_instance, "queue_length", "pending_flushes")
                , make_typed(data_type::GAUGE, totals.pending_flushes)
        ),

        add_polled_metric(type_instance_id("commitlog"
-                        , per_cpu_plugin_instance, "total_operations", "write_limit_exceeded")
-                , make_typed(data_type::DERIVE, totals.write_limit_exceeded)
+                        , per_cpu_plugin_instance, "queue_length", "pending_allocations")
+                , make_typed(data_type::GAUGE, [this] { return pending_allocations(); })
        ),
+
+        add_polled_metric(type_instance_id("commitlog"
+                        , per_cpu_plugin_instance, "total_operations", "requests_blocked_memory")
+                , make_typed(data_type::DERIVE, totals.requests_blocked_memory)
+        ),
+
        add_polled_metric(type_instance_id("commitlog"
                        , per_cpu_plugin_instance, "total_operations", "flush_limit_exceeded")
                , make_typed(data_type::DERIVE, totals.flush_limit_exceeded)
@@ -1142,7 +1168,7 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
    descriptor d(next_id());
    file_open_options opt;
    opt.extent_allocation_size_hint = max_size;
-    return open_checked_file_dma(commit_error, cfg.commit_log_location + "/" + d.filename(), open_flags::wo | open_flags::create, opt).then([this, d, active](file f) {
+    return open_checked_file_dma(commit_error_handler, cfg.commit_log_location + "/" + d.filename(), open_flags::wo | open_flags::create, opt).then([this, d, active](file f) {
        // xfs doesn't like files extended betond eof, so enlarge the file
        return f.truncate(max_size).then([this, d, active, f] () mutable {
            auto s = make_lw_shared<segment>(this->shared_from_this(), d, std::move(f), active);
@@ -1158,36 +1184,42 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:

    ++_new_counter;

-    if (_reserve_segments.empty()) {
-        if (_num_reserve_segments < cfg.max_reserve_segments) {
-            ++_num_reserve_segments;
-            logger.trace("Increased segment reserve count to {}", _num_reserve_segments);
-        }
-        return allocate_segment(true).then([this](sseg_ptr s) {
-            _segments.push_back(s);
-            return make_ready_future<sseg_ptr>(s);
-        });
+    if (_reserve_segments.empty() && (_reserve_segments.max_size() < cfg.max_reserve_segments)) {
+        _reserve_segments.set_max_size(_reserve_segments.max_size() + 1);
+        logger.debug("Increased segment reserve count to {}", _reserve_segments.max_size());
    }
-
-    _segments.push_back(_reserve_segments.front());
-    _reserve_segments.pop_front();
-    _segments.back()->reset_sync_time();
-    logger.trace("Acquired segment {} from reserve", _segments.back());
-    return make_ready_future<sseg_ptr>(_segments.back());
+    return _reserve_segments.pop_eventually().then([this] (auto s) {
+        _segments.push_back(std::move(s));
+        _segments.back()->reset_sync_time();
+        return make_ready_future<sseg_ptr>(_segments.back());
+    });
 }

-future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::active_segment() {
-    if (_segments.empty() || !_segments.back()->is_still_allocating()) {
-        return _new_segment_semaphore.wait().then([this]() {
-            if (_segments.empty() || !_segments.back()->is_still_allocating()) {
-                return new_segment();
+future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::active_segment(commitlog::timeout_clock::time_point timeout) {
+    // If there is no active segment, try to allocate one using new_segment(). If we time out,
+    // make sure later invocations can still pick that segment up once it's ready.
+    return repeat_until_value([this, timeout] () -> future<stdx::optional<sseg_ptr>> {
+        if (!_segments.empty() && _segments.back()->is_still_allocating()) {
+            return make_ready_future<stdx::optional<sseg_ptr>>(_segments.back());
+        }
+        return [this, timeout] {
+            if (!_segment_allocating) {
+                promise<> p;
+                _segment_allocating.emplace(p.get_future());
+                auto f = _segment_allocating->get_future(timeout);
+                with_gate(_gate, [this] {
+                    return new_segment().discard_result().finally([this]() {
+                        _segment_allocating = stdx::nullopt;
+                    });
+                }).forward_to(std::move(p));
+                return f;
+            } else {
+                return _segment_allocating->get_future(timeout);
            }
-            return make_ready_future<sseg_ptr>(_segments.back());
-        }).finally([this]() {
-            _new_segment_semaphore.signal();
+        }().then([] () -> stdx::optional<sseg_ptr> {
+            return stdx::nullopt;
        });
-    }
-    return make_ready_future<sseg_ptr>(_segments.back());
+    });
 }

 /**
@@ -1241,6 +1273,15 @@ void db::commitlog::segment_manager::discard_unused_segments() {
    }
 }

+// FIXME: pop() will call unlink -> sleeping in reactor thread.
+// Not urgent since mostly called during shutdown, but have to fix.
+future<> db::commitlog::segment_manager::clear_reserve_segments() {
+    while (!_reserve_segments.empty()) {
+        _reserve_segments.pop();
+    }
+    return make_ready_future<>();
+}
+
 future<> db::commitlog::segment_manager::sync_all_segments(bool shutdown) {
    logger.debug("Issuing sync for all segments");
    return parallel_for_each(_segments, [this, shutdown](sseg_ptr s) {
@@ -1251,19 +1292,40 @@ future<> db::commitlog::segment_manager::sync_all_segments(bool shutdown) {
 }

 future<> db::commitlog::segment_manager::shutdown() {
-    if (!_shutdown) {
-        _shutdown = true; // no re-arm, no create new segments.
-        _timer.cancel(); // no more timer calls
-        // Now first wait for periodic task to finish, then sync and close all
-        // segments, flushing out any remaining data.
-        return _gate.close().then(std::bind(&segment_manager::sync_all_segments, this, true));
+    if (!_shutdown_promise) {
+        _shutdown_promise = shared_promise<>();
+
+        // Wait for all pending requests to finish. Need to sync first because segments that are
+        // alive may be holding semaphore permits.
+        auto block_new_requests = get_units(_request_controller, max_request_controller_units());
+        return sync_all_segments(false).then([this, block_new_requests = std::move(block_new_requests)] () mutable {
+            return std::move(block_new_requests).then([this] (auto permits) {
+                _timer.cancel(); // no more timer calls
+                _shutdown = true; // no re-arm, no create new segments.
+                // Now first wait for periodic task to finish, then sync and close all
+                // segments, flushing out any remaining data.
+                return _gate.close().then(std::bind(&segment_manager::sync_all_segments, this, true));
+            });
+        }).finally([this] {
+            // Now that the gate is closed and requests completed we are sure nobody else will pop()
+            return clear_reserve_segments().finally([this] {
+                return std::move(_reserve_replenisher).then_wrapped([this] (auto f) {
+                    // Could be cleaner with proper seastar support
+                    if (f.failed()) {
+                        _shutdown_promise->set_exception(f.get_exception());
+                    } else {
+                        _shutdown_promise->set_value();
+                    }
+                });
+            });
+        });
    }
-    return make_ready_future<>();
+    return _shutdown_promise->get_shared_future();
 }

-void db::commitlog::segment_manager::orphan_all() {
+future<> db::commitlog::segment_manager::orphan_all() {
    _segments.clear();
-    _reserve_segments.clear();
+    return clear_reserve_segments();
 }

 /*
@@ -1278,7 +1340,7 @@ future<> db::commitlog::segment_manager::clear() {
        for (auto& s : _segments) {
            s->mark_clean();
        }
-        orphan_all();
+        return orphan_all();
    });
 }
 /**
@@ -1309,37 +1371,7 @@ void db::commitlog::segment_manager::on_timer() {
                flush_segments();
            }
        }
-        // take outstanding allocations into regard. This is paranoid,
-        // but if for some reason the file::open takes longer than timer period,
-        // we could flood the reserve list with new segments
-        //
-        // #482 - _reserve_allocating is decremented in the finally clause below.
-        // This is needed because if either allocate_segment _or_ emplacing into
-        // _reserve_segments should throw, we still need the counter reset
-        // However, because of this, it might be that emplace was done, but not decrement,
-        // when we get here again. So occasionally we might get a sum of the two that is
-        // not consistent. It should however always just potentially be _to much_, i.e.
-        // just an indicator that we don't need to do anything. So lets do that.
-        auto n = std::min(_reserve_segments.size() + _reserve_allocating, _num_reserve_segments);
-        return parallel_for_each(boost::irange(n, _num_reserve_segments), [this, n](auto i) {
-            ++_reserve_allocating;
-            return this->allocate_segment(false).then([this](sseg_ptr s) {
-                if (!_shutdown) {
-                    // insertion sort.
-                    auto i = std::upper_bound(_reserve_segments.begin(), _reserve_segments.end(), s, [](sseg_ptr s1, sseg_ptr s2) {
-                        const descriptor& d1 = s1->_desc;
-                        const descriptor& d2 = s2->_desc;
-                        return d1.id < d2.id;
-                    });
-                    i = _reserve_segments.emplace(i, std::move(s));
-                    logger.trace("Added reserve segment {}", *i);
-                }
-            }).finally([this] {
-                --_reserve_allocating;
-            });
-        });
-    }).handle_exception([](std::exception_ptr ep) {
-        logger.warn("Exception in segment reservation: {}", ep);
+        return make_ready_future<>();
    });
    arm();
 }
@@ -1410,7 +1442,7 @@ void db::commitlog::segment_manager::release_buffer(buffer_type&& b) {
 * Add mutation.
 */
 future<db::replay_position> db::commitlog::add(const cf_id_type& id,
-        size_t size, serializer_func func) {
+        size_t size, commitlog::timeout_clock::time_point timeout, serializer_func func) {
    class serializer_func_entry_writer final : public entry_writer {
        serializer_func _func;
        size_t _size;
@@ -1419,17 +1451,16 @@ future<db::replay_position> db::commitlog::add(const cf_id_type& id,
            : _func(std::move(func)), _size(sz)
        { }
        virtual size_t size(segment&) override { return _size; }
+        virtual size_t size() override { return _size; }
        virtual void write(segment&, output& out) override {
            _func(out);
        }
    };
    auto writer = ::make_shared<serializer_func_entry_writer>(size, std::move(func));
-    return _segment_manager->active_segment().then([id, writer] (auto s) {
-        return s->allocate(id, writer);
-    });
+    return _segment_manager->allocate_when_possible(id, writer, timeout);
 }

-future<db::replay_position> db::commitlog::add_entry(const cf_id_type& id, const commitlog_entry_writer& cew)
+future<db::replay_position> db::commitlog::add_entry(const cf_id_type& id, const commitlog_entry_writer& cew, timeout_clock::time_point timeout)
 {
    class cl_entry_writer final : public entry_writer {
        commitlog_entry_writer _writer;
@@ -1439,6 +1470,9 @@ future<db::replay_position> db::commitlog::add_entry(const cf_id_type& id, const
            _writer.set_with_schema(!seg.is_schema_version_known(_writer.schema()));
            return _writer.size();
        }
+        virtual size_t size() override {
+            return _writer.mutation_size();
+        }
        virtual void write(segment& seg, output& out) override {
            if (_writer.with_schema()) {
                seg.add_schema_version(_writer.schema());
@@ -1447,9 +1481,7 @@ future<db::replay_position> db::commitlog::add_entry(const cf_id_type& id, const
        }
    };
    auto writer = ::make_shared<cl_entry_writer>(cew);
-    return _segment_manager->active_segment().then([id, writer] (auto s) {
-        return s->allocate(id, writer);
-    });
+    return _segment_manager->allocate_when_possible(id, writer, timeout);
 }

 db::commitlog::commitlog(config cfg)
@@ -1546,7 +1578,7 @@ const db::commitlog::config& db::commitlog::active_config() const {
 // on error at startup if required
 future<std::unique_ptr<subscription<temporary_buffer<char>, db::replay_position>>>
 db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func next, position_type off) {
-    return open_checked_file_dma(commit_error, filename, open_flags::ro).then([next = std::move(next), off](file f) {
+    return open_checked_file_dma(commit_error_handler, filename, open_flags::ro).then([next = std::move(next), off](file f) {
       return std::make_unique<subscription<temporary_buffer<char>, replay_position>>(
           read_log_file(std::move(f), std::move(next), off));
    });
@@ -1557,6 +1589,15 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
 subscription<temporary_buffer<char>, db::replay_position>
 db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type off) {
    struct work {
+    private:
+        file_input_stream_options make_file_input_stream_options() {
+            file_input_stream_options fo;
+            fo.buffer_size = db::commitlog::segment::default_size;
+            fo.read_ahead = 10;
+            fo.io_priority_class = service::get_local_commitlog_priority();
+            return fo;
+        }
+    public:
        file f;
        stream<temporary_buffer<char>, replay_position> s;
        input_stream<char> fin;
@@ -1570,9 +1611,10 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
        size_t corrupt_size = 0;
        bool eof = false;
        bool header = true;
+        bool failed = false;

        work(file f, position_type o = 0)
-                : f(f), fin(make_file_input_stream(f)), start_off(o) {
+                : f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options())), start_off(o) {
        }
        work(work&&) = default;

@@ -1603,6 +1645,10 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
            eof = true;
            return make_ready_future<>();
        }
+        future<> fail() {
+            failed = true;
+            return stop();
+        }
        future<> read_header() {
            return fin.read_exactly(segment::descriptor_header_size).then([this](temporary_buffer<char> buf) {
                if (!advance(buf)) {
@@ -1739,7 +1785,9 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
                        return make_ready_future<>();
                    }

-                    return s.produce(buf.share(0, data_size), rp);
+                    return s.produce(buf.share(0, data_size), rp).handle_exception([this](auto ep) {
+                        return this->fail();
+                    });
                });
            });
        }
@@ -1755,6 +1803,8 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
                      throw segment_data_corruption_error("Data corruption", corrupt_size);
                  }
                });
+            }).finally([this] {
+                return fin.close();
            });
        }
    };
@@ -1763,7 +1813,9 @@ db::commitlog::read_log_file(file f, commit_load_reader_func next, position_type
    auto ret = w->s.listen(std::move(next));

    w->s.started().then(std::bind(&work::read_file, w.get())).then([w] {
-        w->s.close();
+        if (!w->failed) {
+            w->s.close();
+        }
    }).handle_exception([w](auto ep) {
        w->s.set_exception(ep);
    });
@@ -1788,12 +1840,7 @@ uint64_t db::commitlog::get_flush_count() const {
 }

 uint64_t db::commitlog::get_pending_tasks() const {
-    return _segment_manager->totals.pending_writes
-                    + _segment_manager->totals.pending_flushes;
-}
-
-uint64_t db::commitlog::get_pending_writes() const {
-    return _segment_manager->totals.pending_writes;
+    return _segment_manager->totals.pending_flushes;
 }

 uint64_t db::commitlog::get_pending_flushes() const {
@@ -1801,11 +1848,7 @@ uint64_t db::commitlog::get_pending_flushes() const {
 }

 uint64_t db::commitlog::get_pending_allocations() const {
-    return _segment_manager->totals.pending_allocations;
-}
-
-uint64_t db::commitlog::get_write_limit_exceeded_count() const {
-    return _segment_manager->totals.write_limit_exceeded;
+    return _segment_manager->pending_allocations();
 }

 uint64_t db::commitlog::get_flush_limit_exceeded_count() const {
@@ -1850,3 +1893,6 @@ future<std::vector<sstring>> db::commitlog::list_existing_segments(const sstring
    });
 }

+std::vector<sstring> db::commitlog::get_segments_to_replay() {
+    return std::move(_segment_manager->_segments_to_replay);
+}
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -94,6 +94,8 @@ using cf_id_type = utils::UUID;
 */
 class commitlog {
 public:
+    using timeout_clock = std::chrono::steady_clock;
+
    class segment_manager;
    class segment;

@@ -171,9 +173,23 @@ public:
    /**
     * Add a "Mutation" to the commit log.
     *
+     * Resolves with timed_out_error when timeout is reached.
+     *
     * @param mutation_func a function that writes 'size' bytes to the log, representing the mutation.
     */
-    future<replay_position> add(const cf_id_type& id, size_t size, serializer_func mutation_func);
+    future<replay_position> add(const cf_id_type& id, size_t size, timeout_clock::time_point timeout, serializer_func mutation_func);
+
+    /**
+     * Template version of add.
+     * Resolves with timed_out_error when timeout is reached.
+     * @param mu an invokable op that generates the serialized data. (Of size bytes)
+     */
+    template<typename _MutationOp>
+    future<replay_position> add_mutation(const cf_id_type& id, size_t size, timeout_clock::time_point timeout, _MutationOp&& mu) {
+        return add(id, size, timeout, [mu = std::forward<_MutationOp>(mu)](output& out) {
+            mu(out);
+        });
+    }

    /**
     * Template version of add.
@@ -181,17 +197,15 @@ public:
     */
    template<typename _MutationOp>
    future<replay_position> add_mutation(const cf_id_type& id, size_t size, _MutationOp&& mu) {
-        return add(id, size, [mu = std::forward<_MutationOp>(mu)](output& out) {
-            mu(out);
-        });
+        return add_mutation(id, size, timeout_clock::time_point::max(), std::forward<_MutationOp>(mu));
    }

    /**
     * Add an entry to the commit log.
-     *
+     * Resolves with timed_out_error when timeout is reached.
     * @param entry_writer a writer responsible for writing the entry
     */
-    future<replay_position> add_entry(const cf_id_type& id, const commitlog_entry_writer& entry_writer);
+    future<replay_position> add_entry(const cf_id_type& id, const commitlog_entry_writer& entry_writer, timeout_clock::time_point timeout);

    /**
     * Modifies the per-CF dirty cursors of any commit log segments for the column family according to the position
@@ -241,14 +255,20 @@ public:
     */
    std::vector<sstring> get_active_segment_names() const;

+    /**
+     * Returns a vector of segment paths which were
+     * preexisting when this instance of commitlog was created.
+     *
+     * The list will be empty when called for the second time.
+     */
+    std::vector<sstring> get_segments_to_replay();
+
    uint64_t get_total_size() const;
    uint64_t get_completed_tasks() const;
    uint64_t get_flush_count() const;
    uint64_t get_pending_tasks() const;
-    uint64_t get_pending_writes() const;
    uint64_t get_pending_flushes() const;
    uint64_t get_pending_allocations() const;
-    uint64_t get_write_limit_exceeded_count() const;
    uint64_t get_flush_limit_exceeded_count() const;
    uint64_t get_num_segments_created() const;
    uint64_t get_num_segments_destroyed() const;
@@ -321,6 +341,8 @@ private:

    struct entry_writer {
        virtual size_t size(segment&) = 0;
+        // Returns segment-independent size of the entry. Must be <= than segment-dependant size.
+        virtual size_t size() = 0;
        virtual void write(segment&, output&) = 0;
    };
 };
--- a/db/commitlog/commitlog_entry.cc
+++ b/db/commitlog/commitlog_entry.cc
@@ -33,48 +33,26 @@
 #include "idl/mutation.dist.impl.hh"
 #include "idl/commitlog.dist.impl.hh"

-commitlog_entry::commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation)
-    : _mapping(std::move(mapping))
-      , _mutation_storage(std::move(mutation))
-      , _mutation(*_mutation_storage)
-{ }
-
-commitlog_entry::commitlog_entry(stdx::optional<column_mapping> mapping, const frozen_mutation& mutation)
-    : _mapping(std::move(mapping))
-      , _mutation(mutation)
-{ }
-
-commitlog_entry::commitlog_entry(commitlog_entry&& ce)
-    : _mapping(std::move(ce._mapping))
-    , _mutation_storage(std::move(ce._mutation_storage))
-    , _mutation(_mutation_storage ? *_mutation_storage : ce._mutation)
-{
-}
-
-commitlog_entry& commitlog_entry::operator=(commitlog_entry&& ce)
-{
-    if (this != &ce) {
-        this->~commitlog_entry();
-        new (this) commitlog_entry(std::move(ce));
-    }
-    return *this;
-}
-
-commitlog_entry commitlog_entry_writer::get_entry() const {
-    if (_with_schema) {
-        return commitlog_entry(_schema->get_column_mapping(), _mutation);
-    } else {
-        return commitlog_entry({}, _mutation);
-    }
+template<typename Output>
+void commitlog_entry_writer::serialize(Output& out) const {
+    [this, wr = ser::writer_of_commitlog_entry<Output>(out)] () mutable {
+        if (_with_schema) {
+            return std::move(wr).write_mapping(_schema->get_column_mapping());
+        } else {
+            return std::move(wr).skip_mapping();
+        }
+    }().write_mutation(_mutation).end_commitlog_entry();
 }

 void commitlog_entry_writer::compute_size() {
-    _size = ser::get_sizeof(get_entry());
+    seastar::measuring_output_stream ms;
+    serialize(ms);
+    _size = ms.size();
 }

 void commitlog_entry_writer::write(data_output& out) const {
    seastar::simple_output_stream str(out.reserve(size()), size());
-    ser::serialize(str, get_entry());
+    serialize(str);
 }

 commitlog_entry_reader::commitlog_entry_reader(const temporary_buffer<char>& buffer)
--- a/db/commitlog/commitlog_entry.hh
+++ b/db/commitlog/commitlog_entry.hh
@@ -31,15 +31,10 @@ namespace stdx = std::experimental;

 class commitlog_entry {
    stdx::optional<column_mapping> _mapping;
-    stdx::optional<frozen_mutation> _mutation_storage;
-    const frozen_mutation& _mutation;
+    frozen_mutation _mutation;
 public:
-    commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation);
-    commitlog_entry(stdx::optional<column_mapping> mapping, const frozen_mutation& mutation);
-    commitlog_entry(commitlog_entry&&);
-    commitlog_entry(const commitlog_entry&) = delete;
-    commitlog_entry& operator=(commitlog_entry&&);
-    commitlog_entry& operator=(const commitlog_entry&) = delete;
+    commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation)
+        : _mapping(std::move(mapping)), _mutation(std::move(mutation)) { }
    const stdx::optional<column_mapping>& mapping() const { return _mapping; }
    const frozen_mutation& mutation() const { return _mutation; }
 };
@@ -50,8 +45,9 @@ class commitlog_entry_writer {
    bool _with_schema = true;
    size_t _size;
 private:
+    template<typename Output>
+    void serialize(Output&) const;
    void compute_size();
-    commitlog_entry get_entry() const;
 public:
    commitlog_entry_writer(schema_ptr s, const frozen_mutation& fm)
        : _schema(std::move(s)), _mutation(fm)
@@ -74,6 +70,10 @@ public:
        return _size;
    }

+    size_t mutation_size() const {
+        return _mutation.representation().size();
+    }
+
    void write(data_output& out) const;
 };

@@ -84,4 +84,4 @@ public:

    const stdx::optional<column_mapping>& get_column_mapping() const { return _ce.mapping(); }
    const frozen_mutation& mutation() const { return _ce.mutation(); }
-};
+};
--- a/Show More
+++ b/Show More