mutation_query_test: add test for result size calculation

Check that digest only and digest+data query calculate result size to be the same. Message-Id: <20180906153800.GK2326@scylladb.com> (cherry picked from commit 9e438933a2)
mutation_partition: accurately account for result size in digest only queries
2018-09-08 18:55:23 +03:00 · 2018-09-08 18:55:23 +03:00 · 2018-09-06 16:51:31 +03:00 · 2018-08-26 15:52:18 +03:00 · 2018-08-21 17:37:36 +01:00 · 2018-08-21 18:24:06 +03:00
890 changed files with 89449 additions and 28760 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,12 @@ dist/ami/files/*.rpm
 dist/ami/variables.json
 dist/ami/scylla_deploy.sh
 *.pyc
+Cql.tokens
+.kdev4
+*.kdev4
+CMakeLists.txt.user
+.cache
+.tox
+*.egg-info
+__pycache__CMakeLists.txt.user
+.gdbinit
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,140 @@
+##
+## For best results, first compile the project using the Ninja build-system.
+##
+
+cmake_minimum_required(VERSION 3.7)
+project(scylla)
+
+if (NOT DEFINED FOR_IDE AND NOT DEFINED ENV{FOR_IDE} AND NOT DEFINED ENV{CLION_IDE})
+    message(FATAL_ERROR "This CMakeLists.txt file is only valid for use in IDEs, please define FOR_IDE to acknowledge this.")
+endif()
+
+# Default value. A more accurate list is populated through `pkg-config` below if `seastar.pc` is available.
+set(SEASTAR_INCLUDE_DIRS "seastar")
+
+# These paths are always available, since they're included in the repository. Additional DPDK headers are placed while
+# Seastar is built, and are captured in `SEASTAR_INCLUDE_DIRS` through parsing the Seastar pkg-config file (below).
+set(SEASTAR_DPDK_INCLUDE_DIRS
+        seastar/dpdk/lib/librte_eal/common/include
+        seastar/dpdk/lib/librte_eal/common/include/generic
+        seastar/dpdk/lib/librte_eal/common/include/x86
+        seastar/dpdk/lib/librte_ether)
+
+find_package(PkgConfig REQUIRED)
+
+set(ENV{PKG_CONFIG_PATH} "${CMAKE_SOURCE_DIR}/seastar/build/release:$ENV{PKG_CONFIG_PATH}")
+pkg_check_modules(SEASTAR seastar)
+
+find_package(Boost COMPONENTS filesystem program_options system thread)
+
+##
+## Populate the names of all source and header files in the indicated paths in a designated variable.
+##
+## When RECURSIVE is specified, directories are traversed recursively.
+##
+## Use: scan_scylla_source_directories(VAR my_result_var [RECURSIVE] PATHS [path1 path2 ...])
+##
+function (scan_scylla_source_directories)
+    set(options RECURSIVE)
+    set(oneValueArgs VAR)
+    set(multiValueArgs PATHS)
+    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}")
+
+    set(globs "")
+
+    foreach (dir ${args_PATHS})
+        list(APPEND globs "${dir}/*.cc" "${dir}/*.hh")
+    endforeach()
+
+    if (args_RECURSIVE)
+        set(glob_kind GLOB_RECURSE)
+    else()
+        set(glob_kind GLOB)
+    endif()
+
+    file(${glob_kind} var
+            ${globs})
+
+    set(${args_VAR} ${var} PARENT_SCOPE)
+endfunction()
+
+## Although Seastar is an external project, it is common enough to explore the sources while doing
+## Scylla development that we'll treat the Seastar sources as part of this project for easier navigation.
+scan_scylla_source_directories(
+        VAR SEASTAR_SOURCE_FILES
+        RECURSIVE
+
+        PATHS
+          seastar/core
+          seastar/http
+          seastar/json
+          seastar/net
+          seastar/rpc
+          seastar/tests
+          seastar/util)
+
+scan_scylla_source_directories(
+        VAR SCYLLA_ROOT_SOURCE_FILES
+        PATHS .)
+
+scan_scylla_source_directories(
+        VAR SCYLLA_SUB_SOURCE_FILES
+        RECURSIVE
+
+        PATHS
+          api
+          auth
+          cql3
+          db
+          dht
+          exceptions
+          gms
+          index
+          io
+          locator
+          message
+          repair
+          service
+          sstables
+          streaming
+          tests
+          thrift
+          tracing
+          transport
+          utils)
+
+scan_scylla_source_directories(
+        VAR SCYLLA_GEN_SOURCE_FILES
+        RECURSIVE
+        PATHS build/release/gen)
+
+set(SCYLLA_SOURCE_FILES
+        ${SCYLLA_ROOT_SOURCE_FILES}
+        ${SCYLLA_GEN_SOURCE_FILES}
+        ${SCYLLA_SUB_SOURCE_FILES})
+
+add_executable(scylla
+        ${SEASTAR_SOURCE_FILES}
+        ${SCYLLA_SOURCE_FILES})
+
+# Note that since CLion does not undestand GCC6 concepts, we always disable them (even if users configure otherwise).
+# CLion seems to have trouble with `-U` (macro undefinition), so we do it this way instead.
+list(REMOVE_ITEM SEASTAR_CFLAGS "-DHAVE_GCC6_CONCEPTS")
+
+# If the Seastar pkg-config information is available, append to the default flags.
+#
+# For ease of browsing the source code, we always pretend that DPDK is enabled.
+target_compile_options(scylla PUBLIC
+        -std=gnu++14
+        -DHAVE_DPDK
+        -DHAVE_HWLOC
+        "${SEASTAR_CFLAGS}")
+
+# The order matters here: prefer the "static" DPDK directories to any dynamic paths from pkg-config. Some files are only
+# available dynamically, though.
+target_include_directories(scylla PUBLIC
+        .
+        ${SEASTAR_DPDK_INCLUDE_DIRS}
+        ${SEASTAR_INCLUDE_DIRS}
+        ${Boost_INCLUDE_DIRS}
+        build/release/gen)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,11 @@
+# Asking questions or requesting help
+
+Use the [ScyllaDB user mailing list](https://groups.google.com/forum/#!forum/scylladb-users) for general questions and help.
+
+# Reporting an issue
+
+Please use the [Issue Tracker](https://github.com/scylladb/scylla/issues/) to report issues.  Fill in as much information as you can in the issue template, especially for performance problems.
+
+# Contributing Code to Scylla
+
+To contribute code to Scylla, you need to sign the [Contributor License Agreement](http://www.scylladb.com/opensource/cla/) and send your changes as [patches](https://github.com/scylladb/scylla/wiki/Formatting-and-sending-patches) to the [mailing list](https://groups.google.com/forum/#!forum/scylladb-dev). We don't accept pull requests on GitHub.
--- a/HACKING.md
+++ b/HACKING.md
@@ -0,0 +1,233 @@
+# Guidelines for developing Scylla
+
+This document is intended to help developers and contributors to Scylla get started. The first part consists of general guidelines that make no assumptions about a development environment or tooling. The second part describes a particular environment and work-flow for exemplary purposes.
+
+## Overview
+
+This section covers some high-level information about the Scylla source code and work-flow.
+
+### Getting the source code
+
+Scylla uses [Git submodules](https://git-scm.com/book/en/v2/Git-Tools-Submodules) to manage its dependency on Seastar and other tools. Be sure that all submodules are correctly initialized when cloning the project:
+
+```bash
+$ git clone https://github.com/scylladb/scylla
+$ cd scylla
+$ git submodule update --init --recursive
+```
+
+### Dependencies
+
+Scylla depends on the system package manager for its development dependencies.
+
+Running `./install_dependencies.sh` (as root) installs the appropriate packages based on your Linux distribution.
+
+### Build system
+
+**Note**: Compiling Scylla requires, conservatively, 2 GB of memory per native thread, and up to 3 GB per native thread while linking.
+
+Scylla is built with [Ninja](https://ninja-build.org/), a low-level rule-based system. A Python script, `configure.py`, generates a Ninja file (`build.ninja`) based on configuration options.
+
+To build for the first time:
+
+```bash
+$ ./configure.py
+$ ninja-build
+```
+
+Afterwards, it is sufficient to just execute Ninja.
+
+The full suite of options for project configuration is available via
+
+```bash
+$ ./configure.py --help
+```
+
+The most important options are:
+
+- `--mode={release,debug,all}`: Debug mode enables [AddressSanitizer](https://github.com/google/sanitizers/wiki/AddressSanitizer) and allows for debugging with tools like GDB. Debugging builds are generally slower and generate much larger object files than release builds.
+
+- `--{enable,disable}-dpdk`: [DPDK](http://dpdk.org/) is a set of libraries and drivers for fast packet processing. During development, it's not necessary to enable support even if it is supported by your platform.
+
+Source files and build targets are tracked manually in `configure.py`, so the script needs to be updated when new files or targets are added or removed.
+
+To save time -- for instance, to avoid compiling all unit tests -- you can also specify specific targets to Ninja. For example,
+
+```bash
+$ ninja-build build/release/tests/schema_change_test
+```
+
+### Unit testing
+
+Unit tests live in the `/tests` directory. Like with application source files, test sources and executables are specified manually in `configure.py` and need to be updated when changes are made.
+
+A test target can be any executable. A non-zero return code indicates test failure.
+
+Most tests in the Scylla repository are built using the [Boost.Test](http://www.boost.org/doc/libs/1_64_0/libs/test/doc/html/index.html) library. Utilities for writing tests with Seastar futures are also included.
+
+Run all tests through the test execution wrapper with
+
+```bash
+$ ./test.py --mode={debug,release}
+```
+
+The `--name` argument can be specified to run a particular test.
+
+Alternatively, you can execute the test executable directly. For example,
+
+```bash
+$ build/release/tests/row_cache_test -- -c1 -m1G
+```
+
+The `-c1 -m1G` arguments limit this Seastar-based test to a single system thread and 1 GB of memory.
+
+### Preparing patches
+
+All changes to Scylla are submitted as patches to the public mailing list. Once a patch is approved by one of the maintainers of the project, it is committed to the maintainers' copy of the repository at https://github.com/scylladb/scylla.
+
+Detailed instructions for formatting patches for the mailing list and advice on preparing good patches are available at the [ScyllaDB website](http://docs.scylladb.com/contribute/).
+
+### Running Scylla
+
+Once Scylla has been compiled, executing the (`debug` or `release`) target will start a running instance in the foreground:
+
+```bash
+$ build/release/scylla
+```
+
+The `scylla` executable requires a configuration file, `scylla.yaml`. By default, this is read from `$SCYLLA_HOME/conf/scylla.yaml`. A good starting point for development is located in the repository at `/conf/scylla.yaml`.
+
+For development, a directory at `$HOME/scylla` can be used for all Scylla-related files:
+
+```bash
+$ mkdir -p $HOME/scylla $HOME/scylla/conf
+$ cp conf/scylla.yaml $HOME/scylla/conf/scylla.yaml
+$ # Edit configuration options as appropriate
+$ SCYLLA_HOME=$HOME/scylla build/release/scylla
+```
+
+The `scylla.yaml` file in the repository by default writes all database data to `/var/lib/scylla`, which likely requires root access. Change the `data_file_directories` and `commitlog_directory` fields as appropriate.
+
+Scylla has a number of requirements for the file-system and operating system to operate ideally and at peak performance. However, during development, these requirements can be relaxed with the `--developer-mode` flag.
+
+Additionally, when running on under-powered platforms like portable laptops, the `--overprovisined` flag is useful.
+
+On a development machine, one might run Scylla as
+
+```bash
+$ SCYLLA_HOME=$HOME/scylla build/release/scylla --overprovisioned --developer-mode=yes
+```
+
+### Branches and tags
+
+Multiple release branches are maintained on the Git repository at https://github.com/scylladb/scylla. Release 1.5, for instance, is tracked on the `branch-1.5` branch.
+
+Similarly, tags are used to pin-point precise release versions, including hot-fix versions like 1.5.4. These are named `scylla-1.5.4`, for example.
+
+Most development happens on the `master` branch. Release branches are cut from `master` based on time and/or features. When a patch against `master` fixes a serious issue like a node crash or data loss, it is backported to a particular release branch with `git cherry-pick` by the project maintainers.
+
+## Example: development on Fedora 25
+
+This section describes one possible work-flow for developing Scylla on a Fedora 25 system. It is presented as an example to help you to develop a work-flow and tools that you are comfortable with.
+
+### Preface
+
+This guide will be written from the perspective of a fictitious developer, Taylor Smith.
+
+### Git work-flow
+
+Having two Git remotes is useful:
+
+- A public clone of Seastar (`"public"`)
+- A private clone of Seastar (`"private"`) for in-progress work or work that is not yet ready to share
+
+The first step to contributing a change to Scylla is to create a local branch dedicated to it. For example, a feature that fixes a bug in the CQL statement for creating tables could be called `ts/cql_create_table_error/v1`. The branch name is prefaced by the developer's initials and has a suffix indicating that this is the first version. The version suffix is useful when branches are shared publicly and changes are requested on the mailing list. Having a branch for each version of the patch (or patch set) shared publicly makes it easier to reference and compare the history of a change.
+
+Setting the upstream branch of your development branch to `master` is a useful way to track your changes. You can do this with
+
+```bash
+$ git branch -u master ts/cql_create_table_error/v1
+```
+
+As a patch set is developed, you can periodically push the branch to the private remote to back-up work.
+
+Once the patch set is ready to be reviewed, push the branch to the public remote and prepare an email to the `scylladb-dev` mailing list. Including a link to the branch on your public remote allows for reviewers to quickly test and explore your changes.
+
+### Development environment and source code navigation
+
+Scylla includes a [CMake](https://cmake.org/) file, `CMakeLists.txt`, for use only with development environments (not for building) so that they can properly analyze the source code.
+
+[CLion](https://www.jetbrains.com/clion/) is a commercial IDE offers reasonably good source code navigation and advice for code hygiene, though its C++ parser sometimes makes errors and flags false issues.
+
+Other good options that directly parse CMake files are [KDevelop](https://www.kdevelop.org/) and [QtCreator](https://wiki.qt.io/Qt_Creator).
+
+To use the `CMakeLists.txt` file with these programs, define the `FOR_IDE` CMake variable or shell environmental variable.
+
+[Eclipse](https://eclipse.org/cdt/) is another open-source option. It doesn't natively work with CMake projects, and its C++ parser has many similar issues as CLion.
+
+### Distributed compilation: `distcc` and `ccache`
+
+Scylla's compilations times can be long. Two tools help somewhat:
+
+- [ccache](https://ccache.samba.org/) caches compiled object files on disk and re-uses them when possible
+- [distcc](https://github.com/distcc/distcc) distributes compilation jobs to remote machines
+
+A reasonably-powered laptop acts as the coordinator for compilation. A second, more powerful, machine acts as a passive compilation server.
+
+Having a direct wired connection between the machines ensures that object files can be transmitted quickly and limits the overhead of remote compilation.
+The coordinator has been assigned the static IP address `10.0.0.1` and the passive compilation machine has been assigned `10.0.0.2`.
+
+On Fedora, installing the `ccache` package places symbolic links for `gcc` and `g++` in the `PATH`. This allows normal compilation to transparently invoke `ccache` for compilation and cache object files on the local file-system.
+
+Next, set `CCACHE_PREFIX` so that `ccache` is responsible for invoking `distcc` as necessary:
+
+```bash
+export CCACHE_PREFIX="distcc"
+```
+
+On each host, edit `/etc/sysconfig/distccd` to include the allowed coordinators and the total number of jobs that the machine should accept.
+This example is for the laptop, which has 2 physical cores (4 logical cores with hyper-threading):
+
+```
+OPTIONS="--allow 10.0.0.2 --allow 127.0.0.1 --jobs 4"
+```
+
+`10.0.0.2` has 8 physical cores (16 logical cores) and 64 GB of memory.
+
+As a rule-of-thumb, the number of jobs that a machine should be specified to support should be equal to the number of its native threads.
+
+Restart the `distccd` service on all machines.
+
+On the coordinator machine, edit `$HOME/.distcc/hosts` with the available hosts for compilation. Order of the hosts indicates preference.
+
+```
+10.0.0.2/16 localhost/2
+```
+
+In this example, `10.0.0.2` will be sent up to 16 jobs and the local machine will be sent up to 2. Allowing for two extra threads on the host machine for coordination, we run compilation with `16 + 2 + 2 = 20` jobs in total: `ninja-build -j20`.
+
+When a compilation is in progress, the status of jobs on all remote machines can be visualized in the terminal with `distccmon-text` or graphically as a GTK application with `distccmon-gnome`.
+
+One thing to keep in mind is that linking object files happens on the coordinating machine, which can be a bottleneck. See the next section speeding up this process.
+
+### Using the `gold` linker
+
+Linking Scylla can be slow. The gold linker can replace GNU ld and often speeds the linking process. On Fedora, you can switch the system linker using
+
+```bash
+$ sudo alternatives --config ld
+```
+
+### Testing changes in Seastar with Scylla
+
+Sometimes Scylla development is closely tied with a feature being developed in Seastar. It can be useful to compile Scylla with a particular check-out of Seastar.
+
+One way to do this it to create a local remote for the Seastar submodule in the Scylla repository:
+
+```bash
+$ cd $HOME/src/scylla
+$ cd seastar
+$ git remote add local /home/tsmith/src/seastar
+$ git remote update
+$ git checkout -t local/my_local_seastar_branch
+```
--- a/README.md
+++ b/README.md
@@ -1,29 +1,19 @@
 # Scylla

-## Building Scylla
+## Quick-start

-In addition to required packages by Seastar, the following packages are required by Scylla.
-
-### Submodules
-Scylla uses submodules, so make sure you pull the submodules first by doing:
-```
-git submodule init
-git submodule update --init --recursive
+```bash
+$ git submodule update --init --recursive
+$ sudo ./install-dependencies.sh
+$ ./configure.py --mode=release
+$ ninja-build -j4 # Assuming 4 system threads.
+$ ./build/release/scylla
+$ # Rejoice!
 ```

-### Building and Running Scylla on Fedora
-* Installing required packages:
+Please see [HACKING.md](HACKING.md) for detailed information on building and developing Scylla.

-```
-sudo dnf install yaml-cpp-devel lz4-devel zlib-devel snappy-devel jsoncpp-devel thrift-devel antlr3-tool antlr3-C++-devel libasan libubsan gcc-c++ gnutls-devel ninja-build ragel libaio-devel cryptopp-devel xfsprogs-devel numactl-devel hwloc-devel libpciaccess-devel libxml2-devel python3-pyparsing lksctp-tools-devel protobuf-devel protobuf-compiler systemd-devel libunwind-devel
-```
-
-* Build Scylla
-```
-./configure.py --mode=release --with=scylla --disable-xen
-ninja-build build/release/scylla -j2 # you can use more cpus if you have tons of RAM
-
-```
+## Running Scylla

 * Run Scylla
 ```
@@ -83,14 +73,6 @@ Run the image with:
 docker run -p $(hostname -i):9042:9042 -i -t <image name>
 ```

-
 ## Contributing to Scylla

-Do not send pull requests.
-
-Send patches to the mailing list address scylladb-dev@googlegroups.com.
-Be sure to subscribe.
-
-In order for your patches to be merged, you must sign the Contributor's
-License Agreement, protecting your rights and ours.  See
-http://www.scylladb.com/opensource/cla/.
+[Guidelines for contributing](CONTRIBUTING.md)
--- a/9
+++ b/9
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=2.1.6

 if test -f version
 then
@@ -10,7 +10,12 @@ else
 	DATE=$(date +%Y%m%d)
 	GIT_COMMIT=$(git log --pretty=format:'%h' -n 1)
 	SCYLLA_VERSION=$VERSION
-	SCYLLA_RELEASE=$DATE.$GIT_COMMIT
+	# For custom package builds, replace "0" with "counter.your_name",
+	# where counter starts at 1 and increments for successive versions.
+	# This ensures that the package manager will select your custom
+	# package over the standard release.
+	SCYLLA_BUILD=0
+	SCYLLA_RELEASE=$SCYLLA_BUILD.$DATE.$GIT_COMMIT
 fi

 echo "$SCYLLA_VERSION-$SCYLLA_RELEASE"
--- a/api/api-doc/cache_service.json
+++ b/api/api-doc/cache_service.json
@@ -397,6 +397,36 @@
        }
      ]
    },
+    {
+      "path": "/cache_service/metrics/key/hits_moving_avrage",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get key hits moving avrage",
+          "type": "#/utils/rate_moving_average",
+          "nickname": "get_key_hits_moving_avrage",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
+    {
+      "path": "/cache_service/metrics/key/requests_moving_avrage",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get key requests moving avrage",
+          "type": "#/utils/rate_moving_average",
+          "nickname": "get_key_requests_moving_avrage",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
    {
      "path": "/cache_service/metrics/key/size",
      "operations": [
@@ -607,6 +637,36 @@
        }
      ]
    },
+    {
+      "path": "/cache_service/metrics/counter/hits_moving_avrage",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get counter hits moving avrage",
+          "type": "#/utils/rate_moving_average",
+          "nickname": "get_counter_hits_moving_avrage",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
+    {
+      "path": "/cache_service/metrics/counter/requests_moving_avrage",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get counter requests moving avrage",
+          "type": "#/utils/rate_moving_average",
+          "nickname": "get_counter_requests_moving_avrage",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
    {
      "path": "/cache_service/metrics/counter/size",
      "operations": [
--- a/api/api-doc/column_family.json
+++ b/api/api-doc/column_family.json
@@ -78,11 +78,19 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"path"
+                  },
+                  {
+                     "name":"split_output",
+                     "description":"true if the output of the major compaction should be split in several sstables",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"bool",
+                     "paramType":"query"
                  }
               ]
            }
@@ -102,7 +110,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -129,7 +137,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -153,7 +161,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -180,7 +188,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -204,7 +212,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -244,7 +252,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -271,7 +279,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -298,7 +306,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -317,7 +325,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -349,7 +357,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -381,7 +389,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -405,7 +413,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -432,7 +440,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -459,7 +467,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -491,7 +499,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -518,7 +526,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -545,7 +553,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -569,7 +577,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -593,7 +601,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -633,7 +641,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -673,7 +681,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -713,7 +721,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -753,7 +761,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -793,7 +801,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -833,7 +841,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -873,7 +881,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -916,7 +924,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -943,7 +951,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -970,7 +978,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -994,7 +1002,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1034,7 +1042,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1058,7 +1066,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1101,7 +1109,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1144,7 +1152,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1203,7 +1211,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1243,7 +1251,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1267,7 +1275,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1310,7 +1318,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1353,7 +1361,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1412,7 +1420,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1452,7 +1460,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1492,7 +1500,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1532,7 +1540,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1572,7 +1580,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1612,7 +1620,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1652,7 +1660,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1692,7 +1700,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1732,7 +1740,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1772,7 +1780,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1812,7 +1820,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1852,7 +1860,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1892,7 +1900,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1932,7 +1940,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -1972,7 +1980,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2012,7 +2020,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2052,7 +2060,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2092,7 +2100,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2116,7 +2124,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2156,7 +2164,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2196,7 +2204,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2236,7 +2244,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2276,7 +2284,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2300,7 +2308,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2324,7 +2332,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2351,7 +2359,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2378,7 +2386,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2405,7 +2413,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2432,7 +2440,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2501,7 +2509,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2525,7 +2533,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2549,7 +2557,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2573,7 +2581,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2597,7 +2605,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2621,7 +2629,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2645,7 +2653,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2669,7 +2677,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2693,7 +2701,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2717,7 +2725,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2741,7 +2749,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -2765,7 +2773,7 @@
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keysspace:name format",
+                     "description":"The column family name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
--- a/api/api-doc/endpoint_snitch_info.json
+++ b/api/api-doc/endpoint_snitch_info.json
@@ -21,8 +21,8 @@
               "parameters":[
                  {
                     "name":"host",
-                     "description":"The host name",
-                     "required":true,
+                     "description":"The host name. If absent, the local server broadcast/listen address is used",
+                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"query"
@@ -45,8 +45,8 @@
               "parameters":[
                  {
                     "name":"host",
-                     "description":"The host name",
-                     "required":true,
+                     "description":"The host name. If absent, the local server broadcast/listen address is used",
+                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
                     "paramType":"query"
--- a/api/api-doc/failure_detector.json
+++ b/api/api-doc/failure_detector.json
@@ -42,6 +42,25 @@
            }
         ]
      },
+      {
+         "path":"/failure_detector/endpoint_phi_values",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get end point phi values",
+               "type":"array",
+               "items":{
+                  "type":"endpoint_phi_values"
+               },
+               "nickname":"get_endpoint_phi_values",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+               ]
+            }
+         ]
+      },
      {
         "path":"/failure_detector/endpoints/",
         "operations":[
@@ -202,6 +221,20 @@
                    "description": "The application state version"
                }
            }
+        },
+        "endpoint_phi_value": {
+            "id" : "endpoint_phi_value",
+            "description": "Holds phi value for a single end point",
+            "properties": {
+                "phi": {
+                    "type": "double",
+                    "description": "Phi value"
+                },
+                "endpoint": {
+                    "type": "string",
+                    "description": "end point address"
+                }
+            }
        }
    }
 }
--- a/api/api-doc/storage_proxy.json
+++ b/api/api-doc/storage_proxy.json
@@ -777,7 +777,7 @@
      ]
    },
    {
-      "path": "/storage_proxy/metrics/read/moving_avrage_histogram",
+      "path": "/storage_proxy/metrics/read/moving_average_histogram",
      "operations": [
        {
          "method": "GET",
@@ -792,7 +792,7 @@
      ]
    },
    {
-      "path": "/storage_proxy/metrics/range/moving_avrage_histogram",
+      "path": "/storage_proxy/metrics/range/moving_average_histogram",
      "operations": [
        {
          "method": "GET",
@@ -942,7 +942,7 @@
      ]
    },
    {
-      "path": "/storage_proxy/metrics/write/moving_avrage_histogram",
+      "path": "/storage_proxy/metrics/write/moving_average_histogram",
      "operations": [
        {
          "method": "GET",
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -952,6 +952,22 @@
            }
         ]
      },
+      {
+         "path":"/storage_service/force_terminate_repair",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Force terminate all repair sessions",
+               "type":"void",
+               "nickname":"force_terminate_all_repair_sessions_new",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+               ]
+            }
+         ]
+      },
      {
         "path":"/storage_service/decommission",
         "operations":[
@@ -1201,11 +1217,12 @@
               ],
               "parameters":[
                  {
-                     "name":"non_system",
-                     "description":"When set to true limit to non system",
+                     "name":"type",
+                     "description":"Which keyspaces to return",
                     "required":false,
                     "allowMultiple":false,
-                     "type":"boolean",
+                     "type":"string",
+                     "enum": [ "all", "user", "non_local_strategy" ],
                     "paramType":"query"
                  }
               ]
--- a/api/api.cc
+++ b/api/api.cc
@@ -49,7 +49,7 @@ static std::unique_ptr<reply> exception_reply(std::exception_ptr eptr) {
        throw bad_param_exception(ex.what());
    }
    // We never going to get here
-    return std::make_unique<reply>();
+    throw std::runtime_error("exception_reply");
 }

 future<> set_server_init(http_context& ctx) {
--- a/api/api.hh
+++ b/api/api.hh
@@ -29,6 +29,7 @@
 #include "utils/histogram.hh"
 #include "http/exception.hh"
 #include "api_init.hh"
+#include "seastarx.hh"

 namespace api {

@@ -166,33 +167,36 @@ inline int64_t max_int64(int64_t a, int64_t b) {
 * It combine total and the sub set for the ratio and its
 * to_json method return the ration sub/total
 */
-struct ratio_holder : public json::jsonable {
-    double total = 0;
-    double sub = 0;
+template<typename T>
+struct basic_ratio_holder : public json::jsonable {
+    T total = 0;
+    T sub = 0;
    virtual std::string to_json() const {
        if (total == 0) {
            return "0";
        }
        return std::to_string(sub/total);
    }
-    ratio_holder() = default;
-    ratio_holder& add(double _total, double _sub) {
+    basic_ratio_holder() = default;
+    basic_ratio_holder& add(T _total, T _sub) {
        total += _total;
        sub += _sub;
        return *this;
    }
-    ratio_holder(double _total, double _sub) {
+    basic_ratio_holder(T _total, T _sub) {
        total = _total;
        sub = _sub;
    }
-    ratio_holder& operator+=(const ratio_holder& a) {
+    basic_ratio_holder<T>& operator+=(const basic_ratio_holder<T>& a) {
        return add(a.total, a.sub);
    }
-    friend ratio_holder operator+(ratio_holder a, const ratio_holder& b) {
+    friend basic_ratio_holder<T> operator+(basic_ratio_holder a, const basic_ratio_holder<T>& b) {
        return a += b;
    }
 };

+typedef basic_ratio_holder<double>  ratio_holder;
+typedef basic_ratio_holder<int64_t> integral_ratio_holder;

 class unimplemented_exception : public base_exception {
 public:
--- a/api/cache_service.cc
+++ b/api/cache_service.cc
@@ -177,6 +177,20 @@ void set_cache_service(http_context& ctx, routes& r) {
        return make_ready_future<json::json_return_type>(0);
    });

+    cs::get_key_hits_moving_avrage.set(r, [&ctx] (std::unique_ptr<request> req) {
+        // TBD
+        // FIXME
+        // See above
+        return make_ready_future<json::json_return_type>(meter_to_json(utils::rate_moving_average()));
+    });
+
+    cs::get_key_requests_moving_avrage.set(r, [&ctx] (std::unique_ptr<request> req) {
+        // TBD
+        // FIXME
+        // See above
+        return make_ready_future<json::json_return_type>(meter_to_json(utils::rate_moving_average()));
+    });
+
    cs::get_key_size.set(r, [] (std::unique_ptr<request> req) {
        // TBD
        // FIXME
@@ -194,7 +208,7 @@ void set_cache_service(http_context& ctx, routes& r) {
    });

    cs::get_row_capacity.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
+        return map_reduce_cf(ctx, uint64_t(0), [](const column_family& cf) {
            return cf.get_row_cache().get_cache_tracker().region().occupancy().used_space();
        }, std::plus<uint64_t>());
    });
@@ -238,13 +252,13 @@ void set_cache_service(http_context& ctx, routes& r) {
        // In origin row size is the weighted size.
        // We currently do not support weights, so we use num entries instead
        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
-            return cf.get_row_cache().num_entries();
+            return cf.get_row_cache().partitions();
        }, std::plus<uint64_t>());
    });

    cs::get_row_entries.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
-            return cf.get_row_cache().num_entries();
+            return cf.get_row_cache().partitions();
        }, std::plus<uint64_t>());
    });

@@ -280,6 +294,20 @@ void set_cache_service(http_context& ctx, routes& r) {
        return make_ready_future<json::json_return_type>(0);
    });

+    cs::get_counter_hits_moving_avrage.set(r, [&ctx] (std::unique_ptr<request> req) {
+        // TBD
+        // FIXME
+        // See above
+        return make_ready_future<json::json_return_type>(meter_to_json(utils::rate_moving_average()));
+    });
+
+    cs::get_counter_requests_moving_avrage.set(r, [&ctx] (std::unique_ptr<request> req) {
+        // TBD
+        // FIXME
+        // See above
+        return make_ready_future<json::json_return_type>(meter_to_json(utils::rate_moving_average()));
+    });
+
    cs::get_counter_size.set(r, [] (std::unique_ptr<request> req) {
        // TBD
        // FIXME
--- a/api/collectd.cc
+++ b/api/collectd.cc
@@ -40,13 +40,13 @@ static auto transformer(const std::vector<collectd_value>& values) {
    for (auto v: values) {
        switch (v._type) {
        case scollectd::data_type::GAUGE:
-            collected_value.values.push(v.u._d);
+            collected_value.values.push(v.d());
            break;
        case scollectd::data_type::DERIVE:
-            collected_value.values.push(v.u._i);
+            collected_value.values.push(v.i());
            break;
        default:
-            collected_value.values.push(v.u._ui);
+            collected_value.values.push(v.ui());
            break;
        }
    }
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -182,17 +182,8 @@ static int64_t max_row_size(column_family& cf) {
    return res;
 }

-static double update_ratio(double acc, double f, double total) {
-    if (f && !total) {
-        throw bad_param_exception("total should include all elements");
-    } else if (total) {
-        acc += f / total;
-    }
-    return acc;
-}
-
-static ratio_holder mean_row_size(column_family& cf) {
-    ratio_holder res;
+static integral_ratio_holder mean_row_size(column_family& cf) {
+    integral_ratio_holder res;
    for (auto i: *cf.get_sstables() ) {
        auto c = i->get_stats_metadata().estimated_row_size.count();
        res.sub += i->get_stats_metadata().estimated_row_size.mean() * c;
@@ -283,6 +274,16 @@ static std::vector<uint64_t> concat_sstable_count_per_level(std::vector<uint64_t
    return a;
 }

+ratio_holder filter_false_positive_as_ratio_holder(const sstables::shared_sstable& sst) {
+    double f = sst->filter_get_false_positive();
+    return ratio_holder(f + sst->filter_get_true_positive(), f);
+}
+
+ratio_holder filter_recent_false_positive_as_ratio_holder(const sstables::shared_sstable& sst) {
+    double f = sst->filter_get_recent_false_positive();
+    return ratio_holder(f + sst->filter_get_recent_true_positive(), f);
+}
+
 void set_column_family(http_context& ctx, routes& r) {
    cf::get_column_family_name.set(r, [&ctx] (const_req req){
        vector<sstring> res;
@@ -562,11 +563,13 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_mean_row_size.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], ratio_holder(), mean_row_size, std::plus<ratio_holder>());
+        // Cassandra 3.x mean values are truncated as integrals.
+        return map_reduce_cf(ctx, req->param["name"], integral_ratio_holder(), mean_row_size, std::plus<integral_ratio_holder>());
    });

    cf::get_all_mean_row_size.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, ratio_holder(), mean_row_size, std::plus<ratio_holder>());
+        // Cassandra 3.x mean values are truncated as integrals.
+        return map_reduce_cf(ctx, integral_ratio_holder(), mean_row_size, std::plus<integral_ratio_holder>());
    });

    cf::get_bloom_filter_false_positives.set(r, [&ctx] (std::unique_ptr<request> req) {
@@ -602,39 +605,27 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_bloom_filter_false_ratio.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], double(0), [] (column_family& cf) {
-            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), double(0), [](double s, auto& sst) {
-                double f = sst->filter_get_false_positive();
-                return update_ratio(s, f, f + sst->filter_get_true_positive());
-            });
-        }, std::plus<double>());
+        return map_reduce_cf(ctx, req->param["name"], ratio_holder(), [] (column_family& cf) {
+            return boost::accumulate(*cf.get_sstables() | boost::adaptors::transformed(filter_false_positive_as_ratio_holder), ratio_holder());
+        }, std::plus<>());
    });

    cf::get_all_bloom_filter_false_ratio.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, double(0), [] (column_family& cf) {
-            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), double(0), [](double s, auto& sst) {
-                double f = sst->filter_get_false_positive();
-                return update_ratio(s, f, f + sst->filter_get_true_positive());
-            });
-        }, std::plus<double>());
+        return map_reduce_cf(ctx, ratio_holder(), [] (column_family& cf) {
+            return boost::accumulate(*cf.get_sstables() | boost::adaptors::transformed(filter_false_positive_as_ratio_holder), ratio_holder());
+        }, std::plus<>());
    });

    cf::get_recent_bloom_filter_false_ratio.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], double(0), [] (column_family& cf) {
-            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), double(0), [](double s, auto& sst) {
-                double f = sst->filter_get_recent_false_positive();
-                return update_ratio(s, f, f + sst->filter_get_recent_true_positive());
-            });
-        }, std::plus<double>());
+        return map_reduce_cf(ctx, req->param["name"], ratio_holder(), [] (column_family& cf) {
+            return boost::accumulate(*cf.get_sstables() | boost::adaptors::transformed(filter_recent_false_positive_as_ratio_holder), ratio_holder());
+        }, std::plus<>());
    });

    cf::get_all_recent_bloom_filter_false_ratio.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, double(0), [] (column_family& cf) {
-            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), double(0), [](double s, auto& sst) {
-                double f = sst->filter_get_recent_false_positive();
-                return update_ratio(s, f, f + sst->filter_get_recent_true_positive());
-            });
-        }, std::plus<double>());
+        return map_reduce_cf(ctx, ratio_holder(), [] (column_family& cf) {
+            return boost::accumulate(*cf.get_sstables() | boost::adaptors::transformed(filter_recent_false_positive_as_ratio_holder), ratio_holder());
+        }, std::plus<>());
    });

    cf::get_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
--- a/api/compaction_manager.cc
+++ b/api/compaction_manager.cc
@@ -20,13 +20,13 @@
 */

 #include "compaction_manager.hh"
+#include "sstables/compaction_manager.hh"
 #include "api/api-doc/compaction_manager.json.hh"
 #include "db/system_keyspace.hh"
 #include "column_family.hh"

 namespace api {

-using namespace scollectd;
 namespace cm = httpd::compaction_manager_json;
 using namespace json;

--- a/api/endpoint_snitch.cc
+++ b/api/endpoint_snitch.cc
@@ -22,16 +22,22 @@
 #include "locator/snitch_base.hh"
 #include "endpoint_snitch.hh"
 #include "api/api-doc/endpoint_snitch_info.json.hh"
+#include "utils/fb_utilities.hh"

 namespace api {

 void set_endpoint_snitch(http_context& ctx, routes& r) {
-    httpd::endpoint_snitch_info_json::get_datacenter.set(r, [] (const_req req) {
-        return locator::i_endpoint_snitch::get_local_snitch_ptr()->get_datacenter(req.get_query_param("host"));
+    static auto host_or_broadcast = [](const_req req) {
+        auto host = req.get_query_param("host");
+        return host.empty() ? gms::inet_address(utils::fb_utilities::get_broadcast_address()) : gms::inet_address(host);
+    };
+
+    httpd::endpoint_snitch_info_json::get_datacenter.set(r, [](const_req req) {
+        return locator::i_endpoint_snitch::get_local_snitch_ptr()->get_datacenter(host_or_broadcast(req));
    });

-    httpd::endpoint_snitch_info_json::get_rack.set(r, [] (const_req req) {
-        return locator::i_endpoint_snitch::get_local_snitch_ptr()->get_rack(req.get_query_param("host"));
+    httpd::endpoint_snitch_info_json::get_rack.set(r, [](const_req req) {
+        return locator::i_endpoint_snitch::get_local_snitch_ptr()->get_rack(host_or_broadcast(req));
    });

    httpd::endpoint_snitch_info_json::get_snitch_name.set(r, [] (const_req req) {
--- a/api/failure_detector.cc
+++ b/api/failure_detector.cc
@@ -88,6 +88,20 @@ void set_failure_detector(http_context& ctx, routes& r) {
            return make_ready_future<json::json_return_type>(state);
        });
    });
+
+    fd::get_endpoint_phi_values.set(r, [](std::unique_ptr<request> req) {
+        return gms::get_arrival_samples().then([](std::map<gms::inet_address, gms::arrival_window> map) {
+            std::vector<fd::endpoint_phi_value> res;
+            auto now = gms::arrival_window::clk::now();
+            for (auto& p : map) {
+                fd::endpoint_phi_value val;
+                val.endpoint = p.first.to_sstring();
+                val.phi = p.second.phi(now);
+                res.emplace_back(std::move(val));
+            }
+            return make_ready_future<json::json_return_type>(res);
+        });
+    });
 }

 }
--- a/api/hinted_handoff.cc
+++ b/api/hinted_handoff.cc
@@ -24,7 +24,6 @@

 namespace api {

-using namespace scollectd;
 using namespace json;
 namespace hh = httpd::hinted_handoff_json;

--- a/api/lsa.cc
+++ b/api/lsa.cc
@@ -29,11 +29,11 @@

 namespace api {

-static logging::logger logger("lsa-api");
+static logging::logger alogger("lsa-api");

 void set_lsa(http_context& ctx, routes& r) {
    httpd::lsa_json::lsa_compact.set(r, [&ctx](std::unique_ptr<request> req) {
-        logger.info("Triggering compaction");
+        alogger.info("Triggering compaction");
        return ctx.db.invoke_on_all([] (database&) {
            logalloc::shard_tracker().reclaim(std::numeric_limits<size_t>::max());
        }).then([] {
--- a/api/messaging_service.cc
+++ b/api/messaging_service.cc
@@ -27,7 +27,7 @@
 #include <sstream>

 using namespace httpd::messaging_service_json;
-using namespace net;
+using namespace netw;

 namespace api {

@@ -120,13 +120,13 @@ void set_messaging_service(http_context& ctx, routes& r) {
    }));

    get_version.set(r, [](const_req req) {
-        return net::get_local_messaging_service().get_raw_version(req.get_query_param("addr"));
+        return netw::get_local_messaging_service().get_raw_version(req.get_query_param("addr"));
    });

    get_dropped_messages_by_ver.set(r, [](std::unique_ptr<request> req) {
        shared_ptr<std::vector<uint64_t>> map = make_shared<std::vector<uint64_t>>(num_verb);

-        return net::get_messaging_service().map_reduce([map](const uint64_t* local_map) mutable {
+        return netw::get_messaging_service().map_reduce([map](const uint64_t* local_map) mutable {
            for (auto i = 0; i < num_verb; i++) {
                (*map)[i]+= local_map[i];
            }
--- a/api/storage_proxy.cc
+++ b/api/storage_proxy.cc
@@ -397,7 +397,7 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_range_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats(ctx.sp, &proxy::stats::read);
+        return sum_timer_stats(ctx.sp, &proxy::stats::range);
    });

    sp::get_range_latency.set(r, [&ctx](std::unique_ptr<request> req) {
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -22,6 +22,8 @@
 #include "storage_service.hh"
 #include "api/api-doc/storage_service.json.hh"
 #include "db/config.hh"
+#include <boost/range/adaptor/map.hpp>
+#include <boost/range/adaptor/filtered.hpp>
 #include <service/storage_service.hh>
 #include <db/commitlog/commitlog.hh>
 #include <gms/gossiper.hh>
@@ -32,6 +34,7 @@
 #include "column_family.hh"
 #include "log.hh"
 #include "release.hh"
+#include "sstables/compaction_manager.hh"

 namespace api {

@@ -359,16 +362,22 @@ void set_storage_service(http_context& ctx, routes& r) {
            try {
                res = fut.get0();
            } catch(std::runtime_error& e) {
-                return make_ready_future<json::json_return_type>(json_exception(httpd::bad_param_exception(e.what())));
+                throw httpd::bad_param_exception(e.what());
            }
            return make_ready_future<json::json_return_type>(json::json_return_type(res));
        });
    });

    ss::force_terminate_all_repair_sessions.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        return make_ready_future<json::json_return_type>(json_void());
+        return repair_abort_all(service::get_local_storage_service().db()).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    ss::force_terminate_all_repair_sessions_new.set(r, [](std::unique_ptr<request> req) {
+        return repair_abort_all(service::get_local_storage_service().db()).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
    });

    ss::decommission.set(r, [](std::unique_ptr<request> req) {
@@ -457,8 +466,15 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::get_keyspaces.set(r, [&ctx](const_req req) {
-        auto non_system = req.get_query_param("non_system");
-        return map_keys(ctx.db.local().keyspaces());
+        auto type = req.get_query_param("type");
+        if (type == "user") {
+            return ctx.db.local().get_non_system_keyspaces();
+        } else if (type == "non_local_strategy") {
+            return map_keys(ctx.db.local().get_keyspaces() | boost::adaptors::filtered([](const auto& p) {
+                return p.second.get_replication_strategy().get_type() != locator::replication_strategy_type::local;
+            }));
+        }
+        return map_keys(ctx.db.local().get_keyspaces());
    });

    ss::update_snitch.set(r, [](std::unique_ptr<request> req) {
@@ -542,9 +558,7 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::is_joined.set(r, [] (std::unique_ptr<request> req) {
-        return service::get_local_storage_service().is_joined().then([] (bool is_joined) {
-            return make_ready_future<json::json_return_type>(is_joined);
-        });
+        return make_ready_future<json::json_return_type>(service::get_local_storage_service().is_joined());
    });

    ss::set_stream_throughput_mb_per_sec.set(r, [](std::unique_ptr<request> req) {
@@ -664,17 +678,23 @@ void set_storage_service(http_context& ctx, routes& r) {

    ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
        auto probability = req->get_query_param("probability");
-        try {
+        return futurize<json::json_return_type>::apply([probability] {
            double real_prob = std::stod(probability.c_str());
            return tracing::tracing::tracing_instance().invoke_on_all([real_prob] (auto& local_tracing) {
                local_tracing.set_trace_probability(real_prob);
            }).then([] {
                return make_ready_future<json::json_return_type>(json_void());
            });
-        } catch (...) {
-            throw httpd::bad_param_exception(sprint("Bad format of a probability value: \"%s\"", probability.c_str()));
-        }
-
+        }).then_wrapped([probability] (auto&& f) {
+            try {
+                f.get();
+                return make_ready_future<json::json_return_type>(json_void());
+            } catch (std::out_of_range& e) {
+                throw httpd::bad_param_exception(e.what());
+            } catch (std::invalid_argument&){
+                throw httpd::bad_param_exception(sprint("Bad format in a probability value: \"%s\"", probability.c_str()));
+            }
+        });
    });

    ss::get_trace_probability.set(r, [](std::unique_ptr<request> req) {
@@ -684,8 +704,8 @@ void set_storage_service(http_context& ctx, routes& r) {
    ss::get_slow_query_info.set(r, [](const_req req) {
        ss::slow_query_info res;
        res.enable = tracing::tracing::get_local_tracing_instance().slow_query_tracing_enabled();
-        res.ttl = std::chrono::duration_cast<std::chrono::microseconds>(tracing::tracing::get_local_tracing_instance().slow_query_record_ttl()).count() ;
-        res.threshold = std::chrono::duration_cast<std::chrono::microseconds>(tracing::tracing::get_local_tracing_instance().slow_query_threshold()).count();
+        res.ttl = tracing::tracing::get_local_tracing_instance().slow_query_record_ttl().count() ;
+        res.threshold = tracing::tracing::get_local_tracing_instance().slow_query_threshold().count();
        return res;
    });

@@ -789,10 +809,8 @@ void set_storage_service(http_context& ctx, routes& r) {
        return make_ready_future<json::json_return_type>(json_void());
    });

-    ss::get_metrics_load.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        return make_ready_future<json::json_return_type>(0);
+    ss::get_metrics_load.set(r, [&ctx](std::unique_ptr<request> req) {
+        return get_cf_stats(ctx, &column_family::stats::live_disk_space_used);
    });

    ss::get_exceptions.set(r, [](const_req req) {
--- a/atomic_cell.hh
+++ b/atomic_cell.hh
@@ -28,11 +28,12 @@
 #include "utils/managed_bytes.hh"
 #include "net/byteorder.hh"
 #include <cstdint>
-#include <iostream>
+#include <iosfwd>
+#include <seastar/util/gcc6-concepts.hh>

-template<typename T>
+template<typename T, typename Input>
 static inline
-void set_field(managed_bytes& v, unsigned offset, T val) {
+void set_field(Input& v, unsigned offset, T val) {
    reinterpret_cast<net::packed<T>*>(v.begin() + offset)->raw = net::hton(val);
 }

@@ -57,6 +58,8 @@ private:
    static constexpr int8_t LIVE_FLAG = 0x01;
    static constexpr int8_t EXPIRY_FLAG = 0x02; // When present, expiry field is present. Set only for live cells
    static constexpr int8_t REVERT_FLAG = 0x04; // transient flag used to efficiently implement ReversiblyMergeable for atomic cells.
+    static constexpr int8_t COUNTER_UPDATE_FLAG = 0x08; // Cell is a counter update.
+    static constexpr int8_t COUNTER_IN_PLACE_REVERT = 0x10;
    static constexpr unsigned flags_size = 1;
    static constexpr unsigned timestamp_offset = flags_size;
    static constexpr unsigned timestamp_size = 8;
@@ -66,14 +69,25 @@ private:
    static constexpr unsigned deletion_time_size = 4;
    static constexpr unsigned ttl_offset = expiry_offset + expiry_size;
    static constexpr unsigned ttl_size = 4;
+    friend class counter_cell_builder;
 private:
+    static bool is_counter_update(bytes_view cell) {
+        return cell[0] & COUNTER_UPDATE_FLAG;
+    }
    static bool is_revert_set(bytes_view cell) {
        return cell[0] & REVERT_FLAG;
    }
+    static bool is_counter_in_place_revert_set(bytes_view cell) {
+        return cell[0] & COUNTER_IN_PLACE_REVERT;
+    }
    template<typename BytesContainer>
    static void set_revert(BytesContainer& cell, bool revert) {
        cell[0] = (cell[0] & ~REVERT_FLAG) | (revert * REVERT_FLAG);
    }
+    template<typename BytesContainer>
+    static void set_counter_in_place_revert(BytesContainer& cell, bool flag) {
+        cell[0] = (cell[0] & ~COUNTER_IN_PLACE_REVERT) | (flag * COUNTER_IN_PLACE_REVERT);
+    }
    static bool is_live(const bytes_view& cell) {
        return cell[0] & LIVE_FLAG;
    }
@@ -87,13 +101,30 @@ private:
    static api::timestamp_type timestamp(const bytes_view& cell) {
        return get_field<api::timestamp_type>(cell, timestamp_offset);
    }
+    template<typename BytesContainer>
+    static void set_timestamp(BytesContainer& cell, api::timestamp_type ts) {
+        set_field(cell, timestamp_offset, ts);
+    }
    // Can be called on live cells only
-    static bytes_view value(bytes_view cell) {
+private:
+    template<typename BytesView>
+    static BytesView do_get_value(BytesView cell) {
        auto expiry_field_size = bool(cell[0] & EXPIRY_FLAG) * (expiry_size + ttl_size);
        auto value_offset = flags_size + timestamp_size + expiry_field_size;
        cell.remove_prefix(value_offset);
        return cell;
    }
+public:
+    static bytes_view value(bytes_view cell) {
+        return do_get_value(cell);
+    }
+    static bytes_mutable_view value(bytes_mutable_view cell) {
+        return do_get_value(cell);
+    }
+    // Can be called on live counter update cells only
+    static int64_t counter_update_value(bytes_view cell) {
+        return get_field<int64_t>(cell, flags_size + timestamp_size);
+    }
    // Can be called only when is_dead() is true.
    static gc_clock::time_point deletion_time(const bytes_view& cell) {
        assert(is_dead(cell));
@@ -126,6 +157,14 @@ private:
        std::copy_n(value.begin(), value.size(), b.begin() + value_offset);
        return b;
    }
+    static managed_bytes make_live_counter_update(api::timestamp_type timestamp, int64_t value) {
+        auto value_offset = flags_size + timestamp_size;
+        managed_bytes b(managed_bytes::initialized_later(), value_offset + sizeof(value));
+        b[0] = LIVE_FLAG | COUNTER_UPDATE_FLAG;
+        set_field(b, timestamp_offset, timestamp);
+        set_field(b, value_offset, value);
+        return b;
+    }
    static managed_bytes make_live(api::timestamp_type timestamp, bytes_view value, gc_clock::time_point expiry, gc_clock::duration ttl) {
        auto value_offset = flags_size + timestamp_size + expiry_size + ttl_size;
        managed_bytes b(managed_bytes::initialized_later(), value_offset + value.size());
@@ -136,6 +175,31 @@ private:
        std::copy_n(value.begin(), value.size(), b.begin() + value_offset);
        return b;
    }
+    // make_live_from_serializer() is intended for users that need to serialise
+    // some object or objects to the format used in atomic_cell::value().
+    // With just make_live() the patter would look like follows:
+    // 1. allocate a buffer and write to it serialised objects
+    // 2. pass that buffer to make_live()
+    // 3. make_live() needs to prepend some metadata to the cell value so it
+    //    allocates a new buffer and copies the content of the original one
+    //
+    // The allocation and copy of a buffer can be avoided.
+    // make_live_from_serializer() allows the user code to specify the timestamp
+    // and size of the cell value as well as provide the serialiser function
+    // object, which would write the serialised value of the cell to the buffer
+    // given to it by make_live_from_serializer().
+    template<typename Serializer>
+    GCC6_CONCEPT(requires requires(Serializer serializer, bytes::iterator it) {
+        serializer(it);
+    })
+    static managed_bytes make_live_from_serializer(api::timestamp_type timestamp, size_t size, Serializer&& serializer) {
+        auto value_offset = flags_size + timestamp_size;
+        managed_bytes b(managed_bytes::initialized_later(), value_offset + size);
+        b[0] = LIVE_FLAG;
+        set_field(b, timestamp_offset, timestamp);
+        serializer(b.begin() + value_offset);
+        return b;
+    }
    template<typename ByteContainer>
    friend class atomic_cell_base;
    friend class atomic_cell;
@@ -149,17 +213,23 @@ protected:
    atomic_cell_base(ByteContainer&& data) : _data(std::forward<ByteContainer>(data)) { }
    friend class atomic_cell_or_collection;
 public:
+    bool is_counter_update() const {
+        return atomic_cell_type::is_counter_update(_data);
+    }
    bool is_revert_set() const {
        return atomic_cell_type::is_revert_set(_data);
    }
+    bool is_counter_in_place_revert_set() const {
+        return atomic_cell_type::is_counter_in_place_revert_set(_data);
+    }
    bool is_live() const {
        return atomic_cell_type::is_live(_data);
    }
-    bool is_live(tombstone t) const {
-        return is_live() && !is_covered_by(t);
+    bool is_live(tombstone t, bool is_counter) const {
+        return is_live() && !is_covered_by(t, is_counter);
    }
-    bool is_live(tombstone t, gc_clock::time_point now) const {
-        return is_live() && !is_covered_by(t) && !has_expired(now);
+    bool is_live(tombstone t, gc_clock::time_point now, bool is_counter) const {
+        return is_live() && !is_covered_by(t, is_counter) && !has_expired(now);
    }
    bool is_live_and_has_ttl() const {
        return atomic_cell_type::is_live_and_has_ttl(_data);
@@ -167,17 +237,24 @@ public:
    bool is_dead(gc_clock::time_point now) const {
        return atomic_cell_type::is_dead(_data) || has_expired(now);
    }
-    bool is_covered_by(tombstone t) const {
-        return timestamp() <= t.timestamp;
+    bool is_covered_by(tombstone t, bool is_counter) const {
+        return timestamp() <= t.timestamp || (is_counter && t.timestamp != api::missing_timestamp);
    }
    // Can be called on live and dead cells
    api::timestamp_type timestamp() const {
        return atomic_cell_type::timestamp(_data);
    }
+    void set_timestamp(api::timestamp_type ts) {
+        atomic_cell_type::set_timestamp(_data, ts);
+    }
    // Can be called on live cells only
-    bytes_view value() const {
+    auto value() const {
        return atomic_cell_type::value(_data);
    }
+    // Can be called on live counter update cells only
+    int64_t counter_update_value() const {
+        return atomic_cell_type::counter_update_value(_data);
+    }
    // Can be called only when is_dead(gc_clock::time_point)
    gc_clock::time_point deletion_time() const {
        return !is_live() ? atomic_cell_type::deletion_time(_data) : expiry() - ttl();
@@ -192,7 +269,7 @@ public:
    }
    // Can be called on live and dead cells
    bool has_expired(gc_clock::time_point now) const {
-        return is_live_and_has_ttl() && expiry() < now;
+        return is_live_and_has_ttl() && expiry() <= now;
    }
    bytes_view serialize() const {
        return _data;
@@ -200,6 +277,9 @@ public:
    void set_revert(bool revert) {
        atomic_cell_type::set_revert(_data, revert);
    }
+    void set_counter_in_place_revert(bool flag) {
+        atomic_cell_type::set_counter_in_place_revert(_data, flag);
+    }
 };

 class atomic_cell_view final : public atomic_cell_base<bytes_view> {
@@ -211,6 +291,14 @@ public:
    friend std::ostream& operator<<(std::ostream& os, const atomic_cell_view& acv);
 };

+class atomic_cell_mutable_view final : public atomic_cell_base<bytes_mutable_view> {
+    atomic_cell_mutable_view(bytes_mutable_view data) : atomic_cell_base(std::move(data)) {}
+public:
+    static atomic_cell_mutable_view from_bytes(bytes_mutable_view data) { return atomic_cell_mutable_view(data); }
+
+    friend class atomic_cell;
+};
+
 class atomic_cell_ref final : public atomic_cell_base<managed_bytes&> {
 public:
    atomic_cell_ref(managed_bytes& buf) : atomic_cell_base(buf) {}
@@ -239,6 +327,9 @@ public:
    static atomic_cell make_live(api::timestamp_type timestamp, const bytes& value) {
        return make_live(timestamp, bytes_view(value));
    }
+    static atomic_cell make_live_counter_update(api::timestamp_type timestamp, int64_t value) {
+        return atomic_cell_type::make_live_counter_update(timestamp, value);
+    }
    static atomic_cell make_live(api::timestamp_type timestamp, bytes_view value,
        gc_clock::time_point expiry, gc_clock::duration ttl)
    {
@@ -256,6 +347,10 @@ public:
            return atomic_cell_type::make_live(timestamp, value, gc_clock::now() + *ttl, *ttl);
        }
    }
+    template<typename Serializer>
+    static atomic_cell make_live_from_serializer(api::timestamp_type timestamp, size_t size, Serializer&& serializer) {
+        return atomic_cell_type::make_live_from_serializer(timestamp, size, std::forward<Serializer>(serializer));
+    }
    friend class atomic_cell_or_collection;
    friend std::ostream& operator<<(std::ostream& os, const atomic_cell& ac);
 };
@@ -293,11 +388,6 @@ collection_mutation::operator collection_mutation_view() const {
    return { data };
 }

-namespace db {
-template<typename T>
-class serializer;
-}
-
 class column_definition;

 int compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right);
--- a/atomic_cell_hash.hh
+++ b/atomic_cell_hash.hh
@@ -26,16 +26,17 @@
 #include "types.hh"
 #include "atomic_cell.hh"
 #include "hashing.hh"
+#include "counters.hh"

 template<>
 struct appending_hash<collection_mutation_view> {
    template<typename Hasher>
-    void operator()(Hasher& h, collection_mutation_view cell) const {
+    void operator()(Hasher& h, collection_mutation_view cell, const column_definition& cdef) const {
        auto m_view = collection_type_impl::deserialize_mutation_form(cell);
        ::feed_hash(h, m_view.tomb);
        for (auto&& key_and_value : m_view.cells) {
            ::feed_hash(h, key_and_value.first);
-            ::feed_hash(h, key_and_value.second);
+            ::feed_hash(h, key_and_value.second, cdef);
        }
    }
 };
@@ -43,10 +44,14 @@ struct appending_hash<collection_mutation_view> {
 template<>
 struct appending_hash<atomic_cell_view> {
    template<typename Hasher>
-    void operator()(Hasher& h, atomic_cell_view cell) const {
+    void operator()(Hasher& h, atomic_cell_view cell, const column_definition& cdef) const {
        feed_hash(h, cell.is_live());
        feed_hash(h, cell.timestamp());
        if (cell.is_live()) {
+            if (cdef.is_counter()) {
+                ::feed_hash(h, counter_cell_view(cell));
+                return;
+            }
            if (cell.is_live_and_has_ttl()) {
                feed_hash(h, cell.expiry());
                feed_hash(h, cell.ttl());
@@ -61,15 +66,15 @@ struct appending_hash<atomic_cell_view> {
 template<>
 struct appending_hash<atomic_cell> {
    template<typename Hasher>
-    void operator()(Hasher& h, const atomic_cell& cell) const {
-        feed_hash(h, static_cast<atomic_cell_view>(cell));
+    void operator()(Hasher& h, const atomic_cell& cell, const column_definition& cdef) const {
+        feed_hash(h, static_cast<atomic_cell_view>(cell), cdef);
    }
 };

 template<>
 struct appending_hash<collection_mutation> {
    template<typename Hasher>
-    void operator()(Hasher& h, const collection_mutation& cm) const {
-        feed_hash(h, static_cast<collection_mutation_view>(cm));
+    void operator()(Hasher& h, const collection_mutation& cm, const column_definition& cdef) const {
+        feed_hash(h, static_cast<collection_mutation_view>(cm), cdef);
    }
 };
--- a/atomic_cell_or_collection.hh
+++ b/atomic_cell_or_collection.hh
@@ -39,10 +39,14 @@ public:
    static atomic_cell_or_collection from_atomic_cell(atomic_cell data) { return { std::move(data._data) }; }
    atomic_cell_view as_atomic_cell() const { return atomic_cell_view::from_bytes(_data); }
    atomic_cell_ref as_atomic_cell_ref() { return { _data }; }
+    atomic_cell_mutable_view as_mutable_atomic_cell() { return atomic_cell_mutable_view::from_bytes(_data); }
    atomic_cell_or_collection(collection_mutation cm) : _data(std::move(cm.data)) {}
    explicit operator bool() const {
        return !_data.empty();
    }
+    bool can_use_mutable_view() const {
+        return !_data.is_fragmented();
+    }
    static atomic_cell_or_collection from_collection_mutation(collection_mutation data) {
        return std::move(data.data);
    }
@@ -58,13 +62,13 @@ public:
    template<typename Hasher>
    void feed_hash(Hasher& h, const column_definition& def) const {
        if (def.is_atomic()) {
-            ::feed_hash(h, as_atomic_cell());
+            ::feed_hash(h, as_atomic_cell(), def);
        } else {
-            ::feed_hash(as_collection_mutation(), h, def.type);
+            ::feed_hash(h, as_collection_mutation(), def);
        }
    }
-    size_t memory_usage() const {
-        return _data.memory_usage();
+    size_t external_memory_usage() const {
+        return _data.external_memory_usage();
    }
    friend std::ostream& operator<<(std::ostream&, const atomic_cell_or_collection&);
 };
--- a/auth/allow_all_authenticator.cc
+++ b/auth/allow_all_authenticator.cc
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "auth/allow_all_authenticator.hh"
+
+#include "service/migration_manager.hh"
+#include "utils/class_registrator.hh"
+
+namespace auth {
+
+const sstring& allow_all_authenticator_name() {
+    static const sstring name = meta::AUTH_PACKAGE_NAME + "AllowAllAuthenticator";
+    return name;
+}
+
+// To ensure correct initialization order, we unfortunately need to use a string literal.
+static const class_registrator<
+        authenticator,
+        allow_all_authenticator,
+        cql3::query_processor&,
+        ::service::migration_manager&> registration("org.apache.cassandra.auth.AllowAllAuthenticator");
+
+}
--- a/auth/allow_all_authenticator.hh
+++ b/auth/allow_all_authenticator.hh
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <stdexcept>
+
+#include "auth/authenticator.hh"
+#include "auth/authenticated_user.hh"
+#include "auth/common.hh"
+
+namespace cql3 {
+class query_processor;
+}
+
+namespace service {
+class migration_manager;
+}
+
+namespace auth {
+
+const sstring& allow_all_authenticator_name();
+
+class allow_all_authenticator final : public authenticator {
+public:
+    allow_all_authenticator(cql3::query_processor&, ::service::migration_manager&) {
+    }
+
+    future<> start() override {
+        return make_ready_future<>();
+    }
+
+    future<> stop() override {
+        return make_ready_future<>();
+    }
+
+    const sstring& qualified_java_name() const override {
+        return allow_all_authenticator_name();
+    }
+
+    bool require_authentication() const override {
+        return false;
+    }
+
+    option_set supported_options() const override {
+        return option_set();
+    }
+
+    option_set alterable_options() const override {
+        return option_set();
+    }
+
+    future<::shared_ptr<authenticated_user>> authenticate(const credentials_map& credentials) const override {
+        return make_ready_future<::shared_ptr<authenticated_user>>(::make_shared<authenticated_user>());
+    }
+
+    future<> create(sstring username, const option_map& options) override {
+        return make_ready_future();
+    }
+
+    future<> alter(sstring username, const option_map& options) override {
+        return make_ready_future();
+    }
+
+    future<> drop(sstring username) override {
+        return make_ready_future();
+    }
+
+    const resource_ids& protected_resources() const override {
+        static const resource_ids ids;
+        return ids;
+    }
+
+    ::shared_ptr<sasl_challenge> new_sasl_challenge() const override {
+        throw std::runtime_error("Should not reach");
+    }
+};
+
+}
--- a/auth/allow_all_authorizer.cc
+++ b/auth/allow_all_authorizer.cc
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "auth/allow_all_authorizer.hh"
+
+#include "auth/common.hh"
+#include "utils/class_registrator.hh"
+
+namespace auth {
+
+const sstring& allow_all_authorizer_name() {
+    static const sstring name = meta::AUTH_PACKAGE_NAME + "AllowAllAuthorizer";
+    return name;
+}
+
+// To ensure correct initialization order, we unfortunately need to use a string literal.
+static const class_registrator<
+    authorizer,
+    allow_all_authorizer,
+    cql3::query_processor&,
+    ::service::migration_manager&> registration("org.apache.cassandra.auth.AllowAllAuthorizer");
+
+}
--- a/auth/allow_all_authorizer.hh
+++ b/auth/allow_all_authorizer.hh
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "authorizer.hh"
+#include "exceptions/exceptions.hh"
+#include "stdx.hh"
+
+namespace cql3 {
+class query_processor;
+}
+
+namespace service {
+class migration_manager;
+}
+
+namespace auth {
+
+class service;
+
+const sstring& allow_all_authorizer_name();
+
+class allow_all_authorizer final  : public authorizer {
+public:
+    allow_all_authorizer(cql3::query_processor&, ::service::migration_manager&) {
+    }
+
+    future<> start() override {
+        return make_ready_future<>();
+    }
+
+    future<> stop() override {
+        return make_ready_future<>();
+    }
+
+    const sstring& qualified_java_name() const override {
+        return allow_all_authorizer_name();
+    }
+
+    future<permission_set> authorize(service&, ::shared_ptr<authenticated_user>, data_resource) const override {
+        return make_ready_future<permission_set>(permissions::ALL);
+    }
+
+    future<> grant(::shared_ptr<authenticated_user>, permission_set, data_resource, sstring) override {
+        throw exceptions::invalid_request_exception("GRANT operation is not supported by AllowAllAuthorizer");
+    }
+
+    future<> revoke(::shared_ptr<authenticated_user>, permission_set, data_resource, sstring) override {
+        throw exceptions::invalid_request_exception("REVOKE operation is not supported by AllowAllAuthorizer");
+    }
+
+    future<std::vector<permission_details>> list(
+            service&,
+            ::shared_ptr<authenticated_user> performer,
+            permission_set,
+            stdx::optional<data_resource>,
+            stdx::optional<sstring>) const override {
+        throw exceptions::invalid_request_exception("LIST PERMISSIONS operation is not supported by AllowAllAuthorizer");
+    }
+
+    future<> revoke_all(sstring dropped_user) override {
+        return make_ready_future();
+    }
+
+    future<> revoke_all(data_resource) override {
+        return make_ready_future();
+    }
+
+    const resource_ids& protected_resources() override {
+        static const resource_ids ids;
+        return ids;
+    }
+
+    future<> validate_configuration() const override {
+        return make_ready_future();
+    }
+};
+
+}
--- a/auth/auth.cc
+++ b/auth/auth.cc
@@ -1,383 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (C) 2016 ScyllaDB
- *
- * Modified by ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-#include <seastar/core/sleep.hh>
-
-#include <seastar/core/distributed.hh>
-
-#include "auth.hh"
-#include "authenticator.hh"
-#include "authorizer.hh"
-#include "database.hh"
-#include "cql3/query_processor.hh"
-#include "cql3/statements/raw/cf_statement.hh"
-#include "cql3/statements/create_table_statement.hh"
-#include "db/config.hh"
-#include "service/migration_manager.hh"
-#include "utils/loading_cache.hh"
-#include "utils/hash.hh"
-
-const sstring auth::auth::DEFAULT_SUPERUSER_NAME("cassandra");
-const sstring auth::auth::AUTH_KS("system_auth");
-const sstring auth::auth::USERS_CF("users");
-
-static const sstring USER_NAME("name");
-static const sstring SUPER("super");
-
-static logging::logger logger("auth");
-
-// TODO: configurable
-using namespace std::chrono_literals;
-const std::chrono::milliseconds auth::auth::SUPERUSER_SETUP_DELAY = 10000ms;
-
-class auth_migration_listener : public service::migration_listener {
-    void on_create_keyspace(const sstring& ks_name) override {}
-    void on_create_column_family(const sstring& ks_name, const sstring& cf_name) override {}
-    void on_create_user_type(const sstring& ks_name, const sstring& type_name) override {}
-    void on_create_function(const sstring& ks_name, const sstring& function_name) override {}
-    void on_create_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
-
-    void on_update_keyspace(const sstring& ks_name) override {}
-    void on_update_column_family(const sstring& ks_name, const sstring& cf_name, bool) override {}
-    void on_update_user_type(const sstring& ks_name, const sstring& type_name) override {}
-    void on_update_function(const sstring& ks_name, const sstring& function_name) override {}
-    void on_update_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
-
-    void on_drop_keyspace(const sstring& ks_name) override {
-        auth::authorizer::get().revoke_all(auth::data_resource(ks_name));
-    }
-    void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {
-        auth::authorizer::get().revoke_all(auth::data_resource(ks_name, cf_name));
-    }
-    void on_drop_user_type(const sstring& ks_name, const sstring& type_name) override {}
-    void on_drop_function(const sstring& ks_name, const sstring& function_name) override {}
-    void on_drop_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
-};
-
-static auth_migration_listener auth_migration;
-
-namespace std {
-template <>
-struct hash<auth::data_resource> {
-    size_t operator()(const auth::data_resource & v) const {
-        return v.hash_value();
-    }
-};
-
-template <>
-struct hash<auth::authenticated_user> {
-    size_t operator()(const auth::authenticated_user & v) const {
-        return utils::tuple_hash()(v.name(), v.is_anonymous());
-    }
-};
-}
-
-class auth::auth::permissions_cache {
-public:
-    typedef utils::loading_cache<std::pair<authenticated_user, data_resource>, permission_set, utils::tuple_hash> cache_type;
-    typedef typename cache_type::key_type key_type;
-
-    permissions_cache()
-                    : permissions_cache(
-                                    cql3::get_local_query_processor().db().local().get_config()) {
-    }
-
-    permissions_cache(const db::config& cfg)
-                    : _cache(cfg.permissions_cache_max_entries(), expiry(cfg),
-                                    std::chrono::milliseconds(
-                                                    cfg.permissions_validity_in_ms()),
-                                    [](const key_type& k) {
-                                        logger.debug("Refreshing permissions for {}", k.first.name());
-                                        return authorizer::get().authorize(::make_shared<authenticated_user>(k.first), k.second);
-                                    }) {
-    }
-
-    static std::chrono::milliseconds expiry(const db::config& cfg) {
-        auto exp = cfg.permissions_update_interval_in_ms();
-        if (exp == 0 || exp == std::numeric_limits<uint32_t>::max()) {
-            exp = cfg.permissions_validity_in_ms();
-        }
-        return std::chrono::milliseconds(exp);
-    }
-
-    future<> stop() {
-        return make_ready_future<>();
-    }
-
-    future<permission_set> get(::shared_ptr<authenticated_user> user, data_resource resource) {
-        return _cache.get(key_type(*user, std::move(resource)));
-    }
-
-private:
-    cache_type _cache;
-};
-
-static distributed<auth::auth::permissions_cache> perm_cache;
-
-/**
- * Poor mans job schedule. For maximum 2 jobs. Sic.
- * Still does nothing more clever than waiting 10 seconds
- * like origin, then runs the submitted tasks.
- *
- * Only difference compared to sleep (from which this
- * borrows _heavily_) is that if tasks have not run by the time
- * we exit (and do static clean up) we delete the promise + cont
- *
- * Should be abstracted to some sort of global server function
- * probably.
- */
-struct waiter {
-    promise<> done;
-    timer<> tmr;
-    waiter() : tmr([this] {done.set_value();})
-    {
-        tmr.arm(auth::auth::SUPERUSER_SETUP_DELAY);
-    }
-    ~waiter() {
-        if (tmr.armed()) {
-            tmr.cancel();
-            done.set_exception(std::runtime_error("shutting down"));
-        }
-        logger.trace("Deleting scheduled task");
-    }
-    void kill() {
-    }
-};
-
-typedef std::unique_ptr<waiter> waiter_ptr;
-
-static std::vector<waiter_ptr> & thread_waiters() {
-    static thread_local std::vector<waiter_ptr> the_waiters;
-    return the_waiters;
-}
-
-void auth::auth::schedule_when_up(scheduled_func f) {
-    logger.trace("Adding scheduled task");
-
-    auto & waiters = thread_waiters();
-
-    waiters.emplace_back(std::make_unique<waiter>());
-    auto* w = waiters.back().get();
-
-    w->done.get_future().finally([w] {
-        auto & waiters = thread_waiters();
-        auto i = std::find_if(waiters.begin(), waiters.end(), [w](const waiter_ptr& p) {
-                            return p.get() == w;
-                        });
-        if (i != waiters.end()) {
-            waiters.erase(i);
-        }
-    }).then([f = std::move(f)] {
-        logger.trace("Running scheduled task");
-        return f();
-    }).handle_exception([](auto ep) {
-        return make_ready_future();
-    });
-}
-
-bool auth::auth::is_class_type(const sstring& type, const sstring& classname) {
-    if (type == classname) {
-        return true;
-    }
-    auto i = classname.find_last_of('.');
-    return classname.compare(i + 1, sstring::npos, type) == 0;
-}
-
-future<> auth::auth::setup() {
-    auto& db = cql3::get_local_query_processor().db().local();
-    auto& cfg = db.get_config();
-
-    future<> f = perm_cache.start();
-
-    if (is_class_type(cfg.authenticator(),
-                    authenticator::ALLOW_ALL_AUTHENTICATOR_NAME)
-                    && is_class_type(cfg.authorizer(),
-                                    authorizer::ALLOW_ALL_AUTHORIZER_NAME)
-                                    ) {
-        // just create the objects
-        return f.then([&cfg] {
-            return authenticator::setup(cfg.authenticator());
-        }).then([&cfg] {
-            return authorizer::setup(cfg.authorizer());
-        });
-    }
-
-    if (!db.has_keyspace(AUTH_KS)) {
-        std::map<sstring, sstring> opts;
-        opts["replication_factor"] = "1";
-        auto ksm = keyspace_metadata::new_keyspace(AUTH_KS, "org.apache.cassandra.locator.SimpleStrategy", opts, true);
-        f = service::get_local_migration_manager().announce_new_keyspace(ksm, false);
-    }
-
-    return f.then([] {
-        return setup_table(USERS_CF, sprint("CREATE TABLE %s.%s (%s text, %s boolean, PRIMARY KEY(%s)) WITH gc_grace_seconds=%d",
-                                        AUTH_KS, USERS_CF, USER_NAME, SUPER, USER_NAME,
-                                        90 * 24 * 60 * 60)); // 3 months.
-    }).then([&cfg] {
-        return authenticator::setup(cfg.authenticator());
-    }).then([&cfg] {
-        return authorizer::setup(cfg.authorizer());
-    }).then([] {
-        service::get_local_migration_manager().register_listener(&auth_migration); // again, only one shard...
-        // instead of once-timer, just schedule this later
-        schedule_when_up([] {
-            // setup default super user
-            return has_existing_users(USERS_CF, DEFAULT_SUPERUSER_NAME, USER_NAME).then([](bool exists) {
-                if (!exists) {
-                    auto query = sprint("INSERT INTO %s.%s (%s, %s) VALUES (?, ?) USING TIMESTAMP 0",
-                                    AUTH_KS, USERS_CF, USER_NAME, SUPER);
-                    cql3::get_local_query_processor().process(query, db::consistency_level::ONE, {DEFAULT_SUPERUSER_NAME, true}).then([](auto) {
-                        logger.info("Created default superuser '{}'", DEFAULT_SUPERUSER_NAME);
-                    }).handle_exception([](auto ep) {
-                        try {
-                            std::rethrow_exception(ep);
-                        } catch (exceptions::request_execution_exception&) {
-                            logger.warn("Skipped default superuser setup: some nodes were not ready");
-                        }
-                    });
-                }
-            });
-        });
-    });
-}
-
-future<> auth::auth::shutdown() {
-    // just make sure we don't have pending tasks.
-    // this is mostly relevant for test cases where
-    // db-env-shutdown != process shutdown
-    return smp::invoke_on_all([] {
-        thread_waiters().clear();
-    }).then([] {
-        return perm_cache.stop();
-    });
-}
-
-future<auth::permission_set> auth::auth::get_permissions(::shared_ptr<authenticated_user> user, data_resource resource) {
-    return perm_cache.local().get(std::move(user), std::move(resource));
-}
-
-static db::consistency_level consistency_for_user(const sstring& username) {
-    if (username == auth::auth::DEFAULT_SUPERUSER_NAME) {
-        return db::consistency_level::QUORUM;
-    }
-    return db::consistency_level::LOCAL_ONE;
-}
-
-static future<::shared_ptr<cql3::untyped_result_set>> select_user(const sstring& username) {
-    // Here was a thread local, explicit cache of prepared statement. In normal execution this is
-    // fine, but since we in testing set up and tear down system over and over, we'd start using
-    // obsolete prepared statements pretty quickly.
-    // Rely on query processing caching statements instead, and lets assume
-    // that a map lookup string->statement is not gonna kill us much.
-    return cql3::get_local_query_processor().process(
-                    sprint("SELECT * FROM %s.%s WHERE %s = ?",
-                                    auth::auth::AUTH_KS, auth::auth::USERS_CF,
-                                    USER_NAME), consistency_for_user(username),
-                    { username }, true);
-}
-
-future<bool> auth::auth::is_existing_user(const sstring& username) {
-    return select_user(username).then(
-                    [](::shared_ptr<cql3::untyped_result_set> res) {
-                        return make_ready_future<bool>(!res->empty());
-                    });
-}
-
-future<bool> auth::auth::is_super_user(const sstring& username) {
-    return select_user(username).then(
-                    [](::shared_ptr<cql3::untyped_result_set> res) {
-                        return make_ready_future<bool>(!res->empty() && res->one().get_as<bool>(SUPER));
-                    });
-}
-
-future<> auth::auth::insert_user(const sstring& username, bool is_super)
-                throw (exceptions::request_execution_exception) {
-    return cql3::get_local_query_processor().process(sprint("INSERT INTO %s.%s (%s, %s) VALUES (?, ?)",
-                    AUTH_KS, USERS_CF, USER_NAME, SUPER),
-                    consistency_for_user(username), { username, is_super }).discard_result();
-}
-
-future<> auth::auth::delete_user(const sstring& username) throw(exceptions::request_execution_exception) {
-    return cql3::get_local_query_processor().process(sprint("DELETE FROM %s.%s WHERE %s = ?",
-                    AUTH_KS, USERS_CF, USER_NAME),
-                    consistency_for_user(username), { username }).discard_result();
-}
-
-future<> auth::auth::setup_table(const sstring& name, const sstring& cql) {
-    auto& qp = cql3::get_local_query_processor();
-    auto& db = qp.db().local();
-
-    if (db.has_schema(AUTH_KS, name)) {
-        return make_ready_future();
-    }
-
-    ::shared_ptr<cql3::statements::raw::cf_statement> parsed = static_pointer_cast<
-                    cql3::statements::raw::cf_statement>(cql3::query_processor::parse_statement(cql));
-    parsed->prepare_keyspace(AUTH_KS);
-    ::shared_ptr<cql3::statements::create_table_statement> statement =
-                    static_pointer_cast<cql3::statements::create_table_statement>(
-                                    parsed->prepare(db)->statement);
-    auto schema = statement->get_cf_meta_data();
-    auto uuid = generate_legacy_id(schema->ks_name(), schema->cf_name());
-
-    schema_builder b(schema);
-    b.set_uuid(uuid);
-    return service::get_local_migration_manager().announce_new_column_family(b.build(), false);
-}
-
-future<bool> auth::auth::has_existing_users(const sstring& cfname, const sstring& def_user_name, const sstring& name_column) {
-    auto default_user_query = sprint("SELECT * FROM %s.%s WHERE %s = ?", AUTH_KS, cfname, name_column);
-    auto all_users_query = sprint("SELECT * FROM %s.%s LIMIT 1", AUTH_KS, cfname);
-
-    return cql3::get_local_query_processor().process(default_user_query, db::consistency_level::ONE, { def_user_name }).then([=](::shared_ptr<cql3::untyped_result_set> res) {
-        if (!res->empty()) {
-            return make_ready_future<bool>(true);
-        }
-        return cql3::get_local_query_processor().process(default_user_query, db::consistency_level::QUORUM, { def_user_name }).then([all_users_query](::shared_ptr<cql3::untyped_result_set> res) {
-            if (!res->empty()) {
-                return make_ready_future<bool>(true);
-            }
-            return cql3::get_local_query_processor().process(all_users_query, db::consistency_level::QUORUM).then([](::shared_ptr<cql3::untyped_result_set> res) {
-                return make_ready_future<bool>(!res->empty());
-            });
-        });
-    });
-}
-
--- a/auth/auth.hh
+++ b/auth/auth.hh
@@ -1,124 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (C) 2016 ScyllaDB
- *
- * Modified by ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <chrono>
-#include <seastar/core/sstring.hh>
-#include <seastar/core/future.hh>
-#include <seastar/core/shared_ptr.hh>
-
-
-#include "exceptions/exceptions.hh"
-#include "permission.hh"
-#include "data_resource.hh"
-
-namespace auth {
-
-class authenticated_user;
-
-class auth {
-public:
-    class permissions_cache;
-
-    static const sstring DEFAULT_SUPERUSER_NAME;
-    static const sstring AUTH_KS;
-    static const sstring USERS_CF;
-    static const std::chrono::milliseconds SUPERUSER_SETUP_DELAY;
-
-    static bool is_class_type(const sstring& type, const sstring& classname);
-
-    static future<permission_set> get_permissions(::shared_ptr<authenticated_user>, data_resource);
-
-    /**
-     * Checks if the username is stored in AUTH_KS.USERS_CF.
-     *
-     * @param username Username to query.
-     * @return whether or not Cassandra knows about the user.
-     */
-    static future<bool> is_existing_user(const sstring& username);
-
-    /**
-     * Checks if the user is a known superuser.
-     *
-     * @param username Username to query.
-     * @return true is the user is a superuser, false if they aren't or don't exist at all.
-     */
-    static future<bool> is_super_user(const sstring& username);
-
-    /**
-     * Inserts the user into AUTH_KS.USERS_CF (or overwrites their superuser status as a result of an ALTER USER query).
-     *
-     * @param username Username to insert.
-     * @param isSuper User's new status.
-     * @throws RequestExecutionException
-     */
-    static future<> insert_user(const sstring& username, bool is_super) throw(exceptions::request_execution_exception);
-
-    /**
-     * Deletes the user from AUTH_KS.USERS_CF.
-     *
-     * @param username Username to delete.
-     * @throws RequestExecutionException
-     */
-    static future<> delete_user(const sstring& username) throw(exceptions::request_execution_exception);
-
-    /**
-     * Sets up Authenticator and Authorizer.
-     */
-    static future<> setup();
-    static future<> shutdown();
-
-    /**
-     * Set up table from given CREATE TABLE statement under system_auth keyspace, if not already done so.
-     *
-     * @param name name of the table
-     * @param cql CREATE TABLE statement
-     */
-    static future<> setup_table(const sstring& name, const sstring& cql);
-
-    static future<bool> has_existing_users(const sstring& cfname, const sstring& def_user_name, const sstring& name_column_name);
-
-    // For internal use. Run function "when system is up".
-    typedef std::function<future<>()> scheduled_func;
-    static void schedule_when_up(scheduled_func);
-};
-}
--- a/auth/authenticated_user.cc
+++ b/auth/authenticated_user.cc
@@ -41,7 +41,6 @@


 #include "authenticated_user.hh"
-#include "auth.hh"

 const sstring auth::authenticated_user::ANONYMOUS_USERNAME("anonymous");

@@ -60,13 +59,6 @@ const sstring& auth::authenticated_user::name() const {
    return _anon ? ANONYMOUS_USERNAME : _name;
 }

-future<bool> auth::authenticated_user::is_super() const {
-    if (is_anonymous()) {
-        return make_ready_future<bool>(false);
-    }
-    return auth::auth::is_super_user(_name);
-}
-
 bool auth::authenticated_user::operator==(const authenticated_user& v) const {
    return _anon ? v._anon : _name == v._name;
 }
--- a/auth/authenticated_user.hh
+++ b/auth/authenticated_user.hh
@@ -43,6 +43,7 @@

 #include <seastar/core/sstring.hh>
 #include <seastar/core/future.hh>
+#include "seastarx.hh"

 namespace auth {

@@ -57,14 +58,6 @@ public:

    const sstring& name() const;

-    /**
-     * Checks the user's superuser status.
-     * Only a superuser is allowed to perform CREATE USER and DROP USER queries.
-     * Im most cased, though not necessarily, a superuser will have Permission.ALL on every resource
-     * (depends on IAuthorizer implementation).
-     */
-    future<bool> is_super() const;
-
    /**
     * If IAuthenticator doesn't require authentication, this method may return true.
     */
--- a/auth/authenticator.cc
+++ b/auth/authenticator.cc
@@ -41,13 +41,14 @@

 #include "authenticator.hh"
 #include "authenticated_user.hh"
+#include "common.hh"
 #include "password_authenticator.hh"
-#include "auth.hh"
+#include "cql3/query_processor.hh"
 #include "db/config.hh"
+#include "utils/class_registrator.hh"

 const sstring auth::authenticator::USERNAME_KEY("username");
 const sstring auth::authenticator::PASSWORD_KEY("password");
-const sstring auth::authenticator::ALLOW_ALL_AUTHENTICATOR_NAME("org.apache.cassandra.auth.AllowAllAuthenticator");

 auth::authenticator::option auth::authenticator::string_to_option(const sstring& name) {
    if (strcasecmp(name.c_str(), "password") == 0) {
@@ -64,64 +65,3 @@ sstring auth::authenticator::option_to_string(option opt) {
        throw std::invalid_argument(sprint("Unknown option {}", opt));
    }
 }
-
-/**
- * Authenticator is assumed to be a fully state-less immutable object (note all the const).
- * We thus store a single instance globally, since it should be safe/ok.
- */
-static std::unique_ptr<auth::authenticator> global_authenticator;
-
-future<>
-auth::authenticator::setup(const sstring& type) throw (exceptions::configuration_exception) {
-    if (auth::auth::is_class_type(type, ALLOW_ALL_AUTHENTICATOR_NAME)) {
-        class allow_all_authenticator : public authenticator {
-        public:
-            const sstring& class_name() const override {
-                return ALLOW_ALL_AUTHENTICATOR_NAME;
-            }
-            bool require_authentication() const override {
-                return false;
-            }
-            option_set supported_options() const override {
-                return option_set();
-            }
-            option_set alterable_options() const override {
-                return option_set();
-            }
-            future<::shared_ptr<authenticated_user>> authenticate(const credentials_map& credentials) const throw(exceptions::authentication_exception) override {
-                return make_ready_future<::shared_ptr<authenticated_user>>(::make_shared<authenticated_user>());
-            }
-            future<> create(sstring username, const option_map& options) throw(exceptions::request_validation_exception, exceptions::request_execution_exception) override {
-                return make_ready_future();
-            }
-            future<> alter(sstring username, const option_map& options) throw(exceptions::request_validation_exception, exceptions::request_execution_exception) override {
-                return make_ready_future();
-            }
-            future<> drop(sstring username) throw(exceptions::request_validation_exception, exceptions::request_execution_exception) override {
-                return make_ready_future();
-            }
-            const resource_ids& protected_resources() const override {
-                static const resource_ids ids;
-                return ids;
-            }
-            ::shared_ptr<sasl_challenge> new_sasl_challenge() const override {
-                throw std::runtime_error("Should not reach");
-            }
-        };
-        global_authenticator = std::make_unique<allow_all_authenticator>();
-    } else if (auth::auth::is_class_type(type, password_authenticator::PASSWORD_AUTHENTICATOR_NAME)) {
-        auto pwa = std::make_unique<password_authenticator>();
-        auto f = pwa->init();
-        return f.then([pwa = std::move(pwa)]() mutable {
-            global_authenticator = std::move(pwa);
-        });
-    } else {
-        throw exceptions::configuration_exception("Invalid authenticator type: " + type);
-    }
-    return make_ready_future();
-}
-
-auth::authenticator& auth::authenticator::get() {
-    assert(global_authenticator);
-    return *global_authenticator;
-}
--- a/auth/authenticator.hh
+++ b/auth/authenticator.hh
@@ -69,7 +69,6 @@ class authenticator {
 public:
    static const sstring USERNAME_KEY;
    static const sstring PASSWORD_KEY;
-    static const sstring ALLOW_ALL_AUTHENTICATOR_NAME;

    /**
     * Supported CREATE USER/ALTER USER options.
@@ -86,23 +85,14 @@ public:
    using option_map = std::unordered_map<option, boost::any, enum_hash<option>>;
    using credentials_map = std::unordered_map<sstring, sstring>;

-    /**
-     * Setup is called once upon system startup to initialize the IAuthenticator.
-     *
-     * For example, use this method to create any required keyspaces/column families.
-     * Note: Only call from main thread.
-     */
-    static future<> setup(const sstring& type) throw(exceptions::configuration_exception);
-
-    /**
-     * Returns the system authenticator. Must have called setup before calling this.
-     */
-    static authenticator& get();
-
    virtual ~authenticator()
    {}

-    virtual const sstring& class_name() const = 0;
+    virtual future<> start() = 0;
+
+    virtual future<> stop() = 0;
+
+    virtual const sstring& qualified_java_name() const = 0;

    /**
     * Whether or not the authenticator requires explicit login.
@@ -129,7 +119,7 @@ public:
     *
     * @throws authentication_exception if credentials don't match any known user.
     */
-    virtual future<::shared_ptr<authenticated_user>> authenticate(const credentials_map& credentials) const throw(exceptions::authentication_exception) = 0;
+    virtual future<::shared_ptr<authenticated_user>> authenticate(const credentials_map& credentials) const = 0;

    /**
     * Called during execution of CREATE USER query (also may be called on startup, see seedSuperuserOptions method).
@@ -141,7 +131,7 @@ public:
     * @throws exceptions::request_validation_exception
     * @throws exceptions::request_execution_exception
     */
-    virtual future<> create(sstring username, const option_map& options) throw(exceptions::request_validation_exception, exceptions::request_execution_exception) = 0;
+    virtual future<> create(sstring username, const option_map& options) = 0;

    /**
     * Called during execution of ALTER USER query.
@@ -154,7 +144,7 @@ public:
     * @throws exceptions::request_validation_exception
     * @throws exceptions::request_execution_exception
     */
-    virtual future<> alter(sstring username, const option_map& options) throw(exceptions::request_validation_exception, exceptions::request_execution_exception) = 0;
+    virtual future<> alter(sstring username, const option_map& options) = 0;


    /**
@@ -164,7 +154,7 @@ public:
     * @throws exceptions::request_validation_exception
     * @throws exceptions::request_execution_exception
     */
-    virtual future<> drop(sstring username) throw(exceptions::request_validation_exception, exceptions::request_execution_exception) = 0;
+    virtual future<> drop(sstring username) = 0;

     /**
     * Set of resources that should be made inaccessible to users and only accessible internally.
@@ -177,9 +167,9 @@ public:
    class sasl_challenge {
    public:
        virtual ~sasl_challenge() {}
-        virtual bytes evaluate_response(bytes_view client_response) throw(exceptions::authentication_exception) = 0;
+        virtual bytes evaluate_response(bytes_view client_response) = 0;
        virtual bool is_complete() const = 0;
-        virtual future<::shared_ptr<authenticated_user>> get_authenticated_user() const throw(exceptions::authentication_exception) = 0;
+        virtual future<::shared_ptr<authenticated_user>> get_authenticated_user() const = 0;
    };

    /**
--- a/auth/authorizer.cc
+++ b/auth/authorizer.cc
@@ -41,23 +41,39 @@

 #include "authorizer.hh"
 #include "authenticated_user.hh"
+#include "common.hh"
 #include "default_authorizer.hh"
 #include "auth.hh"
+#include "cql3/query_processor.hh"
 #include "db/config.hh"
+#include "utils/class_registrator.hh"

-const sstring auth::authorizer::ALLOW_ALL_AUTHORIZER_NAME("org.apache.cassandra.auth.AllowAllAuthorizer");
+const sstring& auth::allow_all_authorizer_name() {
+    static const sstring name = meta::AUTH_PACKAGE_NAME + "AllowAllAuthorizer";
+    return name;
+}

 /**
 * Authenticator is assumed to be a fully state-less immutable object (note all the const).
 * We thus store a single instance globally, since it should be safe/ok.
 */
 static std::unique_ptr<auth::authorizer> global_authorizer;
+using authorizer_registry = class_registry<auth::authorizer, cql3::query_processor&>;

 future<>
 auth::authorizer::setup(const sstring& type) {
-    if (auth::auth::is_class_type(type, ALLOW_ALL_AUTHORIZER_NAME)) {
+    if (type == allow_all_authorizer_name()) {
        class allow_all_authorizer : public authorizer {
        public:
+            future<> start() override {
+                return make_ready_future<>();
+            }
+            future<> stop() override {
+                return make_ready_future<>();
+            }
+            const sstring& qualified_java_name() const override {
+                return allow_all_authorizer_name();
+            }
            future<permission_set> authorize(::shared_ptr<authenticated_user>, data_resource) const override {
                return make_ready_future<permission_set>(permissions::ALL);
            }
@@ -86,16 +102,14 @@ auth::authorizer::setup(const sstring& type) {
        };

        global_authorizer = std::make_unique<allow_all_authorizer>();
-    } else if (auth::auth::is_class_type(type, default_authorizer::DEFAULT_AUTHORIZER_NAME)) {
-        auto da = std::make_unique<default_authorizer>();
-        auto f = da->init();
-        return f.then([da = std::move(da)]() mutable {
-            global_authorizer = std::move(da);
-        });
+        return make_ready_future();
    } else {
-        throw exceptions::configuration_exception("Invalid authorizer type: " + type);
+        auto a = authorizer_registry::create(type, cql3::get_local_query_processor());
+        auto f = a->start();
+        return f.then([a = std::move(a)]() mutable {
+            global_authorizer = std::move(a);
+        });
    }
-    return make_ready_future();
 }

 auth::authorizer& auth::authorizer::get() {
--- a/auth/authorizer.hh
+++ b/auth/authorizer.hh
@@ -51,8 +51,12 @@
 #include "permission.hh"
 #include "data_resource.hh"

+#include "seastarx.hh"
+
 namespace auth {

+class service;
+
 class authenticated_user;

 struct permission_details {
@@ -69,10 +73,14 @@ using std::experimental::optional;

 class authorizer {
 public:
-    static const sstring ALLOW_ALL_AUTHORIZER_NAME;
-
    virtual ~authorizer() {}

+    virtual future<> start() = 0;
+
+    virtual future<> stop() = 0;
+
+    virtual const sstring& qualified_java_name() const = 0;
+
    /**
     * The primary Authorizer method. Returns a set of permissions of a user on a resource.
     *
@@ -80,7 +88,7 @@ public:
     * @param resource Resource for which the authorization is being requested. @see DataResource.
     * @return Set of permissions of the user on the resource. Should never return empty. Use permission.NONE instead.
     */
-    virtual future<permission_set> authorize(::shared_ptr<authenticated_user>, data_resource) const = 0;
+    virtual future<permission_set> authorize(service&, ::shared_ptr<authenticated_user>, data_resource) const = 0;

    /**
     * Grants a set of permissions on a resource to a user.
@@ -124,7 +132,7 @@ public:
     * @throws RequestValidationException
     * @throws RequestExecutionException
     */
-    virtual future<std::vector<permission_details>> list(::shared_ptr<authenticated_user> performer, permission_set, optional<data_resource>, optional<sstring>) const = 0;
+    virtual future<std::vector<permission_details>> list(service&, ::shared_ptr<authenticated_user> performer, permission_set, optional<data_resource>, optional<sstring>) const = 0;

    /**
     * This method is called before deleting a user with DROP USER query so that a new user with the same
@@ -154,18 +162,6 @@ public:
     * @throws ConfigurationException when there is a configuration error.
     */
    virtual future<> validate_configuration() const = 0;
-
-    /**
-     * Setup is called once upon system startup to initialize the IAuthorizer.
-     *
-     * For example, use this method to create any required keyspaces/column families.
-     */
-    static future<> setup(const sstring& type);
-
-    /**
-     * Returns the system authorizer. Must have called setup before calling this.
-     */
-    static authorizer& get();
 };

 }
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "auth/common.hh"
+
+#include <seastar/core/shared_ptr.hh>
+
+#include "cql3/query_processor.hh"
+#include "cql3/statements/create_table_statement.hh"
+#include "schema_builder.hh"
+#include "service/migration_manager.hh"
+
+namespace auth {
+
+namespace meta {
+
+const sstring DEFAULT_SUPERUSER_NAME("cassandra");
+const sstring AUTH_KS("system_auth");
+const sstring USERS_CF("users");
+const sstring AUTH_PACKAGE_NAME("org.apache.cassandra.auth.");
+
+}
+
+future<> create_metadata_table_if_missing(
+        const sstring& table_name,
+        cql3::query_processor& qp,
+        const sstring& cql,
+        ::service::migration_manager& mm) {
+    auto& db = qp.db().local();
+
+    if (db.has_schema(meta::AUTH_KS, table_name)) {
+        return make_ready_future<>();
+    }
+
+    auto parsed_statement = static_pointer_cast<cql3::statements::raw::cf_statement>(
+            cql3::query_processor::parse_statement(cql));
+
+    parsed_statement->prepare_keyspace(meta::AUTH_KS);
+
+    auto statement = static_pointer_cast<cql3::statements::create_table_statement>(
+            parsed_statement->prepare(db, qp.get_cql_stats())->statement);
+
+    const auto schema = statement->get_cf_meta_data();
+    const auto uuid = generate_legacy_id(schema->ks_name(), schema->cf_name());
+
+    schema_builder b(schema);
+    b.set_uuid(uuid);
+
+    return mm.announce_new_column_family(b.build(), false);
+}
+
+}
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <chrono>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/core/resource.hh>
+#include <seastar/core/sstring.hh>
+
+#include "delayed_tasks.hh"
+#include "seastarx.hh"
+
+namespace service {
+class migration_manager;
+}
+
+namespace cql3 {
+class query_processor;
+}
+
+namespace auth {
+
+namespace meta {
+
+extern const sstring DEFAULT_SUPERUSER_NAME;
+extern const sstring AUTH_KS;
+extern const sstring USERS_CF;
+extern const sstring AUTH_PACKAGE_NAME;
+
+}
+
+template <class Task>
+future<> once_among_shards(Task&& f) {
+    if (engine().cpu_id() == 0u) {
+        return f();
+    }
+
+    return make_ready_future<>();
+}
+
+template <class Task, class Clock>
+void delay_until_system_ready(delayed_tasks<Clock>& ts, Task&& f) {
+    static const typename std::chrono::milliseconds delay_duration(10000);
+    ts.schedule_after(delay_duration, std::forward<Task>(f));
+}
+
+future<> create_metadata_table_if_missing(
+        const sstring& table_name,
+        cql3::query_processor&,
+        const sstring& cql,
+        ::service::migration_manager&);
+
+}
--- a/auth/data_resource.cc
+++ b/auth/data_resource.cc
@@ -47,11 +47,8 @@
 const sstring auth::data_resource::ROOT_NAME("data");

 auth::data_resource::data_resource(level l, const sstring& ks, const sstring& cf)
-    : _ks(ks), _cf(cf)
+    : _level(l), _ks(ks), _cf(cf)
 {
-    if (l != get_level()) {
-        throw std::invalid_argument("level/keyspace/column mismatch");
-    }
 }

 auth::data_resource::data_resource()
@@ -67,14 +64,7 @@ auth::data_resource::data_resource(const sstring& ks, const sstring& cf)
 {}

 auth::data_resource::level auth::data_resource::get_level() const {
-    if (!_cf.empty()) {
-        assert(!_ks.empty());
-        return level::COLUMN_FAMILY;
-    }
-    if (!_ks.empty()) {
-        return level::KEYSPACE;
-    }
-    return level::ROOT;
+    return _level;
 }

 auth::data_resource auth::data_resource::from_name(
@@ -125,16 +115,14 @@ auth::data_resource auth::data_resource::get_parent() const {
    }
 }

-const sstring& auth::data_resource::keyspace() const
-                throw (std::invalid_argument) {
+const sstring& auth::data_resource::keyspace() const {
    if (is_root_level()) {
        throw std::invalid_argument("ROOT data resource has no keyspace");
    }
    return _ks;
 }

-const sstring& auth::data_resource::column_family() const
-                throw (std::invalid_argument) {
+const sstring& auth::data_resource::column_family() const {
    if (!is_column_family_level()) {
        throw std::invalid_argument(sprint("%s data resource has no column family", name()));
    }
--- a/auth/data_resource.hh
+++ b/auth/data_resource.hh
@@ -45,6 +45,7 @@
 #include <iosfwd>
 #include <set>
 #include <seastar/core/sstring.hh>
+#include "seastarx.hh"

 namespace auth {

@@ -56,6 +57,7 @@ private:

    static const sstring ROOT_NAME;

+    level _level;
    sstring _ks;
    sstring _cf;

@@ -116,13 +118,13 @@ public:
     * @return keyspace of the resource.
     * @throws std::invalid_argument if it's the root-level resource.
     */
-    const sstring& keyspace() const throw(std::invalid_argument);
+    const sstring& keyspace() const;

    /**
     * @return column family of the resource.
     * @throws std::invalid_argument if it's not a cf-level resource.
     */
-    const sstring& column_family() const throw(std::invalid_argument);
+    const sstring& column_family() const;

    /**
     * @return Whether or not the resource has a parent in the hierarchy.
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -46,46 +46,68 @@

 #include <seastar/core/reactor.hh>

-#include "auth.hh"
+#include "common.hh"
 #include "default_authorizer.hh"
 #include "authenticated_user.hh"
 #include "permission.hh"
 #include "cql3/query_processor.hh"
+#include "cql3/untyped_result_set.hh"
 #include "exceptions/exceptions.hh"
 #include "log.hh"

-const sstring auth::default_authorizer::DEFAULT_AUTHORIZER_NAME(
-                "org.apache.cassandra.auth.CassandraAuthorizer");
+const sstring& auth::default_authorizer_name() {
+    static const sstring name = meta::AUTH_PACKAGE_NAME + "CassandraAuthorizer";
+    return name;
+}

 static const sstring USER_NAME = "username";
 static const sstring RESOURCE_NAME = "resource";
 static const sstring PERMISSIONS_NAME = "permissions";
 static const sstring PERMISSIONS_CF = "permissions";

-static logging::logger logger("default_authorizer");
+static logging::logger alogger("default_authorizer");

-auth::default_authorizer::default_authorizer() {
+// To ensure correct initialization order, we unfortunately need to use a string literal.
+static const class_registrator<
+        auth::authorizer,
+        auth::default_authorizer,
+        cql3::query_processor&,
+        ::service::migration_manager&> password_auth_reg("org.apache.cassandra.auth.CassandraAuthorizer");
+
+auth::default_authorizer::default_authorizer(cql3::query_processor& qp, ::service::migration_manager& mm)
+        : _qp(qp)
+        , _migration_manager(mm) {
 }
+
 auth::default_authorizer::~default_authorizer() {
 }

-future<> auth::default_authorizer::init() {
-    sstring create_table = sprint("CREATE TABLE %s.%s ("
+future<> auth::default_authorizer::start() {
+    static const sstring create_table = sprint("CREATE TABLE %s.%s ("
                    "%s text,"
                    "%s text,"
                    "%s set<text>,"
                    "PRIMARY KEY(%s, %s)"
-                    ") WITH gc_grace_seconds=%d", auth::auth::AUTH_KS,
+                    ") WITH gc_grace_seconds=%d", meta::AUTH_KS,
                    PERMISSIONS_CF, USER_NAME, RESOURCE_NAME, PERMISSIONS_NAME,
                    USER_NAME, RESOURCE_NAME, 90 * 24 * 60 * 60); // 3 months.

-    return auth::setup_table(PERMISSIONS_CF, create_table);
+    return auth::once_among_shards([this] {
+        return auth::create_metadata_table_if_missing(
+                PERMISSIONS_CF,
+                _qp,
+                create_table,
+                _migration_manager);
+    });
 }

+future<> auth::default_authorizer::stop() {
+    return make_ready_future<>();
+}

 future<auth::permission_set> auth::default_authorizer::authorize(
-                ::shared_ptr<authenticated_user> user, data_resource resource) const {
-    return user->is_super().then([this, user, resource = std::move(resource)](bool is_super) {
+                service& ser, ::shared_ptr<authenticated_user> user, data_resource resource) const {
+    return auth::is_super_user(ser, *user).then([this, user, resource = std::move(resource)](bool is_super) {
        if (is_super) {
            return make_ready_future<permission_set>(permissions::ALL);
        }
@@ -94,10 +116,9 @@ future<auth::permission_set> auth::default_authorizer::authorize(
         * TOOD: could create actual data type for permission (translating string<->perm),
         * but this seems overkill right now. We still must store strings so...
         */
-        auto& qp = cql3::get_local_query_processor();
        auto query = sprint("SELECT %s FROM %s.%s WHERE %s = ? AND %s = ?"
-                        , PERMISSIONS_NAME, auth::AUTH_KS, PERMISSIONS_CF, USER_NAME, RESOURCE_NAME);
-        return qp.process(query, db::consistency_level::LOCAL_ONE, {user->name(), resource.name() })
+                        , PERMISSIONS_NAME, meta::AUTH_KS, PERMISSIONS_CF, USER_NAME, RESOURCE_NAME);
+        return _qp.process(query, db::consistency_level::LOCAL_ONE, {user->name(), resource.name() })
                        .then_wrapped([=](future<::shared_ptr<cql3::untyped_result_set>> f) {
            try {
                auto res = f.get0();
@@ -107,7 +128,7 @@ future<auth::permission_set> auth::default_authorizer::authorize(
                }
                return make_ready_future<permission_set>(permissions::from_strings(res->one().get_set<sstring>(PERMISSIONS_NAME)));
            } catch (exceptions::request_execution_exception& e) {
-                logger.warn("CassandraAuthorizer failed to authorize {} for {}", user->name(), resource);
+                alogger.warn("CassandraAuthorizer failed to authorize {} for {}", user->name(), resource);
                return make_ready_future<permission_set>(permissions::NONE);
            }
        });
@@ -120,11 +141,10 @@ future<> auth::default_authorizer::modify(
                ::shared_ptr<authenticated_user> performer, permission_set set,
                data_resource resource, sstring user, sstring op) {
    // TODO: why does this not check super user?
-    auto& qp = cql3::get_local_query_processor();
    auto query = sprint("UPDATE %s.%s SET %s = %s %s ? WHERE %s = ? AND %s = ?",
-                    auth::AUTH_KS, PERMISSIONS_CF, PERMISSIONS_NAME,
+                    meta::AUTH_KS, PERMISSIONS_CF, PERMISSIONS_NAME,
                    PERMISSIONS_NAME, op, USER_NAME, RESOURCE_NAME);
-    return qp.process(query, db::consistency_level::ONE, {
+    return _qp.process(query, db::consistency_level::ONE, {
                    permissions::to_strings(set), user, resource.name() }).discard_result();
 }

@@ -142,15 +162,14 @@ future<> auth::default_authorizer::revoke(
 }

 future<std::vector<auth::permission_details>> auth::default_authorizer::list(
-                ::shared_ptr<authenticated_user> performer, permission_set set,
+                service& ser, ::shared_ptr<authenticated_user> performer, permission_set set,
                optional<data_resource> resource, optional<sstring> user) const {
-    return performer->is_super().then([this, performer, set = std::move(set), resource = std::move(resource), user = std::move(user)](bool is_super) {
+    return auth::is_super_user(ser, *performer).then([this, performer, set = std::move(set), resource = std::move(resource), user = std::move(user)](bool is_super) {
        if (!is_super && (!user || performer->name() != *user)) {
            throw exceptions::unauthorized_exception(sprint("You are not authorized to view %s's permissions", user ? *user : "everyone"));
        }

-        auto query = sprint("SELECT %s, %s, %s FROM %s.%s", USER_NAME, RESOURCE_NAME, PERMISSIONS_NAME, auth::AUTH_KS, PERMISSIONS_CF);
-        auto& qp = cql3::get_local_query_processor();
+        auto query = sprint("SELECT %s, %s, %s FROM %s.%s", USER_NAME, RESOURCE_NAME, PERMISSIONS_NAME, meta::AUTH_KS, PERMISSIONS_CF);

        // Oh, look, it is a case where it does not pay off to have
        // parameters to process in an initializer list.
@@ -158,15 +177,15 @@ future<std::vector<auth::permission_details>> auth::default_authorizer::list(

        if (resource && user) {
            query += sprint(" WHERE %s = ? AND %s = ?", USER_NAME, RESOURCE_NAME);
-            f = qp.process(query, db::consistency_level::ONE, {*user, resource->name()});
+            f = _qp.process(query, db::consistency_level::ONE, {*user, resource->name()});
        } else if (resource) {
            query += sprint(" WHERE %s = ? ALLOW FILTERING", RESOURCE_NAME);
-            f = qp.process(query, db::consistency_level::ONE, {resource->name()});
+            f = _qp.process(query, db::consistency_level::ONE, {resource->name()});
        } else if (user) {
            query += sprint(" WHERE %s = ?", USER_NAME);
-            f = qp.process(query, db::consistency_level::ONE, {*user});
+            f = _qp.process(query, db::consistency_level::ONE, {*user});
        } else {
-            f = qp.process(query, db::consistency_level::ONE, {});
+            f = _qp.process(query, db::consistency_level::ONE, {});
        }

        return f.then([set](::shared_ptr<cql3::untyped_result_set> res) {
@@ -188,42 +207,40 @@ future<std::vector<auth::permission_details>> auth::default_authorizer::list(
 }

 future<> auth::default_authorizer::revoke_all(sstring dropped_user) {
-    auto& qp = cql3::get_local_query_processor();
-    auto query = sprint("DELETE FROM %s.%s WHERE %s = ?", auth::AUTH_KS,
+    auto query = sprint("DELETE FROM %s.%s WHERE %s = ?", meta::AUTH_KS,
                    PERMISSIONS_CF, USER_NAME);
-    return qp.process(query, db::consistency_level::ONE, { dropped_user }).discard_result().handle_exception(
+    return _qp.process(query, db::consistency_level::ONE, { dropped_user }).discard_result().handle_exception(
                    [dropped_user](auto ep) {
                        try {
                            std::rethrow_exception(ep);
                        } catch (exceptions::request_execution_exception& e) {
-                            logger.warn("CassandraAuthorizer failed to revoke all permissions of {}: {}", dropped_user, e);
+                            alogger.warn("CassandraAuthorizer failed to revoke all permissions of {}: {}", dropped_user, e);
                        }
                    });
 }

 future<> auth::default_authorizer::revoke_all(data_resource resource) {
-    auto& qp = cql3::get_local_query_processor();
    auto query = sprint("SELECT %s FROM %s.%s WHERE %s = ? ALLOW FILTERING",
-                    USER_NAME, auth::AUTH_KS, PERMISSIONS_CF, RESOURCE_NAME);
-    return qp.process(query, db::consistency_level::LOCAL_ONE, { resource.name() })
-                    .then_wrapped([resource, &qp](future<::shared_ptr<cql3::untyped_result_set>> f) {
+                    USER_NAME, meta::AUTH_KS, PERMISSIONS_CF, RESOURCE_NAME);
+    return _qp.process(query, db::consistency_level::LOCAL_ONE, { resource.name() })
+                    .then_wrapped([this, resource](future<::shared_ptr<cql3::untyped_result_set>> f) {
        try {
            auto res = f.get0();
-            return parallel_for_each(res->begin(), res->end(), [&qp, res, resource](const cql3::untyped_result_set::row& r) {
+            return parallel_for_each(res->begin(), res->end(), [this, res, resource](const cql3::untyped_result_set::row& r) {
                auto query = sprint("DELETE FROM %s.%s WHERE %s = ? AND %s = ?"
-                                , auth::AUTH_KS, PERMISSIONS_CF, USER_NAME, RESOURCE_NAME);
-                return qp.process(query, db::consistency_level::LOCAL_ONE, { r.get_as<sstring>(USER_NAME), resource.name() })
+                                , meta::AUTH_KS, PERMISSIONS_CF, USER_NAME, RESOURCE_NAME);
+                return _qp.process(query, db::consistency_level::LOCAL_ONE, { r.get_as<sstring>(USER_NAME), resource.name() })
                                .discard_result().handle_exception([resource](auto ep) {
                    try {
                        std::rethrow_exception(ep);
                    } catch (exceptions::request_execution_exception& e) {
-                        logger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", resource, e);
+                        alogger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", resource, e);
                    }

                });
            });
        } catch (exceptions::request_execution_exception& e) {
-            logger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", resource, e);
+            alogger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", resource, e);
            return make_ready_future();
        }
    });
@@ -231,7 +248,7 @@ future<> auth::default_authorizer::revoke_all(data_resource resource) {


 const auth::resource_ids& auth::default_authorizer::protected_resources() {
-    static const resource_ids ids({ data_resource(auth::AUTH_KS, PERMISSIONS_CF) });
+    static const resource_ids ids({ data_resource(meta::AUTH_KS, PERMISSIONS_CF) });
    return ids;
 }

--- a/auth/default_authorizer.hh
+++ b/auth/default_authorizer.hh
@@ -41,26 +41,40 @@

 #pragma once

+#include <functional>
+
 #include "authorizer.hh"
+#include "cql3/query_processor.hh"
+#include "service/migration_manager.hh"

 namespace auth {

-class default_authorizer : public authorizer {
-public:
-    static const sstring DEFAULT_AUTHORIZER_NAME;
+const sstring& default_authorizer_name();

-    default_authorizer();
+class default_authorizer : public authorizer {
+    cql3::query_processor& _qp;
+
+    ::service::migration_manager& _migration_manager;
+
+public:
+    default_authorizer(cql3::query_processor&, ::service::migration_manager&);
    ~default_authorizer();

-    future<> init();
+    future<> start() override;

-    future<permission_set> authorize(::shared_ptr<authenticated_user>, data_resource) const override;
+    future<> stop() override;
+
+    const sstring& qualified_java_name() const override {
+        return default_authorizer_name();
+    }
+
+    future<permission_set> authorize(service&, ::shared_ptr<authenticated_user>, data_resource) const override;

    future<> grant(::shared_ptr<authenticated_user>, permission_set, data_resource, sstring) override;

    future<> revoke(::shared_ptr<authenticated_user>, permission_set, data_resource, sstring) override;

-    future<std::vector<permission_details>> list(::shared_ptr<authenticated_user>, permission_set, optional<data_resource>, optional<sstring>) const override;
+    future<std::vector<permission_details>> list(service&, ::shared_ptr<authenticated_user>, permission_set, optional<data_resource>, optional<sstring>) const override;

    future<> revoke_all(sstring) override;

--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -46,28 +46,42 @@

 #include <seastar/core/reactor.hh>

-#include "auth.hh"
+#include "common.hh"
 #include "password_authenticator.hh"
 #include "authenticated_user.hh"
-#include "cql3/query_processor.hh"
+#include "cql3/untyped_result_set.hh"
 #include "log.hh"
+#include "service/migration_manager.hh"
+#include "utils/class_registrator.hh"

-const sstring auth::password_authenticator::PASSWORD_AUTHENTICATOR_NAME("org.apache.cassandra.auth.PasswordAuthenticator");
+const sstring& auth::password_authenticator_name() {
+    static const sstring name = meta::AUTH_PACKAGE_NAME + "PasswordAuthenticator";
+    return name;
+}

 // name of the hash column.
 static const sstring SALTED_HASH = "salted_hash";
 static const sstring USER_NAME = "username";
-static const sstring DEFAULT_USER_NAME = auth::auth::DEFAULT_SUPERUSER_NAME;
-static const sstring DEFAULT_USER_PASSWORD = auth::auth::DEFAULT_SUPERUSER_NAME;
+static const sstring DEFAULT_USER_NAME = auth::meta::DEFAULT_SUPERUSER_NAME;
+static const sstring DEFAULT_USER_PASSWORD = auth::meta::DEFAULT_SUPERUSER_NAME;
 static const sstring CREDENTIALS_CF = "credentials";

-static logging::logger logger("password_authenticator");
+static logging::logger plogger("password_authenticator");
+
+// To ensure correct initialization order, we unfortunately need to use a string literal.
+static const class_registrator<
+        auth::authenticator,
+        auth::password_authenticator,
+        cql3::query_processor&,
+        ::service::migration_manager&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");

 auth::password_authenticator::~password_authenticator()
 {}

-auth::password_authenticator::password_authenticator()
-{}
+auth::password_authenticator::password_authenticator(cql3::query_processor& qp, ::service::migration_manager& mm)
+    : _qp(qp)
+    , _migration_manager(mm) {
+}

 // TODO: blowfish
 // Origin uses Java bcrypt library, i.e. blowfish salt
@@ -88,12 +102,10 @@ auth::password_authenticator::password_authenticator()
 // and some old-fashioned random salt generation.

 static constexpr size_t rand_bytes = 16;
+static thread_local crypt_data tlcrypt = { 0, };

 static sstring hashpw(const sstring& pass, const sstring& salt) {
-    // crypt_data is huge. should this be a thread_local static?
-    auto tmp = std::make_unique<crypt_data>();
-    tmp->initialized = 0;
-    auto res = crypt_r(pass.c_str(), salt.c_str(), tmp.get());
+    auto res = crypt_r(pass.c_str(), salt.c_str(), &tlcrypt);
    if (res == nullptr) {
        throw std::system_error(errno, std::system_category());
    }
@@ -122,17 +134,16 @@ static sstring gensalt() {
    sstring salt;

    if (!prefix.empty()) {
-        return prefix + salt;
+        return prefix + input;
    }

-    auto tmp = std::make_unique<crypt_data>();
-    tmp->initialized = 0;
-
    // Try in order:
    // blowfish 2011 fix, blowfish, sha512, sha256, md5
    for (sstring pfx : { "$2y$", "$2a$", "$6$", "$5$", "$1$" }) {
        salt = pfx + input;
-        if (crypt_r("fisk", salt.c_str(), tmp.get())) {
+        const char* e = crypt_r("fisk", salt.c_str(), &tlcrypt);
+
+        if (e && (e[0] != '*')) {
            prefix = pfx;
            return salt;
        }
@@ -144,39 +155,52 @@ static sstring hashpw(const sstring& pass) {
    return hashpw(pass, gensalt());
 }

-future<> auth::password_authenticator::init() {
-    gensalt(); // do this once to determine usable hashing
+future<> auth::password_authenticator::start() {
+    return auth::once_among_shards([this] {
+        gensalt(); // do this once to determine usable hashing

-    sstring create_table = sprint(
-                    "CREATE TABLE %s.%s ("
-                                    "%s text,"
-                                    "%s text," // salt + hash + number of rounds
-                                    "options map<text,text>,"// for future extensions
-                                    "PRIMARY KEY(%s)"
-                                    ") WITH gc_grace_seconds=%d",
-                    auth::auth::AUTH_KS,
-                    CREDENTIALS_CF, USER_NAME, SALTED_HASH, USER_NAME,
-                    90 * 24 * 60 * 60); // 3 months.
+        static const sstring create_table = sprint(
+                "CREATE TABLE %s.%s ("
+                "%s text,"
+                "%s text," // salt + hash + number of rounds
+                "options map<text,text>,"// for future extensions
+                "PRIMARY KEY(%s)"
+                ") WITH gc_grace_seconds=%d",
+                meta::AUTH_KS,
+                CREDENTIALS_CF, USER_NAME, SALTED_HASH, USER_NAME,
+                90 * 24 * 60 * 60); // 3 months.

-    return auth::setup_table(CREDENTIALS_CF, create_table).then([this] {
-        // instead of once-timer, just schedule this later
-        auth::schedule_when_up([] {
-            return auth::has_existing_users(CREDENTIALS_CF, DEFAULT_USER_NAME, USER_NAME).then([](bool exists) {
-                if (!exists) {
-                    cql3::get_local_query_processor().process(sprint("INSERT INTO %s.%s (%s, %s) VALUES (?, ?) USING TIMESTAMP 0",
-                                                    auth::AUTH_KS,
-                                                    CREDENTIALS_CF,
-                                                    USER_NAME, SALTED_HASH
-                                    ),
-                                    db::consistency_level::ONE, {DEFAULT_USER_NAME, hashpw(DEFAULT_USER_PASSWORD)}).then([](auto) {
-                                        logger.info("Created default user '{}'", DEFAULT_USER_NAME);
-                                    });
-                }
+        return auth::create_metadata_table_if_missing(
+                CREDENTIALS_CF,
+                _qp,
+                create_table,
+                _migration_manager).then([this] {
+            auth::delay_until_system_ready(_delayed, [this] {
+                return has_existing_users().then([this](bool existing) {
+                    if (!existing) {
+                        return _qp.process(
+                                sprint(
+                                        "INSERT INTO %s.%s (%s, %s) VALUES (?, ?) USING TIMESTAMP 0",
+                                        meta::AUTH_KS,
+                                        CREDENTIALS_CF,
+                                        USER_NAME, SALTED_HASH),
+                                db::consistency_level::ONE,
+                                { DEFAULT_USER_NAME, hashpw(DEFAULT_USER_PASSWORD) }).then([](auto) {
+                            plogger.info("Created default user '{}'", DEFAULT_USER_NAME);
+                        });
+                    }
+
+                    return make_ready_future<>();
+                });
            });
        });
    });
 }

+future<> auth::password_authenticator::stop() {
+    return make_ready_future<>();
+}
+
 db::consistency_level auth::password_authenticator::consistency_for_user(const sstring& username) {
    if (username == DEFAULT_USER_NAME) {
        return db::consistency_level::QUORUM;
@@ -184,8 +208,8 @@ db::consistency_level auth::password_authenticator::consistency_for_user(const s
    return db::consistency_level::LOCAL_ONE;
 }

-const sstring& auth::password_authenticator::class_name() const {
-    return PASSWORD_AUTHENTICATOR_NAME;
+const sstring& auth::password_authenticator::qualified_java_name() const {
+    return password_authenticator_name();
 }

 bool auth::password_authenticator::require_authentication() const {
@@ -201,8 +225,7 @@ auth::authenticator::option_set auth::password_authenticator::alterable_options(
 }

 future<::shared_ptr<auth::authenticated_user> > auth::password_authenticator::authenticate(
-                const credentials_map& credentials) const
-                                throw (exceptions::authentication_exception) {
+                const credentials_map& credentials) const {
    if (!credentials.count(USERNAME_KEY)) {
        throw exceptions::authentication_exception(sprint("Required key '%s' is missing", USERNAME_KEY));
    }
@@ -218,12 +241,11 @@ future<::shared_ptr<auth::authenticated_user> > auth::password_authenticator::au
    // obsolete prepared statements pretty quickly.
    // Rely on query processing caching statements instead, and lets assume
    // that a map lookup string->statement is not gonna kill us much.
-    auto& qp = cql3::get_local_query_processor();
-    return qp.process(
-                    sprint("SELECT %s FROM %s.%s WHERE %s = ?", SALTED_HASH,
-                                    auth::AUTH_KS, CREDENTIALS_CF, USER_NAME),
-                    consistency_for_user(username), { username }, true).then_wrapped(
-                    [=](future<::shared_ptr<cql3::untyped_result_set>> f) {
+    return futurize_apply([this, username, password] {
+        return _qp.process(sprint("SELECT %s FROM %s.%s WHERE %s = ?", SALTED_HASH,
+                                        meta::AUTH_KS, CREDENTIALS_CF, USER_NAME),
+                        consistency_for_user(username), {username}, true);
+    }).then_wrapped([=](future<::shared_ptr<cql3::untyped_result_set>> f) {
        try {
            auto res = f.get0();
            if (res->empty() || !checkpw(password, res->one().get_as<sstring>(SALTED_HASH))) {
@@ -234,63 +256,57 @@ future<::shared_ptr<auth::authenticated_user> > auth::password_authenticator::au
            std::throw_with_nested(exceptions::authentication_exception("Could not verify password"));
        } catch (exceptions::request_execution_exception& e) {
            std::throw_with_nested(exceptions::authentication_exception(e.what()));
+        } catch (...) {
+            std::throw_with_nested(exceptions::authentication_exception("authentication failed"));
        }
    });
 }

 future<> auth::password_authenticator::create(sstring username,
-                const option_map& options)
-                                throw (exceptions::request_validation_exception,
-                                exceptions::request_execution_exception) {
+                const option_map& options) {
    try {
        auto password = boost::any_cast<sstring>(options.at(option::PASSWORD));
        auto query = sprint("INSERT INTO %s.%s (%s, %s) VALUES (?, ?)",
-                        auth::AUTH_KS, CREDENTIALS_CF, USER_NAME, SALTED_HASH);
-        auto& qp = cql3::get_local_query_processor();
-        return qp.process(query, consistency_for_user(username), { username, hashpw(password) }).discard_result();
+                        meta::AUTH_KS, CREDENTIALS_CF, USER_NAME, SALTED_HASH);
+        return _qp.process(query, consistency_for_user(username), { username, hashpw(password) }).discard_result();
    } catch (std::out_of_range&) {
        throw exceptions::invalid_request_exception("PasswordAuthenticator requires PASSWORD option");
    }
 }

 future<> auth::password_authenticator::alter(sstring username,
-                const option_map& options)
-                                throw (exceptions::request_validation_exception,
-                                exceptions::request_execution_exception) {
+                const option_map& options) {
    try {
        auto password = boost::any_cast<sstring>(options.at(option::PASSWORD));
        auto query = sprint("UPDATE %s.%s SET %s = ? WHERE %s = ?",
-                        auth::AUTH_KS, CREDENTIALS_CF, SALTED_HASH, USER_NAME);
-        auto& qp = cql3::get_local_query_processor();
-        return qp.process(query, consistency_for_user(username), { hashpw(password), username }).discard_result();
+                        meta::AUTH_KS, CREDENTIALS_CF, SALTED_HASH, USER_NAME);
+        return _qp.process(query, consistency_for_user(username), { hashpw(password), username }).discard_result();
    } catch (std::out_of_range&) {
        throw exceptions::invalid_request_exception("PasswordAuthenticator requires PASSWORD option");
    }
 }

-future<> auth::password_authenticator::drop(sstring username)
-                throw (exceptions::request_validation_exception,
-                exceptions::request_execution_exception) {
+future<> auth::password_authenticator::drop(sstring username) {
    try {
        auto query = sprint("DELETE FROM %s.%s WHERE %s = ?",
-                        auth::AUTH_KS, CREDENTIALS_CF, USER_NAME);
-        auto& qp = cql3::get_local_query_processor();
-        return qp.process(query, consistency_for_user(username), { username }).discard_result();
+                        meta::AUTH_KS, CREDENTIALS_CF, USER_NAME);
+        return _qp.process(query, consistency_for_user(username), { username }).discard_result();
    } catch (std::out_of_range&) {
        throw exceptions::invalid_request_exception("PasswordAuthenticator requires PASSWORD option");
    }
 }

 const auth::resource_ids& auth::password_authenticator::protected_resources() const {
-    static const resource_ids ids({ data_resource(auth::AUTH_KS, CREDENTIALS_CF) });
+    static const resource_ids ids({ data_resource(meta::AUTH_KS, CREDENTIALS_CF) });
    return ids;
 }

 ::shared_ptr<auth::authenticator::sasl_challenge> auth::password_authenticator::new_sasl_challenge() const {
    class plain_text_password_challenge: public sasl_challenge {
+        const password_authenticator& _self;
+
    public:
-        plain_text_password_challenge(const password_authenticator& a)
-                        : _authenticator(a)
+        plain_text_password_challenge(const password_authenticator& self) : _self(self)
        {}

        /**
@@ -306,9 +322,8 @@ const auth::resource_ids& auth::password_authenticator::protected_resources() co
         * would expect
         * @throws javax.security.sasl.SaslException
         */
-        bytes evaluate_response(bytes_view client_response)
-                        throw (exceptions::authentication_exception) override {
-            logger.debug("Decoding credentials from client token");
+        bytes evaluate_response(bytes_view client_response) override {
+            plogger.debug("Decoding credentials from client token");

            sstring username, password;

@@ -345,14 +360,59 @@ const auth::resource_ids& auth::password_authenticator::protected_resources() co
        bool is_complete() const override {
            return _complete;
        }
-        future<::shared_ptr<authenticated_user>> get_authenticated_user() const
-                        throw (exceptions::authentication_exception) override {
-            return _authenticator.authenticate(_credentials);
+        future<::shared_ptr<authenticated_user>> get_authenticated_user() const override {
+            return _self.authenticate(_credentials);
        }
    private:
-        const password_authenticator& _authenticator;
        credentials_map _credentials;
        bool _complete = false;
    };
    return ::make_shared<plain_text_password_challenge>(*this);
 }
+
+
+//
+// Similar in structure to `auth::service::has_existing_users()`, but trying to generalize the pattern breaks all kinds
+// of module boundaries and leaks implementation details.
+//
+future<bool> auth::password_authenticator::has_existing_users() const {
+    static const sstring default_user_query = sprint(
+            "SELECT * FROM %s.%s WHERE %s = ?",
+            meta::AUTH_KS,
+            CREDENTIALS_CF,
+            USER_NAME);
+
+    static const sstring all_users_query = sprint(
+            "SELECT * FROM %s.%s LIMIT 1",
+            meta::AUTH_KS,
+            CREDENTIALS_CF);
+
+    // This logic is borrowed directly from Apache Cassandra. By first checking for the presence of the default user, we
+    // can potentially avoid doing a range query with a high consistency level.
+
+    return _qp.process(
+            default_user_query,
+            db::consistency_level::ONE,
+            { meta::DEFAULT_SUPERUSER_NAME },
+            true).then([this](auto results) {
+        if (!results->empty()) {
+            return make_ready_future<bool>(true);
+        }
+
+        return _qp.process(
+                default_user_query,
+                db::consistency_level::QUORUM,
+                { meta::DEFAULT_SUPERUSER_NAME },
+                true).then([this](auto results) {
+            if (!results->empty()) {
+                return make_ready_future<bool>(true);
+            }
+
+            return _qp.process(
+                    all_users_query,
+                    db::consistency_level::QUORUM).then([](auto results) {
+                return make_ready_future<bool>(!results->empty());
+            });
+        });
+    });
+}
--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -42,31 +42,48 @@
 #pragma once

 #include "authenticator.hh"
+#include "cql3/query_processor.hh"
+#include "delayed_tasks.hh"
+
+namespace service {
+class migration_manager;
+}

 namespace auth {

-class password_authenticator : public authenticator {
-public:
-    static const sstring PASSWORD_AUTHENTICATOR_NAME;
+const sstring& password_authenticator_name();

-    password_authenticator();
+class password_authenticator : public authenticator {
+    cql3::query_processor& _qp;
+
+    ::service::migration_manager& _migration_manager;
+
+    delayed_tasks<> _delayed{};
+
+public:
+    password_authenticator(cql3::query_processor&, ::service::migration_manager&);
    ~password_authenticator();

-    future<> init();
+    future<> start() override;

-    const sstring& class_name() const override;
+    future<> stop() override;
+
+    const sstring& qualified_java_name() const override;
    bool require_authentication() const override;
    option_set supported_options() const override;
    option_set alterable_options() const override;
-    future<::shared_ptr<authenticated_user>> authenticate(const credentials_map& credentials) const throw(exceptions::authentication_exception) override;
-    future<> create(sstring username, const option_map& options) throw(exceptions::request_validation_exception, exceptions::request_execution_exception) override;
-    future<> alter(sstring username, const option_map& options) throw(exceptions::request_validation_exception, exceptions::request_execution_exception) override;
-    future<> drop(sstring username) throw(exceptions::request_validation_exception, exceptions::request_execution_exception) override;
+    future<::shared_ptr<authenticated_user>> authenticate(const credentials_map& credentials) const override;
+    future<> create(sstring username, const option_map& options) override;
+    future<> alter(sstring username, const option_map& options) override;
+    future<> drop(sstring username) override;
    const resource_ids& protected_resources() const override;
    ::shared_ptr<sasl_challenge> new_sasl_challenge() const override;


    static db::consistency_level consistency_for_user(const sstring& username);
+
+private:
+    future<bool> has_existing_users() const;
 };

 }
--- a/auth/permission.cc
+++ b/auth/permission.cc
@@ -40,6 +40,7 @@
 */

 #include <unordered_map>
+#include <boost/algorithm/string.hpp>
 #include "permission.hh"

 const auth::permission_set auth::permissions::ALL_DATA =
@@ -75,7 +76,9 @@ const sstring& auth::permissions::to_string(permission p) {
 }

 auth::permission auth::permissions::from_string(const sstring& s) {
-    return permission_names.at(s);
+    sstring upper(s);
+    boost::to_upper(upper);
+    return permission_names.at(upper);
 }

 std::unordered_set<sstring> auth::permissions::to_strings(const permission_set& set) {
--- a/auth/permission.hh
+++ b/auth/permission.hh
@@ -44,6 +44,7 @@
 #include <unordered_set>
 #include <seastar/core/sstring.hh>

+#include "seastarx.hh"
 #include "enum_set.hh"

 namespace auth {
--- a/auth/permissions_cache.cc
+++ b/auth/permissions_cache.cc
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "auth/permissions_cache.hh"
+
+#include "auth/authorizer.hh"
+#include "auth/common.hh"
+#include "auth/service.hh"
+#include "db/config.hh"
+
+namespace auth {
+
+permissions_cache_config permissions_cache_config::from_db_config(const db::config& dc) {
+    permissions_cache_config c;
+    c.max_entries = dc.permissions_cache_max_entries();
+    c.validity_period = std::chrono::milliseconds(dc.permissions_validity_in_ms());
+    c.update_period = std::chrono::milliseconds(dc.permissions_update_interval_in_ms());
+
+    return c;
+}
+
+permissions_cache::permissions_cache(const permissions_cache_config& c, service& ser, logging::logger& log)
+        : _cache(c.max_entries, c.validity_period, c.update_period, log, [&ser, &log](const key_type& k) {
+              log.debug("Refreshing permissions for {}", k.first.name());
+              return ser.underlying_authorizer().authorize(ser, ::make_shared<authenticated_user>(k.first), k.second);
+          }) {
+}
+
+future<permission_set> permissions_cache::get(::shared_ptr<authenticated_user> user, data_resource r) {
+    return _cache.get(key_type(*user, r));
+}
+
+}
--- a/auth/permissions_cache.hh
+++ b/auth/permissions_cache.hh
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <chrono>
+#include <functional>
+#include <iostream>
+#include <utility>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_ptr.hh>
+
+#include "auth/authenticated_user.hh"
+#include "auth/data_resource.hh"
+#include "auth/permission.hh"
+#include "log.hh"
+#include "utils/loading_cache.hh"
+
+namespace std {
+
+template <>
+struct hash<auth::data_resource> final {
+    size_t operator()(const auth::data_resource & v) const {
+        return v.hash_value();
+    }
+};
+
+template <>
+struct hash<auth::authenticated_user> final {
+    size_t operator()(const auth::authenticated_user & v) const {
+        return utils::tuple_hash()(v.name(), v.is_anonymous());
+    }
+};
+
+inline std::ostream& operator<<(std::ostream& os, const std::pair<auth::authenticated_user, auth::data_resource>& p) {
+    os << "{user: " << p.first.name() << ", data_resource: " << p.second << "}";
+    return os;
+}
+
+}
+
+namespace db {
+class config;
+}
+
+namespace auth {
+
+class service;
+
+struct permissions_cache_config final {
+    static permissions_cache_config from_db_config(const db::config&);
+
+    std::size_t max_entries;
+    std::chrono::milliseconds validity_period;
+    std::chrono::milliseconds update_period;
+};
+
+class permissions_cache final {
+    using cache_type = utils::loading_cache<
+            std::pair<authenticated_user, data_resource>,
+            permission_set,
+            utils::loading_cache_reload_enabled::yes,
+            utils::simple_entry_size<permission_set>,
+            utils::tuple_hash>;
+
+    using key_type = typename cache_type::key_type;
+
+    cache_type _cache;
+
+public:
+    explicit permissions_cache(const permissions_cache_config&, service&, logging::logger&);
+
+    future <> stop() {
+        return _cache.stop();
+    }
+
+    future<permission_set> get(::shared_ptr<authenticated_user>, data_resource);
+};
+
+}
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -0,0 +1,355 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "auth/service.hh"
+
+#include <map>
+
+#include <seastar/core/future-util.hh>
+#include <seastar/core/shared_ptr.hh>
+
+#include "auth/allow_all_authenticator.hh"
+#include "auth/allow_all_authorizer.hh"
+#include "auth/common.hh"
+#include "cql3/query_processor.hh"
+#include "cql3/untyped_result_set.hh"
+#include "db/config.hh"
+#include "db/consistency_level.hh"
+#include "exceptions/exceptions.hh"
+#include "log.hh"
+#include "service/migration_listener.hh"
+#include "utils/class_registrator.hh"
+
+namespace auth {
+
+namespace meta {
+
+static const sstring user_name_col_name("name");
+static const sstring superuser_col_name("super");
+
+}
+
+static logging::logger log("auth_service");
+
+class auth_migration_listener final : public ::service::migration_listener {
+    authorizer& _authorizer;
+
+public:
+    explicit auth_migration_listener(authorizer& a) : _authorizer(a) {
+    }
+
+private:
+    void on_create_keyspace(const sstring& ks_name) override {}
+    void on_create_column_family(const sstring& ks_name, const sstring& cf_name) override {}
+    void on_create_user_type(const sstring& ks_name, const sstring& type_name) override {}
+    void on_create_function(const sstring& ks_name, const sstring& function_name) override {}
+    void on_create_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
+    void on_create_view(const sstring& ks_name, const sstring& view_name) override {}
+
+    void on_update_keyspace(const sstring& ks_name) override {}
+    void on_update_column_family(const sstring& ks_name, const sstring& cf_name, bool) override {}
+    void on_update_user_type(const sstring& ks_name, const sstring& type_name) override {}
+    void on_update_function(const sstring& ks_name, const sstring& function_name) override {}
+    void on_update_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
+    void on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) override {}
+
+    void on_drop_keyspace(const sstring& ks_name) override {
+        _authorizer.revoke_all(auth::data_resource(ks_name));
+    }
+
+    void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {
+        _authorizer.revoke_all(auth::data_resource(ks_name, cf_name));
+    }
+
+    void on_drop_user_type(const sstring& ks_name, const sstring& type_name) override {}
+    void on_drop_function(const sstring& ks_name, const sstring& function_name) override {}
+    void on_drop_aggregate(const sstring& ks_name, const sstring& aggregate_name) override {}
+    void on_drop_view(const sstring& ks_name, const sstring& view_name) override {}
+};
+
+static db::consistency_level consistency_for_user(const sstring& name) {
+    if (name == meta::DEFAULT_SUPERUSER_NAME) {
+        return db::consistency_level::QUORUM;
+    } else {
+        return db::consistency_level::LOCAL_ONE;
+    }
+}
+
+static future<::shared_ptr<cql3::untyped_result_set>> select_user(cql3::query_processor& qp, const sstring& name) {
+    // Here was a thread local, explicit cache of prepared statement. In normal execution this is
+    // fine, but since we in testing set up and tear down system over and over, we'd start using
+    // obsolete prepared statements pretty quickly.
+    // Rely on query processing caching statements instead, and lets assume
+    // that a map lookup string->statement is not gonna kill us much.
+    return qp.process(
+            sprint(
+                    "SELECT * FROM %s.%s WHERE %s = ?",
+                    meta::AUTH_KS,
+                    meta::USERS_CF,
+                    meta::user_name_col_name),
+            consistency_for_user(name),
+            { name },
+            true);
+}
+
+service_config service_config::from_db_config(const db::config& dc) {
+    const qualified_name qualified_authorizer_name(meta::AUTH_PACKAGE_NAME, dc.authorizer());
+    const qualified_name qualified_authenticator_name(meta::AUTH_PACKAGE_NAME, dc.authenticator());
+
+    service_config c;
+    c.authorizer_java_name = qualified_authorizer_name;
+    c.authenticator_java_name = qualified_authenticator_name;
+
+    return c;
+}
+
+service::service(
+        permissions_cache_config c,
+        cql3::query_processor& qp,
+        ::service::migration_manager& mm,
+        std::unique_ptr<authorizer> a,
+        std::unique_ptr<authenticator> b)
+            : _permissions_cache_config(std::move(c))
+            , _permissions_cache(nullptr)
+            , _qp(qp)
+            , _migration_manager(mm)
+            , _authorizer(std::move(a))
+            , _authenticator(std::move(b))
+            , _migration_listener(std::make_unique<auth_migration_listener>(*_authorizer)) {
+}
+
+service::service(
+        permissions_cache_config cache_config,
+        cql3::query_processor& qp,
+        ::service::migration_manager& mm,
+        const service_config& sc)
+            : service(
+                      std::move(cache_config),
+                      qp,
+                      mm,
+                      create_object<authorizer>(sc.authorizer_java_name, qp, mm),
+                      create_object<authenticator>(sc.authenticator_java_name, qp, mm)) {
+}
+
+bool service::should_create_metadata() const {
+    const bool null_authorizer = _authorizer->qualified_java_name() == allow_all_authorizer_name();
+    const bool null_authenticator = _authenticator->qualified_java_name() == allow_all_authenticator_name();
+    return !null_authorizer || !null_authenticator;
+}
+
+future<> service::create_metadata_if_missing() {
+    auto& db = _qp.db().local();
+
+    auto f = make_ready_future<>();
+
+    if (!db.has_keyspace(meta::AUTH_KS)) {
+        std::map<sstring, sstring> opts{{"replication_factor", "1"}};
+
+        auto ksm = keyspace_metadata::new_keyspace(
+                meta::AUTH_KS,
+                "org.apache.cassandra.locator.SimpleStrategy",
+                opts,
+                true);
+
+        // We use min_timestamp so that default keyspace metadata will loose with any manual adjustments.
+        // See issue #2129.
+        f = _migration_manager.announce_new_keyspace(ksm, api::min_timestamp, false);
+    }
+
+    return f.then([this] {
+        // 3 months.
+        static const auto gc_grace_seconds = 90 * 24 * 60 * 60;
+
+        static const sstring users_table_query = sprint(
+                "CREATE TABLE %s.%s (%s text, %s boolean, PRIMARY KEY (%s)) WITH gc_grace_seconds=%s",
+                meta::AUTH_KS,
+                meta::USERS_CF,
+                meta::user_name_col_name,
+                meta::superuser_col_name,
+                meta::user_name_col_name,
+                gc_grace_seconds);
+
+        return create_metadata_table_if_missing(
+                meta::USERS_CF,
+                _qp,
+                users_table_query,
+                _migration_manager);
+    }).then([this] {
+        delay_until_system_ready(_delayed, [this] {
+            return has_existing_users().then([this](bool existing) {
+                if (!existing) {
+                    //
+                    // Create default superuser.
+                    //
+
+                    static const sstring query = sprint(
+                            "INSERT INTO %s.%s (%s, %s) VALUES (?, ?) USING TIMESTAMP 0",
+                            meta::AUTH_KS,
+                            meta::USERS_CF,
+                            meta::user_name_col_name,
+                            meta::superuser_col_name);
+
+                    return _qp.process(
+                            query,
+                            db::consistency_level::ONE,
+                            { meta::DEFAULT_SUPERUSER_NAME, true }).then([](auto&&) {
+                        log.info("Created default superuser '{}'", meta::DEFAULT_SUPERUSER_NAME);
+                    }).handle_exception([](auto exn) {
+                        try {
+                            std::rethrow_exception(exn);
+                        } catch (const exceptions::request_execution_exception&) {
+                            log.warn("Skipped default superuser setup: some nodes were not ready");
+                        }
+                    }).discard_result();
+                }
+
+                return make_ready_future<>();
+            });
+        });
+
+        return make_ready_future<>();
+    });
+}
+
+future<> service::start() {
+    return once_among_shards([this] {
+        if (should_create_metadata()) {
+            return create_metadata_if_missing();
+        }
+
+        return make_ready_future<>();
+    }).then([this] {
+        return when_all_succeed(_authorizer->start(), _authenticator->start());
+    }).then([this] {
+        _permissions_cache = std::make_unique<permissions_cache>(_permissions_cache_config, *this, log);
+    }).then([this] {
+        return once_among_shards([this] {
+            _migration_manager.register_listener(_migration_listener.get());
+            return make_ready_future<>();
+        });
+    });
+}
+
+future<> service::stop() {
+    return once_among_shards([this] {
+        _delayed.cancel_all();
+        return make_ready_future<>();
+    }).then([this] {
+        return _permissions_cache->stop();
+    }).then([this] {
+        return when_all_succeed(_authorizer->stop(), _authenticator->stop());
+    });
+}
+
+future<bool> service::has_existing_users() const {
+    static const sstring default_user_query = sprint(
+            "SELECT * FROM %s.%s WHERE %s = ?",
+            meta::AUTH_KS,
+            meta::USERS_CF,
+            meta::user_name_col_name);
+
+    static const sstring all_users_query = sprint(
+            "SELECT * FROM %s.%s LIMIT 1",
+            meta::AUTH_KS,
+            meta::USERS_CF);
+
+    // This logic is borrowed directly from Apache Cassandra. By first checking for the presence of the default user, we
+    // can potentially avoid doing a range query with a high consistency level.
+
+    return _qp.process(
+            default_user_query,
+            db::consistency_level::ONE,
+            { meta::DEFAULT_SUPERUSER_NAME },
+            true).then([this](auto results) {
+        if (!results->empty()) {
+            return make_ready_future<bool>(true);
+        }
+
+        return _qp.process(
+                default_user_query,
+                db::consistency_level::QUORUM,
+                { meta::DEFAULT_SUPERUSER_NAME },
+                true).then([this](auto results) {
+            if (!results->empty()) {
+                return make_ready_future<bool>(true);
+            }
+
+            return _qp.process(
+                    all_users_query,
+                    db::consistency_level::QUORUM).then([](auto results) {
+                return make_ready_future<bool>(!results->empty());
+            });
+        });
+    });
+}
+
+future<bool> service::is_existing_user(const sstring& name) const {
+    return select_user(_qp, name).then([](auto results) {
+        return !results->empty();
+    });
+}
+
+future<bool> service::is_super_user(const sstring& name) const {
+    return select_user(_qp, name).then([](auto results) {
+        return !results->empty() && results->one().template get_as<bool>(meta::superuser_col_name);
+    });
+}
+
+future<> service::insert_user(const sstring& name, bool is_superuser) {
+    return _qp.process(
+            sprint(
+                    "INSERT INTO %s.%s (%s, %s) VALUES (?, ?)",
+                    meta::AUTH_KS,
+                    meta::USERS_CF,
+                    meta::user_name_col_name,
+                    meta::superuser_col_name),
+            consistency_for_user(name),
+            { name, is_superuser }).discard_result();
+}
+
+future<> service::delete_user(const sstring& name) {
+    return _qp.process(
+            sprint(
+                    "DELETE FROM %s.%s WHERE %s = ?",
+                    meta::AUTH_KS,
+                    meta::USERS_CF,
+                    meta::user_name_col_name),
+            consistency_for_user(name),
+            { name }).discard_result();
+}
+
+future<permission_set> service::get_permissions(::shared_ptr<authenticated_user> u, data_resource r) const {
+    return _permissions_cache->get(std::move(u), std::move(r));
+}
+
+//
+// Free functions.
+//
+
+future<bool> is_super_user(const service& ser, const authenticated_user& u) {
+    if (u.is_anonymous()) {
+        return make_ready_future<bool>(false);
+    }
+
+    return ser.is_super_user(u.name());
+}
+
+}
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/sstring.hh>
+
+#include "auth/authenticator.hh"
+#include "auth/authorizer.hh"
+#include "auth/authenticated_user.hh"
+#include "auth/permission.hh"
+#include "auth/permissions_cache.hh"
+#include "delayed_tasks.hh"
+#include "seastarx.hh"
+
+namespace cql3 {
+class query_processor;
+}
+
+namespace db {
+class config;
+}
+
+namespace service {
+class migration_manager;
+class migration_listener;
+}
+
+namespace auth {
+
+class authenticator;
+class authorizer;
+
+struct service_config final {
+    static service_config from_db_config(const db::config&);
+
+    sstring authorizer_java_name;
+    sstring authenticator_java_name;
+};
+
+class service final {
+    permissions_cache_config _permissions_cache_config;
+    std::unique_ptr<permissions_cache> _permissions_cache;
+
+    cql3::query_processor& _qp;
+
+    ::service::migration_manager& _migration_manager;
+
+    std::unique_ptr<authorizer> _authorizer;
+
+    std::unique_ptr<authenticator> _authenticator;
+
+    // Only one of these should be registered, so we end up with some unused instances. Not the end of the world.
+    std::unique_ptr<::service::migration_listener> _migration_listener;
+
+    delayed_tasks<> _delayed{};
+
+public:
+    service(
+            permissions_cache_config,
+            cql3::query_processor&,
+            ::service::migration_manager&,
+            std::unique_ptr<authorizer>,
+            std::unique_ptr<authenticator>);
+
+    service(
+            permissions_cache_config,
+            cql3::query_processor&,
+            ::service::migration_manager&,
+            const service_config&);
+
+    future<> start();
+
+    future<> stop();
+
+    future<bool> is_existing_user(const sstring& name) const;
+
+    future<bool> is_super_user(const sstring& name) const;
+
+    future<> insert_user(const sstring& name, bool is_superuser);
+
+    future<> delete_user(const sstring& name);
+
+    future<permission_set> get_permissions(::shared_ptr<authenticated_user>, data_resource) const;
+
+    authenticator& underlying_authenticator() {
+        return *_authenticator;
+    }
+
+    const authenticator& underlying_authenticator() const {
+        return *_authenticator;
+    }
+
+    authorizer& underlying_authorizer() {
+        return *_authorizer;
+    }
+
+    const authorizer& underlying_authorizer() const {
+        return *_authorizer;
+    }
+
+private:
+    future<bool> has_existing_users() const;
+
+    bool should_create_metadata() const;
+
+    future<> create_metadata_if_missing();
+};
+
+future<bool> is_super_user(const service&, const authenticated_user&);
+
+}
--- a/auth/transitional.cc
+++ b/auth/transitional.cc
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (C) 2017 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "authenticator.hh"
+#include "authenticated_user.hh"
+#include "authenticator.hh"
+#include "authorizer.hh"
+#include "password_authenticator.hh"
+#include "default_authorizer.hh"
+#include "permission.hh"
+#include "db/config.hh"
+#include "utils/class_registrator.hh"
+
+namespace auth {
+
+class service;
+
+static const sstring PACKAGE_NAME("com.scylladb.auth.");
+
+static const sstring& transitional_authenticator_name() {
+    static const sstring name = PACKAGE_NAME + "TransitionalAuthenticator";
+    return name;
+}
+
+static const sstring& transitional_authorizer_name() {
+    static const sstring name = PACKAGE_NAME + "TransitionalAuthorizer";
+    return name;
+}
+
+class transitional_authenticator : public authenticator {
+    std::unique_ptr<authenticator> _authenticator;
+public:
+    static const sstring PASSWORD_AUTHENTICATOR_NAME;
+
+    transitional_authenticator(cql3::query_processor& qp, ::service::migration_manager& mm)
+            : transitional_authenticator(std::make_unique<password_authenticator>(qp, mm))
+    {}
+    transitional_authenticator(std::unique_ptr<authenticator> a)
+        : _authenticator(std::move(a))
+    {}
+    future<> start() override {
+        return _authenticator->start();
+    }
+    future<> stop() override {
+        return _authenticator->stop();
+    }
+    const sstring& qualified_java_name() const override {
+        return transitional_authenticator_name();
+    }
+    bool require_authentication() const override {
+        return true;
+    }
+    option_set supported_options() const override {
+        return _authenticator->supported_options();
+    }
+    option_set alterable_options() const override {
+        return _authenticator->alterable_options();
+    }
+    future<::shared_ptr<authenticated_user>> authenticate(const credentials_map& credentials) const override {
+        auto i = credentials.find(authenticator::USERNAME_KEY);
+        if ((i == credentials.end() || i->second.empty()) && (!credentials.count(PASSWORD_KEY) || credentials.at(PASSWORD_KEY).empty())) {
+            // return anon user
+            return make_ready_future<::shared_ptr<authenticated_user>>(::make_shared<authenticated_user>());
+        }
+        return make_ready_future().then([this, &credentials] {
+            return _authenticator->authenticate(credentials);
+        }).handle_exception([](auto ep) {
+            try {
+                std::rethrow_exception(ep);
+            } catch (exceptions::authentication_exception&) {
+                // return anon user
+                return make_ready_future<::shared_ptr<authenticated_user>>(::make_shared<authenticated_user>());
+            }
+        });
+    }
+    future<> create(sstring username, const option_map& options) override {
+        return _authenticator->create(username, options);
+    }
+    future<> alter(sstring username, const option_map& options) override {
+        return _authenticator->alter(username, options);
+    }
+    future<> drop(sstring username) override {
+        return _authenticator->drop(username);
+    }
+    const resource_ids& protected_resources() const override {
+        return _authenticator->protected_resources();
+    }
+    ::shared_ptr<sasl_challenge> new_sasl_challenge() const override {
+        class sasl_wrapper : public sasl_challenge {
+        public:
+            sasl_wrapper(::shared_ptr<sasl_challenge> sasl)
+                : _sasl(std::move(sasl))
+            {}
+            bytes evaluate_response(bytes_view client_response) override {
+                try {
+                    return _sasl->evaluate_response(client_response);
+                } catch (exceptions::authentication_exception&) {
+                    _complete = true;
+                    return {};
+                }
+            }
+            bool is_complete() const {
+                return _complete || _sasl->is_complete();
+            }
+            future<::shared_ptr<authenticated_user>> get_authenticated_user() const {
+                return futurize_apply([this] {
+                    return _sasl->get_authenticated_user().handle_exception([](auto ep) {
+                        try {
+                            std::rethrow_exception(ep);
+                        } catch (exceptions::authentication_exception&) {
+                            // return anon user
+                            return make_ready_future<::shared_ptr<authenticated_user>>(::make_shared<authenticated_user>());
+                        }
+                    });
+                });
+            }
+        private:
+            ::shared_ptr<sasl_challenge> _sasl;
+            bool _complete = false;
+        };
+        return ::make_shared<sasl_wrapper>(_authenticator->new_sasl_challenge());
+    }
+};
+
+class transitional_authorizer : public authorizer {
+    std::unique_ptr<authorizer> _authorizer;
+public:
+    transitional_authorizer(cql3::query_processor& qp, ::service::migration_manager& mm)
+        : transitional_authorizer(std::make_unique<default_authorizer>(qp, mm))
+    {}
+    transitional_authorizer(std::unique_ptr<authorizer> a)
+        : _authorizer(std::move(a))
+    {}
+    ~transitional_authorizer()
+    {}
+    future<> start() override {
+        return _authorizer->start();
+    }
+    future<> stop() override {
+        return _authorizer->stop();
+    }
+    const sstring& qualified_java_name() const override {
+        return transitional_authorizer_name();
+    }
+    future<permission_set> authorize(service& ser, ::shared_ptr<authenticated_user> user, data_resource resource) const override {
+        return is_super_user(ser, *user).then([](bool s) {
+            static const permission_set transitional_permissions =
+                            permission_set::of<permission::CREATE,
+                                            permission::ALTER, permission::DROP,
+                                            permission::SELECT, permission::MODIFY>();
+
+            return make_ready_future<permission_set>(s ? permissions::ALL : transitional_permissions);
+        });
+    }
+    future<> grant(::shared_ptr<authenticated_user> user, permission_set ps, data_resource r, sstring s) override {
+        return _authorizer->grant(std::move(user), std::move(ps), std::move(r), std::move(s));
+    }
+    future<> revoke(::shared_ptr<authenticated_user> user, permission_set ps, data_resource r, sstring s) override {
+        return _authorizer->revoke(std::move(user), std::move(ps), std::move(r), std::move(s));
+    }
+    future<std::vector<permission_details>> list(service& ser, ::shared_ptr<authenticated_user> user, permission_set ps, optional<data_resource> r, optional<sstring> s) const override {
+        return _authorizer->list(ser, std::move(user), std::move(ps), std::move(r), std::move(s));
+    }
+    future<> revoke_all(sstring s) override {
+        return _authorizer->revoke_all(std::move(s));
+    }
+    future<> revoke_all(data_resource r) override {
+        return _authorizer->revoke_all(std::move(r));
+    }
+    const resource_ids& protected_resources() override {
+        return _authorizer->protected_resources();
+    }
+    future<> validate_configuration() const override {
+        return _authorizer->validate_configuration();
+    }
+};
+
+}
+
+//
+// To ensure correct initialization order, we unfortunately need to use string literals.
+//
+
+static const class_registrator<
+        auth::authenticator,
+        auth::transitional_authenticator,
+        cql3::query_processor&,
+        ::service::migration_manager&> transitional_authenticator_reg("com.scylladb.auth.TransitionalAuthenticator");
+
+static const class_registrator<
+        auth::authorizer,
+        auth::transitional_authorizer,
+        cql3::query_processor&,
+        ::service::migration_manager&> transitional_authorizer_reg("com.scylladb.auth.TransitionalAuthorizer");
--- a/bytes.hh
+++ b/bytes.hh
@@ -21,14 +21,17 @@

 #pragma once

+#include "seastarx.hh"
 #include "core/sstring.hh"
 #include "hashing.hh"
 #include <experimental/optional>
 #include <iosfwd>
 #include <functional>
+#include "utils/mutable_view.hh"

 using bytes = basic_sstring<int8_t, uint32_t, 31>;
 using bytes_view = std::experimental::basic_string_view<int8_t>;
+using bytes_mutable_view = basic_mutable_view<bytes_view::value_type>;
 using bytes_opt = std::experimental::optional<bytes>;
 using sstring_view = std::experimental::string_view;

--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -38,7 +38,7 @@ class bytes_ostream {
 public:
    using size_type = bytes::size_type;
    using value_type = bytes::value_type;
-    static constexpr size_type max_chunk_size = 16 * 1024;
+    static constexpr size_type max_chunk_size() { return 16 * 1024; }
 private:
    static_assert(sizeof(value_type) == 1, "value_type is assumed to be one byte long");
    struct chunk {
@@ -59,7 +59,6 @@ private:
    };
    // FIXME: consider increasing chunk size as the buffer grows
    static constexpr size_type chunk_size{512};
-    static constexpr size_type usable_chunk_size{chunk_size - sizeof(chunk)};
 private:
    std::unique_ptr<chunk> _begin;
    chunk* _current;
@@ -100,6 +99,19 @@ private:
        }
        return _current->size - _current->offset;
    }
+    // Figure out next chunk size.
+    //   - must be enough for data_size
+    //   - must be at least chunk_size
+    //   - try to double each time to prevent too many allocations
+    //   - do not exceed max_chunk_size
+    size_type next_alloc_size(size_t data_size) const {
+        auto next_size = _current
+                ? _current->size * 2
+                : chunk_size;
+        next_size = std::min(next_size, max_chunk_size());
+        // FIXME: check for overflow?
+        return std::max<size_type>(next_size, data_size + sizeof(chunk));
+    }
    // Makes room for a contiguous region of given size.
    // The region is accounted for as already written.
    // size must not be zero.
@@ -110,7 +122,7 @@ private:
            _size += size;
            return ret;
        } else {
-            auto alloc_size = size <= usable_chunk_size ? chunk_size : (size + sizeof(chunk));
+            auto alloc_size = next_alloc_size(size);
            auto space = malloc(alloc_size);
            if (!space) {
                throw std::bad_alloc();
@@ -205,7 +217,7 @@ public:
        }

        while (!v.empty()) {
-            auto this_size = std::min(v.size(), size_t(max_chunk_size));
+            auto this_size = std::min(v.size(), size_t(max_chunk_size()));
            std::copy_n(v.begin(), this_size, alloc(this_size));
            v.remove_prefix(this_size);
        }
@@ -329,7 +341,7 @@ public:
        // if its size is below max_chunk_size. We probably could also gain
        // some read performance by doing "real" reduction, i.e. merging
        // all chunks until all but the last one is max_chunk_size.
-        if (size() < max_chunk_size) {
+        if (size() < max_chunk_size()) {
            linearize();
        }
    }
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -0,0 +1,661 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <vector>
+#include "row_cache.hh"
+#include "mutation_reader.hh"
+#include "streamed_mutation.hh"
+#include "partition_version.hh"
+#include "utils/logalloc.hh"
+#include "query-request.hh"
+#include "partition_snapshot_reader.hh"
+#include "partition_snapshot_row_cursor.hh"
+#include "read_context.hh"
+#include "flat_mutation_reader.hh"
+
+namespace cache {
+
+extern logging::logger clogger;
+
+class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
+    enum class state {
+        before_static_row,
+
+        // Invariants:
+        //  - position_range(_lower_bound, _upper_bound) covers all not yet emitted positions from current range
+        //  - if _next_row has valid iterators:
+        //    - _next_row points to the nearest row in cache >= _lower_bound
+        //    - _next_row_in_range = _next.position() < _upper_bound
+        //  - if _next_row doesn't have valid iterators, it has no meaning.
+        reading_from_cache,
+
+        // Starts reading from underlying reader.
+        // The range to read is position_range(_lower_bound, min(_next_row.position(), _upper_bound)).
+        // Invariants:
+        //  - _next_row_in_range = _next.position() < _upper_bound
+        move_to_underlying,
+
+        // Invariants:
+        // - Upper bound of the read is min(_next_row.position(), _upper_bound)
+        // - _next_row_in_range = _next.position() < _upper_bound
+        // - _last_row points at a direct predecessor of the next row which is going to be read.
+        //   Used for populating continuity.
+        // - _population_range_starts_before_all_rows is set accordingly
+        reading_from_underlying,
+
+        end_of_stream
+    };
+    lw_shared_ptr<partition_snapshot> _snp;
+    position_in_partition::tri_compare _position_cmp;
+
+    query::clustering_key_filter_ranges _ck_ranges;
+    query::clustering_row_ranges::const_iterator _ck_ranges_curr;
+    query::clustering_row_ranges::const_iterator _ck_ranges_end;
+
+    lsa_manager _lsa_manager;
+
+    partition_snapshot_row_weakref _last_row;
+
+    // We need to be prepared that we may get overlapping and out of order
+    // range tombstones. We must emit fragments with strictly monotonic positions,
+    // so we can't just trim such tombstones to the position of the last fragment.
+    // To solve that, range tombstones are accumulated first in a range_tombstone_stream
+    // and emitted once we have a fragment with a larger position.
+    range_tombstone_stream _tombstones;
+
+    // Holds the lower bound of a position range which hasn't been processed yet.
+    // Only fragments with positions < _lower_bound have been emitted.
+    //
+    // It is assumed that !_lower_bound.is_clustering_row(). We depend on this when
+    // calling range_tombstone::trim_front() and when inserting dummy entries. Dummy
+    // entries are assumed to be only at !is_clustering_row() positions.
+    position_in_partition _lower_bound;
+    position_in_partition_view _upper_bound;
+
+    state _state = state::before_static_row;
+    lw_shared_ptr<read_context> _read_context;
+    partition_snapshot_row_cursor _next_row;
+    bool _next_row_in_range = false;
+
+    // True iff current population interval, since the previous clustering row, starts before all clustered rows.
+    // We cannot just look at _lower_bound, because emission of range tombstones changes _lower_bound and
+    // because we mark clustering intervals as continuous when consuming a clustering_row, it would prevent
+    // us from marking the interval as continuous.
+    // Valid when _state == reading_from_underlying.
+    bool _population_range_starts_before_all_rows;
+
+    future<> do_fill_buffer();
+    void copy_from_cache_to_buffer();
+    future<> process_static_row();
+    void move_to_end();
+    void move_to_next_range();
+    void move_to_range(query::clustering_row_ranges::const_iterator);
+    void move_to_next_entry();
+    // Emits all delayed range tombstones with positions smaller than upper_bound.
+    void drain_tombstones(position_in_partition_view upper_bound);
+    // Emits all delayed range tombstones.
+    void drain_tombstones();
+    void add_to_buffer(const partition_snapshot_row_cursor&);
+    void add_clustering_row_to_buffer(mutation_fragment&&);
+    void add_to_buffer(range_tombstone&&);
+    void add_to_buffer(mutation_fragment&&);
+    future<> read_from_underlying();
+    void start_reading_from_underlying();
+    bool after_current_range(position_in_partition_view position);
+    bool can_populate() const;
+    void maybe_update_continuity();
+    void maybe_add_to_cache(const mutation_fragment& mf);
+    void maybe_add_to_cache(const clustering_row& cr);
+    void maybe_add_to_cache(const range_tombstone& rt);
+    void maybe_add_to_cache(const static_row& sr);
+    void maybe_set_static_row_continuous();
+    void finish_reader() {
+        push_mutation_fragment(partition_end());
+        _end_of_stream = true;
+        _state = state::end_of_stream;
+    }
+public:
+    cache_flat_mutation_reader(schema_ptr s,
+                               dht::decorated_key dk,
+                               query::clustering_key_filter_ranges&& crr,
+                               lw_shared_ptr<read_context> ctx,
+                               lw_shared_ptr<partition_snapshot> snp,
+                               row_cache& cache)
+        : flat_mutation_reader::impl(std::move(s))
+        , _snp(std::move(snp))
+        , _position_cmp(*_schema)
+        , _ck_ranges(std::move(crr))
+        , _ck_ranges_curr(_ck_ranges.begin())
+        , _ck_ranges_end(_ck_ranges.end())
+        , _lsa_manager(cache)
+        , _tombstones(*_schema)
+        , _lower_bound(position_in_partition::before_all_clustered_rows())
+        , _upper_bound(position_in_partition_view::before_all_clustered_rows())
+        , _read_context(std::move(ctx))
+        , _next_row(*_schema, *_snp)
+    {
+        clogger.trace("csm {}: table={}.{}", this, _schema->ks_name(), _schema->cf_name());
+        push_mutation_fragment(partition_start(std::move(dk), _snp->partition_tombstone()));
+    }
+    cache_flat_mutation_reader(const cache_flat_mutation_reader&) = delete;
+    cache_flat_mutation_reader(cache_flat_mutation_reader&&) = delete;
+    virtual future<> fill_buffer() override;
+    virtual ~cache_flat_mutation_reader() {
+        maybe_merge_versions(_snp, _lsa_manager.region(), _lsa_manager.read_section());
+    }
+    virtual void next_partition() override {
+        clear_buffer_to_next_partition();
+        if (is_buffer_empty()) {
+            _end_of_stream = true;
+        }
+    }
+    virtual future<> fast_forward_to(const dht::partition_range&) override {
+        clear_buffer();
+        _end_of_stream = true;
+        return make_ready_future<>();
+    }
+    virtual future<> fast_forward_to(position_range pr) override {
+        throw std::bad_function_call();
+    }
+};
+
+inline
+future<> cache_flat_mutation_reader::process_static_row() {
+    if (_snp->version()->partition().static_row_continuous()) {
+        _read_context->cache().on_row_hit();
+        row sr = _lsa_manager.run_in_read_section([this] {
+            return _snp->static_row();
+        });
+        if (!sr.empty()) {
+            push_mutation_fragment(mutation_fragment(static_row(std::move(sr))));
+        }
+        return make_ready_future<>();
+    } else {
+        _read_context->cache().on_row_miss();
+        return _read_context->get_next_fragment().then([this] (mutation_fragment_opt&& sr) {
+            if (sr) {
+                assert(sr->is_static_row());
+                maybe_add_to_cache(sr->as_static_row());
+                push_mutation_fragment(std::move(*sr));
+            }
+            maybe_set_static_row_continuous();
+        });
+    }
+}
+
+inline
+future<> cache_flat_mutation_reader::fill_buffer() {
+    if (_state == state::before_static_row) {
+        auto after_static_row = [this] {
+            if (_ck_ranges_curr == _ck_ranges_end) {
+                finish_reader();
+                return make_ready_future<>();
+            }
+            _state = state::reading_from_cache;
+            _lsa_manager.run_in_read_section([this] {
+                move_to_range(_ck_ranges_curr);
+            });
+            return fill_buffer();
+        };
+        if (_schema->has_static_columns()) {
+            return process_static_row().then(std::move(after_static_row));
+        } else {
+            return after_static_row();
+        }
+    }
+    clogger.trace("csm {}: fill_buffer(), range={}, lb={}", this, *_ck_ranges_curr, _lower_bound);
+    return do_until([this] { return _end_of_stream || is_buffer_full(); }, [this] {
+        return do_fill_buffer();
+    });
+}
+
+inline
+future<> cache_flat_mutation_reader::do_fill_buffer() {
+    if (_state == state::move_to_underlying) {
+        _state = state::reading_from_underlying;
+        _population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
+        auto end = _next_row_in_range ? position_in_partition(_next_row.position())
+                                      : position_in_partition(_upper_bound);
+        return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)}).then([this] {
+            return read_from_underlying();
+        });
+    }
+    if (_state == state::reading_from_underlying) {
+        return read_from_underlying();
+    }
+    // assert(_state == state::reading_from_cache)
+    return _lsa_manager.run_in_read_section([this] {
+        auto next_valid = _next_row.iterators_valid();
+        clogger.trace("csm {}: reading_from_cache, range=[{}, {}), next={}, valid={}", this, _lower_bound,
+            _upper_bound, _next_row.position(), next_valid);
+        // We assume that if there was eviction, and thus the range may
+        // no longer be continuous, the cursor was invalidated.
+        if (!next_valid) {
+            auto adjacent = _next_row.advance_to(_lower_bound);
+            _next_row_in_range = !after_current_range(_next_row.position());
+            if (!adjacent && !_next_row.continuous()) {
+                _last_row = nullptr; // We could insert a dummy here, but this path is unlikely.
+                start_reading_from_underlying();
+                return make_ready_future<>();
+            }
+        }
+        _next_row.maybe_refresh();
+        clogger.trace("csm {}: next={}, cont={}", this, _next_row.position(), _next_row.continuous());
+        while (!is_buffer_full() && _state == state::reading_from_cache) {
+            copy_from_cache_to_buffer();
+            if (need_preempt()) {
+                break;
+            }
+        }
+        return make_ready_future<>();
+    });
+}
+
+inline
+future<> cache_flat_mutation_reader::read_from_underlying() {
+    return consume_mutation_fragments_until(_read_context->underlying().underlying(),
+        [this] { return _state != state::reading_from_underlying || is_buffer_full(); },
+        [this] (mutation_fragment mf) {
+            _read_context->cache().on_row_miss();
+            maybe_add_to_cache(mf);
+            add_to_buffer(std::move(mf));
+        },
+        [this] {
+            _state = state::reading_from_cache;
+            _lsa_manager.run_in_update_section([this] {
+                auto same_pos = _next_row.maybe_refresh();
+                if (!same_pos) {
+                    _read_context->cache().on_mispopulate(); // FIXME: Insert dummy entry at _upper_bound.
+                    _next_row_in_range = !after_current_range(_next_row.position());
+                    if (!_next_row.continuous()) {
+                        start_reading_from_underlying();
+                    }
+                    return;
+                }
+                if (_next_row_in_range) {
+                    maybe_update_continuity();
+                    _last_row = _next_row;
+                    add_to_buffer(_next_row);
+                    try {
+                        move_to_next_entry();
+                    } catch (const std::bad_alloc&) {
+                        // We cannot reenter the section, since we may have moved to the new range, and
+                        // because add_to_buffer() should not be repeated.
+                        _snp->region().allocator().invalidate_references(); // Invalidates _next_row
+                    }
+                } else {
+                    if (no_clustering_row_between(*_schema, _upper_bound, _next_row.position())) {
+                        this->maybe_update_continuity();
+                    } else if (can_populate()) {
+                        rows_entry::compare less(*_schema);
+                        auto& rows = _snp->version()->partition().clustered_rows();
+                        if (query::is_single_row(*_schema, *_ck_ranges_curr)) {
+                            with_allocator(_snp->region().allocator(), [&] {
+                                auto e = alloc_strategy_unique_ptr<rows_entry>(
+                                    current_allocator().construct<rows_entry>(_ck_ranges_curr->start()->value()));
+                                // Use _next_row iterator only as a hint, because there could be insertions after _upper_bound.
+                                auto insert_result = rows.insert_check(_next_row.get_iterator_in_latest_version(), *e, less);
+                                auto inserted = insert_result.second;
+                                auto it = insert_result.first;
+                                if (inserted) {
+                                    e.release();
+                                    auto next = std::next(it);
+                                    it->set_continuous(next->continuous());
+                                    clogger.trace("csm {}: inserted dummy at {}, cont={}", this, it->position(), it->continuous());
+                                }
+                            });
+                        } else if (!_ck_ranges_curr->start() || _last_row.refresh(*_snp)) {
+                            with_allocator(_snp->region().allocator(), [&] {
+                                auto e = alloc_strategy_unique_ptr<rows_entry>(
+                                    current_allocator().construct<rows_entry>(*_schema, _upper_bound, is_dummy::yes, is_continuous::yes));
+                                // Use _next_row iterator only as a hint, because there could be insertions after _upper_bound.
+                                auto insert_result = rows.insert_check(_next_row.get_iterator_in_latest_version(), *e, less);
+                                auto inserted = insert_result.second;
+                                if (inserted) {
+                                    clogger.trace("csm {}: inserted dummy at {}", this, _upper_bound);
+                                    e.release();
+                                } else {
+                                    clogger.trace("csm {}: mark {} as continuous", this, insert_result.first->position());
+                                    insert_result.first->set_continuous(true);
+                                }
+                            });
+                        }
+                    } else {
+                        _read_context->cache().on_mispopulate();
+                    }
+                    try {
+                        move_to_next_range();
+                    } catch (const std::bad_alloc&) {
+                        // We cannot reenter the section, since we may have moved to the new range
+                        _snp->region().allocator().invalidate_references(); // Invalidates _next_row
+                    }
+                }
+            });
+            return make_ready_future<>();
+        });
+}
+
+inline
+void cache_flat_mutation_reader::maybe_update_continuity() {
+    if (can_populate() && (_population_range_starts_before_all_rows || _last_row.refresh(*_snp))) {
+            if (_next_row.is_in_latest_version()) {
+                clogger.trace("csm {}: mark {} continuous", this, _next_row.get_iterator_in_latest_version()->position());
+                _next_row.get_iterator_in_latest_version()->set_continuous(true);
+            } else {
+                // Cover entry from older version
+                with_allocator(_snp->region().allocator(), [&] {
+                    auto& rows = _snp->version()->partition().clustered_rows();
+                    rows_entry::compare less(*_schema);
+                    auto e = alloc_strategy_unique_ptr<rows_entry>(
+                        current_allocator().construct<rows_entry>(*_schema, _next_row.position(), is_dummy(_next_row.dummy()), is_continuous::yes));
+                    auto insert_result = rows.insert_check(_next_row.get_iterator_in_latest_version(), *e, less);
+                    auto inserted = insert_result.second;
+                    if (inserted) {
+                        clogger.trace("csm {}: inserted dummy at {}", this, e->position());
+                        e.release();
+                    }
+                });
+            }
+    } else {
+        _read_context->cache().on_mispopulate();
+    }
+}
+
+inline
+void cache_flat_mutation_reader::maybe_add_to_cache(const mutation_fragment& mf) {
+    if (mf.is_range_tombstone()) {
+        maybe_add_to_cache(mf.as_range_tombstone());
+    } else {
+        assert(mf.is_clustering_row());
+        const clustering_row& cr = mf.as_clustering_row();
+        maybe_add_to_cache(cr);
+    }
+}
+
+inline
+void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
+    if (!can_populate()) {
+        _last_row = nullptr;
+        _population_range_starts_before_all_rows = false;
+        _read_context->cache().on_mispopulate();
+        return;
+    }
+    clogger.trace("csm {}: populate({})", this, cr);
+    _lsa_manager.run_in_update_section_with_allocator([this, &cr] {
+        mutation_partition& mp = _snp->version()->partition();
+        rows_entry::compare less(*_schema);
+
+        auto new_entry = alloc_strategy_unique_ptr<rows_entry>(
+            current_allocator().construct<rows_entry>(cr.key(), cr.tomb(), cr.marker(), cr.cells()));
+        new_entry->set_continuous(false);
+        auto it = _next_row.iterators_valid() ? _next_row.get_iterator_in_latest_version()
+                                              : mp.clustered_rows().lower_bound(cr.key(), less);
+        auto insert_result = mp.clustered_rows().insert_check(it, *new_entry, less);
+        if (insert_result.second) {
+            _read_context->cache().on_row_insert();
+            new_entry.release();
+        }
+        it = insert_result.first;
+
+        rows_entry& e = *it;
+        if (!_ck_ranges_curr->start() || _last_row.refresh(*_snp)) {
+            clogger.trace("csm {}: set_continuous({})", this, e.position());
+            e.set_continuous(true);
+        } else {
+            _read_context->cache().on_mispopulate();
+        }
+        with_allocator(standard_allocator(), [&] {
+            _last_row = partition_snapshot_row_weakref(*_snp, it);
+        });
+        _population_range_starts_before_all_rows = false;
+    });
+}
+
+inline
+bool cache_flat_mutation_reader::after_current_range(position_in_partition_view p) {
+    return _position_cmp(p, _upper_bound) >= 0;
+}
+
+inline
+void cache_flat_mutation_reader::start_reading_from_underlying() {
+    clogger.trace("csm {}: start_reading_from_underlying(), range=[{}, {})", this, _lower_bound, _next_row_in_range ? _next_row.position() : _upper_bound);
+    _state = state::move_to_underlying;
+}
+
+inline
+void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
+    clogger.trace("csm {}: copy_from_cache, next={}, next_row_in_range={}", this, _next_row.position(), _next_row_in_range);
+    position_in_partition_view next_lower_bound = _next_row.dummy() ? _next_row.position() : position_in_partition_view::after_key(_next_row.key());
+    for (auto&& rts : _snp->range_tombstones(*_schema, _lower_bound, _next_row_in_range ? next_lower_bound : _upper_bound)) {
+        add_to_buffer(std::move(rts));
+        if (is_buffer_full()) {
+            return;
+        }
+    }
+    if (_next_row_in_range) {
+        _last_row = _next_row;
+        add_to_buffer(_next_row);
+        move_to_next_entry();
+    } else {
+        move_to_next_range();
+    }
+}
+
+inline
+void cache_flat_mutation_reader::move_to_end() {
+    drain_tombstones();
+    finish_reader();
+    clogger.trace("csm {}: eos", this);
+}
+
+inline
+void cache_flat_mutation_reader::move_to_next_range() {
+    auto next_it = std::next(_ck_ranges_curr);
+    if (next_it == _ck_ranges_end) {
+        move_to_end();
+        _ck_ranges_curr = next_it;
+    } else {
+        move_to_range(next_it);
+    }
+}
+
+inline
+void cache_flat_mutation_reader::move_to_range(query::clustering_row_ranges::const_iterator next_it) {
+    auto lb = position_in_partition::for_range_start(*next_it);
+    auto ub = position_in_partition_view::for_range_end(*next_it);
+    _last_row = nullptr;
+    _lower_bound = std::move(lb);
+    _upper_bound = std::move(ub);
+    _ck_ranges_curr = next_it;
+    auto adjacent = _next_row.advance_to(_lower_bound);
+    _next_row_in_range = !after_current_range(_next_row.position());
+    clogger.trace("csm {}: move_to_range(), range={}, lb={}, ub={}, next={}", this, *_ck_ranges_curr, _lower_bound, _upper_bound, _next_row.position());
+    if (!adjacent && !_next_row.continuous()) {
+        // FIXME: We don't insert a dummy for singular range to avoid allocating 3 entries
+        // for a hit (before, at and after). If we supported the concept of an incomplete row,
+        // we could insert such a row for the lower bound if it's full instead, for both singular and
+        // non-singular ranges.
+        if (_ck_ranges_curr->start() && !query::is_single_row(*_schema, *_ck_ranges_curr)) {
+            // Insert dummy for lower bound
+            if (can_populate()) {
+                // FIXME: _lower_bound could be adjacent to the previous row, in which case we could skip this
+                clogger.trace("csm {}: insert dummy at {}", this, _lower_bound);
+                auto it = with_allocator(_lsa_manager.region().allocator(), [&] {
+                    auto& rows = _snp->version()->partition().clustered_rows();
+                    auto new_entry = current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no);
+                    return rows.insert_before(_next_row.get_iterator_in_latest_version(), *new_entry);
+                });
+                _last_row = partition_snapshot_row_weakref(*_snp, it);
+            } else {
+                _read_context->cache().on_mispopulate();
+            }
+        }
+        start_reading_from_underlying();
+    }
+}
+
+// _next_row must be inside the range.
+inline
+void cache_flat_mutation_reader::move_to_next_entry() {
+    clogger.trace("csm {}: move_to_next_entry(), curr={}", this, _next_row.position());
+    if (no_clustering_row_between(*_schema, _next_row.position(), _upper_bound)) {
+        move_to_next_range();
+    } else {
+        if (!_next_row.next()) {
+            move_to_end();
+            return;
+        }
+        _next_row_in_range = !after_current_range(_next_row.position());
+        clogger.trace("csm {}: next={}, cont={}, in_range={}", this, _next_row.position(), _next_row.continuous(), _next_row_in_range);
+        if (!_next_row.continuous()) {
+            start_reading_from_underlying();
+        }
+    }
+}
+
+inline
+void cache_flat_mutation_reader::drain_tombstones(position_in_partition_view pos) {
+    while (true) {
+        reserve_one();
+        auto mfo = _tombstones.get_next(pos);
+        if (!mfo) {
+            break;
+        }
+        push_mutation_fragment(std::move(*mfo));
+    }
+}
+
+inline
+void cache_flat_mutation_reader::drain_tombstones() {
+    while (true) {
+        reserve_one();
+        auto mfo = _tombstones.get_next();
+        if (!mfo) {
+            break;
+        }
+        push_mutation_fragment(std::move(*mfo));
+    }
+}
+
+inline
+void cache_flat_mutation_reader::add_to_buffer(mutation_fragment&& mf) {
+    clogger.trace("csm {}: add_to_buffer({})", this, mf);
+    if (mf.is_clustering_row()) {
+        add_clustering_row_to_buffer(std::move(mf));
+    } else {
+        assert(mf.is_range_tombstone());
+        add_to_buffer(std::move(mf).as_range_tombstone());
+    }
+}
+
+inline
+void cache_flat_mutation_reader::add_to_buffer(const partition_snapshot_row_cursor& row) {
+    if (!row.dummy()) {
+        _read_context->cache().on_row_hit();
+        add_clustering_row_to_buffer(row.row());
+    }
+}
+
+// Maintains the following invariants, also in case of exception:
+//   (1) no fragment with position >= _lower_bound was pushed yet
+//   (2) If _lower_bound > mf.position(), mf was emitted
+inline
+void cache_flat_mutation_reader::add_clustering_row_to_buffer(mutation_fragment&& mf) {
+    clogger.trace("csm {}: add_clustering_row_to_buffer({})", this, mf);
+    auto& row = mf.as_clustering_row();
+    auto key = row.key();
+    try {
+        drain_tombstones(row.position());
+        push_mutation_fragment(std::move(mf));
+        _lower_bound = position_in_partition::after_key(std::move(key));
+    } catch (...) {
+        // We may have emitted some of the range tombstones which start after the old _lower_bound
+        _lower_bound = position_in_partition::for_key(std::move(key));
+        throw;
+    }
+}
+
+inline
+void cache_flat_mutation_reader::add_to_buffer(range_tombstone&& rt) {
+    clogger.trace("csm {}: add_to_buffer({})", this, rt);
+    // This guarantees that rt starts after any emitted clustering_row
+    if (!rt.trim_front(*_schema, _lower_bound)) {
+        return;
+    }
+    _lower_bound = position_in_partition(rt.position());
+    _tombstones.apply(std::move(rt));
+    drain_tombstones(_lower_bound);
+}
+
+inline
+void cache_flat_mutation_reader::maybe_add_to_cache(const range_tombstone& rt) {
+    if (can_populate()) {
+        clogger.trace("csm {}: maybe_add_to_cache({})", this, rt);
+        _lsa_manager.run_in_update_section_with_allocator([&] {
+            _snp->version()->partition().row_tombstones().apply_monotonically(*_schema, rt);
+        });
+    } else {
+        _read_context->cache().on_mispopulate();
+    }
+}
+
+inline
+void cache_flat_mutation_reader::maybe_add_to_cache(const static_row& sr) {
+    if (can_populate()) {
+        clogger.trace("csm {}: populate({})", this, sr);
+        _read_context->cache().on_row_insert();
+        _lsa_manager.run_in_update_section_with_allocator([&] {
+            _snp->version()->partition().static_row().apply(*_schema, column_kind::static_column, sr.cells());
+        });
+    } else {
+        _read_context->cache().on_mispopulate();
+    }
+}
+
+inline
+void cache_flat_mutation_reader::maybe_set_static_row_continuous() {
+    if (can_populate()) {
+        clogger.trace("csm {}: set static row continuous", this);
+        _snp->version()->partition().set_static_row_continuous(true);
+    } else {
+        _read_context->cache().on_mispopulate();
+    }
+}
+
+inline
+bool cache_flat_mutation_reader::can_populate() const {
+    return _snp->at_latest_version() && _read_context->cache().phase_of(_read_context->key()) == _read_context->phase();
+}
+
+} // namespace cache
+
+inline flat_mutation_reader make_cache_flat_mutation_reader(schema_ptr s,
+                                                            dht::decorated_key dk,
+                                                            query::clustering_key_filter_ranges crr,
+                                                            row_cache& cache,
+                                                            lw_shared_ptr<cache::read_context> ctx,
+                                                            lw_shared_ptr<partition_snapshot> snp)
+{
+    return make_flat_mutation_reader<cache::cache_flat_mutation_reader>(
+        std::move(s), std::move(dk), std::move(crr), std::move(ctx), std::move(snp), cache);
+}
--- a/caching_options.hh
+++ b/caching_options.hh
@@ -24,6 +24,7 @@
 #include <boost/lexical_cast.hpp>
 #include "exceptions/exceptions.hh"
 #include "json.hh"
+#include "seastarx.hh"

 class schema;

@@ -58,30 +59,34 @@ class caching_options {
    caching_options() : _key_cache(default_key), _row_cache(default_row) {}
 public:

-    sstring to_sstring() const {
-        return json::to_json(std::map<sstring, sstring>({{ "keys", _key_cache }, { "rows_per_partition", _row_cache }}));
+    std::map<sstring, sstring> to_map() const {
+        return {{ "keys", _key_cache }, { "rows_per_partition", _row_cache }};
    }

-    static caching_options from_sstring(const sstring& str) {
-        auto map = json::to_map(str);
-        if (map.size() > 2) {
-            throw exceptions::configuration_exception("Invalid map: " + str); 
-        }
-        sstring k;
-        sstring r;
-        if (map.count("keys")) {
-            k = map.at("keys");
-        } else {
-            k = default_key;
-        }
+    sstring to_sstring() const {
+        return json::to_json(to_map());
+    }

-        if (map.count("rows_per_partition")) {
-            r = map.at("rows_per_partition");
-        } else {
-            r = default_row;
+    template<typename Map>
+    static caching_options from_map(const Map & map) {
+        sstring k = default_key;
+        sstring r = default_row;
+
+        for (auto& p : map) {
+            if (p.first == "keys") {
+                k = p.second;
+            } else if (p.first == "rows_per_partition") {
+                r = p.second;
+            } else {
+                throw exceptions::configuration_exception("Invalid caching option: " + p.first);
+            }
        }
        return caching_options(k, r);
    }
+    static caching_options from_sstring(const sstring& str) {
+        return from_map(json::to_map(str));
+    }
+
    bool operator==(const caching_options& other) const {
        return _key_cache == other._key_cache && _row_cache == other._row_cache;
    }
--- a/canonical_mutation.cc
+++ b/canonical_mutation.cc
@@ -22,6 +22,7 @@
 #include "canonical_mutation.hh"
 #include "mutation.hh"
 #include "mutation_partition_serializer.hh"
+#include "counters.hh"
 #include "converting_mutation_partition_applier.hh"
 #include "hashing_partition_visitor.hh"
 #include "utils/UUID.hh"
@@ -44,7 +45,7 @@ canonical_mutation::canonical_mutation(const mutation& m)
    mutation_partition_serializer part_ser(*m.schema(), m.partition());

    bytes_ostream out;
-    ser::writer_of_canonical_mutation wr(out);
+    ser::writer_of_canonical_mutation<bytes_ostream> wr(out);
    std::move(wr).write_table_id(m.schema()->id())
                 .write_schema_version(m.schema()->version())
                 .write_key(m.key())
--- a/cell_locking.hh
+++ b/cell_locking.hh
@@ -0,0 +1,566 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <boost/intrusive/unordered_set.hpp>
+
+#if __has_include(<boost/container/small_vector.hpp>)
+
+#include <boost/container/small_vector.hpp>
+
+template <typename T, size_t N>
+using small_vector = boost::container::small_vector<T, N>;
+
+#else
+
+#include <vector>
+template <typename T, size_t N>
+using small_vector = std::vector<T>;
+
+#endif
+
+#include "fnv1a_hasher.hh"
+#include "streamed_mutation.hh"
+#include "mutation_partition.hh"
+
+class cells_range {
+    using ids_vector_type = small_vector<column_id, 5>;
+
+    position_in_partition_view _position;
+    ids_vector_type _ids;
+public:
+    using iterator = ids_vector_type::iterator;
+    using const_iterator = ids_vector_type::const_iterator;
+
+    cells_range()
+        : _position(position_in_partition_view(position_in_partition_view::static_row_tag_t())) { }
+
+    explicit cells_range(position_in_partition_view pos, const row& cells)
+        : _position(pos)
+    {
+        _ids.reserve(cells.size());
+        cells.for_each_cell([this] (auto id, auto&&) {
+            _ids.emplace_back(id);
+        });
+    }
+
+    position_in_partition_view position() const { return _position; }
+    bool empty() const { return _ids.empty(); }
+
+    auto begin() const { return _ids.begin(); }
+    auto end() const { return _ids.end(); }
+};
+
+class partition_cells_range {
+    const mutation_partition& _mp;
+public:
+    class iterator {
+        const mutation_partition& _mp;
+        stdx::optional<mutation_partition::rows_type::const_iterator> _position;
+        cells_range _current;
+    public:
+        explicit iterator(const mutation_partition& mp)
+            : _mp(mp)
+            , _current(position_in_partition_view(position_in_partition_view::static_row_tag_t()), mp.static_row())
+        { }
+
+        iterator(const mutation_partition& mp, mutation_partition::rows_type::const_iterator it)
+            : _mp(mp)
+            , _position(it)
+        { }
+
+        iterator& operator++() {
+            if (!_position) {
+                _position = _mp.clustered_rows().begin();
+            } else {
+                ++(*_position);
+            }
+            if (_position != _mp.clustered_rows().end()) {
+                auto it = *_position;
+                _current = cells_range(position_in_partition_view(position_in_partition_view::clustering_row_tag_t(), it->key()),
+                        it->row().cells());
+            }
+            return *this;
+        }
+
+        iterator operator++(int) {
+            iterator it(*this);
+            operator++();
+            return it;
+        }
+
+        cells_range& operator*() {
+            return _current;
+        }
+
+        cells_range* operator->() {
+            return &_current;
+        }
+
+        bool operator==(const iterator& other) const {
+            return _position == other._position;
+        }
+        bool operator!=(const iterator& other) const {
+            return !(*this == other);
+        }
+    };
+public:
+    explicit partition_cells_range(const mutation_partition& mp) : _mp(mp) { }
+
+    iterator begin() const {
+        return iterator(_mp);
+    }
+    iterator end() const {
+        return iterator(_mp, _mp.clustered_rows().end());
+    }
+};
+
+class locked_cell;
+
+struct cell_locker_stats {
+    uint64_t lock_acquisitions = 0;
+    uint64_t operations_waiting_for_lock = 0;
+};
+
+class cell_locker {
+public:
+    using timeout_clock = lowres_clock;
+private:
+    using semaphore_type = basic_semaphore<default_timeout_exception_factory, timeout_clock>;
+
+    class partition_entry;
+
+    struct cell_address {
+        position_in_partition position;
+        column_id id;
+    };
+
+    class cell_entry : public bi::unordered_set_base_hook<bi::link_mode<bi::auto_unlink>>,
+                       public enable_lw_shared_from_this<cell_entry> {
+        partition_entry& _parent;
+        cell_address _address;
+        semaphore_type _semaphore { 0 };
+
+        friend class cell_locker;
+    public:
+        cell_entry(partition_entry& parent, position_in_partition position, column_id id)
+            : _parent(parent)
+            , _address { std::move(position), id }
+        { }
+
+        // Upgrades cell_entry to another schema.
+        // Changes the value of cell_address, so cell_entry has to be
+        // temporarily removed from its parent partition_entry.
+        // Returns true if the cell_entry still exist in the new schema and
+        // should be reinserted.
+        bool upgrade(const schema& from, const schema& to, column_kind kind) noexcept {
+            auto& old_column_mapping = from.get_column_mapping();
+            auto& column = old_column_mapping.column_at(kind, _address.id);
+            auto cdef = to.get_column_definition(column.name());
+            if (!cdef) {
+                return false;
+            }
+            _address.id = cdef->id;
+            return true;
+        }
+
+        const position_in_partition& position() const {
+            return _address.position;
+        }
+
+        future<> lock(timeout_clock::time_point _timeout) {
+            return _semaphore.wait(_timeout);
+        }
+        void unlock() {
+            _semaphore.signal();
+        }
+
+        ~cell_entry() {
+            if (!is_linked()) {
+                return;
+            }
+            unlink();
+            if (!--_parent._cell_count) {
+                delete &_parent;
+            }
+        }
+
+        class hasher {
+            const schema* _schema; // pointer instead of reference for default assignment
+        public:
+            explicit hasher(const schema& s) : _schema(&s) { }
+
+            size_t operator()(const cell_address& ca) const {
+                fnv1a_hasher hasher;
+                ca.position.feed_hash(hasher, *_schema);
+                ::feed_hash(hasher, ca.id);
+                return hasher.finalize();
+            }
+            size_t operator()(const cell_entry& ce) const {
+                return operator()(ce._address);
+            }
+        };
+
+        class equal_compare {
+            position_in_partition::equal_compare _cmp;
+        private:
+            bool do_compare(const cell_address& a, const cell_address& b) const {
+                return a.id == b.id && _cmp(a.position, b.position);
+            }
+        public:
+            explicit equal_compare(const schema& s) : _cmp(s) { }
+            bool operator()(const cell_address& ca, const cell_entry& ce) const {
+                return do_compare(ca, ce._address);
+            }
+            bool operator()(const cell_entry& ce, const cell_address& ca) const {
+                return do_compare(ca, ce._address);
+            }
+            bool operator()(const cell_entry& a, const cell_entry& b) const {
+                return do_compare(a._address, b._address);
+            }
+        };
+    };
+
+    class partition_entry : public bi::unordered_set_base_hook<bi::link_mode<bi::auto_unlink>> {
+        using cells_type = bi::unordered_set<cell_entry,
+                                             bi::equal<cell_entry::equal_compare>,
+                                             bi::hash<cell_entry::hasher>,
+                                             bi::constant_time_size<false>>;
+
+        static constexpr size_t initial_bucket_count = 16;
+        using max_load_factor = std::ratio<3, 4>;
+        dht::decorated_key _key;
+        cell_locker& _parent;
+        size_t _rehash_at_size = compute_rehash_at_size(initial_bucket_count);
+        std::unique_ptr<cells_type::bucket_type[]> _buckets; // TODO: start with internal storage?
+        size_t _cell_count = 0; // cells_type::empty() is not O(1) if the hook is auto-unlink
+        cells_type::bucket_type _internal_buckets[initial_bucket_count];
+        cells_type _cells;
+        schema_ptr _schema;
+
+        friend class cell_entry;
+    private:
+        static constexpr size_t compute_rehash_at_size(size_t bucket_count) {
+            return bucket_count * max_load_factor::num / max_load_factor::den;
+        }
+        void maybe_rehash() {
+            if (_cell_count >= _rehash_at_size) {
+                auto new_bucket_count = std::min(_cells.bucket_count() * 2, _cells.bucket_count() + 1024);
+                auto buckets = std::make_unique<cells_type::bucket_type[]>(new_bucket_count);
+
+                _cells.rehash(cells_type::bucket_traits(buckets.get(), new_bucket_count));
+                _buckets = std::move(buckets);
+
+                _rehash_at_size = compute_rehash_at_size(new_bucket_count);
+            }
+        }
+    public:
+        partition_entry(schema_ptr s, cell_locker& parent, const dht::decorated_key& dk)
+            : _key(dk)
+            , _parent(parent)
+            , _cells(cells_type::bucket_traits(_internal_buckets, initial_bucket_count),
+                     cell_entry::hasher(*s), cell_entry::equal_compare(*s))
+            , _schema(s)
+        { }
+
+        ~partition_entry() {
+            if (is_linked()) {
+                _parent._partition_count--;
+            }
+        }
+
+        // Upgrades partition entry to new schema. Returns false if all
+        // cell_entries has been removed during the upgrade.
+        bool upgrade(schema_ptr new_schema);
+
+        void insert(lw_shared_ptr<cell_entry> cell) {
+            _cells.insert(*cell);
+            _cell_count++;
+            maybe_rehash();
+        }
+
+        cells_type& cells() {
+            return _cells;
+        }
+
+        struct hasher {
+            size_t operator()(const dht::decorated_key& dk) const {
+                return std::hash<dht::decorated_key>()(dk);
+            }
+            size_t operator()(const partition_entry& pe) const {
+                return operator()(pe._key);
+            }
+        };
+
+        class equal_compare {
+            dht::decorated_key_equals_comparator _cmp;
+        public:
+            explicit equal_compare(const schema& s) : _cmp(s) { }
+            bool operator()(const dht::decorated_key& dk, const partition_entry& pe) {
+                return _cmp(dk, pe._key);
+            }
+            bool operator()(const partition_entry& pe, const dht::decorated_key& dk) {
+                return _cmp(dk, pe._key);
+            }
+            bool operator()(const partition_entry& a, const partition_entry& b) {
+                return _cmp(a._key, b._key);
+            }
+        };
+    };
+
+    using partitions_type = bi::unordered_set<partition_entry,
+                                              bi::equal<partition_entry::equal_compare>,
+                                              bi::hash<partition_entry::hasher>,
+                                              bi::constant_time_size<false>>;
+
+    static constexpr size_t initial_bucket_count = 4 * 1024;
+    using max_load_factor = std::ratio<3, 4>;
+
+    std::unique_ptr<partitions_type::bucket_type[]> _buckets;
+    partitions_type _partitions;
+    size_t _partition_count = 0;
+    size_t _rehash_at_size = compute_rehash_at_size(initial_bucket_count);
+    schema_ptr _schema;
+
+    // partitions_type uses equality comparator which keeps a reference to the
+    // original schema, we must ensure that it doesn't die.
+    schema_ptr _original_schema;
+    cell_locker_stats& _stats;
+
+    friend class locked_cell;
+private:
+    struct locker;
+
+    static constexpr size_t compute_rehash_at_size(size_t bucket_count) {
+        return bucket_count * max_load_factor::num / max_load_factor::den;
+    }
+    void maybe_rehash() {
+        if (_partition_count >= _rehash_at_size) {
+            auto new_bucket_count = std::min(_partitions.bucket_count() * 2, _partitions.bucket_count() + 64 * 1024);
+            auto buckets = std::make_unique<partitions_type::bucket_type[]>(new_bucket_count);
+
+            _partitions.rehash(partitions_type::bucket_traits(buckets.get(), new_bucket_count));
+            _buckets = std::move(buckets);
+
+            _rehash_at_size = compute_rehash_at_size(new_bucket_count);
+        }
+    }
+public:
+    explicit cell_locker(schema_ptr s, cell_locker_stats& stats)
+        : _buckets(std::make_unique<partitions_type::bucket_type[]>(initial_bucket_count))
+        , _partitions(partitions_type::bucket_traits(_buckets.get(), initial_bucket_count),
+                      partition_entry::hasher(), partition_entry::equal_compare(*s))
+        , _schema(s)
+        , _original_schema(std::move(s))
+        , _stats(stats)
+    { }
+
+    ~cell_locker() {
+        assert(_partitions.empty());
+    }
+
+    void set_schema(schema_ptr s) {
+        _schema = s;
+    }
+    schema_ptr schema() const {
+        return _schema;
+    }
+
+    // partition_cells_range is required to be in cell_locker::schema()
+    future<std::vector<locked_cell>> lock_cells(const dht::decorated_key& dk, partition_cells_range&& range,
+                                                timeout_clock::time_point timeout);
+};
+
+
+class locked_cell {
+    lw_shared_ptr<cell_locker::cell_entry> _entry;
+public:
+    explicit locked_cell(lw_shared_ptr<cell_locker::cell_entry> entry)
+        : _entry(std::move(entry)) { }
+
+    locked_cell(const locked_cell&) = delete;
+    locked_cell(locked_cell&&) = default;
+
+    ~locked_cell() {
+        if (_entry) {
+            _entry->unlock();
+        }
+    }
+};
+
+struct cell_locker::locker {
+    cell_entry::hasher _hasher;
+    cell_entry::equal_compare _eq_cmp;
+    partition_entry& _partition_entry;
+
+    partition_cells_range _range;
+    partition_cells_range::iterator _current_ck;
+    cells_range::const_iterator _current_cell;
+
+    timeout_clock::time_point _timeout;
+    std::vector<locked_cell> _locks;
+    cell_locker_stats& _stats;
+private:
+    void update_ck() {
+        if (!is_done()) {
+            _current_cell = _current_ck->begin();
+        }
+    }
+
+    future<> lock_next();
+
+    bool is_done() const { return _current_ck == _range.end(); }
+public:
+    explicit locker(const ::schema& s, cell_locker_stats& st, partition_entry& pe, partition_cells_range&& range, timeout_clock::time_point timeout)
+        : _hasher(s)
+        , _eq_cmp(s)
+        , _partition_entry(pe)
+        , _range(std::move(range))
+        , _current_ck(_range.begin())
+        , _timeout(timeout)
+        , _stats(st)
+    {
+        update_ck();
+    }
+
+    locker(const locker&) = delete;
+    locker(locker&&) = delete;
+
+    future<> lock_all() {
+        // Cannot defer before first call to lock_next().
+        return lock_next().then([this] {
+            return do_until([this] { return is_done(); }, [this] {
+                return lock_next();
+            });
+        });
+    }
+
+    std::vector<locked_cell> get() && { return std::move(_locks); }
+};
+
+inline
+future<std::vector<locked_cell>> cell_locker::lock_cells(const dht::decorated_key& dk, partition_cells_range&& range, timeout_clock::time_point timeout) {
+    partition_entry::hasher pe_hash;
+    partition_entry::equal_compare pe_eq(*_schema);
+
+    auto it = _partitions.find(dk, pe_hash, pe_eq);
+    std::unique_ptr<partition_entry> partition;
+    if (it == _partitions.end()) {
+        partition = std::make_unique<partition_entry>(_schema, *this, dk);
+    } else if (!it->upgrade(_schema)) {
+        partition = std::unique_ptr<partition_entry>(&*it);
+        _partition_count--;
+        _partitions.erase(it);
+    }
+
+    if (partition) {
+        std::vector<locked_cell> locks;
+        for (auto&& r : range) {
+            if (r.empty()) {
+                continue;
+            }
+            for (auto&& c : r) {
+                auto cell = make_lw_shared<cell_entry>(*partition, position_in_partition(r.position()), c);
+                _stats.lock_acquisitions++;
+                partition->insert(cell);
+                locks.emplace_back(std::move(cell));
+            }
+        }
+
+        if (!locks.empty()) {
+            _partitions.insert(*partition.release());
+            _partition_count++;
+            maybe_rehash();
+        }
+        return make_ready_future<std::vector<locked_cell>>(std::move(locks));
+    }
+
+    auto l = std::make_unique<locker>(*_schema, _stats, *it, std::move(range), timeout);
+    auto f = l->lock_all();
+    return f.then([l = std::move(l)] {
+        return std::move(*l).get();
+    });
+}
+
+inline
+future<> cell_locker::locker::lock_next() {
+    while (!is_done()) {
+        if (_current_cell == _current_ck->end()) {
+            ++_current_ck;
+            update_ck();
+            continue;
+        }
+
+        auto cid = *_current_cell++;
+
+        cell_address ca { position_in_partition(_current_ck->position()), cid };
+        auto it = _partition_entry.cells().find(ca, _hasher, _eq_cmp);
+        if (it != _partition_entry.cells().end()) {
+            _stats.operations_waiting_for_lock++;
+            return it->lock(_timeout).then([this, ce = it->shared_from_this()] () mutable {
+                _stats.operations_waiting_for_lock--;
+                _stats.lock_acquisitions++;
+                _locks.emplace_back(std::move(ce));
+            });
+        }
+
+        auto cell = make_lw_shared<cell_entry>(_partition_entry, position_in_partition(_current_ck->position()), cid);
+        _stats.lock_acquisitions++;
+        _partition_entry.insert(cell);
+        _locks.emplace_back(std::move(cell));
+    }
+    return make_ready_future<>();
+}
+
+inline
+bool cell_locker::partition_entry::upgrade(schema_ptr new_schema) {
+    if (_schema == new_schema) {
+        return true;
+    }
+
+    auto buckets = std::make_unique<cells_type::bucket_type[]>(_cells.bucket_count());
+    auto cells = cells_type(cells_type::bucket_traits(buckets.get(), _cells.bucket_count()),
+                            cell_entry::hasher(*new_schema), cell_entry::equal_compare(*new_schema));
+
+    _cells.clear_and_dispose([&] (cell_entry* cell_ptr) noexcept {
+        auto& cell = *cell_ptr;
+        auto kind = cell.position().is_static_row() ? column_kind::static_column
+                                                    : column_kind::regular_column;
+        auto reinsert = cell.upgrade(*_schema, *new_schema, kind);
+        if (reinsert) {
+            cells.insert(cell);
+        } else {
+            _cell_count--;
+        }
+    });
+
+    // bi::unordered_set move assignment is actually a swap.
+    // Original _buckets cannot be destroyed before the container using them is
+    // so we need to explicitly make sure that the original _cells is no more.
+    _cells = std::move(cells);
+    auto destroy = [] (auto) { };
+    destroy(std::move(cells));
+
+    _buckets = std::move(buckets);
+    _schema = new_schema;
+    return _cell_count;
+}
--- a/checked-file-impl.hh
+++ b/checked-file-impl.hh
@@ -27,125 +27,125 @@
 class checked_file_impl : public file_impl {
 public:

-    checked_file_impl(disk_error_signal_type& s, file f)
-            : _signal(s) , _file(f) {
+    checked_file_impl(const io_error_handler& error_handler, file f)
+            : _error_handler(error_handler), _file(f) {
        _memory_dma_alignment = f.memory_dma_alignment();
        _disk_read_dma_alignment = f.disk_read_dma_alignment();
        _disk_write_dma_alignment = f.disk_write_dma_alignment();
    }

    virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->write_dma(pos, buffer, len, pc);
        });
    }

    virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->write_dma(pos, iov, pc);
        });
    }

    virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->read_dma(pos, buffer, len, pc);
        });
    }

    virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->read_dma(pos, iov, pc);
        });
    }

    virtual future<> flush(void) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->flush();
        });
    }

    virtual future<struct stat> stat(void) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->stat();
        });
    }

    virtual future<> truncate(uint64_t length) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->truncate(length);
        });
    }

    virtual future<> discard(uint64_t offset, uint64_t length) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->discard(offset, length);
        });
    }

    virtual future<> allocate(uint64_t position, uint64_t length) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->allocate(position, length);
        });
    }

    virtual future<uint64_t> size(void) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->size();
        });
    }

    virtual future<> close() override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->close();
        });
    }

+    // returns a handle for plain file, so make_checked_file() should be called
+    // on file returned by handle.
+    virtual std::unique_ptr<seastar::file_handle_impl> dup() override {
+        return get_file_impl(_file)->dup();
+    }
+
    virtual subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) override {
-        return do_io_check(_signal, [&] {
+        return do_io_check(_error_handler, [&] {
            return get_file_impl(_file)->list_directory(next);
        });
    }

+    virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) override {
+        return do_io_check(_error_handler, [&] {
+            return get_file_impl(_file)->dma_read_bulk(offset, range_size, pc);
+        });
+    }
 private:
-    disk_error_signal_type &_signal;
+    const io_error_handler& _error_handler;
    file _file;
 };

-inline file make_checked_file(disk_error_signal_type& signal, file& f)
+inline file make_checked_file(const io_error_handler& error_handler, file f)
 {
-    return file(::make_shared<checked_file_impl>(signal, f));
+    return file(::make_shared<checked_file_impl>(error_handler, f));
 }

 future<file>
-inline open_checked_file_dma(disk_error_signal_type& signal,
+inline open_checked_file_dma(const io_error_handler& error_handler,
                             sstring name, open_flags flags,
-                             file_open_options options)
+                             file_open_options options = {})
 {
-    return do_io_check(signal, [&] {
+    return do_io_check(error_handler, [&] {
        return open_file_dma(name, flags, options).then([&] (file f) {
-            return make_ready_future<file>(make_checked_file(signal, f));
+            return make_ready_future<file>(make_checked_file(error_handler, f));
        });
    });
 }

 future<file>
-inline open_checked_file_dma(disk_error_signal_type& signal,
-                             sstring name, open_flags flags)
-{
-    return do_io_check(signal, [&] {
-        return open_file_dma(name, flags).then([&] (file f) {
-            return make_ready_future<file>(make_checked_file(signal, f));
-        });
-    });
-}
-
-future<file>
-inline open_checked_directory(disk_error_signal_type& signal,
+inline open_checked_directory(const io_error_handler& error_handler,
                              sstring name)
 {
-    return do_io_check(signal, [&] {
+    return do_io_check(error_handler, [&] {
        return engine().open_directory(name).then([&] (file f) {
-            return make_ready_future<file>(make_checked_file(signal, f));
+            return make_ready_future<file>(make_checked_file(error_handler, f));
        });
    });
 }
--- a/clocks-impl.cc
+++ b/clocks-impl.cc
@@ -19,6 +19,6 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

-#include "gc_clock.hh"
+#include "clocks-impl.hh"

 std::atomic<int64_t> clocks_offset;
--- a/clocks-impl.hh
+++ b/clocks-impl.hh
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <chrono>
+#include <cstdint>
+
+extern std::atomic<int64_t> clocks_offset;
+
+template<typename Duration>
+static inline void forward_jump_clocks(Duration delta)
+{
+    auto d = std::chrono::duration_cast<std::chrono::seconds>(delta).count();
+    clocks_offset.fetch_add(d, std::memory_order_relaxed);
+}
+
+static inline std::chrono::seconds get_clocks_offset()
+{
+    auto off = clocks_offset.load(std::memory_order_relaxed);
+    return std::chrono::seconds(off);
+}
+
+// Returns a time point which is earlier from t by d, or minimum time point if it cannot be represented.
+template<typename Clock, typename Duration, typename Rep, typename Period>
+inline
+auto saturating_subtract(std::chrono::time_point<Clock, Duration> t, std::chrono::duration<Rep, Period> d) -> decltype(t) {
+    return std::max(t, decltype(t)::min() + d) - d;
+}
--- a/clustering_bounds_comparator.hh
+++ b/clustering_bounds_comparator.hh
@@ -42,47 +42,63 @@ std::ostream& operator<<(std::ostream& out, const bound_kind k);
 bound_kind invert_kind(bound_kind k);
 int32_t weight(bound_kind k);

-static inline bound_kind flip_bound_kind(bound_kind bk)
-{
-    switch (bk) {
-    case bound_kind::excl_end: return bound_kind::excl_start;
-    case bound_kind::incl_end: return bound_kind::incl_start;
-    case bound_kind::excl_start: return bound_kind::excl_end;
-    case bound_kind::incl_start: return bound_kind::incl_end;
-    }
-    abort();
-}
-
 class bound_view {
-    const static thread_local clustering_key empty_prefix;
 public:
+    const static thread_local clustering_key empty_prefix;
    const clustering_key_prefix& prefix;
    bound_kind kind;
    bound_view(const clustering_key_prefix& prefix, bound_kind kind)
        : prefix(prefix)
        , kind(kind)
    { }
-    struct compare {
+    bound_view(const bound_view& other) noexcept = default;
+    bound_view& operator=(const bound_view& other) noexcept {
+        if (this != &other) {
+            this->~bound_view();
+            new (this) bound_view(other);
+        }
+        return *this;
+    }
+    struct tri_compare {
        // To make it assignable and to avoid taking a schema_ptr, we
        // wrap the schema reference.
        std::reference_wrapper<const schema> _s;
-        compare(const schema& s) : _s(s)
+        tri_compare(const schema& s) : _s(s)
        { }
-        bool operator()(const clustering_key_prefix& p1, int32_t w1, const clustering_key_prefix& p2, int32_t w2) const {
+        int operator()(const clustering_key_prefix& p1, int32_t w1, const clustering_key_prefix& p2, int32_t w2) const {
            auto type = _s.get().clustering_key_prefix_type();
            auto res = prefix_equality_tri_compare(type->types().begin(),
                type->begin(p1), type->end(p1),
                type->begin(p2), type->end(p2),
-                tri_compare);
+                ::tri_compare);
            if (res) {
-                return res < 0;
+                return res;
            }
            auto d1 = p1.size(_s);
            auto d2 = p2.size(_s);
            if (d1 == d2) {
-                return w1 < w2;
+                return w1 - w2;
            }
-            return d1 < d2 ? w1 <= 0 : w2 > 0;
+            return d1 < d2 ? w1 - (w1 <= 0) : -(w2 - (w2 <= 0));
+        }
+        int operator()(const bound_view b, const clustering_key_prefix& p) const {
+            return operator()(b.prefix, weight(b.kind), p, 0);
+        }
+        int operator()(const clustering_key_prefix& p, const bound_view b) const {
+            return operator()(p, 0, b.prefix, weight(b.kind));
+        }
+        int operator()(const bound_view b1, const bound_view b2) const {
+            return operator()(b1.prefix, weight(b1.kind), b2.prefix, weight(b2.kind));
+        }
+    };
+    struct compare {
+        // To make it assignable and to avoid taking a schema_ptr, we
+        // wrap the schema reference.
+        tri_compare _cmp;
+        compare(const schema& s) : _cmp(s)
+        { }
+        bool operator()(const clustering_key_prefix& p1, int32_t w1, const clustering_key_prefix& p2, int32_t w2) const {
+            return _cmp(p1, w1, p2, w2) < 0;
        }
        bool operator()(const bound_view b, const clustering_key_prefix& p) const {
            return operator()(b.prefix, weight(b.kind), p, 0);
@@ -106,20 +122,33 @@ public:
    static bound_view top() {
        return {empty_prefix, bound_kind::incl_end};
    }
-    /*
-    template<template<typename> typename T, typename U>
-    concept bool Range() {
-        return requires (T<U> range) {
-            { range.start() } -> stdx::optional<U>;
-            { range.end() } -> stdx::optional<U>;
-        };
-    };*/
-    template<template<typename> typename Range>
-    static std::pair<bound_view, bound_view> from_range(const Range<clustering_key_prefix>& range) {
-        return {
-            range.start() ? bound_view(range.start()->value(), range.start()->is_inclusive() ? bound_kind::incl_start : bound_kind::excl_start) : bottom(),
-            range.end() ? bound_view(range.end()->value(), range.end()->is_inclusive() ? bound_kind::incl_end : bound_kind::excl_end) : top(),
-        };
+    template<template<typename> typename R>
+    GCC6_CONCEPT( requires Range<R, clustering_key_prefix_view> )
+    static bound_view from_range_start(const R<clustering_key_prefix>& range) {
+        return range.start()
+               ? bound_view(range.start()->value(), range.start()->is_inclusive() ? bound_kind::incl_start : bound_kind::excl_start)
+               : bottom();
+    }
+    template<template<typename> typename R>
+    GCC6_CONCEPT( requires Range<R, clustering_key_prefix> )
+    static bound_view from_range_end(const R<clustering_key_prefix>& range) {
+        return range.end()
+               ? bound_view(range.end()->value(), range.end()->is_inclusive() ? bound_kind::incl_end : bound_kind::excl_end)
+               : top();
+    }
+    template<template<typename> typename R>
+    GCC6_CONCEPT( requires Range<R, clustering_key_prefix> )
+    static std::pair<bound_view, bound_view> from_range(const R<clustering_key_prefix>& range) {
+        return {from_range_start(range), from_range_end(range)};
+    }
+    template<template<typename> typename R>
+    GCC6_CONCEPT( requires Range<R, clustering_key_prefix_view> )
+    static stdx::optional<typename R<clustering_key_prefix_view>::bound> to_range_bound(const bound_view& bv) {
+        if (&bv.prefix == &empty_prefix) {
+            return {};
+        }
+        bool inclusive = bv.kind != bound_kind::excl_end && bv.kind != bound_kind::excl_start;
+        return {typename R<clustering_key_prefix_view>::bound(bv.prefix.view(), inclusive)};
    }
    friend std::ostream& operator<<(std::ostream& out, const bound_view& b) {
        return out << "{bound: prefix=" << b.prefix << ", kind=" << b.kind << "}";
--- a/clustering_key_filter.hh
+++ b/clustering_key_filter.hh
@@ -54,6 +54,7 @@ public:
    auto end() const { return _ref.end(); }
    bool empty() const { return _ref.empty(); }
    size_t size() const { return _ref.size(); }
+    const clustering_row_ranges& ranges() const { return _ref; }

    static clustering_key_filter_ranges get_ranges(const schema& schema, const query::partition_slice& slice, const partition_key& key) {
        const query::clustering_row_ranges& ranges = slice.row_ranges(schema, key);
--- a/clustering_ranges_walker.hh
+++ b/clustering_ranges_walker.hh
@@ -0,0 +1,219 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "schema.hh"
+#include "query-request.hh"
+#include "streamed_mutation.hh"
+
+// Utility for in-order checking of overlap with position ranges.
+class clustering_ranges_walker {
+    const schema& _schema;
+    const query::clustering_row_ranges& _ranges;
+    query::clustering_row_ranges::const_iterator _current;
+    query::clustering_row_ranges::const_iterator _end;
+    bool _in_current; // next position is known to be >= _current_start
+    bool _with_static_row;
+    position_in_partition_view _current_start;
+    position_in_partition_view _current_end;
+    stdx::optional<position_in_partition> _trim;
+    size_t _change_counter = 1;
+private:
+    bool advance_to_next_range() {
+        _in_current = false;
+        if (!_current_start.is_static_row()) {
+            if (_current == _end) {
+                return false;
+            }
+            ++_current;
+        }
+        ++_change_counter;
+        if (_current == _end) {
+            _current_end = _current_start = position_in_partition_view::after_all_clustered_rows();
+            return false;
+        }
+        _current_start = position_in_partition_view::for_range_start(*_current);
+        _current_end = position_in_partition_view::for_range_end(*_current);
+        return true;
+    }
+public:
+    clustering_ranges_walker(const schema& s, const query::clustering_row_ranges& ranges, bool with_static_row = true)
+        : _schema(s)
+        , _ranges(ranges)
+        , _current(ranges.begin())
+        , _end(ranges.end())
+        , _in_current(with_static_row)
+        , _with_static_row(with_static_row)
+        , _current_start(position_in_partition_view::for_static_row())
+        , _current_end(position_in_partition_view::before_all_clustered_rows())
+    {
+        if (!with_static_row) {
+            if (_current == _end) {
+                _current_start = position_in_partition_view::before_all_clustered_rows();
+            } else {
+                _current_start = position_in_partition_view::for_range_start(*_current);
+                _current_end = position_in_partition_view::for_range_end(*_current);
+            }
+        }
+    }
+    clustering_ranges_walker(clustering_ranges_walker&& o) noexcept
+        : _schema(o._schema)
+        , _ranges(o._ranges)
+        , _current(o._current)
+        , _end(o._end)
+        , _in_current(o._in_current)
+        , _with_static_row(o._with_static_row)
+        , _current_start(o._current_start)
+        , _current_end(o._current_end)
+        , _trim(std::move(o._trim))
+        , _change_counter(o._change_counter)
+    { }
+    clustering_ranges_walker& operator=(clustering_ranges_walker&& o) {
+        if (this != &o) {
+            this->~clustering_ranges_walker();
+            new (this) clustering_ranges_walker(std::move(o));
+        }
+        return *this;
+    }
+
+    // Excludes positions smaller than pos from the ranges.
+    // pos should be monotonic.
+    // No constraints between pos and positions passed to advance_to().
+    //
+    // After the invocation, when !out_of_range(), lower_bound() returns the smallest position still contained.
+    void trim_front(position_in_partition pos) {
+        position_in_partition::less_compare less(_schema);
+
+        do {
+            if (!less(_current_start, pos)) {
+                break;
+            }
+            if (less(pos, _current_end)) {
+                _trim = std::move(pos);
+                _current_start = *_trim;
+                _in_current = false;
+                ++_change_counter;
+                break;
+            }
+        } while (advance_to_next_range());
+    }
+
+    // Returns true if given position is contained.
+    // Must be called with monotonic positions.
+    // Idempotent.
+    bool advance_to(position_in_partition_view pos) {
+        position_in_partition::less_compare less(_schema);
+
+        do {
+            if (!_in_current && less(pos, _current_start)) {
+                break;
+            }
+            // All subsequent clustering keys are larger than the start of this
+            // range so there is no need to check that again.
+            _in_current = true;
+
+            if (less(pos, _current_end)) {
+                return true;
+            }
+        } while (advance_to_next_range());
+
+        return false;
+    }
+
+    // Returns true if the range expressed by start and end (as in position_range) overlaps
+    // with clustering ranges.
+    // Must be called with monotonic start position. That position must also be greater than
+    // the last position passed to the other advance_to() overload.
+    // Idempotent.
+    bool advance_to(position_in_partition_view start, position_in_partition_view end) {
+        position_in_partition::less_compare less(_schema);
+
+        do {
+            if (!less(_current_start, end)) {
+                break;
+            }
+            if (less(start, _current_end)) {
+                return true;
+            }
+        } while (advance_to_next_range());
+
+        return false;
+    }
+
+    // Returns true if the range tombstone expressed by start and end (as in position_range) overlaps
+    // with clustering ranges.
+    // No monotonicity restrictions on argument values across calls.
+    // Does not affect lower_bound().
+    // Idempotent.
+    bool contains_tombstone(position_in_partition_view start, position_in_partition_view end) const {
+        position_in_partition::less_compare less(_schema);
+
+        if (_trim && !less(*_trim, end)) {
+            return false;
+        }
+
+        auto i = _current;
+        while (i != _end) {
+            auto range_start = position_in_partition_view::for_range_start(*i);
+            if (!less(range_start, end)) {
+                return false;
+            }
+            auto range_end = position_in_partition_view::for_range_end(*i);
+            if (less(start, range_end)) {
+                return true;
+            }
+            ++i;
+        }
+
+        return false;
+    }
+
+    // Returns true if advanced past all contained positions. Any later advance_to() until reset() will return false.
+    bool out_of_range() const {
+        return !_in_current && _current == _end;
+    }
+
+    // Resets the state of the walker so that advance_to() can be now called for new sequence of positions.
+    // Any range trimmings still hold after this.
+    void reset() {
+        auto trim = std::move(_trim);
+        auto ctr = _change_counter;
+        *this = clustering_ranges_walker(_schema, _ranges, _with_static_row);
+        _change_counter = ctr + 1;
+        if (trim) {
+            trim_front(std::move(*trim));
+        }
+    }
+
+    // Can be called only when !out_of_range()
+    position_in_partition_view lower_bound() const {
+        return _current_start;
+    }
+
+    // When lower_bound() changes, this also does
+    // Always > 0.
+    size_t lower_bound_change_counter() const {
+        return _change_counter;
+    }
+};
--- a/coding-style.md
+++ b/coding-style.md
@@ -0,0 +1,3 @@
+# Scylla Coding Style
+
+Please see the [Seastar style document](https://github.com/scylladb/seastar/blob/master/coding-style.md).
--- a/compaction_strategy.hh
+++ b/compaction_strategy.hh
@@ -21,6 +21,9 @@

 #pragma once

+#include "sstables/shared_sstable.hh"
+#include "exceptions/exceptions.hh"
+
 class column_family;
 class schema;
 using schema_ptr = lw_shared_ptr<const schema>;
@@ -33,12 +36,14 @@ enum class compaction_strategy_type {
    size_tiered,
    leveled,
    date_tiered,
+    time_window,
 };

 class compaction_strategy_impl;
 class sstable;
 class sstable_set;
 struct compaction_descriptor;
+struct resharding_descriptor;

 class compaction_strategy {
    ::shared_ptr<compaction_strategy_impl> _compaction_strategy_impl;
@@ -52,7 +57,13 @@ public:
    compaction_strategy& operator=(compaction_strategy&&);

    // Return a list of sstables to be compacted after applying the strategy.
-    compaction_descriptor get_sstables_for_compaction(column_family& cfs, std::vector<lw_shared_ptr<sstable>> candidates);
+    compaction_descriptor get_sstables_for_compaction(column_family& cfs, std::vector<shared_sstable> candidates);
+
+    std::vector<resharding_descriptor> get_resharding_jobs(column_family& cf, std::vector<shared_sstable> candidates);
+
+    // Some strategies may look at the compacted and resulting sstables to
+    // get some useful information for subsequent compactions.
+    void notify_completion(const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added);

    // Return if parallel compaction is allowed by strategy.
    bool parallel_compaction() const;
@@ -75,6 +86,8 @@ public:
            return "LeveledCompactionStrategy";
        case compaction_strategy_type::date_tiered:
            return "DateTieredCompactionStrategy";
+        case compaction_strategy_type::time_window:
+            return "TimeWindowCompactionStrategy";
        default:
            throw std::runtime_error("Invalid Compaction Strategy");
        }
@@ -93,6 +106,8 @@ public:
            return compaction_strategy_type::leveled;
        } else if (short_name == "DateTieredCompactionStrategy") {
            return compaction_strategy_type::date_tiered;
+        } else if (short_name == "TimeWindowCompactionStrategy") {
+            return compaction_strategy_type::time_window;
        } else {
            throw exceptions::configuration_exception(sprint("Unable to find compaction strategy class '%s'", name));
        }
--- a/compatible_ring_position.hh
+++ b/compatible_ring_position.hh
@@ -39,6 +39,9 @@ public:
    compatible_ring_position(const schema& s, dht::ring_position&& rp)
            : _schema(&s), _rp(std::move(rp)) {
    }
+    const dht::token& token() const {
+        return _rp->token();
+    }
    friend int tri_compare(const compatible_ring_position& x, const compatible_ring_position& y) {
        return x._rp->tri_compare(*x._schema, *y._rp);
    }
--- a/compound.hh
+++ b/compound.hh
@@ -22,7 +22,7 @@
 #pragma once

 #include "types.hh"
-#include <iostream>
+#include <iosfwd>
 #include <algorithm>
 #include <vector>
 #include <boost/range/iterator_range.hpp>
@@ -130,10 +130,10 @@ public:
    bytes decompose_value(const value_type& values) {
        return serialize_value(values);
    }
-    class iterator : public std::iterator<std::input_iterator_tag, bytes_view> {
+    class iterator : public std::iterator<std::input_iterator_tag, const bytes_view> {
    private:
        bytes_view _v;
-        value_type _current;
+        bytes_view _current;
    private:
        void read_current() {
            size_type len;
@@ -220,6 +220,9 @@ public:
        assert(AllowPrefixes == allow_prefixes::yes);
        return std::distance(begin(v), end(v)) == (ssize_t)_types.size();
    }
+    bool is_empty(bytes_view v) const {
+        return begin(v) == end(v);
+    }
    void validate(bytes_view v) {
        // FIXME: implement
        warn(unimplemented::cause::VALIDATION);
--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -184,6 +184,8 @@ bytes to_legacy(CompoundType& type, bytes_view packed) {
    return legacy_form;
 }

+class composite_view;
+
 // Represents a value serialized according to Origin's CompositeType.
 // If is_compound is true, then the value is one or more components encoded as:
 //
@@ -202,7 +204,7 @@ public:
            , _is_compound(is_compound)
    { }

-    composite(bytes&& b)
+    explicit composite(bytes&& b)
            : _bytes(std::move(b))
            , _is_compound(true)
    { }
@@ -239,7 +241,7 @@ public:
    using component_view = std::pair<bytes_view, eoc>;
 private:
    template<typename Value, typename = std::enable_if_t<!std::is_same<const data_value, std::decay_t<Value>>::value>>
-    static size_t size(Value& val) {
+    static size_t size(const Value& val) {
        return val.size();
    }
    static size_t size(const data_value& val) {
@@ -304,23 +306,36 @@ public:
        return f(const_cast<bytes&>(_bytes));
    }

+    // marker is ignored if !is_compound
    template<typename RangeOfSerializedComponents>
-    static bytes serialize_value(RangeOfSerializedComponents&& values, bool is_compound = true) {
+    static composite serialize_value(RangeOfSerializedComponents&& values, bool is_compound = true, eoc marker = eoc::none) {
        auto size = serialized_size(values, is_compound);
        bytes b(bytes::initialized_later(), size);
        auto i = b.begin();
        serialize_value(std::forward<decltype(values)>(values), i, is_compound);
-        return b;
+        if (is_compound && !b.empty()) {
+            b.back() = eoc_type(marker);
+        }
+        return composite(std::move(b), is_compound);
+    }
+
+    template<typename RangeOfSerializedComponents>
+    static composite serialize_static(const schema& s, RangeOfSerializedComponents&& values) {
+        // FIXME: Optimize
+        auto b = bytes(size_t(2), bytes::value_type(0xff));
+        std::vector<bytes_view> sv(s.clustering_key_size());
+        b += composite::serialize_value(boost::range::join(sv, std::forward<RangeOfSerializedComponents>(values)), true).release_bytes();
+        return composite(std::move(b));
+    }
+
+    static eoc to_eoc(int8_t eoc_byte) {
+        return eoc_byte == 0 ? eoc::none : (eoc_byte < 0 ? eoc::start : eoc::end);
    }

    class iterator : public std::iterator<std::input_iterator_tag, const component_view> {
        bytes_view _v;
        component_view _current;
    private:
-        eoc to_eoc(int8_t eoc_byte) {
-            return eoc_byte == 0 ? eoc::none : (eoc_byte < 0 ? eoc::start : eoc::end);
-        }
-
        void read_current() {
            size_type len;
            {
@@ -406,6 +421,10 @@ public:
        return _bytes;
    }

+    bytes release_bytes() && {
+        return std::move(_bytes);
+    }
+
    size_t size() const {
        return _bytes.size();
    }
@@ -426,26 +445,20 @@ public:
        return _is_compound;
    }

-    // The following factory functions assume this composite is a compound value.
    template <typename ClusteringElement>
    static composite from_clustering_element(const schema& s, const ClusteringElement& ce) {
-        return serialize_value(ce.components(s));
+        return serialize_value(ce.components(s), s.is_compound());
    }

-    static composite from_exploded(const std::vector<bytes_view>& v, eoc marker = eoc::none) {
+    static composite from_exploded(const std::vector<bytes_view>& v, bool is_compound, eoc marker = eoc::none) {
        if (v.size() == 0) {
-            return bytes(size_t(1), bytes::value_type(marker));
+            return composite(bytes(size_t(1), bytes::value_type(marker)), is_compound);
        }
-        auto b = serialize_value(v);
-        b.back() = eoc_type(marker);
-        return composite(std::move(b));
+        return serialize_value(v, is_compound, marker);
    }

    static composite static_prefix(const schema& s) {
-        static bytes static_marker(size_t(2), bytes::value_type(0xff));
-
-        std::vector<bytes_view> sv(s.clustering_key_size());
-        return static_marker + serialize_value(sv);
+        return serialize_static(s, std::vector<bytes_view>());
    }

    explicit operator bytes_view() const {
@@ -456,6 +469,15 @@ public:
    friend inline std::ostream& operator<<(std::ostream& os, const std::pair<Component, eoc>& c) {
        return os << "{value=" << c.first << "; eoc=" << sprint("0x%02x", eoc_type(c.second) & 0xff) << "}";
    }
+
+    friend std::ostream& operator<<(std::ostream& os, const composite& v);
+
+    struct tri_compare {
+        const std::vector<data_type>& _types;
+        tri_compare(const std::vector<data_type>& types) : _types(types) {}
+        int operator()(const composite&, const composite&) const;
+        int operator()(composite_view, composite_view) const;
+    };
 };

 class composite_view final {
@@ -476,14 +498,15 @@ public:
            , _is_compound(true)
    { }

-    std::vector<bytes> explode() const {
+    std::vector<bytes_view> explode() const {
        if (!_is_compound) {
-            return { to_bytes(_bytes) };
+            return { _bytes };
        }

-        std::vector<bytes> ret;
+        std::vector<bytes_view> ret;
+        ret.reserve(8);
        for (auto it = begin(), e = end(); it != e; ) {
-            ret.push_back(to_bytes(it->first));
+            ret.push_back(it->first);
            auto marker = it->second;
            ++it;
            if (it != e && marker != composite::eoc::none) {
@@ -505,6 +528,15 @@ public:
        return { begin(), end() };
    }

+    composite::eoc last_eoc() const {
+        if (!_is_compound || _bytes.empty()) {
+            return composite::eoc::none;
+        }
+        bytes_view v(_bytes);
+        v.remove_prefix(v.size() - 1);
+        return composite::to_eoc(read_simple<composite::eoc_type>(v));
+    }
+
    auto values() const {
        return components() | boost::adaptors::transformed([](auto&& c) { return c.first; });
    }
@@ -527,4 +559,46 @@ public:

    bool operator==(const composite_view& k) const { return k._bytes == _bytes && k._is_compound == _is_compound; }
    bool operator!=(const composite_view& k) const { return !(k == *this); }
+
+    friend inline std::ostream& operator<<(std::ostream& os, composite_view v) {
+        return os << "{" << ::join(", ", v.components()) << ", compound=" << v._is_compound << ", static=" << v.is_static() << "}";
+    }
 };
+
+inline
+std::ostream& operator<<(std::ostream& os, const composite& v) {
+    return os << composite_view(v);
+}
+
+inline
+int composite::tri_compare::operator()(const composite& v1, const composite& v2) const {
+    return (*this)(composite_view(v1), composite_view(v2));
+}
+
+inline
+int composite::tri_compare::operator()(composite_view v1, composite_view v2) const {
+    // See org.apache.cassandra.db.composites.AbstractCType#compare
+    if (v1.empty()) {
+        return v2.empty() ? 0 : -1;
+    }
+    if (v2.empty()) {
+        return 1;
+    }
+    if (v1.is_static() != v2.is_static()) {
+        return v1.is_static() ? -1 : 1;
+    }
+    auto a_values = v1.components();
+    auto b_values = v2.components();
+    auto cmp = [&](const data_type& t, component_view c1, component_view c2) {
+        // First by value, then by EOC
+        auto r = t->compare(c1.first, c2.first);
+        if (r) {
+            return r;
+        }
+        return static_cast<int>(c1.second) - static_cast<int>(c2.second);
+    };
+    return lexicographical_tri_compare(_types.begin(), _types.end(),
+        a_values.begin(), a_values.end(),
+        b_values.begin(), b_values.end(),
+        cmp);
+}
--- a/compress.hh
+++ b/compress.hh
@@ -39,17 +39,17 @@ public:
    static constexpr auto CHUNK_LENGTH_KB = "chunk_length_kb";
    static constexpr auto CRC_CHECK_CHANCE = "crc_check_chance";
 private:
-    compressor _compressor = compressor::none;
+    compressor _compressor;
    std::experimental::optional<int> _chunk_length;
    std::experimental::optional<double> _crc_check_chance;
 public:
-    compression_parameters() = default;
-    compression_parameters(compressor c) : _compressor(c) { }
+    compression_parameters(compressor c = compressor::lz4) : _compressor(c) { }
    compression_parameters(const std::map<sstring, sstring>& options) {
        validate_options(options);

        auto it = options.find(SSTABLE_COMPRESSION);
        if (it == options.end() || it->second.empty()) {
+            _compressor = compressor::none;
            return;
        }
        const auto& compressor_class = it->second;
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -12,7 +12,9 @@

 # The name of the cluster. This is mainly used to prevent machines in
 # one logical cluster from joining another.
-cluster_name: 'Test Cluster'
+# It is recommended to change the default value when creating a new cluster.
+# You can NOT modify this value for an existing cluster
+#cluster_name: 'Test Cluster'

 # This defines the number of tokens randomly assigned to this node on the ring
 # The more tokens, relative to other nodes, the larger the proportion of data
@@ -85,10 +87,26 @@ listen_address: localhost
 # Leaving this blank will set it to the same value as listen_address
 # broadcast_address: 1.2.3.4

+
+# When using multiple physical network interfaces, set this to true to listen on broadcast_address
+# in addition to the listen_address, allowing nodes to communicate in both interfaces.
+# Ignore this property if the network configuration automatically routes between the public and private networks such as EC2.
+#
+# listen_on_broadcast_address: false
+
 # port for the CQL native transport to listen for clients on
 # For security reasons, you should not expose this port to the internet.  Firewall it if needed.
 native_transport_port: 9042

+# Enabling native transport encryption in client_encryption_options allows you to either use
+# encryption for the standard port or to use a dedicated, additional port along with the unencrypted
+# standard native_transport_port.
+# Enabling client encryption and keeping native_transport_port_ssl disabled will use encryption
+# for native_transport_port. Setting native_transport_port_ssl to a different value
+# from native_transport_port will use encryption for native_transport_port_ssl while
+# keeping native_transport_port unencrypted.
+#native_transport_port_ssl: 9142
+
 # Throttles all outbound streaming file transfers on this node to the
 # given total throughput in Mbps. This is necessary because Scylla does
 # mostly sequential IO when streaming data during bootstrap or repair, which
@@ -192,6 +210,9 @@ api_address: 127.0.0.1
 # Caution should be taken on increasing the size of this threshold as it can lead to node instability.
 batch_size_warn_threshold_in_kb: 5

+# Fail any multiple-partition batch exceeding this value. 50kb (10x warn threshold) by default.
+batch_size_fail_threshold_in_kb: 50
+
 # Authentication backend, identifying users
 # Out of the box, Scylla provides org.apache.cassandra.auth.{AllowAllAuthenticator,
 # PasswordAuthenticator}.
@@ -217,6 +238,15 @@ batch_size_warn_threshold_in_kb: 5
 # that do not have vnodes enabled.
 # initial_token:

+# RPC address to broadcast to drivers and other Scylla nodes. This cannot
+# be set to 0.0.0.0. If left blank, this will be set to the value of
+# rpc_address. If rpc_address is set to 0.0.0.0, broadcast_rpc_address must
+# be set.
+# broadcast_rpc_address: 1.2.3.4
+
+# Uncomment to enable experimental features
+# experimental: true
+
 ###################################################
 ## Not currently supported, reserved for future use
 ###################################################
@@ -249,17 +279,17 @@ batch_size_warn_threshold_in_kb: 5

 # Validity period for permissions cache (fetching permissions can be an
 # expensive operation depending on the authorizer, CassandraAuthorizer is
-# one example). Defaults to 2000, set to 0 to disable.
+# one example). Defaults to 10000, set to 0 to disable.
 # Will be disabled automatically for AllowAllAuthorizer.
-# permissions_validity_in_ms: 2000
+# permissions_validity_in_ms: 10000

 # Refresh interval for permissions cache (if enabled).
 # After this interval, cache entries become eligible for refresh. Upon next
 # access, an async reload is scheduled and the old value returned until it
-# completes. If permissions_validity_in_ms is non-zero, then this must be
-# also.
-# Defaults to the same value as permissions_validity_in_ms.
-# permissions_update_interval_in_ms: 1000
+# completes. If permissions_validity_in_ms is non-zero, then this also must have
+# a non-zero value. Defaults to 2000. It's recommended to set this value to
+# be at least 3 times smaller than the permissions_validity_in_ms.
+# permissions_update_interval_in_ms: 2000

 # The partitioner is responsible for distributing groups of rows (by
 # partition key) across nodes in the cluster.  You should leave this
@@ -273,28 +303,6 @@ batch_size_warn_threshold_in_kb: 5
 #
 partitioner: org.apache.cassandra.dht.Murmur3Partitioner

-
-# policy for data disk failures:
-# die: shut down gossip and Thrift and kill the JVM for any fs errors or
-#      single-sstable errors, so the node can be replaced.
-# stop_paranoid: shut down gossip and Thrift even for single-sstable errors.
-# stop: shut down gossip and Thrift, leaving the node effectively dead, but
-#       can still be inspected via JMX.
-# best_effort: stop using the failed disk and respond to requests based on
-#              remaining available sstables.  This means you WILL see obsolete
-#              data at CL.ONE!
-# ignore: ignore fatal errors and let requests fail, as in pre-1.2 Scylla
-# disk_failure_policy: stop
-
-# policy for commit disk failures:
-# die: shut down gossip and Thrift and kill the JVM, so the node can be replaced.
-# stop: shut down gossip and Thrift, leaving the node effectively dead, but
-#       can still be inspected via JMX.
-# stop_commit: shutdown the commit log, letting writes collect but
-#              continuing to service reads, as in pre-2.0.5 Scylla
-# ignore: ignore fatal errors and let the batches fail
-# commit_failure_policy: stop
-
 # Maximum size of the key cache in memory.
 #
 # Each key cache hit saves 1 seek and each row cache hit saves 2 seeks at the
@@ -409,29 +417,6 @@ partitioner: org.apache.cassandra.dht.Murmur3Partitioner
 # the smaller of 1/4 of heap or 512MB.
 # file_cache_size_in_mb: 512

-# Total permitted memory to use for memtables. Scylla will stop 
-# accepting writes when the limit is exceeded until a flush completes,
-# and will trigger a flush based on memtable_cleanup_threshold
-# If omitted, Scylla will set both to 1/4 the size of the heap.
-# memtable_heap_space_in_mb: 2048
-# memtable_offheap_space_in_mb: 2048
-
-# Ratio of occupied non-flushing memtable size to total permitted size
-# that will trigger a flush of the largest memtable.  Lager mct will
-# mean larger flushes and hence less compaction, but also less concurrent
-# flush activity which can make it difficult to keep your disks fed
-# under heavy write load.
-#
-# memtable_cleanup_threshold defaults to 1 / (memtable_flush_writers + 1)
-# memtable_cleanup_threshold: 0.11
-
-# Specify the way Scylla allocates and manages memtable memory.
-# Options are:
-#   heap_buffers:    on heap nio buffers
-#   offheap_buffers: off heap (direct) nio buffers
-#   offheap_objects: native memory, eliminating nio buffer heap overhead
-# memtable_allocation_type: heap_buffers
-
 # Total space to use for commitlogs.
 #
 # If space gets above this value (it will round up to the next nearest
@@ -443,17 +428,6 @@ partitioner: org.apache.cassandra.dht.Murmur3Partitioner
 # available for Scylla.
 commitlog_total_space_in_mb: -1

-# This sets the amount of memtable flush writer threads.  These will
-# be blocked by disk io, and each one will hold a memtable in memory
-# while blocked. 
-#
-# memtable_flush_writers defaults to the smaller of (number of disks,
-# number of cores), with a minimum of 2 and a maximum of 8.
-# 
-# If your data directories are backed by SSD, you should increase this
-# to the number of cores.
-#memtable_flush_writers: 8
-
 # A fixed memory pool size in MB for for SSTable index summaries. If left
 # empty, this will default to 5% of the heap size. If the memory usage of
 # all index summaries exceeds this limit, SSTables with low read rates will
@@ -518,13 +492,6 @@ commitlog_total_space_in_mb: -1
 # Whether to start the thrift rpc server.
 # start_rpc: true

-
-# RPC address to broadcast to drivers and other Scylla nodes. This cannot
-# be set to 0.0.0.0. If left blank, this will be set to the value of
-# rpc_address. If rpc_address is set to 0.0.0.0, broadcast_rpc_address must
-# be set.
-# broadcast_rpc_address: 1.2.3.4
-
 # enable or disable keepalive on rpc/native connections
 # rpc_keepalive: true

@@ -762,22 +729,17 @@ commitlog_total_space_in_mb: -1
 #    certificate: conf/scylla.crt
 #    keyfile: conf/scylla.key
 #    truststore: <none, use system trust>
+#    require_client_auth: False
+#    priority_string: <none, use default>

 # enable or disable client/server encryption.
 # client_encryption_options:
 #    enabled: false
 #    certificate: conf/scylla.crt
 #    keyfile: conf/scylla.key
-
-    # require_client_auth: false
-    # Set trustore and truststore_password if require_client_auth is true
-    # truststore: conf/.truststore
-    # truststore_password: cassandra
-    # More advanced defaults below:
-    # protocol: TLS
-    # algorithm: SunX509
-    # store_type: JKS
-    # cipher_suites: [TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_DHE_RSA_WITH_AES_128_CBC_SHA,TLS_DHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA]
+#    truststore: <none, use system trust>
+#    require_client_auth: False
+#    priority_string: <none, use default>

 # internode_compression controls whether traffic between nodes is
 # compressed.
@@ -823,3 +785,23 @@ commitlog_total_space_in_mb: -1
 # By default, Scylla binds all interfaces to the prometheus API
 # It is possible to restrict the listening address to a specific one
 # prometheus_address: 0.0.0.0
+
+# Distribution of data among cores (shards) within a node
+#
+# Scylla distributes data within a node among shards, using a round-robin
+# strategy:
+#  [shard0] [shard1] ... [shardN-1] [shard0] [shard1] ... [shardN-1] ...
+#
+# Scylla versions 1.6 and below used just one repetition of the pattern;
+# this intefered with data placement among nodes (vnodes).
+#
+# Scylla versions 1.7 and above use 4096 repetitions of the pattern; this
+# provides for better data distribution.
+#
+# the value below is log (base 2) of the number of repetitions.
+#
+# Set to 0 to avoid rewriting all data when upgrading from Scylla 1.6 and
+# below.
+#
+# Keep at 12 for new clusters.
+murmur3_partitioner_ignore_msb_bits: 12
--- a/configure.py
+++ b/configure.py
@@ -86,14 +86,14 @@ def try_compile(compiler, source = '', flags = []):
    with tempfile.NamedTemporaryFile() as sfile:
        sfile.file.write(bytes(source, 'utf-8'))
        sfile.file.flush()
-        return subprocess.call([compiler, '-x', 'c++', '-o', '/dev/null', '-c', sfile.name] + flags,
+        return subprocess.call([compiler, '-x', 'c++', '-o', '/dev/null', '-c', sfile.name] + args.user_cflags.split() + flags,
                               stdout = subprocess.DEVNULL,
                               stderr = subprocess.DEVNULL) == 0

 def warning_supported(warning, compiler):
    # gcc ignores -Wno-x even if it is not supported
    adjusted = re.sub('^-Wno-', '-W', warning)
-    return try_compile(flags = [adjusted], compiler = compiler)
+    return try_compile(flags = ['-Werror', adjusted], compiler = compiler)

 def debug_flag(compiler):
    src_with_auto = textwrap.dedent('''\
@@ -108,6 +108,11 @@ def debug_flag(compiler):
        print('Note: debug information disabled; upgrade your compiler')
        return ''

+def maybe_static(flag, libs):
+    if flag and not args.static:
+        libs = '-Wl,-Bstatic {} -Wl,-Bdynamic'.format(libs)
+    return libs
+
 class Thrift(object):
    def __init__(self, source, service):
        self.source = source
@@ -162,7 +167,9 @@ modes = {

 scylla_tests = [
    'tests/mutation_test',
+    'tests/mvcc_test',
    'tests/streamed_mutation_test',
+    'tests/flat_mutation_reader_test',
    'tests/schema_registry_test',
    'tests/canonical_mutation_test',
    'tests/range_test',
@@ -170,6 +177,8 @@ scylla_tests = [
    'tests/keys_test',
    'tests/partitioner_test',
    'tests/frozen_mutation_test',
+    'tests/serialized_action_test',
+    'tests/clustering_ranges_walker_test',
    'tests/perf/perf_mutation',
    'tests/lsa_async_eviction_test',
    'tests/lsa_sync_eviction_test',
@@ -178,18 +187,22 @@ scylla_tests = [
    'tests/perf/perf_hash',
    'tests/perf/perf_cql_parser',
    'tests/perf/perf_simple_query',
+    'tests/perf/perf_fast_forward',
+    'tests/perf/perf_cache_eviction',
+    'tests/cache_flat_mutation_reader_test',
+    'tests/row_cache_stress_test',
    'tests/memory_footprint',
    'tests/perf/perf_sstable',
    'tests/cql_query_test',
    'tests/storage_proxy_test',
    'tests/schema_change_test',
    'tests/mutation_reader_test',
-    'tests/key_reader_test',
    'tests/mutation_query_test',
    'tests/row_cache_test',
    'tests/test-serialization',
    'tests/sstable_test',
    'tests/sstable_mutation_test',
+    'tests/sstable_resharding_test',
    'tests/memtable_test',
    'tests/commitlog_test',
    'tests/cartesian_product_test',
@@ -211,6 +224,7 @@ scylla_tests = [
    'tests/murmur_hash_test',
    'tests/allocation_strategy_test',
    'tests/logalloc_test',
+    'tests/log_heap_test',
    'tests/managed_vector_test',
    'tests/crc_test',
    'tests/flush_queue_test',
@@ -222,6 +236,20 @@ scylla_tests = [
    'tests/database_test',
    'tests/nonwrapping_range_test',
    'tests/input_stream_test',
+    'tests/sstable_atomic_deletion_test',
+    'tests/virtual_reader_test',
+    'tests/view_schema_test',
+    'tests/counter_test',
+    'tests/cell_locker_test',
+    'tests/streaming_histogram_test',
+    'tests/duration_test',
+    'tests/vint_serialization_test',
+    'tests/compress_test',
+    'tests/chunked_vector_test',
+    'tests/loading_cache_test',
+    'tests/castas_fcts_test',
+    'tests/big_decimal_test',
+    'tests/aggregate_fcts_test',
 ]

 apps = [
@@ -252,6 +280,8 @@ arg_parser.add_argument('--ldflags', action = 'store', dest = 'user_ldflags', de
                        help = 'Extra flags for the linker')
 arg_parser.add_argument('--compiler', action = 'store', dest = 'cxx', default = 'g++',
                        help = 'C++ compiler path')
+arg_parser.add_argument('--c-compiler', action='store', dest='cc', default='gcc',
+                        help='C compiler path')
 arg_parser.add_argument('--with-osv', action = 'store', dest = 'with_osv', default = '',
                        help = 'Shortcut for compile for OSv')
 arg_parser.add_argument('--enable-dpdk', action = 'store_true', dest = 'dpdk', default = False,
@@ -263,13 +293,19 @@ arg_parser.add_argument('--debuginfo', action = 'store', dest = 'debuginfo', typ
 arg_parser.add_argument('--static-stdc++', dest = 'staticcxx', action = 'store_true',
 			help = 'Link libgcc and libstdc++ statically')
 arg_parser.add_argument('--static-thrift', dest = 'staticthrift', action = 'store_true',
-			help = 'Link libthrift statically')
+            help = 'Link libthrift statically')
+arg_parser.add_argument('--static-boost', dest = 'staticboost', action = 'store_true',
+            help = 'Link boost statically')
 arg_parser.add_argument('--tests-debuginfo', action = 'store', dest = 'tests_debuginfo', type = int, default = 0,
                        help = 'Enable(1)/disable(0)compiler debug information generation for tests')
 arg_parser.add_argument('--python', action = 'store', dest = 'python', default = 'python3',
                        help = 'Python3 path')
 add_tristate(arg_parser, name = 'hwloc', dest = 'hwloc', help = 'hwloc support')
 add_tristate(arg_parser, name = 'xen', dest = 'xen', help = 'Xen support')
+arg_parser.add_argument('--enable-gcc6-concepts', dest='gcc6_concepts', action='store_true', default=False,
+                        help='enable experimental support for C++ Concepts as implemented in GCC 6')
+arg_parser.add_argument('--enable-alloc-failure-injector', dest='alloc_failure_injector', action='store_true', default=False,
+                        help='enable allocation failure injection')
 args = arg_parser.parse_args()

 defines = []
@@ -292,23 +328,26 @@ scylla_core = (['database.cc',
                 'memtable.cc',
                 'schema_mutations.cc',
                 'release.cc',
+                 'supervisor.cc',
                 'utils/logalloc.cc',
                 'utils/large_bitset.cc',
                 'mutation_partition.cc',
                 'mutation_partition_view.cc',
                 'mutation_partition_serializer.cc',
                 'mutation_reader.cc',
+                 'flat_mutation_reader.cc',
                 'mutation_query.cc',
-                 'key_reader.cc',
                 'keys.cc',
+                 'counters.cc',
                 'sstables/sstables.cc',
                 'sstables/compress.cc',
                 'sstables/row.cc',
                 'sstables/partition.cc',
-                 'sstables/filter.cc',
                 'sstables/compaction.cc',
                 'sstables/compaction_strategy.cc',
                 'sstables/compaction_manager.cc',
+                 'sstables/atomic_deletion.cc',
+                 'sstables/integrity_checked_file_impl.cc',
                 'transport/event.cc',
                 'transport/event_notifier.cc',
                 'transport/server.cc',
@@ -323,15 +362,19 @@ scylla_core = (['database.cc',
                 'cql3/sets.cc',
                 'cql3/maps.cc',
                 'cql3/functions/functions.cc',
+                 'cql3/functions/castas_fcts.cc',
                 'cql3/statements/cf_prop_defs.cc',
                 'cql3/statements/cf_statement.cc',
                 'cql3/statements/authentication_statement.cc',
                 'cql3/statements/create_keyspace_statement.cc',
                 'cql3/statements/create_table_statement.cc',
+                 'cql3/statements/create_view_statement.cc',
                 'cql3/statements/create_type_statement.cc',
                 'cql3/statements/create_user_statement.cc',
+                 'cql3/statements/drop_index_statement.cc',
                 'cql3/statements/drop_keyspace_statement.cc',
                 'cql3/statements/drop_table_statement.cc',
+                 'cql3/statements/drop_view_statement.cc',
                 'cql3/statements/drop_type_statement.cc',
                 'cql3/statements/schema_altering_statement.cc',
                 'cql3/statements/ks_prop_defs.cc',
@@ -348,6 +391,7 @@ scylla_core = (['database.cc',
                 'cql3/statements/create_index_statement.cc',
                 'cql3/statements/truncate_statement.cc',
                 'cql3/statements/alter_table_statement.cc',
+                 'cql3/statements/alter_view_statement.cc',
                 'cql3/statements/alter_user_statement.cc',
                 'cql3/statements/drop_user_statement.cc',
                 'cql3/statements/list_users_statement.cc',
@@ -393,16 +437,22 @@ scylla_core = (['database.cc',
                 'cql3/selection/selector.cc',
                 'cql3/restrictions/statement_restrictions.cc',
                 'cql3/result_set.cc',
+                 'cql3/variable_specifications.cc',
                 'db/consistency_level.cc',
                 'db/system_keyspace.cc',
                 'db/schema_tables.cc',
+                 'db/cql_type_parser.cc',
+                 'db/legacy_schema_migrator.cc',
                 'db/commitlog/commitlog.cc',
                 'db/commitlog/commitlog_replayer.cc',
                 'db/commitlog/commitlog_entry.cc',
                 'db/config.cc',
+                 'db/heat_load_balance.cc',
                 'db/index/secondary_index.cc',
                 'db/marshal/type_parser.cc',
                 'db/batchlog_manager.cc',
+                 'db/view/view.cc',
+                 'index/secondary_index_manager.cc',
                 'io/io.cc',
                 'utils/utils.cc',
                 'utils/UUID_gen.cc',
@@ -414,6 +464,7 @@ scylla_core = (['database.cc',
                 'utils/dynamic_bitset.cc',
                 'utils/managed_bytes.cc',
                 'utils/exceptions.cc',
+                 'utils/config_file.cc',
                 'gms/version_generator.cc',
                 'gms/versioned_value.cc',
                 'gms/gossiper.cc',
@@ -423,6 +474,7 @@ scylla_core = (['database.cc',
                 'gms/gossip_digest_ack2.cc',
                 'gms/endpoint_state.cc',
                 'gms/application_state.cc',
+                 'gms/inet_address.cc',
                 'dht/i_partitioner.cc',
                 'dht/murmur3_partitioner.cc',
                 'dht/byte_ordered_partitioner.cc',
@@ -450,7 +502,7 @@ scylla_core = (['database.cc',
                 'service/client_state.cc',
                 'service/migration_task.cc',
                 'service/storage_service.cc',
-                 'service/load_broadcaster.cc',
+                 'service/misc_services.cc',
                 'service/pager/paging_state.cc',
                 'service/pager/query_pagers.cc',
                 'streaming/stream_task.cc',
@@ -466,26 +518,33 @@ scylla_core = (['database.cc',
                 'streaming/stream_manager.cc',
                 'streaming/stream_result_future.cc',
                 'streaming/stream_session_state.cc',
-                 'gc_clock.cc',
+                 'clocks-impl.cc',
                 'partition_slice_builder.cc',
                 'init.cc',
+                 'lister.cc',
                 'repair/repair.cc',
                 'exceptions/exceptions.cc',
-                 'dns.cc',
-                 'auth/auth.cc',
+                 'auth/allow_all_authenticator.cc',
+                 'auth/allow_all_authorizer.cc',
                 'auth/authenticated_user.cc',
                 'auth/authenticator.cc',
-                 'auth/authorizer.cc',
+                 'auth/common.cc',
                 'auth/default_authorizer.cc',
                 'auth/data_resource.cc',
                 'auth/password_authenticator.cc',
                 'auth/permission.cc',
+                 'auth/permissions_cache.cc',
+                 'auth/service.cc',
+                 'auth/transitional.cc',
                 'tracing/tracing.cc',
                 'tracing/trace_keyspace_helper.cc',
                 'tracing/trace_state.cc',
+                 'table_helper.cc',
                 'range_tombstone.cc',
                 'range_tombstone_list.cc',
-                 'db/size_estimates_recorder.cc'
+                 'disk-error-handler.cc',
+                 'duration.cc',
+                 'vint-serialization.cc',
                 ]
                + [Antlr3Grammar('cql3/Cql.g')]
                + [Thrift('interface/cassandra.thrift', 'Cassandra')]
@@ -546,6 +605,8 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/idl_test.idl.hh',
        'idl/commitlog.idl.hh',
        'idl/tracing.idl.hh',
+        'idl/consistency_level.idl.hh',
+        'idl/cache_temperature.idl.hh',
        ]

 scylla_tests_dependencies = scylla_core + api + idls + [
@@ -564,63 +625,92 @@ deps = {
    'scylla': idls + ['main.cc'] + scylla_core + api,
 }

-tests_not_using_seastar_test_framework = set([
-    'tests/keys_test',
+pure_boost_tests = set([
    'tests/partitioner_test',
    'tests/map_difference_test',
+    'tests/keys_test',
+    'tests/compound_test',
+    'tests/range_tombstone_list_test',
+    'tests/anchorless_list_test',
+    'tests/nonwrapping_range_test',
+    'tests/test-serialization',
+    'tests/range_test',
+    'tests/crc_test',
+    'tests/managed_vector_test',
+    'tests/dynamic_bitset_test',
+    'tests/idl_test',
+    'tests/cartesian_product_test',
+    'tests/streaming_histogram_test',
+    'tests/duration_test',
+    'tests/vint_serialization_test',
+    'tests/compress_test',
+    'tests/chunked_vector_test',
+    'tests/big_decimal_test',
+])
+
+tests_not_using_seastar_test_framework = set([
    'tests/perf/perf_mutation',
    'tests/lsa_async_eviction_test',
    'tests/lsa_sync_eviction_test',
    'tests/row_cache_alloc_stress',
    'tests/perf_row_cache_update',
-    'tests/cartesian_product_test',
    'tests/perf/perf_hash',
    'tests/perf/perf_cql_parser',
    'tests/message',
    'tests/perf/perf_simple_query',
+    'tests/perf/perf_fast_forward',
+    'tests/perf/perf_cache_eviction',
+    'tests/row_cache_stress_test',
    'tests/memory_footprint',
-    'tests/test-serialization',
    'tests/gossip',
-    'tests/compound_test',
-    'tests/range_test',
-    'tests/crc_test',
    'tests/perf/perf_sstable',
-    'tests/managed_vector_test',
-    'tests/dynamic_bitset_test',
-    'tests/idl_test',
-    'tests/range_tombstone_list_test',
-    'tests/anchorless_list_test',
-    'tests/nonwrapping_range_test',
-])
+]) | pure_boost_tests

 for t in tests_not_using_seastar_test_framework:
    if not t in scylla_tests:
        raise Exception("Test %s not found in scylla_tests" % (t))

 for t in scylla_tests:
-    deps[t] = scylla_tests_dependencies + [t + '.cc']
+    deps[t] = [t + '.cc']
    if t not in tests_not_using_seastar_test_framework:
+        deps[t] += scylla_tests_dependencies 
        deps[t] += scylla_tests_seastar_deps
+    else:
+        deps[t] += scylla_core + api + idls + ['tests/cql_test_env.cc']

-deps['tests/sstable_test'] += ['tests/sstable_datafile_test.cc']
+deps['tests/sstable_test'] += ['tests/sstable_datafile_test.cc', 'tests/sstable_utils.cc']
+deps['tests/mutation_reader_test'] += ['tests/sstable_utils.cc']

-deps['tests/bytes_ostream_test'] = ['tests/bytes_ostream_test.cc']
+deps['tests/bytes_ostream_test'] = ['tests/bytes_ostream_test.cc', 'utils/managed_bytes.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
 deps['tests/input_stream_test'] = ['tests/input_stream_test.cc']
-deps['tests/UUID_test'] = ['utils/UUID_gen.cc', 'tests/UUID_test.cc']
+deps['tests/UUID_test'] = ['utils/UUID_gen.cc', 'tests/UUID_test.cc', 'utils/uuid.cc', 'utils/managed_bytes.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
 deps['tests/murmur_hash_test'] = ['bytes.cc', 'utils/murmur_hash.cc', 'tests/murmur_hash_test.cc']
 deps['tests/allocation_strategy_test'] = ['tests/allocation_strategy_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
+deps['tests/log_heap_test'] = ['tests/log_heap_test.cc']
 deps['tests/anchorless_list_test'] = ['tests/anchorless_list_test.cc']

 warnings = [
    '-Wno-mismatched-tags',  # clang-only
    '-Wno-maybe-uninitialized', # false positives on gcc 5
+    '-Wno-tautological-compare',
+    '-Wno-parentheses-equality',
+    '-Wno-c++11-narrowing',
+    '-Wno-c++1z-extensions',
+    '-Wno-sometimes-uninitialized',
+    '-Wno-return-stack-address',
+    '-Wno-missing-braces',
+    '-Wno-unused-lambda-capture',
+    '-Wno-misleading-indentation',
+    '-Wno-overflow',
+    '-Wno-noexcept-type',
+    '-Wno-nonnull-compare'
    ]

 warnings = [w
            for w in warnings
            if warning_supported(warning = w, compiler = args.cxx)]

-warnings = ' '.join(warnings)
+warnings = ' '.join(warnings + ['-Wno-error=deprecated-declarations'])

 dbgflag = debug_flag(args.cxx) if args.debuginfo else ''
 tests_link_rule = 'link' if args.tests_debuginfo else 'link_stripped'
@@ -674,6 +764,9 @@ if not try_compile(compiler=args.cxx, source='''\
    print('Installed boost version too old.  Please update {}.'.format(pkgname("boost-devel")))
    sys.exit(1)

+
+has_sanitize_address_use_after_scope = try_compile(compiler=args.cxx, flags=['-fsanitize-address-use-after-scope'], source='int f() {}')
+
 defines = ' '.join(['-D' + d for d in defines])

 globals().update(vars(args))
@@ -696,7 +789,7 @@ scylla_release = file.read().strip()

 extra_cxxflags["release.cc"] = "-DSCYLLA_VERSION=\"\\\"" + scylla_version + "\\\"\" -DSCYLLA_RELEASE=\"\\\"" + scylla_release + "\\\"\""

-seastar_flags = ['--disable-xen']
+seastar_flags = []
 if args.dpdk:
    # fake dependencies on dpdk, so that it is built before anything else
    seastar_flags += ['--enable-dpdk']
@@ -704,9 +797,16 @@ elif args.dpdk_target:
    seastar_flags += ['--dpdk-target', args.dpdk_target]
 if args.staticcxx:
    seastar_flags += ['--static-stdc++']
+if args.staticboost:
+    seastar_flags += ['--static-boost']
+if args.gcc6_concepts:
+    seastar_flags += ['--enable-gcc6-concepts']
+if args.alloc_failure_injector:
+    seastar_flags += ['--enable-alloc-failure-injector']

 seastar_cflags = args.user_cflags + " -march=nehalem"
-seastar_flags += ['--compiler', args.cxx, '--cflags=%s' % (seastar_cflags)]
+seastar_ldflags = args.user_ldflags
+seastar_flags += ['--compiler', args.cxx, '--c-compiler', args.cc, '--cflags=%s' % (seastar_cflags), '--ldflags=%s' %(seastar_ldflags)]

 status = subprocess.call([python, './configure.py'] + seastar_flags, cwd = 'seastar')

@@ -737,7 +837,14 @@ for mode in build_modes:
 seastar_deps = 'practically_anything_can_change_so_lets_run_it_every_time_and_restat.'

 args.user_cflags += " " + pkg_config("--cflags", "jsoncpp")
-libs = "-lyaml-cpp -llz4 -lz -lsnappy " + pkg_config("--libs", "jsoncpp") + ' -lboost_filesystem' + ' -lcrypt' + ' -lboost_date_time'
+libs = ' '.join(['-lyaml-cpp', '-llz4', '-lz', '-lsnappy', pkg_config("--libs", "jsoncpp"),
+                 maybe_static(args.staticboost, '-lboost_filesystem'), ' -lcrypt',
+                 maybe_static(args.staticboost, '-lboost_date_time'),
+                ])
+
+if not args.staticboost:
+    args.user_cflags += ' -DBOOST_TEST_DYN_LINK'
+
 for pkg in pkgs:
    args.user_cflags += ' ' + pkg_config('--cflags', pkg)
    libs += ' ' + pkg_config('--libs', pkg)
@@ -763,10 +870,12 @@ with open(buildfile, 'w') as f:
        builddir = {outdir}
        cxx = {cxx}
        cxxflags = {user_cflags} {warnings} {defines}
-        ldflags = {user_ldflags}
+        ldflags = -fuse-ld=gold {user_ldflags}
        libs = {libs}
        pool link_pool
            depth = {link_pool_depth}
+        pool seastar_pool
+            depth = 1
        rule ragel
            command = ragel -G2 -o $out $in
            description = RAGEL $out
@@ -792,7 +901,7 @@ with open(buildfile, 'w') as f:
        f.write(textwrap.dedent('''\
            cxxflags_{mode} = -I. -I $builddir/{mode}/gen -I seastar -I seastar/build/{mode}/gen
            rule cxx.{mode}
-              command = $cxx -MMD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} -c -o $out $in
+              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} $obj_cxxflags -c -o $out $in
              description = CXX $out
              depfile = $out.d
            rule link.{mode}
@@ -810,7 +919,17 @@ with open(buildfile, 'w') as f:
                command = thrift -gen cpp:cob_style -out $builddir/{mode}/gen $in
                description = THRIFT $in
            rule antlr3.{mode}
-                command = sed -e '/^#if 0/,/^#endif/d' $in > $builddir/{mode}/gen/$in && antlr3 $builddir/{mode}/gen/$in && sed -i 's/^\\( *\)\\(ImplTraits::CommonTokenType\\* [a-zA-Z0-9_]* = NULL;\\)$$/\\1const \\2/' build/{mode}/gen/${{stem}}Parser.cpp
+                # We replace many local `ExceptionBaseType* ex` variables with a single function-scope one.
+                # Because we add such a variable to every function, and because `ExceptionBaseType` is not a global
+                # name, we also add a global typedef to avoid compilation errors. 
+                command = sed -e '/^#if 0/,/^#endif/d' $in > $builddir/{mode}/gen/$in $
+                     && antlr3 $builddir/{mode}/gen/$in $
+                     && sed -i -e 's/^\\( *\)\\(ImplTraits::CommonTokenType\\* [a-zA-Z0-9_]* = NULL;\\)$$/\\1const \\2/' $
+                        -e '1i using ExceptionBaseType = int;' $
+                        -e 's/^{{/{{ ExceptionBaseType\* ex = nullptr;/; $
+                            s/ExceptionBaseType\* ex = new/ex = new/; $
+                            s/exceptions::syntax_exception e/exceptions::syntax_exception\& e/' $
+                        build/{mode}/gen/${{stem}}Parser.cpp
                description = ANTLR3 $in
            ''').format(mode = mode, **modeval))
        f.write('build {mode}: phony {artifacts}\n'.format(mode = mode,
@@ -835,22 +954,15 @@ with open(buildfile, 'w') as f:
                    objs += dep.objects('$builddir/' + mode + '/gen')
                if isinstance(dep, Antlr3Grammar):
                    objs += dep.objects('$builddir/' + mode + '/gen')
-            if binary.endswith('.pc'):
-                vars = modeval.copy()
-                vars.update(globals())
-                pc = textwrap.dedent('''\
-                        Name: Seastar
-                        URL: http://seastar-project.org/
-                        Description: Advanced C++ framework for high-performance server applications on modern hardware.
-                        Version: 1.0
-                        Libs: -L{srcdir}/{builddir} -Wl,--whole-archive -lseastar -Wl,--no-whole-archive {dbgflag} -Wl,--no-as-needed {static} {pie} -fvisibility=hidden -pthread {user_ldflags} {libs} {sanitize_libs}
-                        Cflags: -std=gnu++1y {dbgflag} {fpie} -Wall -Werror -fvisibility=hidden -pthread -I{srcdir} -I{srcdir}/{builddir}/gen {user_cflags} {warnings} {defines} {sanitize} {opt}
-                        ''').format(builddir = 'build/' + mode, srcdir = os.getcwd(), **vars)
-                f.write('build $builddir/{}/{}: gen\n  text = {}\n'.format(mode, binary, repr(pc)))
-            elif binary.endswith('.a'):
+            if binary.endswith('.a'):
                f.write('build $builddir/{}/{}: ar.{} {}\n'.format(mode, binary, mode, str.join(' ', objs)))
            else:
                if binary.startswith('tests/'):
+                    local_libs = '$libs'
+                    if binary not in tests_not_using_seastar_test_framework or binary in pure_boost_tests:
+                        local_libs += ' ' + maybe_static(args.staticboost, '-lboost_unit_test_framework') 
+                    if has_thrift:
+                        local_libs += ' ' + thrift_libs + ' ' + maybe_static(args.staticboost, '-lboost_system')
                    # Our code's debugging information is huge, and multiplied
                    # by many tests yields ridiculous amounts of disk space.
                    # So we strip the tests by default; The user can very
@@ -858,15 +970,15 @@ with open(buildfile, 'w') as f:
                    # to the test name, e.g., "ninja build/release/testname_g"
                    f.write('build $builddir/{}/{}: {}.{} {} {}\n'.format(mode, binary, tests_link_rule, mode, str.join(' ', objs),
                                                                                     'seastar/build/{}/libseastar.a'.format(mode)))
-                    if has_thrift:
-                        f.write('   libs =  {} -lboost_system $libs\n'.format(thrift_libs))
+                    f.write('   libs = {}\n'.format(local_libs))
                    f.write('build $builddir/{}/{}_g: link.{} {} {}\n'.format(mode, binary, mode, str.join(' ', objs),
                                                                              'seastar/build/{}/libseastar.a'.format(mode)))
+                    f.write('   libs = {}\n'.format(local_libs))
                else:
                    f.write('build $builddir/{}/{}: link.{} {} {}\n'.format(mode, binary, mode, str.join(' ', objs),
                                                                            'seastar/build/{}/libseastar.a'.format(mode)))
-                if has_thrift:
-                    f.write('   libs =  {} -lboost_system $libs\n'.format(thrift_libs))
+                    if has_thrift:
+                        f.write('   libs =  {} {} $libs\n'.format(thrift_libs, maybe_static(args.staticboost, '-lboost_system')))
            for src in srcs:
                if src.endswith('.cc'):
                    obj = '$builddir/' + mode + '/' + src.replace('.cc', '.o')
@@ -905,7 +1017,7 @@ with open(buildfile, 'w') as f:
            f.write('build {}: ragel {}\n'.format(hh, src))
        for hh in swaggers:
            src = swaggers[hh]
-            f.write('build {}: swagger {}\n'.format(hh,src))
+            f.write('build {}: swagger {} | seastar/json/json2code.py\n'.format(hh,src))
        for hh in serializers:
            src = serializers[hh]
            f.write('build {}: serializer {} | idl-compiler.py\n'.format(hh,src))
@@ -922,8 +1034,12 @@ with open(buildfile, 'w') as f:
            for cc in grammar.sources('$builddir/{}/gen'.format(mode)):
                obj = cc.replace('.cpp', '.o')
                f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
+                if cc.endswith('Parser.cpp') and has_sanitize_address_use_after_scope:
+                    # Parsers end up using huge amounts of stack space and overflowing their stack 
+                    f.write('  obj_cxxflags = -fno-sanitize-address-use-after-scope\n')
        f.write('build seastar/build/{mode}/libseastar.a seastar/build/{mode}/apps/iotune/iotune seastar/build/{mode}/gen/http/request_parser.hh seastar/build/{mode}/gen/http/http_response_parser.hh: ninja {seastar_deps}\n'
                .format(**locals()))
+        f.write('  pool = seastar_pool\n')
        f.write('  subdir = seastar\n')
        f.write('  target = build/{mode}/libseastar.a build/{mode}/apps/iotune/iotune build/{mode}/gen/http/request_parser.hh build/{mode}/gen/http/http_response_parser.hh\n'.format(**locals()))
        f.write(textwrap.dedent('''\
--- a/converting_mutation_partition_applier.hh
+++ b/converting_mutation_partition_applier.hh
@@ -22,6 +22,7 @@
 #pragma once

 #include "mutation_partition_view.hh"
+#include "mutation_partition.hh"
 #include "schema.hh"

 // Mutation partition visitor which applies visited data into
@@ -37,12 +38,12 @@ private:
    static bool is_compatible(const column_definition& new_def, const data_type& old_type, column_kind kind) {
        return ::is_compatible(new_def.kind, kind) && new_def.type->is_value_compatible_with(*old_type);
    }
-    void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) {
+    static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) {
        if (is_compatible(new_def, old_type, kind) && cell.timestamp() > new_def.dropped_at()) {
            dst.apply(new_def, atomic_cell_or_collection(cell));
        }
    }
-    void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, collection_mutation_view cell) {
+    static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, collection_mutation_view cell) {
        if (!is_compatible(new_def, old_type, kind)) {
            return;
        }
@@ -94,8 +95,8 @@ public:
        _p.apply_row_tombstone(_p_schema, rt);
    }

-    virtual void accept_row(clustering_key_view key, tombstone deleted_at, const row_marker& rm) override {
-        deletable_row& r = _p.clustered_row(_p_schema, key);
+    virtual void accept_row(position_in_partition_view key, const row_tombstone& deleted_at, const row_marker& rm, is_dummy dummy, is_continuous continuous) override {
+        deletable_row& r = _p.clustered_row(_p_schema, key, dummy, continuous);
        r.apply(rm);
        r.apply(deleted_at);
        _current_row = &r;
@@ -116,4 +117,14 @@ public:
            accept_cell(_current_row->cells(), column_kind::regular_column, *def, col.type(), collection);
        }
    }
+
+    // Appends the cell to dst upgrading it to the new schema.
+    // Cells must have monotonic names.
+    static void append_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, const atomic_cell_or_collection& cell) {
+        if (new_def.is_atomic()) {
+            accept_cell(dst, kind, new_def, old_type, cell.as_atomic_cell());
+        } else {
+            accept_cell(dst, kind, new_def, old_type, cell.as_collection_mutation());
+        }
+    }
 };
--- a/counters.cc
+++ b/counters.cc
@@ -0,0 +1,332 @@
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "service/storage_service.hh"
+#include "counters.hh"
+#include "mutation.hh"
+#include "combine.hh"
+
+counter_id counter_id::local()
+{
+    return counter_id(service::get_local_storage_service().get_local_id());
+}
+
+bool counter_id::less_compare_1_7_4::operator()(const counter_id& a, const counter_id& b) const
+{
+    if (a._most_significant != b._most_significant) {
+        return a._most_significant < b._most_significant;
+    } else {
+        return a._least_significant < b._least_significant;
+    }
+}
+
+std::ostream& operator<<(std::ostream& os, const counter_id& id) {
+    return os << id.to_uuid();
+}
+
+std::ostream& operator<<(std::ostream& os, counter_shard_view csv) {
+    return os << "{global_shard id: " << csv.id() << " value: " << csv.value()
+              << " clock: " << csv.logical_clock() << "}";
+}
+
+std::ostream& operator<<(std::ostream& os, counter_cell_view ccv) {
+    return os << "{counter_cell timestamp: " << ccv.timestamp() << " shards: {" << ::join(", ", ccv.shards()) << "}}";
+}
+
+void counter_cell_builder::do_sort_and_remove_duplicates()
+{
+    boost::range::sort(_shards, [] (auto& a, auto& b) { return a.id() < b.id(); });
+
+    std::vector<counter_shard> new_shards;
+    new_shards.reserve(_shards.size());
+    for (auto& cs : _shards) {
+        if (new_shards.empty() || new_shards.back().id() != cs.id()) {
+            new_shards.emplace_back(cs);
+        } else {
+            new_shards.back().apply(cs);
+        }
+    }
+    _shards = std::move(new_shards);
+    _sorted = true;
+}
+
+std::vector<counter_shard> counter_cell_view::shards_compatible_with_1_7_4() const
+{
+    auto sorted_shards = boost::copy_range<std::vector<counter_shard>>(shards());
+    counter_id::less_compare_1_7_4 cmp;
+    boost::range::sort(sorted_shards, [&] (auto& a, auto& b) {
+        return cmp(a.id(), b.id());
+    });
+    return sorted_shards;
+}
+
+static bool apply_in_place(atomic_cell_or_collection& dst, atomic_cell_or_collection& src)
+{
+    auto dst_ccmv = counter_cell_mutable_view(dst.as_mutable_atomic_cell());
+    auto src_ccmv = counter_cell_mutable_view(src.as_mutable_atomic_cell());
+    auto dst_shards = dst_ccmv.shards();
+    auto src_shards = src_ccmv.shards();
+
+    auto dst_it = dst_shards.begin();
+    auto src_it = src_shards.begin();
+
+    while (src_it != src_shards.end()) {
+        while (dst_it != dst_shards.end() && dst_it->id() < src_it->id()) {
+            ++dst_it;
+        }
+        if (dst_it == dst_shards.end() || dst_it->id() != src_it->id()) {
+            // Fast-path failed. Revert and fall back to the slow path.
+            if (dst_it == dst_shards.end()) {
+                --dst_it;
+            }
+            while (src_it != src_shards.begin()) {
+                --src_it;
+                while (dst_it->id() != src_it->id()) {
+                    --dst_it;
+                }
+                src_it->swap_value_and_clock(*dst_it);
+            }
+            return false;
+        }
+        if (dst_it->logical_clock() < src_it->logical_clock()) {
+            dst_it->swap_value_and_clock(*src_it);
+        } else {
+            src_it->set_value_and_clock(*dst_it);
+        }
+        ++src_it;
+    }
+
+    auto dst_ts = dst_ccmv.timestamp();
+    auto src_ts = src_ccmv.timestamp();
+    dst_ccmv.set_timestamp(std::max(dst_ts, src_ts));
+    src_ccmv.set_timestamp(dst_ts);
+    src.as_mutable_atomic_cell().set_counter_in_place_revert(true);
+    return true;
+}
+
+static void revert_in_place_apply(atomic_cell_or_collection& dst, atomic_cell_or_collection& src)
+{
+    assert(dst.can_use_mutable_view() && src.can_use_mutable_view());
+    auto dst_ccmv = counter_cell_mutable_view(dst.as_mutable_atomic_cell());
+    auto src_ccmv = counter_cell_mutable_view(src.as_mutable_atomic_cell());
+    auto dst_shards = dst_ccmv.shards();
+    auto src_shards = src_ccmv.shards();
+
+    auto dst_it = dst_shards.begin();
+    auto src_it = src_shards.begin();
+
+    while (src_it != src_shards.end()) {
+        while (dst_it != dst_shards.end() && dst_it->id() < src_it->id()) {
+            ++dst_it;
+        }
+        assert(dst_it != dst_shards.end() && dst_it->id() == src_it->id());
+        dst_it->swap_value_and_clock(*src_it);
+        ++src_it;
+    }
+
+    auto dst_ts = dst_ccmv.timestamp();
+    auto src_ts = src_ccmv.timestamp();
+    dst_ccmv.set_timestamp(src_ts);
+    src_ccmv.set_timestamp(dst_ts);
+    src.as_mutable_atomic_cell().set_counter_in_place_revert(false);
+}
+
+bool counter_cell_view::apply_reversibly(atomic_cell_or_collection& dst, atomic_cell_or_collection& src)
+{
+    auto dst_ac = dst.as_atomic_cell();
+    auto src_ac = src.as_atomic_cell();
+
+    if (!dst_ac.is_live() || !src_ac.is_live()) {
+        if (dst_ac.is_live() || (!src_ac.is_live() && compare_atomic_cell_for_merge(dst_ac, src_ac) < 0)) {
+            std::swap(dst, src);
+            return true;
+        }
+        return false;
+    }
+
+    if (dst_ac.is_counter_update() && src_ac.is_counter_update()) {
+        auto src_v = src_ac.counter_update_value();
+        auto dst_v = dst_ac.counter_update_value();
+        dst = atomic_cell::make_live_counter_update(std::max(dst_ac.timestamp(), src_ac.timestamp()),
+                                                    src_v + dst_v);
+        return true;
+    }
+
+    assert(!dst_ac.is_counter_update());
+    assert(!src_ac.is_counter_update());
+
+    if (counter_cell_view(dst_ac).shard_count() >= counter_cell_view(src_ac).shard_count()
+        && dst.can_use_mutable_view() && src.can_use_mutable_view()) {
+        if (apply_in_place(dst, src)) {
+            return true;
+        }
+    }
+
+    src.as_mutable_atomic_cell().set_counter_in_place_revert(false);
+    auto dst_shards = counter_cell_view(dst_ac).shards();
+    auto src_shards = counter_cell_view(src_ac).shards();
+
+    counter_cell_builder result;
+    combine(dst_shards.begin(), dst_shards.end(), src_shards.begin(), src_shards.end(),
+            result.inserter(), counter_shard_view::less_compare_by_id(), [] (auto& x, auto& y) {
+                return x.logical_clock() < y.logical_clock() ? y : x;
+            });
+
+    auto cell = result.build(std::max(dst_ac.timestamp(), src_ac.timestamp()));
+    src = std::exchange(dst, atomic_cell_or_collection(cell));
+    return true;
+}
+
+void counter_cell_view::revert_apply(atomic_cell_or_collection& dst, atomic_cell_or_collection& src)
+{
+    if (dst.as_atomic_cell().is_counter_update()) {
+        auto src_v = src.as_atomic_cell().counter_update_value();
+        auto dst_v = dst.as_atomic_cell().counter_update_value();
+        dst = atomic_cell::make_live(dst.as_atomic_cell().timestamp(),
+                                     long_type->decompose(dst_v - src_v));
+    } else if (src.as_atomic_cell().is_counter_in_place_revert_set()) {
+        revert_in_place_apply(dst, src);
+    } else {
+        std::swap(dst, src);
+    }
+}
+
+stdx::optional<atomic_cell> counter_cell_view::difference(atomic_cell_view a, atomic_cell_view b)
+{
+    assert(!a.is_counter_update());
+    assert(!b.is_counter_update());
+
+    if (!b.is_live() || !a.is_live()) {
+        if (b.is_live() || (!a.is_live() && compare_atomic_cell_for_merge(b, a) < 0)) {
+            return atomic_cell(a);
+        }
+        return { };
+    }
+
+    auto a_shards = counter_cell_view(a).shards();
+    auto b_shards = counter_cell_view(b).shards();
+
+    auto a_it = a_shards.begin();
+    auto a_end = a_shards.end();
+    auto b_it = b_shards.begin();
+    auto b_end = b_shards.end();
+
+    counter_cell_builder result;
+    while (a_it != a_end) {
+        while (b_it != b_end && (*b_it).id() < (*a_it).id()) {
+            ++b_it;
+        }
+        if (b_it == b_end || (*a_it).id() != (*b_it).id() || (*a_it).logical_clock() > (*b_it).logical_clock()) {
+            result.add_shard(counter_shard(*a_it));
+        }
+        ++a_it;
+    }
+
+    stdx::optional<atomic_cell> diff;
+    if (!result.empty()) {
+        diff = result.build(std::max(a.timestamp(), b.timestamp()));
+    } else if (a.timestamp() > b.timestamp()) {
+        diff = atomic_cell::make_live(a.timestamp(), bytes_view());
+    }
+    return diff;
+}
+
+
+void transform_counter_updates_to_shards(mutation& m, const mutation* current_state, uint64_t clock_offset) {
+    // FIXME: allow current_state to be frozen_mutation
+
+    auto transform_new_row_to_shards = [clock_offset] (auto& cells) {
+        cells.for_each_cell([clock_offset] (auto, atomic_cell_or_collection& ac_o_c) {
+            auto acv = ac_o_c.as_atomic_cell();
+            if (!acv.is_live()) {
+                return; // continue -- we are in lambda
+            }
+            auto delta = acv.counter_update_value();
+            auto cs = counter_shard(counter_id::local(), delta, clock_offset + 1);
+            ac_o_c = counter_cell_builder::from_single_shard(acv.timestamp(), cs);
+        });
+    };
+
+    if (!current_state) {
+        transform_new_row_to_shards(m.partition().static_row());
+        for (auto& cr : m.partition().clustered_rows()) {
+            transform_new_row_to_shards(cr.row().cells());
+        }
+        return;
+    }
+
+    clustering_key::less_compare cmp(*m.schema());
+
+    auto transform_row_to_shards = [clock_offset] (auto& transformee, auto& state) {
+        std::deque<std::pair<column_id, counter_shard>> shards;
+        state.for_each_cell([&] (column_id id, const atomic_cell_or_collection& ac_o_c) {
+            auto acv = ac_o_c.as_atomic_cell();
+            if (!acv.is_live()) {
+                return; // continue -- we are in lambda
+            }
+            counter_cell_view ccv(acv);
+            auto cs = ccv.local_shard();
+            if (!cs) {
+                return; // continue
+            }
+            shards.emplace_back(std::make_pair(id, counter_shard(*cs)));
+        });
+
+        transformee.for_each_cell([&] (column_id id, atomic_cell_or_collection& ac_o_c) {
+            auto acv = ac_o_c.as_atomic_cell();
+            if (!acv.is_live()) {
+                return; // continue -- we are in lambda
+            }
+            while (!shards.empty() && shards.front().first < id) {
+                shards.pop_front();
+            }
+
+            auto delta = acv.counter_update_value();
+
+            if (shards.empty() || shards.front().first > id) {
+                auto cs = counter_shard(counter_id::local(), delta, clock_offset + 1);
+                ac_o_c = counter_cell_builder::from_single_shard(acv.timestamp(), cs);
+            } else {
+                auto& cs = shards.front().second;
+                cs.update(delta, clock_offset + 1);
+                ac_o_c = counter_cell_builder::from_single_shard(acv.timestamp(), cs);
+                shards.pop_front();
+            }
+        });
+    };
+
+    transform_row_to_shards(m.partition().static_row(), current_state->partition().static_row());
+
+    auto& cstate = current_state->partition();
+    auto it = cstate.clustered_rows().begin();
+    auto end = cstate.clustered_rows().end();
+    for (auto& cr : m.partition().clustered_rows()) {
+        while (it != end && cmp(it->key(), cr.key())) {
+            ++it;
+        }
+        if (it == end || cmp(cr.key(), it->key())) {
+            transform_new_row_to_shards(cr.row().cells());
+            continue;
+        }
+
+        transform_row_to_shards(cr.row().cells(), it->row().cells());
+    }
+}
--- a/counters.hh
+++ b/counters.hh
@@ -0,0 +1,435 @@
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <boost/range/algorithm/find_if.hpp>
+
+#include "atomic_cell_or_collection.hh"
+#include "types.hh"
+
+#include "stdx.hh"
+
+class mutation;
+
+class mutation;
+
+class counter_id {
+    int64_t _least_significant;
+    int64_t _most_significant;
+public:
+    static_assert(std::is_same<decltype(std::declval<utils::UUID>().get_least_significant_bits()), int64_t>::value
+            &&  std::is_same<decltype(std::declval<utils::UUID>().get_most_significant_bits()), int64_t>::value,
+        "utils::UUID is expected to work with two signed 64-bit integers");
+
+    counter_id() = default;
+    explicit counter_id(utils::UUID uuid) noexcept
+        : _least_significant(uuid.get_least_significant_bits())
+        , _most_significant(uuid.get_most_significant_bits())
+    { }
+
+    utils::UUID to_uuid() const {
+        return utils::UUID(_most_significant, _least_significant);
+    }
+
+    bool operator<(const counter_id& other) const {
+        return to_uuid() < other.to_uuid();
+    }
+    bool operator>(const counter_id& other) const {
+        return other.to_uuid() < to_uuid();
+    }
+    bool operator==(const counter_id& other) const {
+        return to_uuid() == other.to_uuid();
+    }
+    bool operator!=(const counter_id& other) const {
+        return !(*this == other);
+    }
+public:
+    // (Wrong) Counter ID ordering used by Scylla 1.7.4 and earlier.
+    struct less_compare_1_7_4 {
+        bool operator()(const counter_id& a, const counter_id& b) const;
+    };
+public:
+    static counter_id local();
+
+    // For tests.
+    static counter_id generate_random() {
+        return counter_id(utils::make_random_uuid());
+    }
+};
+static_assert(std::is_pod<counter_id>::value, "counter_id should be a POD type");
+
+std::ostream& operator<<(std::ostream& os, const counter_id& id);
+
+template<typename View>
+class basic_counter_shard_view {
+    enum class offset : unsigned {
+        id = 0u,
+        value = unsigned(id) + sizeof(counter_id),
+        logical_clock = unsigned(value) + sizeof(int64_t),
+        total_size = unsigned(logical_clock) + sizeof(int64_t),
+    };
+private:
+    typename View::pointer _base;
+private:
+    template<typename T>
+    T read(offset off) const {
+        T value;
+        std::copy_n(_base + static_cast<unsigned>(off), sizeof(T), reinterpret_cast<signed char*>(&value));
+        return value;
+    }
+public:
+    static constexpr auto size = size_t(offset::total_size);
+public:
+    basic_counter_shard_view() = default;
+    explicit basic_counter_shard_view(typename View::pointer ptr) noexcept
+        : _base(ptr) { }
+
+    counter_id id() const { return read<counter_id>(offset::id); }
+    int64_t value() const { return read<int64_t>(offset::value); }
+    int64_t logical_clock() const { return read<int64_t>(offset::logical_clock); }
+
+    void swap_value_and_clock(basic_counter_shard_view& other) noexcept {
+        static constexpr size_t off = size_t(offset::value);
+        static constexpr size_t size = size_t(offset::total_size) - off;
+
+        typename View::value_type tmp[size];
+        std::copy_n(_base + off, size, tmp);
+        std::copy_n(other._base + off, size, _base + off);
+        std::copy_n(tmp, size, other._base + off);
+    }
+
+    void set_value_and_clock(const basic_counter_shard_view& other) noexcept {
+        static constexpr size_t off = size_t(offset::value);
+        static constexpr size_t size = size_t(offset::total_size) - off;
+        std::copy_n(other._base + off, size, _base + off);
+    }
+
+    bool operator==(const basic_counter_shard_view& other) const {
+        return id() == other.id() && value() == other.value()
+               && logical_clock() == other.logical_clock();
+    }
+    bool operator!=(const basic_counter_shard_view& other) const {
+        return !(*this == other);
+    }
+
+    struct less_compare_by_id {
+        bool operator()(const basic_counter_shard_view& x, const basic_counter_shard_view& y) const {
+            return x.id() < y.id();
+        }
+    };
+};
+
+using counter_shard_view = basic_counter_shard_view<bytes_view>;
+
+std::ostream& operator<<(std::ostream& os, counter_shard_view csv);
+
+class counter_shard {
+    counter_id _id;
+    int64_t _value;
+    int64_t _logical_clock;
+private:
+    template<typename T>
+    static void write(const T& value, bytes::iterator& out) {
+        out = std::copy_n(reinterpret_cast<const signed char*>(&value), sizeof(T), out);
+    }
+private:
+    // Shared logic for applying counter_shards and counter_shard_views.
+    // T is either counter_shard or basic_counter_shard_view<U>.
+    template<typename T>
+    GCC6_CONCEPT(requires requires(T shard) {
+        { shard.value() } -> int64_t;
+        { shard.logical_clock() } -> int64_t;
+    })
+    counter_shard& do_apply(T&& other) noexcept {
+        auto other_clock = other.logical_clock();
+        if (_logical_clock < other_clock) {
+            _logical_clock = other_clock;
+            _value = other.value();
+        }
+        return *this;
+    }
+public:
+    counter_shard(counter_id id, int64_t value, int64_t logical_clock) noexcept
+        : _id(id)
+        , _value(value)
+        , _logical_clock(logical_clock)
+    { }
+
+    explicit counter_shard(counter_shard_view csv) noexcept
+        : _id(csv.id())
+        , _value(csv.value())
+        , _logical_clock(csv.logical_clock())
+    { }
+
+    counter_id id() const { return _id; }
+    int64_t value() const { return _value; }
+    int64_t logical_clock() const { return _logical_clock; }
+
+    counter_shard& update(int64_t value_delta, int64_t clock_increment) noexcept {
+        _value += value_delta;
+        _logical_clock += clock_increment;
+        return *this;
+    }
+
+    counter_shard& apply(counter_shard_view other) noexcept {
+        return do_apply(other);
+    }
+
+    counter_shard& apply(const counter_shard& other) noexcept {
+        return do_apply(other);
+    }
+
+    static size_t serialized_size() {
+        return counter_shard_view::size;
+    }
+    void serialize(bytes::iterator& out) const {
+        write(_id, out);
+        write(_value, out);
+        write(_logical_clock, out);
+    }
+};
+
+class counter_cell_builder {
+    std::vector<counter_shard> _shards;
+    bool _sorted = true;
+private:
+    void do_sort_and_remove_duplicates();
+public:
+    counter_cell_builder() = default;
+    counter_cell_builder(size_t shard_count) {
+        _shards.reserve(shard_count);
+    }
+
+    void add_shard(const counter_shard& cs) {
+        _shards.emplace_back(cs);
+    }
+
+    void add_maybe_unsorted_shard(const counter_shard& cs) {
+        add_shard(cs);
+        if (_sorted && _shards.size() > 1) {
+            auto current = _shards.rbegin();
+            auto previous = std::next(current);
+            _sorted = current->id() > previous->id();
+        }
+    }
+
+    void sort_and_remove_duplicates() {
+        if (!_sorted) {
+            do_sort_and_remove_duplicates();
+        }
+    }
+
+    size_t serialized_size() const {
+        return _shards.size() * counter_shard::serialized_size();
+    }
+    void serialize(bytes::iterator& out) const {
+        for (auto&& cs : _shards) {
+            cs.serialize(out);
+        }
+    }
+
+    bool empty() const {
+        return _shards.empty();
+    }
+
+    atomic_cell build(api::timestamp_type timestamp) const {
+        return atomic_cell::make_live_from_serializer(timestamp, serialized_size(), [this] (bytes::iterator out) {
+            serialize(out);
+        });
+    }
+
+    static atomic_cell from_single_shard(api::timestamp_type timestamp, const counter_shard& cs) {
+        return atomic_cell::make_live_from_serializer(timestamp, counter_shard::serialized_size(), [&cs] (bytes::iterator out) {
+            cs.serialize(out);
+        });
+    }
+
+    class inserter_iterator : public std::iterator<std::output_iterator_tag, counter_shard> {
+        counter_cell_builder* _builder;
+    public:
+        explicit inserter_iterator(counter_cell_builder& b) : _builder(&b) { }
+        inserter_iterator& operator=(const counter_shard& cs) {
+            _builder->add_shard(cs);
+            return *this;
+        }
+        inserter_iterator& operator=(const counter_shard_view& csv) {
+            return operator=(counter_shard(csv));
+        }
+        inserter_iterator& operator++() { return *this; }
+        inserter_iterator& operator++(int) { return *this; }
+        inserter_iterator& operator*() { return *this; };
+    };
+
+    inserter_iterator inserter() {
+        return inserter_iterator(*this);
+    }
+};
+
+// <counter_id>   := <int64_t><int64_t>
+// <shard>        := <counter_id><int64_t:value><int64_t:logical_clock>
+// <counter_cell> := <shard>*
+template<typename View>
+class basic_counter_cell_view {
+protected:
+    atomic_cell_base<View> _cell;
+private:
+    class shard_iterator : public std::iterator<std::input_iterator_tag, basic_counter_shard_view<View>> {
+        typename View::pointer _current;
+        basic_counter_shard_view<View> _current_view;
+    public:
+        shard_iterator() = default;
+        shard_iterator(typename View::pointer ptr) noexcept
+            : _current(ptr), _current_view(ptr) { }
+
+        basic_counter_shard_view<View>& operator*() noexcept {
+            return _current_view;
+        }
+        basic_counter_shard_view<View>* operator->() noexcept {
+            return &_current_view;
+        }
+        shard_iterator& operator++() noexcept {
+            _current += counter_shard_view::size;
+            _current_view = basic_counter_shard_view<View>(_current);
+            return *this;
+        }
+        shard_iterator operator++(int) noexcept {
+            auto it = *this;
+            operator++();
+            return it;
+        }
+        shard_iterator& operator--() noexcept {
+            _current -= counter_shard_view::size;
+            _current_view = basic_counter_shard_view<View>(_current);
+            return *this;
+        }
+        shard_iterator operator--(int) noexcept {
+            auto it = *this;
+            operator--();
+            return it;
+        }
+        bool operator==(const shard_iterator& other) const noexcept {
+            return _current == other._current;
+        }
+        bool operator!=(const shard_iterator& other) const noexcept {
+            return !(*this == other);
+        }
+    };
+public:
+    boost::iterator_range<shard_iterator> shards() const {
+        auto bv = _cell.value();
+        auto begin = shard_iterator(bv.data());
+        auto end = shard_iterator(bv.data() + bv.size());
+        return boost::make_iterator_range(begin, end);
+    }
+
+    size_t shard_count() const {
+        return _cell.value().size() / counter_shard_view::size;
+    }
+public:
+    // ac must be a live counter cell
+    explicit basic_counter_cell_view(atomic_cell_base<View> ac) noexcept : _cell(ac) {
+        assert(_cell.is_live());
+        assert(!_cell.is_counter_update());
+    }
+
+    api::timestamp_type timestamp() const { return _cell.timestamp(); }
+
+    static data_type total_value_type() { return long_type; }
+
+    int64_t total_value() const {
+        return boost::accumulate(shards(), int64_t(0), [] (int64_t v, counter_shard_view cs) {
+            return v + cs.value();
+        });
+    }
+
+    stdx::optional<counter_shard_view> get_shard(const counter_id& id) const {
+        auto it = boost::range::find_if(shards(), [&id] (counter_shard_view csv) {
+            return csv.id() == id;
+        });
+        if (it == shards().end()) {
+            return { };
+        }
+        return *it;
+    }
+
+    stdx::optional<counter_shard_view> local_shard() const {
+        // TODO: consider caching local shard position
+        return get_shard(counter_id::local());
+    }
+
+    bool operator==(const basic_counter_cell_view& other) const {
+        return timestamp() == other.timestamp() && boost::equal(shards(), other.shards());
+    }
+};
+
+struct counter_cell_view : basic_counter_cell_view<bytes_view> {
+    using basic_counter_cell_view::basic_counter_cell_view;
+
+    // Returns counter shards in an order that is compatible with Scylla 1.7.4.
+    std::vector<counter_shard> shards_compatible_with_1_7_4() const;
+
+    // Reversibly applies two counter cells, at least one of them must be live.
+    // Returns true iff dst was modified.
+    static bool apply_reversibly(atomic_cell_or_collection& dst, atomic_cell_or_collection& src);
+
+    // Reverts apply performed by apply_reversible().
+    static void revert_apply(atomic_cell_or_collection& dst, atomic_cell_or_collection& src);
+
+    // Computes a counter cell containing minimal amount of data which, when
+    // applied to 'b' returns the same cell as 'a' and 'b' applied together.
+    static stdx::optional<atomic_cell> difference(atomic_cell_view a, atomic_cell_view b);
+
+    friend std::ostream& operator<<(std::ostream& os, counter_cell_view ccv);
+};
+
+struct counter_cell_mutable_view : basic_counter_cell_view<bytes_mutable_view> {
+    using basic_counter_cell_view::basic_counter_cell_view;
+
+    void set_timestamp(api::timestamp_type ts) { _cell.set_timestamp(ts); }
+};
+
+// Transforms mutation dst from counter updates to counter shards using state
+// stored in current_state.
+// If current_state is present it has to be in the same schema as dst.
+void transform_counter_updates_to_shards(mutation& dst, const mutation* current_state, uint64_t clock_offset);
+
+template<>
+struct appending_hash<counter_shard_view> {
+    template<typename Hasher>
+    void operator()(Hasher& h, const counter_shard_view& cshard) const {
+        ::feed_hash(h, cshard.id().to_uuid());
+        ::feed_hash(h, cshard.value());
+        ::feed_hash(h, cshard.logical_clock());
+    }
+};
+
+template<>
+struct appending_hash<counter_cell_view> {
+    template<typename Hasher>
+    void operator()(Hasher& h, const counter_cell_view& cell) const {
+        ::feed_hash(h, true); // is_live
+        ::feed_hash(h, cell.timestamp());
+        for (auto&& csv : cell.shards()) {
+            ::feed_hash(h, csv);
+        }
+    }
+};
--- a/cpu_controller.hh
+++ b/cpu_controller.hh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <seastar/core/thread.hh>
+#include <seastar/core/timer.hh>
+#include <chrono>
+
+// Simple proportional controller to adjust shares of memtable/streaming flushes.
+//
+// Goal is to flush as fast as we can, but not so fast that we steal all the CPU from incoming
+// requests, and at the same time minimize user-visible fluctuations in the flush quota.
+//
+// What that translates to is we'll try to keep virtual dirty's firt derivative at 0 (IOW, we keep
+// virtual dirty constant), which means that the rate of incoming writes is equal to the rate of
+// flushed bytes.
+//
+// The exact point at which the controller stops determines the desired flush CPU usage. As we
+// approach the hard dirty limit, we need to be more aggressive. We will therefore define two
+// thresholds, and increase the constant as we cross them.
+//
+//  1) the soft limit line
+//  2) halfway between soft limit and dirty limit
+//
+// The constants q1 and q2 are used to determine the proportional factor at each stage.
+//
+// Below the soft limit, we are in no particular hurry to flush, since it means we're set to
+// complete flushing before we a new memtable is ready. The quota is dirty * q1, and q1 is set to a
+// low number.
+//
+// The first half of the virtual dirty region is where we expect to be usually, so we have a low
+// slope corresponding to a sluggish response between q1 * soft_limit and q2.
+//
+// In the second half, we're getting close to the hard dirty limit so we increase the slope and
+// become more responsive, up to a maximum quota of qmax.
+//
+// For now we'll just set them in the structure not to complicate the constructor. But q1, q2 and
+// qmax can easily become parameters if we find another user.
+class flush_cpu_controller {
+    static constexpr float hard_dirty_limit = 0.50;
+    static constexpr float q1 = 0.01;
+    static constexpr float q2 = 0.2;
+    static constexpr float qmax = 1;
+
+    float _current_quota = 0.0f;
+    float _goal;
+    std::function<float()> _current_dirty;
+    std::chrono::milliseconds _interval;
+    timer<> _update_timer;
+
+    seastar::thread_scheduling_group _scheduling_group;
+    seastar::thread_scheduling_group *_current_scheduling_group = nullptr;
+
+    void adjust();
+public:
+    seastar::thread_scheduling_group* scheduling_group() {
+        return _current_scheduling_group;
+    }
+    float current_quota() const {
+        return _current_quota;
+    }
+
+    struct disabled {
+        seastar::thread_scheduling_group *backup;
+    };
+    flush_cpu_controller(disabled d) : _scheduling_group(std::chrono::nanoseconds(0), 0), _current_scheduling_group(d.backup) {}
+    flush_cpu_controller(std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty);
+    flush_cpu_controller(flush_cpu_controller&&) = default;
+};
+
+
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -36,15 +36,19 @@ options {
 #include "cql3/statements/raw/select_statement.hh"
 #include "cql3/statements/alter_keyspace_statement.hh"
 #include "cql3/statements/alter_table_statement.hh"
+#include "cql3/statements/alter_view_statement.hh"
 #include "cql3/statements/create_keyspace_statement.hh"
 #include "cql3/statements/drop_keyspace_statement.hh"
 #include "cql3/statements/create_index_statement.hh"
 #include "cql3/statements/create_table_statement.hh"
+#include "cql3/statements/create_view_statement.hh"
 #include "cql3/statements/create_type_statement.hh"
 #include "cql3/statements/drop_type_statement.hh"
 #include "cql3/statements/alter_type_statement.hh"
 #include "cql3/statements/property_definitions.hh"
+#include "cql3/statements/drop_index_statement.hh"
 #include "cql3/statements/drop_table_statement.hh"
+#include "cql3/statements/drop_view_statement.hh"
 #include "cql3/statements/truncate_statement.hh"
 #include "cql3/statements/raw/update_statement.hh"
 #include "cql3/statements/raw/insert_statement.hh"
@@ -315,9 +319,7 @@ cqlStatement returns [shared_ptr<raw::parsed_statement> stmt]
    | st10=createIndexStatement        { $stmt = st10; }
    | st11=dropKeyspaceStatement       { $stmt = st11; }
    | st12=dropTableStatement          { $stmt = st12; }
-#if 0
    | st13=dropIndexStatement          { $stmt = st13; }
-#endif
    | st14=alterTableStatement         { $stmt = st14; }
    | st15=alterKeyspaceStatement      { $stmt = st15; }
    | st16=grantStatement              { $stmt = st16; }
@@ -340,6 +342,9 @@ cqlStatement returns [shared_ptr<raw::parsed_statement> stmt]
    | st30=createAggregateStatement    { $stmt = st30; }
    | st31=dropAggregateStatement      { $stmt = st31; }
 #endif
+    | st32=createViewStatement         { $stmt = st32; }
+    | st33=alterViewStatement          { $stmt = st33; }
+    | st34=dropViewStatement           { $stmt = st34; }
    ;

 /*
@@ -394,6 +399,7 @@ unaliasedSelector returns [shared_ptr<selectable::raw> s]
       | K_WRITETIME '(' c=cident ')'              { tmp = make_shared<selectable::writetime_or_ttl::raw>(c, true); }
       | K_TTL       '(' c=cident ')'              { tmp = make_shared<selectable::writetime_or_ttl::raw>(c, false); }
       | f=functionName args=selectionFunctionArgs { tmp = ::make_shared<selectable::with_function::raw>(std::move(f), std::move(args)); }
+       | K_CAST      '(' arg=unaliasedSelector K_AS t=native_type ')'  { tmp = ::make_shared<selectable::with_cast::raw>(std::move(arg), std::move(t)); }
       )
       ( '.' fi=cident { tmp = make_shared<selectable::with_field_selection::raw>(std::move(tmp), std::move(fi)); } )*
    { $s = tmp; }
@@ -716,7 +722,7 @@ createTableStatement returns [shared_ptr<cql3::statements::create_table_statemen

 cfamDefinition[shared_ptr<cql3::statements::create_table_statement::raw_statement> expr]
    : '(' cfamColumns[expr] ( ',' cfamColumns[expr]? )* ')'
-      ( K_WITH cfamProperty[expr] ( K_AND cfamProperty[expr] )*)?
+      ( K_WITH cfamProperty[$expr->properties()] ( K_AND cfamProperty[$expr->properties()] )*)?
    ;

 cfamColumns[shared_ptr<cql3::statements::create_table_statement::raw_statement> expr]
@@ -732,15 +738,15 @@ pkDef[shared_ptr<cql3::statements::create_table_statement::raw_statement> expr]
    | '(' k1=ident { l.push_back(k1); } ( ',' kn=ident { l.push_back(kn); } )* ')' { $expr->add_key_aliases(l); }
    ;

-cfamProperty[shared_ptr<cql3::statements::create_table_statement::raw_statement> expr]
-    : property[expr->properties]
-    | K_COMPACT K_STORAGE { $expr->set_compact_storage(); }
+cfamProperty[cql3::statements::cf_properties& expr]
+    : property[$expr.properties()]
+    | K_COMPACT K_STORAGE { $expr.set_compact_storage(); }
    | K_CLUSTERING K_ORDER K_BY '(' cfamOrdering[expr] (',' cfamOrdering[expr])* ')'
    ;

-cfamOrdering[shared_ptr<cql3::statements::create_table_statement::raw_statement> expr]
+cfamOrdering[cql3::statements::cf_properties& expr]
    @init{ bool reversed=false; }
-    : k=ident (K_ASC | K_DESC { reversed=true;} ) { $expr->set_ordering(k, reversed); }
+    : k=ident (K_ASC | K_DESC { reversed=true;} ) { $expr.set_ordering(k, reversed); }
    ;


@@ -772,12 +778,13 @@ createIndexStatement returns [::shared_ptr<create_index_statement> expr]
        auto props = make_shared<index_prop_defs>();
        bool if_not_exists = false;
        auto name = ::make_shared<cql3::index_name>();
+        std::vector<::shared_ptr<index_target::raw>> targets;
    }
    : K_CREATE (K_CUSTOM { props->is_custom = true; })? K_INDEX (K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
-        (idxName[name])? K_ON cf=columnFamilyName '(' id=indexIdent ')'
+        (idxName[name])? K_ON cf=columnFamilyName '(' (target1=indexIdent { targets.emplace_back(target1); } (',' target2=indexIdent { targets.emplace_back(target2); } )*)? ')'
        (K_USING cls=STRING_LITERAL { props->custom_class = sstring{$cls.text}; })?
        (K_WITH properties[props])?
-      { $expr = ::make_shared<create_index_statement>(cf, name, id, props, if_not_exists); }
+      { $expr = ::make_shared<create_index_statement>(cf, name, targets, props, if_not_exists); }
    ;

 indexIdent returns [::shared_ptr<index_target::raw> id]
@@ -787,6 +794,39 @@ indexIdent returns [::shared_ptr<index_target::raw> id]
    | K_FULL '(' c=cident ')'    { $id = index_target::raw::full_collection(c); }
    ;

+/**
+ * CREATE MATERIALIZED VIEW <viewName> AS
+ *  SELECT <columns>
+ *  FROM <CF>
+ *  WHERE <pkColumns> IS NOT NULL
+ *  PRIMARY KEY (<pkColumns>)
+ *  WITH <property> = <value> AND ...;
+ */
+createViewStatement returns [::shared_ptr<create_view_statement> expr]
+    @init {
+        bool if_not_exists = false;
+        std::vector<::shared_ptr<cql3::column_identifier::raw>> partition_keys;
+        std::vector<::shared_ptr<cql3::column_identifier::raw>> composite_keys;
+    }
+    : K_CREATE K_MATERIALIZED K_VIEW (K_IF K_NOT K_EXISTS { if_not_exists = true; })? cf=columnFamilyName K_AS
+        K_SELECT sclause=selectClause K_FROM basecf=columnFamilyName
+        (K_WHERE wclause=whereClause)?
+        K_PRIMARY K_KEY (
+        '(' '(' k1=cident { partition_keys.push_back(k1); } ( ',' kn=cident { partition_keys.push_back(kn); } )* ')' ( ',' c1=cident { composite_keys.push_back(c1); } )* ')'
+    |   '(' k1=cident { partition_keys.push_back(k1); } ( ',' cn=cident { composite_keys.push_back(cn); } )* ')'
+        )
+        {
+             $expr = ::make_shared<create_view_statement>(
+                std::move(cf),
+                std::move(basecf),
+                std::move(sclause),
+                std::move(wclause),
+                std::move(partition_keys),
+                std::move(composite_keys),
+                if_not_exists);
+        }
+        ( K_WITH cfamProperty[{ $expr->properties() }] ( K_AND cfamProperty[{ $expr->properties() }] )*)?
+    ;

 #if 0
 /**
@@ -833,7 +873,7 @@ alterKeyspaceStatement returns [shared_ptr<cql3::statements::alter_keyspace_stat
 alterTableStatement returns [shared_ptr<alter_table_statement> expr]
    @init {
        alter_table_statement::type type;
-        auto props = make_shared<cql3::statements::cf_prop_defs>();;
+        auto props = make_shared<cql3::statements::cf_prop_defs>();
        std::vector<std::pair<shared_ptr<cql3::column_identifier::raw>, shared_ptr<cql3::column_identifier::raw>>> renames;
        bool is_static = false;
    }
@@ -867,6 +907,18 @@ alterTypeStatement returns [::shared_ptr<alter_type_statement> expr]
          )
    ;

+/**
+ * ALTER MATERIALIZED VIEW <CF> WITH <property> = <value>;
+ */
+alterViewStatement returns [::shared_ptr<alter_view_statement> expr]
+    @init {
+        auto props = make_shared<cql3::statements::cf_prop_defs>();
+    }
+    : K_ALTER K_MATERIALIZED K_VIEW cf=columnFamilyName K_WITH properties[props]
+    {
+        $expr = ::make_shared<alter_view_statement>(std::move(cf), std::move(props));
+    }
+    ;

 renames[::shared_ptr<alter_type_statement::renames> expr]
    : fromId=ident K_TO toId=ident { $expr->add_rename(fromId, toId); }
@@ -897,16 +949,23 @@ dropTypeStatement returns [::shared_ptr<drop_type_statement> stmt]
    : K_DROP K_TYPE (K_IF K_EXISTS { if_exists = true; } )? name=userTypeName { $stmt = ::make_shared<drop_type_statement>(name, if_exists); }
    ;

-#if 0
+/**
+ * DROP MATERIALIZED VIEW [IF EXISTS] <view_name>
+ */
+dropViewStatement returns [::shared_ptr<drop_view_statement> stmt]
+    @init { bool if_exists = false; }
+    : K_DROP K_MATERIALIZED K_VIEW (K_IF K_EXISTS { if_exists = true; } )? cf=columnFamilyName
+      { $stmt = ::make_shared<drop_view_statement>(cf, if_exists); }
+    ;
+
 /**
 * DROP INDEX [IF EXISTS] <INDEX_NAME>
 */
-dropIndexStatement returns [DropIndexStatement expr]
-    @init { boolean ifExists = false; }
-    : K_DROP K_INDEX (K_IF K_EXISTS { ifExists = true; } )? index=indexName
-      { $expr = new DropIndexStatement(index, ifExists); }
+dropIndexStatement returns [::shared_ptr<drop_index_statement> expr]
+    @init { bool if_exists = false; }
+    : K_DROP K_INDEX (K_IF K_EXISTS { if_exists = true; } )? index=indexName
+      { $expr = ::make_shared<drop_index_statement>(index, if_exists); }
    ;
-#endif

 /**
  * TRUNCATE <CF>;
@@ -1109,6 +1168,7 @@ constant returns [shared_ptr<cql3::constants::literal> constant]
    | t=INTEGER        { $constant = cql3::constants::literal::integer(sstring{$t.text}); }
    | t=FLOAT          { $constant = cql3::constants::literal::floating_point(sstring{$t.text}); }
    | t=BOOLEAN        { $constant = cql3::constants::literal::bool_(sstring{$t.text}); }
+    | t=DURATION       { $constant = cql3::constants::literal::duration(sstring{$t.text}); }
    | t=UUID           { $constant = cql3::constants::literal::uuid(sstring{$t.text}); }
    | t=HEXNUMBER      { $constant = cql3::constants::literal::hex(sstring{$t.text}); }
    | { sign=""; } ('-' {sign = "-"; } )? t=(K_NAN | K_INFINITY) { $constant = cql3::constants::literal::floating_point(sstring{sign + $t.text}); }
@@ -1243,6 +1303,10 @@ normalColumnOperation[operations_type& operations, ::shared_ptr<cql3::column_ide
          }
          add_raw_update(operations, key, make_shared<cql3::operation::addition>(cql3::constants::literal::integer($i.text)));
      }
+    | K_SCYLLA_COUNTER_SHARD_LIST '(' t=term ')'
+      {
+          add_raw_update(operations, key, ::make_shared<cql3::operation::set_counter_value_from_tuple_list>(t));      
+      }
    ;

 specializedColumnOperation[std::vector<std::pair<shared_ptr<cql3::column_identifier::raw>,
@@ -1304,7 +1368,8 @@ relation[std::vector<cql3::relation_ptr>& clauses]

    | K_TOKEN l=tupleOfIdentifiers type=relationType t=term
        { $clauses.emplace_back(::make_shared<cql3::token_relation>(std::move(l), *type, std::move(t))); }
-
+    | name=cident K_IS K_NOT K_NULL {
+          $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), cql3::operator_type::IS_NOT, cql3::constants::NULL_LITERAL)); }
    | name=cident K_IN marker=inMarker
        { $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), cql3::operator_type::IN, std::move(marker))); }
    | name=cident K_IN in_values=singleColumnInValues
@@ -1401,15 +1466,20 @@ native_type returns [shared_ptr<cql3_type> t]
    | K_COUNTER   { $t = cql3_type::counter; }
    | K_DECIMAL   { $t = cql3_type::decimal; }
    | K_DOUBLE    { $t = cql3_type::double_; }
+    | K_DURATION  { $t = cql3_type::duration; }
    | K_FLOAT     { $t = cql3_type::float_; }
    | K_INET      { $t = cql3_type::inet; }
    | K_INT       { $t = cql3_type::int_; }
+    | K_SMALLINT  { $t = cql3_type::smallint; }
    | K_TEXT      { $t = cql3_type::text; }
    | K_TIMESTAMP { $t = cql3_type::timestamp; }
+    | K_TINYINT   { $t = cql3_type::tinyint; }
    | K_UUID      { $t = cql3_type::uuid; }
    | K_VARCHAR   { $t = cql3_type::varchar; }
    | K_VARINT    { $t = cql3_type::varint; }
    | K_TIMEUUID  { $t = cql3_type::timeuuid; }
+    | K_DATE      { $t = cql3_type::date; }
+    | K_TIME      { $t = cql3_type::time; }
    ;

 collection_type returns [shared_ptr<cql3::cql3_type::raw> pt]
@@ -1483,6 +1553,8 @@ basic_unreserved_keyword returns [sstring str]
        | K_DISTINCT
        | K_CONTAINS
        | K_STATIC
+        | K_FROZEN
+        | K_TUPLE
        | K_FUNCTION
        | K_AGGREGATE
        | K_SFUNC
@@ -1500,6 +1572,7 @@ basic_unreserved_keyword returns [sstring str]
 K_SELECT:      S E L E C T;
 K_FROM:        F R O M;
 K_AS:          A S;
+K_CAST:        C A S T;
 K_WHERE:       W H E R E;
 K_AND:         A N D;
 K_KEY:         K E Y;
@@ -1528,6 +1601,8 @@ K_KEYSPACE:    ( K E Y S P A C E
 K_KEYSPACES:   K E Y S P A C E S;
 K_COLUMNFAMILY:( C O L U M N F A M I L Y
                 | T A B L E );
+K_MATERIALIZED:M A T E R I A L I Z E D;
+K_VIEW:        V I E W;
 K_INDEX:       I N D E X;
 K_CUSTOM:      C U S T O M;
 K_ON:          O N;
@@ -1551,6 +1626,7 @@ K_DESC:        D E S C;
 K_ALLOW:       A L L O W;
 K_FILTERING:   F I L T E R I N G;
 K_IF:          I F;
+K_IS:          I S;
 K_CONTAINS:    C O N T A I N S;

 K_GRANT:       G R A N T;
@@ -1577,9 +1653,12 @@ K_BOOLEAN:     B O O L E A N;
 K_COUNTER:     C O U N T E R;
 K_DECIMAL:     D E C I M A L;
 K_DOUBLE:      D O U B L E;
+K_DURATION:    D U R A T I O N;
 K_FLOAT:       F L O A T;
 K_INET:        I N E T;
 K_INT:         I N T;
+K_SMALLINT:    S M A L L I N T;
+K_TINYINT:     T I N Y I N T;
 K_TEXT:        T E X T;
 K_UUID:        U U I D;
 K_VARCHAR:     V A R C H A R;
@@ -1587,6 +1666,8 @@ K_VARINT:      V A R I N T;
 K_TIMEUUID:    T I M E U U I D;
 K_TOKEN:       T O K E N;
 K_WRITETIME:   W R I T E T I M E;
+K_DATE:        D A T E;
+K_TIME:        T I M E;

 K_NULL:        N U L L;
 K_NOT:         N O T;
@@ -1616,6 +1697,7 @@ K_REPLACE:     R E P L A C E;
 K_DETERMINISTIC: D E T E R M I N I S T I C;

 K_SCYLLA_TIMEUUID_LIST_INDEX: S C Y L L A '_' T I M E U U I D '_' L I S T '_' I N D E X;
+K_SCYLLA_COUNTER_SHARD_LIST: S C Y L L A '_' C O U N T E R '_' S H A R D '_' L I S T; 

 // Case-insensitive alpha characters
 fragment A: ('a'|'A');
@@ -1701,6 +1783,20 @@ fragment EXPONENT
    : E ('+' | '-')? DIGIT+
    ;

+fragment DURATION_UNIT
+    : Y
+    | M O
+    | W
+    | D
+    | H
+    | M
+    | S
+    | M S
+    | U S
+    | '\u00B5' S
+    | N S
+    ;
+
 INTEGER
    : '-'? DIGIT+
    ;
@@ -1725,6 +1821,13 @@ BOOLEAN
    : T R U E | F A L S E
    ;

+DURATION
+    : '-'? DIGIT+ DURATION_UNIT (DIGIT+ DURATION_UNIT)*
+    | '-'? 'P' (DIGIT+ 'Y')? (DIGIT+ 'M')? (DIGIT+ 'D')? ('T' (DIGIT+ 'H')? (DIGIT+ 'M')? (DIGIT+ 'S')?)? // ISO 8601 "format with designators"
+    | '-'? 'P' DIGIT+ 'W'
+    | '-'? 'P' DIGIT DIGIT DIGIT DIGIT '-' DIGIT DIGIT '-' DIGIT DIGIT 'T' DIGIT DIGIT ':' DIGIT DIGIT ':' DIGIT DIGIT // ISO 8601 "alternative format"
+    ;
+
 IDENT
    : LETTER (LETTER | DIGIT | '_')*
    ;
--- a/cql3/abstract_marker.cc
+++ b/cql3/abstract_marker.cc
@@ -79,6 +79,7 @@ abstract_marker::raw::raw(int32_t bind_index)
        return ::make_shared<maps::marker>(_bind_index, receiver);
    }
    assert(0);
+    return shared_ptr<term>();
 }

 assignment_testable::test_result abstract_marker::raw::test_assignment(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) {
--- a/cql3/attributes.cc
+++ b/cql3/attributes.cc
@@ -71,13 +71,15 @@ int64_t attributes::get_timestamp(int64_t now, const query_options& options) {
    }

    auto tval = _timestamp->bind_and_get(options);
-    if (!tval) {
+    if (tval.is_null()) {
        throw exceptions::invalid_request_exception("Invalid null value of timestamp");
    }
-
+    if (tval.is_unset_value()) {
+        return now;
+    }
    try {
        data_type_for<int64_t>()->validate(*tval);
-    } catch (marshal_exception e) {
+    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid timestamp value");
    }
    return value_cast<int64_t>(data_type_for<int64_t>()->deserialize(*tval));
@@ -88,14 +90,16 @@ int32_t attributes::get_time_to_live(const query_options& options) {
        return 0;

    auto tval = _time_to_live->bind_and_get(options);
-    if (!tval) {
+    if (tval.is_null()) {
        throw exceptions::invalid_request_exception("Invalid null value of TTL");
    }
-
+    if (tval.is_unset_value()) {
+        return 0;
+    }
    try {
        data_type_for<int32_t>()->validate(*tval);
    }
-    catch (marshal_exception e) {
+    catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid TTL value");
    }

--- a/cql3/column_condition.cc
+++ b/cql3/column_condition.cc
@@ -40,11 +40,29 @@
 */

 #include "cql3/column_condition.hh"
+#include "statements/request_validations.hh"
 #include "unimplemented.hh"
 #include "lists.hh"
 #include "maps.hh"
 #include <boost/range/algorithm_ext/push_back.hpp>

+namespace {
+
+void validate_operation_on_durations(const abstract_type& type, const cql3::operator_type& op) {
+    using cql3::statements::request_validations::check_false;
+
+    if (op.is_slice() && type.references_duration()) {
+        check_false(type.is_collection(), "Slice conditions are not supported on collections containing durations");
+        check_false(type.is_tuple(), "Slice conditions are not supported on tuples containing durations");
+        check_false(type.is_user_type(), "Slice conditions are not supported on UDTs containing durations");
+
+        // We're a duration.
+        throw exceptions::invalid_request_exception(sprint("Slice conditions are not supported on durations"));
+    }
+}
+
+}
+
 namespace cql3 {

 bool
@@ -95,6 +113,7 @@ column_condition::raw::prepare(database& db, const sstring& keyspace, const colu
            }
            return column_condition::in_condition(receiver, std::move(terms));
        } else {
+            validate_operation_on_durations(*receiver.type, _op);
            return column_condition::condition(receiver, _value->prepare(db, keyspace, receiver.column_specification), _op);
        }
    }
@@ -129,6 +148,8 @@ column_condition::raw::prepare(database& db, const sstring& keyspace, const colu
                                | boost::adaptors::transformed(std::bind(&term::raw::prepare, std::placeholders::_1, std::ref(db), std::ref(keyspace), value_spec)));
        return column_condition::in_condition(receiver, _collection_element->prepare(db, keyspace, element_spec), terms);
    } else {
+        validate_operation_on_durations(*receiver.type, _op);
+
        return column_condition::condition(receiver,
                _collection_element->prepare(db, keyspace, element_spec),
                _value->prepare(db, keyspace, value_spec),
--- a/cql3/column_identifier.cc
+++ b/cql3/column_identifier.cc
@@ -23,6 +23,8 @@
 #include "exceptions/exceptions.hh"
 #include "cql3/selection/simple_selector.hh"

+#include <regex>
+
 namespace cql3 {

 column_identifier::column_identifier(sstring raw_text, bool keep_case) {
@@ -59,6 +61,17 @@ sstring column_identifier::to_string() const {
    return _text;
 }

+sstring column_identifier::to_cql_string() const {
+    static const std::regex unquoted_identifier_re("[a-z][a-z0-9_]*");
+    if (std::regex_match(_text.begin(), _text.end(), unquoted_identifier_re)) {
+        return _text;
+    }
+    static const std::regex double_quote_re("\"");
+    std::string result = _text;
+    std::regex_replace(result, double_quote_re, "\"\"");
+    return '"' + result + '"';
+}
+
 column_identifier::raw::raw(sstring raw_text, bool keep_case)
    : _raw_text{raw_text}
    , _text{raw_text}
--- a/cql3/column_identifier.hh
+++ b/cql3/column_identifier.hh
@@ -47,7 +47,7 @@

 #include <algorithm>
 #include <functional>
-#include <iostream>
+#include <iosfwd>

 namespace cql3 {

@@ -80,6 +80,8 @@ public:

    sstring to_string() const;

+    sstring to_cql_string() const;
+
    friend std::ostream& operator<<(std::ostream& out, const column_identifier& i) {
        return out << i._text;
    }
--- a/cql3/constants.cc
+++ b/cql3/constants.cc
@@ -44,6 +44,7 @@

 namespace cql3 {

+thread_local const ::shared_ptr<constants::value> constants::UNSET_VALUE = ::make_shared<constants::value>(cql3::raw_value::make_unset_value());
 thread_local const ::shared_ptr<term::raw> constants::NULL_LITERAL = ::make_shared<constants::null_literal>();
 thread_local const ::shared_ptr<terminal> constants::null_literal::NULL_VALUE = ::make_shared<constants::null_literal::null_value>();

@@ -51,14 +52,15 @@ std::ostream&
 operator<<(std::ostream&out, constants::type t)
 {
    switch (t) {
-        case constants::type::STRING:  return out << "STRING";
-        case constants::type::INTEGER: return out << "INTEGER";
-        case constants::type::UUID:    return out << "UUID";
-        case constants::type::FLOAT:   return out << "FLOAT";
-        case constants::type::BOOLEAN: return out << "BOOLEAN";
-        case constants::type::HEX:     return out << "HEX";
-    };
-    assert(0);
+        case constants::type::STRING:   return out << "STRING";
+        case constants::type::INTEGER:  return out << "INTEGER";
+        case constants::type::UUID:     return out << "UUID";
+        case constants::type::FLOAT:    return out << "FLOAT";
+        case constants::type::BOOLEAN:  return out << "BOOLEAN";
+        case constants::type::HEX:      return out << "HEX";
+        case constants::type::DURATION: return out << "DURATION";
+    }
+    abort();
 }

 bytes
@@ -97,7 +99,9 @@ constants::literal::test_assignment(database& db, const sstring& keyspace, ::sha
                    cql3_type::kind::TEXT,
                    cql3_type::kind::INET,
                    cql3_type::kind::VARCHAR,
-                    cql3_type::kind::TIMESTAMP>::contains(kind)) {
+                    cql3_type::kind::TIMESTAMP,
+                    cql3_type::kind::DATE,
+                    cql3_type::kind::TIME>::contains(kind)) {
                return assignment_testable::test_result::WEAKLY_ASSIGNABLE;
            }
            break;
@@ -109,7 +113,10 @@ constants::literal::test_assignment(database& db, const sstring& keyspace, ::sha
                    cql3_type::kind::DOUBLE,
                    cql3_type::kind::FLOAT,
                    cql3_type::kind::INT,
+                    cql3_type::kind::SMALLINT,
                    cql3_type::kind::TIMESTAMP,
+                    cql3_type::kind::DATE,
+                    cql3_type::kind::TINYINT,
                    cql3_type::kind::VARINT>::contains(kind)) {
                return assignment_testable::test_result::WEAKLY_ASSIGNABLE;
            }
@@ -139,6 +146,11 @@ constants::literal::test_assignment(database& db, const sstring& keyspace, ::sha
                return assignment_testable::test_result::WEAKLY_ASSIGNABLE;
            }
            break;
+        case type::DURATION:
+            if (kind == cql3_type::kind_enum_set::prepare<cql3_type::kind::DURATION>()) {
+                return assignment_testable::test_result::EXACT_MATCH;
+            }
+            break;
    }
    return assignment_testable::test_result::NOT_ASSIGNABLE;
 }
@@ -150,10 +162,10 @@ constants::literal::prepare(database& db, const sstring& keyspace, ::shared_ptr<
        throw exceptions::invalid_request_exception(sprint("Invalid %s constant (%s) for \"%s\" of type %s",
            _type, _text, *receiver->name, receiver->type->as_cql3_type()->to_string()));
    }
-    return ::make_shared<value>(std::experimental::make_optional(parsed_value(receiver->type)));
+    return ::make_shared<value>(cql3::raw_value::make_value(parsed_value(receiver->type)));
 }

-void constants::deleter::execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) {
+void constants::deleter::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
    if (column.type->is_multi_cell()) {
        collection_type_impl::mutation coll_m;
        coll_m.tomb = params.make_tombstone();
--- a/cql3/constants.hh
+++ b/cql3/constants.hh
@@ -44,6 +44,7 @@
 #include "cql3/abstract_marker.hh"
 #include "cql3/update_parameters.hh"
 #include "cql3/operation.hh"
+#include "cql3/values.hh"
 #include "cql3/term.hh"
 #include "core/shared_ptr.hh"

@@ -59,7 +60,7 @@ public:
 #endif
 public:
    enum class type {
-        STRING, INTEGER, UUID, FLOAT, BOOLEAN, HEX
+        STRING, INTEGER, UUID, FLOAT, BOOLEAN, HEX, DURATION
    };

    /**
@@ -67,18 +68,20 @@ public:
    */
    class value : public terminal {
    public:
-        bytes_opt _bytes;
-        value(bytes_opt bytes_) : _bytes(std::move(bytes_)) {}
-        virtual bytes_opt get(const query_options& options) override { return _bytes; }
-        virtual bytes_view_opt bind_and_get(const query_options& options) override { return as_bytes_view_opt(_bytes); }
+        cql3::raw_value _bytes;
+        value(cql3::raw_value bytes_) : _bytes(std::move(bytes_)) {}
+        virtual cql3::raw_value get(const query_options& options) override { return _bytes; }
+        virtual cql3::raw_value_view bind_and_get(const query_options& options) override { return _bytes.to_view(); }
        virtual sstring to_string() const override { return to_hex(*_bytes); }
    };

+    static thread_local const ::shared_ptr<value> UNSET_VALUE;
+
    class null_literal final : public term::raw {
    private:
        class null_value final : public value {
        public:
-            null_value() : value({}) {}
+            null_value() : value(cql3::raw_value::make_null()) {}
            virtual ::shared_ptr<terminal> bind(const query_options& options) override { return {}; }
            virtual sstring to_string() const override { return "null"; }
        };
@@ -146,6 +149,10 @@ public:
            return ::make_shared<literal>(type::HEX, text);
        }

+        static ::shared_ptr<literal> duration(sstring text) {
+            return ::make_shared<literal>(type::DURATION, text);
+        }
+
        virtual ::shared_ptr<term> prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver);
    private:
        bytes parsed_value(data_type validator);
@@ -169,14 +176,13 @@ public:
            assert(!_receiver->type->is_collection());
        }

-        virtual bytes_view_opt bind_and_get(const query_options& options) override {
+        virtual cql3::raw_value_view bind_and_get(const query_options& options) override {
            try {
                auto value = options.get_value_at(_bind_index);
                if (value) {
                    _receiver->type->validate(*value);
-                    return *value;
                }
-                return std::experimental::nullopt;
+                return value;
            } catch (const marshal_exception& e) {
                throw exceptions::invalid_request_exception(e.what());
            }
@@ -187,7 +193,7 @@ public:
            if (!bytes) {
                return ::shared_ptr<terminal>{};
            }
-            return ::make_shared<constants::value>(std::move(to_bytes_opt(*bytes)));
+            return ::make_shared<constants::value>(std::move(cql3::raw_value::make_value(to_bytes(*bytes))));
        }
    };

@@ -195,54 +201,48 @@ public:
    public:
        using operation::operation;

-        virtual void execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) override {
+        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override {
            auto value = _t->bind_and_get(params._options);
-            auto cell = value ? make_cell(*value, params) : make_dead_cell(params);
-            m.set_cell(prefix, column, std::move(cell));
+            if (value.is_null()) {
+                m.set_cell(prefix, column, std::move(make_dead_cell(params)));
+            } else if (value.is_value()) {
+                m.set_cell(prefix, column, std::move(make_cell(*value, params)));
+            }
        }
    };

-#if 0
-    public static class Adder extends Operation
-    {
-        public Adder(ColumnDefinition column, Term t)
-        {
-            super(column, t);
+    struct adder final : operation {
+        using operation::operation;
+
+        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override {
+            auto value = _t->bind_and_get(params._options);
+            if (value.is_null()) {
+                throw exceptions::invalid_request_exception("Invalid null value for counter increment");
+            } else if (value.is_unset_value()) {
+                return;
+            }
+            auto increment = value_cast<int64_t>(long_type->deserialize_value(*value));
+            m.set_cell(prefix, column, make_counter_update_cell(increment, params));
        }
+    };

-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
-        {
-            ByteBuffer bytes = t.bindAndGet(params.options);
-            if (bytes == null)
-                throw new InvalidRequestException("Invalid null value for counter increment");
-            long increment = ByteBufferUtil.toLong(bytes);
-            CellName cname = cf.getComparator().create(prefix, column);
-            cf.addColumn(params.makeCounter(cname, increment));
+    struct subtracter final : operation {
+        using operation::operation;
+
+        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override {
+            auto value = _t->bind_and_get(params._options);
+            if (value.is_null()) {
+                throw exceptions::invalid_request_exception("Invalid null value for counter increment");
+            } else if (value.is_unset_value()) {
+                return;
+            }
+            auto increment = value_cast<int64_t>(long_type->deserialize_value(*value));
+            if (increment == std::numeric_limits<int64_t>::min()) {
+                throw exceptions::invalid_request_exception(sprint("The negation of %d overflows supported counter precision (signed 8 bytes integer)", increment));
+            }
+            m.set_cell(prefix, column, make_counter_update_cell(-increment, params));
        }
-    }
-
-    public static class Substracter extends Operation
-    {
-        public Substracter(ColumnDefinition column, Term t)
-        {
-            super(column, t);
-        }
-
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
-        {
-            ByteBuffer bytes = t.bindAndGet(params.options);
-            if (bytes == null)
-                throw new InvalidRequestException("Invalid null value for counter increment");
-
-            long increment = ByteBufferUtil.toLong(bytes);
-            if (increment == Long.MIN_VALUE)
-                throw new InvalidRequestException("The negation of " + increment + " overflows supported counter precision (signed 8 bytes integer)");
-
-            CellName cname = cf.getComparator().create(prefix, column);
-            cf.addColumn(params.makeCounter(cname, -increment));
-        }
-    }
-#endif
+    };

    class deleter : public operation {
    public:
@@ -250,7 +250,7 @@ public:
            : operation(column, {})
        { }

-        virtual void execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) override;
+        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };
 };

--- a/cql3/cql3_type.cc
+++ b/cql3/cql3_type.cc
@@ -19,11 +19,43 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <iostream>
+#include <iterator>
+#include <regex>
+
 #include "cql3_type.hh"
+#include "cql3/util.hh"
 #include "ut_name.hh"

 namespace cql3 {

+sstring cql3_type::to_string() const {
+    if (_type->is_user_type()) {
+        return "frozen<" + util::maybe_quote(_name) + ">";
+    }
+    if (_type->is_tuple()) {
+        return "frozen<" + _name + ">";
+    }
+    return _name;
+}
+
+shared_ptr<cql3_type> cql3_type::raw::prepare(database& db, const sstring& keyspace) {
+    try {
+        auto&& ks = db.find_keyspace(keyspace);
+        return prepare_internal(keyspace, ks.metadata()->user_types());
+    } catch (no_such_keyspace& nsk) {
+        throw exceptions::invalid_request_exception("Unknown keyspace " + keyspace);
+    }
+}
+
+bool cql3_type::raw::is_duration() const {
+    return false;
+}
+
+bool cql3_type::raw::references_user_type(const sstring& name) const {
+    return false;
+}
+
 class cql3_type::raw_type : public raw {
 private:
    shared_ptr<cql3_type> _type;
@@ -35,6 +67,9 @@ public:
    virtual shared_ptr<cql3_type> prepare(database& db, const sstring& keyspace) {
        return _type;
    }
+    shared_ptr<cql3_type> prepare_internal(const sstring&, lw_shared_ptr<user_types_metadata>) override {
+        return _type;
+    }

    virtual bool supports_freezing() const {
        return false;
@@ -47,6 +82,10 @@ public:
    virtual sstring to_string() const {
        return _type->to_string();
    }
+
+    virtual bool is_duration() const override {
+        return _type->get_type()->equals(duration_type);
+    }
 };

 class cql3_type::raw_collection : public raw {
@@ -76,7 +115,7 @@ public:
        return true;
    }

-    virtual shared_ptr<cql3_type> prepare(database& db, const sstring& keyspace) override {
+    virtual shared_ptr<cql3_type> prepare_internal(const sstring& keyspace, lw_shared_ptr<user_types_metadata> user_types) override {
        assert(_values); // "Got null values type for a collection";

        if (!_frozen && _values->supports_freezing() && !_values->_frozen) {
@@ -93,16 +132,30 @@ public:
        }

        if (_kind == &collection_type_impl::kind::list) {
-            return make_shared(cql3_type(to_string(), list_type_impl::get_instance(_values->prepare(db, keyspace)->get_type(), !_frozen), false));
+            return make_shared(cql3_type(to_string(), list_type_impl::get_instance(_values->prepare_internal(keyspace, user_types)->get_type(), !_frozen), false));
        } else if (_kind == &collection_type_impl::kind::set) {
-            return make_shared(cql3_type(to_string(), set_type_impl::get_instance(_values->prepare(db, keyspace)->get_type(), !_frozen), false));
+            if (_values->is_duration()) {
+                throw exceptions::invalid_request_exception(sprint("Durations are not allowed inside sets: %s", *this));
+            }
+            return make_shared(cql3_type(to_string(), set_type_impl::get_instance(_values->prepare_internal(keyspace, user_types)->get_type(), !_frozen), false));
        } else if (_kind == &collection_type_impl::kind::map) {
            assert(_keys); // "Got null keys type for a collection";
-            return make_shared(cql3_type(to_string(), map_type_impl::get_instance(_keys->prepare(db, keyspace)->get_type(), _values->prepare(db, keyspace)->get_type(), !_frozen), false));
+            if (_keys->is_duration()) {
+                throw exceptions::invalid_request_exception(sprint("Durations are not allowed as map keys: %s", *this));
+            }
+            return make_shared(cql3_type(to_string(), map_type_impl::get_instance(_keys->prepare_internal(keyspace, user_types)->get_type(), _values->prepare_internal(keyspace, user_types)->get_type(), !_frozen), false));
        }
        abort();
    }

+    bool references_user_type(const sstring& name) const override {
+        return (_keys && _keys->references_user_type(name)) || _values->references_user_type(name);
+    }
+
+    bool is_duration() const override {
+        return false;
+    }
+
    virtual sstring to_string() const override {
        sstring start = _frozen ? "frozen<" : "";
        sstring end = _frozen ? ">" : "";
@@ -132,7 +185,7 @@ public:
        _frozen = true;
    }

-    virtual shared_ptr<cql3_type> prepare(database& db, const sstring& keyspace) override {
+    virtual shared_ptr<cql3_type> prepare_internal(const sstring& keyspace, lw_shared_ptr<user_types_metadata> user_types) override {
        if (_name.has_keyspace()) {
            // The provided keyspace is the one of the current statement this is part of. If it's different from the keyspace of
            // the UTName, we reject since we want to limit user types to their own keyspace (see #6643)
@@ -144,23 +197,23 @@ public:
        } else {
            _name.set_keyspace(keyspace);
        }
-
+        if (!user_types) {
+            // bootstrap mode.
+            throw exceptions::invalid_request_exception(sprint("Unknown type %s", _name));
+        }
        try {
-            auto&& ks = db.find_keyspace(_name.get_keyspace());
-            try {
-                auto&& type = ks.metadata()->user_types()->get_type(_name.get_user_type_name());
-                if (!_frozen) {
-                    throw exceptions::invalid_request_exception("Non-frozen User-Defined types are not supported, please use frozen<>");
-                }
-                return make_shared<cql3_type>(_name.to_string(), std::move(type));
-            } catch (std::out_of_range& e) {
-                throw exceptions::invalid_request_exception(sprint("Unknown type %s", _name));
+            auto&& type = user_types->get_type(_name.get_user_type_name());
+            if (!_frozen) {
+                throw exceptions::invalid_request_exception("Non-frozen User-Defined types are not supported, please use frozen<>");
            }
-        } catch (no_such_keyspace& nsk) {
-            throw exceptions::invalid_request_exception("Unknown keyspace " + _name.get_keyspace());
+            return make_shared<cql3_type>(_name.to_string(), std::move(type));
+        } catch (std::out_of_range& e) {
+            throw exceptions::invalid_request_exception(sprint("Unknown type %s", _name));
        }
    }
-
+    bool references_user_type(const sstring& name) const override {
+        return _name.get_string_type_name() == name;
+    }
    virtual bool supports_freezing() const override {
        return true;
    }
@@ -191,7 +244,7 @@ public:
        }
        _frozen = true;
    }
-    virtual shared_ptr<cql3_type> prepare(database& db, const sstring& keyspace) override {
+    virtual shared_ptr<cql3_type> prepare_internal(const sstring& keyspace, lw_shared_ptr<user_types_metadata> user_types) override {
        if (!_frozen) {
            freeze();
        }
@@ -200,10 +253,17 @@ public:
            if (t->is_counter()) {
                throw exceptions::invalid_request_exception("Counters are not allowed inside tuples");
            }
-            ts.push_back(t->prepare(db, keyspace)->get_type());
+            ts.push_back(t->prepare_internal(keyspace, user_types)->get_type());
        }
        return make_cql3_tuple_type(tuple_type_impl::get_instance(std::move(ts)));
    }
+
+    bool references_user_type(const sstring& name) const override {
+        return std::any_of(_types.begin(), _types.end(), [&name](auto t) {
+            return t->references_user_type(name);
+        });
+    }
+
    virtual sstring to_string() const override {
        return sprint("tuple<%s>", join(", ", _types));
    }
@@ -271,17 +331,23 @@ thread_local shared_ptr<cql3_type> cql3_type::bigint = make("bigint", long_type,
 thread_local shared_ptr<cql3_type> cql3_type::blob = make("blob", bytes_type, cql3_type::kind::BLOB);
 thread_local shared_ptr<cql3_type> cql3_type::boolean = make("boolean", boolean_type, cql3_type::kind::BOOLEAN);
 thread_local shared_ptr<cql3_type> cql3_type::double_ = make("double", double_type, cql3_type::kind::DOUBLE);
+thread_local shared_ptr<cql3_type> cql3_type::empty = make("empty", empty_type, cql3_type::kind::EMPTY);
 thread_local shared_ptr<cql3_type> cql3_type::float_ = make("float", float_type, cql3_type::kind::FLOAT);
 thread_local shared_ptr<cql3_type> cql3_type::int_ = make("int", int32_type, cql3_type::kind::INT);
+thread_local shared_ptr<cql3_type> cql3_type::smallint = make("smallint", short_type, cql3_type::kind::SMALLINT);
 thread_local shared_ptr<cql3_type> cql3_type::text = make("text", utf8_type, cql3_type::kind::TEXT);
 thread_local shared_ptr<cql3_type> cql3_type::timestamp = make("timestamp", timestamp_type, cql3_type::kind::TIMESTAMP);
+thread_local shared_ptr<cql3_type> cql3_type::tinyint = make("tinyint", byte_type, cql3_type::kind::TINYINT);
 thread_local shared_ptr<cql3_type> cql3_type::uuid = make("uuid", uuid_type, cql3_type::kind::UUID);
 thread_local shared_ptr<cql3_type> cql3_type::varchar = make("varchar", utf8_type, cql3_type::kind::TEXT);
 thread_local shared_ptr<cql3_type> cql3_type::timeuuid = make("timeuuid", timeuuid_type, cql3_type::kind::TIMEUUID);
+thread_local shared_ptr<cql3_type> cql3_type::date = make("date", simple_date_type, cql3_type::kind::DATE);
+thread_local shared_ptr<cql3_type> cql3_type::time = make("time", time_type, cql3_type::kind::TIME);
 thread_local shared_ptr<cql3_type> cql3_type::inet = make("inet", inet_addr_type, cql3_type::kind::INET);
 thread_local shared_ptr<cql3_type> cql3_type::varint = make("varint", varint_type, cql3_type::kind::VARINT);
 thread_local shared_ptr<cql3_type> cql3_type::decimal = make("decimal", decimal_type, cql3_type::kind::DECIMAL);
 thread_local shared_ptr<cql3_type> cql3_type::counter = make("counter", counter_type, cql3_type::kind::COUNTER);
+thread_local shared_ptr<cql3_type> cql3_type::duration = make("duration", duration_type, cql3_type::kind::DURATION);

 const std::vector<shared_ptr<cql3_type>>&
 cql3_type::values() {
@@ -293,15 +359,21 @@ cql3_type::values() {
        cql3_type::counter,
        cql3_type::decimal,
        cql3_type::double_,
+        cql3_type::empty,
        cql3_type::float_,
-        cql3_type:inet,
+        cql3_type::inet,
        cql3_type::int_,
+        cql3_type::smallint,
        cql3_type::text,
        cql3_type::timestamp,
+        cql3_type::tinyint,
        cql3_type::uuid,
        cql3_type::varchar,
        cql3_type::varint,
        cql3_type::timeuuid,
+        cql3_type::date,
+        cql3_type::time,
+        cql3_type::duration,
    };
    return v;
 }
@@ -321,5 +393,23 @@ operator<<(std::ostream& os, const cql3_type::raw& r) {
    return os << r.to_string();
 }

+namespace util {
+
+sstring maybe_quote(const sstring& s) {
+    static const std::regex unquoted("\\w*");
+    static const std::regex double_quote("\"");
+
+    if (std::regex_match(s.begin(), s.end(), unquoted)) {
+        return s;
+    }
+    std::ostringstream ss;
+    ss << "\"";
+    std::regex_replace(std::ostreambuf_iterator<char>(ss), s.begin(), s.end(), double_quote, "\"\"");
+    ss << "\"";
+    return ss.str();
+}
+
+}
+
 }

--- a/cql3/cql3_type.hh
+++ b/cql3/cql3_type.hh
@@ -47,6 +47,7 @@
 #include "enum_set.hh"

 class database;
+class user_types_metadata;

 namespace cql3 {

@@ -63,19 +64,23 @@ public:
    bool is_counter() const { return _type->is_counter(); }
    bool is_native() const { return _native; }
    data_type get_type() const { return _type; }
-    sstring to_string() const { return _name; }
+    sstring to_string() const;

    // For UserTypes, we need to know the current keyspace to resolve the
    // actual type used, so Raw is a "not yet prepared" CQL3Type.
    class raw {
    public:
+        virtual ~raw() {}
        bool _frozen = false;
        virtual bool supports_freezing() const = 0;
        virtual bool is_collection() const;
        virtual bool is_counter() const;
+        virtual bool is_duration() const;
+        virtual bool references_user_type(const sstring&) const;
        virtual std::experimental::optional<sstring> keyspace() const;
        virtual void freeze();
-        virtual shared_ptr<cql3_type> prepare(database& db, const sstring& keyspace) = 0;
+        virtual shared_ptr<cql3_type> prepare_internal(const sstring& keyspace, lw_shared_ptr<user_types_metadata>) = 0;
+        virtual shared_ptr<cql3_type> prepare(database& db, const sstring& keyspace);
        static shared_ptr<raw> from(shared_ptr<cql3_type> type);
        static shared_ptr<raw> user_type(ut_name name);
        static shared_ptr<raw> map(shared_ptr<raw> t1, shared_ptr<raw> t2);
@@ -98,7 +103,7 @@ private:

 public:
    enum class kind : int8_t {
-        ASCII, BIGINT, BLOB, BOOLEAN, COUNTER, DECIMAL, DOUBLE, FLOAT, INT, INET, TEXT, TIMESTAMP, UUID, VARCHAR, VARINT, TIMEUUID
+        ASCII, BIGINT, BLOB, BOOLEAN, COUNTER, DECIMAL, DOUBLE, EMPTY, FLOAT, INT, SMALLINT, TINYINT, INET, TEXT, TIMESTAMP, UUID, VARCHAR, VARINT, TIMEUUID, DATE, TIME, DURATION
    };
    using kind_enum = super_enum<kind,
        kind::ASCII,
@@ -108,15 +113,21 @@ public:
        kind::COUNTER,
        kind::DECIMAL,
        kind::DOUBLE,
+        kind::EMPTY,
        kind::FLOAT,
        kind::INET,
        kind::INT,
+        kind::SMALLINT,
+        kind::TINYINT,
        kind::TEXT,
        kind::TIMESTAMP,
        kind::UUID,
        kind::VARCHAR,
        kind::VARINT,
-        kind::TIMEUUID>;
+        kind::TIMEUUID,
+        kind::DATE,
+        kind::TIME,
+        kind::DURATION>;
    using kind_enum_set = enum_set<kind_enum>;
 private:
    std::experimental::optional<kind_enum_set::prepared> _kind;
@@ -129,17 +140,23 @@ public:
    static thread_local shared_ptr<cql3_type> blob;
    static thread_local shared_ptr<cql3_type> boolean;
    static thread_local shared_ptr<cql3_type> double_;
+    static thread_local shared_ptr<cql3_type> empty;
    static thread_local shared_ptr<cql3_type> float_;
    static thread_local shared_ptr<cql3_type> int_;
+    static thread_local shared_ptr<cql3_type> smallint;
    static thread_local shared_ptr<cql3_type> text;
    static thread_local shared_ptr<cql3_type> timestamp;
+    static thread_local shared_ptr<cql3_type> tinyint;
    static thread_local shared_ptr<cql3_type> uuid;
    static thread_local shared_ptr<cql3_type> varchar;
    static thread_local shared_ptr<cql3_type> timeuuid;
+    static thread_local shared_ptr<cql3_type> date;
+    static thread_local shared_ptr<cql3_type> time;
    static thread_local shared_ptr<cql3_type> inet;
    static thread_local shared_ptr<cql3_type> varint;
    static thread_local shared_ptr<cql3_type> decimal;
    static thread_local shared_ptr<cql3_type> counter;
+    static thread_local shared_ptr<cql3_type> duration;

    static const std::vector<shared_ptr<cql3_type>>& values();
 public:
--- a/cql3/cql_statement.hh
+++ b/cql3/cql_statement.hh
@@ -46,7 +46,7 @@
 #include "service/storage_proxy.hh"
 #include "cql3/query_options.hh"

-namespace transport {
+namespace cql_transport {

 namespace messages {

@@ -89,7 +89,7 @@ public:
     * @param state the current query state
     * @param options options for this query (consistency, variables, pageSize, ...)
     */
-    virtual future<::shared_ptr<transport::messages::result_message>>
+    virtual future<::shared_ptr<cql_transport::messages::result_message>>
        execute(distributed<service::storage_proxy>& proxy, service::query_state& state, const query_options& options) = 0;

    /**
@@ -97,7 +97,7 @@ public:
     *
     * @param state the current query state
     */
-    virtual future<::shared_ptr<transport::messages::result_message>>
+    virtual future<::shared_ptr<cql_transport::messages::result_message>>
        execute_internal(distributed<service::storage_proxy>& proxy, service::query_state& state, const query_options& options) = 0;

    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const = 0;
--- a/cql3/error_collector.hh
+++ b/cql3/error_collector.hh
@@ -67,10 +67,6 @@ class error_collector : public error_listener<RecognizerType, ExceptionBaseType>
     */
    const sstring_view _query;

-    /**
-     * The error messages.
-     */
-    std::vector<sstring> _error_msgs;
 public:

    /**
@@ -81,7 +77,10 @@ public:
     */
    error_collector(const sstring_view& query) : _query(query) {}

-    virtual void syntax_error(RecognizerType& recognizer, ANTLR_UINT8** token_names, ExceptionBaseType* ex) override {
+    /**
+     * Format and throw a new \c exceptions::syntax_exception.
+     */
+    [[noreturn]] virtual void syntax_error(RecognizerType& recognizer, ANTLR_UINT8** token_names, ExceptionBaseType* ex) override {
        auto hdr = get_error_header(ex);
        auto msg = get_error_message(recognizer, ex, token_names);
        std::stringstream result;
@@ -90,22 +89,15 @@ public:
        if (recognizer instanceof Parser)
            appendQuerySnippet((Parser) recognizer, builder);
 #endif
-        _error_msgs.emplace_back(result.str());
-    }

-    virtual void syntax_error(RecognizerType& recognizer, const sstring& msg) override {
-        _error_msgs.emplace_back(msg);
+        throw exceptions::syntax_exception(result.str());
    }

    /**
-     * Throws the first syntax error found by the lexer or the parser if it exists.
-     *
-     * @throws SyntaxException the syntax error.
+     * Throw a new \c exceptions::syntax_exception.
     */
-    void throw_first_syntax_error() {
-        if (!_error_msgs.empty()) {
-            throw exceptions::syntax_exception(_error_msgs[0]);
-        }
+    [[noreturn]] virtual void syntax_error(RecognizerType&, const sstring& msg) override {
+        throw exceptions::syntax_exception(msg);
    }

 private:
--- a/cql3/error_listener.hh
+++ b/cql3/error_listener.hh
@@ -41,6 +41,7 @@

 #pragma once

+#include "seastarx.hh"
 #include <seastar/core/sstring.hh>
 #include <antlr3.hpp>

@@ -52,6 +53,7 @@ namespace cql3 {
 template<typename RecognizerType, typename ExceptionBaseType>
 class error_listener {
 public:
+    virtual ~error_listener() = default;

    /**
     * Invoked when a syntax error occurs.
--- a/cql3/functions/abstract_function.hh
+++ b/cql3/functions/abstract_function.hh
@@ -43,7 +43,7 @@

 #include "types.hh"
 #include <vector>
-#include <iostream>
+#include <iosfwd>
 #include <boost/functional/hash.hpp>

 namespace cql3 {
--- a/cql3/functions/aggregate_fcts.hh
+++ b/cql3/functions/aggregate_fcts.hh
@@ -41,6 +41,7 @@

 #pragma once

+#include "utils/big_decimal.hh"
 #include "aggregate_function.hh"
 #include "native_aggregate_function.hh"

@@ -111,9 +112,70 @@ make_sum_function() {
    return make_shared<sum_function_for<Type>>();
 }

+template <typename Type>
+class impl_div_for_avg {
+public:
+    static Type div(const Type& x, const int64_t y) {
+        return x/y;
+    }
+};
+
+template <>
+class impl_div_for_avg<big_decimal> {
+public:
+    static big_decimal div(const big_decimal& x, const int64_t y) {
+        return x.div(y, big_decimal::rounding_mode::HALF_EVEN);
+    }
+};
+
+// We need a wider accumulator for average, since summing the inputs can overflow
+// the input type
+template <typename T>
+struct accumulator_for;
+
+template <>
+struct accumulator_for<int8_t> {
+    using type = __int128;
+};
+
+template <>
+struct accumulator_for<int16_t> {
+    using type = __int128;
+};
+
+template <>
+struct accumulator_for<int32_t> {
+    using type = __int128;
+};
+
+template <>
+struct accumulator_for<int64_t> {
+    using type = __int128;
+};
+
+template <>
+struct accumulator_for<float> {
+    using type = float;
+};
+
+template <>
+struct accumulator_for<double> {
+    using type = double;
+};
+
+template <>
+struct accumulator_for<boost::multiprecision::cpp_int> {
+    using type = boost::multiprecision::cpp_int;
+};
+
+template <>
+struct accumulator_for<big_decimal> {
+    using type = big_decimal;
+};
+
 template <typename Type>
 class impl_avg_function_for final : public aggregate_function::aggregate {
-   Type _sum{};
+   typename accumulator_for<Type>::type _sum{};
   int64_t _count = 0;
 public:
    virtual void reset() override {
@@ -121,9 +183,9 @@ public:
        _count = 0;
    }
    virtual opt_bytes compute(cql_serialization_format sf) override {
-        Type ret = 0;
+        Type ret{};
        if (_count) {
-            ret = _sum / _count;
+            ret = impl_div_for_avg<Type>::div(_sum, _count);
        }
        return data_type_for<Type>()->decompose(ret);
    }
--- a/cql3/functions/castas_fcts.cc
+++ b/cql3/functions/castas_fcts.cc
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "castas_fcts.hh"
+#include "cql3/functions/native_scalar_function.hh"
+
+namespace cql3 {
+namespace functions {
+
+namespace {
+
+using bytes_opt = std::experimental::optional<bytes>;
+
+class castas_function_for : public cql3::functions::native_scalar_function {
+    castas_fctn _func;
+public:
+    castas_function_for(data_type to_type,
+                        data_type from_type,
+                        castas_fctn func)
+            : native_scalar_function("castas" + to_type->as_cql3_type()->to_string(), to_type, {from_type})
+            , _func(func) {
+    }
+    virtual bool is_pure() override {
+        return true;
+    }
+    virtual void print(std::ostream& os) const override {
+        os << "cast(" << _arg_types[0]->name() << " as " << _return_type->name() << ")";
+    }
+    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
+        auto from_type = arg_types()[0];
+        auto to_type = return_type();
+
+        auto&& val = parameters[0];
+        if (!val) {
+            return val;
+        }
+        auto val_from = from_type->deserialize(*val);
+        auto val_to = _func(val_from);
+        return to_type->decompose(val_to);
+    }
+};
+
+shared_ptr<function> make_castas_function(data_type to_type, data_type from_type, castas_fctn func) {
+    return ::make_shared<castas_function_for>(std::move(to_type), std::move(from_type), std::move(func));
+}
+
+} /* Anonymous Namespace */
+
+shared_ptr<function> castas_functions::get(data_type to_type, const std::vector<shared_ptr<cql3::selection::selector>>& provided_args, schema_ptr s) {
+    if (provided_args.size() != 1) {
+        throw exceptions::invalid_request_exception("Invalid CAST expression");
+    }
+    auto from_type = provided_args[0]->get_type();
+    auto from_type_key = from_type;
+    if (from_type_key->is_reversed()) {
+        from_type_key = dynamic_cast<const reversed_type_impl&>(*from_type).underlying_type();
+    }
+
+    auto f = get_castas_fctn(to_type, from_type_key);
+    return make_castas_function(to_type, from_type, f);
+}
+
+}
+}
--- a/cql3/functions/castas_fcts.hh
+++ b/cql3/functions/castas_fcts.hh
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Modified by ScyllaDB
+ *
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <tuple>
+#include <unordered_map>
+
+#include "cql3/functions/function.hh"
+#include "cql3/functions/abstract_function.hh"
+#include "exceptions/exceptions.hh"
+#include "core/print.hh"
+#include "cql3/cql3_type.hh"
+#include "cql3/selection/selector.hh"
+
+namespace cql3 {
+namespace functions {
+
+class castas_functions {
+public:
+    static shared_ptr<function> get(data_type to_type, const std::vector<shared_ptr<cql3::selection::selector>>& provided_args, schema_ptr s);
+};
+
+}
+}
--- a/cql3/functions/function_call.hh
+++ b/cql3/functions/function_call.hh
@@ -59,13 +59,13 @@ public:
    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override;
    virtual void collect_marker_specification(shared_ptr<variable_specifications> bound_names) override;
    virtual shared_ptr<terminal> bind(const query_options& options) override;
-    virtual bytes_view_opt bind_and_get(const query_options& options) override;
+    virtual cql3::raw_value_view bind_and_get(const query_options& options) override;
 private:
    static bytes_opt execute_internal(cql_serialization_format sf, scalar_function& fun, std::vector<bytes_opt> params);
 public:
    virtual bool contains_bind_marker() const override;
 private:
-    static shared_ptr<terminal> make_terminal(shared_ptr<function> fun, bytes_opt result, cql_serialization_format sf);
+    static shared_ptr<terminal> make_terminal(shared_ptr<function> fun, cql3::raw_value result, cql_serialization_format sf);
 public:
    class raw : public term::raw {
        function_name _name;
--- a/cql3/functions/function_name.hh
+++ b/cql3/functions/function_name.hh
@@ -43,7 +43,7 @@

 #include "core/sstring.hh"
 #include "db/system_keyspace.hh"
-#include <iostream>
+#include <iosfwd>
 #include <functional>

 namespace cql3 {
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -59,6 +59,14 @@ functions::init() {
        declare(make_to_blob_function(type->get_type()));
        declare(make_from_blob_function(type->get_type()));
    }
+    declare(aggregate_fcts::make_count_function<int8_t>());
+    declare(aggregate_fcts::make_max_function<int8_t>());
+    declare(aggregate_fcts::make_min_function<int8_t>());
+
+    declare(aggregate_fcts::make_count_function<int16_t>());
+    declare(aggregate_fcts::make_max_function<int16_t>());
+    declare(aggregate_fcts::make_min_function<int16_t>());
+
    declare(aggregate_fcts::make_count_function<int32_t>());
    declare(aggregate_fcts::make_max_function<int32_t>());
    declare(aggregate_fcts::make_min_function<int32_t>());
@@ -67,6 +75,26 @@ functions::init() {
    declare(aggregate_fcts::make_max_function<int64_t>());
    declare(aggregate_fcts::make_min_function<int64_t>());

+    declare(aggregate_fcts::make_count_function<boost::multiprecision::cpp_int>());
+    declare(aggregate_fcts::make_max_function<boost::multiprecision::cpp_int>());
+    declare(aggregate_fcts::make_min_function<boost::multiprecision::cpp_int>());
+
+    declare(aggregate_fcts::make_count_function<big_decimal>());
+    declare(aggregate_fcts::make_max_function<big_decimal>());
+    declare(aggregate_fcts::make_min_function<big_decimal>());
+
+    declare(aggregate_fcts::make_count_function<float>());
+    declare(aggregate_fcts::make_max_function<float>());
+    declare(aggregate_fcts::make_min_function<float>());
+
+    declare(aggregate_fcts::make_count_function<double>());
+    declare(aggregate_fcts::make_max_function<double>());
+    declare(aggregate_fcts::make_min_function<double>());
+
+    declare(aggregate_fcts::make_count_function<sstring>());
+    declare(aggregate_fcts::make_max_function<sstring>());
+    declare(aggregate_fcts::make_min_function<sstring>());
+
    //FIXME:
    //declare(aggregate_fcts::make_count_function<bytes>());
    //declare(aggregate_fcts::make_max_function<bytes>());
@@ -76,20 +104,22 @@ functions::init() {

    declare(make_varchar_as_blob_fct());
    declare(make_blob_as_varchar_fct());
+    declare(aggregate_fcts::make_sum_function<int8_t>());
+    declare(aggregate_fcts::make_sum_function<int16_t>());
    declare(aggregate_fcts::make_sum_function<int32_t>());
    declare(aggregate_fcts::make_sum_function<int64_t>());
+    declare(aggregate_fcts::make_sum_function<float>());
+    declare(aggregate_fcts::make_sum_function<double>());
+    declare(aggregate_fcts::make_sum_function<boost::multiprecision::cpp_int>());
+    declare(aggregate_fcts::make_sum_function<big_decimal>());
+    declare(aggregate_fcts::make_avg_function<int8_t>());
+    declare(aggregate_fcts::make_avg_function<int16_t>());
    declare(aggregate_fcts::make_avg_function<int32_t>());
    declare(aggregate_fcts::make_avg_function<int64_t>());
-#if 0
-    declare(AggregateFcts.sumFunctionForFloat);
-    declare(AggregateFcts.sumFunctionForDouble);
-    declare(AggregateFcts.sumFunctionForDecimal);
-    declare(AggregateFcts.sumFunctionForVarint);
-    declare(AggregateFcts.avgFunctionForFloat);
-    declare(AggregateFcts.avgFunctionForDouble);
-    declare(AggregateFcts.avgFunctionForVarint);
-    declare(AggregateFcts.avgFunctionForDecimal);
-#endif
+    declare(aggregate_fcts::make_avg_function<float>());
+    declare(aggregate_fcts::make_avg_function<double>());
+    declare(aggregate_fcts::make_avg_function<boost::multiprecision::cpp_int>());
+    declare(aggregate_fcts::make_avg_function<big_decimal>());

    // also needed for smp:
 #if 0
@@ -299,10 +329,10 @@ function_call::collect_marker_specification(shared_ptr<variable_specifications>

 shared_ptr<terminal>
 function_call::bind(const query_options& options) {
-    return make_terminal(_fun, to_bytes_opt(bind_and_get(options)), options.get_cql_serialization_format());
+    return make_terminal(_fun, cql3::raw_value::make_value(bind_and_get(options)), options.get_cql_serialization_format());
 }

-bytes_view_opt
+cql3::raw_value_view
 function_call::bind_and_get(const query_options& options) {
    std::vector<bytes_opt> buffers;
    buffers.reserve(_terms.size());
@@ -316,7 +346,7 @@ function_call::bind_and_get(const query_options& options) {
        buffers.push_back(std::move(to_bytes_opt(val)));
    }
    auto result = execute_internal(options.get_cql_serialization_format(), *_fun, std::move(buffers));
-    return options.make_temporary(result);
+    return options.make_temporary(cql3::raw_value::make_value(result));
 }

 bytes_opt
@@ -328,7 +358,7 @@ function_call::execute_internal(cql_serialization_format sf, scalar_function& fu
            fun.return_type()->validate(*result);
        }
        return result;
-    } catch (marshal_exception e) {
+    } catch (marshal_exception& e) {
        throw runtime_exception(sprint("Return of function %s (%s) is not a valid value for its declared return type %s",
                                       fun, to_hex(result),
                                       *fun.return_type()->as_cql3_type()
@@ -347,7 +377,7 @@ function_call::contains_bind_marker() const {
 }

 shared_ptr<terminal>
-function_call::make_terminal(shared_ptr<function> fun, bytes_opt result, cql_serialization_format sf)  {
+function_call::make_terminal(shared_ptr<function> fun, cql3::raw_value result, cql_serialization_format sf)  {
    if (!dynamic_pointer_cast<const collection_type_impl>(fun->return_type())) {
        return ::make_shared<constants::value>(std::move(result));
    }
@@ -413,7 +443,7 @@ function_call::raw::prepare(database& db, const sstring& keyspace, ::shared_ptr<
    // If all parameters are terminal and the function is pure, we can
    // evaluate it now, otherwise we'd have to wait execution time
    if (all_terminal && scalar_fun->is_pure()) {
-        return make_terminal(scalar_fun, execute(*scalar_fun, parameters), query_options::DEFAULT.get_cql_serialization_format());
+        return make_terminal(scalar_fun, cql3::raw_value::make_value(execute(*scalar_fun, parameters)), query_options::DEFAULT.get_cql_serialization_format());
    } else {
        return ::make_shared<function_call>(scalar_fun, parameters);
    }
@@ -426,7 +456,7 @@ function_call::raw::execute(scalar_function& fun, std::vector<shared_ptr<term>>
    for (auto&& t : parameters) {
        assert(dynamic_cast<terminal*>(t.get()));
        auto&& param = static_cast<terminal*>(t.get())->get(query_options::DEFAULT);
-        buffers.push_back(std::move(param));
+        buffers.push_back(std::move(to_bytes_opt(param)));
    }

    return execute_internal(cql_serialization_format::internal(), fun, buffers);
--- a/Show More
+++ b/Show More