schema.cc/describe: fix invalid compaction options in schema

There is a typo in schema.cql of snapshot, lack of comma after compaction strategy. It will fail to restore schema by the file. AND compaction = {'class': 'SizeTieredCompactionStrategy''max_compaction_threshold': '32'} map_as_cql_param() function has a `first` parameter to smartly add comma, the compaction_strategy_options is always not the first. Fixes #7741 Signed-off-by: Amos Kong <amos@scylladb.com> Closes #7734 (cherry picked from commit 6b1659ee80)
sstable: writer: ka/la: Write row marker cell after row tombstone
2021-03-24 12:58:11 +02:00 · 2021-03-24 10:42:11 +02:00 · 2021-03-21 10:51:36 +02:00 · 2021-03-18 19:20:10 +02:00 · 2021-03-18 14:29:38 +02:00 · 2021-03-11 08:24:56 +02:00
4679 changed files with 72243 additions and 29330 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,3 +1,4 @@
 .git
 build
 seastar/build
+testlog
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,17 +1,20 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
 	url = ../scylla-swagger-ui
 	ignore = dirty
-[submodule "xxHash"]
-	path = xxHash
-	url = ../xxHash
 [submodule "libdeflate"]
 	path = libdeflate
 	url = ../libdeflate
-[submodule "zstd"]
-	path = zstd
-	url = ../zstd
+[submodule "abseil"]
+	path = abseil
+	url = ../abseil-cpp
+[submodule "scylla-jmx"]
+	path = scylla-jmx
+	url = ../scylla-jmx
+[submodule "scylla-tools"]
+	path = scylla-tools
+	url = ../scylla-tools-java
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,13 +5,25 @@
 cmake_minimum_required(VERSION 3.7)
 project(scylla)

+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+  message(STATUS "Setting build type to 'Release' as none was specified.")
+  set(CMAKE_BUILD_TYPE "Release" CACHE
+      STRING "Choose the type of build." FORCE)
+  # Set the possible values of build type for cmake-gui
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
+    "Debug" "Release" "Dev" "Sanitize")
+endif()
+
+if(CMAKE_BUILD_TYPE)
+    string(TOLOWER "${CMAKE_BUILD_TYPE}" BUILD_TYPE)
+else()
+    set(BUILD_TYPE "release")
+endif()
+
 if (NOT DEFINED FOR_IDE AND NOT DEFINED ENV{FOR_IDE} AND NOT DEFINED ENV{CLION_IDE})
    message(FATAL_ERROR "This CMakeLists.txt file is only valid for use in IDEs, please define FOR_IDE to acknowledge this.")
 endif()

-# Default value. A more accurate list is populated through `pkg-config` below if `seastar.pc` is available.
-set(SEASTAR_INCLUDE_DIRS "seastar")
-
 # These paths are always available, since they're included in the repository. Additional DPDK headers are placed while
 # Seastar is built, and are captured in `SEASTAR_INCLUDE_DIRS` through parsing the Seastar pkg-config file (below).
 set(SEASTAR_DPDK_INCLUDE_DIRS
@@ -22,9 +34,14 @@ set(SEASTAR_DPDK_INCLUDE_DIRS

 find_package(PkgConfig REQUIRED)

-set(ENV{PKG_CONFIG_PATH} "${CMAKE_SOURCE_DIR}/seastar/build/release:$ENV{PKG_CONFIG_PATH}")
+set(ENV{PKG_CONFIG_PATH} "${CMAKE_SOURCE_DIR}/build/${BUILD_TYPE}/seastar:$ENV{PKG_CONFIG_PATH}")
 pkg_check_modules(SEASTAR seastar)

+if(NOT SEASTAR_INCLUDE_DIRS)
+    # Default value. A more accurate list is populated through `pkg-config` below if `seastar.pc` is available.
+    set(SEASTAR_INCLUDE_DIRS "seastar/include")
+endif()
+
 find_package(Boost COMPONENTS filesystem program_options system thread)

 ##
@@ -70,7 +87,7 @@ scan_scylla_source_directories(
          seastar/json
          seastar/net
          seastar/rpc
-          seastar/tests
+          seastar/testing
          seastar/util)

 scan_scylla_source_directories(
@@ -106,7 +123,7 @@ scan_scylla_source_directories(
 scan_scylla_source_directories(
        VAR SCYLLA_GEN_SOURCE_FILES
        RECURSIVE
-        PATHS build/release/gen)
+        PATHS build/${BUILD_TYPE}/gen)

 set(SCYLLA_SOURCE_FILES
        ${SCYLLA_ROOT_SOURCE_FILES}
@@ -117,15 +134,11 @@ add_executable(scylla
        ${SEASTAR_SOURCE_FILES}
        ${SCYLLA_SOURCE_FILES})

-# Note that since CLion does not undestand GCC6 concepts, we always disable them (even if users configure otherwise).
-# CLion seems to have trouble with `-U` (macro undefinition), so we do it this way instead.
-list(REMOVE_ITEM SEASTAR_CFLAGS "-DHAVE_GCC6_CONCEPTS")
-
 # If the Seastar pkg-config information is available, append to the default flags.
 #
 # For ease of browsing the source code, we always pretend that DPDK is enabled.
 target_compile_options(scylla PUBLIC
-        -std=gnu++1z
+        -std=gnu++20
        -DHAVE_DPDK
        -DHAVE_HWLOC
        "${SEASTAR_CFLAGS}")
@@ -139,4 +152,4 @@ target_include_directories(scylla PUBLIC
        ${Boost_INCLUDE_DIRS}
        xxhash
        libdeflate
-        build/release/gen)
+        build/${BUILD_TYPE}/gen)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -8,4 +8,4 @@ Please use the [Issue Tracker](https://github.com/scylladb/scylla/issues/) to re

 # Contributing Code to Scylla

-To contribute code to Scylla, you need to sign the [Contributor License Agreement](http://www.scylladb.com/opensource/cla/) and send your changes as [patches](https://github.com/scylladb/scylla/wiki/Formatting-and-sending-patches) to the [mailing list](https://groups.google.com/forum/#!forum/scylladb-dev). We don't accept pull requests on GitHub.
+To contribute code to Scylla, you need to sign the [Contributor License Agreement](https://www.scylladb.com/open-source/contributor-agreement/) and send your changes as [patches](https://github.com/scylladb/scylla/wiki/Formatting-and-sending-patches) to the [mailing list](https://groups.google.com/forum/#!forum/scylladb-dev). We don't accept pull requests on GitHub.
--- a/HACKING.md
+++ b/HACKING.md
@@ -18,23 +18,35 @@ $ git submodule update --init --recursive

 ### Dependencies

-Scylla depends on the system package manager for its development dependencies.
+Scylla is fairly fussy about its build environment, requiring a very recent
+version of the C++20 compiler and numerous tools and libraries to build.

-Running `./install-dependencies.sh` (as root) installs the appropriate packages based on your Linux distribution.
+Run `./install-dependencies.sh` (as root) to use your Linux distributions's
+package manager to install the appropriate packages on your build machine.
+However, this will only work on very recent distributions. For example,
+currently Fedora users must upgrade to Fedora 32 otherwise the C++ compiler
+will be too old, and not support the new C++20 standard that Scylla uses.

-On Ubuntu and Debian based Linux distributions, some packages
-required to build Scylla are missing in the official upstream:
+Alternatively, to avoid having to upgrade your build machine or install
+various packages on it, we provide another option - the **frozen toolchain**.
+This is a script, `./tools/toolchain/dbuild`, that can execute build or run
+commands inside a Docker image that contains exactly the right build tools and
+libraries. The `dbuild` technique is useful for beginners, but is also the way
+in which ScyllaDB produces official releases, so it is highly recommended.

- libthrift-dev and libthrift
- antlr3-c++-dev
+To use `dbuild`, you simply prefix any build or run command with it. Building
+and running Scylla becomes as easy as:

-Try running ```sudo ./scripts/scylla_current_repo``` to add Scylla upstream,
-and get the missing packages from it.
+```bash
+$ ./tools/toolchain/dbuild ./configure.py
+$ ./tools/toolchain/dbuild ninja build/release/scylla
+$ ./tools/toolchain/dbuild ./build/release/scylla --developer-mode 1
+```

 ### Build system

 **Note**: Compiling Scylla requires, conservatively, 2 GB of memory per native
-thread, and up to 3 GB per native thread while linking. GCC >= 8.1.1. is
+thread, and up to 3 GB per native thread while linking. GCC >= 10 is
 required.

 Scylla is built with [Ninja](https://ninja-build.org/), a low-level rule-based system. A Python script, `configure.py`, generates a Ninja file (`build.ninja`) based on configuration options.
@@ -141,7 +153,7 @@ In v3:
 "Tests: unit ({mode}), dtest ({smp})"
 ```

-The usual is "Tests: unit (release)", although running debug tests is encouraged.
+The usual is "Tests: unit (dev)", although running debug tests is encouraged.

 5. When answering review comments, prefer inline quotes as they make it easier to track the conversation across multiple e-mails.

--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -1,5 +1,7 @@
 This project includes code developed by the Apache Software Foundation (http://www.apache.org/),
 especially Apache Cassandra.

-It also includes files from https://github.com/antonblanchard/crc32-vpmsum (author Anton Blanchard <anton@au.ibm.com>, IBM).
+It includes files from https://github.com/antonblanchard/crc32-vpmsum (author Anton Blanchard <anton@au.ibm.com>, IBM).
 These files are located in utils/arch/powerpc/crc32-vpmsum. Their license may be found in licenses/LICENSE-crc32-vpmsum.TXT.
+
+It includes modified code from https://gitbox.apache.org/repos/asf?p=cassandra-dtest.git (owned by The Apache Software Foundation)
--- a/README.md
+++ b/README.md
@@ -2,22 +2,24 @@

 ## Quick-start

-To get the build going quickly, Scylla offers a [frozen toolchain](tools/toolchain/README.md)
-which would build and run Scylla using a pre-configured Docker image.
-Using the frozen toolchain will also isolate all of the installed
-dependencies in a Docker container.
-Assuming you have met the toolchain prerequisites, which is running
-Docker in user mode, building and running is as easy as:
+Scylla is fairly fussy about its build environment, requiring very recent
+versions of the C++20 compiler and of many libraries to build. The document
+[HACKING.md](HACKING.md) includes detailed information on building and
+developing Scylla, but to get Scylla building quickly on (almost) any build
+machine, Scylla offers offers a [frozen toolchain](tools/toolchain/README.md),
+This is a pre-configured Docker image which includes recent versions of all
+the required compilers, libraries and build tools. Using the frozen toolchain
+allows you to avoid changing anything in your build machine to meet Scylla's
+requirements - you just need to meet the frozen toolchain's prerequisites
+(mostly, Docker or Podman being available).
+
+Building and running Scylla with the frozen toolchain is as easy as:

 ```bash
 $ ./tools/toolchain/dbuild ./configure.py
 $ ./tools/toolchain/dbuild ninja build/release/scylla
 $ ./tools/toolchain/dbuild ./build/release/scylla --developer-mode 1
- ```
-
-Please see [HACKING.md](HACKING.md) for detailed information on building and developing Scylla.
-
-**Note**: GCC >= 8.1.1 is required to compile Scylla.
+```

 ## Running Scylla

@@ -38,6 +40,10 @@ Please see [HACKING.md](HACKING.md) for detailed information on building and dev
 ./build/release/scylla --help
 ```

+## Testing
+
+See [test.py manual](docs/testing.md).
+
 ## Scylla APIs and compatibility
 By default, Scylla is compatible with Apache Cassandra and its APIs - CQL and
 Thrift. There is also experimental support for the API of Amazon DynamoDB,
@@ -56,41 +62,27 @@ both.
 Seastar documentation can be found [here](http://docs.seastar.io/master/index.html).
 User documentation can be found [here](https://docs.scylladb.com/).

-## Building Fedora RPM
+## Training 

-As a pre-requisite, you need to install [Mock](https://fedoraproject.org/wiki/Mock) on your machine:
+Training material and online courses can be found at [Scylla University](https://university.scylladb.com/). 
+The courses are free, self-paced and include hands-on examples. They cover a variety of topics including Scylla data modeling, 
+administration, architecture, basic NoSQL concepts, using drivers for application development, Scylla setup, failover, compactions, 
+multi-datacenters and how Scylla integrates with third-party applications.

-```
-# Install mock:
-sudo yum install mock
-
-# Add user to the "mock" group:
-usermod -a -G mock $USER && newgrp mock
-```
-
-Then, to build an RPM, run:
-
-```
-./dist/redhat/build_rpm.sh
-```
-
-The built RPM is stored in ``/var/lib/mock/<configuration>/result`` directory.
-For example, on Fedora 21 mock reports the following:
-
-```
-INFO: Done(scylla-server-0.00-1.fc21.src.rpm) Config(default) 20 minutes 7 seconds
-INFO: Results and/or logs in: /var/lib/mock/fedora-21-x86_64/result
-```
-
-## Building Fedora-based Docker image
+## Building a CentOS-based Docker image

 Build a Docker image with:

 ```
-cd dist/docker
+cd dist/docker/redhat
 docker build -t <image-name> .
 ```

+This build is based on executables downloaded from downloads.scylladb.com,
+**not** on the executables built in this source directory. See further
+instructions in dist/docker/redhat/README.md to build a docker image from
+your own executables.
+
 Run the image with:

 ```
--- a/10
+++ b/10
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=666.development
+VERSION=4.2.4

 if test -f version
 then
@@ -19,6 +19,14 @@ else
 	SCYLLA_RELEASE=$SCYLLA_BUILD.$DATE.$GIT_COMMIT
 fi

+if [ -f build/SCYLLA-RELEASE-FILE ]; then
+	RELEASE_FILE=$(cat build/SCYLLA-RELEASE-FILE)
+	GIT_COMMIT_FILE=$(cat build/SCYLLA-RELEASE-FILE |cut -d . -f 3)
+	if [ "$GIT_COMMIT" = "$GIT_COMMIT_FILE" ]; then
+		exit 0
+	fi
+fi
+
 echo "$SCYLLA_VERSION-$SCYLLA_RELEASE"
 mkdir -p build
 echo "$SCYLLA_VERSION" > build/SCYLLA-VERSION-FILE
--- a/1
+++ b/1
--- a/absl-flat_hash_map.cc
+++ b/absl-flat_hash_map.cc
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "absl-flat_hash_map.hh"
+
+size_t sstring_hash::operator()(std::string_view v) const noexcept {
+    return absl::Hash<std::string_view>{}(v);
+}
--- a/absl-flat_hash_map.hh
+++ b/absl-flat_hash_map.hh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+#include <seastar/core/sstring.hh>
+
+using namespace seastar;
+
+struct sstring_hash {
+    using is_transparent = void;
+    size_t operator()(std::string_view v) const noexcept;
+};
+
+struct sstring_eq {
+    using is_transparent = void;
+    bool operator()(std::string_view a, std::string_view b) const noexcept {
+        return a == b;
+    }
+};
+
+template <typename K, typename V, typename... Ts>
+struct flat_hash_map : public absl::flat_hash_map<K, V, Ts...> {
+};
+
+template <typename V>
+struct flat_hash_map<sstring, V>
+    : public absl::flat_hash_map<sstring, V, sstring_hash, sstring_eq> {};
--- a/alternator-test/test_condition_expression.py
+++ b/alternator-test/test_condition_expression.py
--- a/alternator-test/test_item.py
+++ b/alternator-test/test_item.py
@@ -1,402 +0,0 @@
-# Copyright 2019 ScyllaDB
-#
-# This file is part of Scylla.
-#
-# Scylla is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Scylla is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
-
-# Tests for the CRUD item operations: PutItem, GetItem, UpdateItem, DeleteItem
-
-import pytest
-from botocore.exceptions import ClientError
-from decimal import Decimal
-from util import random_string, random_bytes
-
-# Basic test for creating a new item with a random name, and reading it back
-# with strong consistency.
-# Only the string type is used for keys and attributes. None of the various
-# optional PutItem features (Expected, ReturnValues, ReturnConsumedCapacity,
-# ReturnItemCollectionMetrics, ConditionalOperator, ConditionExpression,
-# ExpressionAttributeNames, ExpressionAttributeValues) are used, and
-# for GetItem strong consistency is requested as well as all attributes,
-# but no other optional features (AttributesToGet, ReturnConsumedCapacity,
-# ProjectionExpression, ExpressionAttributeNames)
-def test_basic_string_put_and_get(test_table):
-    p = random_string()
-    c = random_string()
-    val = random_string()
-    val2 = random_string()
-    test_table.put_item(Item={'p': p, 'c': c, 'attribute': val, 'another': val2})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item['p'] == p
-    assert item['c'] == c
-    assert item['attribute'] == val
-    assert item['another'] == val2
-
-# Similar to test_basic_string_put_and_get, just uses UpdateItem instead of
-# PutItem. Because the item does not yet exist, it should work the same.
-def test_basic_string_update_and_get(test_table):
-    p = random_string()
-    c = random_string()
-    val = random_string()
-    val2 = random_string()
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'attribute': {'Value': val, 'Action': 'PUT'}, 'another': {'Value': val2, 'Action': 'PUT'}})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item['p'] == p
-    assert item['c'] == c
-    assert item['attribute'] == val
-    assert item['another'] == val2
-
-# Test put_item and get_item of various types for the *attributes*,
-# including both scalars as well as nested documents, lists and sets.
-# The full list of types tested here:
-#    number, boolean, bytes, null, list, map, string set, number set,
-#    binary set.
-# The keys are still strings.
-# Note that only top-level attributes are written and read in this test -
-# this test does not attempt to modify *nested* attributes.
-# See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/dynamodb.html
-# on how to pass these various types to Boto3's put_item().
-def test_put_and_get_attribute_types(test_table):
-    key = {'p': random_string(), 'c': random_string()}
-    test_items = [
-        Decimal("12.345"),
-        42,
-        True,
-        False,
-        b'xyz',
-        None,
-        ['hello', 'world', 42],
-        {'hello': 'world', 'life': 42},
-        {'hello': {'test': 'hi', 'hello': True, 'list': [1, 2, 'hi']}},
-        set(['hello', 'world', 'hi']),
-        set([1, 42, Decimal("3.14")]),
-        set([b'xyz', b'hi']),
-    ]
-    item = { str(i) : test_items[i] for i in range(len(test_items)) }
-    item.update(key)
-    test_table.put_item(Item=item)
-    got_item = test_table.get_item(Key=key, ConsistentRead=True)['Item']
-    assert item == got_item
-
-# The test_empty_* tests below verify support for empty items, with no
-# attributes except the key. This is a difficult case for Scylla, because
-# for an empty row to exist, Scylla needs to add a "CQL row marker".
-# There are several ways to create empty items - via PutItem, UpdateItem
-# and deleting attributes from non-empty items, and we need to check them
-# all, in several test_empty_* tests:
-def test_empty_put(test_table):
-    p = random_string()
-    c = random_string()
-    test_table.put_item(Item={'p': p, 'c': c})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item == {'p': p, 'c': c}
-def test_empty_put_delete(test_table):
-    p = random_string()
-    c = random_string()
-    test_table.put_item(Item={'p': p, 'c': c, 'hello': 'world'})
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'hello': {'Action': 'DELETE'}})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item == {'p': p, 'c': c}
-def test_empty_update(test_table):
-    p = random_string()
-    c = random_string()
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item == {'p': p, 'c': c}
-def test_empty_update_delete(test_table):
-    p = random_string()
-    c = random_string()
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'hello': {'Value': 'world', 'Action': 'PUT'}})
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'hello': {'Action': 'DELETE'}})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item == {'p': p, 'c': c}
-
-# Test error handling of UpdateItem passed a bad "Action" field.
-def test_update_bad_action(test_table):
-    p = random_string()
-    c = random_string()
-    val = random_string()
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'attribute': {'Value': val, 'Action': 'NONEXISTENT'}})
-
-# A more elaborate UpdateItem test, updating different attributes at different
-# times. Includes PUT and DELETE operations.
-def test_basic_string_more_update(test_table):
-    p = random_string()
-    c = random_string()
-    val1 = random_string()
-    val2 = random_string()
-    val3 = random_string()
-    val4 = random_string()
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a3': {'Value': val1, 'Action': 'PUT'}})
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a1': {'Value': val1, 'Action': 'PUT'}})
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a2': {'Value': val2, 'Action': 'PUT'}})
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a1': {'Value': val3, 'Action': 'PUT'}})
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a3': {'Action': 'DELETE'}})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item['p'] == p
-    assert item['c'] == c
-    assert item['a1'] == val3
-    assert item['a2'] == val2
-    assert not 'a3' in item
-
-# Test that item operations on a non-existant table name fail with correct
-# error code.
-def test_item_operations_nonexistent_table(dynamodb):
-    with pytest.raises(ClientError, match='ResourceNotFoundException'):
-        dynamodb.meta.client.put_item(TableName='non_existent_table',
-            Item={'a':{'S':'b'}})
-
-# Fetching a non-existant item. According to the DynamoDB doc, "If there is no
-# matching item, GetItem does not return any data and there will be no Item
-# element in the response."
-def test_get_item_missing_item(test_table):
-    p = random_string()
-    c = random_string()
-    assert not "Item" in test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)
-
-# Test that if we have a table with string hash and sort keys, we can't read
-# or write items with other key types to it.
-def test_put_item_wrong_key_type(test_table):
-    b = random_bytes()
-    s = random_string()
-    n = Decimal("3.14")
-    # Should succeed (correct key types)
-    test_table.put_item(Item={'p': s, 'c': s})
-    assert test_table.get_item(Key={'p': s, 'c': s}, ConsistentRead=True)['Item'] == {'p': s, 'c': s}
-    # Should fail (incorrect hash key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.put_item(Item={'p': b, 'c': s})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.put_item(Item={'p': n, 'c': s})
-    # Should fail (incorrect sort key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.put_item(Item={'p': s, 'c': b})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.put_item(Item={'p': s, 'c': n})
-    # Should fail (missing hash key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.put_item(Item={'c': s})
-    # Should fail (missing sort key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.put_item(Item={'p': s})
-def test_update_item_wrong_key_type(test_table, test_table_s):
-    b = random_bytes()
-    s = random_string()
-    n = Decimal("3.14")
-    # Should succeed (correct key types)
-    test_table.update_item(Key={'p': s, 'c': s}, AttributeUpdates={})
-    assert test_table.get_item(Key={'p': s, 'c': s}, ConsistentRead=True)['Item'] == {'p': s, 'c': s}
-    # Should fail (incorrect hash key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'p': b, 'c': s}, AttributeUpdates={})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'p': n, 'c': s}, AttributeUpdates={})
-    # Should fail (incorrect sort key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'p': s, 'c': b}, AttributeUpdates={})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'p': s, 'c': n}, AttributeUpdates={})
-    # Should fail (missing hash key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'c': s}, AttributeUpdates={})
-    # Should fail (missing sort key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'p': s}, AttributeUpdates={})
-    # Should fail (spurious key columns)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': s, 'c': s, 'spurious': s})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.get_item(Key={'p': s, 'c': s})
-def test_get_item_wrong_key_type(test_table, test_table_s):
-    b = random_bytes()
-    s = random_string()
-    n = Decimal("3.14")
-    # Should succeed (correct key types) but have empty result
-    assert not "Item" in test_table.get_item(Key={'p': s, 'c': s}, ConsistentRead=True)
-    # Should fail (incorrect hash key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': b, 'c': s})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': n, 'c': s})
-    # Should fail (incorrect sort key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': s, 'c': b})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': s, 'c': n})
-    # Should fail (missing hash key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'c': s})
-    # Should fail (missing sort key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': s})
-    # Should fail (spurious key columns)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': s, 'c': s, 'spurious': s})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.get_item(Key={'p': s, 'c': s})
-def test_delete_item_wrong_key_type(test_table, test_table_s):
-    b = random_bytes()
-    s = random_string()
-    n = Decimal("3.14")
-    # Should succeed (correct key types)
-    test_table.delete_item(Key={'p': s, 'c': s})
-    # Should fail (incorrect hash key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'p': b, 'c': s})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'p': n, 'c': s})
-    # Should fail (incorrect sort key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'p': s, 'c': b})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'p': s, 'c': n})
-    # Should fail (missing hash key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'c': s})
-    # Should fail (missing sort key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'p': s})
-    # Should fail (spurious key columns)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'p': s, 'c': s, 'spurious': s})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.delete_item(Key={'p': s, 'c': s})
-
-# Most of the tests here arbitrarily used a table with both hash and sort keys
-# (both strings). Let's check that a table with *only* a hash key works ok
-# too, for PutItem, GetItem, and UpdateItem.
-def test_only_hash_key(test_table_s):
-    s = random_string()
-    test_table_s.put_item(Item={'p': s, 'hello': 'world'})
-    assert test_table_s.get_item(Key={'p': s}, ConsistentRead=True)['Item'] == {'p': s, 'hello': 'world'}
-    test_table_s.update_item(Key={'p': s}, AttributeUpdates={'hi': {'Value': 'there', 'Action': 'PUT'}})
-    assert test_table_s.get_item(Key={'p': s}, ConsistentRead=True)['Item'] == {'p': s, 'hello': 'world', 'hi': 'there'}
-
-# Tests for item operations in tables with non-string hash or sort keys.
-# These tests focus only on the type of the key - everything else is as
-# simple as we can (string attributes, no special options for GetItem
-# and PutItem). These tests also focus on individual items only, and
-# not about the sort order of sort keys - this should be verified in
-# test_query.py, for example.
-def test_bytes_hash_key(test_table_b):
-    # Bytes values are passed using base64 encoding, which has weird cases
-    # depending on len%3 and len%4. So let's try various lengths.
-    for len in range(10,18):
-        p = random_bytes(len)
-        val = random_string()
-        test_table_b.put_item(Item={'p': p, 'attribute': val})
-        assert test_table_b.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'attribute': val}
-def test_bytes_sort_key(test_table_sb):
-    p = random_string()
-    c = random_bytes()
-    val = random_string()
-    test_table_sb.put_item(Item={'p': p, 'c': c, 'attribute': val})
-    assert test_table_sb.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item'] == {'p': p, 'c': c, 'attribute': val}
-
-# Tests for using a large binary blob as hash key, sort key, or attribute.
-# DynamoDB strictly limits the size of the binary hash key to 2048 bytes,
-# and binary sort key to 1024 bytes, and refuses anything larger. The total
-# size of an item is limited to 400KB, which also limits the size of the
-# largest attributes. For more details on these limits, see
-# https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Limits.html
-# Alternator currently does *not* have these limitations, and can accept much
-# larger keys and attributes, but what we do in the following tests is to verify
-# that items up to DynamoDB's maximum sizes also work well in Alternator.
-def test_large_blob_hash_key(test_table_b):
-    b = random_bytes(2048)
-    test_table_b.put_item(Item={'p': b})
-    assert test_table_b.get_item(Key={'p': b}, ConsistentRead=True)['Item'] == {'p': b}
-def test_large_blob_sort_key(test_table_sb):
-    s = random_string()
-    b = random_bytes(1024)
-    test_table_sb.put_item(Item={'p': s, 'c': b})
-    assert test_table_sb.get_item(Key={'p': s, 'c': b}, ConsistentRead=True)['Item'] == {'p': s, 'c': b}
-def test_large_blob_attribute(test_table):
-    p = random_string()
-    c = random_string()
-    b = random_bytes(409500)  # a bit less than 400KB
-    test_table.put_item(Item={'p': p, 'c': c, 'attribute': b })
-    assert test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item'] == {'p': p, 'c': c, 'attribute': b}
-
-# Checks what it is not allowed to use in a single UpdateItem request both
-# old-style AttributeUpdates and new-style UpdateExpression.
-def test_update_item_two_update_methods(test_table_s):
-    p = random_string()
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.update_item(Key={'p': p},
-            AttributeUpdates={'a': {'Value': 3, 'Action': 'PUT'}},
-            UpdateExpression='SET b = :val1',
-            ExpressionAttributeValues={':val1': 4})
-
-# Verify that having neither AttributeUpdates nor UpdateExpression is
-# allowed, and results in creation of an empty item.
-def test_update_item_no_update_method(test_table_s):
-    p = random_string()
-    assert not "Item" in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)
-    test_table_s.update_item(Key={'p': p})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p}
-
-# Test GetItem with the AttributesToGet parameter. Result should include the
-# selected attributes only - if one wants the key attributes as well, one
-# needs to select them explicitly. When no key attributes are selected,
-# some items may have *none* of the selected attributes. Those items are
-# returned too, as empty items - they are not outright missing.
-def test_getitem_attributes_to_get(dynamodb, test_table):
-    p = random_string()
-    c = random_string()
-    item = {'p': p, 'c': c, 'a': 'hello', 'b': 'hi'}
-    test_table.put_item(Item=item)
-    for wanted in [ ['a'],             # only non-key attribute
-                    ['c', 'a'],        # a key attribute (sort key) and non-key
-                    ['p', 'c'],        # entire key
-                    ['nonexistent']    # Our item doesn't have this
-                   ]:
-        got_item = test_table.get_item(Key={'p': p, 'c': c}, AttributesToGet=wanted, ConsistentRead=True)['Item']
-        expected_item = {k: item[k] for k in wanted if k in item}
-        assert expected_item == got_item
-
-# Basic test for DeleteItem, with hash key only
-def test_delete_item_hash(test_table_s):
-    p = random_string()
-    test_table_s.put_item(Item={'p': p})
-    assert 'Item' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)
-    test_table_s.delete_item(Key={'p': p})
-    assert not 'Item' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)
-
-# Basic test for DeleteItem, with hash and sort key
-def test_delete_item_sort(test_table):
-    p = random_string()
-    c = random_string()
-    key = {'p': p, 'c': c}
-    test_table.put_item(Item=key)
-    assert 'Item' in test_table.get_item(Key=key, ConsistentRead=True)
-    test_table.delete_item(Key=key)
-    assert not 'Item' in test_table.get_item(Key=key, ConsistentRead=True)
-
-# Test that PutItem completely replaces an existing item. It shouldn't merge
-# it with a previously existing value, as UpdateItem does!
-# We test for a table with just hash key, and for a table with both hash and
-# sort keys.
-def test_put_item_replace(test_table_s, test_table):
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'a': 'hi'}
-    test_table_s.put_item(Item={'p': p, 'b': 'hello'})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'b': 'hello'}
-    c = random_string()
-    test_table.put_item(Item={'p': p, 'c': c, 'a': 'hi'})
-    assert test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item'] == {'p': p, 'c': c, 'a': 'hi'}
-    test_table.put_item(Item={'p': p, 'c': c, 'b': 'hello'})
-    assert test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item'] == {'p': p, 'c': c, 'b': 'hello'}
--- a/alternator-test/test_query.py
+++ b/alternator-test/test_query.py
@@ -1,516 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2019 ScyllaDB
-#
-# This file is part of Scylla.
-#
-# Scylla is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Scylla is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
-
-# Tests for the Query operation
-
-import random
-import pytest
-from botocore.exceptions import ClientError, ParamValidationError
-from decimal import Decimal
-from util import random_string, random_bytes, full_query, multiset
-from boto3.dynamodb.conditions import Key, Attr
-
-# Test that scanning works fine with in-stock paginator
-def test_query_basic_restrictions(dynamodb, filled_test_table):
-    test_table, items = filled_test_table
-    paginator = dynamodb.meta.client.get_paginator('query')
-
-    # EQ
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditions={
-            'p' : {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'}
-        }):
-        got_items += page['Items']
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long']) == multiset(got_items)
-
-    # LT
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditions={
-            'p' : {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'},
-            'c' : {'AttributeValueList': ['12'], 'ComparisonOperator': 'LT'}
-        }):
-        got_items += page['Items']
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['c'] < '12']) == multiset(got_items)
-
-    # LE
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditions={
-            'p' : {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'},
-            'c' : {'AttributeValueList': ['14'], 'ComparisonOperator': 'LE'}
-        }):
-        got_items += page['Items']
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['c'] <= '14']) == multiset(got_items)
-
-    # GT
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditions={
-            'p' : {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'},
-            'c' : {'AttributeValueList': ['15'], 'ComparisonOperator': 'GT'}
-        }):
-        got_items += page['Items']
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['c'] > '15']) == multiset(got_items)
-
-    # GE
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditions={
-            'p' : {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'},
-            'c' : {'AttributeValueList': ['14'], 'ComparisonOperator': 'GE'}
-        }):
-        got_items += page['Items']
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['c'] >= '14']) == multiset(got_items)
-
-    # BETWEEN
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditions={
-            'p' : {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'},
-            'c' : {'AttributeValueList': ['155', '164'], 'ComparisonOperator': 'BETWEEN'}
-        }):
-        got_items += page['Items']
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['c'] >= '155' and item['c'] <= '164']) == multiset(got_items)
-
-    # BEGINS_WITH
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditions={
-            'p' : {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'},
-            'c' : {'AttributeValueList': ['11'], 'ComparisonOperator': 'BEGINS_WITH'}
-        }):
-        print([item for item in items if item['p'] == 'long' and item['c'].startswith('11')])
-        got_items += page['Items']
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['c'].startswith('11')]) == multiset(got_items)
-
-# Test that KeyConditionExpression parameter is supported
-@pytest.mark.xfail(reason="KeyConditionExpression not supported yet")
-def test_query_key_condition_expression(dynamodb, filled_test_table):
-    test_table, items = filled_test_table
-    paginator = dynamodb.meta.client.get_paginator('query')
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditionExpression=Key("p").eq("long") & Key("c").lt("12")):
-        got_items += page['Items']
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['c'] < '12']) == multiset(got_items)
-
-def test_begins_with(dynamodb, test_table):
-    paginator = dynamodb.meta.client.get_paginator('query')
-    items = [{'p': 'unorthodox_chars', 'c': sort_key, 'str': 'a'} for sort_key in [u'ÿÿÿ', u'cÿbÿ', u'cÿbÿÿabg'] ]
-    with test_table.batch_writer() as batch:
-        for item in items:
-            batch.put_item(item)
-
-    # TODO(sarna): Once bytes type is supported, /xFF character should be tested
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditions={
-            'p' : {'AttributeValueList': ['unorthodox_chars'], 'ComparisonOperator': 'EQ'},
-            'c' : {'AttributeValueList': [u'ÿÿ'], 'ComparisonOperator': 'BEGINS_WITH'}
-        }):
-        got_items += page['Items']
-    print(got_items)
-    assert sorted([d['c'] for d in got_items]) == sorted([d['c'] for d in items if d['c'].startswith(u'ÿÿ')])
-
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditions={
-            'p' : {'AttributeValueList': ['unorthodox_chars'], 'ComparisonOperator': 'EQ'},
-            'c' : {'AttributeValueList': [u'cÿbÿ'], 'ComparisonOperator': 'BEGINS_WITH'}
-        }):
-        got_items += page['Items']
-    print(got_items)
-    assert sorted([d['c'] for d in got_items]) == sorted([d['c'] for d in items if d['c'].startswith(u'cÿbÿ')])
-
-def test_begins_with_wrong_type(dynamodb, test_table_sn):
-    paginator = dynamodb.meta.client.get_paginator('query')
-    with pytest.raises(ClientError, match='ValidationException'):
-        for page in paginator.paginate(TableName=test_table_sn.name, KeyConditions={
-                'p' : {'AttributeValueList': ['unorthodox_chars'], 'ComparisonOperator': 'EQ'},
-                'c' : {'AttributeValueList': [17], 'ComparisonOperator': 'BEGINS_WITH'}
-                }):
-            pass
-
-# Items returned by Query should be sorted by the sort key. The following
-# tests verify that this is indeed the case, for the three allowed key types:
-# strings, binary, and numbers. These tests test not just the Query operation,
-# but inherently that the sort-key sorting works.
-def test_query_sort_order_string(test_table):
-    # Insert a lot of random items in one new partition:
-    # str(i) has a non-obvious sort order (e.g., "100" comes before "2") so is a nice test.
-    p = random_string()
-    items = [{'p': p, 'c': str(i)} for i in range(128)]
-    with test_table.batch_writer() as batch:
-        for item in items:
-            batch.put_item(item)
-    got_items = full_query(test_table, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}})
-    assert len(items) == len(got_items)
-    # Extract just the sort key ("c") from the items
-    sort_keys = [x['c'] for x in items]
-    got_sort_keys = [x['c'] for x in got_items]
-    # Verify that got_sort_keys are already sorted (in string order)
-    assert sorted(got_sort_keys) == got_sort_keys
-    # Verify that got_sort_keys are a sorted version of the expected sort_keys
-    assert sorted(sort_keys) == got_sort_keys
-def test_query_sort_order_bytes(test_table_sb):
-    # Insert a lot of random items in one new partition:
-    # We arbitrarily use random_bytes with a random length.
-    p = random_string()
-    items = [{'p': p, 'c': random_bytes(10)} for i in range(128)]
-    with test_table_sb.batch_writer() as batch:
-        for item in items:
-            batch.put_item(item)
-    got_items = full_query(test_table_sb, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}})
-    assert len(items) == len(got_items)
-    sort_keys = [x['c'] for x in items]
-    got_sort_keys = [x['c'] for x in got_items]
-    # Boto3's "Binary" objects are sorted as if bytes are signed integers.
-    # This isn't the order that DynamoDB itself uses (byte 0 should be first,
-    # not byte -128). Sorting the byte array ".value" works.
-    assert sorted(got_sort_keys, key=lambda x: x.value) == got_sort_keys
-    assert sorted(sort_keys) == got_sort_keys
-def test_query_sort_order_number(test_table_sn):
-    # This is a list of numbers, sorted in correct order, and each suitable
-    # for accurate representation by Alternator's number type.
-    numbers = [
-        Decimal("-2e10"),
-        Decimal("-7.1e2"),
-        Decimal("-4.1"),
-        Decimal("-0.1"),
-        Decimal("-1e-5"),
-        Decimal("0"),
-        Decimal("2e-5"),
-        Decimal("0.15"),
-        Decimal("1"),
-        Decimal("1.00000000000000000000000001"),
-        Decimal("3.14159"),
-        Decimal("3.1415926535897932384626433832795028841"),
-        Decimal("31.4"),
-        Decimal("1.4e10"),
-    ]
-    # Insert these numbers, in random order, into one partition:
-    p = random_string()
-    items = [{'p': p, 'c': num} for num in random.sample(numbers, len(numbers))]
-    with test_table_sn.batch_writer() as batch:
-        for item in items:
-            batch.put_item(item)
-    # Finally, verify that we get back exactly the same numbers (with identical
-    # precision), and in their original sorted order.
-    got_items = full_query(test_table_sn, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}})
-    got_sort_keys = [x['c'] for x in got_items]
-    assert got_sort_keys == numbers
-
-def test_query_filtering_attributes_equality(filled_test_table):
-    test_table, items = filled_test_table
-
-    query_filter = {
-        "attribute" : {
-            "AttributeValueList" : [ "xxxx" ],
-            "ComparisonOperator": "EQ"
-        }
-    }
-    got_items = full_query(test_table, KeyConditions={'p': {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'}}, QueryFilter=query_filter)
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['attribute'] == 'xxxx']) == multiset(got_items)
-
-    query_filter = {
-        "attribute" : {
-            "AttributeValueList" : [ "xxxx" ],
-            "ComparisonOperator": "EQ"
-        },
-        "another" : {
-            "AttributeValueList" : [ "yy" ],
-            "ComparisonOperator": "EQ"
-        }
-    }
-
-    got_items = full_query(test_table, KeyConditions={'p': {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'}}, QueryFilter=query_filter)
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['attribute'] == 'xxxx' and item['another'] == 'yy']) == multiset(got_items)
-
-# Test that FilterExpression works as expected
-@pytest.mark.xfail(reason="FilterExpression not supported yet")
-def test_query_filter_expression(filled_test_table):
-    test_table, items = filled_test_table
-
-    got_items = full_query(test_table, KeyConditions={'p': {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'}}, FilterExpression=Attr("attribute").eq("xxxx"))
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['attribute'] == 'xxxx']) == multiset(got_items)
-
-    got_items = full_query(test_table, KeyConditions={'p': {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'}}, FilterExpression=Attr("attribute").eq("xxxx") & Attr("another").eq("yy"))
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['attribute'] == 'xxxx' and item['another'] == 'yy']) == multiset(got_items)
-
-# QueryFilter can only contain non-key attributes in order to be compatible
-def test_query_filtering_key_equality(filled_test_table):
-    test_table, items = filled_test_table
-
-    with pytest.raises(ClientError, match='ValidationException'):
-        query_filter = {
-            "c" : {
-                "AttributeValueList" : [ "5" ],
-                "ComparisonOperator": "EQ"
-            }
-        }
-        got_items = full_query(test_table, KeyConditions={'p': {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'}}, QueryFilter=query_filter)
-        print(got_items)
-
-    with pytest.raises(ClientError, match='ValidationException'):
-        query_filter = {
-            "attribute" : {
-                "AttributeValueList" : [ "x" ],
-                "ComparisonOperator": "EQ"
-            },
-            "p" : {
-                "AttributeValueList" : [ "5" ],
-                "ComparisonOperator": "EQ"
-            }
-        }
-        got_items = full_query(test_table, KeyConditions={'p': {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'}}, QueryFilter=query_filter)
-        print(got_items)
-
-# Test Query with the AttributesToGet parameter. Result should include the
-# selected attributes only - if one wants the key attributes as well, one
-# needs to select them explicitly. When no key attributes are selected,
-# some items may have *none* of the selected attributes. Those items are
-# returned too, as empty items - they are not outright missing.
-def test_query_attributes_to_get(dynamodb, test_table):
-    p = random_string()
-    items = [{'p': p, 'c': str(i), 'a': str(i*10), 'b': str(i*100) } for i in range(10)]
-    with test_table.batch_writer() as batch:
-        for item in items:
-            batch.put_item(item)
-    for wanted in [ ['a'],             # only non-key attributes
-                    ['c', 'a'],        # a key attribute (sort key) and non-key
-                    ['p', 'c'],        # entire key
-                    ['nonexistent']    # none of the items have this attribute!
-                   ]:
-        got_items = full_query(test_table, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, AttributesToGet=wanted)
-        expected_items = [{k: x[k] for k in wanted if k in x} for x in items]
-        assert multiset(expected_items) == multiset(got_items)
-
-# Test that in a table with both hash key and sort key, which keys we can
-# Query by: We can Query by the hash key, by a combination of both hash and
-# sort keys, but *cannot* query by just the sort key, and obviously not
-# by any non-key column.
-def test_query_which_key(test_table):
-    p = random_string()
-    c = random_string()
-    p2 = random_string()
-    c2 = random_string()
-    item1 = {'p': p, 'c': c}
-    item2 = {'p': p, 'c': c2}
-    item3 = {'p': p2, 'c': c}
-    for i in [item1, item2, item3]:
-        test_table.put_item(Item=i)
-    # Query by hash key only:
-    got_items = full_query(test_table, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}})
-    expected_items = [item1, item2]
-    assert multiset(expected_items) == multiset(got_items)
-    # Query by hash key *and* sort key (this is basically a GetItem):
-    got_items = full_query(test_table, KeyConditions={
-        'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'},
-        'c': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'}
-    })
-    expected_items = [item1]
-    assert multiset(expected_items) == multiset(got_items)
-    # Query by sort key alone is not allowed. DynamoDB reports:
-    # "Query condition missed key schema element: p".
-    with pytest.raises(ClientError, match='ValidationException'):
-        full_query(test_table, KeyConditions={
-            'c': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'}
-        })
-    # Query by a non-key isn't allowed, for the same reason - that the
-    # actual hash key (p) is missing in the query:
-    with pytest.raises(ClientError, match='ValidationException'):
-        full_query(test_table, KeyConditions={
-            'z': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'}
-        })
-    # If we try both p and a non-key we get a complaint that the sort
-    # key is missing: "Query condition missed key schema element: c"
-    with pytest.raises(ClientError, match='ValidationException'):
-        full_query(test_table, KeyConditions={
-            'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'},
-            'z': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'}
-        })
-    # If we try p, c and another key, we get an error that
-    # "Conditions can be of length 1 or 2 only".
-    with pytest.raises(ClientError, match='ValidationException'):
-        full_query(test_table, KeyConditions={
-            'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'},
-            'c': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'},
-            'z': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'}
-        })
-
-# Test the "Select" parameter of Query. The default Select mode,
-# ALL_ATTRIBUTES, returns items with all their attributes. Other modes
-# allow returning just specific attributes or just counting the results
-# without returning items at all.
-@pytest.mark.xfail(reason="Select not supported yet")
-def test_query_select(test_table_sn):
-    numbers = [Decimal(i) for i in range(10)]
-    # Insert these numbers, in random order, into one partition:
-    p = random_string()
-    items = [{'p': p, 'c': num, 'x': num} for num in random.sample(numbers, len(numbers))]
-    with test_table_sn.batch_writer() as batch:
-        for item in items:
-            batch.put_item(item)
-    # Verify that we get back the numbers in their sorted order. By default,
-    # query returns all attributes:
-    got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}})['Items']
-    got_sort_keys = [x['c'] for x in got_items]
-    assert got_sort_keys == numbers
-    got_x_attributes = [x['x'] for x in got_items]
-    assert got_x_attributes == numbers
-    # Select=ALL_ATTRIBUTES does exactly the same as the default - return
-    # all attributes:
-    got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Select='ALL_ATTRIBUTES')['Items']
-    got_sort_keys = [x['c'] for x in got_items]
-    assert got_sort_keys == numbers
-    got_x_attributes = [x['x'] for x in got_items]
-    assert got_x_attributes == numbers
-    # Select=ALL_PROJECTED_ATTRIBUTES is not allowed on a base table (it
-    # is just for indexes, when IndexName is specified)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Select='ALL_PROJECTED_ATTRIBUTES')
-    # Select=SPECIFIC_ATTRIBUTES requires that either a AttributesToGet
-    # or ProjectionExpression appears, but then really does nothing:
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Select='SPECIFIC_ATTRIBUTES')
-    got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Select='SPECIFIC_ATTRIBUTES', AttributesToGet=['x'])['Items']
-    expected_items = [{'x': i} for i in numbers]
-    assert got_items == expected_items
-    got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Select='SPECIFIC_ATTRIBUTES', ProjectionExpression='x')['Items']
-    assert got_items == expected_items
-    # Select=COUNT just returns a count - not any items
-    got = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Select='COUNT')
-    assert got['Count'] == len(numbers)
-    assert not 'Items' in got
-    # Check again that we also get a count - not just with Select=COUNT,
-    # but without Select=COUNT we also get the items:
-    got = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}})
-    assert got['Count'] == len(numbers)
-    assert 'Items' in got
-    # Select with some unknown string generates a validation exception:
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Select='UNKNOWN')
-
-# Test that the "Limit" parameter can be used to return only some of the
-# items in a single partition. The items returned are the first in the
-# sorted order.
-def test_query_limit(test_table_sn):
-    numbers = [Decimal(i) for i in range(10)]
-    # Insert these numbers, in random order, into one partition:
-    p = random_string()
-    items = [{'p': p, 'c': num} for num in random.sample(numbers, len(numbers))]
-    with test_table_sn.batch_writer() as batch:
-        for item in items:
-            batch.put_item(item)
-    # Verify that we get back the numbers in their sorted order.
-    # First, no Limit so we should get all numbers (we have few of them, so
-    # it all fits in the default 1MB limitation)
-    got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}})['Items']
-    got_sort_keys = [x['c'] for x in got_items]
-    assert got_sort_keys == numbers
-    # Now try a few different Limit values, and verify that the query
-    # returns exactly the first Limit sorted numbers.
-    for limit in [1, 2, 3, 7, 10, 17, 100, 10000]:
-        got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Limit=limit)['Items']
-        assert len(got_items) == min(limit, len(numbers))
-        got_sort_keys = [x['c'] for x in got_items]
-        assert got_sort_keys == numbers[0:limit]
-    # Unfortunately, the boto3 library forbids a Limit of 0 on its own,
-    # before even sending a request, so we can't test how the server responds.
-    with pytest.raises(ParamValidationError):
-        test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Limit=0)
-
-# In test_query_limit we tested just that Limit allows to stop the result
-# after right right number of items. Here we test that such a stopped result
-# can be resumed, via the LastEvaluatedKey/ExclusiveStartKey paging mechanism.
-def test_query_limit_paging(test_table_sn):
-    numbers = [Decimal(i) for i in range(20)]
-    # Insert these numbers, in random order, into one partition:
-    p = random_string()
-    items = [{'p': p, 'c': num} for num in random.sample(numbers, len(numbers))]
-    with test_table_sn.batch_writer() as batch:
-        for item in items:
-            batch.put_item(item)
-    # Verify that full_query() returns all these numbers, in sorted order.
-    # full_query() will do a query with the given limit, and resume it again
-    # and again until the last page.
-    for limit in [1, 2, 3, 7, 10, 17, 100, 10000]:
-        got_items = full_query(test_table_sn, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Limit=limit)
-        got_sort_keys = [x['c'] for x in got_items]
-        assert got_sort_keys == numbers
-
-# Test that the ScanIndexForward parameter works, and can be used to
-# return items sorted in reverse order. Combining this with Limit can
-# be used to return the last items instead of the first items of the
-# partition.
-@pytest.mark.xfail(reason="ScanIndexForward not supported yet")
-def test_query_reverse(test_table_sn):
-    numbers = [Decimal(i) for i in range(20)]
-    # Insert these numbers, in random order, into one partition:
-    p = random_string()
-    items = [{'p': p, 'c': num} for num in random.sample(numbers, len(numbers))]
-    with test_table_sn.batch_writer() as batch:
-        for item in items:
-            batch.put_item(item)
-    # Verify that we get back the numbers in their sorted order or reverse
-    # order, depending on the ScanIndexForward parameter being True or False.
-    # First, no Limit so we should get all numbers (we have few of them, so
-    # it all fits in the default 1MB limitation)
-    reversed_numbers = list(reversed(numbers))
-    got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, ScanIndexForward=True)['Items']
-    got_sort_keys = [x['c'] for x in got_items]
-    assert got_sort_keys == numbers
-    got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, ScanIndexForward=False)['Items']
-    got_sort_keys = [x['c'] for x in got_items]
-    assert got_sort_keys == reversed_numbers
-    # Now try a few different Limit values, and verify that the query
-    # returns exactly the first Limit sorted numbers - in regular or
-    # reverse order, depending on ScanIndexForward.
-    for limit in [1, 2, 3, 7, 10, 17, 100, 10000]:
-        got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Limit=limit, ScanIndexForward=True)['Items']
-        assert len(got_items) == min(limit, len(numbers))
-        got_sort_keys = [x['c'] for x in got_items]
-        assert got_sort_keys == numbers[0:limit]
-        got_items = test_table_sn.query(KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, Limit=limit, ScanIndexForward=False)['Items']
-        assert len(got_items) == min(limit, len(numbers))
-        got_sort_keys = [x['c'] for x in got_items]
-        assert got_sort_keys == reversed_numbers[0:limit]
-
-# Test that paging also works properly with reverse order
-# (ScanIndexForward=false), i.e., reverse-order queries can be resumed
-@pytest.mark.xfail(reason="ScanIndexForward not supported yet")
-def test_query_reverse_paging(test_table_sn):
-    numbers = [Decimal(i) for i in range(20)]
-    # Insert these numbers, in random order, into one partition:
-    p = random_string()
-    items = [{'p': p, 'c': num} for num in random.sample(numbers, len(numbers))]
-    with test_table_sn.batch_writer() as batch:
-        for item in items:
-            batch.put_item(item)
-    reversed_numbers = list(reversed(numbers))
-    # Verify that with ScanIndexForward=False, full_query() returns all
-    # these numbers in reversed sorted order - getting pages of Limit items
-    # at a time and resuming the query.
-    for limit in [1, 2, 3, 7, 10, 17, 100, 10000]:
-        got_items = full_query(test_table_sn, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, ScanIndexForward=False, Limit=limit)
-        got_sort_keys = [x['c'] for x in got_items]
-        assert got_sort_keys == reversed_numbers
--- a/alternator-test/test_returnvalues.py
+++ b/alternator-test/test_returnvalues.py
@@ -1,226 +0,0 @@
-# Copyright 2019 ScyllaDB
-#
-# This file is part of Scylla.
-#
-# Scylla is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Scylla is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
-
-# Tests for the ReturnValues parameter for the different update operations
-# (PutItem, UpdateItem, DeleteItem).
-
-import pytest
-from botocore.exceptions import ClientError
-from util import random_string
-
-# Test trivial support for the ReturnValues parameter in PutItem, UpdateItem
-# and DeleteItem - test that "NONE" works (and changes nothing), while a
-# completely unsupported value gives an error.
-# This test is useful to check that before the ReturnValues parameter is fully
-# implemented, it returns an error when a still-unsupported ReturnValues
-# option is attempted in the request - instead of simply being ignored.
-def test_trivial_returnvalues(test_table_s):
-    # PutItem:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    ret=test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='NONE')
-    assert not 'Attributes' in ret
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='DOG')
-    # UpdateItem:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='NONE',
-        UpdateExpression='SET b = :val',
-        ExpressionAttributeValues={':val': 'cat'})
-    assert not 'Attributes' in ret
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.update_item(Key={'p': p}, ReturnValues='DOG',
-            UpdateExpression='SET a = a + :val',
-            ExpressionAttributeValues={':val': 1})
-    # DeleteItem:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    ret=test_table_s.delete_item(Key={'p': p}, ReturnValues='NONE')
-    assert not 'Attributes' in ret
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.delete_item(Key={'p': p}, ReturnValues='DOG')
-
-# Test the ReturnValues parameter on a PutItem operation. Only two settings
-# are supported for this parameter for this operation: NONE (the default)
-# and ALL_OLD.
-@pytest.mark.xfail(reason="ReturnValues not supported")
-def test_put_item_returnvalues(test_table_s):
-    # By default, the previous value of an item is not returned:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    ret=test_table_s.put_item(Item={'p': p, 'a': 'hello'})
-    assert not 'Attributes' in ret
-    # Using ReturnValues=NONE is the same:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    ret=test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='NONE')
-    assert not 'Attributes' in ret
-    # With ReturnValues=ALL_OLD, the old value of the item is returned
-    # in an "Attributes" attribute:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    ret=test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='ALL_OLD')
-    assert ret['Attributes'] == {'p': p, 'a': 'hi'}
-    # Other ReturnValue options - UPDATED_OLD, ALL_NEW, UPDATED_NEW,
-    # are supported by other operations but not by PutItem:
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='UPDATED_OLD')
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='ALL_NEW')
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='UPDATED_NEW')
-    # Also, obviously, a non-supported setting "DOG" also returns in error:
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='DOG')
-    # The ReturnValues value is case sensitive, so while "NONE" is supported
-    # (and tested above), "none" isn't:
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='none')
-
-# Test the ReturnValues parameter on a DeleteItem operation. Only two settings
-# are supported for this parameter for this operation: NONE (the default)
-# and ALL_OLD.
-@pytest.mark.xfail(reason="ReturnValues not supported")
-def test_delete_item_returnvalues(test_table_s):
-    # By default, the previous value of an item is not returned:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    ret=test_table_s.delete_item(Key={'p': p})
-    assert not 'Attributes' in ret
-    # Using ReturnValues=NONE is the same:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    ret=test_table_s.delete_item(Key={'p': p}, ReturnValues='NONE')
-    assert not 'Attributes' in ret
-    # With ReturnValues=ALL_OLD, the old value of the item is returned
-    # in an "Attributes" attribute:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    ret=test_table_s.delete_item(Key={'p': p}, ReturnValues='ALL_OLD')
-    assert ret['Attributes'] == {'p': p, 'a': 'hi'}
-    # Other ReturnValue options - UPDATED_OLD, ALL_NEW, UPDATED_NEW,
-    # are supported by other operations but not by PutItem:
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.delete_item(Key={'p': p}, ReturnValues='UPDATE_OLD')
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.delete_item(Key={'p': p}, ReturnValues='ALL_NEW')
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.delete_item(Key={'p': p}, ReturnValues='UPDATE_NEW')
-    # Also, obviously, a non-supported setting "DOG" also returns in error:
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.delete_item(Key={'p': p}, ReturnValues='DOG')
-    # The ReturnValues value is case sensitive, so while "NONE" is supported
-    # (and tested above), "none" isn't:
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.delete_item(Key={'p': p}, ReturnValues='none')
-
-# Test the ReturnValues parameter on a UpdateItem operation. All five
-# settings are supported for this parameter for this operation: NONE
-# (the default), ALL_OLD, UPDATED_OLD, ALL_NEW and UPDATED_NEW.
-@pytest.mark.xfail(reason="ReturnValues not supported")
-def test_update_item_returnvalues(test_table_s):
-    # By default, the previous value of an item is not returned:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
-    ret=test_table_s.update_item(Key={'p': p},
-        UpdateExpression='SET b = :val',
-        ExpressionAttributeValues={':val': 'cat'})
-    assert not 'Attributes' in ret
-
-    # Using ReturnValues=NONE is the same:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='NONE',
-        UpdateExpression='SET b = :val',
-        ExpressionAttributeValues={':val': 'cat'})
-    assert not 'Attributes' in ret
-
-    # With ReturnValues=ALL_OLD, the entire old value of the item (even
-    # attributes we did not modify) is returned in an "Attributes" attribute:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='ALL_OLD',
-        UpdateExpression='SET b = :val',
-        ExpressionAttributeValues={':val': 'cat'})
-    assert ret['Attributes'] == {'p': p, 'a': 'hi', 'b': 'dog'}
-
-    # With ReturnValues=UPDATED_OLD, only the overwritten attributes of the
-    # old item are returned in an "Attributes" attribute:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_OLD',
-        UpdateExpression='SET b = :val, c = :val2',
-        ExpressionAttributeValues={':val': 'cat', ':val2': 'hello'})
-    assert ret['Attributes'] == {'b': 'dog'}
-    # Even if an update overwrites an attribute by the same value again,
-    # this is considered an update, and the old value (identical to the
-    # new one) is returned:
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_OLD',
-        UpdateExpression='SET b = :val',
-        ExpressionAttributeValues={':val': 'cat'})
-    assert ret['Attributes'] == {'b': 'cat'}
-    # Deleting an attribute also counts as overwriting it, of course:
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_OLD',
-        UpdateExpression='REMOVE b')
-    assert ret['Attributes'] == {'b': 'cat'}
-
-    # With ReturnValues=ALL_NEW, the entire new value of the item (including
-    # old attributes we did not modify) is returned:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='ALL_NEW',
-        UpdateExpression='SET b = :val',
-        ExpressionAttributeValues={':val': 'cat'})
-    assert ret['Attributes'] == {'p': p, 'a': 'hi', 'b': 'cat'}
-
-    # With ReturnValues=UPDATED_NEW, only the new value of the updated
-    # attributes are returned. Note that "updated attributes" means
-    # the newly set attributes - it doesn't require that these attributes
-    # have any previous values
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_NEW',
-        UpdateExpression='SET b = :val, c = :val2',
-        ExpressionAttributeValues={':val': 'cat', ':val2': 'hello'})
-    assert ret['Attributes'] == {'b': 'cat', 'c': 'hello'}
-    # Deleting an attribute also counts as overwriting it, but the delete
-    # column is not returned in the response - so it's empty in this case.
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_NEW',
-        UpdateExpression='REMOVE b')
-    assert not 'Attributes' in ret
-    # In the above examples, UPDATED_NEW is not useful because it just
-    # returns the new values we already know from the request... UPDATED_NEW
-    # becomes more useful in read-modify-write operations:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 1})
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_NEW',
-        UpdateExpression='SET a = a + :val',
-        ExpressionAttributeValues={':val': 1})
-    assert ret['Attributes'] == {'a': 2}
-
-    # A non-supported setting "DOG" also returns in error:
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.update_item(Key={'p': p}, ReturnValues='DOG',
-            UpdateExpression='SET a = a + :val',
-            ExpressionAttributeValues={':val': 1})
-    # The ReturnValues value is case sensitive, so while "NONE" is supported
-    # (and tested above), "none" isn't:
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.update_item(Key={'p': p}, ReturnValues='none',
-            UpdateExpression='SET a = a + :val',
-            ExpressionAttributeValues={':val': 1})
--- a/alternator-test/util.py
+++ b/alternator-test/util.py
@@ -1,141 +0,0 @@
-# Copyright 2019 ScyllaDB
-#
-# This file is part of Scylla.
-#
-# Scylla is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Scylla is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
-
-# Various utility functions which are useful for multiple tests
-
-import string
-import random
-import collections
-import time
-
-def random_string(length=10, chars=string.ascii_uppercase + string.digits):
-    return ''.join(random.choice(chars) for x in range(length))
-
-def random_bytes(length=10):
-    return bytearray(random.getrandbits(8) for _ in range(length))
-
-# Utility functions for scan and query into an array of items:
-# TODO: add to full_scan and full_query by default ConsistentRead=True, as
-# it's not useful for tests without it!
-def full_scan(table, **kwargs):
-    response = table.scan(**kwargs)
-    items = response['Items']
-    while 'LastEvaluatedKey' in response:
-        response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'], **kwargs)
-        items.extend(response['Items'])
-    return items
-
-# full_scan_and_count returns both items and count as returned by the server.
-# Note that count isn't simply len(items) - the server returns them
-# independently. e.g., with Select='COUNT' the items are not returned, but
-# count is.
-def full_scan_and_count(table, **kwargs):
-    response = table.scan(**kwargs)
-    items = []
-    count = 0
-    if 'Items' in response:
-        items.extend(response['Items'])
-    if 'Count' in response:
-        count = count + response['Count']
-    while 'LastEvaluatedKey' in response:
-        response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'], **kwargs)
-        if 'Items' in response:
-            items.extend(response['Items'])
-        if 'Count' in response:
-            count = count + response['Count']
-    return (count, items)
-
-# Utility function for fetching the entire results of a query into an array of items
-def full_query(table, **kwargs):
-    response = table.query(**kwargs)
-    items = response['Items']
-    while 'LastEvaluatedKey' in response:
-        response = table.query(ExclusiveStartKey=response['LastEvaluatedKey'], **kwargs)
-        items.extend(response['Items'])
-    return items
-
-# To compare two lists of items (each is a dict) without regard for order,
-# "==" is not good enough because it will fail if the order is different.
-# The following function, multiset() converts the list into a multiset
-# (set with duplicates) where order doesn't matter, so the multisets can
-# be compared.
-
-def freeze(item):
-    if isinstance(item, dict):
-        return frozenset((key, freeze(value)) for key, value in item.items())
-    elif isinstance(item, list):
-        return tuple(freeze(value) for value in item)
-    return item
-
-def multiset(items):
-    return collections.Counter([freeze(item) for item in items])
-
-
-test_table_prefix = 'alternator_test_'
-def test_table_name():
-    current_ms = int(round(time.time() * 1000))
-    # In the off chance that test_table_name() is called twice in the same millisecond...
-    if test_table_name.last_ms >= current_ms:
-        current_ms = test_table_name.last_ms + 1
-    test_table_name.last_ms = current_ms
-    return test_table_prefix + str(current_ms)
-test_table_name.last_ms = 0
-
-def create_test_table(dynamodb, **kwargs):
-    name = test_table_name()
-    print("fixture creating new table {}".format(name))
-    table = dynamodb.create_table(TableName=name,
-        BillingMode='PAY_PER_REQUEST', **kwargs)
-    waiter = table.meta.client.get_waiter('table_exists')
-    # recheck every second instead of the default, lower, frequency. This can
-    # save a few seconds on AWS with its very slow table creation, but can
-    # more on tests on Scylla with its faster table creation turnaround.
-    waiter.config.delay = 1
-    waiter.config.max_attempts = 200
-    waiter.wait(TableName=name)
-    return table
-
-# DynamoDB's ListTables request returns up to a single page of table names
-# (e.g., up to 100) and it is up to the caller to call it again and again
-# to get the next page. This is a utility function which calls it repeatedly
-# as much as necessary to get the entire list.
-# We deliberately return a list and not a set, because we want the caller
-# to be able to recognize bugs in ListTables which causes the same table
-# to be returned twice.
-def list_tables(dynamodb, limit=100):
-    ret = []
-    pos = None
-    while True:
-        if pos:
-            page = dynamodb.meta.client.list_tables(Limit=limit, ExclusiveStartTableName=pos);
-        else:
-            page = dynamodb.meta.client.list_tables(Limit=limit);
-        results = page.get('TableNames', None)
-        assert(results)
-        ret = ret + results
-        newpos = page.get('LastEvaluatedTableName', None)
-        if not newpos:
-            break;
-        # It doesn't make sense for Dynamo to tell us we need more pages, but
-        # not send anything in *this* page!
-        assert len(results) > 0
-        assert newpos != pos
-        # Note that we only checked that we got back tables, not that we got
-        # any new tables not already in ret. So a buggy implementation might
-        # still cause an endless loop getting the same tables again and again.
-        pos = newpos
-    return ret
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -66,8 +66,9 @@ static std::string format_time_point(db_clock::time_point tp) {
    time_t time_point_repr = db_clock::to_time_t(tp);
    std::string time_point_str;
    time_point_str.resize(17);
+    ::tm time_buf;
    // strftime prints the terminating null character as well
-    std::strftime(time_point_str.data(), time_point_str.size(), "%Y%m%dT%H%M%SZ", std::gmtime(&time_point_repr));
+    std::strftime(time_point_str.data(), time_point_str.size(), "%Y%m%dT%H%M%SZ", ::gmtime_r(&time_point_repr, &time_buf));
    time_point_str.resize(16);
    return time_point_str;
 }
@@ -128,8 +129,8 @@ future<std::string> get_key_from_roles(cql3::query_processor& qp, std::string us
            auth::meta::roles_table::qualified_name(), auth::meta::roles_table::role_col_name);

    auto cl = auth::password_authenticator::consistency_for_user(username);
-    auto timeout = auth::internal_distributed_timeout_config();
-    return qp.process(query, cl, timeout, {sstring(username)}, true).then_wrapped([username = std::move(username)] (future<::shared_ptr<cql3::untyped_result_set>> f) {
+    auto& timeout = auth::internal_distributed_timeout_config();
+    return qp.execute_internal(query, cl, timeout, {sstring(username)}, true).then_wrapped([username = std::move(username)] (future<::shared_ptr<cql3::untyped_result_set>> f) {
        auto res = f.get0();
        auto salted_hash = std::optional<sstring>();
        if (res->empty()) {
--- a/alternator/base64.cc
+++ b/alternator/base64.cc
@@ -77,7 +77,7 @@ std::string base64_encode(bytes_view in) {
    return ret;
 }

-bytes base64_decode(std::string_view in) {
+static std::string base64_decode_string(std::string_view in) {
    int i = 0;
    int8_t chunk4[4]; // chunk of input, each byte converted to 0..63;
    std::string ret;
@@ -104,8 +104,42 @@ bytes base64_decode(std::string_view in) {
        if (i==3)
            ret += ((chunk4[1] & 0xf) << 4) + ((chunk4[2] & 0x3c) >> 2);
    }
+    return ret;
+}
+
+bytes base64_decode(std::string_view in) {
    // FIXME: This copy is sad. The problem is we need back "bytes"
    // but "bytes" doesn't have efficient append and std::string.
    // To fix this we need to use bytes' "uninitialized" feature.
+    std::string ret = base64_decode_string(in);
    return bytes(ret.begin(), ret.end());
 }
+
+static size_t base64_padding_len(std::string_view str) {
+    size_t padding = 0;
+    padding += (!str.empty() && str.back() == '=');
+    padding += (str.size() > 1 && *(str.end() - 2) == '=');
+    return padding;
+}
+
+size_t base64_decoded_len(std::string_view str) {
+    return str.size() / 4 * 3 - base64_padding_len(str);
+}
+
+bool base64_begins_with(std::string_view base, std::string_view operand) {
+    if (base.size() < operand.size() || base.size() % 4 != 0 || operand.size() % 4 != 0) {
+        return false;
+    }
+    if (base64_padding_len(operand) == 0) {
+        return base.starts_with(operand);
+    }
+    const std::string_view unpadded_base_prefix = base.substr(0, operand.size() - 4);
+    const std::string_view unpadded_operand = operand.substr(0, operand.size() - 4);
+    if (unpadded_base_prefix != unpadded_operand) {
+        return false;
+    }
+    // Decode and compare last 4 bytes of base64-encoded strings
+    const std::string base_remainder = base64_decode_string(base.substr(operand.size() - 4, operand.size()));
+    const std::string operand_remainder = base64_decode_string(operand.substr(operand.size() - 4));
+    return base_remainder.starts_with(operand_remainder);
+}
--- a/alternator/base64.hh
+++ b/alternator/base64.hh
@@ -32,3 +32,7 @@ bytes base64_decode(std::string_view);
 inline bytes base64_decode(const rjson::value& v) {
  return base64_decode(std::string_view(v.GetString(), v.GetStringLength()));
 }
+
+size_t base64_decoded_len(std::string_view str);
+
+bool base64_begins_with(std::string_view base, std::string_view operand);
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -30,6 +30,11 @@
 #include "serialization.hh"
 #include "base64.hh"
 #include <stdexcept>
+#include <boost/algorithm/cxx11/all_of.hpp>
+#include <boost/algorithm/cxx11/any_of.hpp>
+#include "utils/overloaded_functor.hh"
+
+#include "expressions.hh"

 namespace alternator {

@@ -62,49 +67,6 @@ comparison_operator_type get_comparison_operator(const rjson::value& comparison_
    return it->second;
 }

-static ::shared_ptr<cql3::restrictions::single_column_restriction::contains> make_map_element_restriction(const column_definition& cdef, std::string_view key, const rjson::value& value) {
-    bytes raw_key = utf8_type->from_string(sstring_view(key.data(), key.size()));
-    auto key_value = ::make_shared<cql3::constants::value>(cql3::raw_value::make_value(std::move(raw_key)));
-    bytes raw_value = serialize_item(value);
-    auto entry_value = ::make_shared<cql3::constants::value>(cql3::raw_value::make_value(std::move(raw_value)));
-    return make_shared<cql3::restrictions::single_column_restriction::contains>(cdef, std::move(key_value), std::move(entry_value));
-}
-
-static ::shared_ptr<cql3::restrictions::single_column_restriction::EQ> make_key_eq_restriction(const column_definition& cdef, const rjson::value& value) {
-    bytes raw_value = get_key_from_typed_value(value, cdef, type_to_string(cdef.type));
-    auto restriction_value = ::make_shared<cql3::constants::value>(cql3::raw_value::make_value(std::move(raw_value)));
-    return make_shared<cql3::restrictions::single_column_restriction::EQ>(cdef, std::move(restriction_value));
-}
-
-::shared_ptr<cql3::restrictions::statement_restrictions> get_filtering_restrictions(schema_ptr schema, const column_definition& attrs_col, const rjson::value& query_filter) {
-    clogger.trace("Getting filtering restrictions for: {}", rjson::print(query_filter));
-    auto filtering_restrictions = ::make_shared<cql3::restrictions::statement_restrictions>(schema, true);
-    for (auto it = query_filter.MemberBegin(); it != query_filter.MemberEnd(); ++it) {
-        std::string_view column_name(it->name.GetString(), it->name.GetStringLength());
-        const rjson::value& condition = it->value;
-
-        const rjson::value& comp_definition = rjson::get(condition, "ComparisonOperator");
-        const rjson::value& attr_list = rjson::get(condition, "AttributeValueList");
-        comparison_operator_type op = get_comparison_operator(comp_definition);
-
-        if (op != comparison_operator_type::EQ) {
-            throw api_error("ValidationException", "Filtering is currently implemented for EQ operator only");
-        }
-        if (attr_list.Size() != 1) {
-            throw api_error("ValidationException", format("EQ restriction needs exactly 1 attribute value: {}", rjson::print(attr_list)));
-        }
-        if (const column_definition* cdef = schema->get_column_definition(to_bytes(column_name.data()))) {
-            // Primary key restriction
-            filtering_restrictions->add_restriction(make_key_eq_restriction(*cdef, attr_list[0]), false, true);
-        } else {
-            // Regular column restriction
-            filtering_restrictions->add_restriction(make_map_element_restriction(attrs_col, column_name, attr_list[0]), false, true);
-        }
-
-    }
-    return filtering_restrictions;
-}
-
 namespace {

 struct size_check {
@@ -136,6 +98,11 @@ struct nonempty : public size_check {

 // Check that array has the expected number of elements
 static void verify_operand_count(const rjson::value* array, const size_check& expected, const rjson::value& op) {
+    if (!array && expected(0)) {
+        // If expected() allows an empty AttributeValueList, it is also fine
+        // that it is missing.
+        return;
+    }
    if (!array || !array->IsArray()) {
        throw api_error("ValidationException", "With ComparisonOperator, AttributeValueList must be given and an array");
    }
@@ -192,61 +159,63 @@ static bool check_NE(const rjson::value* v1, const rjson::value& v2) {
 }

 // Check if two JSON-encoded values match with the BEGINS_WITH relation
-static bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2) {
-    // BEGINS_WITH requires that its single operand (v2) be a string or
-    // binary - otherwise it's a validation error. However, problems with
-    // the stored attribute (v1) will just return false (no match).
-    if (!v2.IsObject() || v2.MemberCount() != 1) {
-        throw api_error("ValidationException", format("BEGINS_WITH operator encountered malformed AttributeValue: {}", v2));
-    }
-    auto it2 = v2.MemberBegin();
-    if (it2->name != "S" && it2->name != "B") {
-        throw api_error("ValidationException", format("BEGINS_WITH operator requires String or Binary in AttributeValue, got {}", it2->name));
-    }
-
-
+bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2,
+                       bool v1_from_query, bool v2_from_query) {
+    bool bad = false;
    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
+        if (v1_from_query) {
+            throw api_error("ValidationException", "begins_with() encountered malformed argument");
+        } else {
+            bad = true;
+        }
+    } else if (v1->MemberBegin()->name != "S" && v1->MemberBegin()->name != "B") {
+        if (v1_from_query) {
+            throw api_error("ValidationException", format("begins_with supports only string or binary type, got: {}", *v1));
+        } else {
+            bad = true;
+        }
+    }
+    if (!v2.IsObject() || v2.MemberCount() != 1) {
+        if (v2_from_query) {
+            throw api_error("ValidationException", "begins_with() encountered malformed argument");
+        } else {
+            bad = true;
+        }
+    } else if (v2.MemberBegin()->name != "S" && v2.MemberBegin()->name != "B") {
+        if (v2_from_query) {
+            throw api_error("ValidationException", format("begins_with() supports only string or binary type, got: {}", v2));
+        } else {
+            bad = true;
+        }
+    }
+    if (bad) {
        return false;
    }
    auto it1 = v1->MemberBegin();
+    auto it2 = v2.MemberBegin();
    if (it1->name != it2->name) {
        return false;
    }
    if (it2->name == "S") {
-        std::string_view val1(it1->value.GetString(), it1->value.GetStringLength());
-        std::string_view val2(it2->value.GetString(), it2->value.GetStringLength());
-        return val1.substr(0, val2.size()) == val2;
+        return rjson::to_string_view(it1->value).starts_with(rjson::to_string_view(it2->value));
    } else /* it2->name == "B" */ {
-        // TODO (optimization): Check the begins_with condition directly on
-        // the base64-encoded string, without making a decoded copy.
-        bytes val1 = base64_decode(it1->value);
-        bytes val2 = base64_decode(it2->value);
-        return val1.substr(0, val2.size()) == val2;
+        return base64_begins_with(rjson::to_string_view(it1->value), rjson::to_string_view(it2->value));
    }
 }

-static std::string_view to_string_view(const rjson::value& v) {
-    return std::string_view(v.GetString(), v.GetStringLength());
-}
-
 static bool is_set_of(const rjson::value& type1, const rjson::value& type2) {
    return (type2 == "S" && type1 == "SS") || (type2 == "N" && type1 == "NS") || (type2 == "B" && type1 == "BS");
 }

 // Check if two JSON-encoded values match with the CONTAINS relation
-static bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2) {
+bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2) {
    if (!v1) {
        return false;
    }
    const auto& kv1 = *v1->MemberBegin();
    const auto& kv2 = *v2.MemberBegin();
-    if (kv2.name != "S" && kv2.name != "N" &&  kv2.name != "B") {
-        throw api_error("ValidationException",
-                        format("CONTAINS operator requires a single AttributeValue of type String, Number, or Binary, "
-                               "got {} instead", kv2.name));
-    }
    if (kv1.name == "S" && kv2.name == "S") {
-        return to_string_view(kv1.value).find(to_string_view(kv2.value)) != std::string_view::npos;
+        return rjson::to_string_view(kv1.value).find(rjson::to_string_view(kv2.value)) != std::string_view::npos;
    } else if (kv1.name == "B" && kv2.name == "B") {
        return base64_decode(kv1.value).find(base64_decode(kv2.value)) != bytes::npos;
    } else if (is_set_of(kv1.name, kv2.name)) {
@@ -306,6 +275,19 @@ static bool check_IN(const rjson::value* val, const rjson::value& array) {
    return have_match;
 }

+// Another variant of check_IN, this one for ConditionExpression. It needs to
+// check whether the first element in the given vector is equal to any of the
+// others.
+static bool check_IN(const std::vector<rjson::value>& array) {
+    const rjson::value* first = &array[0];
+    for (unsigned i = 1; i < array.size(); i++) {
+        if (check_EQ(first, array[i])) {
+            return true;
+        }
+    }
+    return false;
+}
+
 static bool check_NULL(const rjson::value* val) {
    return val == nullptr;
 }
@@ -314,24 +296,38 @@ static bool check_NOT_NULL(const rjson::value* val) {
    return val != nullptr;
 }

+// Only types S, N or B (string, number or bytes) may be compared by the
+// various comparion operators - lt, le, gt, ge, and between.
+static bool check_comparable_type(const rjson::value& v) {
+    if (!v.IsObject() || v.MemberCount() != 1) {
+        return false;
+    }
+    const rjson::value& type = v.MemberBegin()->name;
+    return type == "S" || type == "N" || type == "B";
+}
+
 // Check if two JSON-encoded values match with cmp.
 template <typename Comparator>
-bool check_compare(const rjson::value* v1, const rjson::value& v2, const Comparator& cmp) {
-    if (!v2.IsObject() || v2.MemberCount() != 1) {
-        throw api_error("ValidationException",
-                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic));
+bool check_compare(const rjson::value* v1, const rjson::value& v2, const Comparator& cmp,
+                   bool v1_from_query, bool v2_from_query) {
+    bool bad = false;
+    if (!v1 || !check_comparable_type(*v1)) {
+        if (v1_from_query) {
+            throw api_error("ValidationException", format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
+        }
+        bad = true;
    }
-    const auto& kv2 = *v2.MemberBegin();
-    if (kv2.name != "S" && kv2.name != "N" && kv2.name != "B") {
-        throw api_error("ValidationException",
-                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic));
+    if (!check_comparable_type(v2)) {
+        if (v2_from_query) {
+            throw api_error("ValidationException", format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
+        }
+        bad = true;
    }
-    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
+    if (bad) {
        return false;
    }
    const auto& kv1 = *v1->MemberBegin();
+    const auto& kv2 = *v2.MemberBegin();
    if (kv1.name != kv2.name) {
        return false;
    }
@@ -345,84 +341,103 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara
    if (kv1.name == "B") {
        return cmp(base64_decode(kv1.value), base64_decode(kv2.value));
    }
-    clogger.error("check_compare panic: LHS type equals RHS type, but one is in {N,S,B} while the other isn't");
+    // cannot reach here, as check_comparable_type() verifies the type is one
+    // of the above options.
    return false;
 }

 struct cmp_lt {
    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs; }
+    // We cannot use the normal comparison operators like "<" on the bytes
+    // type, because they treat individual bytes as signed but we need to
+    // compare them as *unsigned*. So we need a specialization for bytes.
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) < 0; }
    static constexpr const char* diagnostic = "LT operator";
 };

 struct cmp_le {
-    // bytes only has <, so we cannot use <=.
-    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs || lhs == rhs; }
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs <= rhs; }
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) <= 0; }
    static constexpr const char* diagnostic = "LE operator";
 };

 struct cmp_ge {
-    // bytes only has <, so we cannot use >=.
-    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs || lhs == rhs; }
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs >= rhs; }
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) >= 0; }
    static constexpr const char* diagnostic = "GE operator";
 };

 struct cmp_gt {
-    // bytes only has <, so we cannot use >.
-    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs; }
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs > rhs; }
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) > 0; }
    static constexpr const char* diagnostic = "GT operator";
 };

-// True if v is between lb and ub, inclusive.  Throws if lb > ub.
+// True if v is between lb and ub, inclusive.  Throws or returns false
+// (depending on bounds_from_query parameter) if lb > ub.
 template <typename T>
-bool check_BETWEEN(const T& v, const T& lb, const T& ub) {
-    if (ub < lb) {
-        throw api_error("ValidationException",
-                        format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+static bool check_BETWEEN(const T& v, const T& lb, const T& ub, bool bounds_from_query) {
+    if (cmp_lt()(ub, lb)) {
+        if (bounds_from_query) {
+            throw api_error("ValidationException",
+                format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+        } else {
+            return false;
+        }
    }
    return cmp_ge()(v, lb) && cmp_le()(v, ub);
 }

-static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub) {
-    if (!v) {
+static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub,
+                          bool v_from_query, bool lb_from_query, bool ub_from_query) {
+    if ((v && v_from_query && !check_comparable_type(*v)) ||
+        (lb_from_query && !check_comparable_type(lb)) ||
+        (ub_from_query && !check_comparable_type(ub))) {
+        throw api_error("ValidationException", "between allow only the types String, Number, or Binary");
+
+    }
+    if (!v || !v->IsObject() || v->MemberCount() != 1 ||
+        !lb.IsObject() || lb.MemberCount() != 1 ||
+        !ub.IsObject() || ub.MemberCount() != 1) {
        return false;
    }
-    if (!v->IsObject() || v->MemberCount() != 1) {
-        throw api_error("ValidationException", format("BETWEEN operator encountered malformed AttributeValue: {}", *v));
-    }
-    if (!lb.IsObject() || lb.MemberCount() != 1) {
-        throw api_error("ValidationException", format("BETWEEN operator encountered malformed AttributeValue: {}", lb));
-    }
-    if (!ub.IsObject() || ub.MemberCount() != 1) {
-        throw api_error("ValidationException", format("BETWEEN operator encountered malformed AttributeValue: {}", ub));
-    }

    const auto& kv_v = *v->MemberBegin();
    const auto& kv_lb = *lb.MemberBegin();
    const auto& kv_ub = *ub.MemberBegin();
+    bool bounds_from_query = lb_from_query && ub_from_query;
    if (kv_lb.name != kv_ub.name) {
-        throw api_error(
-                "ValidationException",
+        if (bounds_from_query) {
+           throw api_error("ValidationException",
                format("BETWEEN operator requires the same type for lower and upper bound; instead got {} and {}",
                       kv_lb.name, kv_ub.name));
+        } else {
+            return false;
+        }
    }
    if (kv_v.name != kv_lb.name) { // Cannot compare different types, so v is NOT between lb and ub.
        return false;
    }
    if (kv_v.name == "N") {
        const char* diag = "BETWEEN operator";
-        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag));
+        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag), bounds_from_query);
    }
    if (kv_v.name == "S") {
        return check_BETWEEN(std::string_view(kv_v.value.GetString(), kv_v.value.GetStringLength()),
                             std::string_view(kv_lb.value.GetString(), kv_lb.value.GetStringLength()),
-                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()));
+                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()),
+                             bounds_from_query);
    }
    if (kv_v.name == "B") {
-        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value));
+        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value), bounds_from_query);
    }
-    throw api_error("ValidationException",
-        format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
+    if (v_from_query) {
+        throw api_error("ValidationException",
+            format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
               kv_lb.name));
+    } else {
+        return false;
+    }
 }

 // Verify one Expect condition on one attribute (whose content is "got")
@@ -469,19 +484,19 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
            return check_NE(got, (*attribute_value_list)[0]);
        case comparison_operator_type::LT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_lt{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_lt{}, false, true);
        case comparison_operator_type::LE:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_le{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_le{}, false, true);
        case comparison_operator_type::GT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_gt{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_gt{}, false, true);
        case comparison_operator_type::GE:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_ge{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_ge{}, false, true);
        case comparison_operator_type::BEGINS_WITH:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_BEGINS_WITH(got, (*attribute_value_list)[0]);
+            return check_BEGINS_WITH(got, (*attribute_value_list)[0], false, true);
        case comparison_operator_type::IN:
            verify_operand_count(attribute_value_list, nonempty(), *comparison_operator);
            return check_IN(got, *attribute_value_list);
@@ -493,72 +508,196 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
            return check_NOT_NULL(got);
        case comparison_operator_type::BETWEEN:
            verify_operand_count(attribute_value_list, exact_size(2), *comparison_operator);
-            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1]);
+            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1],
+                                 false, true, true);
        case comparison_operator_type::CONTAINS:
-            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_CONTAINS(got, (*attribute_value_list)[0]);
+            {
+                verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+                // Expected's "CONTAINS" has this artificial limitation.
+                // ConditionExpression's "contains()" does not...
+                const rjson::value& arg = (*attribute_value_list)[0];
+                const auto& argtype = (*arg.MemberBegin()).name;
+                if (argtype != "S" && argtype != "N" && argtype != "B") {
+                    throw api_error("ValidationException",
+                            format("CONTAINS operator requires a single AttributeValue of type String, Number, or Binary, "
+                                    "got {} instead", argtype));
+                }
+                return check_CONTAINS(got, arg);
+            }
        case comparison_operator_type::NOT_CONTAINS:
-            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_NOT_CONTAINS(got, (*attribute_value_list)[0]);
+            {
+                verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+                // Expected's "NOT_CONTAINS" has this artificial limitation.
+                // ConditionExpression's "contains()" does not...
+                const rjson::value& arg = (*attribute_value_list)[0];
+                const auto& argtype = (*arg.MemberBegin()).name;
+                if (argtype != "S" && argtype != "N" && argtype != "B") {
+                    throw api_error("ValidationException",
+                            format("CONTAINS operator requires a single AttributeValue of type String, Number, or Binary, "
+                                    "got {} instead", argtype));
+                }
+                return check_NOT_CONTAINS(got, arg);
+            }
        }
        throw std::logic_error(format("Internal error: corrupted operator enum: {}", int(op)));
    }
 }

-// Verify that the existing values of the item (previous_item) match the
+conditional_operator_type get_conditional_operator(const rjson::value& req) {
+    const rjson::value* conditional_operator = rjson::find(req, "ConditionalOperator");
+    if (!conditional_operator) {
+        return conditional_operator_type::MISSING;
+    }
+    if (!conditional_operator->IsString()) {
+        throw api_error("ValidationException", "'ConditionalOperator' parameter, if given, must be a string");
+    }
+    auto s = rjson::to_string_view(*conditional_operator);
+    if (s == "AND") {
+        return conditional_operator_type::AND;
+    } else if (s == "OR") {
+        return conditional_operator_type::OR;
+    } else {
+        throw api_error("ValidationException",
+                format("'ConditionalOperator' parameter must be AND, OR or missing. Found {}.", s));
+    }
+}
+
+// Check if the existing values of the item (previous_item) match the
 // conditions given by the Expected and ConditionalOperator parameters
 // (if they exist) in the request (an UpdateItem, PutItem or DeleteItem).
-// This function will throw a ConditionalCheckFailedException API error
-// if the values do not match the condition, or ValidationException if there
+// This function can throw an ValidationException API error if there
 // are errors in the format of the condition itself.
-void verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value>& previous_item) {
+bool verify_expected(const rjson::value& req, const rjson::value* previous_item) {
    const rjson::value* expected = rjson::find(req, "Expected");
+    auto conditional_operator = get_conditional_operator(req);
+    if (conditional_operator != conditional_operator_type::MISSING &&
+        (!expected || (expected->IsObject() && expected->GetObject().ObjectEmpty()))) {
+            throw api_error("ValidationException", "'ConditionalOperator' parameter cannot be specified for missing or empty Expression");
+    }
    if (!expected) {
-        return;
+        return true;
    }
    if (!expected->IsObject()) {
        throw api_error("ValidationException", "'Expected' parameter, if given, must be an object");
    }
-    // ConditionalOperator can be "AND" for requiring all conditions, or
-    // "OR" for requiring one condition, and defaults to "AND" if missing.
-    const rjson::value* conditional_operator = rjson::find(req, "ConditionalOperator");
-    bool require_all = true;
-    if (conditional_operator) {
-        if (!conditional_operator->IsString()) {
-            throw api_error("ValidationException", "'ConditionalOperator' parameter, if given, must be a string");
-        }
-        std::string_view s(conditional_operator->GetString(), conditional_operator->GetStringLength());
-        if (s == "AND") {
-            // require_all is already true
-        } else if (s == "OR") {
-            require_all = false;
-        } else {
-            throw api_error("ValidationException", "'ConditionalOperator' parameter must be AND, OR or missing");
-        }
-        if (expected->GetObject().ObjectEmpty()) {
-            throw api_error("ValidationException", "'ConditionalOperator' parameter cannot be specified for empty Expression");
-        }
-    }
+    bool require_all = conditional_operator != conditional_operator_type::OR;
+    return verify_condition(*expected, require_all, previous_item);
+}

-    for (auto it = expected->MemberBegin(); it != expected->MemberEnd(); ++it) {
+bool verify_condition(const rjson::value& condition, bool require_all, const rjson::value* previous_item) {
+    for (auto it = condition.MemberBegin(); it != condition.MemberEnd(); ++it) {
        const rjson::value* got = nullptr;
-        if (previous_item && previous_item->IsObject() && previous_item->HasMember("Item")) {
-            got = rjson::find((*previous_item)["Item"], rjson::string_ref_type(it->name.GetString()));
+        if (previous_item) {
+            got = rjson::find(*previous_item, rjson::to_string_view(it->name));
        }
        bool success = verify_expected_one(it->value, got);
        if (success && !require_all) {
            // When !require_all, one success is enough!
-            return;
+            return true;
        } else if (!success && require_all) {
            // When require_all, one failure is enough!
-            throw api_error("ConditionalCheckFailedException", "Failed condition.");
+            return false;
        }
    }
    // If we got here and require_all, none of the checks failed, so succeed.
    // If we got here and !require_all, all of the checks failed, so fail.
-    if (!require_all) {
-        throw api_error("ConditionalCheckFailedException", "None of ORed Expect conditions were successful.");
+    return require_all;
+}
+
+static bool calculate_primitive_condition(const parsed::primitive_condition& cond,
+        const rjson::value* previous_item) {
+    std::vector<rjson::value> calculated_values;
+    calculated_values.reserve(cond._values.size());
+    for (const parsed::value& v : cond._values) {
+        calculated_values.push_back(calculate_value(v,
+                cond._op == parsed::primitive_condition::type::VALUE ?
+                        calculate_value_caller::ConditionExpressionAlone :
+                        calculate_value_caller::ConditionExpression,
+                previous_item));
+    }
+    switch (cond._op) {
+    case parsed::primitive_condition::type::BETWEEN:
+        if (calculated_values.size() != 3) {
+            // Shouldn't happen unless we have a bug in the parser
+            throw std::logic_error(format("Wrong number of values {} in BETWEEN primitive_condition", cond._values.size()));
+        }
+        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2],
+                             cond._values[0].is_constant(), cond._values[1].is_constant(), cond._values[2].is_constant());
+    case parsed::primitive_condition::type::IN:
+        return check_IN(calculated_values);
+    case parsed::primitive_condition::type::VALUE:
+        if (calculated_values.size() != 1) {
+            // Shouldn't happen unless we have a bug in the parser
+            throw std::logic_error(format("Unexpected values in primitive_condition", cond._values.size()));
+        }
+        // Unwrap the boolean wrapped as the value (if it is a boolean)
+        if (calculated_values[0].IsObject() && calculated_values[0].MemberCount() == 1) {
+            auto it = calculated_values[0].MemberBegin();
+            if (it->name == "BOOL" && it->value.IsBool()) {
+                return it->value.GetBool();
+            }
+        }
+        throw api_error("ValidationException",
+                format("ConditionExpression: condition results in a non-boolean value: {}",
+                        calculated_values[0]));
+    default:
+        // All the rest of the operators have exactly two parameters (and unless
+        // we have a bug in the parser, that's what we have in the parsed object:
+        if (calculated_values.size() != 2) {
+            throw std::logic_error(format("Wrong number of values {} in primitive_condition object", cond._values.size()));
+        }
+    }
+    switch (cond._op) {
+    case parsed::primitive_condition::type::EQ:
+        return check_EQ(&calculated_values[0], calculated_values[1]);
+    case parsed::primitive_condition::type::NE:
+        return check_NE(&calculated_values[0], calculated_values[1]);
+    case parsed::primitive_condition::type::GT:
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
+    case parsed::primitive_condition::type::GE:
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
+    case parsed::primitive_condition::type::LT:
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
+    case parsed::primitive_condition::type::LE:
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
+    default:
+        // Shouldn't happen unless we have a bug in the parser
+        throw std::logic_error(format("Unknown type {} in primitive_condition object", (int)(cond._op)));
    }
 }

+// Check if the existing values of the item (previous_item) match the
+// conditions given by the given parsed ConditionExpression.
+bool verify_condition_expression(
+        const parsed::condition_expression& condition_expression,
+        const rjson::value* previous_item) {
+    if (condition_expression.empty()) {
+        return true;
+    }
+    bool ret = std::visit(overloaded_functor {
+        [&] (const parsed::primitive_condition& cond) -> bool {
+            return calculate_primitive_condition(cond, previous_item);
+        },
+        [&] (const parsed::condition_expression::condition_list& list) -> bool {
+            auto verify_condition = [&] (const parsed::condition_expression& e) {
+                return verify_condition_expression(e, previous_item);
+            };
+            switch (list.op) {
+            case '&':
+                return boost::algorithm::all_of(list.conditions, verify_condition);
+            case '|':
+                return boost::algorithm::any_of(list.conditions, verify_condition);
+            default:
+                // Shouldn't happen unless we have a bug in the parser
+                throw std::logic_error("bad operator in condition_list");
+            }
+        }
+    }, condition_expression._expression);
+    return condition_expression._negated ? !ret : ret;
+}
+
 }
--- a/alternator/conditions.hh
+++ b/alternator/conditions.hh
@@ -33,6 +33,7 @@

 #include "cql3/restrictions/statement_restrictions.hh"
 #include "serialization.hh"
+#include "expressions_types.hh"

 namespace alternator {

@@ -42,8 +43,19 @@ enum class comparison_operator_type {

 comparison_operator_type get_comparison_operator(const rjson::value& comparison_operator);

-::shared_ptr<cql3::restrictions::statement_restrictions> get_filtering_restrictions(schema_ptr schema, const column_definition& attrs_col, const rjson::value& query_filter);
+enum class conditional_operator_type {
+    AND, OR, MISSING
+};
+conditional_operator_type get_conditional_operator(const rjson::value& req);

-void verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value>& previous_item);
+bool verify_expected(const rjson::value& req, const rjson::value* previous_item);
+bool verify_condition(const rjson::value& condition, bool require_all, const rjson::value* previous_item);
+
+bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2);
+bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2, bool v1_from_query, bool v2_from_query);
+
+bool verify_condition_expression(
+        const parsed::condition_expression& condition_expression,
+        const rjson::value* previous_item);

 }
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -25,47 +25,59 @@
 #include <seastar/http/httpd.hh>
 #include "seastarx.hh"
 #include <seastar/json/json_elements.hh>
+#include <seastar/core/sharded.hh>

 #include "service/storage_proxy.hh"
 #include "service/migration_manager.hh"
 #include "service/client_state.hh"

+#include "alternator/error.hh"
 #include "stats.hh"
+#include "rjson.hh"

 namespace alternator {

-class executor {
+class executor : public peering_sharded_service<executor> {
    service::storage_proxy& _proxy;
    service::migration_manager& _mm;
+    // An smp_service_group to be used for limiting the concurrency when
+    // forwarding Alternator request between shards - if necessary for LWT.
+    smp_service_group _ssg;

 public:
    using client_state = service::client_state;
+    using request_return_type = std::variant<json::json_return_type, api_error>;
    stats _stats;
    static constexpr auto ATTRS_COLUMN_NAME = ":attrs";
-    static constexpr auto KEYSPACE_NAME = "alternator";
+    static constexpr auto KEYSPACE_NAME_PREFIX = "alternator_";
+    static constexpr std::string_view INTERNAL_TABLE_PREFIX = ".scylla.alternator.";

-    executor(service::storage_proxy& proxy, service::migration_manager& mm) : _proxy(proxy), _mm(mm) {}
+    executor(service::storage_proxy& proxy, service::migration_manager& mm, smp_service_group ssg)
+        : _proxy(proxy), _mm(mm), _ssg(ssg) {}

-    future<json::json_return_type> create_table(client_state& client_state, std::string content);
-    future<json::json_return_type> describe_table(client_state& client_state, std::string content);
-    future<json::json_return_type> delete_table(client_state& client_state, std::string content);
-    future<json::json_return_type> put_item(client_state& client_state, std::string content);
-    future<json::json_return_type> get_item(client_state& client_state, std::string content);
-    future<json::json_return_type> delete_item(client_state& client_state, std::string content);
-    future<json::json_return_type> update_item(client_state& client_state, std::string content);
-    future<json::json_return_type> list_tables(client_state& client_state, std::string content);
-    future<json::json_return_type> scan(client_state& client_state, std::string content);
-    future<json::json_return_type> describe_endpoints(client_state& client_state, std::string content, std::string host_header);
-    future<json::json_return_type> batch_write_item(client_state& client_state, std::string content);
-    future<json::json_return_type> batch_get_item(client_state& client_state, std::string content);
-    future<json::json_return_type> query(client_state& client_state, std::string content);
+    future<request_return_type> create_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> delete_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> put_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> delete_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> update_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> list_tables(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> scan(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> describe_endpoints(client_state& client_state, service_permit permit, rjson::value request, std::string host_header);
+    future<request_return_type> batch_write_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> batch_get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> query(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> tag_resource(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> untag_resource(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> list_tags_of_resource(client_state& client_state, service_permit permit, rjson::value request);

    future<> start();
    future<> stop() { return make_ready_future<>(); }

-    future<> maybe_create_keyspace();
+    future<> create_keyspace(std::string_view keyspace_name);

-    static void maybe_trace_query(client_state& client_state, sstring_view op, sstring_view query);
+    static tracing::trace_state_ptr maybe_trace_query(client_state& client_state, sstring_view op, sstring_view query);
 };

 }
--- a/alternator/expressions.cc
+++ b/alternator/expressions.cc
@@ -20,15 +20,24 @@
 */

 #include "expressions.hh"
+#include "serialization.hh"
+#include "base64.hh"
+#include "conditions.hh"
 #include "alternator/expressionsLexer.hpp"
 #include "alternator/expressionsParser.hpp"
+#include "utils/overloaded_functor.hh"
+#include "error.hh"

-#include <seastarx.hh>
+#include "seastarx.hh"

 #include <seastar/core/print.hh>
 #include <seastar/util/log.hh>

+#include <boost/algorithm/cxx11/any_of.hpp>
+#include <boost/algorithm/cxx11/all_of.hpp>
+
 #include <functional>
+#include <unordered_map>

 namespace alternator {

@@ -65,13 +74,19 @@ parse_projection_expression(std::string query) {
    }
 }

-template<class... Ts> struct overloaded : Ts... { using Ts::operator()...; };
-template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
+parsed::condition_expression
+parse_condition_expression(std::string query) {
+    try {
+        return do_with_parser(query,  std::mem_fn(&expressionsParser::condition_expression));
+    } catch (...) {
+        throw expressions_syntax_error(format("Failed parsing ConditionExpression '{}': {}", query, std::current_exception()));
+    }
+}

 namespace parsed {

 void update_expression::add(update_expression::action a) {
-    std::visit(overloaded {
+    std::visit(overloaded_functor {
        [&] (action::set&)    { seen_set = true; },
        [&] (action::remove&) { seen_remove = true; },
        [&] (action::add&)    { seen_add = true; },
@@ -94,5 +109,576 @@ void update_expression::append(update_expression other) {
    seen_del |= other.seen_del;
 }

+void condition_expression::append(condition_expression&& a, char op) {
+    std::visit(overloaded_functor {
+        [&] (condition_list& x) {
+            // If 'a' has a single condition, we could, instead of inserting
+            // it insert its single condition (possibly negated if a._negated)
+            // But considering it we don't evaluate these expressions many
+            // times, this optimization is not worth extra code complexity.
+            if (!x.conditions.empty() && x.op != op) {
+                // Shouldn't happen unless we have a bug in the parser
+                throw std::logic_error("condition_expression::append called with mixed operators");
+            }
+            x.conditions.push_back(std::move(a));
+            x.op = op;
+        },
+        [&] (primitive_condition& x) {
+            // Shouldn't happen unless we have a bug in the parser
+            throw std::logic_error("condition_expression::append called on primitive_condition");
+        }
+    }, _expression);
+}
+
 } // namespace parsed
+
+// The following resolve_*() functions resolve references in parsed
+// expressions of different types. Resolving a parsed expression means
+// replacing:
+//  1. In parsed::path objects, replace references like "#name" with the
+//     attribute name from ExpressionAttributeNames,
+//  2. In parsed::constant objects, replace references like ":value" with
+//     the value from ExpressionAttributeValues.
+// These function also track which name and value references were used, to
+// allow complaining if some remain unused.
+// Note that the resolve_*() functions modify the expressions in-place,
+// so if we ever intend to cache parsed expression, we need to pass a copy
+// into this function.
+//
+// Doing the "resolving" stage before the evaluation stage has two benefits.
+// First, it allows us to be compatible with DynamoDB in catching unused
+// names and values (see issue #6572). Second, in the FilterExpression case,
+// we need to resolve the expression just once but then use it many times
+// (once for each item to be filtered).
+
+static void resolve_path(parsed::path& p,
+        const rjson::value* expression_attribute_names,
+        std::unordered_set<std::string>& used_attribute_names) {
+    const std::string& column_name = p.root();
+    if (column_name.size() > 0 && column_name.front() == '#') {
+        if (!expression_attribute_names) {
+            throw api_error("ValidationException",
+                    format("ExpressionAttributeNames missing, entry '{}' required by expression", column_name));
+        }
+        const rjson::value* value = rjson::find(*expression_attribute_names, column_name);
+        if (!value || !value->IsString()) {
+            throw api_error("ValidationException",
+                    format("ExpressionAttributeNames missing entry '{}' required by expression", column_name));
+        }
+        used_attribute_names.emplace(column_name);
+        p.set_root(std::string(rjson::to_string_view(*value)));
+    }
+}
+
+static void resolve_constant(parsed::constant& c,
+        const rjson::value* expression_attribute_values,
+        std::unordered_set<std::string>& used_attribute_values) {
+    std::visit(overloaded_functor {
+        [&] (const std::string& valref) {
+            if (!expression_attribute_values) {
+                throw api_error("ValidationException",
+                        format("ExpressionAttributeValues missing, entry '{}' required by expression", valref));
+            }
+            const rjson::value* value = rjson::find(*expression_attribute_values, valref);
+            if (!value) {
+                throw api_error("ValidationException",
+                        format("ExpressionAttributeValues missing entry '{}' required by expression", valref));
+            }
+            if (value->IsNull()) {
+                throw api_error("ValidationException",
+                        format("ExpressionAttributeValues null value for entry '{}' required by expression", valref));
+            }
+            validate_value(*value, "ExpressionAttributeValues");
+            used_attribute_values.emplace(valref);
+            c.set(*value);
+        },
+        [&] (const parsed::constant::literal& lit) {
+            // Nothing to do, already resolved
+        }
+    }, c._value);
+
+}
+
+void resolve_value(parsed::value& rhs,
+        const rjson::value* expression_attribute_names,
+        const rjson::value* expression_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        std::unordered_set<std::string>& used_attribute_values) {
+    std::visit(overloaded_functor {
+        [&] (parsed::constant& c) {
+            resolve_constant(c, expression_attribute_values, used_attribute_values);
+        },
+        [&] (parsed::value::function_call& f) {
+            for (parsed::value& value : f._parameters) {
+                resolve_value(value, expression_attribute_names, expression_attribute_values,
+                        used_attribute_names, used_attribute_values);
+            }
+        },
+        [&] (parsed::path& p) {
+            resolve_path(p, expression_attribute_names, used_attribute_names);
+        }
+    }, rhs._value);
+}
+
+void resolve_set_rhs(parsed::set_rhs& rhs,
+        const rjson::value* expression_attribute_names,
+        const rjson::value* expression_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        std::unordered_set<std::string>& used_attribute_values) {
+    resolve_value(rhs._v1, expression_attribute_names, expression_attribute_values,
+            used_attribute_names, used_attribute_values);
+    if (rhs._op != 'v') {
+        resolve_value(rhs._v2, expression_attribute_names, expression_attribute_values,
+                used_attribute_names, used_attribute_values);
+    }
+}
+
+void resolve_update_expression(parsed::update_expression& ue,
+        const rjson::value* expression_attribute_names,
+        const rjson::value* expression_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        std::unordered_set<std::string>& used_attribute_values) {
+    for (parsed::update_expression::action& action : ue.actions()) {
+        resolve_path(action._path, expression_attribute_names, used_attribute_names);
+        std::visit(overloaded_functor {
+            [&] (parsed::update_expression::action::set& a) {
+                resolve_set_rhs(a._rhs, expression_attribute_names, expression_attribute_values,
+                        used_attribute_names, used_attribute_values);
+            },
+            [&] (parsed::update_expression::action::remove& a) {
+                // nothing to do
+            },
+            [&] (parsed::update_expression::action::add& a) {
+                resolve_constant(a._valref, expression_attribute_values, used_attribute_values);
+            },
+            [&] (parsed::update_expression::action::del& a) {
+                resolve_constant(a._valref, expression_attribute_values, used_attribute_values);
+            }
+        }, action._action);
+    }
+}
+
+static void resolve_primitive_condition(parsed::primitive_condition& pc,
+        const rjson::value* expression_attribute_names,
+        const rjson::value* expression_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        std::unordered_set<std::string>& used_attribute_values) {
+    for (parsed::value& value : pc._values) {
+        resolve_value(value,
+                expression_attribute_names, expression_attribute_values,
+                used_attribute_names, used_attribute_values);
+    }
+}
+
+void resolve_condition_expression(parsed::condition_expression& ce,
+        const rjson::value* expression_attribute_names,
+        const rjson::value* expression_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        std::unordered_set<std::string>& used_attribute_values) {
+    std::visit(overloaded_functor {
+        [&] (parsed::primitive_condition& cond) {
+            resolve_primitive_condition(cond,
+                    expression_attribute_names, expression_attribute_values,
+                    used_attribute_names, used_attribute_values);
+        },
+        [&] (parsed::condition_expression::condition_list& list) {
+            for (parsed::condition_expression& cond : list.conditions) {
+                resolve_condition_expression(cond,
+                        expression_attribute_names, expression_attribute_values,
+                            used_attribute_names, used_attribute_values);
+
+            }
+        }
+    }, ce._expression);
+}
+
+void resolve_projection_expression(std::vector<parsed::path>& pe,
+        const rjson::value* expression_attribute_names,
+        std::unordered_set<std::string>& used_attribute_names) {
+    for (parsed::path& p : pe) {
+        resolve_path(p, expression_attribute_names, used_attribute_names);
+    }
+}
+
+// condition_expression_on() checks whether a condition_expression places any
+// condition on the given attribute. It can be useful, for example, for
+// checking whether the condition tries to restrict a key column.
+
+static bool value_on(const parsed::value& v, std::string_view attribute) {
+    return std::visit(overloaded_functor {
+        [&] (const parsed::constant& c) {
+            return false;
+        },
+        [&] (const parsed::value::function_call& f) {
+            for (const parsed::value& value : f._parameters) {
+                if (value_on(value, attribute)) {
+                    return true;
+                }
+            }
+            return false;
+        },
+        [&] (const parsed::path& p) {
+            return p.root() == attribute;
+        }
+    }, v._value);
+}
+
+static bool primitive_condition_on(const parsed::primitive_condition& pc, std::string_view attribute) {
+    for (const parsed::value& value : pc._values) {
+        if (value_on(value, attribute)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool condition_expression_on(const parsed::condition_expression& ce, std::string_view attribute) {
+    return std::visit(overloaded_functor {
+        [&] (const parsed::primitive_condition& cond) {
+            return primitive_condition_on(cond, attribute);
+        },
+        [&] (const parsed::condition_expression::condition_list& list) {
+            for (const parsed::condition_expression& cond : list.conditions) {
+                if (condition_expression_on(cond, attribute)) {
+                    return true;
+                }
+            }
+            return false;
+        }
+    }, ce._expression);
+}
+
+// for_condition_expression_on() runs a given function over all the attributes
+// mentioned in the expression. If the same attribute is mentioned more than
+// once, the function will be called more than once for the same attribute.
+
+static void for_value_on(const parsed::value& v, const noncopyable_function<void(std::string_view)>& func) {
+    std::visit(overloaded_functor {
+        [&] (const parsed::constant& c) { },
+        [&] (const parsed::value::function_call& f) {
+            for (const parsed::value& value : f._parameters) {
+                for_value_on(value, func);
+            }
+        },
+        [&] (const parsed::path& p) {
+            func(p.root());
+        }
+    }, v._value);
+}
+
+void for_condition_expression_on(const parsed::condition_expression& ce, const noncopyable_function<void(std::string_view)>& func) {
+    std::visit(overloaded_functor {
+        [&] (const parsed::primitive_condition& cond) {
+            for (const parsed::value& value : cond._values) {
+                for_value_on(value, func);
+            }
+        },
+        [&] (const parsed::condition_expression::condition_list& list) {
+            for (const parsed::condition_expression& cond : list.conditions) {
+                for_condition_expression_on(cond, func);
+            }
+        }
+    }, ce._expression);
+}
+
+// The following calculate_value() functions calculate, or evaluate, a parsed
+// expression. The parsed expression is assumed to have been "resolved", with
+// the matching resolve_* function.
+
+// Take two JSON-encoded list values (remember that a list value is
+// {"L": [...the actual list]}) and return the concatenation, again as
+// a list value.
+static rjson::value list_concatenate(const rjson::value& v1, const rjson::value& v2) {
+    const rjson::value* list1 = unwrap_list(v1);
+    const rjson::value* list2 = unwrap_list(v2);
+    if (!list1 || !list2) {
+        throw api_error("ValidationException", "UpdateExpression: list_append() given a non-list");
+    }
+    rjson::value cat = rjson::copy(*list1);
+    for (const auto& a : list2->GetArray()) {
+        rjson::push_back(cat, rjson::copy(a));
+    }
+    rjson::value ret = rjson::empty_object();
+    rjson::set(ret, "L", std::move(cat));
+    return ret;
+}
+
+// calculate_size() is ConditionExpression's size() function, i.e., it takes
+// a JSON-encoded value and returns its "size" as defined differently for the
+// different types - also as a JSON-encoded number.
+// It return a JSON-encoded "null" value if this value's type has no size
+// defined. Comparisons against this non-numeric value will later fail.
+static rjson::value calculate_size(const rjson::value& v) {
+    // NOTE: If v is improperly formatted for our JSON value encoding, it
+    // must come from the request itself, not from the database, so it makes
+    // sense to throw a ValidationException if we see such a problem.
+    if (!v.IsObject() || v.MemberCount() != 1) {
+        throw api_error("ValidationException", format("invalid object: {}", v));
+    }
+    auto it = v.MemberBegin();
+    int ret;
+    if (it->name == "S") {
+        if (!it->value.IsString()) {
+            throw api_error("ValidationException", format("invalid string: {}", v));
+        }
+        ret = it->value.GetStringLength();
+    } else if (it->name == "NS" || it->name == "SS" || it->name == "BS" || it->name == "L") {
+        if (!it->value.IsArray()) {
+            throw api_error("ValidationException", format("invalid set: {}", v));
+        }
+        ret = it->value.Size();
+    } else if (it->name == "M") {
+        if (!it->value.IsObject()) {
+            throw api_error("ValidationException", format("invalid map: {}", v));
+        }
+        ret = it->value.MemberCount();
+    } else if (it->name == "B") {
+        if (!it->value.IsString()) {
+            throw api_error("ValidationException", format("invalid byte string: {}", v));
+        }
+        ret = base64_decoded_len(rjson::to_string_view(it->value));
+    } else {
+        rjson::value json_ret = rjson::empty_object();
+        rjson::set(json_ret, "null", rjson::value(true));
+        return json_ret;
+    }
+    rjson::value json_ret = rjson::empty_object();
+    rjson::set(json_ret, "N", rjson::from_string(std::to_string(ret)));
+    return json_ret;
+}
+
+static const rjson::value& calculate_value(const parsed::constant& c) {
+    return std::visit(overloaded_functor {
+        [&] (const parsed::constant::literal& v) -> const rjson::value& {
+            return *v;
+        },
+        [&] (const std::string& valref) -> const rjson::value& {
+            // Shouldn't happen, we should have called resolve_value() earlier
+            // and replaced the value reference by the literal constant.
+            throw std::logic_error("calculate_value() called before resolve_value()");
+        }
+    }, c._value);
+}
+
+static rjson::value to_bool_json(bool b) {
+    rjson::value json_ret = rjson::empty_object();
+    rjson::set(json_ret, "BOOL", rjson::value(b));
+    return json_ret;
+}
+
+static bool known_type(std::string_view type) {
+    static thread_local const std::unordered_set<std::string_view> types = {
+            "N", "S", "B", "NS", "SS", "BS", "L", "M", "NULL", "BOOL"
+    };
+    return types.contains(type);
+}
+
+using function_handler_type = rjson::value(calculate_value_caller, const rjson::value*, const parsed::value::function_call&);
+static const
+std::unordered_map<std::string_view, function_handler_type*> function_handlers {
+    {"list_append", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
+            if (caller != calculate_value_caller::UpdateExpression) {
+                throw api_error("ValidationException",
+                        format("{}: list_append() not allowed here", caller));
+            }
+            if (f._parameters.size() != 2) {
+                throw api_error("ValidationException",
+                        format("{}: list_append() accepts 2 parameters, got {}", caller, f._parameters.size()));
+            }
+            rjson::value v1 = calculate_value(f._parameters[0], caller, previous_item);
+            rjson::value v2 = calculate_value(f._parameters[1], caller, previous_item);
+            return list_concatenate(v1, v2);
+        }
+    },
+    {"if_not_exists", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
+            if (caller != calculate_value_caller::UpdateExpression) {
+                throw api_error("ValidationException",
+                        format("{}: if_not_exists() not allowed here", caller));
+            }
+            if (f._parameters.size() != 2) {
+                throw api_error("ValidationException",
+                        format("{}: if_not_exists() accepts 2 parameters, got {}", caller, f._parameters.size()));
+            }
+            if (!std::holds_alternative<parsed::path>(f._parameters[0]._value)) {
+                throw api_error("ValidationException",
+                        format("{}: if_not_exists() must include path as its first argument", caller));
+            }
+            rjson::value v1 = calculate_value(f._parameters[0], caller, previous_item);
+            rjson::value v2 = calculate_value(f._parameters[1], caller, previous_item);
+            return v1.IsNull() ? std::move(v2) : std::move(v1);
+        }
+    },
+    {"size", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
+            if (caller != calculate_value_caller::ConditionExpression) {
+                throw api_error("ValidationException",
+                        format("{}: size() not allowed here", caller));
+            }
+            if (f._parameters.size() != 1) {
+                throw api_error("ValidationException",
+                        format("{}: size() accepts 1 parameter, got {}", caller, f._parameters.size()));
+            }
+            rjson::value v = calculate_value(f._parameters[0], caller, previous_item);
+            return calculate_size(v);
+        }
+    },
+    {"attribute_exists", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
+            if (caller != calculate_value_caller::ConditionExpressionAlone) {
+                throw api_error("ValidationException",
+                        format("{}: attribute_exists() not allowed here", caller));
+            }
+            if (f._parameters.size() != 1) {
+                throw api_error("ValidationException",
+                        format("{}: attribute_exists() accepts 1 parameter, got {}", caller, f._parameters.size()));
+            }
+            if (!std::holds_alternative<parsed::path>(f._parameters[0]._value)) {
+                throw api_error("ValidationException",
+                        format("{}: attribute_exists()'s parameter must be a path", caller));
+            }
+            rjson::value v = calculate_value(f._parameters[0], caller, previous_item);
+            return to_bool_json(!v.IsNull());
+        }
+    },
+    {"attribute_not_exists", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
+            if (caller != calculate_value_caller::ConditionExpressionAlone) {
+                throw api_error("ValidationException",
+                        format("{}: attribute_not_exists() not allowed here", caller));
+            }
+            if (f._parameters.size() != 1) {
+                throw api_error("ValidationException",
+                        format("{}: attribute_not_exists() accepts 1 parameter, got {}", caller, f._parameters.size()));
+            }
+            if (!std::holds_alternative<parsed::path>(f._parameters[0]._value)) {
+                throw api_error("ValidationException",
+                        format("{}: attribute_not_exists()'s parameter must be a path", caller));
+            }
+            rjson::value v = calculate_value(f._parameters[0], caller, previous_item);
+            return to_bool_json(v.IsNull());
+        }
+    },
+    {"attribute_type", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
+            if (caller != calculate_value_caller::ConditionExpressionAlone) {
+                throw api_error("ValidationException",
+                        format("{}: attribute_type() not allowed here", caller));
+            }
+            if (f._parameters.size() != 2) {
+                throw api_error("ValidationException",
+                        format("{}: attribute_type() accepts 2 parameters, got {}", caller, f._parameters.size()));
+            }
+            // There is no real reason for the following check (not
+            // allowing the type to come from a document attribute), but
+            // DynamoDB does this check, so we do too...
+            if (!f._parameters[1].is_constant()) {
+                throw api_error("ValidationException",
+                        format("{}: attribute_types()'s first parameter must be an expression attribute", caller));
+            }
+            rjson::value v0 = calculate_value(f._parameters[0], caller, previous_item);
+            rjson::value v1 = calculate_value(f._parameters[1], caller, previous_item);
+            if (v1.IsObject() && v1.MemberCount() == 1 && v1.MemberBegin()->name == "S") {
+                // If the type parameter is not one of the legal types
+                // we should generate an error, not a failed condition:
+                if (!known_type(rjson::to_string_view(v1.MemberBegin()->value))) {
+                    throw api_error("ValidationException",
+                            format("{}: attribute_types()'s second parameter, {}, is not a known type",
+                                    caller, v1.MemberBegin()->value));
+                }
+                if (v0.IsObject() && v0.MemberCount() == 1) {
+                    return to_bool_json(v1.MemberBegin()->value == v0.MemberBegin()->name);
+                } else {
+                    return to_bool_json(false);
+                }
+            } else {
+                throw api_error("ValidationException",
+                        format("{}: attribute_type() second parameter must refer to a string, got {}", caller, v1));
+            }
+        }
+    },
+    {"begins_with", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
+            if (caller != calculate_value_caller::ConditionExpressionAlone) {
+                throw api_error("ValidationException",
+                        format("{}: begins_with() not allowed here", caller));
+            }
+            if (f._parameters.size() != 2) {
+                throw api_error("ValidationException",
+                        format("{}: begins_with() accepts 2 parameters, got {}", caller, f._parameters.size()));
+            }
+            rjson::value v1 = calculate_value(f._parameters[0], caller, previous_item);
+            rjson::value v2 = calculate_value(f._parameters[1], caller, previous_item);
+            return to_bool_json(check_BEGINS_WITH(v1.IsNull() ? nullptr : &v1,  v2,
+                                    f._parameters[0].is_constant(), f._parameters[1].is_constant()));
+        }
+    },
+    {"contains", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
+            if (caller != calculate_value_caller::ConditionExpressionAlone) {
+                throw api_error("ValidationException",
+                        format("{}: contains() not allowed here", caller));
+            }
+            if (f._parameters.size() != 2) {
+                throw api_error("ValidationException",
+                        format("{}: contains() accepts 2 parameters, got {}", caller, f._parameters.size()));
+            }
+            rjson::value v1 = calculate_value(f._parameters[0], caller, previous_item);
+            rjson::value v2 = calculate_value(f._parameters[1], caller, previous_item);
+            return to_bool_json(check_CONTAINS(v1.IsNull() ? nullptr : &v1,  v2));
+        }
+    },
+};
+
+// Given a parsed::value, which can refer either to a constant value from
+// ExpressionAttributeValues, to the value of some attribute, or to a function
+// of other values, this function calculates the resulting value.
+// "caller" determines which expression - ConditionExpression or
+// UpdateExpression - is asking for this value. We need to know this because
+// DynamoDB allows a different choice of functions for different expressions.
+rjson::value calculate_value(const parsed::value& v,
+        calculate_value_caller caller,
+        const rjson::value* previous_item) {
+    return std::visit(overloaded_functor {
+        [&] (const parsed::constant& c) -> rjson::value {
+            return rjson::copy(calculate_value(c));
+        },
+        [&] (const parsed::value::function_call& f) -> rjson::value {
+            auto function_it = function_handlers.find(std::string_view(f._function_name));
+            if (function_it == function_handlers.end()) {
+                throw api_error("ValidationException",
+                        format("UpdateExpression: unknown function '{}' called.", f._function_name));
+            }
+            return function_it->second(caller, previous_item, f);
+        },
+        [&] (const parsed::path& p) -> rjson::value {
+            if (!previous_item) {
+                return rjson::null_value();
+            }
+            std::string update_path = p.root();
+            if (p.has_operators()) {
+                // FIXME: support this
+                throw api_error("ValidationException", "Reading attribute paths not yet implemented");
+            }
+            const rjson::value* previous_value = rjson::find(*previous_item, update_path);
+            return previous_value ? rjson::copy(*previous_value) : rjson::null_value();
+        }
+    }, v._value);
+}
+
+// Same as calculate_value() above, except takes a set_rhs, which may be
+// either a single value, or v1+v2 or v1-v2.
+rjson::value calculate_value(const parsed::set_rhs& rhs,
+        const rjson::value* previous_item) {
+    switch(rhs._op) {
+    case 'v':
+        return calculate_value(rhs._v1, calculate_value_caller::UpdateExpression, previous_item);
+    case '+': {
+        rjson::value v1 = calculate_value(rhs._v1, calculate_value_caller::UpdateExpression, previous_item);
+        rjson::value v2 = calculate_value(rhs._v2, calculate_value_caller::UpdateExpression, previous_item);
+        return number_add(v1, v2);
+    }
+    case '-': {
+        rjson::value v1 = calculate_value(rhs._v1, calculate_value_caller::UpdateExpression, previous_item);
+        rjson::value v2 = calculate_value(rhs._v2, calculate_value_caller::UpdateExpression, previous_item);
+        return number_subtract(v1, v2);
+    }
+    }
+    // Can't happen
+    return rjson::null_value();
+}
+
 } // namespace alternator
--- a/alternator/expressions.g
+++ b/alternator/expressions.g
@@ -145,6 +145,12 @@ REMOVE: R E M O V E;
 ADD: A D D;
 DELETE: D E L E T E;

+AND: A N D;
+OR: O R;
+NOT: N O T;
+BETWEEN: B E T W E E N;
+IN: I N;
+
 fragment ALPHA: 'A'..'Z' | 'a'..'z';
 fragment DIGIT: '0'..'9';
 fragment ALNUM: ALPHA | DIGIT | '_';
@@ -165,19 +171,19 @@ path returns [parsed::path p]:
      | '[' INTEGER ']'           { $p.add_index(std::stoi($INTEGER.text)); }
    )*;

-update_expression_set_value returns [parsed::value v]:
-      VALREF                             { $v.set_valref($VALREF.text); }
-    | path                               { $v.set_path($path.p); }
-    | NAME                               { $v.set_func_name($NAME.text); }
-     '(' x=update_expression_set_value   { $v.add_func_parameter($x.v); }
-     (',' x=update_expression_set_value  { $v.add_func_parameter($x.v); })*
+value returns [parsed::value v]:
+      VALREF       { $v.set_valref($VALREF.text); }
+    | path         { $v.set_path($path.p); }
+    | NAME         { $v.set_func_name($NAME.text); }
+     '(' x=value   { $v.add_func_parameter($x.v); }
+     (',' x=value  { $v.add_func_parameter($x.v); })*
     ')'
    ;

 update_expression_set_rhs returns [parsed::set_rhs rhs]:
-    v=update_expression_set_value  { $rhs.set_value(std::move($v.v)); }
-    (   '+' v=update_expression_set_value  { $rhs.set_plus(std::move($v.v)); }
-      | '-' v=update_expression_set_value  { $rhs.set_minus(std::move($v.v)); }
+    v=value  { $rhs.set_value(std::move($v.v)); }
+    (   '+' v=value  { $rhs.set_plus(std::move($v.v)); }
+      | '-' v=value  { $rhs.set_minus(std::move($v.v)); }
    )?
    ;

@@ -212,3 +218,48 @@ update_expression returns [parsed::update_expression e]:
 projection_expression returns [std::vector<parsed::path> v]:
    p=path      { $v.push_back(std::move($p.p)); }
    (',' p=path { $v.push_back(std::move($p.p)); } )* EOF;
+
+
+primitive_condition returns [parsed::primitive_condition c]:
+      v=value         { $c.add_value(std::move($v.v));
+                        $c.set_operator(parsed::primitive_condition::type::VALUE); }
+      (  (  '='       { $c.set_operator(parsed::primitive_condition::type::EQ); }
+          | '<' '>'   { $c.set_operator(parsed::primitive_condition::type::NE); }
+          | '<'       { $c.set_operator(parsed::primitive_condition::type::LT); }
+          | '<' '='   { $c.set_operator(parsed::primitive_condition::type::LE); }
+          | '>'       { $c.set_operator(parsed::primitive_condition::type::GT); }
+          | '>' '='   { $c.set_operator(parsed::primitive_condition::type::GE); }
+         )
+         v=value      { $c.add_value(std::move($v.v)); }
+       | BETWEEN      { $c.set_operator(parsed::primitive_condition::type::BETWEEN); }
+         v=value      { $c.add_value(std::move($v.v)); }
+         AND
+         v=value      { $c.add_value(std::move($v.v)); }
+       | IN '('       { $c.set_operator(parsed::primitive_condition::type::IN); }
+         v=value      { $c.add_value(std::move($v.v)); }
+         (',' v=value { $c.add_value(std::move($v.v)); })*
+         ')'
+      )?
+    ;
+
+// The following rules for parsing boolean expressions are verbose and
+// somewhat strange because of Antlr 3's limitations on recursive rules,
+// common rule prefixes, and (lack of) support for operator precedence.
+// These rules could have been written more clearly using a more powerful
+// parser generator - such as Yacc.
+boolean_expression returns [parsed::condition_expression e]:
+	  b=boolean_expression_1       { $e.append(std::move($b.e), '|'); }
+	  (OR b=boolean_expression_1   { $e.append(std::move($b.e), '|'); } )*
+	;
+boolean_expression_1 returns [parsed::condition_expression e]:
+	  b=boolean_expression_2       { $e.append(std::move($b.e), '&'); }
+	  (AND b=boolean_expression_2  { $e.append(std::move($b.e), '&'); } )*
+	;
+boolean_expression_2 returns [parsed::condition_expression e]:
+	  p=primitive_condition        { $e.set_primitive(std::move($p.c)); }
+	| NOT b=boolean_expression_2   { $e = std::move($b.e); $e.apply_not(); }
+	| '(' b=boolean_expression ')' { $e = std::move($b.e); }
+    ;
+
+condition_expression returns [parsed::condition_expression e]:
+    boolean_expression { e=std::move($boolean_expression.e); } EOF;
--- a/alternator/expressions.hh
+++ b/alternator/expressions.hh
@@ -24,8 +24,13 @@
 #include <string>
 #include <stdexcept>
 #include <vector>
+#include <unordered_set>
+#include <string_view>
+
+#include <seastar/util/noncopyable_function.hh>

 #include "expressions_types.hh"
+#include "rjson.hh"

 namespace alternator {

@@ -36,6 +41,62 @@ public:

 parsed::update_expression parse_update_expression(std::string query);
 std::vector<parsed::path> parse_projection_expression(std::string query);
+parsed::condition_expression parse_condition_expression(std::string query);
+
+void resolve_update_expression(parsed::update_expression& ue,
+        const rjson::value* expression_attribute_names,
+        const rjson::value* expression_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        std::unordered_set<std::string>& used_attribute_values);
+void resolve_projection_expression(std::vector<parsed::path>& pe,
+        const rjson::value* expression_attribute_names,
+        std::unordered_set<std::string>& used_attribute_names);
+void resolve_condition_expression(parsed::condition_expression& ce,
+        const rjson::value* expression_attribute_names,
+        const rjson::value* expression_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        std::unordered_set<std::string>& used_attribute_values);
+
+void validate_value(const rjson::value& v, const char* caller);
+
+bool condition_expression_on(const parsed::condition_expression& ce, std::string_view attribute);
+
+// for_condition_expression_on() runs the given function on the attributes
+// that the expression uses. It may run for the same attribute more than once
+// if the same attribute is used more than once in the expression.
+void for_condition_expression_on(const parsed::condition_expression& ce, const noncopyable_function<void(std::string_view)>& func);
+
+// calculate_value() behaves slightly different (especially, different
+// functions supported) when used in different types of expressions, as
+// enumerated in this enum:
+enum class calculate_value_caller {
+    UpdateExpression, ConditionExpression, ConditionExpressionAlone
+};
+
+inline std::ostream& operator<<(std::ostream& out, calculate_value_caller caller) {
+    switch (caller) {
+        case calculate_value_caller::UpdateExpression:
+            out << "UpdateExpression";
+            break;
+        case calculate_value_caller::ConditionExpression:
+            out << "ConditionExpression";
+            break;
+        case calculate_value_caller::ConditionExpressionAlone:
+            out << "ConditionExpression";
+            break;
+        default:
+            out << "unknown type of expression";
+            break;
+    }
+    return out;
+}
+
+rjson::value calculate_value(const parsed::value& v,
+        calculate_value_caller caller,
+        const rjson::value* previous_item);
+
+rjson::value calculate_value(const parsed::set_rhs& rhs,
+        const rjson::value* previous_item);


 } /* namespace alternator */
--- a/alternator/expressions_types.hh
+++ b/alternator/expressions_types.hh
@@ -25,6 +25,10 @@
 #include <string>
 #include <variant>

+#include <seastar/core/shared_ptr.hh>
+
+#include "rjson.hh"
+
 /*
 * Parsed representation of expressions and their components.
 *
@@ -63,10 +67,27 @@ public:
    }
 };

+// When an expression is first parsed, all constants are references, like
+// ":val1", into ExpressionAttributeValues. This uses std::string() variant.
+// The resolve_value() function replaces these constants by the JSON item
+// extracted from the ExpressionAttributeValues.
+struct constant {
+    // We use lw_shared_ptr<rjson::value> just to make rjson::value copyable,
+    // to make this entire object copyable as ANTLR needs.
+    using literal = lw_shared_ptr<rjson::value>;
+    std::variant<std::string, literal> _value;
+    void set(const rjson::value& v) {
+        _value = make_lw_shared<rjson::value>(rjson::copy(v));
+    }
+    void set(std::string& s) {
+        _value = s;
+    }
+};
+
 // "value" is is a value used in the right hand side of an assignment
-// expression, "SET a = ...". It can be a reference to a value included in
-// the request (":val"), a path to an attribute from the existing item
-// (e.g., "a.b[3].c"), or a function of other such values.
+// expression, "SET a = ...". It can be a constant (a reference to a value
+// included in the request, e.g., ":val"), a path to an attribute from the
+// existing item (e.g., "a.b[3].c"), or a function of other such values.
 // Note that the real right-hand-side of an assignment is actually a bit
 // more general - it allows either a value, or a value+value or value-value -
 // see class set_rhs below.
@@ -75,9 +96,12 @@ struct value {
        std::string _function_name;
        std::vector<value> _parameters;
    };
-    std::variant<std::string, path, function_call> _value;
+    std::variant<constant, path, function_call> _value;
+    void set_constant(constant c) {
+        _value = std::move(c);
+    }
    void set_valref(std::string s) {
-        _value = std::move(s);
+        _value = constant { std::move(s) };
    }
    void set_path(path p) {
        _value = std::move(p);
@@ -88,6 +112,15 @@ struct value {
    void add_func_parameter(value v) {
        std::get<function_call>(_value)._parameters.emplace_back(std::move(v));
    }
+    bool is_constant() const {
+        return std::holds_alternative<constant>(_value);
+    }
+    bool is_path() const {
+        return std::holds_alternative<path>(_value);
+    }
+    bool is_func() const {
+        return std::holds_alternative<function_call>(_value);
+    }
 };

 // The right-hand-side of a SET in an update expression can be either a
@@ -121,10 +154,10 @@ public:
        struct remove {
        };
        struct add {
-            std::string _valref;
+            constant _valref;
        };
        struct del {
-            std::string _valref;
+            constant _valref;
        };
        std::variant<set, remove, add, del> _action;

@@ -138,11 +171,11 @@ public:
        }
        void assign_add(path p, std::string v) {
            _path = std::move(p);
-            _action = add { std::move(v) };
+            _action = add { constant { std::move(v) } };
        }
        void assign_del(path p, std::string v) {
            _path = std::move(p);
-            _action = del { std::move(v) };
+            _action = del { constant { std::move(v) } };
        }
    };
 private:
@@ -160,6 +193,62 @@ public:
    const std::vector<action>& actions() const {
        return _actions;
    }
+    std::vector<action>& actions() {
+        return _actions;
+    }
+};
+
+// A primitive_condition is a condition expression involving one condition,
+// while the full condition_expression below adds boolean logic over these
+// primitive conditions.
+// The supported primitive conditions are:
+// 1. Binary operators - v1 OP v2, where OP is =, <>, <, <=, >, or >= and
+//    v1 and v2 are values - from the item (an attribute path), the query
+//    (a ":val" reference), or a function of the the above (only the size()
+//    function is supported).
+// 2. Ternary operator - v1 BETWEEN v2 and v3 (means v1 >= v2 AND v1 <= v3).
+// 3. N-ary operator - v1 IN ( v2, v3, ... )
+// 4. A single function call (attribute_exists etc.). The parser actually
+//    accepts a more general "value" here but later stages reject a value
+//    which is not a function call (because DynamoDB does it too).
+class primitive_condition {
+public:
+    enum class type {
+        UNDEFINED, VALUE, EQ, NE, LT, LE, GT, GE, BETWEEN, IN
+    };
+    type _op = type::UNDEFINED;
+    std::vector<value> _values;
+    void set_operator(type op) {
+        _op = op;
+    }
+    void add_value(value&& v) {
+        _values.push_back(std::move(v));
+    }
+    bool empty() const {
+        return _op == type::UNDEFINED;
+    }
+};
+
+class condition_expression {
+public:
+    bool _negated = false; // If true, the entire condition is negated
+    struct condition_list {
+        char op = '|'; // '&' or '|'
+        std::vector<condition_expression> conditions;
+    };
+    std::variant<primitive_condition, condition_list> _expression = condition_list();
+
+    void set_primitive(primitive_condition&& p) {
+        _expression = std::move(p);
+    }
+    void append(condition_expression&& c, char op);
+    void apply_not() {
+        _negated = !_negated;
+    }
+    bool empty() const {
+        return std::holds_alternative<condition_list>(_expression) &&
+               std::get<condition_list>(_expression).conditions.empty();
+    }
 };

 } // namespace parsed
--- a/alternator/rjson.cc
+++ b/alternator/rjson.cc
@@ -22,14 +22,108 @@
 #include "rjson.hh"
 #include "error.hh"
 #include <seastar/core/print.hh>
+#include <seastar/core/thread.hh>

 namespace rjson {

 static allocator the_allocator;

+/*
+ * This wrapper class adds nested level checks to rapidjson's handlers.
+ * Each rapidjson handler implements functions for accepting JSON values,
+ * which includes strings, numbers, objects, arrays, etc.
+ * Parsing objects and arrays needs to be performed carefully with regard
+ * to stack overflow - each object/array layer adds another stack frame
+ * to parsing, printing and destroying the parent JSON document.
+ * To prevent stack overflow, a rapidjson handler can be wrapped with
+ * guarded_json_handler, which accepts an additional max_nested_level parameter.
+ * After trying to exceed the max nested level, a proper rjson::error will be thrown.
+ */
+template<typename Handler, bool EnableYield>
+struct guarded_yieldable_json_handler : public Handler {
+    size_t _nested_level = 0;
+    size_t _max_nested_level;
+public:
+    using handler_base = Handler;
+
+    explicit guarded_yieldable_json_handler(size_t max_nested_level) : _max_nested_level(max_nested_level) {}
+    guarded_yieldable_json_handler(string_buffer& buf, size_t max_nested_level)
+            : handler_base(buf), _max_nested_level(max_nested_level) {}
+
+    void Parse(const char* str, size_t length) {
+        rapidjson::MemoryStream ms(static_cast<const char*>(str), length * sizeof(typename encoding::Ch));
+        rapidjson::EncodedInputStream<encoding, rapidjson::MemoryStream> is(ms);
+        rapidjson::GenericReader<encoding, encoding, allocator> reader(&the_allocator);
+        reader.Parse(is, *this);
+        if (reader.HasParseError()) {
+            throw rjson::error(format("Parsing JSON failed: {}", rapidjson::GetParseError_En(reader.GetParseErrorCode())));
+        }
+        //NOTICE: The handler has parsed the string, but in case of rapidjson::GenericDocument
+        // the data now resides in an internal stack_ variable, which is private instead of
+        // protected... which means we cannot simply access its data. Fortunately, another
+        // function for populating documents from SAX events can be abused to extract the data
+        // from the stack via gadget-oriented programming - we use an empty event generator
+        // which does nothing, and use it to call Populate(), which assumes that the generator
+        // will fill the stack with something. It won't, but our stack is already filled with
+        // data we want to steal, so once Populate() ends, our document will be properly parsed.
+        // A proper solution could be programmed once rapidjson declares this stack_ variable
+        // as protected instead of private, so that this class can access it.
+        auto dummy_generator = [](handler_base&){return true;};
+        handler_base::Populate(dummy_generator);
+    }
+
+    bool StartObject() {
+        ++_nested_level;
+        check_nested_level();
+        maybe_yield();
+        return handler_base::StartObject();
+    }
+
+    bool EndObject(rapidjson::SizeType elements_count = 0) {
+        --_nested_level;
+        return handler_base::EndObject(elements_count);
+    }
+
+    bool StartArray() {
+        ++_nested_level;
+        check_nested_level();
+        maybe_yield();
+        return handler_base::StartArray();
+    }
+
+    bool EndArray(rapidjson::SizeType elements_count = 0) {
+        --_nested_level;
+        return handler_base::EndArray(elements_count);
+    }
+
+    bool Null()                 { maybe_yield(); return handler_base::Null(); }
+    bool Bool(bool b)           { maybe_yield(); return handler_base::Bool(b); }
+    bool Int(int i)             { maybe_yield(); return handler_base::Int(i); }
+    bool Uint(unsigned u)       { maybe_yield(); return handler_base::Uint(u); }
+    bool Int64(int64_t i64)     { maybe_yield(); return handler_base::Int64(i64); }
+    bool Uint64(uint64_t u64)   { maybe_yield(); return handler_base::Uint64(u64); }
+    bool Double(double d)       { maybe_yield(); return handler_base::Double(d); }
+    bool String(const value::Ch* str, size_t length, bool copy = false) { maybe_yield(); return handler_base::String(str, length, copy); }
+    bool Key(const value::Ch* str, size_t length, bool copy = false) { maybe_yield(); return handler_base::Key(str, length, copy); }
+
+
+protected:
+    static void maybe_yield() {
+        if constexpr (EnableYield) {
+            thread::maybe_yield();
+        }
+    }
+
+    void check_nested_level() const {
+        if (RAPIDJSON_UNLIKELY(_nested_level > _max_nested_level)) {
+            throw rjson::error(format("Max nested level reached: {}", _max_nested_level));
+        }
+    }
+};
+
 std::string print(const rjson::value& value) {
    string_buffer buffer;
-    writer writer(buffer);
+    guarded_yieldable_json_handler<writer, false> writer(buffer, 78);
    value.Accept(writer);
    return std::string(buffer.GetString());
 }
@@ -38,13 +132,9 @@ rjson::value copy(const rjson::value& value) {
    return rjson::value(value, the_allocator);
 }

-rjson::value parse(const std::string& str) {
-    return parse_raw(str.c_str(), str.size());
-}
-
-rjson::value parse_raw(const char* c_str, size_t size) {
-    rjson::document d;
-    d.Parse(c_str, size);
+rjson::value parse(std::string_view str) {
+    guarded_yieldable_json_handler<document, false> d(78);
+    d.Parse(str.data(), str.size());
    if (d.HasParseError()) {
        throw rjson::error(format("Parsing JSON failed: {}", GetParseError_En(d.GetParseError())));
    }
@@ -52,8 +142,22 @@ rjson::value parse_raw(const char* c_str, size_t size) {
    return std::move(v);
 }

-rjson::value& get(rjson::value& value, rjson::string_ref_type name) {
-    auto member_it = value.FindMember(name);
+rjson::value parse_yieldable(std::string_view str) {
+    guarded_yieldable_json_handler<document, true> d(78);
+    d.Parse(str.data(), str.size());
+    if (d.HasParseError()) {
+        throw rjson::error(format("Parsing JSON failed: {}", GetParseError_En(d.GetParseError())));
+    }
+    rjson::value& v = d;
+    return std::move(v);
+}
+
+rjson::value& get(rjson::value& value, std::string_view name) {
+    // Although FindMember() has a variant taking a StringRef, it ignores the
+    // given length (see https://github.com/Tencent/rapidjson/issues/1649).
+    // Luckily, the variant taking a GenericValue doesn't share this bug,
+    // and we can create a string GenericValue without copying the string.
+    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
    if (member_it != value.MemberEnd())
        return member_it->value;
    else {
@@ -61,8 +165,8 @@ rjson::value& get(rjson::value& value, rjson::string_ref_type name) {
    }
 }

-const rjson::value& get(const rjson::value& value, rjson::string_ref_type name) {
-    auto member_it = value.FindMember(name);
+const rjson::value& get(const rjson::value& value, std::string_view name) {
+    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
    if (member_it != value.MemberEnd())
        return member_it->value;
    else {
@@ -82,24 +186,48 @@ rjson::value from_string(const char* str, size_t size) {
    return rjson::value(str, size, the_allocator);
 }

-const rjson::value* find(const rjson::value& value, string_ref_type name) {
-    auto member_it = value.FindMember(name);
+rjson::value from_string(std::string_view view) {
+    return rjson::value(view.data(), view.size(), the_allocator);
+}
+
+const rjson::value* find(const rjson::value& value, std::string_view name) {
+    // Although FindMember() has a variant taking a StringRef, it ignores the
+    // given length (see https://github.com/Tencent/rapidjson/issues/1649).
+    // Luckily, the variant taking a GenericValue doesn't share this bug,
+    // and we can create a string GenericValue without copying the string.
+    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
    return member_it != value.MemberEnd() ? &member_it->value : nullptr;
 }

-rjson::value* find(rjson::value& value, string_ref_type name) {
-    auto member_it = value.FindMember(name);
+rjson::value* find(rjson::value& value, std::string_view name) {
+    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
    return member_it != value.MemberEnd() ? &member_it->value : nullptr;
 }

+bool remove_member(rjson::value& value, std::string_view name) {
+    // Although RemoveMember() has a variant taking a StringRef, it ignores
+    // given length (see https://github.com/Tencent/rapidjson/issues/1649).
+    // Luckily, the variant taking a GenericValue doesn't share this bug,
+    // and we can create a string GenericValue without copying the string.
+    return value.RemoveMember(rjson::value(name.data(), name.size()));
+}
+
 void set_with_string_name(rjson::value& base, const std::string& name, rjson::value&& member) {
    base.AddMember(rjson::value(name.c_str(), name.size(), the_allocator), std::move(member), the_allocator);
 }

+void set_with_string_name(rjson::value& base, std::string_view name, rjson::value&& member) {
+    base.AddMember(rjson::value(name.data(), name.size(), the_allocator), std::move(member), the_allocator);
+}
+
 void set_with_string_name(rjson::value& base, const std::string& name, rjson::string_ref_type member) {
    base.AddMember(rjson::value(name.c_str(), name.size(), the_allocator), rjson::value(member), the_allocator);
 }

+void set_with_string_name(rjson::value& base, std::string_view name, rjson::string_ref_type member) {
+    base.AddMember(rjson::value(name.data(), name.size(), the_allocator), rjson::value(member), the_allocator);
+}
+
 void set(rjson::value& base, rjson::string_ref_type name, rjson::value&& member) {
    base.AddMember(name, std::move(member), the_allocator);
 }
--- a/alternator/rjson.hh
+++ b/alternator/rjson.hh
@@ -104,38 +104,49 @@ inline rjson::value empty_string() {
 // The representation is dense - without any redundant indentation.
 std::string print(const rjson::value& value);

+// Returns a string_view to the string held in a JSON value (which is
+// assumed to hold a string, i.e., v.IsString() == true). This is a view
+// to the existing data - no copying is done.
+inline std::string_view to_string_view(const rjson::value& v) {
+    return std::string_view(v.GetString(), v.GetStringLength());
+}
+
 // Copies given JSON value - involves allocation
 rjson::value copy(const rjson::value& value);

 // Parses a JSON value from given string or raw character array.
 // The string/char array liveness does not need to be persisted,
-// as both parse() and parse_raw() will allocate member names and values.
+// as parse() will allocate member names and values.
 // Throws rjson::error if parsing failed.
-rjson::value parse(const std::string& str);
-rjson::value parse_raw(const char* c_str, size_t size);
+rjson::value parse(std::string_view str);
+// Needs to be run in thread context
+rjson::value parse_yieldable(std::string_view str);

 // Creates a JSON value (of JSON string type) out of internal string representations.
 // The string value is copied, so str's liveness does not need to be persisted.
 rjson::value from_string(const std::string& str);
 rjson::value from_string(const sstring& str);
 rjson::value from_string(const char* str, size_t size);
+rjson::value from_string(std::string_view view);

 // Returns a pointer to JSON member if it exists, nullptr otherwise
-rjson::value* find(rjson::value& value, rjson::string_ref_type name);
-const rjson::value* find(const rjson::value& value, rjson::string_ref_type name);
+rjson::value* find(rjson::value& value, std::string_view name);
+const rjson::value* find(const rjson::value& value, std::string_view name);

 // Returns a reference to JSON member if it exists, throws otherwise
-rjson::value& get(rjson::value& value, rjson::string_ref_type name);
-const rjson::value& get(const rjson::value& value, rjson::string_ref_type name);
+rjson::value& get(rjson::value& value, std::string_view name);
+const rjson::value& get(const rjson::value& value, std::string_view name);

 // Sets a member in given JSON object by moving the member - allocates the name.
 // Throws if base is not a JSON object.
 void set_with_string_name(rjson::value& base, const std::string& name, rjson::value&& member);
+void set_with_string_name(rjson::value& base, std::string_view name, rjson::value&& member);

 // Sets a string member in given JSON object by assigning its reference - allocates the name.
 // NOTICE: member string liveness must be ensured to be at least as long as base's.
 // Throws if base is not a JSON object.
 void set_with_string_name(rjson::value& base, const std::string& name, rjson::string_ref_type member);
+void set_with_string_name(rjson::value& base, std::string_view name, rjson::string_ref_type member);

 // Sets a member in given JSON object by moving the member.
 // NOTICE: name liveness must be ensured to be at least as long as base's.
@@ -152,6 +163,9 @@ void set(rjson::value& base, rjson::string_ref_type name, rjson::string_ref_type
 // Throws if base_array is not a JSON array.
 void push_back(rjson::value& base_array, rjson::value&& item);

+// Remove a member from a JSON object. Throws if value isn't an object.
+bool remove_member(rjson::value& value, std::string_view name);
+
 struct single_value_comp {
    bool operator()(const rjson::value& r1, const rjson::value& r2) const;
 };
--- a/alternator/rmw_operation.hh
+++ b/alternator/rmw_operation.hh
@@ -0,0 +1,128 @@
+/*
+ * Copyright 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "seastarx.hh"
+#include "service/storage_proxy.hh"
+#include "service/storage_proxy.hh"
+#include "rjson.hh"
+#include "executor.hh"
+
+namespace alternator {
+
+// An rmw_operation encapsulates the common logic of all the item update
+// operations which may involve a read of the item before the write
+// (so-called Read-Modify-Write operations). These operations include PutItem,
+// UpdateItem and DeleteItem: All of these may be conditional operations (the
+// "Expected" parameter) which requir a read before the write, and UpdateItem
+// may also have an update expression which refers to the item's old value.
+//
+// The code below supports running the read and the write together as one
+// transaction using LWT (this is why rmw_operation is a subclass of
+// cas_request, as required by storage_proxy::cas()), but also has optional
+// modes not using LWT.
+class rmw_operation : public service::cas_request, public enable_shared_from_this<rmw_operation> {
+public:
+    // The following options choose which mechanism to use for isolating
+    // parallel write operations:
+    // * The FORBID_RMW option forbids RMW (read-modify-write) operations
+    //   such as conditional updates. For the remaining write-only
+    //   operations, ordinary quorum writes are isolated enough.
+    // * The LWT_ALWAYS option always uses LWT (lightweight transactions)
+    //   for any write operation - whether or not it also has a read.
+    // * The LWT_RMW_ONLY option uses LWT only for RMW operations, and uses
+    //   ordinary quorum writes for write-only operations.
+    //   This option is not safe if the user may send both RMW and write-only
+    //   operations on the same item.
+    // * The UNSAFE_RMW option does read-modify-write operations as separate
+    //   read and write. It is unsafe - concurrent RMW operations are not
+    //   isolated at all. This option will likely be removed in the future.
+    enum class write_isolation {
+        FORBID_RMW, LWT_ALWAYS, LWT_RMW_ONLY, UNSAFE_RMW
+    };
+    static constexpr auto WRITE_ISOLATION_TAG_KEY = "system:write_isolation";
+
+    static write_isolation get_write_isolation_for_schema(schema_ptr schema);
+
+    static write_isolation default_write_isolation;
+public:
+    static void set_default_write_isolation(std::string_view mode);
+
+protected:
+    // The full request JSON
+    rjson::value _request;
+    // All RMW operations involve a single item with a specific partition
+    // and optional clustering key, in a single table, so the following
+    // information is common to all of them:
+    schema_ptr _schema;
+    partition_key _pk = partition_key::make_empty();
+    clustering_key _ck = clustering_key::make_empty();
+    write_isolation _write_isolation;
+
+    // All RMW operations can have a ReturnValues parameter from the following
+    // choices. But note that only UpdateItem actually supports all of them:
+    enum class returnvalues {
+        NONE, ALL_OLD, UPDATED_OLD, ALL_NEW, UPDATED_NEW
+    } _returnvalues;
+    static returnvalues parse_returnvalues(const rjson::value& request);
+    // When _returnvalues != NONE, apply() should store here, in JSON form,
+    // the values which are to be returned in the "Attributes" field.
+    // The default null JSON means do not return an Attributes field at all.
+    // This field is marked "mutable" so that the const apply() can modify
+    // it (see explanation below), but note that because apply() may be
+    // called more than once, if apply() will sometimes set this field it
+    // must set it (even if just to the default empty value) every time.
+    mutable rjson::value _return_attributes;
+public:
+    // The constructor of a rmw_operation subclass should parse the request
+    // and try to discover as many input errors as it can before really
+    // attempting the read or write operations.
+    rmw_operation(service::storage_proxy& proxy, rjson::value&& request);
+    // rmw_operation subclasses (update_item_operation, put_item_operation
+    // and delete_item_operation) shall implement an apply() function which
+    // takes the previous value of the item (if it was read) and creates the
+    // write mutation. If the previous value of item does not pass the needed
+    // conditional expression, apply() should return an empty optional.
+    // apply() may throw if it encounters input errors not discovered during
+    // the constructor.
+    // apply() may be called more than once in case of contention, so it must
+    // not change the state saved in the object (issue #7218 was caused by
+    // violating this). We mark apply() "const" to let the compiler validate
+    // this for us. The output-only field _return_attributes is marked
+    // "mutable" above so that apply() can still write to it.
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const = 0;
+    // Convert the above apply() into the signature needed by cas_request:
+    virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override;
+    virtual ~rmw_operation() = default;
+    schema_ptr schema() const { return _schema; }
+    const rjson::value& request() const { return _request; }
+    rjson::value&& move_request() && { return std::move(_request); }
+    future<executor::request_return_type> execute(service::storage_proxy& proxy,
+            service::client_state& client_state,
+            tracing::trace_state_ptr trace_state,
+            service_permit permit,
+            bool needs_read_before_write,
+            stats& stats);
+    std::optional<shard_id> shard_for_execute(bool needs_read_before_write);
+};
+
+} // namespace alternator
--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -31,8 +31,8 @@ static logging::logger slogger("alternator-serialization");

 namespace alternator {

-type_info type_info_from_string(std::string type) {
-    static thread_local const std::unordered_map<std::string, type_info> type_infos = {
+type_info type_info_from_string(std::string_view type) {
+    static thread_local const std::unordered_map<std::string_view, type_info> type_infos = {
        {"S", {alternator_type::S, utf8_type}},
        {"B", {alternator_type::B, bytes_type}},
        {"BOOL", {alternator_type::BOOL, boolean_type}},
@@ -87,7 +87,7 @@ bytes serialize_item(const rjson::value& item) {
        throw api_error("ValidationException", format("An item can contain only one attribute definition: {}", item));
    }
    auto it = item.MemberBegin();
-    type_info type_info = type_info_from_string(it->name.GetString()); // JSON keys are guaranteed to be strings
+    type_info type_info = type_info_from_string(rjson::to_string_view(it->name)); // JSON keys are guaranteed to be strings

    if (type_info.atype == alternator_type::NOT_SUPPORTED_YET) {
        slogger.trace("Non-optimal serialization of type {}", it->name.GetString());
@@ -121,7 +121,7 @@ struct to_json_visitor {
    }
    // default
    void operator()(const abstract_type& t) const {
-        rjson::set_with_string_name(deserialized, type_ident, rjson::parse(t.to_string(bytes(bv))));
+        rjson::set_with_string_name(deserialized, type_ident, rjson::parse(to_json_string(t, bytes(bv))));
    }
 };

@@ -136,7 +136,7 @@ rjson::value deserialize_item(bytes_view bv) {

    if (atype == alternator_type::NOT_SUPPORTED_YET) {
        slogger.trace("Non-optimal deserialization of alternator type {}", int8_t(atype));
-        return rjson::parse_raw(reinterpret_cast<const char *>(bv.data()), bv.size());
+        return rjson::parse(std::string_view(reinterpret_cast<const char *>(bv.data()), bv.size()));
    }
    type_representation type_representation = represent_type(atype);
    visit(*type_representation.dtype, to_json_visitor{deserialized, type_representation.ident, bv});
@@ -153,34 +153,48 @@ std::string type_to_string(data_type type) {
    };
    auto it = types.find(type);
    if (it == types.end()) {
-        throw std::runtime_error(format("Unknown type {}", type->name()));
+        // fall back to string, in order to be able to present
+        // internal Scylla types in a human-readable way
+        return "S";
    }
    return it->second;
 }

 bytes get_key_column_value(const rjson::value& item, const column_definition& column) {
    std::string column_name = column.name_as_text();
-    std::string expected_type = type_to_string(column.type);
-
-    const rjson::value& key_typed_value = rjson::get(item, rjson::value::StringRefType(column_name.c_str()));
-    if (!key_typed_value.IsObject() || key_typed_value.MemberCount() != 1) {
-        throw api_error("ValidationException",
-                format("Missing or invalid value object for key column {}: {}", column_name, item));
+    const rjson::value* key_typed_value = rjson::find(item, column_name);
+    if (!key_typed_value) {
+        throw api_error("ValidationException", format("Key column {} not found", column_name));
    }
-    return get_key_from_typed_value(key_typed_value, column, expected_type);
+    return get_key_from_typed_value(*key_typed_value, column);
 }

-bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column, const std::string& expected_type) {
+// Parses the JSON encoding for a key value, which is a map with a single
+// entry, whose key is the type (expected to match the key column's type)
+// and the value is the encoded value.
+bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column) {
+    if (!key_typed_value.IsObject() || key_typed_value.MemberCount() != 1 ||
+            !key_typed_value.MemberBegin()->value.IsString()) {
+        throw api_error("ValidationException",
+                format("Malformed value object for key column {}: {}",
+                        column.name_as_text(), key_typed_value));
+    }
+
    auto it = key_typed_value.MemberBegin();
-    if (it->name.GetString() != expected_type) {
+    if (it->name != type_to_string(column.type)) {
        throw api_error("ValidationException",
                format("Type mismatch: expected type {} for key column {}, got type {}",
-                        expected_type, column.name_as_text(), it->name.GetString()));
+                        type_to_string(column.type), column.name_as_text(), it->name.GetString()));
+    }
+    std::string_view value_view = rjson::to_string_view(it->value);
+    if (value_view.empty()) {
+        throw api_error("ValidationException",
+                format("The AttributeValue for a key attribute cannot contain an empty string value. Key: {}", column.name_as_text()));
    }
    if (column.type == bytes_type) {
        return base64_decode(it->value);
    } else {
-        return column.type->from_string(it->value.GetString());
+        return column.type->from_string(rjson::to_string_view(it->value));
    }

 }
@@ -198,8 +212,11 @@ rjson::value json_key_column_value(bytes_view cell, const column_definition& col
        auto s = to_json_string(*decimal_type, bytes(cell));
        return rjson::from_string(s);
    } else {
-        // We shouldn't get here, we shouldn't see such key columns.
-        throw std::runtime_error(format("Unexpected key type: {}", column.type->name()));
+        // Support for arbitrary key types is useful for parsing values of virtual tables,
+        // which can involve any type supported by Scylla.
+        // In order to guarantee that the returned type is parsable by alternator clients,
+        // they are represented simply as strings.
+        return rjson::from_string(column.type->to_string(bytes(cell)));
    }
 }

@@ -258,4 +275,93 @@ const std::pair<std::string, const rjson::value*> unwrap_set(const rjson::value&
    return std::make_pair(it_key, &(it->value));
 }

+const rjson::value* unwrap_list(const rjson::value& v) {
+    if (!v.IsObject() || v.MemberCount() != 1) {
+        return nullptr;
+    }
+    auto it = v.MemberBegin();
+    if (it->name != std::string("L")) {
+        return nullptr;
+    }
+    return &(it->value);
+}
+
+// Take two JSON-encoded numeric values ({"N": "thenumber"}) and return the
+// sum, again as a JSON-encoded number.
+rjson::value number_add(const rjson::value& v1, const rjson::value& v2) {
+    auto n1 = unwrap_number(v1, "UpdateExpression");
+    auto n2 = unwrap_number(v2, "UpdateExpression");
+    rjson::value ret = rjson::empty_object();
+    std::string str_ret = std::string((n1 + n2).to_string());
+    rjson::set(ret, "N", rjson::from_string(str_ret));
+    return ret;
+}
+
+rjson::value number_subtract(const rjson::value& v1, const rjson::value& v2) {
+    auto n1 = unwrap_number(v1, "UpdateExpression");
+    auto n2 = unwrap_number(v2, "UpdateExpression");
+    rjson::value ret = rjson::empty_object();
+    std::string str_ret = std::string((n1 - n2).to_string());
+    rjson::set(ret, "N", rjson::from_string(str_ret));
+    return ret;
+}
+
+// Take two JSON-encoded set values (e.g. {"SS": [...the actual set]}) and
+// return the sum of both sets, again as a set value.
+rjson::value set_sum(const rjson::value& v1, const rjson::value& v2) {
+    auto [set1_type, set1] = unwrap_set(v1);
+    auto [set2_type, set2] = unwrap_set(v2);
+    if (set1_type != set2_type) {
+        throw api_error("ValidationException", format("Mismatched set types: {} and {}", set1_type, set2_type));
+    }
+    if (!set1 || !set2) {
+        throw api_error("ValidationException", "UpdateExpression: ADD operation for sets must be given sets as arguments");
+    }
+    rjson::value sum = rjson::copy(*set1);
+    std::set<rjson::value, rjson::single_value_comp> set1_raw;
+    for (auto it = sum.Begin(); it != sum.End(); ++it) {
+        set1_raw.insert(rjson::copy(*it));
+    }
+    for (const auto& a : set2->GetArray()) {
+        if (set1_raw.count(a) == 0) {
+            rjson::push_back(sum, rjson::copy(a));
+        }
+    }
+    rjson::value ret = rjson::empty_object();
+    rjson::set_with_string_name(ret, set1_type, std::move(sum));
+    return ret;
+}
+
+// Take two JSON-encoded set values (e.g. {"SS": [...the actual list]}) and
+// return the difference of s1 - s2, again as a set value.
+// DynamoDB does not allow empty sets, so if resulting set is empty, return
+// an unset optional instead.
+std::optional<rjson::value> set_diff(const rjson::value& v1, const rjson::value& v2) {
+    auto [set1_type, set1] = unwrap_set(v1);
+    auto [set2_type, set2] = unwrap_set(v2);
+    if (set1_type != set2_type) {
+        throw api_error("ValidationException", format("Mismatched set types: {} and {}", set1_type, set2_type));
+    }
+    if (!set1 || !set2) {
+        throw api_error("ValidationException", "UpdateExpression: DELETE operation can only be performed on a set");
+    }
+    std::set<rjson::value, rjson::single_value_comp> set1_raw;
+    for (auto it = set1->Begin(); it != set1->End(); ++it) {
+        set1_raw.insert(rjson::copy(*it));
+    }
+    for (const auto& a : set2->GetArray()) {
+        set1_raw.erase(a);
+    }
+    if (set1_raw.empty()) {
+        return std::nullopt;
+    }
+    rjson::value ret = rjson::empty_object();
+    rjson::set_with_string_name(ret, set1_type, rjson::empty_array());
+    rjson::value& result_set = ret[set1_type];
+    for (const auto& a : set1_raw) {
+        rjson::push_back(result_set, rjson::copy(a));
+    }
+    return ret;
+}
+
 }
--- a/alternator/serialization.hh
+++ b/alternator/serialization.hh
@@ -24,7 +24,7 @@
 #include <string>
 #include <string_view>
 #include "types.hh"
-#include "schema.hh"
+#include "schema_fwd.hh"
 #include "keys.hh"
 #include "rjson.hh"
 #include "utils/big_decimal.hh"
@@ -45,7 +45,7 @@ struct type_representation {
    data_type dtype;
 };

-type_info type_info_from_string(std::string type);
+type_info type_info_from_string(std::string_view type);
 type_representation represent_type(alternator_type atype);

 bytes serialize_item(const rjson::value& item);
@@ -54,7 +54,7 @@ rjson::value deserialize_item(bytes_view bv);
 std::string type_to_string(data_type type);

 bytes get_key_column_value(const rjson::value& item, const column_definition& column);
-bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column, const std::string& expected_type);
+bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column);
 rjson::value json_key_column_value(bytes_view cell, const column_definition& column);

 partition_key pk_from_json(const rjson::value& item, schema_ptr schema);
@@ -69,4 +69,21 @@ big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic);
 // returned value is {"", nullptr}
 const std::pair<std::string, const rjson::value*> unwrap_set(const rjson::value& v);

+// Check if a given JSON object encodes a list (i.e., it is a {"L": [...]}
+// and returns a pointer to that list.
+const rjson::value* unwrap_list(const rjson::value& v);
+
+// Take two JSON-encoded numeric values ({"N": "thenumber"}) and return the
+// sum, again as a JSON-encoded number.
+rjson::value number_add(const rjson::value& v1, const rjson::value& v2);
+rjson::value number_subtract(const rjson::value& v1, const rjson::value& v2);
+// Take two JSON-encoded set values (e.g. {"SS": [...the actual set]}) and
+// return the sum of both sets, again as a set value.
+rjson::value set_sum(const rjson::value& v1, const rjson::value& v2);
+// Take two JSON-encoded set values (e.g. {"SS": [...the actual list]}) and
+// return the difference of s1 - s2, again as a set value.
+// DynamoDB does not allow empty sets, so if resulting set is empty, return
+// an unset optional instead.
+std::optional<rjson::value> set_diff(const rjson::value& v1, const rjson::value& v2);
+
 }
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -23,12 +23,14 @@
 #include "log.hh"
 #include <seastar/http/function_handlers.hh>
 #include <seastar/json/json_elements.hh>
-#include <seastarx.hh>
+#include "seastarx.hh"
 #include "error.hh"
 #include "rjson.hh"
 #include "auth.hh"
 #include <cctype>
 #include "cql3/query_processor.hh"
+#include "service/storage_service.hh"
+#include "utils/overloaded_functor.hh"

 static logging::logger slogger("alternator-server");

@@ -65,9 +67,9 @@ inline std::vector<std::string_view> split(std::string_view text, char separator
 // Internal Server Error.
 class api_handler : public handler_base {
 public:
-    api_handler(const future_json_function& _handle) : _f_handle(
-         [_handle](std::unique_ptr<request> req, std::unique_ptr<reply> rep) {
-         return seastar::futurize_apply(_handle, std::move(req)).then_wrapped([rep = std::move(rep)](future<json::json_return_type> resf) mutable {
+    api_handler(const std::function<future<executor::request_return_type>(std::unique_ptr<request> req)>& _handle) : _f_handle(
+         [this, _handle](std::unique_ptr<request> req, std::unique_ptr<reply> rep) {
+         return seastar::futurize_invoke(_handle, std::move(req)).then_wrapped([this, rep = std::move(rep)](future<executor::request_return_type> resf) mutable {
             if (resf.failed()) {
                 // Exceptions of type api_error are wrapped as JSON and
                 // returned to the client as expected. Other types of
@@ -86,20 +88,24 @@ public:
                             format("Internal server error: {}", std::current_exception()),
                             reply::status_type::internal_server_error);
                 }
-                 // FIXME: what is this version number?
-                 rep->_content += "{\"__type\":\"com.amazonaws.dynamodb.v20120810#" + ret._type + "\"," +
-                         "\"message\":\"" + ret._msg + "\"}";
-                 rep->_status = ret._http_code;
-                 slogger.trace("api_handler error case: {}", rep->_content);
+                 generate_error_reply(*rep, ret);
                 return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
             }
-             slogger.trace("api_handler success case");
             auto res = resf.get0();
-             if (res._body_writer) {
-                 rep->write_body("json", std::move(res._body_writer));
-             } else {
-                 rep->_content += res._res;
-             }
+             std::visit(overloaded_functor {
+                 [&] (const json::json_return_type& json_return_value) {
+                     slogger.trace("api_handler success case");
+                     if (json_return_value._body_writer) {
+                         rep->write_body("json", std::move(json_return_value._body_writer));
+                     } else {
+                         rep->_content += json_return_value._res;
+                     }
+                 },
+                 [&] (const api_error& err) {
+                     generate_error_reply(*rep, err);
+                 }
+             }, res);
+
             return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
         });
    }), _type("json") { }
@@ -115,18 +121,66 @@ public:
    }

 protected:
+    void generate_error_reply(reply& rep, const api_error& err) {
+        rep._content += "{\"__type\":\"com.amazonaws.dynamodb.v20120810#" + err._type + "\"," +
+                "\"message\":\"" + err._msg + "\"}";
+        rep._status = err._http_code;
+        slogger.trace("api_handler error case: {}", rep._content);
+    }
+
    future_handler_function _f_handle;
    sstring _type;
 };

-class health_handler : public handler_base {
-    virtual future<std::unique_ptr<reply>> handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
+class gated_handler : public handler_base {
+    seastar::gate& _gate;
+public:
+    gated_handler(seastar::gate& gate) : _gate(gate) {}
+    virtual future<std::unique_ptr<reply>> do_handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) = 0;
+    virtual future<std::unique_ptr<reply>> handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) final override {
+        return with_gate(_gate, [this, &path, req = std::move(req), rep = std::move(rep)] () mutable {
+            return do_handle(path, std::move(req), std::move(rep));
+        });
+    }
+};
+
+class health_handler : public gated_handler {
+public:
+    health_handler(seastar::gate& pending_requests) : gated_handler(pending_requests) {}
+protected:
+    virtual future<std::unique_ptr<reply>> do_handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
        rep->set_status(reply::status_type::ok);
        rep->write_body("txt", format("healthy: {}", req->get_header("Host")));
        return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
    }
 };

+class local_nodelist_handler : public gated_handler {
+public:
+    local_nodelist_handler(seastar::gate& pending_requests) : gated_handler(pending_requests) {}
+protected:
+    virtual future<std::unique_ptr<reply>> do_handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
+        rjson::value results = rjson::empty_array();
+        // It's very easy to get a list of all live nodes on the cluster,
+        // using gms::get_local_gossiper().get_live_members(). But getting
+        // just the list of live nodes in this DC needs more elaborate code:
+        sstring local_dc = locator::i_endpoint_snitch::get_local_snitch_ptr()->get_datacenter(
+                utils::fb_utilities::get_broadcast_address());
+        std::unordered_set<gms::inet_address> local_dc_nodes =
+                service::get_local_storage_service().get_token_metadata().
+                get_topology().get_datacenter_endpoints().at(local_dc);
+        for (auto& ip : local_dc_nodes) {
+            if (gms::get_local_gossiper().is_alive(ip)) {
+                rjson::push_back(results, rjson::from_string(ip.to_sstring()));
+            }
+        }
+        rep->set_status(reply::status_type::ok);
+        rep->set_content_type("json");
+        rep->_content = rjson::print(results);
+        return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
+    }
+};
+
 future<> server::verify_signature(const request& req) {
    if (!_enforce_authorization) {
        slogger.debug("Skipping authorization");
@@ -137,7 +191,7 @@ future<> server::verify_signature(const request& req) {
        throw api_error("InvalidSignatureException", "Host header is mandatory for signature verification");
    }
    auto authorization_it = req._headers.find("Authorization");
-    if (host_it == req._headers.end()) {
+    if (authorization_it == req._headers.end()) {
        throw api_error("InvalidSignatureException", "Authorization header is mandatory for signature verification");
    }
    std::string host = host_it->second;
@@ -214,7 +268,8 @@ future<> server::verify_signature(const request& req) {
    });
 }

-future<json::json_return_type> server::handle_api_request(std::unique_ptr<request>&& req) {
+future<executor::request_return_type> server::handle_api_request(std::unique_ptr<request>&& req) {
+    _executor._stats.total_operations++;
    sstring target = req->get_header(TARGET);
    std::vector<std::string_view> split_target = split(target, '.');
    //NOTICE(sarna): Target consists of Dynamo API version followed by a dot '.' and operation type (e.g. CreateTable)
@@ -223,17 +278,32 @@ future<json::json_return_type> server::handle_api_request(std::unique_ptr<reques
    return verify_signature(*req).then([this, op, req = std::move(req)] () mutable {
        auto callback_it = _callbacks.find(op);
        if (callback_it == _callbacks.end()) {
-            _executor.local()._stats.unsupported_operations++;
+            _executor._stats.unsupported_operations++;
            throw api_error("UnknownOperationException",
                    format("Unsupported operation {}", op));
        }
-        //FIXME: Client state can provide more context, e.g. client's endpoint address
-        // We use unique_ptr because client_state cannot be moved or copied
-        return do_with(std::make_unique<executor::client_state>(executor::client_state::internal_tag()), [this, callback_it = std::move(callback_it), op = std::move(op), req = std::move(req)] (std::unique_ptr<executor::client_state>& client_state) mutable {
-            client_state->set_raw_keyspace(executor::KEYSPACE_NAME);
-            executor::maybe_trace_query(*client_state, op, req->content);
-            tracing::trace(client_state->get_trace_state(), op);
-            return callback_it->second(_executor.local(), *client_state, std::move(req));
+        return with_gate(_pending_requests, [this, callback_it = std::move(callback_it), op = std::move(op), req = std::move(req)] () mutable {
+            //FIXME: Client state can provide more context, e.g. client's endpoint address
+            // We use unique_ptr because client_state cannot be moved or copied
+            return do_with(std::make_unique<executor::client_state>(executor::client_state::internal_tag()),
+                    [this, callback_it = std::move(callback_it), op = std::move(op), req = std::move(req)] (std::unique_ptr<executor::client_state>& client_state) mutable {
+                tracing::trace_state_ptr trace_state = executor::maybe_trace_query(*client_state, op, req->content);
+                tracing::trace(trace_state, op);
+                // JSON parsing can allocate up to roughly 2x the size of the raw document, + a couple of bytes for maintenance.
+                // FIXME: by this time, the whole HTTP request was already read, so some memory is already occupied.
+                // Once HTTP allows working on streams, we should grab the permit *before* reading the HTTP payload.
+                size_t mem_estimate = req->content.size() * 3 + 8000;
+                auto units_fut = get_units(*_memory_limiter, mem_estimate);
+                if (_memory_limiter->waiters()) {
+                    ++_executor._stats.requests_blocked_memory;
+                }
+                return units_fut.then([this, callback_it = std::move(callback_it), &client_state, trace_state, req = std::move(req)] (semaphore_units<> units) mutable {
+                    return _json_parser.parse(req->content).then([this, callback_it = std::move(callback_it), &client_state, trace_state,
+                            units = std::move(units), req = std::move(req)] (rjson::value json_request) mutable {
+                        return callback_it->second(_executor, *client_state, trace_state, make_service_permit(std::move(units)), std::move(json_request), std::move(req)).finally([trace_state] {});
+                    });
+                });
+            });
        });
    });
 }
@@ -243,35 +313,88 @@ void server::set_routes(routes& r) {
        return handle_api_request(std::move(req));
    });

-    r.add(operation_type::POST, url("/"), req_handler);
-    r.add(operation_type::GET, url("/"), new health_handler);
+    r.put(operation_type::POST, "/", req_handler);
+    r.put(operation_type::GET, "/", new health_handler(_pending_requests));
+    // The "/localnodes" request is a new Alternator feature, not supported by
+    // DynamoDB and not required for DynamoDB compatibility. It allows a
+    // client to enquire - using a trivial HTTP request without requiring
+    // authentication - the list of all live nodes in the same data center of
+    // the Alternator cluster. The client can use this list to balance its
+    // request load to all the nodes in the same geographical region.
+    // Note that this API exposes - openly without authentication - the
+    // information on the cluster's members inside one data center. We do not
+    // consider this to be a security risk, because an attacker can already
+    // scan an entire subnet for nodes responding to the health request,
+    // or even just scan for open ports.
+    r.put(operation_type::GET, "/localnodes", new local_nodelist_handler(_pending_requests));
 }

 //FIXME: A way to immediately invalidate the cache should be considered,
 // e.g. when the system table which stores the keys is changed.
 // For now, this propagation may take up to 1 minute.
-server::server(seastar::sharded<executor>& e)
-        : _executor(e), _key_cache(1024, 1min, slogger), _enforce_authorization(false)
+server::server(executor& exec)
+        : _http_server("http-alternator")
+        , _https_server("https-alternator")
+        , _executor(exec)
+        , _key_cache(1024, 1min, slogger)
+        , _enforce_authorization(false)
+        , _enabled_servers{}
+        , _pending_requests{}
      , _callbacks{
-        {"CreateTable", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) {
-            return e.maybe_create_keyspace().then([&e, &client_state, req = std::move(req)] { return e.create_table(client_state, req->content); }); }
-        },
-        {"DescribeTable", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.describe_table(client_state, req->content); }},
-        {"DeleteTable", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.delete_table(client_state, req->content); }},
-        {"PutItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.put_item(client_state, req->content); }},
-        {"UpdateItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.update_item(client_state, req->content); }},
-        {"GetItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.get_item(client_state, req->content); }},
-        {"DeleteItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.delete_item(client_state, req->content); }},
-        {"ListTables", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.list_tables(client_state, req->content); }},
-        {"Scan", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.scan(client_state, req->content); }},
-        {"DescribeEndpoints", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.describe_endpoints(client_state, req->content, req->get_header("Host")); }},
-        {"BatchWriteItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.batch_write_item(client_state, req->content); }},
-        {"BatchGetItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.batch_get_item(client_state, req->content); }},
-        {"Query", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.query(client_state, req->content); }},
+        {"CreateTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.create_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"DescribeTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.describe_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"DeleteTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.delete_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"PutItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.put_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"UpdateItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.update_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"GetItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.get_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"DeleteItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.delete_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"ListTables", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.list_tables(client_state, std::move(permit), std::move(json_request));
+        }},
+        {"Scan", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.scan(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"DescribeEndpoints", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.describe_endpoints(client_state, std::move(permit), std::move(json_request), req->get_header("Host"));
+        }},
+        {"BatchWriteItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.batch_write_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"BatchGetItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.batch_get_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"Query", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.query(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"TagResource", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.tag_resource(client_state, std::move(permit), std::move(json_request));
+        }},
+        {"UntagResource", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.untag_resource(client_state, std::move(permit), std::move(json_request));
+        }},
+        {"ListTagsOfResource", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.list_tags_of_resource(client_state, std::move(permit), std::move(json_request));
+        }},
    } {
 }

-future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds, bool enforce_authorization) {
+future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
+        bool enforce_authorization, semaphore* memory_limiter) {
+    _memory_limiter = memory_limiter;
    _enforce_authorization = enforce_authorization;
    if (!port && !https_port) {
        return make_exception_future<>(std::runtime_error("Either regular port or TLS port"
@@ -279,25 +402,26 @@ future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std:
    }
    return seastar::async([this, addr, port, https_port, creds] {
        try {
-            _executor.invoke_on_all([] (executor& e) {
-                return e.start();
-            }).get();
+            _executor.start().get();

            if (port) {
-                _control.start().get();
-                _control.set_routes(std::bind(&server::set_routes, this, std::placeholders::_1)).get();
-                _control.listen(socket_address{addr, *port}).get();
-                slogger.info("Alternator HTTP server listening on {} port {}", addr, *port);
+                set_routes(_http_server._routes);
+                _http_server.set_content_length_limit(server::content_length_limit);
+                _http_server.listen(socket_address{addr, *port}).get();
+                _enabled_servers.push_back(std::ref(_http_server));
            }
            if (https_port) {
-                _https_control.start().get();
-                _https_control.set_routes(std::bind(&server::set_routes, this, std::placeholders::_1)).get();
-                _https_control.server().invoke_on_all([creds] (http_server& serv) {
-                    return serv.set_tls_credentials(creds->build_server_credentials());
-                }).get();
-
-                _https_control.listen(socket_address{addr, *https_port}).get();
-                slogger.info("Alternator HTTPS server listening on {} port {}", addr, *https_port);
+                set_routes(_https_server._routes);
+                _https_server.set_content_length_limit(server::content_length_limit);
+                _https_server.set_tls_credentials(creds->build_reloadable_server_credentials([](const std::unordered_set<sstring>& files, std::exception_ptr ep) {
+                    if (ep) {
+                        slogger.warn("Exception loading {}: {}", files, ep);
+                    } else {
+                        slogger.info("Reloaded {}", files);
+                    }
+                }).get0());
+                _https_server.listen(socket_address{addr, *https_port}).get();
+                _enabled_servers.push_back(std::ref(_https_server));
            }
        } catch (...) {
            slogger.error("Failed to set up Alternator HTTP server on {} port {}, TLS port {}: {}",
@@ -309,5 +433,55 @@ future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std:
    });
 }

+future<> server::stop() {
+    return parallel_for_each(_enabled_servers, [] (http_server& server) {
+        return server.stop();
+    }).then([this] {
+        return _pending_requests.close();
+    }).then([this] {
+        return _json_parser.stop();
+    });
+}
+
+server::json_parser::json_parser() : _run_parse_json_thread(async([this] {
+        while (true) {
+            _document_waiting.wait().get();
+            if (_as.abort_requested()) {
+                return;
+            }
+            try {
+                _parsed_document = rjson::parse_yieldable(_raw_document);
+                _current_exception = nullptr;
+            } catch (...) {
+                _current_exception = std::current_exception();
+            }
+            _document_parsed.signal();
+        }
+    })) {
+}
+
+future<rjson::value> server::json_parser::parse(std::string_view content) {
+    if (content.size() < yieldable_parsing_threshold) {
+        return make_ready_future<rjson::value>(rjson::parse(content));
+    }
+    return with_semaphore(_parsing_sem, 1, [this, content] {
+        _raw_document = content;
+        _document_waiting.signal();
+        return _document_parsed.wait().then([this] {
+            if (_current_exception) {
+                return make_exception_future<rjson::value>(_current_exception);
+            }
+            return make_ready_future<rjson::value>(std::move(_parsed_document));
+        });
+    });
+}
+
+future<> server::json_parser::stop() {
+    _as.request_abort();
+    _document_waiting.signal();
+    _document_parsed.broken();
+    return std::move(_run_parse_json_thread);
+}
+
 }

--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -26,28 +26,57 @@
 #include <seastar/http/httpd.hh>
 #include <seastar/net/tls.hh>
 #include <optional>
-#include <alternator/auth.hh>
+#include "alternator/auth.hh"
+#include "utils/small_vector.hh"
+#include <seastar/core/units.hh>

 namespace alternator {

 class server {
-    using alternator_callback = std::function<future<json::json_return_type>(executor&, executor::client_state&, std::unique_ptr<request>)>;
+    static constexpr size_t content_length_limit = 16*MB;
+    using alternator_callback = std::function<future<executor::request_return_type>(executor&, executor::client_state&,
+            tracing::trace_state_ptr, service_permit, rjson::value, std::unique_ptr<request>)>;
    using alternator_callbacks_map = std::unordered_map<std::string_view, alternator_callback>;

-    seastar::httpd::http_server_control _control;
-    seastar::httpd::http_server_control _https_control;
-    seastar::sharded<executor>& _executor;
+    http_server _http_server;
+    http_server _https_server;
+    executor& _executor;
+
    key_cache _key_cache;
    bool _enforce_authorization;
+    utils::small_vector<std::reference_wrapper<seastar::httpd::http_server>, 2> _enabled_servers;
+    gate _pending_requests;
    alternator_callbacks_map _callbacks;
-public:
-    server(seastar::sharded<executor>& executor);

-    seastar::future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds, bool enforce_authorization);
+    semaphore* _memory_limiter;
+
+    class json_parser {
+        static constexpr size_t yieldable_parsing_threshold = 16*KB;
+        std::string_view _raw_document;
+        rjson::value _parsed_document;
+        std::exception_ptr _current_exception;
+        semaphore _parsing_sem{1};
+        condition_variable _document_waiting;
+        condition_variable _document_parsed;
+        abort_source _as;
+        future<> _run_parse_json_thread;
+    public:
+        json_parser();
+        future<rjson::value> parse(std::string_view content);
+        future<> stop();
+    };
+    json_parser _json_parser;
+
+public:
+    server(executor& executor);
+
+    future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
+            bool enforce_authorization, semaphore* memory_limiter);
+    future<> stop();
 private:
    void set_routes(seastar::httpd::routes& r);
    future<> verify_signature(const seastar::httpd::request& r);
-    future<json::json_return_type> handle_api_request(std::unique_ptr<request>&& req);
+    future<executor::request_return_type> handle_api_request(std::unique_ptr<request>&& req);
 };

 }
--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -85,6 +85,12 @@ stats::stats() : api_operations{} {
                    seastar::metrics::description("number of total operations via Alternator API")),
            seastar::metrics::make_total_operations("reads_before_write", reads_before_write,
                    seastar::metrics::description("number of performed read-before-write operations")),
+            seastar::metrics::make_total_operations("write_using_lwt", write_using_lwt,
+                    seastar::metrics::description("number of writes that used LWT")),
+            seastar::metrics::make_total_operations("shard_bounce_for_lwt", shard_bounce_for_lwt,
+                    seastar::metrics::description("number writes that had to be bounced from this shard because of LWT requirements")),
+            seastar::metrics::make_total_operations("requests_blocked_memory", requests_blocked_memory,
+                    seastar::metrics::description("Counts a number of requests blocked due to memory pressure.")),
            seastar::metrics::make_total_operations("filtered_rows_read_total", cql_stats.filtered_rows_read_total,
                    seastar::metrics::description("number of rows read during filtering operations")),
            seastar::metrics::make_total_operations("filtered_rows_matched_total", cql_stats.filtered_rows_matched_total,
--- a/alternator/stats.hh
+++ b/alternator/stats.hh
@@ -84,6 +84,9 @@ public:
    uint64_t total_operations = 0;
    uint64_t unsupported_operations = 0;
    uint64_t reads_before_write = 0;
+    uint64_t write_using_lwt = 0;
+    uint64_t shard_bounce_for_lwt = 0;
+    uint64_t requests_blocked_memory = 0;
    // CQL-derived stats
    cql3::cql_stats cql_stats;
 private:
--- a/alternator/tags_extension.hh
+++ b/alternator/tags_extension.hh
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "serializer.hh"
+#include "schema.hh"
+#include "db/extensions.hh"
+
+namespace alternator {
+
+class tags_extension : public schema_extension {
+public:
+    static constexpr auto NAME = "scylla_tags";
+
+    tags_extension() = default;
+    explicit tags_extension(const std::map<sstring, sstring>& tags) : _tags(std::move(tags)) {}
+    explicit tags_extension(bytes b) : _tags(tags_extension::deserialize(b)) {}
+    explicit tags_extension(const sstring& s) {
+        throw std::logic_error("Cannot create tags from string");
+    }
+    bytes serialize() const override {
+        return ser::serialize_to_buffer<bytes>(_tags);
+    }
+    static std::map<sstring, sstring> deserialize(bytes_view buffer) {
+        return ser::deserialize_from_buffer(buffer, boost::type<std::map<sstring, sstring>>());
+    }
+    const std::map<sstring, sstring>& tags() const {
+        return _tags;
+    }
+private:
+    std::map<sstring, sstring> _tags;
+};
+
+}
--- a/api/api-doc/column_family.json
+++ b/api/api-doc/column_family.json
@@ -70,7 +70,7 @@
            {
               "method":"POST",
               "summary":"Force a major compaction of this column family",
-               "type":"string",
+               "type":"void",
               "nickname":"force_major_compaction",
               "produces":[
                  "application/json"
@@ -380,16 +380,54 @@
         "operations":[
            {
               "method":"GET",
-               "summary":"check if the auto compaction disabled",
+               "summary":"check if the auto_compaction property is enabled for a given table",
               "type":"boolean",
-               "nickname":"is_auto_compaction_disabled",
+               "nickname":"get_auto_compaction",
               "produces":[
                  "application/json"
               ],
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keyspace:name format",
+                     "description":"The table name in keyspace:name format",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            },
+            {
+               "method":"POST",
+               "summary":"Enable table auto compaction",
+               "type":"void",
+               "nickname":"enable_auto_compaction",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"name",
+                     "description":"The table name in keyspace:name format",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            },
+            {
+               "method":"DELETE",
+               "summary":"Disable table auto compaction",
+               "type":"void",
+               "nickname":"disable_auto_compaction",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"name",
+                     "description":"The table name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
--- a/api/api-doc/error_injection.json
+++ b/api/api-doc/error_injection.json
@@ -0,0 +1,90 @@
+{
+   "apiVersion":"0.0.1",
+   "swaggerVersion":"1.2",
+   "basePath":"{{Protocol}}://{{Host}}",
+   "resourcePath":"/error_injection",
+   "produces":[
+      "application/json"
+   ],
+   "apis":[
+      {
+         "path":"/v2/error_injection/injection/{injection}",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Activate an injection that triggers an error in code",
+               "type":"void",
+               "nickname":"enable_injection",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"injection",
+                     "description":"injection name, should correspond to an injection added in code",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  },
+                  {
+                     "name":"one_shot",
+                     "description":"boolean flag indicating whether the injection should be enabled to trigger only once",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
+                  }
+               ]
+            },
+            {
+               "method":"DELETE",
+               "summary":"Deactivate an injection previously activated by the API",
+               "type":"void",
+               "nickname":"disable_injection",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"injection",
+                     "description":"injection name",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/v2/error_injection/injection",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"List all enabled injections on all shards, i.e. injections that will trigger an error in the code",
+               "type":"array",
+               "items":{
+                  "type":"string"
+               },
+               "nickname":"get_enabled_injections_on_all",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            },
+            {
+               "method":"DELETE",
+               "summary":"Deactivate all injections previously activated on all shards by the API",
+               "type":"void",
+               "nickname":"disable_on_all",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            }
+         ]
+      }
+   ]
+}
--- a/api/api-doc/storage_proxy.json
+++ b/api/api-doc/storage_proxy.json
@@ -641,6 +641,21 @@
        }
      ]
    },
+    {
+      "path": "/storage_proxy/metrics/cas_write/failed_read_round_optimization",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get cas write metrics",
+          "type": "long",
+          "nickname": "get_cas_write_metrics_failed_read_round_optimization",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
    {
      "path": "/storage_proxy/metrics/cas_read/unfinished_commit",
      "operations": [
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -511,6 +511,21 @@
            }
         ]
      },
+      {
+         "path":"/storage_service/cdc_streams_check_and_repair",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Checks that CDC streams reflect current cluster topology and regenerates them if not.",
+               "type":"void",
+               "nickname":"cdc_streams_check_and_repair",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            }
+         ]
+      },
      {
         "path":"/storage_service/snapshots",
         "operations":[
@@ -582,7 +597,15 @@
                  },
                  {
                     "name":"kn",
-                     "description":"Comma seperated keyspaces name to snapshot",
+                     "description":"Comma seperated keyspaces name that their snapshot will be deleted",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"cf",
+                     "description":"an optional table name that its snapshot will be deleted",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
--- a/api/api.cc
+++ b/api/api.cc
@@ -36,6 +36,7 @@
 #include "endpoint_snitch.hh"
 #include "compaction_manager.hh"
 #include "hinted_handoff.hh"
+#include "error_injection.hh"
 #include <seastar/http/exception.hh>
 #include "stream_manager.hh"
 #include "system.hh"
@@ -68,13 +69,19 @@ future<> set_server_init(http_context& ctx) {
        rb->set_api_doc(r);
        rb02->set_api_doc(r);
        rb02->register_api_file(r, "swagger20_header");
-        set_config(rb02, ctx, r);
        rb->register_function(r, "system",
                "The system related API");
        set_system(ctx, r);
    });
 }

+future<> set_server_config(http_context& ctx) {
+    auto rb02 = std::make_shared < api_registry_builder20 > (ctx.api_doc, "/v2");
+    return ctx.http_server.set_routes([&ctx, rb02](routes& r) {
+        set_config(rb02, ctx, r);
+    });
+}
+
 static future<> register_api(http_context& ctx, const sstring& api_name,
        const sstring api_desc,
        std::function<void(http_context& ctx, routes& r)> f) {
@@ -86,10 +93,30 @@ static future<> register_api(http_context& ctx, const sstring& api_name,
    });
 }

+future<> set_transport_controller(http_context& ctx, cql_transport::controller& ctl) {
+    return ctx.http_server.set_routes([&ctx, &ctl] (routes& r) { set_transport_controller(ctx, r, ctl); });
+}
+
+future<> unset_transport_controller(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_transport_controller(ctx, r); });
+}
+
+future<> set_rpc_controller(http_context& ctx, thrift_controller& ctl) {
+    return ctx.http_server.set_routes([&ctx, &ctl] (routes& r) { set_rpc_controller(ctx, r, ctl); });
+}
+
+future<> unset_rpc_controller(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_rpc_controller(ctx, r); });
+}
+
 future<> set_server_storage_service(http_context& ctx) {
    return register_api(ctx, "storage_service", "The storage service API", set_storage_service);
 }

+future<> set_server_snapshot(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { set_snapshot(ctx, r); });
+}
+
 future<> set_server_snitch(http_context& ctx) {
    return register_api(ctx, "endpoint_snitch_info", "The endpoint snitch info API", set_endpoint_snitch);
 }
@@ -153,6 +180,9 @@ future<> set_server_done(http_context& ctx) {
        rb->register_function(r, "collectd",
                "The collectd API");
        set_collectd(ctx, r);
+        rb->register_function(r, "error_injection",
+                "The error injection API");
+        set_error_injection(ctx, r);
    });
 }

--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -24,6 +24,9 @@
 #include <seastar/http/httpd.hh>

 namespace service { class load_meter; }
+namespace locator { class token_metadata; }
+namespace cql_transport { class controller; }
+class thrift_controller;

 namespace api {

@@ -34,16 +37,24 @@ struct http_context {
    distributed<database>& db;
    distributed<service::storage_proxy>& sp;
    service::load_meter& lmeter;
+    sharded<locator::token_metadata>& token_metadata;
+
    http_context(distributed<database>& _db,
            distributed<service::storage_proxy>& _sp,
-            service::load_meter& _lm)
-            : db(_db), sp(_sp), lmeter(_lm) {
+            service::load_meter& _lm, sharded<locator::token_metadata>& _tm)
+            : db(_db), sp(_sp), lmeter(_lm), token_metadata(_tm) {
    }
 };

 future<> set_server_init(http_context& ctx);
+future<> set_server_config(http_context& ctx);
 future<> set_server_snitch(http_context& ctx);
 future<> set_server_storage_service(http_context& ctx);
+future<> set_transport_controller(http_context& ctx, cql_transport::controller& ctl);
+future<> unset_transport_controller(http_context& ctx);
+future<> set_rpc_controller(http_context& ctx, thrift_controller& ctl);
+future<> unset_rpc_controller(http_context& ctx);
+future<> set_server_snapshot(http_context& ctx);
 future<> set_server_gossip(http_context& ctx);
 future<> set_server_load_sstable(http_context& ctx);
 future<> set_server_messaging_service(http_context& ctx);
--- a/api/cache_service.cc
+++ b/api/cache_service.cc
@@ -208,9 +208,11 @@ void set_cache_service(http_context& ctx, routes& r) {
    });

    cs::get_row_capacity.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, uint64_t(0), [](const column_family& cf) {
-            return cf.get_row_cache().get_cache_tracker().region().occupancy().used_space();
-        }, std::plus<uint64_t>());
+        return ctx.db.map_reduce0([](database& db) -> uint64_t {
+            return db.row_cache_tracker().region().occupancy().used_space();
+        }, uint64_t(0), std::plus<uint64_t>()).then([](const int64_t& res) {
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    cs::get_row_hits.set(r, [&ctx] (std::unique_ptr<request> req) {
@@ -251,15 +253,19 @@ void set_cache_service(http_context& ctx, routes& r) {
    cs::get_row_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        // In origin row size is the weighted size.
        // We currently do not support weights, so we use num entries instead
-        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
-            return cf.get_row_cache().partitions();
-        }, std::plus<uint64_t>());
+        return ctx.db.map_reduce0([](database& db) -> uint64_t {
+            return db.row_cache_tracker().partitions();
+        }, uint64_t(0), std::plus<uint64_t>()).then([](const int64_t& res) {
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    cs::get_row_entries.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
-            return cf.get_row_cache().partitions();
-        }, std::plus<uint64_t>());
+        return ctx.db.map_reduce0([](database& db) -> uint64_t {
+            return db.row_cache_tracker().partitions();
+        }, uint64_t(0), std::plus<uint64_t>()).then([](const int64_t& res) {
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    cs::get_counter_capacity.set(r, [] (std::unique_ptr<request> req) {
--- a/api/collectd.cc
+++ b/api/collectd.cc
@@ -64,7 +64,7 @@ static const char* str_to_regex(const sstring& v) {
 void set_collectd(http_context& ctx, routes& r) {
    cd::get_collectd.set(r, [&ctx](std::unique_ptr<request> req) {

-        auto id = make_shared<scollectd::type_instance_id>(req->param["pluginid"],
+        auto id = ::make_shared<scollectd::type_instance_id>(req->param["pluginid"],
                req->get_query_param("instance"), req->get_query_param("type"),
                req->get_query_param("type_instance"));

--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -650,7 +650,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_size();
+                return s + sst->filter_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -658,7 +658,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_size();
+                return s + sst->filter_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -666,7 +666,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_memory_size();
+                return s + sst->filter_memory_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -674,7 +674,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_memory_size();
+                return s + sst->filter_memory_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -682,7 +682,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->get_summary().memory_footprint();
+                return s + sst->get_summary().memory_footprint();
            });
        }, std::plus<uint64_t>());
    });
@@ -690,7 +690,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->get_summary().memory_footprint();
+                return s + sst->get_summary().memory_footprint();
            });
        }, std::plus<uint64_t>());
    });
@@ -804,14 +804,14 @@ void set_column_family(http_context& ctx, routes& r) {

    cf::get_cas_propose.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](column_family& cf) {
-            return cf.get_stats().estimated_cas_propose;
+            return cf.get_stats().estimated_cas_accept;
        },
        utils::estimated_histogram_merge, utils_json::estimated_histogram());
    });

    cf::get_cas_commit.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](column_family& cf) {
-            return cf.get_stats().estimated_cas_commit;
+            return cf.get_stats().estimated_cas_learn;
        },
        utils::estimated_histogram_merge, utils_json::estimated_histogram());
    });
@@ -839,11 +839,26 @@ void set_column_family(http_context& ctx, routes& r) {
        return make_ready_future<json::json_return_type>(res);
    });

-    cf::is_auto_compaction_disabled.set(r, [] (const_req req) {
-        // FIXME
-        // currently auto compaction is disable
-        // it should be changed when it would have an API
-        return true;
+    cf::get_auto_compaction.set(r, [&ctx] (const_req req) {
+        const utils::UUID& uuid = get_uuid(req.param["name"], ctx.db.local());
+        column_family& cf = ctx.db.local().find_column_family(uuid);
+        return !cf.is_auto_compaction_disabled_by_user();
+    });
+
+    cf::enable_auto_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
+        return foreach_column_family(ctx, req->param["name"], [](column_family &cf) {
+            cf.enable_auto_compaction();
+        }).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    cf::disable_auto_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
+        return foreach_column_family(ctx, req->param["name"], [](column_family &cf) {
+            cf.disable_auto_compaction();
+        }).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
    });

    cf::get_built_indexes.set(r, [&ctx](std::unique_ptr<request> req) {
@@ -994,5 +1009,15 @@ void set_column_family(http_context& ctx, routes& r) {
        });
    });

+    cf::force_major_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
+        if (req->get_query_param("split_output") != "") {
+            fail(unimplemented::cause::API);
+        }
+        return foreach_column_family(ctx, req->param["name"], [](column_family &cf) {
+            return cf.compact_all_sstables();
+        }).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
 }
 }
--- a/api/commitlog.cc
+++ b/api/commitlog.cc
@@ -20,7 +20,7 @@
 */

 #include "commitlog.hh"
-#include <db/commitlog/commitlog.hh>
+#include "db/commitlog/commitlog.hh"
 #include "api/api-doc/commitlog.json.hh"
 #include "database.hh"
 #include <vector>
--- a/api/error_injection.cc
+++ b/api/error_injection.cc
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "api/api-doc/error_injection.json.hh"
+#include "api/api.hh"
+
+#include <seastar/http/exception.hh>
+#include "log.hh"
+#include "utils/error_injection.hh"
+#include "seastar/core/future-util.hh"
+
+namespace api {
+
+namespace hf = httpd::error_injection_json;
+
+void set_error_injection(http_context& ctx, routes& r) {
+
+    hf::enable_injection.set(r, [](std::unique_ptr<request> req) {
+        sstring injection = req->param["injection"];
+        bool one_shot = req->get_query_param("one_shot") == "True";
+        auto& errinj = utils::get_local_injector();
+        return errinj.enable_on_all(injection, one_shot).then([] {
+            return make_ready_future<json::json_return_type>(json::json_void());
+        });
+    });
+
+    hf::get_enabled_injections_on_all.set(r, [](std::unique_ptr<request> req) {
+        auto& errinj = utils::get_local_injector();
+        auto ret = errinj.enabled_injections_on_all();
+        return make_ready_future<json::json_return_type>(ret);
+    });
+
+    hf::disable_injection.set(r, [](std::unique_ptr<request> req) {
+        sstring injection = req->param["injection"];
+
+        auto& errinj = utils::get_local_injector();
+        return errinj.disable_on_all(injection).then([] {
+            return make_ready_future<json::json_return_type>(json::json_void());
+        });
+    });
+
+    hf::disable_on_all.set(r, [](std::unique_ptr<request> req) {
+        auto& errinj = utils::get_local_injector();
+        return errinj.disable_on_all().then([] {
+            return make_ready_future<json::json_return_type>(json::json_void());
+        });
+    });
+
+}
+
+} // namespace api
--- a/api/error_injection.hh
+++ b/api/error_injection.hh
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "api.hh"
+
+namespace api {
+
+void set_error_injection(http_context& ctx, routes& r);
+
+}
--- a/api/gossiper.cc
+++ b/api/gossiper.cc
@@ -21,7 +21,7 @@

 #include "gossiper.hh"
 #include "api/api-doc/gossiper.json.hh"
-#include <gms/gossiper.hh>
+#include "gms/gossiper.hh"

 namespace api {
 using namespace json;
--- a/api/storage_proxy.cc
+++ b/api/storage_proxy.cc
@@ -27,6 +27,7 @@
 #include "db/config.hh"
 #include "utils/histogram.hh"
 #include "database.hh"
+#include "seastar/core/scheduling_specific.hh"

 namespace api {

@@ -34,12 +35,70 @@ namespace sp = httpd::storage_proxy_json;
 using proxy = service::storage_proxy;
 using namespace json;

-static future<utils::rate_moving_average>  sum_timed_rate(distributed<proxy>& d, utils::timed_rate_moving_average proxy::stats::*f) {
-    return d.map_reduce0([f](const proxy& p) {return (p.get_stats().*f).rate();}, utils::rate_moving_average(),
-            std::plus<utils::rate_moving_average>());
+
+/**
+ * This function implement a two dimentional map reduce where
+ * the first level is a distributed storage_proxy class and the
+ * second level is the stats per scheduling group class.
+ * @param d -  a reference to the storage_proxy distributed class.
+ * @param mapper -  the internal mapper that is used to map the internal
+ * stat class into a value of type `V`.
+ * @param reducer - the reducer that is used in both outer and inner
+ * aggregations.
+ * @param initial_value - the initial value to use for both aggregations
+ * @return A future that resolves to the result of the aggregation.
+ */
+template<typename V, typename Reducer, typename InnerMapper>
+future<V> two_dimensional_map_reduce(distributed<service::storage_proxy>& d,
+        InnerMapper mapper, Reducer reducer, V initial_value) {
+    return d.map_reduce0( [mapper, reducer, initial_value] (const service::storage_proxy& sp) {
+        return map_reduce_scheduling_group_specific<service::storage_proxy_stats::stats>(
+                mapper, reducer, initial_value, sp.get_stats_key());
+    }, initial_value, reducer);
 }

-static future<json::json_return_type>  sum_timed_rate_as_obj(distributed<proxy>& d, utils::timed_rate_moving_average proxy::stats::*f) {
+/**
+ * This function implement a two dimentional map reduce where
+ * the first level is a distributed storage_proxy class and the
+ * second level is the stats per scheduling group class.
+ * @param d -  a reference to the storage_proxy distributed class.
+ * @param f - a field pointer which is the implicit internal reducer.
+ * @param reducer - the reducer that is used in both outer and inner
+ * aggregations.
+ * @param initial_value - the initial value to use for both aggregations* @return
+ * @return A future that resolves to the result of the aggregation.
+ */
+template<typename V, typename Reducer, typename F>
+future<V> two_dimensional_map_reduce(distributed<service::storage_proxy>& d,
+        V F::*f, Reducer reducer, V initial_value) {
+    return two_dimensional_map_reduce(d, [f] (F& stats) {
+        return stats.*f;
+    }, reducer, initial_value);
+}
+
+/**
+ * A partial Specialization of sum_stats for the storage proxy
+ * case where the get stats function doesn't return a
+ * stats object with fields but a per scheduling group
+ * stats object, the name was also changed since functions
+ * partial specialization is not supported in C++.
+ *
+ */
+template<typename V, typename F>
+future<json::json_return_type>  sum_stats_storage_proxy(distributed<proxy>& d, V F::*f) {
+    return two_dimensional_map_reduce(d, [f] (F& stats) { return stats.*f; }, std::plus<V>(), V(0)).then([] (V val) {
+        return make_ready_future<json::json_return_type>(val);
+    });
+}
+
+
+static future<utils::rate_moving_average>  sum_timed_rate(distributed<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
+    return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
+        return (stats.*f).rate();
+    }, std::plus<utils::rate_moving_average>(), utils::rate_moving_average());
+}
+
+static future<json::json_return_type>  sum_timed_rate_as_obj(distributed<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
    return sum_timed_rate(d, f).then([](const utils::rate_moving_average& val) {
        httpd::utils_json::rate_moving_average m;
        m = val;
@@ -51,29 +110,89 @@ httpd::utils_json::rate_moving_average_and_histogram get_empty_moving_average()
    return timer_to_json(utils::rate_moving_average_and_histogram());
 }

-static future<json::json_return_type>  sum_timed_rate_as_long(distributed<proxy>& d, utils::timed_rate_moving_average proxy::stats::*f) {
+static future<json::json_return_type>  sum_timed_rate_as_long(distributed<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
    return sum_timed_rate(d, f).then([](const utils::rate_moving_average& val) {
        return make_ready_future<json::json_return_type>(val.count);
    });
 }

-static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::estimated_histogram proxy::stats::*f) {
-    return ctx.sp.map_reduce0([f](const proxy& p) {return p.get_stats().*f;}, utils::estimated_histogram(),
-            utils::estimated_histogram_merge).then([](const utils::estimated_histogram& val) {
+utils_json::estimated_histogram time_to_json_histogram(const utils::time_estimated_histogram& val) {
+    utils_json::estimated_histogram res;
+    for (size_t i = 0; i < val.size(); i++) {
+        res.buckets.push(val.get(i));
+        res.bucket_offsets.push(val.get_bucket_lower_limit(i));
+    }
+    return res;
+}
+
+static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::time_estimated_histogram service::storage_proxy_stats::stats::*f) {
+
+    return two_dimensional_map_reduce(ctx.sp, f, utils::time_estimated_histogram_merge,
+            utils::time_estimated_histogram()).then([](const utils::time_estimated_histogram& val) {
+        return make_ready_future<json::json_return_type>(time_to_json_histogram(val));
+    });
+}
+
+static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::estimated_histogram service::storage_proxy_stats::stats::*f) {
+
+    return two_dimensional_map_reduce(ctx.sp, f, utils::estimated_histogram_merge,
+            utils::estimated_histogram()).then([](const utils::estimated_histogram& val) {
        utils_json::estimated_histogram res;
        res = val;
        return make_ready_future<json::json_return_type>(res);
    });
 }

-static future<json::json_return_type>  total_latency(http_context& ctx, utils::timed_rate_moving_average_and_histogram proxy::stats::*f) {
-    return ctx.sp.map_reduce0([f](const proxy& p) {return (p.get_stats().*f).hist.mean * (p.get_stats().*f).hist.count;}, 0.0,
-            std::plus<double>()).then([](double val) {
+static future<json::json_return_type>  total_latency(http_context& ctx, utils::timed_rate_moving_average_and_histogram service::storage_proxy_stats::stats::*f) {
+    return two_dimensional_map_reduce(ctx.sp, [f] (service::storage_proxy_stats::stats& stats) {
+            return (stats.*f).hist.mean * (stats.*f).hist.count;
+        }, std::plus<double>(), 0.0).then([](double val) {
        int64_t res = val;
        return make_ready_future<json::json_return_type>(res);
    });
 }

+/**
+ * A partial Specialization of sum_histogram_stats
+ * for the storage proxy case where the get stats
+ * function doesn't return a stats object with
+ * fields but a per scheduling group stats object,
+ * the name was also changed since function partial
+ * specialization is not supported in C++.
+ */
+template<typename F>
+future<json::json_return_type>
+sum_histogram_stats_storage_proxy(distributed<proxy>& d,
+        utils::timed_rate_moving_average_and_histogram F::*f) {
+    return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
+        return (stats.*f).hist;
+    }, std::plus<utils::ihistogram>(), utils::ihistogram()).
+            then([](const utils::ihistogram& val) {
+        return make_ready_future<json::json_return_type>(to_json(val));
+    });
+}
+
+/**
+ * A partial Specialization of sum_timer_stats for the
+ * storage proxy case where the get stats function
+ * doesn't return a stats object with fields but a
+ * per scheduling group stats object, the name
+ * was also changed since partial function specialization
+ * is not supported in C++.
+ */
+template<typename F>
+future<json::json_return_type>
+sum_timer_stats_storage_proxy(distributed<proxy>& d,
+        utils::timed_rate_moving_average_and_histogram F::*f) {
+
+    return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
+        return (stats.*f).rate();
+    }, std::plus<utils::rate_moving_average_and_histogram>(),
+            utils::rate_moving_average_and_histogram()).then([](const utils::rate_moving_average_and_histogram& val) {
+        return make_ready_future<json::json_return_type>(timer_to_json(val));
+    });
+}
+
 void set_storage_proxy(http_context& ctx, routes& r) {
    sp::get_total_hints.set(r, [](std::unique_ptr<request> req)  {
        //TBD
@@ -223,15 +342,15 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_read_repair_attempted.set(r, [&ctx](std::unique_ptr<request> req)  {
-        return sum_stats(ctx.sp, &proxy::stats::read_repair_attempts);
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_attempts);
    });

    sp::get_read_repair_repaired_blocking.set(r, [&ctx](std::unique_ptr<request> req)  {
-        return sum_stats(ctx.sp, &proxy::stats::read_repair_repaired_blocking);
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_repaired_blocking);
    });

    sp::get_read_repair_repaired_background.set(r, [&ctx](std::unique_ptr<request> req)  {
-        return sum_stats(ctx.sp, &proxy::stats::read_repair_repaired_background);
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_repaired_background);
    });

    sp::get_schema_versions.set(r, [](std::unique_ptr<request> req)  {
@@ -275,6 +394,10 @@ void set_storage_proxy(http_context& ctx, routes& r) {
        return sum_stats(ctx.sp, &proxy::stats::cas_write_condition_not_met);
    });

+    sp::get_cas_write_metrics_failed_read_round_optimization.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_failed_read_round_optimization);
+    });
+
    sp::get_cas_read_metrics_unfinished_commit.set(r, [&ctx](std::unique_ptr<request> req) {
        return sum_stats(ctx.sp, &proxy::stats::cas_read_unfinished_commit);
    });
@@ -284,71 +407,71 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_read_metrics_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::read_timeouts);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::read_timeouts);
    });

    sp::get_read_metrics_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::read_unavailables);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::read_unavailables);
    });

    sp::get_range_metrics_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::range_slice_timeouts);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::range_slice_timeouts);
    });

    sp::get_range_metrics_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::range_slice_unavailables);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::range_slice_unavailables);
    });

    sp::get_write_metrics_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::write_timeouts);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::write_timeouts);
    });

    sp::get_write_metrics_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::write_unavailables);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::write_unavailables);
    });

    sp::get_read_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::read_timeouts);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::read_timeouts);
    });

    sp::get_read_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::read_unavailables);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::read_unavailables);
    });

    sp::get_range_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::range_slice_timeouts);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::range_slice_timeouts);
    });

    sp::get_range_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::range_slice_unavailables);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::range_slice_unavailables);
    });

    sp::get_write_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::write_timeouts);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::write_timeouts);
    });

    sp::get_write_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::write_unavailables);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::write_unavailables);
    });

    sp::get_range_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_histogram_stats(ctx.sp, &proxy::stats::range);
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

    sp::get_write_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_histogram_stats(ctx.sp, &proxy::stats::write);
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::write);
    });

    sp::get_read_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_histogram_stats(ctx.sp, &proxy::stats::read);
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read);
    });

    sp::get_range_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats(ctx.sp, &proxy::stats::range);
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

    sp::get_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats(ctx.sp, &proxy::stats::write);
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::write);
    });
    sp::get_cas_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
        return sum_timer_stats(ctx.sp, &proxy::stats::cas_write);
@@ -367,30 +490,30 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_read_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats(ctx.sp, &proxy::stats::read);
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read);
    });

    sp::get_read_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_estimated_histogram(ctx, &proxy::stats::estimated_read);
+        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::estimated_read);
    });

    sp::get_read_latency.set(r, [&ctx](std::unique_ptr<request> req) {
-        return total_latency(ctx, &proxy::stats::read);
+        return total_latency(ctx, &service::storage_proxy_stats::stats::read);
    });
    sp::get_write_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_estimated_histogram(ctx, &proxy::stats::estimated_write);
+        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::estimated_write);
    });

    sp::get_write_latency.set(r, [&ctx](std::unique_ptr<request> req) {
-        return total_latency(ctx, &proxy::stats::write);
+        return total_latency(ctx, &service::storage_proxy_stats::stats::write);
    });

    sp::get_range_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats(ctx.sp, &proxy::stats::range);
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

    sp::get_range_latency.set(r, [&ctx](std::unique_ptr<request> req) {
-        return total_latency(ctx, &proxy::stats::range);
+        return total_latency(ctx, &service::storage_proxy_stats::stats::range);
    });
 }

--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -41,8 +41,8 @@
 #include "sstables/sstables.hh"
 #include "database.hh"
 #include "db/extensions.hh"
-
-sstables::sstable::version_types get_highest_supported_format();
+#include "transport/controller.hh"
+#include "thrift/controller.hh"

 namespace api {

@@ -56,57 +56,115 @@ static sstring validate_keyspace(http_context& ctx, const parameters& param) {
    throw bad_param_exception("Keyspace " + param["keyspace"] + " Does not exist");
 }

-static std::vector<ss::token_range> describe_ring(const sstring& keyspace) {
-    std::vector<ss::token_range> res;
-    for (auto d : service::get_local_storage_service().describe_ring(keyspace)) {
-        ss::token_range r;
-        r.start_token = d._start_token;
-        r.end_token = d._end_token;
-        r.endpoints = d._endpoints;
-        r.rpc_endpoints = d._rpc_endpoints;
-        for (auto det : d._endpoint_details) {
-            ss::endpoint_detail ed;
-            ed.host = det._host;
-            ed.datacenter = det._datacenter;
-            if (det._rack != "") {
-                ed.rack = det._rack;
-            }
-            r.endpoint_details.push(ed);
+static ss::token_range token_range_endpoints_to_json(const dht::token_range_endpoints& d) {
+    ss::token_range r;
+    r.start_token = d._start_token;
+    r.end_token = d._end_token;
+    r.endpoints = d._endpoints;
+    r.rpc_endpoints = d._rpc_endpoints;
+    for (auto det : d._endpoint_details) {
+        ss::endpoint_detail ed;
+        ed.host = det._host;
+        ed.datacenter = det._datacenter;
+        if (det._rack != "") {
+            ed.rack = det._rack;
        }
-        res.push_back(r);
+        r.endpoint_details.push(ed);
    }
-    return res;
+    return r;
+}
+
+using ks_cf_func = std::function<future<json::json_return_type>(http_context&, std::unique_ptr<request>, sstring, std::vector<sstring>)>;
+
+static auto wrap_ks_cf(http_context &ctx, ks_cf_func f) {
+    return [&ctx, f = std::move(f)](std::unique_ptr<request> req) {
+        auto keyspace = validate_keyspace(ctx, req->param);
+        auto column_families = split_cf(req->get_query_param("cf"));
+        if (column_families.empty()) {
+            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
+        }
+        return f(ctx, std::move(req), std::move(keyspace), std::move(column_families));
+    };
+}
+
+future<json::json_return_type> set_tables_autocompaction(http_context& ctx, const sstring &keyspace, std::vector<sstring> tables, bool enabled) {
+    if (tables.empty()) {
+        tables = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
+    }
+
+    return service::get_local_storage_service().set_tables_autocompaction(keyspace, tables, enabled).then([]{
+        return make_ready_future<json::json_return_type>(json_void());
+    });
+}
+
+void set_transport_controller(http_context& ctx, routes& r, cql_transport::controller& ctl) {
+    ss::start_native_transport.set(r, [&ctl](std::unique_ptr<request> req) {
+        return ctl.start_server().then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    ss::stop_native_transport.set(r, [&ctl](std::unique_ptr<request> req) {
+        return ctl.stop_server().then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    ss::is_native_transport_running.set(r, [&ctl] (std::unique_ptr<request> req) {
+        return ctl.is_server_running().then([] (bool running) {
+            return make_ready_future<json::json_return_type>(running);
+        });
+    });
+}
+
+void unset_transport_controller(http_context& ctx, routes& r) {
+    ss::start_native_transport.unset(r);
+    ss::stop_native_transport.unset(r);
+    ss::is_native_transport_running.unset(r);
+}
+
+void set_rpc_controller(http_context& ctx, routes& r, thrift_controller& ctl) {
+    ss::stop_rpc_server.set(r, [&ctl](std::unique_ptr<request> req) {
+        return ctl.stop_server().then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    ss::start_rpc_server.set(r, [&ctl](std::unique_ptr<request> req) {
+        return ctl.start_server().then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    ss::is_rpc_server_running.set(r, [&ctl] (std::unique_ptr<request> req) {
+        return ctl.is_server_running().then([] (bool running) {
+            return make_ready_future<json::json_return_type>(running);
+        });
+    });
+}
+
+void unset_rpc_controller(http_context& ctx, routes& r) {
+    ss::stop_rpc_server.unset(r);
+    ss::start_rpc_server.unset(r);
+    ss::is_rpc_server_running.unset(r);
 }

 void set_storage_service(http_context& ctx, routes& r) {
-    using ks_cf_func = std::function<future<json::json_return_type>(std::unique_ptr<request>, sstring, std::vector<sstring>)>;
-
-    auto wrap_ks_cf = [&ctx](ks_cf_func f) {
-        return [&ctx, f = std::move(f)](std::unique_ptr<request> req) {
-            auto keyspace = validate_keyspace(ctx, req->param);
-            auto column_families = split_cf(req->get_query_param("cf"));
-            if (column_families.empty()) {
-                column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
-            }
-            return f(std::move(req), std::move(keyspace), std::move(column_families));
-        };
-    };
-
    ss::local_hostid.set(r, [](std::unique_ptr<request> req) {
        return db::system_keyspace::get_local_host_id().then([](const utils::UUID& id) {
            return make_ready_future<json::json_return_type>(id.to_sstring());
        });
    });

-    ss::get_tokens.set(r, [] (std::unique_ptr<request> req) {
-        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().get_token_metadata().sorted_tokens(), [](const dht::token& i) {
+    ss::get_tokens.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.token_metadata.local().sorted_tokens(), [](const dht::token& i) {
           return boost::lexical_cast<std::string>(i);
        }));
    });

-    ss::get_node_tokens.set(r, [] (std::unique_ptr<request> req) {
+    ss::get_node_tokens.set(r, [&ctx] (std::unique_ptr<request> req) {
        gms::inet_address addr(req->param["endpoint"]);
-        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().get_token_metadata().get_tokens(addr), [](const dht::token& i) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.token_metadata.local().get_tokens(addr), [](const dht::token& i) {
           return boost::lexical_cast<std::string>(i);
       }));
    });
@@ -124,8 +182,8 @@ void set_storage_service(http_context& ctx, routes& r) {
        }));
    });

-    ss::get_leaving_nodes.set(r, [](const_req req) {
-        return container_to_vec(service::get_local_storage_service().get_token_metadata().get_leaving_endpoints());
+    ss::get_leaving_nodes.set(r, [&ctx](const_req req) {
+        return container_to_vec(ctx.token_metadata.local().get_leaving_endpoints());
    });

    ss::get_moving_nodes.set(r, [](const_req req) {
@@ -133,8 +191,8 @@ void set_storage_service(http_context& ctx, routes& r) {
        return container_to_vec(addr);
    });

-    ss::get_joining_nodes.set(r, [](const_req req) {
-        auto points = service::get_local_storage_service().get_token_metadata().get_bootstrap_tokens();
+    ss::get_joining_nodes.set(r, [&ctx](const_req req) {
+        auto points = ctx.token_metadata.local().get_bootstrap_tokens();
        std::unordered_set<sstring> addr;
        for (auto i: points) {
            addr.insert(boost::lexical_cast<std::string>(i.second));
@@ -177,19 +235,18 @@ void set_storage_service(http_context& ctx, routes& r) {
        return make_ready_future<json::json_return_type>(res);
    });

-    ss::describe_any_ring.set(r, [&ctx](const_req req) {
-        return describe_ring("");
+    ss::describe_any_ring.set(r, [&ctx](std::unique_ptr<request> req) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(""), token_range_endpoints_to_json));
    });

-    ss::describe_ring.set(r, [&ctx](const_req req) {
-        auto keyspace = validate_keyspace(ctx, req.param);
-        return describe_ring(keyspace);
+    ss::describe_ring.set(r, [&ctx](std::unique_ptr<request> req) {
+        auto keyspace = validate_keyspace(ctx, req->param);
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(keyspace), token_range_endpoints_to_json));
    });

-    ss::get_host_id_map.set(r, [](const_req req) {
+    ss::get_host_id_map.set(r, [&ctx](const_req req) {
        std::vector<ss::mapper> res;
-        return map_to_key_value(service::get_local_storage_service().
-                get_token_metadata().get_endpoint_to_host_id_map_for_reading(), res);
+        return map_to_key_value(ctx.token_metadata.local().get_endpoint_to_host_id_map_for_reading(), res);
    });

    ss::get_load.set(r, [&ctx](std::unique_ptr<request> req) {
@@ -222,64 +279,12 @@ void set_storage_service(http_context& ctx, routes& r) {
                req.get_query_param("key")));
    });

-    ss::get_snapshot_details.set(r, [](std::unique_ptr<request> req) {
-        return service::get_local_storage_service().get_snapshot_details().then([] (auto result) {
-            std::vector<ss::snapshots> res;
-            for (auto& map: result) {
-                ss::snapshots all_snapshots;
-                all_snapshots.key = map.first;
-
-                std::vector<ss::snapshot> snapshot;
-                for (auto& cf: map.second) {
-                    ss::snapshot s;
-                    s.ks = cf.ks;
-                    s.cf = cf.cf;
-                    s.live = cf.live;
-                    s.total = cf.total;
-                    snapshot.push_back(std::move(s));
-                }
-                all_snapshots.value = std::move(snapshot);
-                res.push_back(std::move(all_snapshots));
-            }
-            return make_ready_future<json::json_return_type>(std::move(res));
-        });
-    });
-
-    ss::take_snapshot.set(r, [](std::unique_ptr<request> req) {
-        auto tag = req->get_query_param("tag");
-        auto column_family = req->get_query_param("cf");
-
-        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-
-        auto resp = make_ready_future<>();
-        if (column_family.empty()) {
-            resp = service::get_local_storage_service().take_snapshot(tag, keynames);
-        } else {
-            if (keynames.size() > 1) {
-                throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
-            }
-            resp = service::get_local_storage_service().take_column_family_snapshot(keynames[0], column_family, tag);
-        }
-        return resp.then([] {
+    ss::cdc_streams_check_and_repair.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return service::get_local_storage_service().check_and_repair_cdc_streams().then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::del_snapshot.set(r, [](std::unique_ptr<request> req) {
-        auto tag = req->get_query_param("tag");
-
-        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-        return service::get_local_storage_service().clear_snapshot(tag, keynames).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
-    ss::true_snapshots_size.set(r, [](std::unique_ptr<request> req) {
-        return service::get_local_storage_service().true_snapshots_size().then([] (int64_t size) {
-            return make_ready_future<json::json_return_type>(size);
-        });
-    });
-
    ss::force_keyspace_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
        auto keyspace = validate_keyspace(ctx, req->param);
        auto column_families = split_cf(req->get_query_param("cf"));
@@ -317,8 +322,8 @@ void set_storage_service(http_context& ctx, routes& r) {
                for (auto cf : column_families) {
                    column_families_vec.push_back(&db.find_column_family(keyspace, cf));
                }
-                return parallel_for_each(column_families_vec, [&cm] (column_family* cf) {
-                    return cm.perform_cleanup(cf);
+                return parallel_for_each(column_families_vec, [&cm, &db] (column_family* cf) {
+                    return cm.perform_cleanup(db, cf);
                });
            }).then([]{
                return make_ready_future<json::json_return_type>(0);
@@ -326,32 +331,7 @@ void set_storage_service(http_context& ctx, routes& r) {
        });
    });

-    ss::scrub.set(r, wrap_ks_cf([&ctx](std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
-        // TODO: respect this
-        auto skip_corrupted = req->get_query_param("skip_corrupted");
-
-        auto f = make_ready_future<>();
-        if (!req_param<bool>(*req, "disable_snapshot", false)) {
-            auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
-            f = parallel_for_each(column_families, [keyspace, tag](sstring cf) {
-                return service::get_local_storage_service().take_column_family_snapshot(keyspace, cf, tag);
-            });
-        }
-
-        return f.then([&ctx, keyspace, column_families] {
-            return ctx.db.invoke_on_all([=] (database& db) {
-                return do_for_each(column_families, [=, &db](sstring cfname) {
-                    auto& cm = db.get_compaction_manager();
-                    auto& cf = db.find_column_family(keyspace, cfname);
-                    return cm.perform_sstable_scrub(&cf);
-                });
-            });
-        }).then([]{
-            return make_ready_future<json::json_return_type>(0);
-        });
-    }));
-
-    ss::upgrade_sstables.set(r, wrap_ks_cf([&ctx](std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
+    ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);

        return ctx.db.invoke_on_all([=] (database& db) {
@@ -569,42 +549,6 @@ void set_storage_service(http_context& ctx, routes& r) {
        });
    });

-    ss::stop_rpc_server.set(r, [](std::unique_ptr<request> req) {
-        return service::get_local_storage_service().stop_rpc_server().then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
-    ss::start_rpc_server.set(r, [](std::unique_ptr<request> req) {
-        return service::get_local_storage_service().start_rpc_server().then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
-    ss::is_rpc_server_running.set(r, [] (std::unique_ptr<request> req) {
-        return service::get_local_storage_service().is_rpc_server_running().then([] (bool running) {
-            return make_ready_future<json::json_return_type>(running);
-        });
-    });
-
-    ss::start_native_transport.set(r, [](std::unique_ptr<request> req) {
-        return service::get_local_storage_service().start_native_transport().then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
-    ss::stop_native_transport.set(r, [](std::unique_ptr<request> req) {
-        return service::get_local_storage_service().stop_native_transport().then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
-    ss::is_native_transport_running.set(r, [] (std::unique_ptr<request> req) {
-        return service::get_local_storage_service().is_native_transport_running().then([] (bool running) {
-            return make_ready_future<json::json_return_type>(running);
-        });
-    });
-
    ss::join_ring.set(r, [](std::unique_ptr<request> req) {
        return make_ready_future<json::json_return_type>(json_void());
    });
@@ -734,7 +678,7 @@ void set_storage_service(http_context& ctx, routes& r) {

    ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
        auto probability = req->get_query_param("probability");
-        return futurize<json::json_return_type>::apply([probability] {
+        return futurize_invoke([probability] {
            double real_prob = std::stod(probability.c_str());
            return tracing::tracing::tracing_instance().invoke_on_all([real_prob] (auto& local_tracing) {
                local_tracing.set_trace_probability(real_prob);
@@ -789,19 +733,17 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::enable_auto_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
        auto keyspace = validate_keyspace(ctx, req->param);
-        auto column_family = req->get_query_param("cf");
-        return make_ready_future<json::json_return_type>(json_void());
+        auto tables = split_cf(req->get_query_param("cf"));
+
+        return set_tables_autocompaction(ctx, keyspace, tables, true);
    });

    ss::disable_auto_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
        auto keyspace = validate_keyspace(ctx, req->param);
-        auto column_family = req->get_query_param("cf");
-        return make_ready_future<json::json_return_type>(json_void());
+        auto tables = split_cf(req->get_query_param("cf"));
+
+        return set_tables_autocompaction(ctx, keyspace, tables, false);
    });

    ss::deliver_hints.set(r, [](std::unique_ptr<request> req) {
@@ -1037,4 +979,107 @@ void set_storage_service(http_context& ctx, routes& r) {

 }

+void set_snapshot(http_context& ctx, routes& r) {
+    ss::get_snapshot_details.set(r, [](std::unique_ptr<request> req) {
+        std::function<future<>(output_stream<char>&&)> f = [](output_stream<char>&& s) {
+            return do_with(output_stream<char>(std::move(s)), true, [] (output_stream<char>& s, bool& first){
+                return s.write("[").then([&s, &first] {
+                    return service::get_local_storage_service().get_snapshot_details().then([&s, &first] (std::unordered_map<sstring, std::vector<service::storage_service::snapshot_details>>&& result) {
+                        return do_with(std::move(result), [&s, &first](const std::unordered_map<sstring, std::vector<service::storage_service::snapshot_details>>& result) {
+                            return do_for_each(result, [&s, &result,&first](std::tuple<sstring, std::vector<service::storage_service::snapshot_details>>&& map){
+                                return do_with(ss::snapshots(), [&s, &first, &result, &map](ss::snapshots& all_snapshots) {
+                                    all_snapshots.key = std::get<0>(map);
+                                    future<> f = first ? make_ready_future<>() : s.write(", ");
+                                    first = false;
+                                    std::vector<ss::snapshot> snapshot;
+                                    for (auto& cf: std::get<1>(map)) {
+                                        ss::snapshot snp;
+                                        snp.ks = cf.ks;
+                                        snp.cf = cf.cf;
+                                        snp.live = cf.live;
+                                        snp.total = cf.total;
+                                        snapshot.push_back(std::move(snp));
+                                    }
+                                    all_snapshots.value = std::move(snapshot);
+                                    return f.then([&s, &all_snapshots] {
+                                        return all_snapshots.write(s);
+                                    });
+                                });
+                            });
+                        });
+                    }).then([&s] {
+                        return s.write("]").then([&s] {
+                            return s.close();
+                        });
+                    });
+                });
+            });
+        };
+        return make_ready_future<json::json_return_type>(std::move(f));
+    });
+
+    ss::take_snapshot.set(r, [](std::unique_ptr<request> req) {
+        auto tag = req->get_query_param("tag");
+        auto column_families = split(req->get_query_param("cf"), ",");
+
+        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
+
+        auto resp = make_ready_future<>();
+        if (column_families.empty()) {
+            resp = service::get_local_storage_service().take_snapshot(tag, keynames);
+        } else {
+            if (keynames.empty()) {
+                throw httpd::bad_param_exception("The keyspace of column families must be specified");
+            }
+            if (keynames.size() > 1) {
+                throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
+            }
+            resp = service::get_local_storage_service().take_column_family_snapshot(keynames[0], column_families, tag);
+        }
+        return resp.then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    ss::del_snapshot.set(r, [](std::unique_ptr<request> req) {
+        auto tag = req->get_query_param("tag");
+        auto column_family = req->get_query_param("cf");
+
+        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
+        return service::get_local_storage_service().clear_snapshot(tag, keynames, column_family).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    ss::true_snapshots_size.set(r, [](std::unique_ptr<request> req) {
+        return service::get_local_storage_service().true_snapshots_size().then([] (int64_t size) {
+            return make_ready_future<json::json_return_type>(size);
+        });
+    });
+
+    ss::scrub.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
+        const auto skip_corrupted = req_param<bool>(*req, "skip_corrupted", false);
+
+        auto f = make_ready_future<>();
+        if (!req_param<bool>(*req, "disable_snapshot", false)) {
+            auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
+            f = parallel_for_each(column_families, [keyspace, tag](sstring cf) {
+                return service::get_local_storage_service().take_column_family_snapshot(keyspace, cf, tag);
+            });
+        }
+
+        return f.then([&ctx, keyspace, column_families, skip_corrupted] {
+            return ctx.db.invoke_on_all([=] (database& db) {
+                return do_for_each(column_families, [=, &db](sstring cfname) {
+                    auto& cm = db.get_compaction_manager();
+                    auto& cf = db.find_column_family(keyspace, cfname);
+                    return cm.perform_sstable_scrub(&cf, skip_corrupted);
+                });
+            });
+        }).then([]{
+            return make_ready_future<json::json_return_type>(0);
+        });
+    }));
+}
+
 }
--- a/api/storage_service.hh
+++ b/api/storage_service.hh
@@ -23,8 +23,16 @@

 #include "api.hh"

+namespace cql_transport { class controller; }
+class thrift_controller;
+
 namespace api {

 void set_storage_service(http_context& ctx, routes& r);
+void set_transport_controller(http_context& ctx, routes& r, cql_transport::controller& ctl);
+void unset_transport_controller(http_context& ctx, routes& r);
+void set_rpc_controller(http_context& ctx, routes& r, thrift_controller& ctl);
+void unset_rpc_controller(http_context& ctx, routes& r);
+void set_snapshot(http_context& ctx, routes& r);

 }
--- a/api/system.cc
+++ b/api/system.cc
@@ -22,6 +22,7 @@
 #include "api/api-doc/system.json.hh"
 #include "api/api.hh"

+#include <seastar/core/reactor.hh>
 #include <seastar/http/exception.hh>
 #include "log.hh"

--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -21,6 +21,7 @@

 #include "atomic_cell.hh"
 #include "atomic_cell_or_collection.hh"
+#include "counters.hh"
 #include "types.hh"

 /// LSA mirator for cells with irrelevant type
@@ -218,7 +219,9 @@ std::ostream&
 operator<<(std::ostream& os, const atomic_cell_view& acv) {
    if (acv.is_live()) {
        return fmt_print(os, "atomic_cell{{{},ts={:d},expiry={:d},ttl={:d}}}",
-            to_hex(acv.value().linearize()),
+            acv.is_counter_update()
+                    ? "counter_update_value=" + to_sstring(acv.counter_update_value())
+                    : to_hex(acv.value().linearize()),
            acv.timestamp(),
            acv.is_live_and_has_ttl() ? acv.expiry().time_since_epoch().count() : -1,
            acv.is_live_and_has_ttl() ? acv.ttl().count() : 0);
@@ -238,8 +241,21 @@ operator<<(std::ostream& os, const atomic_cell_view::printer& acvp) {
    auto& type = acvp._type;
    auto& acv = acvp._cell;
    if (acv.is_live()) {
+        std::ostringstream cell_value_string_builder;
+        if (type.is_counter()) {
+            if (acv.is_counter_update()) {
+                cell_value_string_builder << "counter_update_value=" << acv.counter_update_value();
+            } else {
+                cell_value_string_builder << "shards: ";
+                counter_cell_view::with_linearized(acv, [&cell_value_string_builder] (counter_cell_view& ccv) {
+                    cell_value_string_builder << ::join(", ", ccv.shards());
+                });
+            }
+        } else {
+            cell_value_string_builder << type.to_string(acv.value().linearize());
+        }
        return fmt_print(os, "atomic_cell{{{},ts={:d},expiry={:d},ttl={:d}}}",
-            type.to_string(acv.value().linearize()),
+            cell_value_string_builder.str(),
            acv.timestamp(),
            acv.is_live_and_has_ttl() ? acv.expiry().time_since_epoch().count() : -1,
            acv.is_live_and_has_ttl() ? acv.ttl().count() : 0);
--- a/atomic_cell.hh
+++ b/atomic_cell.hh
@@ -29,7 +29,6 @@
 #include <seastar/net//byteorder.hh>
 #include <cstdint>
 #include <iosfwd>
-#include <seastar/util/gcc6-concepts.hh>
 #include "data/cell.hh"
 #include "data/schema_info.hh"
 #include "imr/utils.hh"
--- a/auth/allow_all_authenticator.hh
+++ b/auth/allow_all_authenticator.hh
@@ -52,7 +52,7 @@ public:
        return make_ready_future<>();
    }

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return allow_all_authenticator_name();
    }

--- a/auth/allow_all_authorizer.hh
+++ b/auth/allow_all_authorizer.hh
@@ -49,7 +49,7 @@ public:
        return make_ready_future<>();
    }

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return allow_all_authorizer_name();
    }

--- a/auth/authenticator.hh
+++ b/auth/authenticator.hh
@@ -96,7 +96,7 @@ public:
    ///
    /// A fully-qualified (class with package) Java-like name for this implementation.
    ///
-    virtual const sstring& qualified_java_name() const = 0;
+    virtual std::string_view qualified_java_name() const = 0;

    virtual bool require_authentication() const = 0;

--- a/auth/authorizer.hh
+++ b/auth/authorizer.hh
@@ -100,7 +100,7 @@ public:
    ///
    /// A fully-qualified (class with package) Java-like name for this implementation.
    ///
-    virtual const sstring& qualified_java_name() const = 0;
+    virtual std::string_view qualified_java_name() const = 0;

    ///
    /// Query for the permissions granted directly to a role for a particular \ref resource (and not any of its
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -59,22 +59,22 @@ future<> do_after_system_ready(seastar::abort_source& as, seastar::noncopyable_f
    }).discard_result();
 }

-future<> create_metadata_table_if_missing(
+static future<> create_metadata_table_if_missing_impl(
        std::string_view table_name,
        cql3::query_processor& qp,
        std::string_view cql,
        ::service::migration_manager& mm) {
    static auto ignore_existing = [] (seastar::noncopyable_function<future<>()> func) {
-        return futurize_apply(std::move(func)).handle_exception_type([] (exceptions::already_exists_exception& ignored) { });
+        return futurize_invoke(std::move(func)).handle_exception_type([] (exceptions::already_exists_exception& ignored) { });
    };
    auto& db = qp.db();
-    auto parsed_statement = static_pointer_cast<cql3::statements::raw::cf_statement>(
-            cql3::query_processor::parse_statement(cql));
+    auto parsed_statement = cql3::query_processor::parse_statement(cql);
+    auto& parsed_cf_statement = static_cast<cql3::statements::raw::cf_statement&>(*parsed_statement);

-    parsed_statement->prepare_keyspace(meta::AUTH_KS);
+    parsed_cf_statement.prepare_keyspace(meta::AUTH_KS);

    auto statement = static_pointer_cast<cql3::statements::create_table_statement>(
-            parsed_statement->prepare(db, qp.get_cql_stats())->statement);
+            parsed_cf_statement.prepare(db, qp.get_cql_stats())->statement);

    const auto schema = statement->get_cf_meta_data(qp.db());
    const auto uuid = generate_legacy_id(schema->ks_name(), schema->cf_name());
@@ -85,7 +85,14 @@ future<> create_metadata_table_if_missing(
    return ignore_existing([&mm, table = std::move(table)] () {
        return mm.announce_new_column_family(table, false);
    });
+}

+future<> create_metadata_table_if_missing(
+        std::string_view table_name,
+        cql3::query_processor& qp,
+        std::string_view cql,
+        ::service::migration_manager& mm) noexcept {
+    return futurize_invoke(create_metadata_table_if_missing_impl, table_name, qp, cql, mm);
 }

 future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db, seastar::abort_source& as) {
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -27,9 +27,10 @@
 #include <seastar/core/future.hh>
 #include <seastar/core/abort_source.hh>
 #include <seastar/util/noncopyable_function.hh>
-#include <seastar/core/reactor.hh>
+#include <seastar/core/seastar.hh>
 #include <seastar/core/resource.hh>
 #include <seastar/core/sstring.hh>
+#include <seastar/core/smp.hh>

 #include "log.hh"
 #include "seastarx.hh"
@@ -61,7 +62,7 @@ extern const sstring AUTH_PACKAGE_NAME;

 template <class Task>
 future<> once_among_shards(Task&& f) {
-    if (engine().cpu_id() == 0u) {
+    if (this_shard_id() == 0u) {
        return f();
    }

@@ -79,7 +80,7 @@ future<> create_metadata_table_if_missing(
        std::string_view table_name,
        cql3::query_processor&,
        std::string_view cql,
-        ::service::migration_manager&);
+        ::service::migration_manager&) noexcept;

 future<> wait_for_schema_agreement(::service::migration_manager&, const database&, seastar::abort_source&);

--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -51,7 +51,7 @@ extern "C" {

 #include <boost/algorithm/string/join.hpp>
 #include <boost/range.hpp>
-#include <seastar/core/reactor.hh>
+#include <seastar/core/seastar.hh>

 #include "auth/authenticated_user.hh"
 #include "auth/common.hh"
@@ -101,7 +101,7 @@ bool default_authorizer::legacy_metadata_exists() const {
 future<bool> default_authorizer::any_granted() const {
    static const sstring query = format("SELECT * FROM {}.{} LIMIT 1", meta::AUTH_KS, PERMISSIONS_CF);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config,
@@ -115,7 +115,7 @@ future<> default_authorizer::migrate_legacy_metadata() const {
    alogger.info("Starting migration of legacy permissions metadata.");
    static const sstring query = format("SELECT * FROM {}.{}", meta::AUTH_KS, legacy_table_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config).then([this](::shared_ptr<cql3::untyped_result_set> results) {
@@ -195,7 +195,7 @@ default_authorizer::authorize(const role_or_anonymous& maybe_role, const resourc
            ROLE_NAME,
            RESOURCE_NAME);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config,
@@ -224,7 +224,7 @@ default_authorizer::modify(
                    ROLE_NAME,
                    RESOURCE_NAME),
            [this, &role_name, set, &resource](const auto& query) {
-        return _qp.process(
+        return _qp.execute_internal(
                query,
                db::consistency_level::ONE,
                internal_distributed_timeout_config(),
@@ -249,7 +249,7 @@ future<std::vector<permission_details>> default_authorizer::list_all() const {
            meta::AUTH_KS,
            PERMISSIONS_CF);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::ONE,
            internal_distributed_timeout_config(),
@@ -276,7 +276,7 @@ future<> default_authorizer::revoke_all(std::string_view role_name) const {
            PERMISSIONS_CF,
            ROLE_NAME);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::ONE,
            internal_distributed_timeout_config(),
@@ -296,7 +296,7 @@ future<> default_authorizer::revoke_all(const resource& resource) const {
            PERMISSIONS_CF,
            RESOURCE_NAME);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config,
@@ -313,7 +313,7 @@ future<> default_authorizer::revoke_all(const resource& resource) const {
                        ROLE_NAME,
                        RESOURCE_NAME);

-                return _qp.process(
+                return _qp.execute_internal(
                        query,
                        db::consistency_level::LOCAL_ONE,
                        infinite_timeout_config,
--- a/auth/default_authorizer.hh
+++ b/auth/default_authorizer.hh
@@ -71,7 +71,7 @@ public:

    virtual future<> stop() override;

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return default_authorizer_name();
    }

--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -48,7 +48,7 @@
 #include <optional>

 #include <boost/algorithm/cxx11/all_of.hpp>
-#include <seastar/core/reactor.hh>
+#include <seastar/core/seastar.hh>

 #include "auth/authenticated_user.hh"
 #include "auth/common.hh"
@@ -96,10 +96,13 @@ static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
    return !row.get_or<sstring>(SALTED_HASH, "").empty();
 }

-static const sstring update_row_query = format("UPDATE {} SET {} = ? WHERE {} = ?",
-        meta::roles_table::qualified_name(),
-        SALTED_HASH,
-        meta::roles_table::role_col_name);
+static const sstring& update_row_query() {
+    static const sstring update_row_query = format("UPDATE {} SET {} = ? WHERE {} = ?",
+            meta::roles_table::qualified_name(),
+            SALTED_HASH,
+            meta::roles_table::role_col_name);
+    return update_row_query;
+}

 static const sstring legacy_table_name{"credentials"};

@@ -111,7 +114,7 @@ future<> password_authenticator::migrate_legacy_metadata() const {
    plogger.info("Starting migration of legacy authentication metadata.");
    static const sstring query = format("SELECT * FROM {}.{}", meta::AUTH_KS, legacy_table_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
            internal_distributed_timeout_config()).then([this](::shared_ptr<cql3::untyped_result_set> results) {
@@ -119,8 +122,8 @@ future<> password_authenticator::migrate_legacy_metadata() const {
            auto username = row.get_as<sstring>("username");
            auto salted_hash = row.get_as<sstring>(SALTED_HASH);

-            return _qp.process(
-                    update_row_query,
+            return _qp.execute_internal(
+                    update_row_query(),
                    consistency_for_user(username),
                    internal_distributed_timeout_config(),
                    {std::move(salted_hash), username}).discard_result();
@@ -136,8 +139,8 @@ future<> password_authenticator::migrate_legacy_metadata() const {
 future<> password_authenticator::create_default_if_missing() const {
    return default_role_row_satisfies(_qp, &has_salted_hash).then([this](bool exists) {
        if (!exists) {
-            return _qp.process(
-                    update_row_query,
+            return _qp.execute_internal(
+                    update_row_query(),
                    db::consistency_level::QUORUM,
                    internal_distributed_timeout_config(),
                    {passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt), DEFAULT_USER_NAME}).then([](auto&&) {
@@ -194,7 +197,7 @@ db::consistency_level password_authenticator::consistency_for_user(std::string_v
    return db::consistency_level::LOCAL_ONE;
 }

-const sstring& password_authenticator::qualified_java_name() const {
+std::string_view password_authenticator::qualified_java_name() const {
    return password_authenticator_name();
 }

@@ -227,13 +230,13 @@ future<authenticated_user> password_authenticator::authenticate(
    // obsolete prepared statements pretty quickly.
    // Rely on query processing caching statements instead, and lets assume
    // that a map lookup string->statement is not gonna kill us much.
-    return futurize_apply([this, username, password] {
+    return futurize_invoke([this, username, password] {
        static const sstring query = format("SELECT {} FROM {} WHERE {} = ?",
                SALTED_HASH,
                meta::roles_table::qualified_name(),
                meta::roles_table::role_col_name);

-        return _qp.process(
+        return _qp.execute_internal(
                query,
                consistency_for_user(username),
                internal_distributed_timeout_config(),
@@ -267,8 +270,8 @@ future<> password_authenticator::create(std::string_view role_name, const authen
        return make_ready_future<>();
    }

-    return _qp.process(
-            update_row_query,
+    return _qp.execute_internal(
+            update_row_query(),
            consistency_for_user(role_name),
            internal_distributed_timeout_config(),
            {passwords::hash(*options.password, rng_for_salt), sstring(role_name)}).discard_result();
@@ -284,7 +287,7 @@ future<> password_authenticator::alter(std::string_view role_name, const authent
            SALTED_HASH,
            meta::roles_table::role_col_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            consistency_for_user(role_name),
            internal_distributed_timeout_config(),
@@ -297,7 +300,7 @@ future<> password_authenticator::drop(std::string_view name) const {
            meta::roles_table::qualified_name(),
            meta::roles_table::role_col_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query, consistency_for_user(name),
            internal_distributed_timeout_config(),
            {sstring(name)}).discard_result();
--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -71,7 +71,7 @@ public:

    virtual future<> stop() override;

-    virtual const sstring& qualified_java_name() const override;
+    virtual std::string_view qualified_java_name() const override;

    virtual bool require_authentication() const override;

--- a/auth/role_manager.hh
+++ b/auth/role_manager.hh
@@ -33,6 +33,7 @@

 #include "auth/resource.hh"
 #include "seastarx.hh"
+#include "exceptions/exceptions.hh"

 namespace auth {

@@ -52,9 +53,9 @@ struct role_config_update final {
 ///
 /// A logical argument error for a role-management operation.
 ///
-class roles_argument_exception : public std::invalid_argument {
+class roles_argument_exception : public exceptions::invalid_request_exception {
 public:
-    using std::invalid_argument::invalid_argument;
+    using exceptions::invalid_request_exception::invalid_request_exception;
 };

 class role_already_exists : public roles_argument_exception {
--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -68,14 +68,14 @@ future<bool> default_role_row_satisfies(
            meta::roles_table::role_col_name);

    return do_with(std::move(p), [&qp](const auto& p) {
-        return qp.process(
+        return qp.execute_internal(
                query,
                db::consistency_level::ONE,
                infinite_timeout_config,
                {meta::DEFAULT_SUPERUSER_NAME},
                true).then([&qp, &p](::shared_ptr<cql3::untyped_result_set> results) {
            if (results->empty()) {
-                return qp.process(
+                return qp.execute_internal(
                        query,
                        db::consistency_level::QUORUM,
                        internal_distributed_timeout_config(),
@@ -100,7 +100,7 @@ future<bool> any_nondefault_role_row_satisfies(
    static const sstring query = format("SELECT * FROM {}", meta::roles_table::qualified_name());

    return do_with(std::move(p), [&qp](const auto& p) {
-        return qp.process(
+        return qp.execute_internal(
                query,
                db::consistency_level::QUORUM,
                internal_distributed_timeout_config()).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -178,7 +178,7 @@ future<> service::start(::service::migration_manager& mm) {
        return create_keyspace_if_missing(mm);
    }).then([this] {
        return _role_manager->start().then([this] {
-            return when_all_succeed(_authorizer->start(), _authenticator->start());
+            return when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result();
        });
    }).then([this] {
        _permissions_cache = std::make_unique<permissions_cache>(_permissions_cache_config, *this, log);
@@ -193,10 +193,13 @@ future<> service::start(::service::migration_manager& mm) {
 future<> service::stop() {
    // Only one of the shards has the listener registered, but let's try to
    // unregister on each one just to make sure.
-    _mnotifier.unregister_listener(_migration_listener.get());
-
-    return _permissions_cache->stop().then([this] {
-        return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop());
+    return _mnotifier.unregister_listener(_migration_listener.get()).then([this] {
+        if (_permissions_cache) {
+            return _permissions_cache->stop();
+        }
+        return make_ready_future<>();
+    }).then([this] {
+        return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
    });
 }

@@ -217,7 +220,7 @@ future<bool> service::has_existing_legacy_users() const {
    // This logic is borrowed directly from Apache Cassandra. By first checking for the presence of the default user, we
    // can potentially avoid doing a range query with a high consistency level.

-    return _qp.process(
+    return _qp.execute_internal(
            default_user_query,
            db::consistency_level::ONE,
            infinite_timeout_config,
@@ -227,7 +230,7 @@ future<bool> service::has_existing_legacy_users() const {
            return make_ready_future<bool>(true);
        }

-        return _qp.process(
+        return _qp.execute_internal(
                default_user_query,
                db::consistency_level::QUORUM,
                infinite_timeout_config,
@@ -237,7 +240,7 @@ future<bool> service::has_existing_legacy_users() const {
                return make_ready_future<bool>(true);
            }

-            return _qp.process(
+            return _qp.execute_internal(
                    all_users_query,
                    db::consistency_level::QUORUM,
                    infinite_timeout_config).then([](auto results) {
@@ -416,7 +419,7 @@ future<> create_role(
            return make_ready_future<>();
        }

-        return futurize_apply(
+        return futurize_invoke(
                &validate_authentication_options_are_supported,
                options,
                ser.underlying_authenticator().supported_options()).then([&ser, name, &options] {
@@ -440,7 +443,7 @@ future<> alter_role(
            return make_ready_future<>();
        }

-        return futurize_apply(
+        return futurize_invoke(
                &validate_authentication_options_are_supported,
                options,
                ser.underlying_authenticator().supported_options()).then([&ser, name, &options] {
@@ -455,7 +458,9 @@ future<> drop_role(const service& ser, std::string_view name) {

        return when_all_succeed(
                a.revoke_all(name),
-                a.revoke_all(r)).handle_exception_type([](const unsupported_authorization_operation&) {
+                a.revoke_all(r))
+                    .discard_result()
+                    .handle_exception_type([](const unsupported_authorization_operation&) {
            // Nothing.
        });
    }).then([&ser, name] {
@@ -468,7 +473,7 @@ future<> drop_role(const service& ser, std::string_view name) {
 future<bool> has_role(const service& ser, std::string_view grantee, std::string_view name) {
    return when_all_succeed(
            validate_role_exists(ser, name),
-            ser.get_roles(grantee)).then([name](role_set all_roles) {
+            ser.get_roles(grantee)).then_unpack([name](role_set all_roles) {
        return make_ready_future<bool>(all_roles.count(sstring(name)) != 0);
    });
 }
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -35,6 +35,7 @@
 #include "auth/common.hh"
 #include "auth/roles-metadata.hh"
 #include "cql3/query_processor.hh"
+#include "cql3/untyped_result_set.hh"
 #include "db/consistency_level_type.hh"
 #include "exceptions/exceptions.hh"
 #include "log.hh"
@@ -86,7 +87,7 @@ static future<std::optional<record>> find_record(cql3::query_processor& qp, std:
            meta::roles_table::qualified_name(),
            meta::roles_table::role_col_name);

-    return qp.process(
+    return qp.execute_internal(
            query,
            consistency_for_role(role_name),
            internal_distributed_timeout_config(),
@@ -160,7 +161,7 @@ future<> standard_role_manager::create_metadata_tables_if_missing() const {
                    meta::role_members_table::name,
                    _qp,
                    create_role_members_query,
-                    _migration_manager));
+                    _migration_manager)).discard_result();
 }

 future<> standard_role_manager::create_default_role_if_missing() const {
@@ -170,7 +171,7 @@ future<> standard_role_manager::create_default_role_if_missing() const {
                    meta::roles_table::qualified_name(),
                    meta::roles_table::role_col_name);

-            return _qp.process(
+            return _qp.execute_internal(
                    query,
                    db::consistency_level::QUORUM,
                    internal_distributed_timeout_config(),
@@ -197,7 +198,7 @@ future<> standard_role_manager::migrate_legacy_metadata() const {
    log.info("Starting migration of legacy user metadata.");
    static const sstring query = format("SELECT * FROM {}.{}", meta::AUTH_KS, legacy_table_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
            internal_distributed_timeout_config()).then([this](::shared_ptr<cql3::untyped_result_set> results) {
@@ -258,7 +259,7 @@ future<> standard_role_manager::create_or_replace(std::string_view role_name, co
            meta::roles_table::qualified_name(),
            meta::roles_table::role_col_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            consistency_for_role(role_name),
            internal_distributed_timeout_config(),
@@ -298,7 +299,7 @@ standard_role_manager::alter(std::string_view role_name, const role_config_updat
            return make_ready_future<>();
        }

-        return _qp.process(
+        return _qp.execute_internal(
                format("UPDATE {} SET {} WHERE {} = ?",
                        meta::roles_table::qualified_name(),
                        build_column_assignments(u),
@@ -320,7 +321,7 @@ future<> standard_role_manager::drop(std::string_view role_name) const {
            static const sstring query = format("SELECT member FROM {} WHERE role = ?",
                    meta::role_members_table::qualified_name());

-            return _qp.process(
+            return _qp.execute_internal(
                    query,
                    consistency_for_role(role_name),
                    internal_distributed_timeout_config(),
@@ -359,14 +360,14 @@ future<> standard_role_manager::drop(std::string_view role_name) const {
                    meta::roles_table::qualified_name(),
                    meta::roles_table::role_col_name);

-            return _qp.process(
+            return _qp.execute_internal(
                    query,
                    consistency_for_role(role_name),
                    internal_distributed_timeout_config(),
                    {sstring(role_name)}).discard_result();
        };

-        return when_all_succeed(revoke_from_members(), revoke_members_of()).then([delete_role = std::move(delete_role)] {
+        return when_all_succeed(revoke_from_members(), revoke_members_of()).then_unpack([delete_role = std::move(delete_role)] {
            return delete_role();
        });
    });
@@ -386,7 +387,7 @@ standard_role_manager::modify_membership(
                (ch == membership_change::add ? '+' : '-'),
                meta::roles_table::role_col_name);

-        return _qp.process(
+        return _qp.execute_internal(
                query,
                consistency_for_role(grantee_name),
                internal_distributed_timeout_config(),
@@ -396,7 +397,7 @@ standard_role_manager::modify_membership(
    const auto modify_role_members = [this, role_name, grantee_name, ch] {
        switch (ch) {
            case membership_change::add:
-                return _qp.process(
+                return _qp.execute_internal(
                        format("INSERT INTO {} (role, member) VALUES (?, ?)",
                                meta::role_members_table::qualified_name()),
                        consistency_for_role(role_name),
@@ -404,7 +405,7 @@ standard_role_manager::modify_membership(
                        {sstring(role_name), sstring(grantee_name)}).discard_result();

            case membership_change::remove:
-                return _qp.process(
+                return _qp.execute_internal(
                        format("DELETE FROM {} WHERE role = ? AND member = ?",
                                meta::role_members_table::qualified_name()),
                        consistency_for_role(role_name),
@@ -415,7 +416,7 @@ standard_role_manager::modify_membership(
        return make_ready_future<>();
    };

-    return when_all_succeed(modify_roles(), modify_role_members());
+    return when_all_succeed(modify_roles(), modify_role_members()).discard_result();
 }

 future<>
@@ -444,7 +445,7 @@ standard_role_manager::grant(std::string_view grantee_name, std::string_view rol
        });
    };

-   return when_all_succeed(check_redundant(), check_cycle()).then([this, role_name, grantee_name] {
+   return when_all_succeed(check_redundant(), check_cycle()).then_unpack([this, role_name, grantee_name] {
       return this->modify_membership(grantee_name, role_name, membership_change::add);
   });
 }
@@ -508,7 +509,7 @@ future<role_set> standard_role_manager::query_all() const {
    // To avoid many copies of a view.
    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
            internal_distributed_timeout_config()).then([](::shared_ptr<cql3::untyped_result_set> results) {
--- a/auth/transitional.cc
+++ b/auth/transitional.cc
@@ -82,7 +82,7 @@ public:
        return _authenticator->stop();
    }

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return transitional_authenticator_name();
    }

@@ -158,7 +158,7 @@ public:
            }

            virtual future<authenticated_user> get_authenticated_user() const {
-                return futurize_apply([this] {
+                return futurize_invoke([this] {
                    return _sasl->get_authenticated_user().handle_exception([](auto ep) {
                        try {
                            std::rethrow_exception(ep);
@@ -201,7 +201,7 @@ public:
        return _authorizer->stop();
    }

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return transitional_authorizer_name();
    }

--- a/backlog_controller.hh
+++ b/backlog_controller.hh
@@ -23,7 +23,11 @@
 #include <seastar/core/scheduling.hh>
 #include <seastar/core/timer.hh>
 #include <seastar/core/gate.hh>
+#include <seastar/core/file.hh>
 #include <chrono>
+#include <cmath>
+
+#include "seastarx.hh"

 // Simple proportional controller to adjust shares for processes for which a backlog can be clearly
 // defined.
--- a/build_id.cc
+++ b/build_id.cc
@@ -7,6 +7,7 @@
 #include <link.h>
 #include <seastar/core/align.hh>
 #include <sstream>
+#include <cassert>

 using namespace seastar;

--- a/bytes.cc
+++ b/bytes.cc
@@ -64,7 +64,7 @@ bytes from_hex(sstring_view s) {

 sstring to_hex(bytes_view b) {
    static char digits[] = "0123456789abcdef";
-    sstring out(sstring::initialized_later(), b.size() * 2);
+    sstring out = uninitialized_string(b.size() * 2);
    unsigned end = b.size();
    for (unsigned i = 0; i != end; ++i) {
        uint8_t x = b[i];
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -176,7 +176,7 @@ public:
        return make_ready_future<>();
    }
    virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override {
-        throw std::bad_function_call();
+        return make_exception_future<>(make_backtraced_exception_ptr<std::bad_function_call>());
    }
 };

--- a/caching_options.hh
+++ b/caching_options.hh
@@ -39,7 +39,10 @@ class caching_options {

    sstring _key_cache;
    sstring _row_cache;
-    caching_options(sstring k, sstring r) : _key_cache(k), _row_cache(r) {
+    bool _enabled = true;
+    caching_options(sstring k, sstring r, bool enabled)
+        : _key_cache(k), _row_cache(r), _enabled(enabled)
+    {
        if ((k != "ALL") && (k != "NONE")) {
            throw exceptions::configuration_exception("Invalid key value: " + k); 
        }
@@ -59,36 +62,53 @@ class caching_options {
    caching_options() : _key_cache(default_key), _row_cache(default_row) {}
 public:

+    bool enabled() const {
+        return _enabled;
+    }
+
    std::map<sstring, sstring> to_map() const {
-        return {{ "keys", _key_cache }, { "rows_per_partition", _row_cache }};
+        std::map<sstring, sstring> res = {{ "keys", _key_cache },
+                { "rows_per_partition", _row_cache }};
+        if (!_enabled) {
+            res.insert({"enabled", "false"});
+        }
+        return res;
    }

    sstring to_sstring() const {
        return json::to_json(to_map());
    }

+    static caching_options get_disabled_caching_options() {
+        return caching_options("NONE", "NONE", false);
+    }
+
    template<typename Map>
    static caching_options from_map(const Map & map) {
        sstring k = default_key;
        sstring r = default_row;
+        bool e = true;

        for (auto& p : map) {
            if (p.first == "keys") {
                k = p.second;
            } else if (p.first == "rows_per_partition") {
                r = p.second;
+            } else if (p.first == "enabled") {
+                e = p.second == "true";
            } else {
                throw exceptions::configuration_exception("Invalid caching option: " + p.first);
            }
        }
-        return caching_options(k, r);
+        return caching_options(k, r, e);
    }
    static caching_options from_sstring(const sstring& str) {
        return from_map(json::to_map(str));
    }

    bool operator==(const caching_options& other) const {
-        return _key_cache == other._key_cache && _row_cache == other._row_cache;
+        return _key_cache == other._key_cache && _row_cache == other._row_cache
+            && _enabled == other._enabled;
    }
    bool operator!=(const caching_options& other) const {
        return !(*this == other);
--- a/canonical_mutation.cc
+++ b/canonical_mutation.cc
@@ -92,7 +92,7 @@ mutation canonical_mutation::to_mutation(schema_ptr s) const {
 }

 static sstring bytes_to_text(bytes_view bv) {
-    sstring ret(sstring::initialized_later(), bv.size());
+    sstring ret = uninitialized_string(bv.size());
    std::copy_n(reinterpret_cast<const char*>(bv.data()), bv.size(), ret.data());
    return ret;
 }
--- a/canonical_mutation.hh
+++ b/canonical_mutation.hh
@@ -22,7 +22,7 @@
 #pragma once

 #include "bytes.hh"
-#include "schema.hh"
+#include "schema_fwd.hh"
 #include "database_fwd.hh"
 #include "mutation_partition_visitor.hh"
 #include "mutation_partition_serializer.hh"
--- a/cartesian_product.hh
+++ b/cartesian_product.hh
@@ -22,6 +22,9 @@

 #pragma once

+#include <vector>
+#include <sys/types.h>
+
 // Single-pass range over cartesian product of vectors.

 // Note:
--- a/cdc/cdc.cc
+++ b/cdc/cdc.cc
@@ -1,818 +0,0 @@
-/*
- * Copyright (C) 2019 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <utility>
-#include <algorithm>
-
-#include <boost/range/irange.hpp>
-#include <seastar/util/defer.hh>
-#include <seastar/core/thread.hh>
-
-#include "cdc/cdc.hh"
-#include "bytes.hh"
-#include "database.hh"
-#include "db/config.hh"
-#include "dht/murmur3_partitioner.hh"
-#include "partition_slice_builder.hh"
-#include "schema.hh"
-#include "schema_builder.hh"
-#include "service/migration_listener.hh"
-#include "service/storage_service.hh"
-#include "types/tuple.hh"
-#include "cql3/statements/select_statement.hh"
-#include "cql3/multi_column_relation.hh"
-#include "cql3/tuples.hh"
-#include "log.hh"
-#include "json.hh"
-
-using locator::snitch_ptr;
-using locator::token_metadata;
-using locator::topology;
-using seastar::sstring;
-using service::migration_notifier;
-using service::storage_proxy;
-
-namespace std {
-
-template<> struct hash<std::pair<net::inet_address, unsigned int>> {
-    std::size_t operator()(const std::pair<net::inet_address, unsigned int> &p) const {
-        return std::hash<net::inet_address>{}(p.first) ^ std::hash<int>{}(p.second);
-    }
-};
-
-}
-
-using namespace std::chrono_literals;
-
-static logging::logger cdc_log("cdc");
-
-namespace cdc {
-static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {});
-static schema_ptr create_stream_description_table_schema(const schema&, std::optional<utils::UUID> = {});
-static future<> populate_desc(db_context ctx, const schema& s);
-}
-
-class cdc::cdc_service::impl : service::migration_listener::empty_listener {
-    friend cdc_service;
-    db_context _ctxt;
-public:
-    impl(db_context ctxt)
-        : _ctxt(std::move(ctxt))
-    {
-        _ctxt._migration_notifier.register_listener(this);
-    }
-    ~impl() {
-        _ctxt._migration_notifier.unregister_listener(this);
-    }
-
-    void on_before_create_column_family(const schema& schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
-        if (schema.cdc_options().enabled()) {
-            auto& db = _ctxt._proxy.get_db().local();
-            auto logname = log_name(schema.cf_name());
-            if (!db.has_schema(schema.ks_name(), logname)) {
-                // in seastar thread
-                auto log_schema = create_log_schema(schema);
-                auto stream_desc_schema = create_stream_description_table_schema(schema);
-                auto& keyspace = db.find_keyspace(schema.ks_name());
-
-                auto log_mut = db::schema_tables::make_create_table_mutations(keyspace.metadata(), log_schema, timestamp);
-                auto stream_mut = db::schema_tables::make_create_table_mutations(keyspace.metadata(), stream_desc_schema, timestamp);
-
-                mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
-                mutations.insert(mutations.end(), std::make_move_iterator(stream_mut.begin()), std::make_move_iterator(stream_mut.end()));
-            }
-        }
-    }
-
-    void on_before_update_column_family(const schema& new_schema, const schema& old_schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
-        bool is_cdc = new_schema.cdc_options().enabled();
-        bool was_cdc = old_schema.cdc_options().enabled();
-
-        // we need to create or modify the log & stream schemas iff either we changed cdc status (was != is)
-        // or if cdc is on now unconditionally, since then any actual base schema changes will affect the column 
-        // etc.
-        if (was_cdc || is_cdc) {
-            auto logname = log_name(old_schema.cf_name());
-            auto descname = desc_name(old_schema.cf_name());
-            auto& db = _ctxt._proxy.get_db().local();
-            auto& keyspace = db.find_keyspace(old_schema.ks_name());
-            auto log_schema = was_cdc ? db.find_column_family(old_schema.ks_name(), logname).schema() : nullptr;
-            auto stream_desc_schema = was_cdc ? db.find_column_family(old_schema.ks_name(), descname).schema() : nullptr;
-
-            if (!is_cdc) {
-                auto log_mut = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), log_schema, timestamp);
-                auto stream_mut = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), stream_desc_schema, timestamp);
-
-                mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
-                mutations.insert(mutations.end(), std::make_move_iterator(stream_mut.begin()), std::make_move_iterator(stream_mut.end()));
-                return;
-            }
-
-            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt);
-            auto new_stream_desc_schema = create_stream_description_table_schema(new_schema, stream_desc_schema ? std::make_optional(stream_desc_schema->id()) : std::nullopt);
-
-            auto log_mut = log_schema 
-                ? db::schema_tables::make_update_table_mutations(keyspace.metadata(), log_schema, new_log_schema, timestamp, false)
-                : db::schema_tables::make_create_table_mutations(keyspace.metadata(), new_log_schema, timestamp)
-                ;
-            auto stream_mut = stream_desc_schema 
-                ? db::schema_tables::make_update_table_mutations(keyspace.metadata(), stream_desc_schema, new_stream_desc_schema, timestamp, false)
-                : db::schema_tables::make_create_table_mutations(keyspace.metadata(), new_stream_desc_schema, timestamp)
-                ;
-
-            mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
-            mutations.insert(mutations.end(), std::make_move_iterator(stream_mut.begin()), std::make_move_iterator(stream_mut.end()));
-        }
-    }
-
-    void on_before_drop_column_family(const schema& schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
-        if (schema.cdc_options().enabled()) {
-            auto logname = log_name(schema.cf_name());
-            auto descname = desc_name(schema.cf_name());
-            auto& db = _ctxt._proxy.get_db().local();
-            auto& keyspace = db.find_keyspace(schema.ks_name());
-            auto log_schema = db.find_column_family(schema.ks_name(), logname).schema();
-            auto stream_desc_schema = db.find_column_family(schema.ks_name(), descname).schema();
-
-            auto log_mut = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), log_schema, timestamp);
-            auto stream_mut = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), stream_desc_schema, timestamp);
-
-            mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
-            mutations.insert(mutations.end(), std::make_move_iterator(stream_mut.begin()), std::make_move_iterator(stream_mut.end()));
-        }
-    }
-
-    void on_create_column_family(const sstring& ks_name, const sstring& cf_name) override {
-        // This callback is done on all shards. Only do the work once. 
-        if (engine().cpu_id() != 0) {
-            return; 
-        }
-        auto& db = _ctxt._proxy.get_db().local();
-        auto& cf = db.find_column_family(ks_name, cf_name);
-        auto schema = cf.schema();
-        if (schema->cdc_options().enabled()) {
-            populate_desc(_ctxt, *schema).get();
-        }
-    }
-
-    void on_update_column_family(const sstring& ks_name, const sstring& cf_name, bool columns_changed) override {
-        on_create_column_family(ks_name, cf_name);
-    }
-
-    void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {}
-
-    future<std::tuple<std::vector<mutation>, result_callback>> augment_mutation_call(
-        lowres_clock::time_point timeout,
-        std::vector<mutation>&& mutations
-    );
-
-    template<typename Iter>
-    future<> append_mutations(Iter i, Iter e, schema_ptr s, lowres_clock::time_point, std::vector<mutation>&);
-};
-
-cdc::cdc_service::cdc_service(service::storage_proxy& proxy)
-    : cdc_service(db_context::builder(proxy).build())
-{}
-
-cdc::cdc_service::cdc_service(db_context ctxt)
-    : _impl(std::make_unique<impl>(std::move(ctxt)))
-{
-    _impl->_ctxt._proxy.set_cdc_service(this);
-}
-
-cdc::cdc_service::~cdc_service() = default;
-
-cdc::options::options(const std::map<sstring, sstring>& map) {
-    if (map.find("enabled") == std::end(map)) {
-        return;
-    }
-
-    for (auto& p : map) {
-        if (p.first == "enabled") {
-            _enabled = p.second == "true";
-        } else if (p.first == "preimage") {
-            _preimage = p.second == "true";
-        } else if (p.first == "postimage") {
-            _postimage = p.second == "true";
-        } else if (p.first == "ttl") {
-            _ttl = std::stoi(p.second);
-        } else {
-            throw exceptions::configuration_exception("Invalid CDC option: " + p.first);
-        }
-    }
-}
-
-std::map<sstring, sstring> cdc::options::to_map() const {
-    if (!_enabled) {
-        return {};
-    }
-    return {
-        { "enabled", _enabled ? "true" : "false" },
-        { "preimage", _preimage ? "true" : "false" },
-        { "postimage", _postimage ? "true" : "false" },
-        { "ttl", std::to_string(_ttl) },
-    };
-}
-
-sstring cdc::options::to_sstring() const {
-    return json::to_json(to_map());
-}
-
-bool cdc::options::operator==(const options& o) const {
-    return _enabled == o._enabled && _preimage == o._preimage && _postimage == o._postimage && _ttl == o._ttl;
-}
-bool cdc::options::operator!=(const options& o) const {
-    return !(*this == o);
-}
-
-namespace cdc {
-
-using operation_native_type = std::underlying_type_t<operation>;
-using column_op_native_type = std::underlying_type_t<column_op>;
-
-sstring log_name(const sstring& table_name) {
-    static constexpr auto cdc_log_suffix = "_scylla_cdc_log";
-    return table_name + cdc_log_suffix;
-}
-
-sstring desc_name(const sstring& table_name) {
-    static constexpr auto cdc_desc_suffix = "_scylla_cdc_desc";
-    return table_name + cdc_desc_suffix;
-}
-
-static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid) {
-    schema_builder b(s.ks_name(), log_name(s.cf_name()));
-    b.set_default_time_to_live(gc_clock::duration{s.cdc_options().ttl()});
-    b.set_comment(sprint("CDC log for %s.%s", s.ks_name(), s.cf_name()));
-    b.with_column("stream_id", uuid_type, column_kind::partition_key);
-    b.with_column("time", timeuuid_type, column_kind::clustering_key);
-    b.with_column("batch_seq_no", int32_type, column_kind::clustering_key);
-    b.with_column("operation", data_type_for<operation_native_type>());
-    b.with_column("ttl", long_type);
-    auto add_columns = [&] (const schema::const_iterator_range_type& columns, bool is_data_col = false) {
-        for (const auto& column : columns) {
-            auto type = column.type;
-            if (is_data_col) {
-                type = tuple_type_impl::get_instance({ /* op */ data_type_for<column_op_native_type>(), /* value */ type, /* ttl */long_type});
-            }
-            b.with_column("_" + column.name(), type);
-        }
-    };
-    add_columns(s.partition_key_columns());
-    add_columns(s.clustering_key_columns());
-    add_columns(s.static_columns(), true);
-    add_columns(s.regular_columns(), true);
-
-    if (uuid) {
-        b.set_uuid(*uuid);
-    }
-    
-    return b.build();
-}
-
-static schema_ptr create_stream_description_table_schema(const schema& s, std::optional<utils::UUID> uuid) {
-    schema_builder b(s.ks_name(), desc_name(s.cf_name()));
-    b.set_comment(sprint("CDC description for %s.%s", s.ks_name(), s.cf_name()));
-    b.with_column("node_ip", inet_addr_type, column_kind::partition_key);
-    b.with_column("shard_id", int32_type, column_kind::partition_key);
-    b.with_column("created_at", timestamp_type, column_kind::clustering_key);
-    b.with_column("stream_id", uuid_type);
-
-    if (uuid) {
-        b.set_uuid(*uuid);
-    }
-
-    return b.build();
-}
-
-// This function assumes setup_stream_description_table was called on |s| before the call to this
-// function.
-static future<> populate_desc(db_context ctx, const schema& s) {
-    auto& db = ctx._proxy.get_db().local();
-    auto desc_schema =
-        db.find_schema(s.ks_name(), desc_name(s.cf_name()));
-    auto log_schema =
-        db.find_schema(s.ks_name(), log_name(s.cf_name()));
-    auto belongs_to = [&](const gms::inet_address& endpoint,
-                          const unsigned int shard_id,
-                          const int shard_count,
-                          const unsigned int ignore_msb_bits,
-                          const utils::UUID& stream_id) {
-        const auto log_pk = partition_key::from_singular(*log_schema,
-                                                         data_value(stream_id));
-        const auto token = ctx._partitioner.decorate_key(*log_schema, log_pk).token();
-        if (ctx._token_metadata.get_endpoint(ctx._token_metadata.first_token(token)) != endpoint) {
-            return false;
-        }
-        const auto owning_shard_id = dht::murmur3_partitioner(shard_count, ignore_msb_bits).shard_of(token);
-        return owning_shard_id == shard_id;
-    };
-
-    std::vector<mutation> mutations;
-    const auto ts = api::new_timestamp();
-    const auto ck = clustering_key::from_single_value(
-            *desc_schema, timestamp_type->decompose(ts));
-    auto cdef = desc_schema->get_column_definition(to_bytes("stream_id"));
-
-    for (const auto& dc : ctx._token_metadata.get_topology().get_datacenter_endpoints()) {
-        for (const auto& endpoint : dc.second) {
-            const auto decomposed_ip = inet_addr_type->decompose(endpoint.addr());
-            const unsigned int shard_count = ctx._snitch->get_shard_count(endpoint);
-            const unsigned int ignore_msb_bits = ctx._snitch->get_ignore_msb_bits(endpoint);
-            for (unsigned int shard_id = 0; shard_id < shard_count; ++shard_id) {
-                const auto pk = partition_key::from_exploded(
-                        *desc_schema, { decomposed_ip, int32_type->decompose(static_cast<int>(shard_id)) });
-                mutations.emplace_back(desc_schema, pk);
-
-                auto stream_id = utils::make_random_uuid();
-                while (!belongs_to(endpoint, shard_id, shard_count, ignore_msb_bits, stream_id)) {
-                    stream_id = utils::make_random_uuid();
-                }
-                auto value = atomic_cell::make_live(*uuid_type,
-                                                    ts,
-                                                    uuid_type->decompose(stream_id));
-                mutations.back().set_cell(ck, *cdef, std::move(value));
-            }
-        }
-    }
-    return ctx._proxy.mutate(std::move(mutations),
-                             db::consistency_level::QUORUM,
-                             db::no_timeout,
-                             nullptr,
-                             empty_service_permit());
-}
-
-db_context::builder::builder(service::storage_proxy& proxy) 
-    : _proxy(proxy) 
-{}
-
-db_context::builder& db_context::builder::with_migration_notifier(service::migration_notifier& migration_notifier) {
-    _migration_notifier = migration_notifier;
-    return *this;
-}
-
-db_context::builder& db_context::builder::with_token_metadata(locator::token_metadata& token_metadata) {
-    _token_metadata = token_metadata;
-    return *this;
-}
-
-db_context::builder& db_context::builder::with_snitch(locator::snitch_ptr& snitch) {
-    _snitch = snitch;
-    return *this;
-}
-
-db_context::builder& db_context::builder::with_partitioner(dht::i_partitioner& partitioner) {
-    _partitioner = partitioner;
-    return *this;
-}
-
-db_context db_context::builder::build() {
-    return db_context{
-        _proxy,
-        _migration_notifier ? _migration_notifier->get() : service::get_local_storage_service().get_migration_notifier(),
-        _token_metadata ? _token_metadata->get() : service::get_local_storage_service().get_token_metadata(),
-        _snitch ? _snitch->get() : locator::i_endpoint_snitch::get_local_snitch_ptr(),
-        _partitioner ? _partitioner->get() : dht::global_partitioner()
-    };
-}
-
-class transformer final {
-public:
-    using streams_type = std::unordered_map<std::pair<net::inet_address, unsigned int>, utils::UUID>;
-private:
-    db_context _ctx;
-    schema_ptr _schema;
-    schema_ptr _log_schema;
-    utils::UUID _time;
-    bytes _decomposed_time;
-    ::shared_ptr<const transformer::streams_type> _streams;
-    const column_definition& _op_col;
-
-    clustering_key set_pk_columns(const partition_key& pk, int batch_no, mutation& m) const {
-        const auto log_ck = clustering_key::from_exploded(
-                *m.schema(), { _decomposed_time, int32_type->decompose(batch_no) });
-        auto pk_value = pk.explode(*_schema);
-        size_t pos = 0;
-        for (const auto& column : _schema->partition_key_columns()) {
-            assert (pos < pk_value.size());
-            auto cdef = m.schema()->get_column_definition(to_bytes("_" + column.name()));
-            auto value = atomic_cell::make_live(*column.type,
-                                                _time.timestamp(),
-                                                bytes_view(pk_value[pos]));
-            m.set_cell(log_ck, *cdef, std::move(value));
-            ++pos;
-        }
-        return log_ck;
-    }
-
-    void set_operation(const clustering_key& ck, operation op, mutation& m) const {
-        m.set_cell(ck, _op_col, atomic_cell::make_live(*_op_col.type, _time.timestamp(), _op_col.type->decompose(operation_native_type(op))));
-    }
-
-    partition_key stream_id(const net::inet_address& ip, unsigned int shard_id) const {
-        auto it = _streams->find(std::make_pair(ip, shard_id));
-        if (it == std::end(*_streams)) {
-                throw std::runtime_error(format("No stream found for node {} and shard {}", ip, shard_id));
-        }
-        return partition_key::from_exploded(*_log_schema, { uuid_type->decompose(it->second) });
-    }
-public:
-    transformer(db_context ctx, schema_ptr s, ::shared_ptr<const transformer::streams_type> streams)
-        : _ctx(ctx)
-        , _schema(std::move(s))
-        , _log_schema(ctx._proxy.get_db().local().find_schema(_schema->ks_name(), log_name(_schema->cf_name())))
-        , _time(utils::UUID_gen::get_time_UUID())
-        , _decomposed_time(timeuuid_type->decompose(_time))
-        , _streams(std::move(streams))
-        , _op_col(*_log_schema->get_column_definition(to_bytes("operation")))
-    {}
-
-    // TODO: is pre-image data based on query enough. We only have actual column data. Do we need
-    // more details like tombstones/ttl? Probably not but keep in mind.
-    mutation transform(const mutation& m, const cql3::untyped_result_set* rs = nullptr) const {
-        auto& t = m.token();
-        auto&& ep = _ctx._token_metadata.get_endpoint(
-                _ctx._token_metadata.first_token(t));
-        if (!ep) {
-            throw std::runtime_error(format("No owner found for key {}", m.decorated_key()));
-        }
-        auto shard_id = dht::murmur3_partitioner(_ctx._snitch->get_shard_count(*ep), _ctx._snitch->get_ignore_msb_bits(*ep)).shard_of(t);
-        mutation res(_log_schema, stream_id(ep->addr(), shard_id));
-        auto& p = m.partition();
-        if (p.partition_tombstone()) {
-            // Partition deletion
-            auto log_ck = set_pk_columns(m.key(), 0, res);
-            set_operation(log_ck, operation::partition_delete, res);
-        } else if (!p.row_tombstones().empty()) {
-            // range deletion
-            int batch_no = 0;
-            for (auto& rt : p.row_tombstones()) {
-                auto set_bound = [&] (const clustering_key& log_ck, const clustering_key_prefix& ckp) {
-                    auto exploded = ckp.explode(*_schema);
-                    size_t pos = 0;
-                    for (const auto& column : _schema->clustering_key_columns()) {
-                        if (pos >= exploded.size()) {
-                            break;
-                        }
-                        auto cdef = _log_schema->get_column_definition(to_bytes("_" + column.name()));
-                        auto value = atomic_cell::make_live(*column.type,
-                                                            _time.timestamp(),
-                                                            bytes_view(exploded[pos]));
-                        res.set_cell(log_ck, *cdef, std::move(value));
-                        ++pos;
-                    }
-                };
-                {
-                    auto log_ck = set_pk_columns(m.key(), batch_no, res);
-                    set_bound(log_ck, rt.start);
-                    // TODO: separate inclusive/exclusive range
-                    set_operation(log_ck, operation::range_delete_start, res);
-                    ++batch_no;
-                }
-                {
-                    auto log_ck = set_pk_columns(m.key(), batch_no, res);
-                    set_bound(log_ck, rt.end);
-                    // TODO: separate inclusive/exclusive range
-                    set_operation(log_ck, operation::range_delete_end, res);
-                    ++batch_no;
-                }
-            }
-        } else {
-            // should be update or deletion
-            int batch_no = 0;
-            for (const rows_entry& r : p.clustered_rows()) {
-                auto ck_value = r.key().explode(*_schema);
-
-                std::optional<clustering_key> pikey;
-                const cql3::untyped_result_set_row * pirow = nullptr;
-
-                if (rs) {
-                    for (auto& utr : *rs) {
-                        bool match = true;
-                        for (auto& c : _schema->clustering_key_columns()) {
-                            auto rv = utr.get_view(c.name_as_text());
-                            auto cv = r.key().get_component(*_schema, c.component_index());
-                            if (rv != cv) {
-                                match = false;
-                                break;
-                            }
-                        }
-                        if (match) {
-                            pikey = set_pk_columns(m.key(), batch_no, res);
-                            set_operation(*pikey, operation::pre_image, res);
-                            pirow = &utr;
-                            ++batch_no;
-                            break;
-                        }
-                    }
-                }
-
-                auto log_ck = set_pk_columns(m.key(), batch_no, res);
-
-                size_t pos = 0;
-                for (const auto& column : _schema->clustering_key_columns()) {
-                    assert (pos < ck_value.size());
-                    auto cdef = _log_schema->get_column_definition(to_bytes("_" + column.name()));
-                    res.set_cell(log_ck, *cdef, atomic_cell::make_live(*column.type, _time.timestamp(), bytes_view(ck_value[pos])));
-
-                    if (pirow) {
-                        assert(pirow->has(column.name_as_text()));
-                        res.set_cell(*pikey, *cdef, atomic_cell::make_live(*column.type, _time.timestamp(), bytes_view(ck_value[pos])));
-                    }
-
-                    ++pos;
-                }
-
-                std::vector<bytes_opt> values(3);
-
-                auto process_cells = [&](const row& r, column_kind ckind) {
-                    r.for_each_cell([&](column_id id, const atomic_cell_or_collection& cell) {
-                        auto& cdef = _schema->column_at(ckind, id);
-                        auto* dst = _log_schema->get_column_definition(to_bytes("_" + cdef.name()));
-                        // todo: collections.
-                        if (cdef.is_atomic()) {
-                            column_op op;
-
-                            values[1] = values[2] = std::nullopt;
-                            auto view = cell.as_atomic_cell(cdef);
-                            if (view.is_live()) {
-                                op = column_op::set;
-                                values[1] = view.value().linearize();
-                                if (view.is_live_and_has_ttl()) {
-                                    values[2] = long_type->decompose(data_value(view.ttl().count()));
-                                }
-                            } else {
-                                op = column_op::del;
-                            }
-
-                            values[0] = data_type_for<column_op_native_type>()->decompose(data_value(static_cast<column_op_native_type>(op)));
-                            res.set_cell(log_ck, *dst, atomic_cell::make_live(*dst->type, _time.timestamp(), tuple_type_impl::build_value(values)));
-
-                            if (pirow && pirow->has(cdef.name_as_text())) {
-                                values[0] = data_type_for<column_op_native_type>()->decompose(data_value(static_cast<column_op_native_type>(column_op::set)));
-                                values[1] = pirow->get_blob(cdef.name_as_text());
-                                values[2] = std::nullopt;
-
-                                assert(std::addressof(res.partition().clustered_row(*_log_schema, *pikey)) != std::addressof(res.partition().clustered_row(*_log_schema, log_ck)));
-                                assert(pikey->explode() != log_ck.explode());
-                                res.set_cell(*pikey, *dst, atomic_cell::make_live(*dst->type, _time.timestamp(), tuple_type_impl::build_value(values)));
-                            }
-                        } else {
-                            cdc_log.warn("Non-atomic cell ignored {}.{}:{}", _schema->ks_name(), _schema->cf_name(), cdef.name_as_text());
-                        }
-                    });
-                };
-
-                process_cells(r.row().cells(), column_kind::regular_column);
-                process_cells(p.static_row().get(), column_kind::static_column);
-
-                set_operation(log_ck, operation::update, res);
-                ++batch_no;
-            }
-        }
-
-        return res;
-    }
-
-    static db::timeout_clock::time_point default_timeout() {
-        return db::timeout_clock::now() + 10s;
-    }
-
-    future<lw_shared_ptr<cql3::untyped_result_set>> pre_image_select(
-            service::client_state& client_state,
-            db::consistency_level cl,
-            const mutation& m)
-    {
-        auto& p = m.partition();
-        if (p.partition_tombstone() || !p.row_tombstones().empty() || p.clustered_rows().empty()) {
-            return make_ready_future<lw_shared_ptr<cql3::untyped_result_set>>();
-        }
-
-        dht::partition_range_vector partition_ranges{dht::partition_range(m.decorated_key())};
-
-        auto&& pc = _schema->partition_key_columns();
-        auto&& cc = _schema->clustering_key_columns();
-
-        std::vector<query::clustering_range> bounds;
-        if (cc.empty()) {
-            bounds.push_back(query::clustering_range::make_open_ended_both_sides());
-        } else {
-            for (const rows_entry& r : p.clustered_rows()) {
-                auto& ck = r.key();
-                bounds.push_back(query::clustering_range::make_singular(ck));
-            }
-        }
-
-        std::vector<const column_definition*> columns;
-        columns.reserve(_schema->all_columns().size());
-
-        std::transform(pc.begin(), pc.end(), std::back_inserter(columns), [](auto& c) { return &c; });
-        std::transform(cc.begin(), cc.end(), std::back_inserter(columns), [](auto& c) { return &c; });
-
-        query::column_id_vector static_columns, regular_columns;
-
-        auto sk = column_kind::static_column;
-        auto rk = column_kind::regular_column;
-        // TODO: this assumes all mutations touch the same set of columns. This might not be true, and we may need to do more horrible set operation here.
-        for (auto& [r, cids, kind] : { std::tie(p.static_row().get(), static_columns, sk), std::tie(p.clustered_rows().begin()->row().cells(), regular_columns, rk) }) {
-            r.for_each_cell([&](column_id id, const atomic_cell_or_collection&) {
-                auto& cdef =_schema->column_at(kind, id);
-                cids.emplace_back(id);
-                columns.emplace_back(&cdef);
-            });
-        }
-
-        auto selection = cql3::selection::selection::for_columns(_schema, std::move(columns));
-        auto partition_slice = query::partition_slice(std::move(bounds), std::move(static_columns), std::move(regular_columns), selection->get_query_options());
-        auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), partition_slice, query::max_partitions);
-
-        return _ctx._proxy.query(_schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), empty_service_permit(), client_state)).then(
-                [s = _schema, partition_slice = std::move(partition_slice), selection = std::move(selection)] (service::storage_proxy::coordinator_query_result qr) -> lw_shared_ptr<cql3::untyped_result_set> {
-                    cql3::selection::result_set_builder builder(*selection, gc_clock::now(), cql_serialization_format::latest());
-                    query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *s, *selection));
-                    auto result_set = builder.build();
-                    if (!result_set || result_set->empty()) {
-                        return {};
-                    }
-                    return make_lw_shared<cql3::untyped_result_set>(*result_set);
-        });
-    }
-};
-
-// This class is used to build a mapping from <node ip, shard id> to stream_id
-// It is used as a consumer for rows returned by the query to CDC Description Table
-class streams_builder {
-    const schema& _schema;
-    transformer::streams_type _streams;
-    net::inet_address _node_ip = net::inet_address();
-    unsigned int _shard_id = 0;
-    api::timestamp_type _latest_row_timestamp = api::min_timestamp;
-    utils::UUID _latest_row_stream_id = utils::UUID();
-public:
-    streams_builder(const schema& s) : _schema(s) {}
-
-    void accept_new_partition(const partition_key& key, uint32_t row_count) {
-        auto exploded = key.explode(_schema);
-        _node_ip = value_cast<net::inet_address>(inet_addr_type->deserialize(exploded[0]));
-        _shard_id = static_cast<unsigned int>(value_cast<int>(int32_type->deserialize(exploded[1])));
-        _latest_row_timestamp = api::min_timestamp;
-        _latest_row_stream_id = utils::UUID();
-    }
-
-    void accept_new_partition(uint32_t row_count) {
-        assert(false);
-    }
-
-    void accept_new_row(
-            const clustering_key& key,
-            const query::result_row_view& static_row,
-            const query::result_row_view& row) {
-        auto row_iterator = row.iterator();
-        api::timestamp_type timestamp = value_cast<db_clock::time_point>(
-                timestamp_type->deserialize(key.explode(_schema)[0])).time_since_epoch().count();
-        if (timestamp <= _latest_row_timestamp) {
-            return;
-        }
-        _latest_row_timestamp = timestamp;
-        for (auto&& cdef : _schema.regular_columns()) {
-            if (cdef.name_as_text() != "stream_id") {
-                row_iterator.skip(cdef);
-                continue;
-            }
-            auto val_opt = row_iterator.next_atomic_cell();
-            assert(val_opt);
-            val_opt->value().with_linearized([&] (bytes_view bv) {
-                _latest_row_stream_id = value_cast<utils::UUID>(uuid_type->deserialize(bv));
-            });
-        }
-    }
-
-    void accept_new_row(const query::result_row_view& static_row, const query::result_row_view& row) {
-        assert(false);
-    }
-
-    void accept_partition_end(const query::result_row_view& static_row) {
-        _streams.emplace(std::make_pair(_node_ip, _shard_id), _latest_row_stream_id);
-    }
-
-    transformer::streams_type build() {
-        return std::move(_streams);
-    }
-};
-
-static future<::shared_ptr<transformer::streams_type>> get_streams(
-        db_context ctx,
-        const sstring& ks_name,
-        const sstring& cf_name,
-        lowres_clock::time_point timeout,
-        service::query_state& qs) {
-    auto s =
-        ctx._proxy.get_db().local().find_schema(ks_name, desc_name(cf_name));
-    query::read_command cmd(
-            s->id(),
-            s->version(),
-            partition_slice_builder(*s).with_no_static_columns().build());
-    return ctx._proxy.query(
-            s,
-            make_lw_shared(std::move(cmd)),
-            {dht::partition_range::make_open_ended_both_sides()},
-            db::consistency_level::QUORUM,
-            {timeout, qs.get_permit(), qs.get_client_state()}).then([s = std::move(s)] (auto qr) mutable {
-        return query::result_view::do_with(*qr.query_result,
-                [s = std::move(s)] (query::result_view v) {
-            auto slice = partition_slice_builder(*s)
-                    .with_no_static_columns()
-                    .build();
-            streams_builder builder{ *s };
-            v.consume(slice, builder);
-            return ::make_shared<transformer::streams_type>(builder.build());
-        });
-    });
-}
-
-template <typename Func>
-future<std::vector<mutation>>
-transform_mutations(std::vector<mutation>& muts, decltype(muts.size()) batch_size, Func&& f) {
-    return parallel_for_each(
-            boost::irange(static_cast<decltype(muts.size())>(0), muts.size(), batch_size),
-            std::move(f))
-        .then([&muts] () mutable { return std::move(muts); });
-}
-
-} // namespace cdc
-
-future<std::tuple<std::vector<mutation>, cdc::result_callback>>
-cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations) {
-    // we do all this because in the case of batches, we can have mixed schemas.
-    auto e = mutations.end();
-    auto i = std::find_if(mutations.begin(), e, [](const mutation& m) {
-        return m.schema()->cdc_options().enabled();
-    });
-
-    if (i == e) {
-        return make_ready_future<std::tuple<std::vector<mutation>, cdc::result_callback>>(std::make_tuple(std::move(mutations), result_callback{}));
-    }
-
-    mutations.reserve(2 * mutations.size());
-
-    return do_with(std::move(mutations), service::query_state(service::client_state::for_internal_calls(), empty_service_permit()), [this, timeout, i](std::vector<mutation>& mutations, service::query_state& qs) {
-        return transform_mutations(mutations, 1, [this, &mutations, timeout, &qs] (int idx) {
-            auto& m = mutations[idx];
-            auto s = m.schema();
-
-            if (!s->cdc_options().enabled()) {
-                return make_ready_future<>();
-            }
-            // for batches/multiple mutations this is super inefficient. either partition the mutation set by schema
-            // and re-use streams, or probably better: add a cache so this lookup is a noop on second mutation
-            return get_streams(_ctxt, s->ks_name(), s->cf_name(), timeout, qs).then([this, s = std::move(s), &qs, &mutations, idx](::shared_ptr<transformer::streams_type> streams) mutable {
-                auto& m = mutations[idx]; // should not really need because of reserve, but lets be conservative
-                transformer trans(_ctxt, s, streams);
-
-                if (!s->cdc_options().preimage()) {
-                    mutations.emplace_back(trans.transform(m));
-                    return make_ready_future<>();
-                }
-
-                // Note: further improvement here would be to coalesce the pre-image selects into one
-                // iff a batch contains several modifications to the same table. Otoh, batch is rare(?)
-                // so this is premature.
-                auto f = trans.pre_image_select(qs.get_client_state(), db::consistency_level::LOCAL_QUORUM, m);
-                return f.then([trans = std::move(trans), &mutations, idx] (lw_shared_ptr<cql3::untyped_result_set> rs) mutable {
-                    mutations.push_back(trans.transform(mutations[idx], rs.get()));
-                });
-            });
-        }).then([](std::vector<mutation> mutations) {
-            return make_ready_future<std::tuple<std::vector<mutation>, cdc::result_callback>>(std::make_tuple(std::move(mutations), result_callback{}));
-        });
-    });
-}
-
-bool cdc::cdc_service::needs_cdc_augmentation(const std::vector<mutation>& mutations) const {
-    return std::any_of(mutations.begin(), mutations.end(), [](const mutation& m) {
-        return m.schema()->cdc_options().enabled();
-    });
-}
-
-future<std::tuple<std::vector<mutation>, cdc::result_callback>>
-cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations) {
-    return _impl->augment_mutation_call(timeout, std::move(mutations));
-}
--- a/cdc/cdc.hh
+++ b/cdc/cdc.hh
@@ -1,141 +0,0 @@
-/*
- * Copyright (C) 2019 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <functional>
-#include <optional>
-#include <map>
-#include <string>
-#include <vector>
-
-#include <seastar/core/future.hh>
-#include <seastar/core/lowres_clock.hh>
-#include <seastar/core/shared_ptr.hh>
-#include <seastar/core/sstring.hh>
-
-#include "exceptions/exceptions.hh"
-#include "timestamp.hh"
-#include "cdc_options.hh"
-
-class schema;
-using schema_ptr = seastar::lw_shared_ptr<const schema>;
-
-namespace locator {
-
-class snitch_ptr;
-class token_metadata;
-
-} // namespace locator
-
-namespace service {
-
-class migration_notifier;
-class storage_proxy;
-class query_state;
-
-} // namespace service
-
-namespace dht {
-
-class i_partitioner;
-
-} // namespace dht
-
-class mutation;
-class partition_key;
-
-namespace cdc {
-
-class db_context;
-
-// Callback to be invoked on mutation finish to fix
-// the whole bit about post-image.
-// TODO: decide on what the parameters are to be for this.
-using result_callback = std::function<future<>()>;
-
-/// \brief CDC service, responsible for schema listeners
-///
-/// CDC service will listen for schema changes and iff CDC is enabled/changed
-/// create/modify/delete corresponding log tables etc as part of the schema change. 
-///
-class cdc_service {
-    class impl;
-    std::unique_ptr<impl> _impl;
-public:
-    cdc_service(service::storage_proxy&);
-    cdc_service(db_context);
-    ~cdc_service();
-
-    // If any of the mutations are cdc enabled, optionally selects preimage, and adds the
-    // appropriate augments to set the log entries.
-    // Iff post-image is enabled for any of these, a non-empty callback is also
-    // returned to be invoked post the mutation query.
-    future<std::tuple<std::vector<mutation>, result_callback>> augment_mutation_call(
-        lowres_clock::time_point timeout,
-        std::vector<mutation>&& mutations
-        );
-    bool needs_cdc_augmentation(const std::vector<mutation>&) const;
-};
-
-struct db_context final {
-    service::storage_proxy& _proxy;
-    service::migration_notifier& _migration_notifier;
-    locator::token_metadata& _token_metadata;
-    locator::snitch_ptr& _snitch;
-    dht::i_partitioner& _partitioner;
-
-    class builder final {
-        service::storage_proxy& _proxy;
-        std::optional<std::reference_wrapper<service::migration_notifier>> _migration_notifier;
-        std::optional<std::reference_wrapper<locator::token_metadata>> _token_metadata;
-        std::optional<std::reference_wrapper<locator::snitch_ptr>> _snitch;
-        std::optional<std::reference_wrapper<dht::i_partitioner>> _partitioner;
-    public:
-        builder(service::storage_proxy& proxy);
-
-        builder& with_migration_notifier(service::migration_notifier& migration_notifier);
-        builder& with_token_metadata(locator::token_metadata& token_metadata);
-        builder& with_snitch(locator::snitch_ptr& snitch);
-        builder& with_partitioner(dht::i_partitioner& partitioner);
-
-        db_context build();
-    };
-};
-
-// cdc log table operation
-enum class operation : int8_t {
-    // note: these values will eventually be read by a third party, probably not privvy to this
-    // enum decl, so don't change the constant values (or the datatype).
-    pre_image = 0, update = 1, row_delete = 2, range_delete_start = 3, range_delete_end = 4, partition_delete = 5
-};
-
-// cdc log data column operation
-enum class column_op : int8_t {
-    // same as "operation". Do not edit values or type/type unless you _really_ want to.
-    set = 0, del = 1, add = 2,
-};
-
-seastar::sstring log_name(const seastar::sstring& table_name);
-
-seastar::sstring desc_name(const seastar::sstring& table_name);
-
-} // namespace cdc
--- a/cdc/cdc_extension.hh
+++ b/cdc/cdc_extension.hh
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2020 ScyllaDB
+ */
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "serializer.hh"
+#include "db/extensions.hh"
+#include "cdc/cdc_options.hh"
+#include "schema.hh"
+
+namespace cdc {
+
+class cdc_extension : public schema_extension {
+    cdc::options _cdc_options;
+public:
+    static constexpr auto NAME = "cdc";
+
+    cdc_extension() = default;
+    explicit cdc_extension(std::map<sstring, sstring> tags) : _cdc_options(std::move(tags)) {}
+    explicit cdc_extension(const bytes& b) : _cdc_options(cdc_extension::deserialize(b)) {}
+    explicit cdc_extension(const sstring& s) {
+        throw std::logic_error("Cannot create cdc info from string");
+    }
+    bytes serialize() const override {
+        return ser::serialize_to_buffer<bytes>(_cdc_options.to_map());
+    }
+    static std::map<sstring, sstring> deserialize(const bytes_view& buffer) {
+        return ser::deserialize_from_buffer(buffer, boost::type<std::map<sstring, sstring>>());
+    }
+    const options& get_options() const {
+        return _cdc_options;
+    }
+};
+
+}
--- a/cdc/cdc_partitioner.cc
+++ b/cdc/cdc_partitioner.cc
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cdc_partitioner.hh"
+#include "dht/token.hh"
+#include "schema.hh"
+#include "sstables/key.hh"
+#include "utils/class_registrator.hh"
+#include "cdc/generation.hh"
+#include "keys.hh"
+
+static const sstring cdc_partitioner_name = "com.scylladb.dht.CDCPartitioner";
+
+namespace cdc {
+
+const sstring cdc_partitioner::name() const {
+    return cdc_partitioner_name;
+}
+
+static dht::token to_token(int64_t value) {
+    return dht::token(dht::token::kind::key, value);
+}
+
+static dht::token to_token(bytes_view key) {
+    // Key should be 16 B long, of which first 8 B are used for token calculation
+    if (key.size() != 2*sizeof(int64_t)) {
+        return dht::minimum_token();
+    }
+    return to_token(stream_id::token_from_bytes(key));
+}
+
+dht::token
+cdc_partitioner::get_token(const sstables::key_view& key) const {
+    return to_token(bytes_view(key));
+}
+
+dht::token
+cdc_partitioner::get_token(const schema& s, partition_key_view key) const {
+    auto exploded_key = key.explode(s);
+    return to_token(exploded_key[0]);
+}
+
+using registry = class_registrator<dht::i_partitioner, cdc_partitioner>;
+static registry registrator(cdc_partitioner_name);
+static registry registrator_short_name("CDCPartitioner");
+
+}
--- a/cdc/cdc_partitioner.hh
+++ b/cdc/cdc_partitioner.hh
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <seastar/core/sstring.hh>
+
+#include "bytes.hh"
+#include "dht/i_partitioner.hh"
+
+class schema;
+class partition_key_view;
+
+namespace sstables {
+
+class key_view;
+
+}
+
+namespace cdc {
+
+struct cdc_partitioner final : public dht::i_partitioner {
+    cdc_partitioner() = default;
+    virtual const sstring name() const override;
+    virtual dht::token get_token(const schema& s, partition_key_view key) const override;
+    virtual dht::token get_token(const sstables::key_view& key) const override;
+};
+
+
+}
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -0,0 +1,331 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <boost/type.hpp>
+#include <random>
+#include <unordered_set>
+#include <seastar/core/sleep.hh>
+
+#include "keys.hh"
+#include "schema_builder.hh"
+#include "db/config.hh"
+#include "db/system_keyspace.hh"
+#include "db/system_distributed_keyspace.hh"
+#include "dht/token-sharding.hh"
+#include "locator/token_metadata.hh"
+#include "gms/application_state.hh"
+#include "gms/inet_address.hh"
+#include "gms/gossiper.hh"
+
+#include "cdc/generation.hh"
+
+extern logging::logger cdc_log;
+
+static int get_shard_count(const gms::inet_address& endpoint, const gms::gossiper& g) {
+    auto ep_state = g.get_application_state_ptr(endpoint, gms::application_state::SHARD_COUNT);
+    return ep_state ? std::stoi(ep_state->value) : -1;
+}
+
+static unsigned get_sharding_ignore_msb(const gms::inet_address& endpoint, const gms::gossiper& g) {
+    auto ep_state = g.get_application_state_ptr(endpoint, gms::application_state::IGNORE_MSB_BITS);
+    return ep_state ? std::stoi(ep_state->value) : 0;
+}
+
+namespace cdc {
+
+extern const api::timestamp_clock::duration generation_leeway =
+    std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));
+
+static void copy_int_to_bytes(int64_t i, size_t offset, bytes& b) {
+    i = net::hton(i);
+    std::copy_n(reinterpret_cast<int8_t*>(&i), sizeof(int64_t), b.begin() + offset);
+}
+
+stream_id::stream_id(int64_t first, int64_t second)
+    : _value(bytes::initialized_later(), 2 * sizeof(int64_t))
+{
+    copy_int_to_bytes(first, 0, _value);
+    copy_int_to_bytes(second, sizeof(int64_t), _value);
+}
+
+stream_id::stream_id(bytes b) : _value(std::move(b)) { }
+
+bool stream_id::is_set() const {
+    return !_value.empty();
+}
+
+bool stream_id::operator==(const stream_id& o) const {
+    return _value == o._value;
+}
+
+bool stream_id::operator<(const stream_id& o) const {
+    return _value < o._value;
+}
+
+static int64_t bytes_to_int64(bytes_view b, size_t offset) {
+    assert(b.size() >= offset + sizeof(int64_t));
+    int64_t res;
+    std::copy_n(b.begin() + offset, sizeof(int64_t), reinterpret_cast<int8_t *>(&res));
+    return net::ntoh(res);
+}
+
+int64_t stream_id::first() const {
+    return token_from_bytes(_value);
+}
+
+int64_t stream_id::second() const {
+    return bytes_to_int64(_value, sizeof(int64_t));
+}
+
+int64_t stream_id::token_from_bytes(bytes_view b) {
+    return bytes_to_int64(b, 0);
+}
+
+const bytes& stream_id::to_bytes() const {
+    return _value;
+}
+
+partition_key stream_id::to_partition_key(const schema& log_schema) const {
+    return partition_key::from_single_value(log_schema, _value);
+}
+
+bool token_range_description::operator==(const token_range_description& o) const {
+    return token_range_end == o.token_range_end && streams == o.streams
+        && sharding_ignore_msb == o.sharding_ignore_msb;
+}
+
+topology_description::topology_description(std::vector<token_range_description> entries)
+    : _entries(std::move(entries)) {}
+
+bool topology_description::operator==(const topology_description& o) const {
+    return _entries == o._entries;
+}
+
+const std::vector<token_range_description>& topology_description::entries() const {
+    return _entries;
+}
+
+static stream_id create_stream_id(dht::token t) {
+    static thread_local std::mt19937_64 rand_gen(std::random_device().operator()());
+    static thread_local std::uniform_int_distribution<int64_t> rand_dist(std::numeric_limits<int64_t>::min());
+
+    return {dht::token::to_int64(t), rand_dist(rand_gen)};
+}
+
+class topology_description_generator final {
+    const db::config& _cfg;
+    const std::unordered_set<dht::token>& _bootstrap_tokens;
+    const locator::token_metadata& _token_metadata;
+    const gms::gossiper& _gossiper;
+
+    // Compute a set of tokens that split the token ring into vnodes
+    auto get_tokens() const {
+        auto tokens = _token_metadata.sorted_tokens();
+        auto it = tokens.insert(
+                tokens.end(), _bootstrap_tokens.begin(), _bootstrap_tokens.end());
+        std::sort(it, tokens.end());
+        std::inplace_merge(tokens.begin(), it, tokens.end());
+        tokens.erase(std::unique(tokens.begin(), tokens.end()), tokens.end());
+        return tokens;
+    }
+
+    // Fetch sharding parameters for a node that owns vnode ending with this.end
+    // Returns <shard_count, ignore_msb> pair.
+    std::pair<size_t, uint8_t> get_sharding_info(dht::token end) const {
+        if (_bootstrap_tokens.count(end) > 0) {
+            return {smp::count, _cfg.murmur3_partitioner_ignore_msb_bits()};
+        } else {
+            auto endpoint = _token_metadata.get_endpoint(end);
+            if (!endpoint) {
+                throw std::runtime_error(
+                        format("Can't find endpoint for token {}", end));
+            }
+            auto sc = get_shard_count(*endpoint, _gossiper);
+            return {sc > 0 ? sc : 1, get_sharding_ignore_msb(*endpoint, _gossiper)};
+        }
+    }
+
+    token_range_description create_description(dht::token start, dht::token end) const {
+        token_range_description desc;
+
+        desc.token_range_end = end;
+
+        auto [shard_count, ignore_msb] = get_sharding_info(end);
+        desc.streams.reserve(shard_count);
+        desc.sharding_ignore_msb = ignore_msb;
+
+        dht::sharder sharder(shard_count, ignore_msb);
+        for (size_t shard_idx = 0; shard_idx < shard_count; ++shard_idx) {
+            auto t = dht::find_first_token_for_shard(sharder, start, end, shard_idx);
+            desc.streams.push_back(create_stream_id(t));
+        }
+
+        return desc;
+    }
+public:
+    topology_description_generator(
+            const db::config& cfg,
+            const std::unordered_set<dht::token>& bootstrap_tokens,
+            const locator::token_metadata& token_metadata,
+            const gms::gossiper& gossiper)
+        : _cfg(cfg)
+        , _bootstrap_tokens(bootstrap_tokens)
+        , _token_metadata(token_metadata)
+        , _gossiper(gossiper)
+    {}
+
+    /*
+     * Generate a set of CDC stream identifiers such that for each shard
+     * and vnode pair there exists a stream whose token falls into this vnode
+     * and is owned by this shard. It is sometimes not possible to generate
+     * a CDC stream identifier for some (vnode, shard) pair because not all
+     * shards have to own tokens in a vnode. Small vnode can be totally owned
+     * by a single shard. In such case, a stream identifier that maps to
+     * end of the vnode is generated.
+     *
+     * Then build a cdc::topology_description which maps tokens to generated
+     * stream identifiers, such that if token T is owned by shard S in vnode V,
+     * it gets mapped to the stream identifier generated for (S, V).
+     */
+    // Run in seastar::async context.
+    topology_description generate() const {
+        const auto tokens = get_tokens();
+
+        std::vector<token_range_description> vnode_descriptions;
+        vnode_descriptions.reserve(tokens.size());
+
+        vnode_descriptions.push_back(
+                create_description(tokens.back(), tokens.front()));
+        for (size_t idx = 1; idx < tokens.size(); ++idx) {
+            vnode_descriptions.push_back(
+                    create_description(tokens[idx - 1], tokens[idx]));
+        }
+
+        return {std::move(vnode_descriptions)};
+    }
+};
+
+bool should_propose_first_generation(const gms::inet_address& me, const gms::gossiper& g) {
+    auto my_host_id = g.get_host_id(me);
+    auto& eps = g.get_endpoint_states();
+    return std::none_of(eps.begin(), eps.end(),
+            [&] (const std::pair<gms::inet_address, gms::endpoint_state>& ep) {
+        return my_host_id < g.get_host_id(ep.first);
+    });
+}
+
+future<db_clock::time_point> get_local_streams_timestamp() {
+    return db::system_keyspace::get_saved_cdc_streams_timestamp().then([] (std::optional<db_clock::time_point> ts) {
+        if (!ts) {
+            auto err = format("get_local_streams_timestamp: tried to retrieve streams timestamp after bootstrapping, but it's not present");
+            cdc_log.error("{}", err);
+            throw std::runtime_error(err);
+        }
+        return *ts;
+    });
+}
+
+// Run inside seastar::async context.
+db_clock::time_point make_new_cdc_generation(
+        const db::config& cfg,
+        const std::unordered_set<dht::token>& bootstrap_tokens,
+        const locator::token_metadata& tm,
+        const gms::gossiper& g,
+        db::system_distributed_keyspace& sys_dist_ks,
+        std::chrono::milliseconds ring_delay,
+        bool for_testing) {
+    auto gen = topology_description_generator(cfg, bootstrap_tokens, tm, g).generate();
+
+    // Begin the race.
+    auto ts = db_clock::now() + (
+            for_testing ? std::chrono::milliseconds(0) : (
+                2 * ring_delay + std::chrono::duration_cast<std::chrono::milliseconds>(generation_leeway)));
+    sys_dist_ks.insert_cdc_topology_description(ts, std::move(gen), { tm.count_normal_token_owners() }).get();
+
+    return ts;
+}
+
+std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_address& endpoint, const gms::gossiper& g) {
+    auto streams_ts_string = g.get_application_state_value(endpoint, gms::application_state::CDC_STREAMS_TIMESTAMP);
+    cdc_log.trace("endpoint={}, streams_ts_string={}", endpoint, streams_ts_string);
+    return gms::versioned_value::cdc_streams_timestamp_from_string(streams_ts_string);
+}
+
+// Run inside seastar::async context.
+static void do_update_streams_description(
+        db_clock::time_point streams_ts,
+        db::system_distributed_keyspace& sys_dist_ks,
+        db::system_distributed_keyspace::context ctx) {
+    if (sys_dist_ks.cdc_desc_exists(streams_ts, ctx).get0()) {
+        cdc_log.debug("update_streams_description: description of generation {} already inserted", streams_ts);
+        return;
+    }
+
+    // We might race with another node also inserting the description, but that's ok. It's an idempotent operation.
+
+    auto topo = sys_dist_ks.read_cdc_topology_description(streams_ts, ctx).get0();
+    if (!topo) {
+        throw std::runtime_error(format("could not find streams data for timestamp {}", streams_ts));
+    }
+
+    std::set<cdc::stream_id> streams_set;
+    for (auto& entry: topo->entries()) {
+        streams_set.insert(entry.streams.begin(), entry.streams.end());
+    }
+
+    std::vector<cdc::stream_id> streams_vec(streams_set.begin(), streams_set.end());
+
+    sys_dist_ks.create_cdc_desc(streams_ts, streams_vec, ctx).get();
+    cdc_log.info("CDC description table successfully updated with generation {}.", streams_ts);
+}
+
+void update_streams_description(
+        db_clock::time_point streams_ts,
+        shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source& abort_src) {
+    try {
+        do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
+    } catch(...) {
+        cdc_log.warn(
+            "Could not update CDC description table with generation {}: {}. Will retry in the background.",
+            streams_ts, std::current_exception());
+
+        // It is safe to discard this future: we keep system distributed keyspace alive.
+        (void)seastar::async([
+            streams_ts, sys_dist_ks, get_num_token_owners = std::move(get_num_token_owners), &abort_src
+        ] {
+            while (true) {
+                sleep_abortable(std::chrono::seconds(60), abort_src).get();
+                try {
+                    do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
+                    return;
+                } catch (...) {
+                    cdc_log.warn(
+                        "Could not update CDC description table with generation {}: {}. Will try again.",
+                        streams_ts, std::current_exception());
+                }
+            }
+        });
+    }
+}
+
+} // namespace cdc
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -0,0 +1,177 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+/* This module contains classes and functions used to manage CDC generations:
+ * sets of CDC stream identifiers used by the cluster to choose partition keys for CDC log writes.
+ * Each CDC generation begins operating at a specific time point, called the generation's timestamp
+ * (`cdc_streams_timpestamp` or `streams_timestamp` in the code).
+ * The generation is used by all nodes in the cluster to pick CDC streams until superseded by a new generation.
+ *
+ * Functions from this module are used by the node joining procedure to introduce new CDC generations to the cluster
+ * (which is necessary due to new tokens being inserted into the token ring), or during rolling upgrade
+ * if CDC is enabled for the first time.
+ */
+
+#pragma once
+
+#include <vector>
+#include <unordered_set>
+#include <seastar/util/noncopyable_function.hh>
+
+#include "database_fwd.hh"
+#include "db_clock.hh"
+#include "dht/token.hh"
+
+namespace seastar {
+    class abort_source;
+} // namespace seastar
+
+namespace db {
+    class config;
+    class system_distributed_keyspace;
+} // namespace db
+
+namespace gms {
+    class inet_address;
+    class gossiper;
+} // namespace gms
+
+namespace locator {
+    class token_metadata;
+} // namespace locator
+
+namespace cdc {
+
+class stream_id final {
+    bytes _value;
+public:
+    stream_id() = default;
+    stream_id(int64_t, int64_t);
+    stream_id(bytes);
+    bool is_set() const;
+    bool operator==(const stream_id&) const;
+    bool operator<(const stream_id&) const;
+
+    int64_t first() const;
+    int64_t second() const;
+
+    const bytes& to_bytes() const;
+
+    partition_key to_partition_key(const schema& log_schema) const;
+    static int64_t token_from_bytes(bytes_view);
+};
+
+/* Describes a mapping of tokens to CDC streams in a token range.
+ *
+ * The range ends with `token_range_end`. A vector of `token_range_description`s defines the ranges entirely
+ * (the end of the `i`th range is the beginning of the `i+1 % size()`th range). Ranges are left-opened, right-closed.
+ *
+ * Tokens in the range ending with `token_range_end` are mapped to streams in the `streams` vector as follows:
+ * token `T` is mapped to `streams[j]` if and only if the used partitioner maps `T` to the `j`th shard,
+ * assuming that the partitioner is configured for `streams.size()` shards and (partitioner's) `sharding_ignore_msb`
+ * equals to the given `sharding_ignore_msb`.
+*/
+struct token_range_description {
+    dht::token token_range_end;
+    std::vector<stream_id> streams;
+    uint8_t sharding_ignore_msb;
+
+    bool operator==(const token_range_description&) const;
+};
+
+
+/* Describes a mapping of tokens to CDC streams in a whole token ring.
+ *
+ * Division of the ring to token ranges is defined in terms of `token_range_end`s
+ * in the `_entries` vector. See the comment above `token_range_description` for explanation.
+ */
+class topology_description {
+    std::vector<token_range_description> _entries;
+public:
+    topology_description(std::vector<token_range_description> entries);
+    bool operator==(const topology_description&) const;
+
+    const std::vector<token_range_description>& entries() const;
+};
+
+/* Should be called when we're restarting and we noticed that we didn't save any streams timestamp in our local tables,
+ * which means that we're probably upgrading from a non-CDC/old CDC version (another reason could be
+ * that there's a bug, or the user messed with our local tables).
+ *
+ * It checks whether we should be the node to propose the first generation of CDC streams.
+ * The chosen condition is arbitrary, it only tries to make sure that no two nodes propose a generation of streams
+ * when upgrading, and nothing bad happens if they for some reason do (it's mostly an optimization).
+ */
+bool should_propose_first_generation(const gms::inet_address& me, const gms::gossiper&);
+
+/*
+ * Read this node's streams generation timestamp stored in the LOCAL table.
+ * Assumes that the node has successfully bootstrapped, and we're not upgrading from a non-CDC version,
+ * so the timestamp is present.
+ */
+future<db_clock::time_point> get_local_streams_timestamp();
+
+/* Generate a new set of CDC streams and insert it into the distributed cdc_generations table.
+ * Returns the timestamp of this new generation.
+ *
+ * Should be called when starting the node for the first time (i.e., joining the ring).
+ *
+ * Assumes that the system_distributed keyspace is initialized.
+ *
+ * The caller of this function is expected to insert this timestamp into the gossiper as fast as possible,
+ * so that other nodes learn about the generation before their clocks cross the timestmap
+ * (not guaranteed in the current implementation, but expected to be the common case;
+ *  we assume that `ring_delay` is enough for other nodes to learn about the new generation).
+ */
+db_clock::time_point make_new_cdc_generation(
+        const db::config& cfg,
+        const std::unordered_set<dht::token>& bootstrap_tokens,
+        const locator::token_metadata& tm,
+        const gms::gossiper& g,
+        db::system_distributed_keyspace& sys_dist_ks,
+        std::chrono::milliseconds ring_delay,
+        bool for_testing);
+
+/* Retrieves CDC streams generation timestamp from the given endpoint's application state (broadcasted through gossip).
+ * We might be during a rolling upgrade, so the timestamp might not be there (if the other node didn't upgrade yet),
+ * but if the cluster already supports CDC, then every newly joining node will propose a new CDC generation,
+ * which means it will gossip the generation's timestamp.
+ */
+std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_address& endpoint, const gms::gossiper&);
+
+/* Inform CDC users about a generation of streams (identified by the given timestamp)
+ * by inserting it into the cdc_streams table.
+ *
+ * Assumes that the cdc_generations table contains this generation.
+ *
+ * Returning from this function does not mean that the table update was successful: the function
+ * might run an asynchronous task in the background.
+ *
+ * Run inside seastar::async context.
+ */
+void update_streams_description(
+        db_clock::time_point,
+        shared_ptr<db::system_distributed_keyspace>,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source&);
+
+} // namespace cdc
--- a/cdc/log.cc
+++ b/cdc/log.cc
--- a/cdc/log.hh
+++ b/cdc/log.hh
@@ -0,0 +1,146 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * This module manages CDC log tables. It contains facilities used to:
+ * - perform schema changes to CDC log tables correspondingly when base tables are changed,
+ * - perform writes to CDC log tables correspondingly when writes to base tables are made.
+ */
+
+#pragma once
+
+#include <functional>
+#include <optional>
+#include <map>
+#include <string>
+#include <vector>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/sstring.hh>
+
+#include "exceptions/exceptions.hh"
+#include "timestamp.hh"
+#include "tracing/trace_state.hh"
+#include "cdc_options.hh"
+#include "utils/UUID.hh"
+
+class schema;
+using schema_ptr = seastar::lw_shared_ptr<const schema>;
+
+namespace locator {
+
+class token_metadata;
+
+} // namespace locator
+
+namespace service {
+
+class migration_notifier;
+class storage_proxy;
+class query_state;
+
+} // namespace service
+
+class mutation;
+class partition_key;
+
+namespace cdc {
+
+struct operation_result_tracker;
+class db_context;
+class metadata;
+
+/// \brief CDC service, responsible for schema listeners
+///
+/// CDC service will listen for schema changes and iff CDC is enabled/changed
+/// create/modify/delete corresponding log tables etc as part of the schema change. 
+///
+class cdc_service final : public async_sharded_service<cdc::cdc_service> {
+    class impl;
+    std::unique_ptr<impl> _impl;
+public:
+    future<> stop();
+    cdc_service(service::storage_proxy&);
+    cdc_service(db_context);
+    ~cdc_service();
+
+    // If any of the mutations are cdc enabled, optionally selects preimage, and adds the
+    // appropriate augments to set the log entries.
+    // Iff post-image is enabled for any of these, a non-empty callback is also
+    // returned to be invoked post the mutation query.
+    future<std::tuple<std::vector<mutation>, lw_shared_ptr<operation_result_tracker>>> augment_mutation_call(
+        lowres_clock::time_point timeout,
+        std::vector<mutation>&& mutations,
+        tracing::trace_state_ptr tr_state,
+        db::consistency_level write_cl
+        );
+    bool needs_cdc_augmentation(const std::vector<mutation>&) const;
+};
+
+struct db_context final {
+    service::storage_proxy& _proxy;
+    service::migration_notifier& _migration_notifier;
+    locator::token_metadata& _token_metadata;
+    cdc::metadata& _cdc_metadata;
+
+    class builder final {
+        service::storage_proxy& _proxy;
+        std::optional<std::reference_wrapper<service::migration_notifier>> _migration_notifier;
+        std::optional<std::reference_wrapper<locator::token_metadata>> _token_metadata;
+        std::optional<std::reference_wrapper<cdc::metadata>> _cdc_metadata;
+    public:
+        builder(service::storage_proxy& proxy);
+
+        builder& with_migration_notifier(service::migration_notifier& migration_notifier);
+        builder& with_token_metadata(locator::token_metadata& token_metadata);
+        builder& with_cdc_metadata(cdc::metadata&);
+
+        db_context build();
+    };
+};
+
+// cdc log table operation
+enum class operation : int8_t {
+    // note: these values will eventually be read by a third party, probably not privvy to this
+    // enum decl, so don't change the constant values (or the datatype).
+    pre_image = 0, update = 1, insert = 2, row_delete = 3, partition_delete = 4,
+    range_delete_start_inclusive = 5, range_delete_start_exclusive = 6, range_delete_end_inclusive = 7, range_delete_end_exclusive = 8,
+    post_image = 9,
+};
+
+bool is_log_for_some_table(const sstring& ks_name, const std::string_view& table_name);
+seastar::sstring log_name(const seastar::sstring& table_name);
+seastar::sstring log_data_column_name(std::string_view column_name);
+seastar::sstring log_meta_column_name(std::string_view column_name);
+bytes log_data_column_name_bytes(const bytes& column_name);
+bytes log_meta_column_name_bytes(const bytes& column_name);
+
+seastar::sstring log_data_column_deleted_name(std::string_view column_name);
+bytes log_data_column_deleted_name_bytes(const bytes& column_name);
+
+seastar::sstring log_data_column_deleted_elements_name(std::string_view column_name);
+bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name);
+
+utils::UUID generate_timeuuid(api::timestamp_type t);
+
+} // namespace cdc
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -0,0 +1,200 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "dht/token-sharding.hh"
+#include "utils/exceptions.hh"
+#include "exceptions/exceptions.hh"
+
+#include "cdc/generation.hh"
+#include "cdc/metadata.hh"
+
+extern logging::logger cdc_log;
+
+namespace cdc {
+    extern const api::timestamp_clock::duration generation_leeway;
+} // namespace cdc
+
+static api::timestamp_type to_ts(db_clock::time_point tp) {
+    // This assumes that timestamp_clock and db_clock have the same epochs.
+    return std::chrono::duration_cast<api::timestamp_clock::duration>(tp.time_since_epoch()).count();
+}
+
+static cdc::stream_id get_stream(
+        const cdc::token_range_description& entry,
+        dht::token tok) {
+    // The ith stream is the stream for the ith shard.
+    auto shard_cnt = entry.streams.size();
+    auto shard_id = dht::shard_of(shard_cnt, entry.sharding_ignore_msb, tok);
+
+    if (shard_id >= shard_cnt) {
+        on_internal_error(cdc_log, "get_stream: shard_id out of bounds");
+    }
+
+    return entry.streams[shard_id];
+}
+
+static cdc::stream_id get_stream(
+        const std::vector<cdc::token_range_description>& entries,
+        dht::token tok) {
+    if (entries.empty()) {
+        on_internal_error(cdc_log, "get_stream: entries empty");
+    }
+
+    auto it = std::lower_bound(entries.begin(), entries.end(), tok,
+            [] (const cdc::token_range_description& e, dht::token t) { return e.token_range_end < t; });
+    if (it == entries.end()) {
+        it = entries.begin();
+    }
+
+    return get_stream(*it, tok);
+}
+
+cdc::metadata::container_t::const_iterator cdc::metadata::gen_used_at(api::timestamp_type ts) const {
+    auto it = _gens.upper_bound(ts);
+    if (it == _gens.begin()) {
+        // All known generations have higher timestamps than `ts`.
+        return _gens.end();
+    }
+
+    return std::prev(it);
+}
+
+cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok) {
+    auto now = api::new_timestamp();
+    if (ts > now + generation_leeway.count()) {
+        throw exceptions::invalid_request_exception(format(
+                "cdc: attempted to get a stream \"from the future\" ({}; current server time: {})."
+                " With CDC you cannot send writes with timestamps arbitrarily into the future, because we don't"
+                " know what streams will be used at that time.\n"
+                "We *do* allow sending writes into the near future, but our ability to do that is limited."
+                " If you really must use your own timestamps, then make sure your clocks are well-synchronized"
+               "  with the database's clocks.", format_timestamp(ts), format_timestamp(now)));
+        // Note that we might still send a write to a wrong generation, if we learn about the current
+        // generation too late (we might think that an earlier generation is the current one).
+        // Nothing protects us from that until we start using transactions for generation switching.
+    }
+
+    auto it = gen_used_at(now);
+    if (it == _gens.end()) {
+        throw std::runtime_error(format(
+                "cdc::metadata::get_stream: could not find any CDC stream (current time: {})."
+                " Are we in the middle of a cluster upgrade?", format_timestamp(now)));
+    }
+
+    // Garbage-collect generations that will no longer be used.
+    it = _gens.erase(_gens.begin(), it);
+
+    if (it->first > ts) {
+        throw exceptions::invalid_request_exception(format(
+                "cdc: attempted to get a stream from an earlier generation than the currently used one."
+                " With CDC you cannot send writes with timestamps too far into the past, because that would break"
+                " consistency properties (write timestamp: {}, current generation started at: {})",
+                format_timestamp(ts), format_timestamp(it->first)));
+    }
+
+    // With `generation_leeway` we allow sending writes to the near future. It might happen
+    // that `ts` doesn't belong to the current generation ("current" according to our clock),
+    // but to the next generation. Adjust for this case:
+    {
+        auto next_it = std::next(it);
+        while (next_it != _gens.end() && next_it->first <= ts) {
+            it = next_it++;
+        }
+    }
+    // Note: if there is a next generation that `ts` belongs to, but we don't know about it,
+    // then too bad. This is no different from the situation in which we didn't manage to learn
+    // about the current generation in time. We won't be able to prevent it until we introduce transactions.
+
+    if (!it->second) {
+        throw std::runtime_error(format(
+                "cdc: attempted to get a stream from a generation that we know about, but weren't able to retrieve"
+                " (generation timestamp: {}, write timestamp: {}). Make sure that the replicas which contain"
+                " this generation's data are alive and reachable from this node.", format_timestamp(it->first), format_timestamp(ts)));
+    }
+
+    auto& gen = *it->second;
+    auto ret = ::get_stream(gen.entries(), tok);
+    _last_stream_timestamp = ts;
+    return ret;
+}
+
+bool cdc::metadata::known_or_obsolete(db_clock::time_point tp) const {
+    auto ts = to_ts(tp);
+    auto it = _gens.lower_bound(ts);
+
+    if (it == _gens.end()) {
+        // No known generations with timestamp >= ts.
+        return false;
+    }
+
+    if (it->first == ts) {
+        if (it->second) {
+            // We already inserted this particular generation.
+            return true;
+        }
+        ++it;
+    }
+
+    // Check if some new generation has already superseded this one.
+    return it != _gens.end() && it->first <= api::new_timestamp();
+}
+
+bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen) {
+    if (known_or_obsolete(tp)) {
+        return false;
+    }
+
+    auto now = api::new_timestamp();
+    auto it = gen_used_at(now);
+
+    if (it != _gens.end()) {
+        // Garbage-collect generations that will no longer be used.
+        it = _gens.erase(_gens.begin(), it);
+
+    }
+
+    _gens.insert_or_assign(to_ts(tp), std::move(gen));
+    return true;
+}
+
+bool cdc::metadata::prepare(db_clock::time_point tp) {
+    if (known_or_obsolete(tp)) {
+        return false;
+    }
+
+    auto ts = to_ts(tp);
+    auto emplaced = _gens.emplace(to_ts(tp), std::nullopt).second;
+
+    if (_last_stream_timestamp != api::missing_timestamp) {
+        auto last_correct_gen = gen_used_at(_last_stream_timestamp);
+        if (emplaced && last_correct_gen != _gens.end() && last_correct_gen->first == ts) {
+            cdc_log.error(
+                "just learned about a CDC generation newer than the one used the last time"
+                " streams were retrieved. This generation, or some newer one, should have"
+                " been used instead (new generation's timestamp: {}, last time streams were retrieved: {})."
+                " The new generation probably arrived too late due to a network partition"
+                " and we've made a write using the wrong set streams.",
+                format_timestamp(ts), format_timestamp(_last_stream_timestamp));
+        }
+    }
+
+    return emplaced;
+}
--- a/cdc/metadata.hh
+++ b/cdc/metadata.hh
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <map>
+
+#include "db_clock.hh"
+#include "timestamp.hh"
+
+namespace dht {
+    class token;
+}
+
+namespace cdc {
+
+class stream_id;
+class topology_description;
+
+/* Represents the node's knowledge about CDC generations used in the cluster.
+ * Used during writes to pick streams to which CDC log writes should be sent to
+ * (i.e., to pick partition keys for these writes).
+ */
+class metadata final {
+    // Note: we use db_clock (1ms resolution) for generation timestaps
+    // (because we need to insert them into tables using columns of timestamp types,
+    //  and the native type of our columns' timestamp_type is db_clock::time_point).
+    // On the other hand, timestamp_clock (1us resolution) is used for mutation timestamps,
+    // and api::timestamp_type represents the number of ticks of a timestamp_clock::time_point since epoch.
+
+    using container_t = std::map<api::timestamp_type, std::optional<topology_description>>;
+    container_t _gens;
+
+    /* The timestamp used in the last successful `get_stream` call. */
+    api::timestamp_type _last_stream_timestamp = api::missing_timestamp;
+
+    container_t::const_iterator gen_used_at(api::timestamp_type ts) const;
+public:
+    /* Is a generation with the given timestamp already known or superseded by a newer generation? */
+    bool known_or_obsolete(db_clock::time_point) const;
+
+    /* Return the stream for the base partition whose token is `tok` to which a corresponding log write should go
+     * according to the generation used at time `ts` (i.e, the latest generation whose timestamp is less or equal to `ts`).
+     *
+     * If the provided timestamp is too far away "into the future" (where "now" is defined according to our local clock),
+     * we reject the get_stream query. This is because the resulting stream might belong to a generation which we don't
+     * yet know about. The amount of leeway (how much "into the future" we allow `ts` to be) is defined
+     * by the `cdc::generation_leeway` constant.
+     */
+    stream_id get_stream(api::timestamp_type ts, dht::token tok);
+
+    /* Insert the generation given by `gen` with timestamp `ts` to be used by the `get_stream` function,
+     * if the generation is not already known or older than the currently known ones.
+     *
+     * Returns true if the generation was inserted,
+     * meaning that `get_stream` might return a stream from this generation (at some time points).
+     */
+    bool insert(db_clock::time_point ts, topology_description&& gen);
+
+    /* Prepare for inserting a new generation whose timestamp is `ts`.
+     * This method is not required to be called before `insert`, but it's here
+     * to increase safety of `get_stream` calls in some situations. Use it if you:
+     * 1. know that there is a new generation, but
+     * 2. you didn't yet retrieve the generation's topology_description.
+     *
+     * After preparing a generation, if `get_stream` is supposed to return a stream from this generation
+     * but we don't yet have the generation's data, it will reject the query to maintain consistency of streams.
+     *
+     * Returns true iff this generation is not obsolete and wasn't previously prepared nor inserted.
+     */
+    bool prepare(db_clock::time_point ts);
+};
+
+} // namespace cdc
--- a/cdc/split.cc
+++ b/cdc/split.cc
@@ -0,0 +1,493 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "mutation.hh"
+#include "schema.hh"
+
+#include "split.hh"
+#include "log.hh"
+
+struct atomic_column_update {
+    column_id id;
+    atomic_cell cell;
+};
+
+struct nonatomic_column_update {
+    column_id id;
+    tombstone t; // optional
+    utils::chunked_vector<std::pair<bytes, atomic_cell>> cells;
+};
+
+struct static_row_update {
+    gc_clock::duration ttl;
+    std::vector<atomic_column_update> atomic_entries;
+    std::vector<nonatomic_column_update> nonatomic_entries;
+};
+
+struct clustered_row_insert {
+    gc_clock::duration ttl;
+    clustering_key key;
+    row_marker marker;
+    std::vector<atomic_column_update> atomic_entries;
+    std::vector<nonatomic_column_update> nonatomic_entries;
+};
+
+struct clustered_row_update {
+    gc_clock::duration ttl;
+    clustering_key key;
+    std::vector<atomic_column_update> atomic_entries;
+    std::vector<nonatomic_column_update> nonatomic_entries;
+};
+
+struct clustered_row_deletion {
+    clustering_key key;
+    tombstone t;
+};
+
+struct clustered_range_deletion {
+    range_tombstone rt;
+};
+
+struct partition_deletion {
+    tombstone t;
+};
+
+struct batch {
+    std::vector<static_row_update> static_updates;
+    std::vector<clustered_row_insert> clustered_inserts;
+    std::vector<clustered_row_update> clustered_updates;
+    std::vector<clustered_row_deletion> clustered_row_deletions;
+    std::vector<clustered_range_deletion> clustered_range_deletions;
+    std::optional<partition_deletion> partition_deletions;
+};
+
+using set_of_changes = std::map<api::timestamp_type, batch>;
+
+struct row_update {
+    std::vector<atomic_column_update> atomic_entries;
+    std::vector<nonatomic_column_update> nonatomic_entries;
+};
+
+static
+std::map<std::pair<api::timestamp_type, gc_clock::duration>, row_update>
+extract_row_updates(const row& r, column_kind ckind, const schema& schema) {
+    std::map<std::pair<api::timestamp_type, gc_clock::duration>, row_update> result;
+    r.for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
+        auto& cdef = schema.column_at(ckind, id);
+        if (cdef.is_atomic()) {
+            auto view = cell.as_atomic_cell(cdef);
+            auto timestamp_and_ttl = std::pair(
+                    view.timestamp(),
+                    view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0)
+                );
+            result[timestamp_and_ttl].atomic_entries.push_back({id, atomic_cell(*cdef.type, view)});
+            return;
+        }
+
+        cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
+            auto desc = mview.materialize(*cdef.type);
+            for (auto& [k, v]: desc.cells) {
+                auto timestamp_and_ttl = std::pair(
+                        v.timestamp(),
+                        v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0)
+                    );
+                auto& updates = result[timestamp_and_ttl].nonatomic_entries;
+                if (updates.empty() || updates.back().id != id) {
+                    updates.push_back({id, {}});
+                }
+                updates.back().cells.push_back({std::move(k), std::move(v)});
+            }
+
+            if (desc.tomb) {
+                auto timestamp_and_ttl = std::pair(desc.tomb.timestamp + 1, gc_clock::duration(0));
+                auto& updates = result[timestamp_and_ttl].nonatomic_entries;
+                if (updates.empty() || updates.back().id != id) {
+                    updates.push_back({id, {}});
+                }
+                updates.back().t = std::move(desc.tomb);
+            }
+        });
+    });
+    return result;
+};
+
+set_of_changes extract_changes(const mutation& base_mutation, const schema& base_schema) {
+    set_of_changes res;
+    auto& p = base_mutation.partition();
+
+    auto sr_updates = extract_row_updates(p.static_row().get(), column_kind::static_column, base_schema);
+    for (auto& [k, up]: sr_updates) {
+        auto [timestamp, ttl] = k;
+        res[timestamp].static_updates.push_back({
+                ttl,
+                std::move(up.atomic_entries),
+                std::move(up.nonatomic_entries)
+            });
+    }
+
+    for (const rows_entry& cr : p.clustered_rows()) {
+        auto cr_updates = extract_row_updates(cr.row().cells(), column_kind::regular_column, base_schema);
+
+        const auto& marker = cr.row().marker();
+        auto marker_timestamp = marker.timestamp();
+        auto marker_ttl = marker.is_expiring() ? marker.ttl() : gc_clock::duration(0);
+        if (marker.is_live()) {
+            // make sure that an entry corresponding to the row marker's timestamp and ttl is in the map
+            (void)cr_updates[std::pair(marker_timestamp, marker_ttl)];
+        }
+
+        auto is_insert = [&] (api::timestamp_type timestamp, gc_clock::duration ttl) {
+            if (!marker.is_live()) {
+                return false;
+            }
+
+            return timestamp == marker_timestamp && ttl == marker_ttl;
+        };
+
+        for (auto& [k, up]: cr_updates) {
+            // It is important that changes in the resulting `set_of_changes` are listed
+            // in increasing TTL order. The reason is explained in a comment in cdc/log.cc,
+            // search for "#6070".
+            auto [timestamp, ttl] = k;
+
+            if (is_insert(timestamp, ttl)) {
+                res[timestamp].clustered_inserts.push_back({
+                        ttl,
+                        cr.key(),
+                        marker,
+                        std::move(up.atomic_entries),
+                        {}
+                    });
+
+                auto& cr_insert = res[timestamp].clustered_inserts.back();
+                bool clustered_update_exists = false;
+                for (auto& nonatomic_up: up.nonatomic_entries) {
+                    // Updating a collection column with an INSERT statement implies inserting a tombstone.
+                    //
+                    // For example, suppose that we have:
+                    //     CREATE TABLE t (a int primary key, b map<int, int>);
+                    // Then the following statement:
+                    //     INSERT INTO t (a, b) VALUES (0, {0:0}) USING TIMESTAMP T;
+                    // creates a tombstone in column b with timestamp T-1.
+                    // It also creates a cell (0, 0) with timestamp T.
+                    //
+                    // There is no way to create just the cell using an INSERT statement.
+                    // This can only be done using an UPDATE, as follows:
+                    //     UPDATE t USING TIMESTAMP T SET b = b + {0:0} WHERE a = 0;
+                    // note that this is different  than
+                    //     UPDATE t USING TIMESTAMP T SET b = {0:0} WHERE a = 0;
+                    // which also creates a tombstone with timestamp T-1.
+                    //
+                    // It follows that:
+                    // - if `nonatomic_up` has a tombstone, it can be made merged with our `cr_insert`,
+                    //   which represents an INSERT change.
+                    // - but if `nonatomic_up` only has cells, we must create a separate UPDATE change
+                    //   for the cells alone.
+                    if (nonatomic_up.t) {
+                        cr_insert.nonatomic_entries.push_back(std::move(nonatomic_up));
+                    } else {
+                        if (!clustered_update_exists) {
+                            res[timestamp].clustered_updates.push_back({
+                                ttl,
+                                cr.key(),
+                                {},
+                                {}
+                            });
+
+                            // Multiple iterations of this `for` loop (for different collection columns)
+                            // might want to put their `nonatomic_up`s into an UPDATE change;
+                            // but we don't want to create a separate change for each of them, reusing one instead.
+                            //
+                            // Example:
+                            // CREATE TABLE t (a int primary key, b map<int, int>, c map <int, int>) with cdc = {'enabled':true};
+                            // insert into t (a, b, c) values (0, {1:1}, {2:2}) USING TTL 5;
+                            //
+                            // this should create 3 delta rows:
+                            // 1. one for the row marker (indicating an INSERT), with TTL 5
+                            // 2. one for the b and c tombstones, without TTL (cdc$ttl = null)
+                            // 3. one for the b and c cells, with TTL 5
+                            // This logic takes care that b cells and c cells are put into a single change (3. above).
+                            clustered_update_exists = true;
+                        }
+
+                        auto& cr_update = res[timestamp].clustered_updates.back();
+                        cr_update.nonatomic_entries.push_back(std::move(nonatomic_up));
+                    }
+                }
+            } else {
+                res[timestamp].clustered_updates.push_back({
+                        ttl,
+                        cr.key(),
+                        std::move(up.atomic_entries),
+                        std::move(up.nonatomic_entries)
+                    });
+            }
+        }
+
+        auto row_tomb = cr.row().deleted_at().regular();
+        if (row_tomb) {
+            res[row_tomb.timestamp].clustered_row_deletions.push_back({cr.key(), row_tomb});
+        }
+    }
+
+    for (const auto& rt: p.row_tombstones()) {
+        if (rt.tomb.timestamp != api::missing_timestamp) {
+            res[rt.tomb.timestamp].clustered_range_deletions.push_back({rt});
+        }
+    }
+
+    auto partition_tomb_timestamp = p.partition_tombstone().timestamp;
+    if (partition_tomb_timestamp != api::missing_timestamp) {
+        res[partition_tomb_timestamp].partition_deletions = {p.partition_tombstone()};
+    }
+
+    return res;
+}
+
+namespace cdc {
+
+bool should_split(const mutation& base_mutation, const schema& base_schema) {
+    auto& p = base_mutation.partition();
+
+    api::timestamp_type found_ts = api::missing_timestamp;
+    std::optional<gc_clock::duration> found_ttl; // 0 = "no ttl"
+
+    auto check_or_set = [&] (api::timestamp_type ts, gc_clock::duration ttl) {
+        if (found_ts != api::missing_timestamp && found_ts != ts) {
+            return true;
+        }
+        found_ts = ts;
+
+        if (found_ttl && *found_ttl != ttl) {
+            return true;
+        }
+        found_ttl = ttl;
+
+        return false;
+    };
+
+    bool had_static_row = false;
+
+    bool should_split = false;
+    p.static_row().get().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
+        had_static_row = true;
+
+        auto& cdef = base_schema.column_at(column_kind::static_column, id);
+        if (cdef.is_atomic()) {
+            auto view = cell.as_atomic_cell(cdef);
+            if (check_or_set(view.timestamp(), view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0))) {
+                should_split = true;
+            }
+            return;
+        }
+
+        cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
+            auto desc = mview.materialize(*cdef.type);
+            for (auto& [k, v]: desc.cells) {
+                if (check_or_set(v.timestamp(), v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0))) {
+                    should_split = true;
+                    return;
+                }
+            }
+
+            if (desc.tomb) {
+                if (check_or_set(desc.tomb.timestamp + 1, gc_clock::duration(0))) {
+                    should_split = true;
+                    return;
+                }
+            }
+        });
+    });
+
+    if (should_split) {
+        return true;
+    }
+
+    bool had_clustered_row = false;
+
+    if (!p.clustered_rows().empty() && had_static_row) {
+        return true;
+    }
+    for (const rows_entry& cr : p.clustered_rows()) {
+        had_clustered_row = true;
+
+        const auto& marker = cr.row().marker();
+        if (marker.is_live() && check_or_set(marker.timestamp(), marker.is_expiring() ? marker.ttl() : gc_clock::duration(0))) {
+            return true;
+        }
+
+        bool is_insert = marker.is_live();
+
+        bool had_cells = false;
+        cr.row().cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
+            had_cells = true;
+
+            auto& cdef = base_schema.column_at(column_kind::regular_column, id);
+            if (cdef.is_atomic()) {
+                auto view = cell.as_atomic_cell(cdef);
+                if (check_or_set(view.timestamp(), view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0))) {
+                    should_split = true;
+                }
+                return;
+            }
+
+            cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
+                for (auto& [k, v]: mview.cells) {
+                    if (check_or_set(v.timestamp(), v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0))) {
+                        should_split = true;
+                        return;
+                    }
+
+                    if (is_insert) {
+                        // nonatomic updates cannot be expressed with an INSERT.
+                        should_split = true;
+                        return;
+                    }
+                }
+
+                if (mview.tomb) {
+                    if (check_or_set(mview.tomb.timestamp + 1, gc_clock::duration(0))) {
+                        should_split = true;
+                        return;
+                    }
+                }
+            });
+        });
+
+        if (should_split) {
+            return true;
+        }
+
+        auto row_tomb = cr.row().deleted_at().regular();
+        if (row_tomb) {
+            if (had_cells) {
+                return true;
+            }
+
+            // there were no cells, so no ttl
+            assert(!found_ttl);
+            if (found_ts != api::missing_timestamp && found_ts != row_tomb.timestamp) {
+                return true;
+            }
+
+            found_ts = row_tomb.timestamp;
+        }
+    }
+
+    if (!p.row_tombstones().empty() && (had_static_row || had_clustered_row)) {
+        return true;
+    }
+
+    for (const auto& rt: p.row_tombstones()) {
+        if (rt.tomb) {
+            if (found_ts != api::missing_timestamp && found_ts != rt.tomb.timestamp) {
+                return true;
+            }
+
+            found_ts = rt.tomb.timestamp;
+        }
+    }
+
+    if (p.partition_tombstone().timestamp != api::missing_timestamp
+            && (!p.row_tombstones().empty() || had_static_row || had_clustered_row)) {
+        return true;
+    }
+
+    // A mutation with no timestamp will be split into 0 mutations
+    return found_ts == api::missing_timestamp;
+}
+
+void for_each_change(const mutation& base_mutation, const schema_ptr& base_schema,
+        seastar::noncopyable_function<void(mutation, api::timestamp_type, bytes, int&)> f) {
+    auto changes = extract_changes(base_mutation, *base_schema);
+    auto pk = base_mutation.key();
+
+    for (auto& [change_ts, btch] : changes) {
+        auto tuuid = timeuuid_type->decompose(generate_timeuuid(change_ts));
+        int batch_no = 0;
+
+        for (auto& sr_update : btch.static_updates) {
+            mutation m(base_schema, pk);
+            for (auto& atomic_update : sr_update.atomic_entries) {
+                auto& cdef = base_schema->column_at(column_kind::static_column, atomic_update.id);
+                m.set_static_cell(cdef, std::move(atomic_update.cell));
+            }
+            for (auto& nonatomic_update : sr_update.nonatomic_entries) {
+                auto& cdef = base_schema->column_at(column_kind::static_column, nonatomic_update.id);
+                m.set_static_cell(cdef, collection_mutation_description{nonatomic_update.t, std::move(nonatomic_update.cells)}.serialize(*cdef.type));
+            }
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        for (auto& cr_insert : btch.clustered_inserts) {
+            mutation m(base_schema, pk);
+
+            auto& row = m.partition().clustered_row(*base_schema, cr_insert.key);
+            for (auto& atomic_update : cr_insert.atomic_entries) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, atomic_update.id);
+                row.cells().apply(cdef, std::move(atomic_update.cell));
+            }
+            for (auto& nonatomic_update : cr_insert.nonatomic_entries) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, nonatomic_update.id);
+                row.cells().apply(cdef, collection_mutation_description{nonatomic_update.t, std::move(nonatomic_update.cells)}.serialize(*cdef.type));
+            }
+            row.apply(cr_insert.marker);
+
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        for (auto& cr_update : btch.clustered_updates) {
+            mutation m(base_schema, pk);
+
+            auto& row = m.partition().clustered_row(*base_schema, cr_update.key).cells();
+            for (auto& atomic_update : cr_update.atomic_entries) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, atomic_update.id);
+                row.apply(cdef, std::move(atomic_update.cell));
+            }
+            for (auto& nonatomic_update : cr_update.nonatomic_entries) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, nonatomic_update.id);
+                row.apply(cdef, collection_mutation_description{nonatomic_update.t, std::move(nonatomic_update.cells)}.serialize(*cdef.type));
+            }
+
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        for (auto& cr_delete : btch.clustered_row_deletions) {
+            mutation m(base_schema, pk);
+            m.partition().apply_delete(*base_schema, cr_delete.key, cr_delete.t);
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        for (auto& crange_delete : btch.clustered_range_deletions) {
+            mutation m(base_schema, pk);
+            m.partition().apply_delete(*base_schema, crange_delete.rt);
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        if (btch.partition_deletions) {
+            mutation m(base_schema, pk);
+            m.partition().apply(btch.partition_deletions->t);
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+    }
+}
+
+} // namespace cdc
--- a/cdc/split.hh
+++ b/cdc/split.hh
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <vector>
+#include "schema_fwd.hh"
+#include "timestamp.hh"
+#include "bytes.hh"
+#include <seastar/util/noncopyable_function.hh>
+
+class mutation;
+
+namespace cdc {
+
+bool should_split(const mutation& base_mutation, const schema& base_schema);
+void for_each_change(const mutation& base_mutation, const schema_ptr& base_schema,
+        seastar::noncopyable_function<void(mutation, api::timestamp_type, bytes, int&)>);
+
+}
--- a/cdc/stats.hh
+++ b/cdc/stats.hh
@@ -0,0 +1,120 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <string>
+#include <seastar/core/metrics_registration.hh>
+#include "enum_set.hh"
+#include "utils/histogram.hh"
+#include "utils/estimated_histogram.hh"
+
+namespace cdc {
+
+class stats final {
+    seastar::metrics::metric_groups _metrics;
+
+public:
+    enum class part_type {
+        STATIC_ROW,
+        CLUSTERING_ROW,
+        MAP,
+        SET,
+        LIST,
+        UDT,
+        RANGE_TOMBSTONE,
+        PARTITION_DELETE,
+        ROW_DELETE,
+
+        MAX
+    };
+
+    using part_type_set = enum_set<super_enum<part_type,
+        part_type::STATIC_ROW,
+        part_type::CLUSTERING_ROW,
+        part_type::MAP,
+        part_type::SET,
+        part_type::LIST,
+        part_type::UDT,
+        part_type::RANGE_TOMBSTONE,
+        part_type::PARTITION_DELETE,
+        part_type::ROW_DELETE
+    >>;
+
+    struct parts_touched_stats final {
+        std::array<uint64_t, (size_t)part_type::MAX> count = {};
+
+        inline void apply(part_type_set parts_set) {
+            for (part_type idx : parts_set) {
+                count[(size_t)idx]++;
+            }
+        }
+
+        void register_metrics(seastar::metrics::metric_groups& metrics, std::string_view suffix);
+    };
+
+    struct counters final {
+        uint64_t unsplit_count = 0;
+        uint64_t split_count = 0;
+        uint64_t preimage_selects = 0;
+        uint64_t with_preimage_count = 0;
+        uint64_t with_postimage_count = 0;
+
+        parts_touched_stats touches;
+    };
+
+    counters counters_total;
+    counters counters_failed;
+
+    stats();
+};
+
+// Contains the details on what happened during a CDC operation.
+struct operation_details final {
+    stats::part_type_set touched_parts;
+    bool was_split = false;
+    bool had_preimage = false;
+    bool had_postimage = false;
+};
+
+// This object tracks the lifetime of write handlers related to one CDC operation. After all
+// write handlers for the operation finish, CDC metrics are updated.
+class operation_result_tracker final {
+    stats& _stats;
+    operation_details _details;
+    bool _failed;
+
+public:
+    operation_result_tracker(stats& stats, operation_details details)
+        : _stats(stats)
+        , _details(details)
+        , _failed(false)
+    {}
+    ~operation_result_tracker();
+
+    void on_mutation_failed() {
+        _failed = true;
+    }
+};
+
+}
--- a/checked-file-impl.hh
+++ b/checked-file-impl.hh
@@ -22,7 +22,10 @@
 #pragma once

 #include "seastar/core/file.hh"
-#include "disk-error-handler.hh"
+#include "seastar/core/seastar.hh"
+#include "utils/disk-error-handler.hh"
+
+#include "seastarx.hh"

 class checked_file_impl : public file_impl {
 public:
@@ -144,7 +147,7 @@ inline open_checked_directory(const io_error_handler& error_handler,
                              sstring name)
 {
    return do_io_check(error_handler, [&] {
-        return engine().open_directory(name).then([&] (file f) {
+        return open_directory(name).then([&] (file f) {
            return make_ready_future<file>(make_checked_file(error_handler, f));
        });
    });
--- a/clocks-impl.cc
+++ b/clocks-impl.cc
@@ -19,6 +19,23 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <seastar/core/print.hh>
+
+#include "db_clock.hh"
+#include "timestamp.hh"
+
 #include "clocks-impl.hh"

 std::atomic<int64_t> clocks_offset;
+
+std::ostream& operator<<(std::ostream& os, db_clock::time_point tp) {
+    auto t = db_clock::to_time_t(tp);
+    ::tm t_buf;
+    return os << std::put_time(::gmtime_r(&t, &t_buf), "%Y/%m/%d %T");
+}
+
+std::string format_timestamp(api::timestamp_type ts) {
+    auto t = std::time_t(std::chrono::duration_cast<std::chrono::seconds>(api::timestamp_clock::duration(ts)).count());
+    ::tm t_buf;
+    return format("{}", std::put_time(::gmtime_r(&t, &t_buf), "%Y/%m/%d %T"));
+}
--- a/clustering_bounds_comparator.hh
+++ b/clustering_bounds_comparator.hh
@@ -24,7 +24,7 @@

 #include <functional>
 #include "keys.hh"
-#include "schema.hh"
+#include "schema_fwd.hh"
 #include "range.hh"

 /**
@@ -122,26 +122,26 @@ public:
        return {_empty_prefix, bound_kind::incl_end};
    }
    template<template<typename> typename R>
-    GCC6_CONCEPT( requires Range<R, clustering_key_prefix_view> )
+    requires Range<R, clustering_key_prefix_view>
    static bound_view from_range_start(const R<clustering_key_prefix>& range) {
        return range.start()
               ? bound_view(range.start()->value(), range.start()->is_inclusive() ? bound_kind::incl_start : bound_kind::excl_start)
               : bottom();
    }
    template<template<typename> typename R>
-    GCC6_CONCEPT( requires Range<R, clustering_key_prefix> )
+    requires Range<R, clustering_key_prefix>
    static bound_view from_range_end(const R<clustering_key_prefix>& range) {
        return range.end()
               ? bound_view(range.end()->value(), range.end()->is_inclusive() ? bound_kind::incl_end : bound_kind::excl_end)
               : top();
    }
    template<template<typename> typename R>
-    GCC6_CONCEPT( requires Range<R, clustering_key_prefix> )
+    requires Range<R, clustering_key_prefix>
    static std::pair<bound_view, bound_view> from_range(const R<clustering_key_prefix>& range) {
        return {from_range_start(range), from_range_end(range)};
    }
    template<template<typename> typename R>
-    GCC6_CONCEPT( requires Range<R, clustering_key_prefix_view> )
+    requires Range<R, clustering_key_prefix_view>
    static std::optional<typename R<clustering_key_prefix_view>::bound> to_range_bound(const bound_view& bv) {
        if (&bv._prefix.get() == &_empty_prefix) {
            return {};
--- a/clustering_interval_set.hh
+++ b/clustering_interval_set.hh
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "schema_fwd.hh"
+#include "position_in_partition.hh"
+#include <boost/icl/interval_set.hpp>
+
+// Represents a non-contiguous subset of clustering_key domain of a particular schema.
+// Can be treated like an ordered and non-overlapping sequence of position_range:s.
+class clustering_interval_set {
+    // Needed to make position_in_partition comparable, required by boost::icl::interval_set.
+    class position_in_partition_with_schema {
+        schema_ptr _schema;
+        position_in_partition _pos;
+    public:
+        position_in_partition_with_schema()
+            : _pos(position_in_partition::for_static_row())
+        { }
+        position_in_partition_with_schema(schema_ptr s, position_in_partition pos)
+            : _schema(std::move(s))
+            , _pos(std::move(pos))
+        { }
+        bool operator<(const position_in_partition_with_schema& other) const {
+            return position_in_partition::less_compare(*_schema)(_pos, other._pos);
+        }
+        bool operator==(const position_in_partition_with_schema& other) const {
+            return position_in_partition::equal_compare(*_schema)(_pos, other._pos);
+        }
+        const position_in_partition& position() const { return _pos; }
+    };
+private:
+    // We want to represent intervals of clustering keys, not position_in_partitions,
+    // but clustering_key domain is not enough to represent all kinds of clustering ranges.
+    // All intervals in this set are of the form [x, y).
+    using set_type = boost::icl::interval_set<position_in_partition_with_schema>;
+    using interval = boost::icl::interval<position_in_partition_with_schema>;
+    set_type _set;
+public:
+    clustering_interval_set() = default;
+    // Constructs from legacy clustering_row_ranges
+    clustering_interval_set(const schema& s, const query::clustering_row_ranges& ranges) {
+        for (auto&& r : ranges) {
+            add(s, position_range::from_range(r));
+        }
+    }
+    query::clustering_row_ranges to_clustering_row_ranges() const {
+        query::clustering_row_ranges result;
+        for (position_range r : *this) {
+            result.push_back(query::clustering_range::make(
+                {r.start().key(), r.start()._bound_weight != bound_weight::after_all_prefixed},
+                {r.end().key(), r.end()._bound_weight == bound_weight::after_all_prefixed}));
+        }
+        return result;
+    }
+    class position_range_iterator : public std::iterator<std::input_iterator_tag, const position_range> {
+        set_type::iterator _i;
+    public:
+        position_range_iterator(set_type::iterator i) : _i(i) {}
+        position_range operator*() const {
+            // FIXME: Produce position_range view. Not performance critical yet.
+            const interval::interval_type& iv = *_i;
+            return position_range{iv.lower().position(), iv.upper().position()};
+        }
+        bool operator==(const position_range_iterator& other) const { return _i == other._i; }
+        bool operator!=(const position_range_iterator& other) const { return _i != other._i; }
+        position_range_iterator& operator++() {
+            ++_i;
+            return *this;
+        }
+        position_range_iterator operator++(int) {
+            auto tmp = *this;
+            ++_i;
+            return tmp;
+        }
+    };
+    static interval::type make_interval(const schema& s, const position_range& r) {
+        assert(r.start().has_clustering_key());
+        assert(r.end().has_clustering_key());
+        return interval::right_open(
+            position_in_partition_with_schema(s.shared_from_this(), r.start()),
+            position_in_partition_with_schema(s.shared_from_this(), r.end()));
+    }
+public:
+    bool equals(const schema& s, const clustering_interval_set& other) const {
+        return boost::equal(_set, other._set);
+    }
+    bool contains(const schema& s, position_in_partition_view pos) const {
+        // FIXME: Avoid copy
+        return _set.find(position_in_partition_with_schema(s.shared_from_this(), position_in_partition(pos))) != _set.end();
+    }
+    // Returns true iff this set is fully contained in the other set.
+    bool contained_in(clustering_interval_set& other) const {
+        return boost::icl::within(_set, other._set);
+    }
+    bool overlaps(const schema& s, const position_range& range) const {
+        // FIXME: Avoid copy
+        auto r = _set.equal_range(make_interval(s, range));
+        return r.first != r.second;
+    }
+    // Adds given clustering range to this interval set.
+    // The range may overlap with this set.
+    void add(const schema& s, const position_range& r) {
+        _set += make_interval(s, r);
+    }
+    void add(const schema& s, const clustering_interval_set& other) {
+        for (auto&& r : other) {
+            add(s, r);
+        }
+    }
+    position_range_iterator begin() const { return {_set.begin()}; }
+    position_range_iterator end() const { return {_set.end()}; }
+    friend std::ostream& operator<<(std::ostream&, const clustering_interval_set&);
+};
+
--- a/clustering_key_filter.hh
+++ b/clustering_key_filter.hh
@@ -23,7 +23,7 @@

 #pragma once

-#include "schema.hh"
+#include "schema_fwd.hh"
 #include "query-request.hh"

 namespace query {
--- a/collection_mutation.cc
+++ b/collection_mutation.cc
@@ -61,7 +61,7 @@ bool collection_mutation_view::is_empty() const {
 }

 template <typename F>
-GCC6_CONCEPT(requires std::is_invocable_r_v<const data::type_info&, F, collection_mutation_input_stream&>)
+requires std::is_invocable_r_v<const data::type_info&, F, collection_mutation_input_stream&>
 static bool is_any_live(const atomic_cell_value_view& data, tombstone tomb, gc_clock::time_point now, F&& read_cell_type_info) {
    auto in = collection_mutation_input_stream(data);
    auto has_tomb = in.read_trivial<bool>();
@@ -108,7 +108,7 @@ bool collection_mutation_view::is_any_live(const abstract_type& type, tombstone
 }

 template <typename F>
-GCC6_CONCEPT(requires std::is_invocable_r_v<const data::type_info&, F, collection_mutation_input_stream&>)
+requires std::is_invocable_r_v<const data::type_info&, F, collection_mutation_input_stream&>
 static api::timestamp_type last_update(const atomic_cell_value_view& data, F&& read_cell_type_info) {
    auto in = collection_mutation_input_stream(data);
    api::timestamp_type max = api::missing_timestamp;
@@ -313,7 +313,7 @@ collection_mutation collection_mutation_view_description::serialize(const abstra
 }

 template <typename C>
-GCC6_CONCEPT(requires std::is_base_of_v<abstract_type, std::remove_reference_t<C>>)
+requires std::is_base_of_v<abstract_type, std::remove_reference_t<C>>
 static collection_mutation_view_description
 merge(collection_mutation_view_description a, collection_mutation_view_description b, C&& key_type) {
    using element_type = std::pair<bytes_view, atomic_cell_view>;
@@ -375,7 +375,7 @@ collection_mutation merge(const abstract_type& type, collection_mutation_view a,
 }

 template <typename C>
-GCC6_CONCEPT(requires std::is_base_of_v<abstract_type, std::remove_reference_t<C>>)
+requires std::is_base_of_v<abstract_type, std::remove_reference_t<C>>
 static collection_mutation_view_description
 difference(collection_mutation_view_description a, collection_mutation_view_description b, C&& key_type)
 {
@@ -421,7 +421,7 @@ collection_mutation difference(const abstract_type& type, collection_mutation_vi
 }

 template <typename F>
-GCC6_CONCEPT(requires std::is_invocable_r_v<std::pair<bytes_view, atomic_cell_view>, F, collection_mutation_input_stream&>)
+requires std::is_invocable_r_v<std::pair<bytes_view, atomic_cell_view>, F, collection_mutation_input_stream&>
 static collection_mutation_view_description
 deserialize_collection_mutation(collection_mutation_input_stream& in, F&& read_kv) {
    collection_mutation_view_description ret;
--- a/column_computation.hh
+++ b/column_computation.hh
@@ -21,6 +21,8 @@

 #pragma once

+#include <json/json.h>
+
 #include "bytes.hh"

 class schema;
--- a/Show More
+++ b/Show More