release: prepare for 4.0.11

Update seastar submodule
* seastar 065a40b34a...748428930a (1): > append_challenged_posix_file_impl: allow destructing file with no queued work Fixes #7285.
2020-10-26 18:12:47 +02:00 · 2020-10-19 15:06:24 +03:00 · 2020-10-19 15:05:13 +03:00 · 2020-10-18 15:03:04 +03:00 · 2020-10-06 17:12:28 +03:00 · 2020-10-04 18:05:00 +03:00
4964 changed files with 68715 additions and 30666 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,3 +1,4 @@
 .git
 build
 seastar/build
+testlog
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,5 @@ resources
 .pytest_cache
 /expressions.tokens
 tags
+testlog/*
+test/*/*.reject
--- a/.gitmodules
+++ b/.gitmodules
@@ -15,3 +15,6 @@
 [submodule "zstd"]
 	path = zstd
 	url = ../zstd
+[submodule "abseil"]
+	path = abseil
+	url = ../abseil-cpp
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,13 +5,25 @@
 cmake_minimum_required(VERSION 3.7)
 project(scylla)

+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+  message(STATUS "Setting build type to 'Release' as none was specified.")
+  set(CMAKE_BUILD_TYPE "Release" CACHE
+      STRING "Choose the type of build." FORCE)
+  # Set the possible values of build type for cmake-gui
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
+    "Debug" "Release" "Dev" "Sanitize")
+endif()
+
+if(CMAKE_BUILD_TYPE)
+    string(TOLOWER "${CMAKE_BUILD_TYPE}" BUILD_TYPE)
+else()
+    set(BUILD_TYPE "release")
+endif()
+
 if (NOT DEFINED FOR_IDE AND NOT DEFINED ENV{FOR_IDE} AND NOT DEFINED ENV{CLION_IDE})
    message(FATAL_ERROR "This CMakeLists.txt file is only valid for use in IDEs, please define FOR_IDE to acknowledge this.")
 endif()

-# Default value. A more accurate list is populated through `pkg-config` below if `seastar.pc` is available.
-set(SEASTAR_INCLUDE_DIRS "seastar")
-
 # These paths are always available, since they're included in the repository. Additional DPDK headers are placed while
 # Seastar is built, and are captured in `SEASTAR_INCLUDE_DIRS` through parsing the Seastar pkg-config file (below).
 set(SEASTAR_DPDK_INCLUDE_DIRS
@@ -22,9 +34,14 @@ set(SEASTAR_DPDK_INCLUDE_DIRS

 find_package(PkgConfig REQUIRED)

-set(ENV{PKG_CONFIG_PATH} "${CMAKE_SOURCE_DIR}/seastar/build/release:$ENV{PKG_CONFIG_PATH}")
+set(ENV{PKG_CONFIG_PATH} "${CMAKE_SOURCE_DIR}/build/${BUILD_TYPE}/seastar:$ENV{PKG_CONFIG_PATH}")
 pkg_check_modules(SEASTAR seastar)

+if(NOT SEASTAR_INCLUDE_DIRS)
+    # Default value. A more accurate list is populated through `pkg-config` below if `seastar.pc` is available.
+    set(SEASTAR_INCLUDE_DIRS "seastar/include")
+endif()
+
 find_package(Boost COMPONENTS filesystem program_options system thread)

 ##
@@ -70,7 +87,7 @@ scan_scylla_source_directories(
          seastar/json
          seastar/net
          seastar/rpc
-          seastar/tests
+          seastar/testing
          seastar/util)

 scan_scylla_source_directories(
@@ -97,7 +114,7 @@ scan_scylla_source_directories(
          service
          sstables
          streaming
-          tests
+          test
          thrift
          tracing
          transport
@@ -106,7 +123,7 @@ scan_scylla_source_directories(
 scan_scylla_source_directories(
        VAR SCYLLA_GEN_SOURCE_FILES
        RECURSIVE
-        PATHS build/release/gen)
+        PATHS build/${BUILD_TYPE}/gen)

 set(SCYLLA_SOURCE_FILES
        ${SCYLLA_ROOT_SOURCE_FILES}
@@ -139,4 +156,4 @@ target_include_directories(scylla PUBLIC
        ${Boost_INCLUDE_DIRS}
        xxhash
        libdeflate
-        build/release/gen)
+        build/${BUILD_TYPE}/gen)
--- a/HACKING.md
+++ b/HACKING.md
@@ -141,7 +141,7 @@ In v3:
 "Tests: unit ({mode}), dtest ({smp})"
 ```

-The usual is "Tests: unit (release)", although running debug tests is encouraged.
+The usual is "Tests: unit (dev)", although running debug tests is encouraged.

 5. When answering review comments, prefer inline quotes as they make it easier to track the conversation across multiple e-mails.

--- a/31
+++ b/31
@@ -5,8 +5,6 @@ F: Filename, directory, or pattern for the subsystem
 ---

 AUTH
-M: Paweł Dziepak <pdziepak@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 R: Calle Wilund <calle@scylladb.com>
 R: Vlad Zolotarov <vladz@scylladb.com>
 R: Jesse Haber-Kucharsky <jhaberku@scylladb.com>
@@ -14,22 +12,17 @@ F: auth/*

 CACHE
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Paweł Dziepak <pdziepak@scylladb.com>
 R: Piotr Jastrzebski <piotr@scylladb.com>
 F: row_cache*
 F: *mutation*
 F: tests/mvcc*

 COMMITLOG / BATCHLOGa
-M: Paweł Dziepak <pdziepak@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 R: Calle Wilund <calle@scylladb.com>
 F: db/commitlog/*
 F: db/batch*

 COORDINATOR
-M: Paweł Dziepak <pdziepak@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 R: Gleb Natapov <gleb@scylladb.com>
 F: service/storage_proxy*

@@ -49,12 +42,10 @@ M: Pekka Enberg <penberg@scylladb.com>
 F: cql3/*

 COUNTERS
-M: Paweł Dziepak <pdziepak@scylladb.com>
 F: counters*
 F: tests/counter_test*

 GOSSIP
-M: Duarte Nunes <duarte@scylladb.com>
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
 R: Asias He <asias@scylladb.com>
 F: gms/*
@@ -65,14 +56,11 @@ F: dist/docker/*

 LSA
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Paweł Dziepak <pdziepak@scylladb.com>
 F: utils/logalloc*

 MATERIALIZED VIEWS
-M: Duarte Nunes <duarte@scylladb.com>
 M: Pekka Enberg <penberg@scylladb.com>
-R: Nadav Har'El <nyh@scylladb.com>
-R: Duarte Nunes <duarte@scylladb.com>
+M: Nadav Har'El <nyh@scylladb.com>
 F: db/view/*
 F: cql3/statements/*view*

@@ -82,14 +70,12 @@ F: dist/*

 REPAIR
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 R: Asias He <asias@scylladb.com>
 R: Nadav Har'El <nyh@scylladb.com>
 F: repair/*

 SCHEMA MANAGEMENT
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 M: Pekka Enberg <penberg@scylladb.com>
 F: db/schema_tables*
 F: db/legacy_schema_migrator*
@@ -98,15 +84,13 @@ F: schema*

 SECONDARY INDEXES
 M: Pekka Enberg <penberg@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
-R: Nadav Har'El <nyh@scylladb.com>
+M: Nadav Har'El <nyh@scylladb.com>
 R: Pekka Enberg <penberg@scylladb.com>
 F: db/index/*
 F: cql3/statements/*index*

 SSTABLES
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 R: Raphael S. Carvalho <raphaelsc@scylladb.com>
 R: Glauber Costa <glauber@scylladb.com>
 R: Nadav Har'El <nyh@scylladb.com>
@@ -114,18 +98,17 @@ F: sstables/*

 STREAMING
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 R: Asias He <asias@scylladb.com>
 F: streaming/*
 F: service/storage_service.*

-THRIFT TRANSPORT LAYER
-M: Duarte Nunes <duarte@scylladb.com>
-F: thrift/*
+ALTERNATOR
+M: Nadav Har'El <nyh@scylladb.com>
+F: alternator/*
+F: alternator-test/*

 THE REST
 M: Avi Kivity <avi@scylladb.com>
-M: Paweł Dziepak <pdziepak@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
+M: Nadav Har'El <nyh@scylladb.com>
 F: *
--- a/README.md
+++ b/README.md
@@ -27,10 +27,10 @@ Please see [HACKING.md](HACKING.md) for detailed information on building and dev

 ```

-* run Scylla with one CPU and ./tmp as data directory
+* run Scylla with one CPU and ./tmp as work directory

 ```
-./build/release/scylla --datadir tmp --commitlog-directory tmp --smp 1
+./build/release/scylla --workdir tmp --smp 1
 ```

 * For more run options:
@@ -38,6 +38,10 @@ Please see [HACKING.md](HACKING.md) for detailed information on building and dev
 ./build/release/scylla --help
 ```

+## Testing
+
+See [test.py manual](docs/testing.md).
+
 ## Scylla APIs and compatibility
 By default, Scylla is compatible with Apache Cassandra and its APIs - CQL and
 Thrift. There is also experimental support for the API of Amazon DynamoDB,
@@ -56,31 +60,12 @@ both.
 Seastar documentation can be found [here](http://docs.seastar.io/master/index.html).
 User documentation can be found [here](https://docs.scylladb.com/).

-## Building Fedora RPM
+## Training 

-As a pre-requisite, you need to install [Mock](https://fedoraproject.org/wiki/Mock) on your machine:
-
-```
-# Install mock:
-sudo yum install mock
-
-# Add user to the "mock" group:
-usermod -a -G mock $USER && newgrp mock
-```
-
-Then, to build an RPM, run:
-
-```
-./dist/redhat/build_rpm.sh
-```
-
-The built RPM is stored in ``/var/lib/mock/<configuration>/result`` directory.
-For example, on Fedora 21 mock reports the following:
-
-```
-INFO: Done(scylla-server-0.00-1.fc21.src.rpm) Config(default) 20 minutes 7 seconds
-INFO: Results and/or logs in: /var/lib/mock/fedora-21-x86_64/result
-```
+Training material and online courses can be found at [Scylla University](https://university.scylladb.com/). 
+The courses are free, self-paced and include hands-on examples. They cover a variety of topics including Scylla data modeling, 
+administration, architecture, basic NoSQL concepts, using drivers for application development, Scylla setup, failover, compactions, 
+multi-datacenters and how Scylla integrates with third-party applications.

 ## Building Fedora-based Docker image

--- a/10
+++ b/10
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=3.2.5
+VERSION=4.0.11

 if test -f version
 then
@@ -19,6 +19,14 @@ else
 	SCYLLA_RELEASE=$SCYLLA_BUILD.$DATE.$GIT_COMMIT
 fi

+if [ -f build/SCYLLA-RELEASE-FILE ]; then
+	RELEASE_FILE=$(cat build/SCYLLA-RELEASE-FILE)
+	GIT_COMMIT_FILE=$(cat build/SCYLLA-RELEASE-FILE |cut -d . -f 3)
+	if [ "$GIT_COMMIT" = "$GIT_COMMIT_FILE" ]; then
+		exit 0
+	fi
+fi
+
 echo "$SCYLLA_VERSION-$SCYLLA_RELEASE"
 mkdir -p build
 echo "$SCYLLA_VERSION" > build/SCYLLA-VERSION-FILE
--- a/1
+++ b/1
--- a/alternator-test/test_condition_expression.py
+++ b/alternator-test/test_condition_expression.py
@@ -1,40 +0,0 @@
-# Copyright 2019 ScyllaDB
-#
-# This file is part of Scylla.
-#
-# Scylla is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Scylla is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
-
-# Tests for the ConditionExpression parameter
-
-import pytest
-from botocore.exceptions import ClientError
-from util import random_string
-
-# Test that ConditionExpression works as expected
-@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
-def test_update_condition_expression(test_table_s):
-    p = random_string()
-    test_table_s.update_item(Key={'p': p},
-        UpdateExpression='SET b = :val1',
-        ExpressionAttributeValues={':val1': 4})
-    test_table_s.update_item(Key={'p': p},
-        UpdateExpression='SET b = :val1',
-        ConditionExpression='b = :oldval',
-        ExpressionAttributeValues={':val1': 6, ':oldval': 4})
-    with pytest.raises(ClientError, match='ConditionalCheckFailedException.*'):
-        test_table_s.update_item(Key={'p': p},
-            UpdateExpression='SET b = :val1',
-            ConditionExpression='b = :oldval',
-            ExpressionAttributeValues={':val1': 8, ':oldval': 4})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'b': 6}
--- a/alternator-test/test_item.py
+++ b/alternator-test/test_item.py
@@ -1,402 +0,0 @@
-# Copyright 2019 ScyllaDB
-#
-# This file is part of Scylla.
-#
-# Scylla is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Scylla is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
-
-# Tests for the CRUD item operations: PutItem, GetItem, UpdateItem, DeleteItem
-
-import pytest
-from botocore.exceptions import ClientError
-from decimal import Decimal
-from util import random_string, random_bytes
-
-# Basic test for creating a new item with a random name, and reading it back
-# with strong consistency.
-# Only the string type is used for keys and attributes. None of the various
-# optional PutItem features (Expected, ReturnValues, ReturnConsumedCapacity,
-# ReturnItemCollectionMetrics, ConditionalOperator, ConditionExpression,
-# ExpressionAttributeNames, ExpressionAttributeValues) are used, and
-# for GetItem strong consistency is requested as well as all attributes,
-# but no other optional features (AttributesToGet, ReturnConsumedCapacity,
-# ProjectionExpression, ExpressionAttributeNames)
-def test_basic_string_put_and_get(test_table):
-    p = random_string()
-    c = random_string()
-    val = random_string()
-    val2 = random_string()
-    test_table.put_item(Item={'p': p, 'c': c, 'attribute': val, 'another': val2})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item['p'] == p
-    assert item['c'] == c
-    assert item['attribute'] == val
-    assert item['another'] == val2
-
-# Similar to test_basic_string_put_and_get, just uses UpdateItem instead of
-# PutItem. Because the item does not yet exist, it should work the same.
-def test_basic_string_update_and_get(test_table):
-    p = random_string()
-    c = random_string()
-    val = random_string()
-    val2 = random_string()
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'attribute': {'Value': val, 'Action': 'PUT'}, 'another': {'Value': val2, 'Action': 'PUT'}})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item['p'] == p
-    assert item['c'] == c
-    assert item['attribute'] == val
-    assert item['another'] == val2
-
-# Test put_item and get_item of various types for the *attributes*,
-# including both scalars as well as nested documents, lists and sets.
-# The full list of types tested here:
-#    number, boolean, bytes, null, list, map, string set, number set,
-#    binary set.
-# The keys are still strings.
-# Note that only top-level attributes are written and read in this test -
-# this test does not attempt to modify *nested* attributes.
-# See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/dynamodb.html
-# on how to pass these various types to Boto3's put_item().
-def test_put_and_get_attribute_types(test_table):
-    key = {'p': random_string(), 'c': random_string()}
-    test_items = [
-        Decimal("12.345"),
-        42,
-        True,
-        False,
-        b'xyz',
-        None,
-        ['hello', 'world', 42],
-        {'hello': 'world', 'life': 42},
-        {'hello': {'test': 'hi', 'hello': True, 'list': [1, 2, 'hi']}},
-        set(['hello', 'world', 'hi']),
-        set([1, 42, Decimal("3.14")]),
-        set([b'xyz', b'hi']),
-    ]
-    item = { str(i) : test_items[i] for i in range(len(test_items)) }
-    item.update(key)
-    test_table.put_item(Item=item)
-    got_item = test_table.get_item(Key=key, ConsistentRead=True)['Item']
-    assert item == got_item
-
-# The test_empty_* tests below verify support for empty items, with no
-# attributes except the key. This is a difficult case for Scylla, because
-# for an empty row to exist, Scylla needs to add a "CQL row marker".
-# There are several ways to create empty items - via PutItem, UpdateItem
-# and deleting attributes from non-empty items, and we need to check them
-# all, in several test_empty_* tests:
-def test_empty_put(test_table):
-    p = random_string()
-    c = random_string()
-    test_table.put_item(Item={'p': p, 'c': c})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item == {'p': p, 'c': c}
-def test_empty_put_delete(test_table):
-    p = random_string()
-    c = random_string()
-    test_table.put_item(Item={'p': p, 'c': c, 'hello': 'world'})
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'hello': {'Action': 'DELETE'}})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item == {'p': p, 'c': c}
-def test_empty_update(test_table):
-    p = random_string()
-    c = random_string()
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item == {'p': p, 'c': c}
-def test_empty_update_delete(test_table):
-    p = random_string()
-    c = random_string()
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'hello': {'Value': 'world', 'Action': 'PUT'}})
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'hello': {'Action': 'DELETE'}})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item == {'p': p, 'c': c}
-
-# Test error handling of UpdateItem passed a bad "Action" field.
-def test_update_bad_action(test_table):
-    p = random_string()
-    c = random_string()
-    val = random_string()
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'attribute': {'Value': val, 'Action': 'NONEXISTENT'}})
-
-# A more elaborate UpdateItem test, updating different attributes at different
-# times. Includes PUT and DELETE operations.
-def test_basic_string_more_update(test_table):
-    p = random_string()
-    c = random_string()
-    val1 = random_string()
-    val2 = random_string()
-    val3 = random_string()
-    val4 = random_string()
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a3': {'Value': val1, 'Action': 'PUT'}})
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a1': {'Value': val1, 'Action': 'PUT'}})
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a2': {'Value': val2, 'Action': 'PUT'}})
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a1': {'Value': val3, 'Action': 'PUT'}})
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a3': {'Action': 'DELETE'}})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item['p'] == p
-    assert item['c'] == c
-    assert item['a1'] == val3
-    assert item['a2'] == val2
-    assert not 'a3' in item
-
-# Test that item operations on a non-existant table name fail with correct
-# error code.
-def test_item_operations_nonexistent_table(dynamodb):
-    with pytest.raises(ClientError, match='ResourceNotFoundException'):
-        dynamodb.meta.client.put_item(TableName='non_existent_table',
-            Item={'a':{'S':'b'}})
-
-# Fetching a non-existant item. According to the DynamoDB doc, "If there is no
-# matching item, GetItem does not return any data and there will be no Item
-# element in the response."
-def test_get_item_missing_item(test_table):
-    p = random_string()
-    c = random_string()
-    assert not "Item" in test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)
-
-# Test that if we have a table with string hash and sort keys, we can't read
-# or write items with other key types to it.
-def test_put_item_wrong_key_type(test_table):
-    b = random_bytes()
-    s = random_string()
-    n = Decimal("3.14")
-    # Should succeed (correct key types)
-    test_table.put_item(Item={'p': s, 'c': s})
-    assert test_table.get_item(Key={'p': s, 'c': s}, ConsistentRead=True)['Item'] == {'p': s, 'c': s}
-    # Should fail (incorrect hash key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.put_item(Item={'p': b, 'c': s})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.put_item(Item={'p': n, 'c': s})
-    # Should fail (incorrect sort key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.put_item(Item={'p': s, 'c': b})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.put_item(Item={'p': s, 'c': n})
-    # Should fail (missing hash key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.put_item(Item={'c': s})
-    # Should fail (missing sort key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.put_item(Item={'p': s})
-def test_update_item_wrong_key_type(test_table, test_table_s):
-    b = random_bytes()
-    s = random_string()
-    n = Decimal("3.14")
-    # Should succeed (correct key types)
-    test_table.update_item(Key={'p': s, 'c': s}, AttributeUpdates={})
-    assert test_table.get_item(Key={'p': s, 'c': s}, ConsistentRead=True)['Item'] == {'p': s, 'c': s}
-    # Should fail (incorrect hash key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'p': b, 'c': s}, AttributeUpdates={})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'p': n, 'c': s}, AttributeUpdates={})
-    # Should fail (incorrect sort key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'p': s, 'c': b}, AttributeUpdates={})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'p': s, 'c': n}, AttributeUpdates={})
-    # Should fail (missing hash key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'c': s}, AttributeUpdates={})
-    # Should fail (missing sort key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'p': s}, AttributeUpdates={})
-    # Should fail (spurious key columns)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': s, 'c': s, 'spurious': s})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.get_item(Key={'p': s, 'c': s})
-def test_get_item_wrong_key_type(test_table, test_table_s):
-    b = random_bytes()
-    s = random_string()
-    n = Decimal("3.14")
-    # Should succeed (correct key types) but have empty result
-    assert not "Item" in test_table.get_item(Key={'p': s, 'c': s}, ConsistentRead=True)
-    # Should fail (incorrect hash key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': b, 'c': s})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': n, 'c': s})
-    # Should fail (incorrect sort key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': s, 'c': b})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': s, 'c': n})
-    # Should fail (missing hash key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'c': s})
-    # Should fail (missing sort key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': s})
-    # Should fail (spurious key columns)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': s, 'c': s, 'spurious': s})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.get_item(Key={'p': s, 'c': s})
-def test_delete_item_wrong_key_type(test_table, test_table_s):
-    b = random_bytes()
-    s = random_string()
-    n = Decimal("3.14")
-    # Should succeed (correct key types)
-    test_table.delete_item(Key={'p': s, 'c': s})
-    # Should fail (incorrect hash key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'p': b, 'c': s})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'p': n, 'c': s})
-    # Should fail (incorrect sort key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'p': s, 'c': b})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'p': s, 'c': n})
-    # Should fail (missing hash key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'c': s})
-    # Should fail (missing sort key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'p': s})
-    # Should fail (spurious key columns)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'p': s, 'c': s, 'spurious': s})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.delete_item(Key={'p': s, 'c': s})
-
-# Most of the tests here arbitrarily used a table with both hash and sort keys
-# (both strings). Let's check that a table with *only* a hash key works ok
-# too, for PutItem, GetItem, and UpdateItem.
-def test_only_hash_key(test_table_s):
-    s = random_string()
-    test_table_s.put_item(Item={'p': s, 'hello': 'world'})
-    assert test_table_s.get_item(Key={'p': s}, ConsistentRead=True)['Item'] == {'p': s, 'hello': 'world'}
-    test_table_s.update_item(Key={'p': s}, AttributeUpdates={'hi': {'Value': 'there', 'Action': 'PUT'}})
-    assert test_table_s.get_item(Key={'p': s}, ConsistentRead=True)['Item'] == {'p': s, 'hello': 'world', 'hi': 'there'}
-
-# Tests for item operations in tables with non-string hash or sort keys.
-# These tests focus only on the type of the key - everything else is as
-# simple as we can (string attributes, no special options for GetItem
-# and PutItem). These tests also focus on individual items only, and
-# not about the sort order of sort keys - this should be verified in
-# test_query.py, for example.
-def test_bytes_hash_key(test_table_b):
-    # Bytes values are passed using base64 encoding, which has weird cases
-    # depending on len%3 and len%4. So let's try various lengths.
-    for len in range(10,18):
-        p = random_bytes(len)
-        val = random_string()
-        test_table_b.put_item(Item={'p': p, 'attribute': val})
-        assert test_table_b.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'attribute': val}
-def test_bytes_sort_key(test_table_sb):
-    p = random_string()
-    c = random_bytes()
-    val = random_string()
-    test_table_sb.put_item(Item={'p': p, 'c': c, 'attribute': val})
-    assert test_table_sb.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item'] == {'p': p, 'c': c, 'attribute': val}
-
-# Tests for using a large binary blob as hash key, sort key, or attribute.
-# DynamoDB strictly limits the size of the binary hash key to 2048 bytes,
-# and binary sort key to 1024 bytes, and refuses anything larger. The total
-# size of an item is limited to 400KB, which also limits the size of the
-# largest attributes. For more details on these limits, see
-# https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Limits.html
-# Alternator currently does *not* have these limitations, and can accept much
-# larger keys and attributes, but what we do in the following tests is to verify
-# that items up to DynamoDB's maximum sizes also work well in Alternator.
-def test_large_blob_hash_key(test_table_b):
-    b = random_bytes(2048)
-    test_table_b.put_item(Item={'p': b})
-    assert test_table_b.get_item(Key={'p': b}, ConsistentRead=True)['Item'] == {'p': b}
-def test_large_blob_sort_key(test_table_sb):
-    s = random_string()
-    b = random_bytes(1024)
-    test_table_sb.put_item(Item={'p': s, 'c': b})
-    assert test_table_sb.get_item(Key={'p': s, 'c': b}, ConsistentRead=True)['Item'] == {'p': s, 'c': b}
-def test_large_blob_attribute(test_table):
-    p = random_string()
-    c = random_string()
-    b = random_bytes(409500)  # a bit less than 400KB
-    test_table.put_item(Item={'p': p, 'c': c, 'attribute': b })
-    assert test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item'] == {'p': p, 'c': c, 'attribute': b}
-
-# Checks what it is not allowed to use in a single UpdateItem request both
-# old-style AttributeUpdates and new-style UpdateExpression.
-def test_update_item_two_update_methods(test_table_s):
-    p = random_string()
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.update_item(Key={'p': p},
-            AttributeUpdates={'a': {'Value': 3, 'Action': 'PUT'}},
-            UpdateExpression='SET b = :val1',
-            ExpressionAttributeValues={':val1': 4})
-
-# Verify that having neither AttributeUpdates nor UpdateExpression is
-# allowed, and results in creation of an empty item.
-def test_update_item_no_update_method(test_table_s):
-    p = random_string()
-    assert not "Item" in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)
-    test_table_s.update_item(Key={'p': p})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p}
-
-# Test GetItem with the AttributesToGet parameter. Result should include the
-# selected attributes only - if one wants the key attributes as well, one
-# needs to select them explicitly. When no key attributes are selected,
-# some items may have *none* of the selected attributes. Those items are
-# returned too, as empty items - they are not outright missing.
-def test_getitem_attributes_to_get(dynamodb, test_table):
-    p = random_string()
-    c = random_string()
-    item = {'p': p, 'c': c, 'a': 'hello', 'b': 'hi'}
-    test_table.put_item(Item=item)
-    for wanted in [ ['a'],             # only non-key attribute
-                    ['c', 'a'],        # a key attribute (sort key) and non-key
-                    ['p', 'c'],        # entire key
-                    ['nonexistent']    # Our item doesn't have this
-                   ]:
-        got_item = test_table.get_item(Key={'p': p, 'c': c}, AttributesToGet=wanted, ConsistentRead=True)['Item']
-        expected_item = {k: item[k] for k in wanted if k in item}
-        assert expected_item == got_item
-
-# Basic test for DeleteItem, with hash key only
-def test_delete_item_hash(test_table_s):
-    p = random_string()
-    test_table_s.put_item(Item={'p': p})
-    assert 'Item' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)
-    test_table_s.delete_item(Key={'p': p})
-    assert not 'Item' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)
-
-# Basic test for DeleteItem, with hash and sort key
-def test_delete_item_sort(test_table):
-    p = random_string()
-    c = random_string()
-    key = {'p': p, 'c': c}
-    test_table.put_item(Item=key)
-    assert 'Item' in test_table.get_item(Key=key, ConsistentRead=True)
-    test_table.delete_item(Key=key)
-    assert not 'Item' in test_table.get_item(Key=key, ConsistentRead=True)
-
-# Test that PutItem completely replaces an existing item. It shouldn't merge
-# it with a previously existing value, as UpdateItem does!
-# We test for a table with just hash key, and for a table with both hash and
-# sort keys.
-def test_put_item_replace(test_table_s, test_table):
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'a': 'hi'}
-    test_table_s.put_item(Item={'p': p, 'b': 'hello'})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'b': 'hello'}
-    c = random_string()
-    test_table.put_item(Item={'p': p, 'c': c, 'a': 'hi'})
-    assert test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item'] == {'p': p, 'c': c, 'a': 'hi'}
-    test_table.put_item(Item={'p': p, 'c': c, 'b': 'hello'})
-    assert test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item'] == {'p': p, 'c': c, 'b': 'hello'}
--- a/alternator-test/test_query.py
+++ b/alternator-test/test_query.py
@@ -1,358 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2019 ScyllaDB
-#
-# This file is part of Scylla.
-#
-# Scylla is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Scylla is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
-
-# Tests for the Query operation
-
-import random
-import pytest
-from botocore.exceptions import ClientError
-from decimal import Decimal
-from util import random_string, random_bytes, full_query, multiset
-from boto3.dynamodb.conditions import Key, Attr
-
-# Test that scanning works fine with in-stock paginator
-def test_query_basic_restrictions(dynamodb, filled_test_table):
-    test_table, items = filled_test_table
-    paginator = dynamodb.meta.client.get_paginator('query')
-
-    # EQ
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditions={
-            'p' : {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'}
-        }):
-        got_items += page['Items']
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long']) == multiset(got_items)
-
-    # LT
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditions={
-            'p' : {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'},
-            'c' : {'AttributeValueList': ['12'], 'ComparisonOperator': 'LT'}
-        }):
-        got_items += page['Items']
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['c'] < '12']) == multiset(got_items)
-
-    # LE
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditions={
-            'p' : {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'},
-            'c' : {'AttributeValueList': ['14'], 'ComparisonOperator': 'LE'}
-        }):
-        got_items += page['Items']
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['c'] <= '14']) == multiset(got_items)
-
-    # GT
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditions={
-            'p' : {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'},
-            'c' : {'AttributeValueList': ['15'], 'ComparisonOperator': 'GT'}
-        }):
-        got_items += page['Items']
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['c'] > '15']) == multiset(got_items)
-
-    # GE
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditions={
-            'p' : {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'},
-            'c' : {'AttributeValueList': ['14'], 'ComparisonOperator': 'GE'}
-        }):
-        got_items += page['Items']
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['c'] >= '14']) == multiset(got_items)
-
-    # BETWEEN
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditions={
-            'p' : {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'},
-            'c' : {'AttributeValueList': ['155', '164'], 'ComparisonOperator': 'BETWEEN'}
-        }):
-        got_items += page['Items']
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['c'] >= '155' and item['c'] <= '164']) == multiset(got_items)
-
-    # BEGINS_WITH
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditions={
-            'p' : {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'},
-            'c' : {'AttributeValueList': ['11'], 'ComparisonOperator': 'BEGINS_WITH'}
-        }):
-        print([item for item in items if item['p'] == 'long' and item['c'].startswith('11')])
-        got_items += page['Items']
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['c'].startswith('11')]) == multiset(got_items)
-
-# Test that KeyConditionExpression parameter is supported
-@pytest.mark.xfail(reason="KeyConditionExpression not supported yet")
-def test_query_key_condition_expression(dynamodb, filled_test_table):
-    test_table, items = filled_test_table
-    paginator = dynamodb.meta.client.get_paginator('query')
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditionExpression=Key("p").eq("long") & Key("c").lt("12")):
-        got_items += page['Items']
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['c'] < '12']) == multiset(got_items)
-
-def test_begins_with(dynamodb, test_table):
-    paginator = dynamodb.meta.client.get_paginator('query')
-    items = [{'p': 'unorthodox_chars', 'c': sort_key, 'str': 'a'} for sort_key in [u'ÿÿÿ', u'cÿbÿ', u'cÿbÿÿabg'] ]
-    with test_table.batch_writer() as batch:
-        for item in items:
-            batch.put_item(item)
-
-    # TODO(sarna): Once bytes type is supported, /xFF character should be tested
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditions={
-            'p' : {'AttributeValueList': ['unorthodox_chars'], 'ComparisonOperator': 'EQ'},
-            'c' : {'AttributeValueList': [u'ÿÿ'], 'ComparisonOperator': 'BEGINS_WITH'}
-        }):
-        got_items += page['Items']
-    print(got_items)
-    assert sorted([d['c'] for d in got_items]) == sorted([d['c'] for d in items if d['c'].startswith(u'ÿÿ')])
-
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name, KeyConditions={
-            'p' : {'AttributeValueList': ['unorthodox_chars'], 'ComparisonOperator': 'EQ'},
-            'c' : {'AttributeValueList': [u'cÿbÿ'], 'ComparisonOperator': 'BEGINS_WITH'}
-        }):
-        got_items += page['Items']
-    print(got_items)
-    assert sorted([d['c'] for d in got_items]) == sorted([d['c'] for d in items if d['c'].startswith(u'cÿbÿ')])
-
-def test_begins_with_wrong_type(dynamodb, test_table_sn):
-    paginator = dynamodb.meta.client.get_paginator('query')
-    with pytest.raises(ClientError, match='ValidationException'):
-        for page in paginator.paginate(TableName=test_table_sn.name, KeyConditions={
-                'p' : {'AttributeValueList': ['unorthodox_chars'], 'ComparisonOperator': 'EQ'},
-                'c' : {'AttributeValueList': [17], 'ComparisonOperator': 'BEGINS_WITH'}
-                }):
-            pass
-
-# Items returned by Query should be sorted by the sort key. The following
-# tests verify that this is indeed the case, for the three allowed key types:
-# strings, binary, and numbers. These tests test not just the Query operation,
-# but inherently that the sort-key sorting works.
-def test_query_sort_order_string(test_table):
-    # Insert a lot of random items in one new partition:
-    # str(i) has a non-obvious sort order (e.g., "100" comes before "2") so is a nice test.
-    p = random_string()
-    items = [{'p': p, 'c': str(i)} for i in range(128)]
-    with test_table.batch_writer() as batch:
-        for item in items:
-            batch.put_item(item)
-    got_items = full_query(test_table, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}})
-    assert len(items) == len(got_items)
-    # Extract just the sort key ("c") from the items
-    sort_keys = [x['c'] for x in items]
-    got_sort_keys = [x['c'] for x in got_items]
-    # Verify that got_sort_keys are already sorted (in string order)
-    assert sorted(got_sort_keys) == got_sort_keys
-    # Verify that got_sort_keys are a sorted version of the expected sort_keys
-    assert sorted(sort_keys) == got_sort_keys
-def test_query_sort_order_bytes(test_table_sb):
-    # Insert a lot of random items in one new partition:
-    # We arbitrarily use random_bytes with a random length.
-    p = random_string()
-    items = [{'p': p, 'c': random_bytes(10)} for i in range(128)]
-    with test_table_sb.batch_writer() as batch:
-        for item in items:
-            batch.put_item(item)
-    got_items = full_query(test_table_sb, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}})
-    assert len(items) == len(got_items)
-    sort_keys = [x['c'] for x in items]
-    got_sort_keys = [x['c'] for x in got_items]
-    # Boto3's "Binary" objects are sorted as if bytes are signed integers.
-    # This isn't the order that DynamoDB itself uses (byte 0 should be first,
-    # not byte -128). Sorting the byte array ".value" works.
-    assert sorted(got_sort_keys, key=lambda x: x.value) == got_sort_keys
-    assert sorted(sort_keys) == got_sort_keys
-def test_query_sort_order_number(test_table_sn):
-    # This is a list of numbers, sorted in correct order, and each suitable
-    # for accurate representation by Alternator's number type.
-    numbers = [
-        Decimal("-2e10"),
-        Decimal("-7.1e2"),
-        Decimal("-4.1"),
-        Decimal("-0.1"),
-        Decimal("-1e-5"),
-        Decimal("0"),
-        Decimal("2e-5"),
-        Decimal("0.15"),
-        Decimal("1"),
-        Decimal("1.00000000000000000000000001"),
-        Decimal("3.14159"),
-        Decimal("3.1415926535897932384626433832795028841"),
-        Decimal("31.4"),
-        Decimal("1.4e10"),
-    ]
-    # Insert these numbers, in random order, into one partition:
-    p = random_string()
-    items = [{'p': p, 'c': num} for num in random.sample(numbers, len(numbers))]
-    with test_table_sn.batch_writer() as batch:
-        for item in items:
-            batch.put_item(item)
-    # Finally, verify that we get back exactly the same numbers (with identical
-    # precision), and in their original sorted order.
-    got_items = full_query(test_table_sn, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}})
-    got_sort_keys = [x['c'] for x in got_items]
-    assert got_sort_keys == numbers
-
-def test_query_filtering_attributes_equality(filled_test_table):
-    test_table, items = filled_test_table
-
-    query_filter = {
-        "attribute" : {
-            "AttributeValueList" : [ "xxxx" ],
-            "ComparisonOperator": "EQ"
-        }
-    }
-    got_items = full_query(test_table, KeyConditions={'p': {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'}}, QueryFilter=query_filter)
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['attribute'] == 'xxxx']) == multiset(got_items)
-
-    query_filter = {
-        "attribute" : {
-            "AttributeValueList" : [ "xxxx" ],
-            "ComparisonOperator": "EQ"
-        },
-        "another" : {
-            "AttributeValueList" : [ "yy" ],
-            "ComparisonOperator": "EQ"
-        }
-    }
-
-    got_items = full_query(test_table, KeyConditions={'p': {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'}}, QueryFilter=query_filter)
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['attribute'] == 'xxxx' and item['another'] == 'yy']) == multiset(got_items)
-
-# Test that FilterExpression works as expected
-@pytest.mark.xfail(reason="FilterExpression not supported yet")
-def test_query_filter_expression(filled_test_table):
-    test_table, items = filled_test_table
-
-    got_items = full_query(test_table, KeyConditions={'p': {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'}}, FilterExpression=Attr("attribute").eq("xxxx"))
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['attribute'] == 'xxxx']) == multiset(got_items)
-
-    got_items = full_query(test_table, KeyConditions={'p': {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'}}, FilterExpression=Attr("attribute").eq("xxxx") & Attr("another").eq("yy"))
-    print(got_items)
-    assert multiset([item for item in items if item['p'] == 'long' and item['attribute'] == 'xxxx' and item['another'] == 'yy']) == multiset(got_items)
-
-# QueryFilter can only contain non-key attributes in order to be compatible
-def test_query_filtering_key_equality(filled_test_table):
-    test_table, items = filled_test_table
-
-    with pytest.raises(ClientError, match='ValidationException'):
-        query_filter = {
-            "c" : {
-                "AttributeValueList" : [ "5" ],
-                "ComparisonOperator": "EQ"
-            }
-        }
-        got_items = full_query(test_table, KeyConditions={'p': {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'}}, QueryFilter=query_filter)
-        print(got_items)
-
-    with pytest.raises(ClientError, match='ValidationException'):
-        query_filter = {
-            "attribute" : {
-                "AttributeValueList" : [ "x" ],
-                "ComparisonOperator": "EQ"
-            },
-            "p" : {
-                "AttributeValueList" : [ "5" ],
-                "ComparisonOperator": "EQ"
-            }
-        }
-        got_items = full_query(test_table, KeyConditions={'p': {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'}}, QueryFilter=query_filter)
-        print(got_items)
-
-# Test Query with the AttributesToGet parameter. Result should include the
-# selected attributes only - if one wants the key attributes as well, one
-# needs to select them explicitly. When no key attributes are selected,
-# some items may have *none* of the selected attributes. Those items are
-# returned too, as empty items - they are not outright missing.
-def test_query_attributes_to_get(dynamodb, test_table):
-    p = random_string()
-    items = [{'p': p, 'c': str(i), 'a': str(i*10), 'b': str(i*100) } for i in range(10)]
-    with test_table.batch_writer() as batch:
-        for item in items:
-            batch.put_item(item)
-    for wanted in [ ['a'],             # only non-key attributes
-                    ['c', 'a'],        # a key attribute (sort key) and non-key
-                    ['p', 'c'],        # entire key
-                    ['nonexistent']    # none of the items have this attribute!
-                   ]:
-        got_items = full_query(test_table, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, AttributesToGet=wanted)
-        expected_items = [{k: x[k] for k in wanted if k in x} for x in items]
-        assert multiset(expected_items) == multiset(got_items)
-
-# Test that in a table with both hash key and sort key, which keys we can
-# Query by: We can Query by the hash key, by a combination of both hash and
-# sort keys, but *cannot* query by just the sort key, and obviously not
-# by any non-key column.
-def test_query_which_key(test_table):
-    p = random_string()
-    c = random_string()
-    p2 = random_string()
-    c2 = random_string()
-    item1 = {'p': p, 'c': c}
-    item2 = {'p': p, 'c': c2}
-    item3 = {'p': p2, 'c': c}
-    for i in [item1, item2, item3]:
-        test_table.put_item(Item=i)
-    # Query by hash key only:
-    got_items = full_query(test_table, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}})
-    expected_items = [item1, item2]
-    assert multiset(expected_items) == multiset(got_items)
-    # Query by hash key *and* sort key (this is basically a GetItem):
-    got_items = full_query(test_table, KeyConditions={
-        'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'},
-        'c': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'}
-    })
-    expected_items = [item1]
-    assert multiset(expected_items) == multiset(got_items)
-    # Query by sort key alone is not allowed. DynamoDB reports:
-    # "Query condition missed key schema element: p".
-    with pytest.raises(ClientError, match='ValidationException'):
-        full_query(test_table, KeyConditions={
-            'c': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'}
-        })
-    # Query by a non-key isn't allowed, for the same reason - that the
-    # actual hash key (p) is missing in the query:
-    with pytest.raises(ClientError, match='ValidationException'):
-        full_query(test_table, KeyConditions={
-            'z': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'}
-        })
-    # If we try both p and a non-key we get a complaint that the sort
-    # key is missing: "Query condition missed key schema element: c"
-    with pytest.raises(ClientError, match='ValidationException'):
-        full_query(test_table, KeyConditions={
-            'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'},
-            'z': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'}
-        })
-    # If we try p, c and another key, we get an error that
-    # "Conditions can be of length 1 or 2 only".
-    with pytest.raises(ClientError, match='ValidationException'):
-        full_query(test_table, KeyConditions={
-            'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'},
-            'c': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'},
-            'z': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'}
-        })
--- a/alternator-test/test_scan.py
+++ b/alternator-test/test_scan.py
@@ -1,191 +0,0 @@
-# Copyright 2019 ScyllaDB
-#
-# This file is part of Scylla.
-#
-# Scylla is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Scylla is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
-
-# Tests for the Scan operation
-
-import pytest
-from botocore.exceptions import ClientError
-from util import random_string, full_scan, multiset
-from boto3.dynamodb.conditions import Attr
-
-# Test that scanning works fine with/without pagination
-def test_scan_basic(filled_test_table):
-    test_table, items = filled_test_table
-    for limit in [None,1,2,4,33,50,100,9007,16*1024*1024]:
-        pos = None
-        got_items = []
-        while True:
-            if limit:
-                response = test_table.scan(Limit=limit, ExclusiveStartKey=pos) if pos else test_table.scan(Limit=limit)
-                assert len(response['Items']) <= limit
-            else:
-                response = test_table.scan(ExclusiveStartKey=pos) if pos else test_table.scan()
-            pos = response.get('LastEvaluatedKey', None)
-            got_items += response['Items']
-            if not pos:
-                break
-
-        assert len(items) == len(got_items)
-        assert multiset(items) == multiset(got_items)
-
-def test_scan_with_paginator(dynamodb, filled_test_table):
-    test_table, items = filled_test_table
-    paginator = dynamodb.meta.client.get_paginator('scan')
-
-    got_items = []
-    for page in paginator.paginate(TableName=test_table.name):
-        got_items += page['Items']
-
-    assert len(items) == len(got_items)
-    assert multiset(items) == multiset(got_items)
-
-    for page_size in [1, 17, 1234]:
-        got_items = []
-        for page in paginator.paginate(TableName=test_table.name, PaginationConfig={'PageSize': page_size}):
-            got_items += page['Items']
-
-    assert len(items) == len(got_items)
-    assert multiset(items) == multiset(got_items)
-
-# Although partitions are scanned in seemingly-random order, inside a
-# partition items must be returned by Scan sorted in sort-key order.
-# This test verifies this, for string sort key. We'll need separate
-# tests for the other sort-key types (number and binary)
-def test_scan_sort_order_string(filled_test_table):
-    test_table, items = filled_test_table
-    got_items = full_scan(test_table)
-    assert len(items) == len(got_items)
-    # Extract just the sort key ("c") from the partition "long"
-    items_long = [x['c'] for x in items if x['p'] == 'long']
-    got_items_long = [x['c'] for x in got_items if x['p'] == 'long']
-    # Verify that got_items_long are already sorted (in string order)
-    assert sorted(got_items_long) == got_items_long
-    # Verify that got_items_long are a sorted version of the expected items_long
-    assert sorted(items_long) == got_items_long
-
-# Test Scan with the AttributesToGet parameter. Result should include the
-# selected attributes only - if one wants the key attributes as well, one
-# needs to select them explicitly. When no key attributes are selected,
-# some items may have *none* of the selected attributes. Those items are
-# returned too, as empty items - they are not outright missing.
-def test_scan_attributes_to_get(dynamodb, filled_test_table):
-    table, items = filled_test_table
-    for wanted in [ ['another'],       # only non-key attributes (one item doesn't have it!)
-                    ['c', 'another'],  # a key attribute (sort key) and non-key
-                    ['p', 'c'],        # entire key
-                    ['nonexistent']    # none of the items have this attribute!
-                   ]:
-        print(wanted)
-        got_items = full_scan(table, AttributesToGet=wanted)
-        expected_items = [{k: x[k] for k in wanted if k in x} for x in items]
-        assert multiset(expected_items) == multiset(got_items)
-
-def test_scan_with_attribute_equality_filtering(dynamodb, filled_test_table):
-    table, items = filled_test_table
-    scan_filter = {
-        "attribute" : {
-            "AttributeValueList" : [ "xxxxx" ],
-            "ComparisonOperator": "EQ"
-        }
-    }
-
-    got_items = full_scan(table, ScanFilter=scan_filter)
-    expected_items = [item for item in items if "attribute" in item.keys() and item["attribute"] == "xxxxx" ]
-    assert multiset(expected_items) == multiset(got_items)
-
-    scan_filter = {
-        "another" : {
-            "AttributeValueList" : [ "y" ],
-            "ComparisonOperator": "EQ"
-        },
-        "attribute" : {
-            "AttributeValueList" : [ "xxxxx" ],
-            "ComparisonOperator": "EQ"
-        }
-    }
-
-    got_items = full_scan(table, ScanFilter=scan_filter)
-    expected_items = [item for item in items if "attribute" in item.keys() and item["attribute"] == "xxxxx" and item["another"] == "y" ]
-    assert multiset(expected_items) == multiset(got_items)
-
-# Test that FilterExpression works as expected
-@pytest.mark.xfail(reason="FilterExpression not supported yet")
-def test_scan_filter_expression(filled_test_table):
-    test_table, items = filled_test_table
-
-    got_items = full_scan(test_table, FilterExpression=Attr("attribute").eq("xxxx"))
-    print(got_items)
-    assert multiset([item for item in items if 'attribute' in item.keys() and item['attribute'] == 'xxxx']) == multiset(got_items)
-
-    got_items = full_scan(test_table, FilterExpression=Attr("attribute").eq("xxxx") & Attr("another").eq("yy"))
-    print(got_items)
-    assert multiset([item for item in items if 'attribute' in item.keys() and 'another' in item.keys() and item['attribute'] == 'xxxx' and item['another'] == 'yy']) == multiset(got_items)
-
-def test_scan_with_key_equality_filtering(dynamodb, filled_test_table):
-    table, items = filled_test_table
-    scan_filter_p = {
-        "p" : {
-            "AttributeValueList" : [ "7" ],
-            "ComparisonOperator": "EQ"
-        }
-    }
-    scan_filter_c = {
-        "c" : {
-            "AttributeValueList" : [ "9" ],
-            "ComparisonOperator": "EQ"
-        }
-    }
-    scan_filter_p_and_attribute = {
-        "p" : {
-            "AttributeValueList" : [ "7" ],
-            "ComparisonOperator": "EQ"
-        },
-        "attribute" : {
-            "AttributeValueList" : [ "x"*7 ],
-            "ComparisonOperator": "EQ"
-        }
-    }
-    scan_filter_c_and_another = {
-        "c" : {
-            "AttributeValueList" : [ "9" ],
-            "ComparisonOperator": "EQ"
-        },
-        "another" : {
-            "AttributeValueList" : [ "y"*16 ],
-            "ComparisonOperator": "EQ"
-        }
-    }
-
-    # Filtering on the hash key
-    got_items = full_scan(table, ScanFilter=scan_filter_p)
-    expected_items = [item for item in items if "p" in item.keys() and item["p"] == "7" ]
-    assert multiset(expected_items) == multiset(got_items)
-
-    # Filtering on the sort key
-    got_items = full_scan(table, ScanFilter=scan_filter_c)
-    expected_items = [item for item in items if "c" in item.keys() and item["c"] == "9"]
-    assert multiset(expected_items) == multiset(got_items)
-
-    # Filtering on the hash key and an attribute
-    got_items = full_scan(table, ScanFilter=scan_filter_p_and_attribute)
-    expected_items = [item for item in items if "p" in item.keys() and "another" in item.keys() and item["p"] == "7" and item["another"] == "y"*16]
-    assert multiset(expected_items) == multiset(got_items)
-
-    # Filtering on the sort key and an attribute
-    got_items = full_scan(table, ScanFilter=scan_filter_c_and_another)
-    expected_items = [item for item in items if "c" in item.keys() and "another" in item.keys() and item["c"] == "9" and item["another"] == "y"*16]
-    assert multiset(expected_items) == multiset(got_items)
--- a/alternator-test/util.py
+++ b/alternator-test/util.py
@@ -1,121 +0,0 @@
-# Copyright 2019 ScyllaDB
-#
-# This file is part of Scylla.
-#
-# Scylla is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Scylla is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
-
-# Various utility functions which are useful for multiple tests
-
-import string
-import random
-import collections
-import time
-
-def random_string(length=10, chars=string.ascii_uppercase + string.digits):
-    return ''.join(random.choice(chars) for x in range(length))
-
-def random_bytes(length=10):
-    return bytearray(random.getrandbits(8) for _ in range(length))
-
-# Utility functions for scan and query into an array of items:
-# TODO: add to full_scan and full_query by default ConsistentRead=True, as
-# it's not useful for tests without it!
-def full_scan(table, **kwargs):
-    response = table.scan(**kwargs)
-    items = response['Items']
-    while 'LastEvaluatedKey' in response:
-        response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'], **kwargs)
-        items.extend(response['Items'])
-    return items
-
-# Utility function for fetching the entire results of a query into an array of items
-def full_query(table, **kwargs):
-    response = table.query(**kwargs)
-    items = response['Items']
-    while 'LastEvaluatedKey' in response:
-        response = table.query(ExclusiveStartKey=response['LastEvaluatedKey'], **kwargs)
-        items.extend(response['Items'])
-    return items
-
-# To compare two lists of items (each is a dict) without regard for order,
-# "==" is not good enough because it will fail if the order is different.
-# The following function, multiset() converts the list into a multiset
-# (set with duplicates) where order doesn't matter, so the multisets can
-# be compared.
-
-def freeze(item):
-    if isinstance(item, dict):
-        return frozenset((key, freeze(value)) for key, value in item.items())
-    elif isinstance(item, list):
-        return tuple(freeze(value) for value in item)
-    return item
-
-def multiset(items):
-    return collections.Counter([freeze(item) for item in items])
-
-
-test_table_prefix = 'alternator_test_'
-def test_table_name():
-    current_ms = int(round(time.time() * 1000))
-    # In the off chance that test_table_name() is called twice in the same millisecond...
-    if test_table_name.last_ms >= current_ms:
-        current_ms = test_table_name.last_ms + 1
-    test_table_name.last_ms = current_ms
-    return test_table_prefix + str(current_ms)
-test_table_name.last_ms = 0
-
-def create_test_table(dynamodb, **kwargs):
-    name = test_table_name()
-    print("fixture creating new table {}".format(name))
-    table = dynamodb.create_table(TableName=name,
-        BillingMode='PAY_PER_REQUEST', **kwargs)
-    waiter = table.meta.client.get_waiter('table_exists')
-    # recheck every second instead of the default, lower, frequency. This can
-    # save a few seconds on AWS with its very slow table creation, but can
-    # more on tests on Scylla with its faster table creation turnaround.
-    waiter.config.delay = 1
-    waiter.config.max_attempts = 200
-    waiter.wait(TableName=name)
-    return table
-
-# DynamoDB's ListTables request returns up to a single page of table names
-# (e.g., up to 100) and it is up to the caller to call it again and again
-# to get the next page. This is a utility function which calls it repeatedly
-# as much as necessary to get the entire list.
-# We deliberately return a list and not a set, because we want the caller
-# to be able to recognize bugs in ListTables which causes the same table
-# to be returned twice.
-def list_tables(dynamodb, limit=100):
-    ret = []
-    pos = None
-    while True:
-        if pos:
-            page = dynamodb.meta.client.list_tables(Limit=limit, ExclusiveStartTableName=pos);
-        else:
-            page = dynamodb.meta.client.list_tables(Limit=limit);
-        results = page.get('TableNames', None)
-        assert(results)
-        ret = ret + results
-        newpos = page.get('LastEvaluatedTableName', None)
-        if not newpos:
-            break;
-        # It doesn't make sense for Dynamo to tell us we need more pages, but
-        # not send anything in *this* page!
-        assert len(results) > 0
-        assert newpos != pos
-        # Note that we only checked that we got back tables, not that we got
-        # any new tables not already in ret. So a buggy implementation might
-        # still cause an endless loop getting the same tables again and again.
-        pos = newpos
-    return ret
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -66,8 +66,9 @@ static std::string format_time_point(db_clock::time_point tp) {
    time_t time_point_repr = db_clock::to_time_t(tp);
    std::string time_point_str;
    time_point_str.resize(17);
+    ::tm time_buf;
    // strftime prints the terminating null character as well
-    std::strftime(time_point_str.data(), time_point_str.size(), "%Y%m%dT%H%M%SZ", std::gmtime(&time_point_repr));
+    std::strftime(time_point_str.data(), time_point_str.size(), "%Y%m%dT%H%M%SZ", ::gmtime_r(&time_point_repr, &time_buf));
    time_point_str.resize(16);
    return time_point_str;
 }
@@ -129,7 +130,7 @@ future<std::string> get_key_from_roles(cql3::query_processor& qp, std::string us

    auto cl = auth::password_authenticator::consistency_for_user(username);
    auto timeout = auth::internal_distributed_timeout_config();
-    return qp.process(query, cl, timeout, {sstring(username)}, true).then_wrapped([username = std::move(username)] (future<::shared_ptr<cql3::untyped_result_set>> f) {
+    return qp.execute_internal(query, cl, timeout, {sstring(username)}, true).then_wrapped([username = std::move(username)] (future<::shared_ptr<cql3::untyped_result_set>> f) {
        auto res = f.get0();
        auto salted_hash = std::optional<sstring>();
        if (res->empty()) {
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -29,6 +29,12 @@
 #include "rjson.hh"
 #include "serialization.hh"
 #include "base64.hh"
+#include <stdexcept>
+#include <boost/algorithm/cxx11/all_of.hpp>
+#include <boost/algorithm/cxx11/any_of.hpp>
+#include "utils/overloaded_functor.hh"
+
+#include "expressions_eval.hh"

 namespace alternator {

@@ -47,7 +53,9 @@ comparison_operator_type get_comparison_operator(const rjson::value& comparison_
            {"NOT_NULL", comparison_operator_type::NOT_NULL},
            {"BETWEEN", comparison_operator_type::BETWEEN},
            {"BEGINS_WITH", comparison_operator_type::BEGINS_WITH},
-    }; //TODO: CONTAINS
+            {"CONTAINS", comparison_operator_type::CONTAINS},
+            {"NOT_CONTAINS", comparison_operator_type::NOT_CONTAINS},
+    };
    if (!comparison_operator.IsString()) {
        throw api_error("ValidationException", format("Invalid comparison operator definition {}", rjson::print(comparison_operator)));
    }
@@ -68,7 +76,7 @@ static ::shared_ptr<cql3::restrictions::single_column_restriction::contains> mak
 }

 static ::shared_ptr<cql3::restrictions::single_column_restriction::EQ> make_key_eq_restriction(const column_definition& cdef, const rjson::value& value) {
-    bytes raw_value = get_key_from_typed_value(value, cdef, type_to_string(cdef.type));
+    bytes raw_value = get_key_from_typed_value(value, cdef);
    auto restriction_value = ::make_shared<cql3::constants::value>(cql3::raw_value::make_value(std::move(raw_value)));
    return make_shared<cql3::restrictions::single_column_restriction::EQ>(cdef, std::move(restriction_value));
 }
@@ -143,9 +151,44 @@ static void verify_operand_count(const rjson::value* array, const size_check& ex
    }
 }

+struct rjson_engaged_ptr_comp {
+    bool operator()(const rjson::value* p1, const rjson::value* p2) const {
+        return rjson::single_value_comp()(*p1, *p2);
+    }
+};
+
+// It's not enough to compare underlying JSON objects when comparing sets,
+// as internally they're stored in an array, and the order of elements is
+// not important in set equality. See issue #5021
+static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2) {
+    if (set1.Size() != set2.Size()) {
+        return false;
+    }
+    std::set<const rjson::value*, rjson_engaged_ptr_comp> set1_raw;
+    for (auto it = set1.Begin(); it != set1.End(); ++it) {
+        set1_raw.insert(&*it);
+    }
+    for (const auto& a : set2.GetArray()) {
+        if (set1_raw.count(&a) == 0) {
+            return false;
+        }
+    }
+    return true;
+}
+
 // Check if two JSON-encoded values match with the EQ relation
 static bool check_EQ(const rjson::value* v1, const rjson::value& v2) {
-    return v1 && *v1 == v2;
+    if (!v1) {
+        return false;
+    }
+    if (v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
+        auto it1 = v1->MemberBegin();
+        auto it2 = v2.MemberBegin();
+        if ((it1->name == "SS" && it2->name == "SS") || (it1->name == "NS" && it2->name == "NS") || (it1->name == "BS" && it2->name == "BS")) {
+            return check_EQ_for_sets(it1->value, it2->value);
+        }
+    }
+    return *v1 == v2;
 }

 // Check if two JSON-encoded values match with the NE relation
@@ -174,9 +217,66 @@ static bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2) {
    if (it1->name != it2->name) {
        return false;
    }
-    std::string_view val1(it1->value.GetString(), it1->value.GetStringLength());
-    std::string_view val2(it2->value.GetString(), it2->value.GetStringLength());
-    return val1.substr(0, val2.size()) == val2;
+    if (it2->name == "S") {
+        std::string_view val1(it1->value.GetString(), it1->value.GetStringLength());
+        std::string_view val2(it2->value.GetString(), it2->value.GetStringLength());
+        return val1.substr(0, val2.size()) == val2;
+    } else /* it2->name == "B" */ {
+        // TODO (optimization): Check the begins_with condition directly on
+        // the base64-encoded string, without making a decoded copy.
+        bytes val1 = base64_decode(it1->value);
+        bytes val2 = base64_decode(it2->value);
+        return val1.substr(0, val2.size()) == val2;
+    }
+}
+
+static bool is_set_of(const rjson::value& type1, const rjson::value& type2) {
+    return (type2 == "S" && type1 == "SS") || (type2 == "N" && type1 == "NS") || (type2 == "B" && type1 == "BS");
+}
+
+// Check if two JSON-encoded values match with the CONTAINS relation
+bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2) {
+    if (!v1) {
+        return false;
+    }
+    const auto& kv1 = *v1->MemberBegin();
+    const auto& kv2 = *v2.MemberBegin();
+    if (kv2.name != "S" && kv2.name != "N" &&  kv2.name != "B") {
+        throw api_error("ValidationException",
+                        format("CONTAINS operator requires a single AttributeValue of type String, Number, or Binary, "
+                               "got {} instead", kv2.name));
+    }
+    if (kv1.name == "S" && kv2.name == "S") {
+        return rjson::to_string_view(kv1.value).find(rjson::to_string_view(kv2.value)) != std::string_view::npos;
+    } else if (kv1.name == "B" && kv2.name == "B") {
+        return base64_decode(kv1.value).find(base64_decode(kv2.value)) != bytes::npos;
+    } else if (is_set_of(kv1.name, kv2.name)) {
+        for (auto i = kv1.value.Begin(); i != kv1.value.End(); ++i) {
+            if (*i == kv2.value) {
+                return true;
+            }
+        }
+    } else if (kv1.name == "L") {
+        for (auto i = kv1.value.Begin(); i != kv1.value.End(); ++i) {
+            if (!i->IsObject() || i->MemberCount() != 1) {
+                clogger.error("check_CONTAINS received a list whose element is malformed");
+                return false;
+            }
+            const auto& el = *i->MemberBegin();
+            if (el.name == kv2.name && el.value == kv2.value) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+// Check if two JSON-encoded values match with the NOT_CONTAINS relation
+static bool check_NOT_CONTAINS(const rjson::value* v1, const rjson::value& v2) {
+    if (!v1) {
+        return false;
+    }
+    return !check_CONTAINS(v1, v2);
 }

 // Check if a JSON-encoded value equals any element of an array, which must have at least one element.
@@ -207,6 +307,19 @@ static bool check_IN(const rjson::value* val, const rjson::value& array) {
    return have_match;
 }

+// Another variant of check_IN, this one for ConditionExpression. It needs to
+// check whether the first element in the given vector is equal to any of the
+// others.
+static bool check_IN(const std::vector<rjson::value>& array) {
+    const rjson::value* first = &array[0];
+    for (unsigned i = 1; i < array.size(); i++) {
+        if (check_EQ(first, array[i])) {
+            return true;
+        }
+    }
+    return false;
+}
+
 static bool check_NULL(const rjson::value* val) {
    return val == nullptr;
 }
@@ -221,13 +334,13 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara
    if (!v2.IsObject() || v2.MemberCount() != 1) {
        throw api_error("ValidationException",
                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic()));
+                               cmp.diagnostic));
    }
    const auto& kv2 = *v2.MemberBegin();
    if (kv2.name != "S" && kv2.name != "N" && kv2.name != "B") {
        throw api_error("ValidationException",
                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic()));
+                               cmp.diagnostic));
    }
    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
        return false;
@@ -237,7 +350,7 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara
        return false;
    }
    if (kv1.name == "N") {
-        return cmp(unwrap_number(*v1, cmp.diagnostic()), unwrap_number(v2, cmp.diagnostic()));
+        return cmp(unwrap_number(*v1, cmp.diagnostic), unwrap_number(v2, cmp.diagnostic));
    }
    if (kv1.name == "S") {
        return cmp(std::string_view(kv1.value.GetString(), kv1.value.GetStringLength()),
@@ -252,15 +365,84 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara

 struct cmp_lt {
    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs; }
-    const char* diagnostic() const { return "LT operator"; }
+    // We cannot use the normal comparison operators like "<" on the bytes
+    // type, because they treat individual bytes as signed but we need to
+    // compare them as *unsigned*. So we need a specialization for bytes.
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) < 0; }
+    static constexpr const char* diagnostic = "LT operator";
+};
+
+struct cmp_le {
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs <= rhs; }
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) <= 0; }
+    static constexpr const char* diagnostic = "LE operator";
+};
+
+struct cmp_ge {
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs >= rhs; }
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) >= 0; }
+    static constexpr const char* diagnostic = "GE operator";
 };

 struct cmp_gt {
-    // bytes only has <
-    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs; }
-    const char* diagnostic() const { return "GT operator"; }
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs > rhs; }
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) > 0; }
+    static constexpr const char* diagnostic = "GT operator";
 };

+// True if v is between lb and ub, inclusive.  Throws if lb > ub.
+template <typename T>
+bool check_BETWEEN(const T& v, const T& lb, const T& ub) {
+    if (cmp_lt()(ub, lb)) {
+        throw api_error("ValidationException",
+                        format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+    }
+    return cmp_ge()(v, lb) && cmp_le()(v, ub);
+}
+
+static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub) {
+    if (!v) {
+        return false;
+    }
+    if (!v->IsObject() || v->MemberCount() != 1) {
+        throw api_error("ValidationException", format("BETWEEN operator encountered malformed AttributeValue: {}", *v));
+    }
+    if (!lb.IsObject() || lb.MemberCount() != 1) {
+        throw api_error("ValidationException", format("BETWEEN operator encountered malformed AttributeValue: {}", lb));
+    }
+    if (!ub.IsObject() || ub.MemberCount() != 1) {
+        throw api_error("ValidationException", format("BETWEEN operator encountered malformed AttributeValue: {}", ub));
+    }
+
+    const auto& kv_v = *v->MemberBegin();
+    const auto& kv_lb = *lb.MemberBegin();
+    const auto& kv_ub = *ub.MemberBegin();
+    if (kv_lb.name != kv_ub.name) {
+        throw api_error(
+                "ValidationException",
+                format("BETWEEN operator requires the same type for lower and upper bound; instead got {} and {}",
+                       kv_lb.name, kv_ub.name));
+    }
+    if (kv_v.name != kv_lb.name) { // Cannot compare different types, so v is NOT between lb and ub.
+        return false;
+    }
+    if (kv_v.name == "N") {
+        const char* diag = "BETWEEN operator";
+        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag));
+    }
+    if (kv_v.name == "S") {
+        return check_BETWEEN(std::string_view(kv_v.value.GetString(), kv_v.value.GetStringLength()),
+                             std::string_view(kv_lb.value.GetString(), kv_lb.value.GetStringLength()),
+                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()));
+    }
+    if (kv_v.name == "B") {
+        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value));
+    }
+    throw api_error("ValidationException",
+        format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
+               kv_lb.name));
+}
+
 // Verify one Expect condition on one attribute (whose content is "got")
 // for the verify_expected() below.
 // This function returns true or false depending on whether the condition
@@ -306,9 +488,15 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
        case comparison_operator_type::LT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
            return check_compare(got, (*attribute_value_list)[0], cmp_lt{});
+        case comparison_operator_type::LE:
+            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+            return check_compare(got, (*attribute_value_list)[0], cmp_le{});
        case comparison_operator_type::GT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
            return check_compare(got, (*attribute_value_list)[0], cmp_gt{});
+        case comparison_operator_type::GE:
+            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+            return check_compare(got, (*attribute_value_list)[0], cmp_ge{});
        case comparison_operator_type::BEGINS_WITH:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
            return check_BEGINS_WITH(got, (*attribute_value_list)[0]);
@@ -321,23 +509,29 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
        case comparison_operator_type::NOT_NULL:
            verify_operand_count(attribute_value_list, empty(), *comparison_operator);
            return check_NOT_NULL(got);
-        default:
-            // FIXME: implement all the missing types, so there will be no default here.
-            throw api_error("ValidationException", format("ComparisonOperator {} is not yet supported", *comparison_operator));
+        case comparison_operator_type::BETWEEN:
+            verify_operand_count(attribute_value_list, exact_size(2), *comparison_operator);
+            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1]);
+        case comparison_operator_type::CONTAINS:
+            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+            return check_CONTAINS(got, (*attribute_value_list)[0]);
+        case comparison_operator_type::NOT_CONTAINS:
+            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+            return check_NOT_CONTAINS(got, (*attribute_value_list)[0]);
        }
+        throw std::logic_error(format("Internal error: corrupted operator enum: {}", int(op)));
    }
 }

-// Verify that the existing values of the item (previous_item) match the
+// Check if the existing values of the item (previous_item) match the
 // conditions given by the Expected and ConditionalOperator parameters
 // (if they exist) in the request (an UpdateItem, PutItem or DeleteItem).
-// This function will throw a ConditionalCheckFailedException API error
-// if the values do not match the condition, or ValidationException if there
+// This function can throw an ValidationException API error if there
 // are errors in the format of the condition itself.
-void verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value>& previous_item) {
+bool verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value>& previous_item) {
    const rjson::value* expected = rjson::find(req, "Expected");
    if (!expected) {
-        return;
+        return true;
    }
    if (!expected->IsObject()) {
        throw api_error("ValidationException", "'Expected' parameter, if given, must be an object");
@@ -366,22 +560,123 @@ void verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value
    for (auto it = expected->MemberBegin(); it != expected->MemberEnd(); ++it) {
        const rjson::value* got = nullptr;
        if (previous_item && previous_item->IsObject() && previous_item->HasMember("Item")) {
-            got = rjson::find((*previous_item)["Item"], rjson::string_ref_type(it->name.GetString()));
+            got = rjson::find((*previous_item)["Item"], rjson::to_string_view(it->name));
        }
        bool success = verify_expected_one(it->value, got);
        if (success && !require_all) {
            // When !require_all, one success is enough!
-            return;
+            return true;
        } else if (!success && require_all) {
            // When require_all, one failure is enough!
-            throw api_error("ConditionalCheckFailedException", "Failed condition.");
+            return false;
        }
    }
    // If we got here and require_all, none of the checks failed, so succeed.
    // If we got here and !require_all, all of the checks failed, so fail.
-    if (!require_all) {
-        throw api_error("ConditionalCheckFailedException", "None of ORed Expect conditions were successful.");
+    return require_all;
+}
+
+bool calculate_primitive_condition(const parsed::primitive_condition& cond,
+        std::unordered_set<std::string>& used_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        const rjson::value& req,
+        schema_ptr schema,
+        const std::unique_ptr<rjson::value>& previous_item) {
+    std::vector<rjson::value> calculated_values;
+    calculated_values.reserve(cond._values.size());
+    for (const parsed::value& v : cond._values) {
+        calculated_values.push_back(calculate_value(v,
+                cond._op == parsed::primitive_condition::type::VALUE ?
+                        calculate_value_caller::ConditionExpressionAlone :
+                        calculate_value_caller::ConditionExpression,
+                rjson::find(req, "ExpressionAttributeValues"),
+                used_attribute_names, used_attribute_values,
+                req, schema, previous_item));
+    }
+    switch (cond._op) {
+    case parsed::primitive_condition::type::BETWEEN:
+        if (calculated_values.size() != 3) {
+            // Shouldn't happen unless we have a bug in the parser
+            throw std::logic_error(format("Wrong number of values {} in BETWEEN primitive_condition", cond._values.size()));
+        }
+        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2]);
+    case parsed::primitive_condition::type::IN:
+        return check_IN(calculated_values);
+    case parsed::primitive_condition::type::VALUE:
+        if (calculated_values.size() != 1) {
+            // Shouldn't happen unless we have a bug in the parser
+            throw std::logic_error(format("Unexpected values in primitive_condition", cond._values.size()));
+        }
+        // Unwrap the boolean wrapped as the value (if it is a boolean)
+        if (calculated_values[0].IsObject() && calculated_values[0].MemberCount() == 1) {
+            auto it = calculated_values[0].MemberBegin();
+            if (it->name == "BOOL" && it->value.IsBool()) {
+                return it->value.GetBool();
+            }
+        }
+        throw api_error("ValidationException",
+                format("ConditionExpression: condition results in a non-boolean value: {}",
+                        calculated_values[0]));
+    default:
+        // All the rest of the operators have exactly two parameters (and unless
+        // we have a bug in the parser, that's what we have in the parsed object:
+        if (calculated_values.size() != 2) {
+            throw std::logic_error(format("Wrong number of values {} in primitive_condition object", cond._values.size()));
+        }
+    }
+    switch (cond._op) {
+    case parsed::primitive_condition::type::EQ:
+        return check_EQ(&calculated_values[0], calculated_values[1]);
+    case parsed::primitive_condition::type::NE:
+        return check_NE(&calculated_values[0], calculated_values[1]);
+    case parsed::primitive_condition::type::GT:
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{});
+    case parsed::primitive_condition::type::GE:
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{});
+    case parsed::primitive_condition::type::LT:
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{});
+    case parsed::primitive_condition::type::LE:
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{});
+    default:
+        // Shouldn't happen unless we have a bug in the parser
+        throw std::logic_error(format("Unknown type {} in primitive_condition object", (int)(cond._op)));
    }
 }

+// Check if the existing values of the item (previous_item) match the
+// conditions given by the given parsed ConditionExpression.
+bool verify_condition_expression(
+        const parsed::condition_expression& condition_expression,
+        std::unordered_set<std::string>& used_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        const rjson::value& req,
+        schema_ptr schema,
+        const std::unique_ptr<rjson::value>& previous_item) {
+    if (condition_expression.empty()) {
+        return true;
+    }
+    bool ret = std::visit(overloaded_functor {
+        [&] (const parsed::primitive_condition& cond) -> bool {
+            return calculate_primitive_condition(cond, used_attribute_values,
+                    used_attribute_names, req, schema, previous_item);
+        },
+        [&] (const parsed::condition_expression::condition_list& list) -> bool {
+            auto verify_condition = [&] (const parsed::condition_expression& e) {
+                return verify_condition_expression(e, used_attribute_values,
+                        used_attribute_names, req, schema, previous_item);
+            };
+            switch (list.op) {
+            case '&':
+                return boost::algorithm::all_of(list.conditions, verify_condition);
+            case '|':
+                return boost::algorithm::any_of(list.conditions, verify_condition);
+            default:
+                // Shouldn't happen unless we have a bug in the parser
+                throw std::logic_error("bad operator in condition_list");
+            }
+        }
+    }, condition_expression._expression);
+    return condition_expression._negated ? !ret : ret;
+}
+
 }
--- a/alternator/conditions.hh
+++ b/alternator/conditions.hh
@@ -37,13 +37,13 @@
 namespace alternator {

 enum class comparison_operator_type {
-    EQ, NE, LE, LT, GE, GT, IN, BETWEEN, CONTAINS, IS_NULL, NOT_NULL, BEGINS_WITH
+    EQ, NE, LE, LT, GE, GT, IN, BETWEEN, CONTAINS, NOT_CONTAINS, IS_NULL, NOT_NULL, BEGINS_WITH
 };

 comparison_operator_type get_comparison_operator(const rjson::value& comparison_operator);

 ::shared_ptr<cql3::restrictions::statement_restrictions> get_filtering_restrictions(schema_ptr schema, const column_definition& attrs_col, const rjson::value& query_filter);

-void verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value>& previous_item);
+bool verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value>& previous_item);

 }
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -25,47 +25,58 @@
 #include <seastar/http/httpd.hh>
 #include "seastarx.hh"
 #include <seastar/json/json_elements.hh>
+#include <seastar/core/sharded.hh>

 #include "service/storage_proxy.hh"
 #include "service/migration_manager.hh"
 #include "service/client_state.hh"

+#include "alternator/error.hh"
 #include "stats.hh"
+#include "rjson.hh"

 namespace alternator {

-class executor {
+class executor : public peering_sharded_service<executor> {
    service::storage_proxy& _proxy;
    service::migration_manager& _mm;
+    // An smp_service_group to be used for limiting the concurrency when
+    // forwarding Alternator request between shards - if necessary for LWT.
+    smp_service_group _ssg;

 public:
    using client_state = service::client_state;
+    using request_return_type = std::variant<json::json_return_type, api_error>;
    stats _stats;
    static constexpr auto ATTRS_COLUMN_NAME = ":attrs";
-    static constexpr auto KEYSPACE_NAME = "alternator";
+    static constexpr auto KEYSPACE_NAME_PREFIX = "alternator_";

-    executor(service::storage_proxy& proxy, service::migration_manager& mm) : _proxy(proxy), _mm(mm) {}
+    executor(service::storage_proxy& proxy, service::migration_manager& mm, smp_service_group ssg)
+        : _proxy(proxy), _mm(mm), _ssg(ssg) {}

-    future<json::json_return_type> create_table(client_state& client_state, std::string content);
-    future<json::json_return_type> describe_table(client_state& client_state, std::string content);
-    future<json::json_return_type> delete_table(client_state& client_state, std::string content);
-    future<json::json_return_type> put_item(client_state& client_state, std::string content);
-    future<json::json_return_type> get_item(client_state& client_state, std::string content);
-    future<json::json_return_type> delete_item(client_state& client_state, std::string content);
-    future<json::json_return_type> update_item(client_state& client_state, std::string content);
-    future<json::json_return_type> list_tables(client_state& client_state, std::string content);
-    future<json::json_return_type> scan(client_state& client_state, std::string content);
-    future<json::json_return_type> describe_endpoints(client_state& client_state, std::string content, std::string host_header);
-    future<json::json_return_type> batch_write_item(client_state& client_state, std::string content);
-    future<json::json_return_type> batch_get_item(client_state& client_state, std::string content);
-    future<json::json_return_type> query(client_state& client_state, std::string content);
+    future<request_return_type> create_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> delete_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> put_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> delete_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> update_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> list_tables(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> scan(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> describe_endpoints(client_state& client_state, service_permit permit, rjson::value request, std::string host_header);
+    future<request_return_type> batch_write_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> batch_get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> query(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> tag_resource(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> untag_resource(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> list_tags_of_resource(client_state& client_state, service_permit permit, rjson::value request);

    future<> start();
    future<> stop() { return make_ready_future<>(); }

-    future<> maybe_create_keyspace();
+    future<> create_keyspace(std::string_view keyspace_name);

-    static void maybe_trace_query(client_state& client_state, sstring_view op, sstring_view query);
+    static tracing::trace_state_ptr maybe_trace_query(client_state& client_state, sstring_view op, sstring_view query);
 };

 }
--- a/alternator/expressions.cc
+++ b/alternator/expressions.cc
@@ -22,6 +22,7 @@
 #include "expressions.hh"
 #include "alternator/expressionsLexer.hpp"
 #include "alternator/expressionsParser.hpp"
+#include "utils/overloaded_functor.hh"

 #include <seastarx.hh>

@@ -65,13 +66,19 @@ parse_projection_expression(std::string query) {
    }
 }

-template<class... Ts> struct overloaded : Ts... { using Ts::operator()...; };
-template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
+parsed::condition_expression
+parse_condition_expression(std::string query) {
+    try {
+        return do_with_parser(query,  std::mem_fn(&expressionsParser::condition_expression));
+    } catch (...) {
+        throw expressions_syntax_error(format("Failed parsing ConditionExpression '{}': {}", query, std::current_exception()));
+    }
+}

 namespace parsed {

 void update_expression::add(update_expression::action a) {
-    std::visit(overloaded {
+    std::visit(overloaded_functor {
        [&] (action::set&)    { seen_set = true; },
        [&] (action::remove&) { seen_remove = true; },
        [&] (action::add&)    { seen_add = true; },
@@ -94,5 +101,27 @@ void update_expression::append(update_expression other) {
    seen_del |= other.seen_del;
 }

+void condition_expression::append(condition_expression&& a, char op) {
+    std::visit(overloaded_functor {
+        [&] (condition_list& x) {
+            // If 'a' has a single condition, we could, instead of inserting
+            // it insert its single condition (possibly negated if a._negated)
+            // But considering it we don't evaluate these expressions many
+            // times, this optimization is not worth extra code complexity.
+            if (!x.conditions.empty() && x.op != op) {
+                // Shouldn't happen unless we have a bug in the parser
+                throw std::logic_error("condition_expression::append called with mixed operators");
+            }
+            x.conditions.push_back(std::move(a));
+            x.op = op;
+        },
+        [&] (primitive_condition& x) {
+            // Shouldn't happen unless we have a bug in the parser
+            throw std::logic_error("condition_expression::append called on primitive_condition");
+        }
+    }, _expression);
+}
+
+
 } // namespace parsed
 } // namespace alternator
--- a/alternator/expressions.g
+++ b/alternator/expressions.g
@@ -145,6 +145,12 @@ REMOVE: R E M O V E;
 ADD: A D D;
 DELETE: D E L E T E;

+AND: A N D;
+OR: O R;
+NOT: N O T;
+BETWEEN: B E T W E E N;
+IN: I N;
+
 fragment ALPHA: 'A'..'Z' | 'a'..'z';
 fragment DIGIT: '0'..'9';
 fragment ALNUM: ALPHA | DIGIT | '_';
@@ -165,19 +171,19 @@ path returns [parsed::path p]:
      | '[' INTEGER ']'           { $p.add_index(std::stoi($INTEGER.text)); }
    )*;

-update_expression_set_value returns [parsed::value v]:
-      VALREF                             { $v.set_valref($VALREF.text); }
-    | path                               { $v.set_path($path.p); }
-    | NAME                               { $v.set_func_name($NAME.text); }
-     '(' x=update_expression_set_value   { $v.add_func_parameter($x.v); }
-     (',' x=update_expression_set_value  { $v.add_func_parameter($x.v); })*
+value returns [parsed::value v]:
+      VALREF       { $v.set_valref($VALREF.text); }
+    | path         { $v.set_path($path.p); }
+    | NAME         { $v.set_func_name($NAME.text); }
+     '(' x=value   { $v.add_func_parameter($x.v); }
+     (',' x=value  { $v.add_func_parameter($x.v); })*
     ')'
    ;

 update_expression_set_rhs returns [parsed::set_rhs rhs]:
-    v=update_expression_set_value  { $rhs.set_value(std::move($v.v)); }
-    (   '+' v=update_expression_set_value  { $rhs.set_plus(std::move($v.v)); }
-      | '-' v=update_expression_set_value  { $rhs.set_minus(std::move($v.v)); }
+    v=value  { $rhs.set_value(std::move($v.v)); }
+    (   '+' v=value  { $rhs.set_plus(std::move($v.v)); }
+      | '-' v=value  { $rhs.set_minus(std::move($v.v)); }
    )?
    ;

@@ -212,3 +218,48 @@ update_expression returns [parsed::update_expression e]:
 projection_expression returns [std::vector<parsed::path> v]:
    p=path      { $v.push_back(std::move($p.p)); }
    (',' p=path { $v.push_back(std::move($p.p)); } )* EOF;
+
+
+primitive_condition returns [parsed::primitive_condition c]:
+      v=value         { $c.add_value(std::move($v.v));
+                        $c.set_operator(parsed::primitive_condition::type::VALUE); }
+      (  (  '='       { $c.set_operator(parsed::primitive_condition::type::EQ); }
+          | '<' '>'   { $c.set_operator(parsed::primitive_condition::type::NE); }
+          | '<'       { $c.set_operator(parsed::primitive_condition::type::LT); }
+          | '<' '='   { $c.set_operator(parsed::primitive_condition::type::LE); }
+          | '>'       { $c.set_operator(parsed::primitive_condition::type::GT); }
+          | '>' '='   { $c.set_operator(parsed::primitive_condition::type::GE); }
+         )
+         v=value      { $c.add_value(std::move($v.v)); }
+       | BETWEEN      { $c.set_operator(parsed::primitive_condition::type::BETWEEN); }
+         v=value      { $c.add_value(std::move($v.v)); }
+         AND
+         v=value      { $c.add_value(std::move($v.v)); }
+       | IN '('       { $c.set_operator(parsed::primitive_condition::type::IN); }
+         v=value      { $c.add_value(std::move($v.v)); }
+         (',' v=value { $c.add_value(std::move($v.v)); })*
+         ')'
+      )?
+    ;
+
+// The following rules for parsing boolean expressions are verbose and
+// somewhat strange because of Antlr 3's limitations on recursive rules,
+// common rule prefixes, and (lack of) support for operator precedence.
+// These rules could have been written more clearly using a more powerful
+// parser generator - such as Yacc.
+boolean_expression returns [parsed::condition_expression e]:
+	  b=boolean_expression_1       { $e.append(std::move($b.e), '|'); }
+	  (OR b=boolean_expression_1   { $e.append(std::move($b.e), '|'); } )*
+	;
+boolean_expression_1 returns [parsed::condition_expression e]:
+	  b=boolean_expression_2       { $e.append(std::move($b.e), '&'); }
+	  (AND b=boolean_expression_2  { $e.append(std::move($b.e), '&'); } )*
+	;
+boolean_expression_2 returns [parsed::condition_expression e]:
+	  p=primitive_condition        { $e.set_primitive(std::move($p.c)); }
+	| NOT b=boolean_expression_2   { $e = std::move($b.e); $e.apply_not(); }
+	| '(' b=boolean_expression ')' { $e = std::move($b.e); }
+    ;
+
+condition_expression returns [parsed::condition_expression e]:
+    boolean_expression { e=std::move($boolean_expression.e); } EOF;
--- a/alternator/expressions.hh
+++ b/alternator/expressions.hh
@@ -36,6 +36,6 @@ public:

 parsed::update_expression parse_update_expression(std::string query);
 std::vector<parsed::path> parse_projection_expression(std::string query);
-
+parsed::condition_expression parse_condition_expression(std::string query);

 } /* namespace alternator */
--- a/alternator/expressions_eval.hh
+++ b/alternator/expressions_eval.hh
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <string>
+#include <unordered_set>
+
+#include "rjson.hh"
+#include "schema_fwd.hh"
+
+#include "expressions_types.hh"
+
+namespace alternator {
+
+// calculate_value() behaves slightly different (especially, different
+// functions supported) when used in different types of expressions, as
+// enumerated in this enum:
+enum class calculate_value_caller {
+    UpdateExpression, ConditionExpression, ConditionExpressionAlone
+};
+
+inline std::ostream& operator<<(std::ostream& out, calculate_value_caller caller) {
+    switch (caller) {
+        case calculate_value_caller::UpdateExpression:
+            out << "UpdateExpression";
+            break;
+        case calculate_value_caller::ConditionExpression:
+            out << "ConditionExpression";
+            break;
+        case calculate_value_caller::ConditionExpressionAlone:
+            out << "ConditionExpression";
+            break;
+        default:
+            out << "unknown type of expression";
+            break;
+    }
+    return out;
+}
+
+bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2);
+
+rjson::value calculate_value(const parsed::value& v,
+        calculate_value_caller caller,
+        const rjson::value* expression_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        std::unordered_set<std::string>& used_attribute_values,
+        const rjson::value& update_info,
+        schema_ptr schema,
+        const std::unique_ptr<rjson::value>& previous_item);
+
+bool verify_condition_expression(
+        const parsed::condition_expression& condition_expression,
+        std::unordered_set<std::string>& used_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        const rjson::value& req,
+        schema_ptr schema,
+        const std::unique_ptr<rjson::value>& previous_item);
+
+} /* namespace alternator */
--- a/alternator/expressions_types.hh
+++ b/alternator/expressions_types.hh
@@ -88,6 +88,15 @@ struct value {
    void add_func_parameter(value v) {
        std::get<function_call>(_value)._parameters.emplace_back(std::move(v));
    }
+    bool is_valref() const {
+        return std::holds_alternative<std::string>(_value);
+    }
+    bool is_path() const {
+        return std::holds_alternative<path>(_value);
+    }
+    bool is_func() const {
+        return std::holds_alternative<function_call>(_value);
+    }
 };

 // The right-hand-side of a SET in an update expression can be either a
@@ -162,5 +171,58 @@ public:
    }
 };

+// A primitive_condition is a condition expression involving one condition,
+// while the full condition_expression below adds boolean logic over these
+// primitive conditions.
+// The supported primitive conditions are:
+// 1. Binary operators - v1 OP v2, where OP is =, <>, <, <=, >, or >= and
+//    v1 and v2 are values - from the item (an attribute path), the query
+//    (a ":val" reference), or a function of the the above (only the size()
+//    function is supported).
+// 2. Ternary operator - v1 BETWEEN v2 and v3 (means v1 >= v2 AND v1 <= v3).
+// 3. N-ary operator - v1 IN ( v2, v3, ... )
+// 4. A single function call (attribute_exists etc.). The parser actually
+//    accepts a more general "value" here but later stages reject a value
+//    which is not a function call (because DynamoDB does it too).
+class primitive_condition {
+public:
+    enum class type {
+        UNDEFINED, VALUE, EQ, NE, LT, LE, GT, GE, BETWEEN, IN
+    };
+    type _op = type::UNDEFINED;
+    std::vector<value> _values;
+    void set_operator(type op) {
+        _op = op;
+    }
+    void add_value(value&& v) {
+        _values.push_back(std::move(v));
+    }
+    bool empty() const {
+        return _op == type::UNDEFINED;
+    }
+};
+
+class condition_expression {
+public:
+    bool _negated = false; // If true, the entire condition is negated
+    struct condition_list {
+        char op = '|'; // '&' or '|'
+        std::vector<condition_expression> conditions;
+    };
+    std::variant<primitive_condition, condition_list> _expression = condition_list();
+
+    void set_primitive(primitive_condition&& p) {
+        _expression = std::move(p);
+    }
+    void append(condition_expression&& c, char op);
+    void apply_not() {
+        _negated = !_negated;
+    }
+    bool empty() const {
+        return std::holds_alternative<condition_list>(_expression) &&
+               std::get<condition_list>(_expression).conditions.empty();
+    }
+};
+
 } // namespace parsed
 } // namespace alternator
--- a/alternator/rjson.cc
+++ b/alternator/rjson.cc
@@ -22,14 +22,108 @@
 #include "rjson.hh"
 #include "error.hh"
 #include <seastar/core/print.hh>
+#include <seastar/core/thread.hh>

 namespace rjson {

 static allocator the_allocator;

+/*
+ * This wrapper class adds nested level checks to rapidjson's handlers.
+ * Each rapidjson handler implements functions for accepting JSON values,
+ * which includes strings, numbers, objects, arrays, etc.
+ * Parsing objects and arrays needs to be performed carefully with regard
+ * to stack overflow - each object/array layer adds another stack frame
+ * to parsing, printing and destroying the parent JSON document.
+ * To prevent stack overflow, a rapidjson handler can be wrapped with
+ * guarded_json_handler, which accepts an additional max_nested_level parameter.
+ * After trying to exceed the max nested level, a proper rjson::error will be thrown.
+ */
+template<typename Handler, bool EnableYield>
+struct guarded_yieldable_json_handler : public Handler {
+    size_t _nested_level = 0;
+    size_t _max_nested_level;
+public:
+    using handler_base = Handler;
+
+    explicit guarded_yieldable_json_handler(size_t max_nested_level) : _max_nested_level(max_nested_level) {}
+    guarded_yieldable_json_handler(string_buffer& buf, size_t max_nested_level)
+            : handler_base(buf), _max_nested_level(max_nested_level) {}
+
+    void Parse(const char* str, size_t length) {
+        rapidjson::MemoryStream ms(static_cast<const char*>(str), length * sizeof(typename encoding::Ch));
+        rapidjson::EncodedInputStream<encoding, rapidjson::MemoryStream> is(ms);
+        rapidjson::GenericReader<encoding, encoding, allocator> reader(&the_allocator);
+        reader.Parse(is, *this);
+        if (reader.HasParseError()) {
+            throw rjson::error(format("Parsing JSON failed: {}", rapidjson::GetParseError_En(reader.GetParseErrorCode())));
+        }
+        //NOTICE: The handler has parsed the string, but in case of rapidjson::GenericDocument
+        // the data now resides in an internal stack_ variable, which is private instead of
+        // protected... which means we cannot simply access its data. Fortunately, another
+        // function for populating documents from SAX events can be abused to extract the data
+        // from the stack via gadget-oriented programming - we use an empty event generator
+        // which does nothing, and use it to call Populate(), which assumes that the generator
+        // will fill the stack with something. It won't, but our stack is already filled with
+        // data we want to steal, so once Populate() ends, our document will be properly parsed.
+        // A proper solution could be programmed once rapidjson declares this stack_ variable
+        // as protected instead of private, so that this class can access it.
+        auto dummy_generator = [](handler_base&){return true;};
+        handler_base::Populate(dummy_generator);
+    }
+
+    bool StartObject() {
+        ++_nested_level;
+        check_nested_level();
+        maybe_yield();
+        return handler_base::StartObject();
+    }
+
+    bool EndObject(rapidjson::SizeType elements_count = 0) {
+        --_nested_level;
+        return handler_base::EndObject(elements_count);
+    }
+
+    bool StartArray() {
+        ++_nested_level;
+        check_nested_level();
+        maybe_yield();
+        return handler_base::StartArray();
+    }
+
+    bool EndArray(rapidjson::SizeType elements_count = 0) {
+        --_nested_level;
+        return handler_base::EndArray(elements_count);
+    }
+
+    bool Null()                 { maybe_yield(); return handler_base::Null(); }
+    bool Bool(bool b)           { maybe_yield(); return handler_base::Bool(b); }
+    bool Int(int i)             { maybe_yield(); return handler_base::Int(i); }
+    bool Uint(unsigned u)       { maybe_yield(); return handler_base::Uint(u); }
+    bool Int64(int64_t i64)     { maybe_yield(); return handler_base::Int64(i64); }
+    bool Uint64(uint64_t u64)   { maybe_yield(); return handler_base::Uint64(u64); }
+    bool Double(double d)       { maybe_yield(); return handler_base::Double(d); }
+    bool String(const value::Ch* str, size_t length, bool copy = false) { maybe_yield(); return handler_base::String(str, length, copy); }
+    bool Key(const value::Ch* str, size_t length, bool copy = false) { maybe_yield(); return handler_base::Key(str, length, copy); }
+
+
+protected:
+    static void maybe_yield() {
+        if constexpr (EnableYield) {
+            thread::maybe_yield();
+        }
+    }
+
+    void check_nested_level() const {
+        if (RAPIDJSON_UNLIKELY(_nested_level > _max_nested_level)) {
+            throw rjson::error(format("Max nested level reached: {}", _max_nested_level));
+        }
+    }
+};
+
 std::string print(const rjson::value& value) {
    string_buffer buffer;
-    writer writer(buffer);
+    guarded_yieldable_json_handler<writer, false> writer(buffer, 39);
    value.Accept(writer);
    return std::string(buffer.GetString());
 }
@@ -38,13 +132,9 @@ rjson::value copy(const rjson::value& value) {
    return rjson::value(value, the_allocator);
 }

-rjson::value parse(const std::string& str) {
-    return parse_raw(str.c_str(), str.size());
-}
-
-rjson::value parse_raw(const char* c_str, size_t size) {
-    rjson::document d;
-    d.Parse(c_str, size);
+rjson::value parse(std::string_view str) {
+    guarded_yieldable_json_handler<document, false> d(39);
+    d.Parse(str.data(), str.size());
    if (d.HasParseError()) {
        throw rjson::error(format("Parsing JSON failed: {}", GetParseError_En(d.GetParseError())));
    }
@@ -52,8 +142,22 @@ rjson::value parse_raw(const char* c_str, size_t size) {
    return std::move(v);
 }

-rjson::value& get(rjson::value& value, rjson::string_ref_type name) {
-    auto member_it = value.FindMember(name);
+rjson::value parse_yieldable(std::string_view str) {
+    guarded_yieldable_json_handler<document, true> d(39);
+    d.Parse(str.data(), str.size());
+    if (d.HasParseError()) {
+        throw rjson::error(format("Parsing JSON failed: {}", GetParseError_En(d.GetParseError())));
+    }
+    rjson::value& v = d;
+    return std::move(v);
+}
+
+rjson::value& get(rjson::value& value, std::string_view name) {
+    // Although FindMember() has a variant taking a StringRef, it ignores the
+    // given length (see https://github.com/Tencent/rapidjson/issues/1649).
+    // Luckily, the variant taking a GenericValue doesn't share this bug,
+    // and we can create a string GenericValue without copying the string.
+    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
    if (member_it != value.MemberEnd())
        return member_it->value;
    else {
@@ -61,8 +165,8 @@ rjson::value& get(rjson::value& value, rjson::string_ref_type name) {
    }
 }

-const rjson::value& get(const rjson::value& value, rjson::string_ref_type name) {
-    auto member_it = value.FindMember(name);
+const rjson::value& get(const rjson::value& value, std::string_view name) {
+    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
    if (member_it != value.MemberEnd())
        return member_it->value;
    else {
@@ -82,24 +186,48 @@ rjson::value from_string(const char* str, size_t size) {
    return rjson::value(str, size, the_allocator);
 }

-const rjson::value* find(const rjson::value& value, string_ref_type name) {
-    auto member_it = value.FindMember(name);
+rjson::value from_string(std::string_view view) {
+    return rjson::value(view.data(), view.size(), the_allocator);
+}
+
+const rjson::value* find(const rjson::value& value, std::string_view name) {
+    // Although FindMember() has a variant taking a StringRef, it ignores the
+    // given length (see https://github.com/Tencent/rapidjson/issues/1649).
+    // Luckily, the variant taking a GenericValue doesn't share this bug,
+    // and we can create a string GenericValue without copying the string.
+    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
    return member_it != value.MemberEnd() ? &member_it->value : nullptr;
 }

-rjson::value* find(rjson::value& value, string_ref_type name) {
-    auto member_it = value.FindMember(name);
+rjson::value* find(rjson::value& value, std::string_view name) {
+    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
    return member_it != value.MemberEnd() ? &member_it->value : nullptr;
 }

+bool remove_member(rjson::value& value, std::string_view name) {
+    // Although RemoveMember() has a variant taking a StringRef, it ignores
+    // given length (see https://github.com/Tencent/rapidjson/issues/1649).
+    // Luckily, the variant taking a GenericValue doesn't share this bug,
+    // and we can create a string GenericValue without copying the string.
+    return value.RemoveMember(rjson::value(name.data(), name.size()));
+}
+
 void set_with_string_name(rjson::value& base, const std::string& name, rjson::value&& member) {
    base.AddMember(rjson::value(name.c_str(), name.size(), the_allocator), std::move(member), the_allocator);
 }

+void set_with_string_name(rjson::value& base, std::string_view name, rjson::value&& member) {
+    base.AddMember(rjson::value(name.data(), name.size(), the_allocator), std::move(member), the_allocator);
+}
+
 void set_with_string_name(rjson::value& base, const std::string& name, rjson::string_ref_type member) {
    base.AddMember(rjson::value(name.c_str(), name.size(), the_allocator), rjson::value(member), the_allocator);
 }

+void set_with_string_name(rjson::value& base, std::string_view name, rjson::string_ref_type member) {
+    base.AddMember(rjson::value(name.data(), name.size(), the_allocator), rjson::value(member), the_allocator);
+}
+
 void set(rjson::value& base, rjson::string_ref_type name, rjson::value&& member) {
    base.AddMember(name, std::move(member), the_allocator);
 }
@@ -113,6 +241,58 @@ void push_back(rjson::value& base_array, rjson::value&& item) {

 }

+bool single_value_comp::operator()(const rjson::value& r1, const rjson::value& r2) const {
+   auto r1_type = r1.GetType();
+   auto r2_type = r2.GetType();
+
+   // null is the smallest type and compares with every other type, nothing is lesser than null
+   if (r1_type == rjson::type::kNullType || r2_type == rjson::type::kNullType) {
+       return r1_type < r2_type;
+   }
+   // only null, true, and false are comparable with each other, other types are not compatible
+   if (r1_type != r2_type) {
+       if (r1_type > rjson::type::kTrueType || r2_type > rjson::type::kTrueType) {
+           throw rjson::error(format("Types are not comparable: {} {}", r1, r2));
+       }
+   }
+
+   switch (r1_type) {
+   case rjson::type::kNullType:
+       // fall-through
+   case rjson::type::kFalseType:
+       // fall-through
+   case rjson::type::kTrueType:
+       return r1_type < r2_type;
+   case rjson::type::kObjectType:
+       throw rjson::error("Object type comparison is not supported");
+   case rjson::type::kArrayType:
+       throw rjson::error("Array type comparison is not supported");
+   case rjson::type::kStringType: {
+       const size_t r1_len = r1.GetStringLength();
+       const size_t r2_len = r2.GetStringLength();
+       size_t len = std::min(r1_len, r2_len);
+       int result = std::strncmp(r1.GetString(), r2.GetString(), len);
+       return result < 0 || (result == 0 && r1_len < r2_len);
+   }
+   case rjson::type::kNumberType: {
+       if (r1.IsInt() && r2.IsInt()) {
+           return r1.GetInt() < r2.GetInt();
+       } else if (r1.IsUint() && r2.IsUint()) {
+           return r1.GetUint() < r2.GetUint();
+       } else if (r1.IsInt64() && r2.IsInt64()) {
+           return r1.GetInt64() < r2.GetInt64();
+       } else if (r1.IsUint64() && r2.IsUint64()) {
+           return r1.GetUint64() < r2.GetUint64();
+       } else {
+           // it's safe to call GetDouble() on any number type
+           return r1.GetDouble() < r2.GetDouble();
+       }
+   }
+   default:
+       return false;
+   }
+}
+
 } // end namespace rjson

 std::ostream& std::operator<<(std::ostream& os, const rjson::value& v) {
--- a/alternator/rjson.hh
+++ b/alternator/rjson.hh
@@ -104,38 +104,49 @@ inline rjson::value empty_string() {
 // The representation is dense - without any redundant indentation.
 std::string print(const rjson::value& value);

+// Returns a string_view to the string held in a JSON value (which is
+// assumed to hold a string, i.e., v.IsString() == true). This is a view
+// to the existing data - no copying is done.
+inline std::string_view to_string_view(const rjson::value& v) {
+    return std::string_view(v.GetString(), v.GetStringLength());
+}
+
 // Copies given JSON value - involves allocation
 rjson::value copy(const rjson::value& value);

 // Parses a JSON value from given string or raw character array.
 // The string/char array liveness does not need to be persisted,
-// as both parse() and parse_raw() will allocate member names and values.
+// as parse() will allocate member names and values.
 // Throws rjson::error if parsing failed.
-rjson::value parse(const std::string& str);
-rjson::value parse_raw(const char* c_str, size_t size);
+rjson::value parse(std::string_view str);
+// Needs to be run in thread context
+rjson::value parse_yieldable(std::string_view str);

 // Creates a JSON value (of JSON string type) out of internal string representations.
 // The string value is copied, so str's liveness does not need to be persisted.
 rjson::value from_string(const std::string& str);
 rjson::value from_string(const sstring& str);
 rjson::value from_string(const char* str, size_t size);
+rjson::value from_string(std::string_view view);

 // Returns a pointer to JSON member if it exists, nullptr otherwise
-rjson::value* find(rjson::value& value, rjson::string_ref_type name);
-const rjson::value* find(const rjson::value& value, rjson::string_ref_type name);
+rjson::value* find(rjson::value& value, std::string_view name);
+const rjson::value* find(const rjson::value& value, std::string_view name);

 // Returns a reference to JSON member if it exists, throws otherwise
-rjson::value& get(rjson::value& value, rjson::string_ref_type name);
-const rjson::value& get(const rjson::value& value, rjson::string_ref_type name);
+rjson::value& get(rjson::value& value, std::string_view name);
+const rjson::value& get(const rjson::value& value, std::string_view name);

 // Sets a member in given JSON object by moving the member - allocates the name.
 // Throws if base is not a JSON object.
 void set_with_string_name(rjson::value& base, const std::string& name, rjson::value&& member);
+void set_with_string_name(rjson::value& base, std::string_view name, rjson::value&& member);

 // Sets a string member in given JSON object by assigning its reference - allocates the name.
 // NOTICE: member string liveness must be ensured to be at least as long as base's.
 // Throws if base is not a JSON object.
 void set_with_string_name(rjson::value& base, const std::string& name, rjson::string_ref_type member);
+void set_with_string_name(rjson::value& base, std::string_view name, rjson::string_ref_type member);

 // Sets a member in given JSON object by moving the member.
 // NOTICE: name liveness must be ensured to be at least as long as base's.
@@ -152,6 +163,13 @@ void set(rjson::value& base, rjson::string_ref_type name, rjson::string_ref_type
 // Throws if base_array is not a JSON array.
 void push_back(rjson::value& base_array, rjson::value&& item);

+// Remove a member from a JSON object. Throws if value isn't an object.
+bool remove_member(rjson::value& value, std::string_view name);
+
+struct single_value_comp {
+    bool operator()(const rjson::value& r1, const rjson::value& r2) const;
+};
+
 } // end namespace rjson

 namespace std {
--- a/alternator/rmw_operation.hh
+++ b/alternator/rmw_operation.hh
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <seastarx.hh>
+#include <service/storage_proxy.hh>
+#include <service/storage_proxy.hh>
+#include "rjson.hh"
+#include "executor.hh"
+
+namespace alternator {
+
+// An rmw_operation encapsulates the common logic of all the item update
+// operations which may involve a read of the item before the write
+// (so-called Read-Modify-Write operations). These operations include PutItem,
+// UpdateItem and DeleteItem: All of these may be conditional operations (the
+// "Expected" parameter) which requir a read before the write, and UpdateItem
+// may also have an update expression which refers to the item's old value.
+//
+// The code below supports running the read and the write together as one
+// transaction using LWT (this is why rmw_operation is a subclass of
+// cas_request, as required by storage_proxy::cas()), but also has optional
+// modes not using LWT.
+class rmw_operation : public service::cas_request, public enable_shared_from_this<rmw_operation> {
+public:
+    // The following options choose which mechanism to use for isolating
+    // parallel write operations:
+    // * The FORBID_RMW option forbids RMW (read-modify-write) operations
+    //   such as conditional updates. For the remaining write-only
+    //   operations, ordinary quorum writes are isolated enough.
+    // * The LWT_ALWAYS option always uses LWT (lightweight transactions)
+    //   for any write operation - whether or not it also has a read.
+    // * The LWT_RMW_ONLY option uses LWT only for RMW operations, and uses
+    //   ordinary quorum writes for write-only operations.
+    //   This option is not safe if the user may send both RMW and write-only
+    //   operations on the same item.
+    // * The UNSAFE_RMW option does read-modify-write operations as separate
+    //   read and write. It is unsafe - concurrent RMW operations are not
+    //   isolated at all. This option will likely be removed in the future.
+    enum class write_isolation {
+        FORBID_RMW, LWT_ALWAYS, LWT_RMW_ONLY, UNSAFE_RMW
+    };
+    static constexpr auto WRITE_ISOLATION_TAG_KEY = "system:write_isolation";
+
+    static write_isolation get_write_isolation_for_schema(schema_ptr schema);
+
+protected:
+    // The full request JSON
+    rjson::value _request;
+    // All RMW operations involve a single item with a specific partition
+    // and optional clustering key, in a single table, so the following
+    // information is common to all of them:
+    schema_ptr _schema;
+    partition_key _pk = partition_key::make_empty();
+    clustering_key _ck = clustering_key::make_empty();
+    write_isolation _write_isolation;
+
+    // All RMW operations can have a ReturnValues parameter from the following
+    // choices. But note that only UpdateItem actually supports all of them:
+    enum class returnvalues {
+        NONE, ALL_OLD, UPDATED_OLD, ALL_NEW, UPDATED_NEW
+    } _returnvalues;
+    static returnvalues parse_returnvalues(const rjson::value& request);
+    // When _returnvalues != NONE, apply() should store here, in JSON form,
+    // the values which are to be returned in the "Attributes" field.
+    // The default null JSON means do not return an Attributes field at all.
+    // This field is marked "mutable" so that the const apply() can modify
+    // it (see explanation below), but note that because apply() may be
+    // called more than once, if apply() will sometimes set this field it
+    // must set it (even if just to the default empty value) every time.
+    mutable rjson::value _return_attributes;
+public:
+    // The constructor of a rmw_operation subclass should parse the request
+    // and try to discover as many input errors as it can before really
+    // attempting the read or write operations.
+    rmw_operation(service::storage_proxy& proxy, rjson::value&& request);
+    // rmw_operation subclasses (update_item_operation, put_item_operation
+    // and delete_item_operation) shall implement an apply() function which
+    // takes the previous value of the item (if it was read) and creates the
+    // write mutation. If the previous value of item does not pass the needed
+    // conditional expression, apply() should return an empty optional.
+    // apply() may throw if it encounters input errors not discovered during
+    // the constructor.
+    // apply() may be called more than once in case of contention, so it must
+    // not change the state saved in the object (issue #7218 was caused by
+    // violating this). We mark apply() "const" to let the compiler validate
+    // this for us. The output-only field _return_attributes is marked
+    // "mutable" above so that apply() can still write to it.
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const = 0;
+    // Convert the above apply() into the signature needed by cas_request:
+    virtual std::optional<mutation> apply(query::result& qr, const query::partition_slice& slice, api::timestamp_type ts) override;
+    virtual ~rmw_operation() = default;
+    schema_ptr schema() const { return _schema; }
+    const rjson::value& request() const { return _request; }
+    rjson::value&& move_request() && { return std::move(_request); }
+    future<executor::request_return_type> execute(service::storage_proxy& proxy,
+            service::client_state& client_state,
+            tracing::trace_state_ptr trace_state,
+            service_permit permit,
+            bool needs_read_before_write,
+            stats& stats);
+    std::optional<shard_id> shard_for_execute(bool needs_read_before_write);
+};
+
+} // namespace alternator
--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -25,6 +25,7 @@
 #include "error.hh"
 #include "rapidjson/writer.h"
 #include "concrete_types.hh"
+#include "cql3/type_json.hh"

 static logging::logger slogger("alternator-serialization");

@@ -77,7 +78,7 @@ struct from_json_visitor {
    }
    // default
    void operator()(const abstract_type& t) const {
-        bo.write(t.from_json_object(Json::Value(rjson::print(v)), cql_serialization_format::internal()));
+        bo.write(from_json_object(t, Json::Value(rjson::print(v)), cql_serialization_format::internal()));
    }
 };

@@ -107,7 +108,7 @@ struct to_json_visitor {

    void operator()(const reversed_type_impl& t) const { visit(*t.underlying_type(), to_json_visitor{deserialized, type_ident, bv}); };
    void operator()(const decimal_type_impl& t) const {
-        auto s = decimal_type->to_json_string(bytes(bv));
+        auto s = to_json_string(*decimal_type, bytes(bv));
        //FIXME(sarna): unnecessary copy
        rjson::set_with_string_name(deserialized, type_ident, rjson::from_string(s));
    }
@@ -135,7 +136,7 @@ rjson::value deserialize_item(bytes_view bv) {

    if (atype == alternator_type::NOT_SUPPORTED_YET) {
        slogger.trace("Non-optimal deserialization of alternator type {}", int8_t(atype));
-        return rjson::parse_raw(reinterpret_cast<const char *>(bv.data()), bv.size());
+        return rjson::parse(std::string_view(reinterpret_cast<const char *>(bv.data()), bv.size()));
    }
    type_representation type_representation = represent_type(atype);
    visit(*type_representation.dtype, to_json_visitor{deserialized, type_representation.ident, bv});
@@ -159,27 +160,34 @@ std::string type_to_string(data_type type) {

 bytes get_key_column_value(const rjson::value& item, const column_definition& column) {
    std::string column_name = column.name_as_text();
-    std::string expected_type = type_to_string(column.type);
-
-    const rjson::value& key_typed_value = rjson::get(item, rjson::value::StringRefType(column_name.c_str()));
-    if (!key_typed_value.IsObject() || key_typed_value.MemberCount() != 1) {
-        throw api_error("ValidationException",
-                format("Missing or invalid value object for key column {}: {}", column_name, item));
+    const rjson::value* key_typed_value = rjson::find(item, column_name);
+    if (!key_typed_value) {
+        throw api_error("ValidationException", format("Key column {} not found", column_name));
    }
-    return get_key_from_typed_value(key_typed_value, column, expected_type);
+    return get_key_from_typed_value(*key_typed_value, column);
 }

-bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column, const std::string& expected_type) {
+// Parses the JSON encoding for a key value, which is a map with a single
+// entry, whose key is the type (expected to match the key column's type)
+// and the value is the encoded value.
+bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column) {
+    if (!key_typed_value.IsObject() || key_typed_value.MemberCount() != 1 ||
+            !key_typed_value.MemberBegin()->value.IsString()) {
+        throw api_error("ValidationException",
+                format("Malformed value object for key column {}: {}",
+                        column.name_as_text(), key_typed_value));
+    }
+
    auto it = key_typed_value.MemberBegin();
-    if (it->name.GetString() != expected_type) {
+    if (it->name != type_to_string(column.type)) {
        throw api_error("ValidationException",
                format("Type mismatch: expected type {} for key column {}, got type {}",
-                        expected_type, column.name_as_text(), it->name.GetString()));
+                        type_to_string(column.type), column.name_as_text(), it->name.GetString()));
    }
    if (column.type == bytes_type) {
        return base64_decode(it->value);
    } else {
-        return column.type->from_string(it->value.GetString());
+        return column.type->from_string(rjson::to_string_view(it->value));
    }

 }
@@ -194,7 +202,7 @@ rjson::value json_key_column_value(bytes_view cell, const column_definition& col
        // FIXME: use specialized Alternator number type, not the more
        // general "decimal_type". A dedicated type can be more efficient
        // in storage space and in parsing speed.
-        auto s = decimal_type->to_json_string(bytes(cell));
+        auto s = to_json_string(*decimal_type, bytes(cell));
        return rjson::from_string(s);
    } else {
        // We shouldn't get here, we shouldn't see such key columns.
@@ -245,4 +253,16 @@ big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic) {
    return big_decimal(it->value.GetString());
 }

+const std::pair<std::string, const rjson::value*> unwrap_set(const rjson::value& v) {
+    if (!v.IsObject() || v.MemberCount() != 1) {
+        return {"", nullptr};
+    }
+    auto it = v.MemberBegin();
+    const std::string it_key = it->name.GetString();
+    if (it_key != "SS" && it_key != "BS" && it_key != "NS") {
+        return {"", nullptr};
+    }
+    return std::make_pair(it_key, &(it->value));
+}
+
 }
--- a/alternator/serialization.hh
+++ b/alternator/serialization.hh
@@ -24,7 +24,7 @@
 #include <string>
 #include <string_view>
 #include "types.hh"
-#include "schema.hh"
+#include "schema_fwd.hh"
 #include "keys.hh"
 #include "rjson.hh"
 #include "utils/big_decimal.hh"
@@ -54,7 +54,7 @@ rjson::value deserialize_item(bytes_view bv);
 std::string type_to_string(data_type type);

 bytes get_key_column_value(const rjson::value& item, const column_definition& column);
-bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column, const std::string& expected_type);
+bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column);
 rjson::value json_key_column_value(bytes_view cell, const column_definition& column);

 partition_key pk_from_json(const rjson::value& item, schema_ptr schema);
@@ -63,4 +63,10 @@ clustering_key ck_from_json(const rjson::value& item, schema_ptr schema);
 // If v encodes a number (i.e., it is a {"N": [...]}, returns an object representing it.  Otherwise,
 // raises ValidationException with diagnostic.
 big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic);
+
+// Check if a given JSON object encodes a set (i.e., it is a {"SS": [...]}, or "NS", "BS"
+// and returns set's type and a pointer to that set. If the object does not encode a set,
+// returned value is {"", nullptr}
+const std::pair<std::string, const rjson::value*> unwrap_set(const rjson::value& v);
+
 }
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -29,6 +29,8 @@
 #include "auth.hh"
 #include <cctype>
 #include "cql3/query_processor.hh"
+#include "service/storage_service.hh"
+#include "utils/overloaded_functor.hh"

 static logging::logger slogger("alternator-server");

@@ -65,9 +67,9 @@ inline std::vector<std::string_view> split(std::string_view text, char separator
 // Internal Server Error.
 class api_handler : public handler_base {
 public:
-    api_handler(const future_json_function& _handle) : _f_handle(
-         [_handle](std::unique_ptr<request> req, std::unique_ptr<reply> rep) {
-         return seastar::futurize_apply(_handle, std::move(req)).then_wrapped([rep = std::move(rep)](future<json::json_return_type> resf) mutable {
+    api_handler(const std::function<future<executor::request_return_type>(std::unique_ptr<request> req)>& _handle) : _f_handle(
+         [this, _handle](std::unique_ptr<request> req, std::unique_ptr<reply> rep) {
+         return seastar::futurize_apply(_handle, std::move(req)).then_wrapped([this, rep = std::move(rep)](future<executor::request_return_type> resf) mutable {
             if (resf.failed()) {
                 // Exceptions of type api_error are wrapped as JSON and
                 // returned to the client as expected. Other types of
@@ -86,20 +88,24 @@ public:
                             format("Internal server error: {}", std::current_exception()),
                             reply::status_type::internal_server_error);
                 }
-                 // FIXME: what is this version number?
-                 rep->_content += "{\"__type\":\"com.amazonaws.dynamodb.v20120810#" + ret._type + "\"," +
-                         "\"message\":\"" + ret._msg + "\"}";
-                 rep->_status = ret._http_code;
-                 slogger.trace("api_handler error case: {}", rep->_content);
+                 generate_error_reply(*rep, ret);
                 return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
             }
-             slogger.trace("api_handler success case");
             auto res = resf.get0();
-             if (res._body_writer) {
-                 rep->write_body("json", std::move(res._body_writer));
-             } else {
-                 rep->_content += res._res;
-             }
+             std::visit(overloaded_functor {
+                 [&] (const json::json_return_type& json_return_value) {
+                     slogger.trace("api_handler success case");
+                     if (json_return_value._body_writer) {
+                         rep->write_body("json", std::move(json_return_value._body_writer));
+                     } else {
+                         rep->_content += json_return_value._res;
+                     }
+                 },
+                 [&] (const api_error& err) {
+                     generate_error_reply(*rep, err);
+                 }
+             }, res);
+
             return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
         });
    }), _type("json") { }
@@ -115,18 +121,66 @@ public:
    }

 protected:
+    void generate_error_reply(reply& rep, const api_error& err) {
+        rep._content += "{\"__type\":\"com.amazonaws.dynamodb.v20120810#" + err._type + "\"," +
+                "\"message\":\"" + err._msg + "\"}";
+        rep._status = err._http_code;
+        slogger.trace("api_handler error case: {}", rep._content);
+    }
+
    future_handler_function _f_handle;
    sstring _type;
 };

-class health_handler : public handler_base {
-    virtual future<std::unique_ptr<reply>> handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
+class gated_handler : public handler_base {
+    seastar::gate& _gate;
+public:
+    gated_handler(seastar::gate& gate) : _gate(gate) {}
+    virtual future<std::unique_ptr<reply>> do_handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) = 0;
+    virtual future<std::unique_ptr<reply>> handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) final override {
+        return with_gate(_gate, [this, &path, req = std::move(req), rep = std::move(rep)] () mutable {
+            return do_handle(path, std::move(req), std::move(rep));
+        });
+    }
+};
+
+class health_handler : public gated_handler {
+public:
+    health_handler(seastar::gate& pending_requests) : gated_handler(pending_requests) {}
+protected:
+    virtual future<std::unique_ptr<reply>> do_handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
        rep->set_status(reply::status_type::ok);
        rep->write_body("txt", format("healthy: {}", req->get_header("Host")));
        return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
    }
 };

+class local_nodelist_handler : public gated_handler {
+public:
+    local_nodelist_handler(seastar::gate& pending_requests) : gated_handler(pending_requests) {}
+protected:
+    virtual future<std::unique_ptr<reply>> do_handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
+        rjson::value results = rjson::empty_array();
+        // It's very easy to get a list of all live nodes on the cluster,
+        // using gms::get_local_gossiper().get_live_members(). But getting
+        // just the list of live nodes in this DC needs more elaborate code:
+        sstring local_dc = locator::i_endpoint_snitch::get_local_snitch_ptr()->get_datacenter(
+                utils::fb_utilities::get_broadcast_address());
+        std::unordered_set<gms::inet_address> local_dc_nodes =
+                service::get_local_storage_service().get_token_metadata().
+                get_topology().get_datacenter_endpoints().at(local_dc);
+        for (auto& ip : local_dc_nodes) {
+            if (gms::get_local_gossiper().is_alive(ip)) {
+                rjson::push_back(results, rjson::from_string(ip.to_sstring()));
+            }
+        }
+        rep->set_status(reply::status_type::ok);
+        rep->set_content_type("json");
+        rep->_content = rjson::print(results);
+        return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
+    }
+};
+
 future<> server::verify_signature(const request& req) {
    if (!_enforce_authorization) {
        slogger.debug("Skipping authorization");
@@ -137,7 +191,7 @@ future<> server::verify_signature(const request& req) {
        throw api_error("InvalidSignatureException", "Host header is mandatory for signature verification");
    }
    auto authorization_it = req._headers.find("Authorization");
-    if (host_it == req._headers.end()) {
+    if (authorization_it == req._headers.end()) {
        throw api_error("InvalidSignatureException", "Authorization header is mandatory for signature verification");
    }
    std::string host = host_it->second;
@@ -214,7 +268,8 @@ future<> server::verify_signature(const request& req) {
    });
 }

-future<json::json_return_type> server::handle_api_request(std::unique_ptr<request>&& req) {
+future<executor::request_return_type> server::handle_api_request(std::unique_ptr<request>&& req) {
+    _executor._stats.total_operations++;
    sstring target = req->get_header(TARGET);
    std::vector<std::string_view> split_target = split(target, '.');
    //NOTICE(sarna): Target consists of Dynamo API version followed by a dot '.' and operation type (e.g. CreateTable)
@@ -223,17 +278,32 @@ future<json::json_return_type> server::handle_api_request(std::unique_ptr<reques
    return verify_signature(*req).then([this, op, req = std::move(req)] () mutable {
        auto callback_it = _callbacks.find(op);
        if (callback_it == _callbacks.end()) {
-            _executor.local()._stats.unsupported_operations++;
+            _executor._stats.unsupported_operations++;
            throw api_error("UnknownOperationException",
                    format("Unsupported operation {}", op));
        }
-        //FIXME: Client state can provide more context, e.g. client's endpoint address
-        // We use unique_ptr because client_state cannot be moved or copied
-        return do_with(std::make_unique<executor::client_state>(executor::client_state::internal_tag()), [this, callback_it = std::move(callback_it), op = std::move(op), req = std::move(req)] (std::unique_ptr<executor::client_state>& client_state) mutable {
-            client_state->set_raw_keyspace(executor::KEYSPACE_NAME);
-            executor::maybe_trace_query(*client_state, op, req->content);
-            tracing::trace(client_state->get_trace_state(), op);
-            return callback_it->second(_executor.local(), *client_state, std::move(req));
+        return with_gate(_pending_requests, [this, callback_it = std::move(callback_it), op = std::move(op), req = std::move(req)] () mutable {
+            //FIXME: Client state can provide more context, e.g. client's endpoint address
+            // We use unique_ptr because client_state cannot be moved or copied
+            return do_with(std::make_unique<executor::client_state>(executor::client_state::internal_tag()),
+                    [this, callback_it = std::move(callback_it), op = std::move(op), req = std::move(req)] (std::unique_ptr<executor::client_state>& client_state) mutable {
+                tracing::trace_state_ptr trace_state = executor::maybe_trace_query(*client_state, op, req->content);
+                tracing::trace(trace_state, op);
+                // JSON parsing can allocate up to roughly 2x the size of the raw document, + a couple of bytes for maintenance.
+                // FIXME: by this time, the whole HTTP request was already read, so some memory is already occupied.
+                // Once HTTP allows working on streams, we should grab the permit *before* reading the HTTP payload.
+                size_t mem_estimate = req->content.size() * 3 + 8000;
+                auto units_fut = get_units(*_memory_limiter, mem_estimate);
+                if (_memory_limiter->waiters()) {
+                    ++_executor._stats.requests_blocked_memory;
+                }
+                return units_fut.then([this, callback_it = std::move(callback_it), &client_state, trace_state, req = std::move(req)] (semaphore_units<> units) mutable {
+                    return _json_parser.parse(req->content).then([this, callback_it = std::move(callback_it), &client_state, trace_state,
+                            units = std::move(units), req = std::move(req)] (rjson::value json_request) mutable {
+                        return callback_it->second(_executor, *client_state, trace_state, make_service_permit(std::move(units)), std::move(json_request), std::move(req)).finally([trace_state] {});
+                    });
+                });
+            });
        });
    });
 }
@@ -243,35 +313,88 @@ void server::set_routes(routes& r) {
        return handle_api_request(std::move(req));
    });

-    r.add(operation_type::POST, url("/"), req_handler);
-    r.add(operation_type::GET, url("/"), new health_handler);
+    r.put(operation_type::POST, "/", req_handler);
+    r.put(operation_type::GET, "/", new health_handler(_pending_requests));
+    // The "/localnodes" request is a new Alternator feature, not supported by
+    // DynamoDB and not required for DynamoDB compatibility. It allows a
+    // client to enquire - using a trivial HTTP request without requiring
+    // authentication - the list of all live nodes in the same data center of
+    // the Alternator cluster. The client can use this list to balance its
+    // request load to all the nodes in the same geographical region.
+    // Note that this API exposes - openly without authentication - the
+    // information on the cluster's members inside one data center. We do not
+    // consider this to be a security risk, because an attacker can already
+    // scan an entire subnet for nodes responding to the health request,
+    // or even just scan for open ports.
+    r.put(operation_type::GET, "/localnodes", new local_nodelist_handler(_pending_requests));
 }

 //FIXME: A way to immediately invalidate the cache should be considered,
 // e.g. when the system table which stores the keys is changed.
 // For now, this propagation may take up to 1 minute.
-server::server(seastar::sharded<executor>& e)
-        : _executor(e), _key_cache(1024, 1min, slogger), _enforce_authorization(false)
+server::server(executor& exec)
+        : _http_server("http-alternator")
+        , _https_server("https-alternator")
+        , _executor(exec)
+        , _key_cache(1024, 1min, slogger)
+        , _enforce_authorization(false)
+        , _enabled_servers{}
+        , _pending_requests{}
      , _callbacks{
-        {"CreateTable", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) {
-            return e.maybe_create_keyspace().then([&e, &client_state, req = std::move(req)] { return e.create_table(client_state, req->content); }); }
-        },
-        {"DescribeTable", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.describe_table(client_state, req->content); }},
-        {"DeleteTable", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.delete_table(client_state, req->content); }},
-        {"PutItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.put_item(client_state, req->content); }},
-        {"UpdateItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.update_item(client_state, req->content); }},
-        {"GetItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.get_item(client_state, req->content); }},
-        {"DeleteItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.delete_item(client_state, req->content); }},
-        {"ListTables", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.list_tables(client_state, req->content); }},
-        {"Scan", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.scan(client_state, req->content); }},
-        {"DescribeEndpoints", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.describe_endpoints(client_state, req->content, req->get_header("Host")); }},
-        {"BatchWriteItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.batch_write_item(client_state, req->content); }},
-        {"BatchGetItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.batch_get_item(client_state, req->content); }},
-        {"Query", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.query(client_state, req->content); }},
+        {"CreateTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.create_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"DescribeTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.describe_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"DeleteTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.delete_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"PutItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.put_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"UpdateItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.update_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"GetItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.get_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"DeleteItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.delete_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"ListTables", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.list_tables(client_state, std::move(permit), std::move(json_request));
+        }},
+        {"Scan", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.scan(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"DescribeEndpoints", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.describe_endpoints(client_state, std::move(permit), std::move(json_request), req->get_header("Host"));
+        }},
+        {"BatchWriteItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.batch_write_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"BatchGetItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.batch_get_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"Query", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.query(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"TagResource", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.tag_resource(client_state, std::move(permit), std::move(json_request));
+        }},
+        {"UntagResource", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.untag_resource(client_state, std::move(permit), std::move(json_request));
+        }},
+        {"ListTagsOfResource", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.list_tags_of_resource(client_state, std::move(permit), std::move(json_request));
+        }},
    } {
 }

-future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds, bool enforce_authorization) {
+future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
+        bool enforce_authorization, semaphore* memory_limiter) {
+    _memory_limiter = memory_limiter;
    _enforce_authorization = enforce_authorization;
    if (!port && !https_port) {
        return make_exception_future<>(std::runtime_error("Either regular port or TLS port"
@@ -279,33 +402,82 @@ future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std:
    }
    return seastar::async([this, addr, port, https_port, creds] {
        try {
-            _executor.invoke_on_all([] (executor& e) {
-                return e.start();
-            }).get();
+            _executor.start().get();

            if (port) {
-                _control.start().get();
-                _control.set_routes(std::bind(&server::set_routes, this, std::placeholders::_1)).get();
-                _control.listen(socket_address{addr, *port}).get();
+                set_routes(_http_server._routes);
+                _http_server.set_content_length_limit(server::content_length_limit);
+                _http_server.listen(socket_address{addr, *port}).get();
+                _enabled_servers.push_back(std::ref(_http_server));
                slogger.info("Alternator HTTP server listening on {} port {}", addr, *port);
            }
            if (https_port) {
-                _https_control.start().get();
-                _https_control.set_routes(std::bind(&server::set_routes, this, std::placeholders::_1)).get();
-                _https_control.server().invoke_on_all([creds] (http_server& serv) {
-                    return serv.set_tls_credentials(creds->build_server_credentials());
-                }).get();
-
-                _https_control.listen(socket_address{addr, *https_port}).get();
+                set_routes(_https_server._routes);
+                _https_server.set_content_length_limit(server::content_length_limit);
+                _https_server.set_tls_credentials(creds->build_server_credentials());
+                _https_server.listen(socket_address{addr, *https_port}).get();
+                _enabled_servers.push_back(std::ref(_https_server));
                slogger.info("Alternator HTTPS server listening on {} port {}", addr, *https_port);
            }
        } catch (...) {
-            slogger.warn("Failed to set up Alternator HTTP server on {} port {}, TLS port {}: {}",
+            slogger.error("Failed to set up Alternator HTTP server on {} port {}, TLS port {}: {}",
                    addr, port ? std::to_string(*port) : "OFF", https_port ? std::to_string(*https_port) : "OFF", std::current_exception());
-            throw;
+            std::throw_with_nested(std::runtime_error(
+                    format("Failed to set up Alternator HTTP server on {} port {}, TLS port {}",
+                            addr, port ? std::to_string(*port) : "OFF", https_port ? std::to_string(*https_port) : "OFF")));
        }
    });
 }

+future<> server::stop() {
+    return parallel_for_each(_enabled_servers, [] (http_server& server) {
+        return server.stop();
+    }).then([this] {
+        return _pending_requests.close();
+    }).then([this] {
+        return _json_parser.stop();
+    });
+}
+
+server::json_parser::json_parser() : _run_parse_json_thread(async([this] {
+        while (true) {
+            _document_waiting.wait().get();
+            if (_as.abort_requested()) {
+                return;
+            }
+            try {
+                _parsed_document = rjson::parse_yieldable(_raw_document);
+                _current_exception = nullptr;
+            } catch (...) {
+                _current_exception = std::current_exception();
+            }
+            _document_parsed.signal();
+        }
+    })) {
+}
+
+future<rjson::value> server::json_parser::parse(std::string_view content) {
+    if (content.size() < yieldable_parsing_threshold) {
+        return make_ready_future<rjson::value>(rjson::parse(content));
+    }
+    return with_semaphore(_parsing_sem, 1, [this, content] {
+        _raw_document = content;
+        _document_waiting.signal();
+        return _document_parsed.wait().then([this] {
+            if (_current_exception) {
+                return make_exception_future<rjson::value>(_current_exception);
+            }
+            return make_ready_future<rjson::value>(std::move(_parsed_document));
+        });
+    });
+}
+
+future<> server::json_parser::stop() {
+    _as.request_abort();
+    _document_waiting.signal();
+    _document_parsed.broken();
+    return std::move(_run_parse_json_thread);
+}
+
 }

--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -27,27 +27,56 @@
 #include <seastar/net/tls.hh>
 #include <optional>
 #include <alternator/auth.hh>
+#include <utils/small_vector.hh>
+#include <seastar/core/units.hh>

 namespace alternator {

 class server {
-    using alternator_callback = std::function<future<json::json_return_type>(executor&, executor::client_state&, std::unique_ptr<request>)>;
+    static constexpr size_t content_length_limit = 16*MB;
+    using alternator_callback = std::function<future<executor::request_return_type>(executor&, executor::client_state&,
+            tracing::trace_state_ptr, service_permit, rjson::value, std::unique_ptr<request>)>;
    using alternator_callbacks_map = std::unordered_map<std::string_view, alternator_callback>;

-    seastar::httpd::http_server_control _control;
-    seastar::httpd::http_server_control _https_control;
-    seastar::sharded<executor>& _executor;
+    http_server _http_server;
+    http_server _https_server;
+    executor& _executor;
+
    key_cache _key_cache;
    bool _enforce_authorization;
+    utils::small_vector<std::reference_wrapper<seastar::httpd::http_server>, 2> _enabled_servers;
+    gate _pending_requests;
    alternator_callbacks_map _callbacks;
-public:
-    server(seastar::sharded<executor>& executor);

-    seastar::future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds, bool enforce_authorization);
+    semaphore* _memory_limiter;
+
+    class json_parser {
+        static constexpr size_t yieldable_parsing_threshold = 16*KB;
+        std::string_view _raw_document;
+        rjson::value _parsed_document;
+        std::exception_ptr _current_exception;
+        semaphore _parsing_sem{1};
+        condition_variable _document_waiting;
+        condition_variable _document_parsed;
+        abort_source _as;
+        future<> _run_parse_json_thread;
+    public:
+        json_parser();
+        future<rjson::value> parse(std::string_view content);
+        future<> stop();
+    };
+    json_parser _json_parser;
+
+public:
+    server(executor& executor);
+
+    future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
+            bool enforce_authorization, semaphore* memory_limiter);
+    future<> stop();
 private:
    void set_routes(seastar::httpd::routes& r);
    future<> verify_signature(const seastar::httpd::request& r);
-    future<json::json_return_type> handle_api_request(std::unique_ptr<request>&& req);
+    future<executor::request_return_type> handle_api_request(std::unique_ptr<request>&& req);
 };

 }
--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -85,6 +85,12 @@ stats::stats() : api_operations{} {
                    seastar::metrics::description("number of total operations via Alternator API")),
            seastar::metrics::make_total_operations("reads_before_write", reads_before_write,
                    seastar::metrics::description("number of performed read-before-write operations")),
+            seastar::metrics::make_total_operations("write_using_lwt", write_using_lwt,
+                    seastar::metrics::description("number of writes that used LWT")),
+            seastar::metrics::make_total_operations("shard_bounce_for_lwt", shard_bounce_for_lwt,
+                    seastar::metrics::description("number writes that had to be bounced from this shard because of LWT requirements")),
+            seastar::metrics::make_total_operations("requests_blocked_memory", requests_blocked_memory,
+                    seastar::metrics::description("Counts a number of requests blocked due to memory pressure.")),
            seastar::metrics::make_total_operations("filtered_rows_read_total", cql_stats.filtered_rows_read_total,
                    seastar::metrics::description("number of rows read during filtering operations")),
            seastar::metrics::make_total_operations("filtered_rows_matched_total", cql_stats.filtered_rows_matched_total,
--- a/alternator/stats.hh
+++ b/alternator/stats.hh
@@ -84,6 +84,9 @@ public:
    uint64_t total_operations = 0;
    uint64_t unsupported_operations = 0;
    uint64_t reads_before_write = 0;
+    uint64_t write_using_lwt = 0;
+    uint64_t shard_bounce_for_lwt = 0;
+    uint64_t requests_blocked_memory = 0;
    // CQL-derived stats
    cql3::cql_stats cql_stats;
 private:
--- a/alternator/tags_extension.hh
+++ b/alternator/tags_extension.hh
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "serializer.hh"
+#include "schema.hh"
+#include "db/extensions.hh"
+
+namespace alternator {
+
+class tags_extension : public schema_extension {
+public:
+    static constexpr auto NAME = "scylla_tags";
+
+    tags_extension() = default;
+    explicit tags_extension(const std::map<sstring, sstring>& tags) : _tags(std::move(tags)) {}
+    explicit tags_extension(bytes b) : _tags(tags_extension::deserialize(b)) {}
+    explicit tags_extension(const sstring& s) {
+        throw std::logic_error("Cannot create tags from string");
+    }
+    bytes serialize() const override {
+        return ser::serialize_to_buffer<bytes>(_tags);
+    }
+    static std::map<sstring, sstring> deserialize(bytes_view buffer) {
+        return ser::deserialize_from_buffer(buffer, boost::type<std::map<sstring, sstring>>());
+    }
+    const std::map<sstring, sstring>& tags() const {
+        return _tags;
+    }
+private:
+    std::map<sstring, sstring> _tags;
+};
+
+}
--- a/api/api-doc/cache_service.json
+++ b/api/api-doc/cache_service.json
@@ -13,7 +13,7 @@
            {
               "method":"GET",
               "summary":"get row cache save period in seconds",
-               "type":"int",
+               "type": "long",
               "nickname":"get_row_cache_save_period_in_seconds",
               "produces":[
                  "application/json"
@@ -35,7 +35,7 @@
                     "description":"row cache save period in seconds",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -48,7 +48,7 @@
            {
               "method":"GET",
               "summary":"get key cache save period in seconds",
-               "type":"int",
+               "type": "long",
               "nickname":"get_key_cache_save_period_in_seconds",
               "produces":[
                  "application/json"
@@ -70,7 +70,7 @@
                     "description":"key cache save period in seconds",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -83,7 +83,7 @@
            {
               "method":"GET",
               "summary":"get counter cache save period in seconds",
-               "type":"int",
+               "type": "long",
               "nickname":"get_counter_cache_save_period_in_seconds",
               "produces":[
                  "application/json"
@@ -105,7 +105,7 @@
                     "description":"counter cache save period in seconds",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -118,7 +118,7 @@
            {
               "method":"GET",
               "summary":"get row cache keys to save",
-               "type":"int",
+               "type": "long",
               "nickname":"get_row_cache_keys_to_save",
               "produces":[
                  "application/json"
@@ -140,7 +140,7 @@
                     "description":"row cache keys to save",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -153,7 +153,7 @@
            {
               "method":"GET",
               "summary":"get key cache keys to save",
-               "type":"int",
+               "type": "long",
               "nickname":"get_key_cache_keys_to_save",
               "produces":[
                  "application/json"
@@ -175,7 +175,7 @@
                     "description":"key cache keys to save",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -188,7 +188,7 @@
            {
               "method":"GET",
               "summary":"get counter cache keys to save",
-               "type":"int",
+               "type": "long",
               "nickname":"get_counter_cache_keys_to_save",
               "produces":[
                  "application/json"
@@ -210,7 +210,7 @@
                     "description":"counter cache keys to save",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -448,7 +448,7 @@
        {
          "method": "GET",
          "summary": "Get key entries",
-          "type": "int",
+          "type": "long",
          "nickname": "get_key_entries",
          "produces": [
            "application/json"
@@ -568,7 +568,7 @@
        {
          "method": "GET",
          "summary": "Get row entries",
-          "type": "int",
+          "type": "long",
          "nickname": "get_row_entries",
          "produces": [
            "application/json"
@@ -688,7 +688,7 @@
        {
          "method": "GET",
          "summary": "Get counter entries",
-          "type": "int",
+          "type": "long",
          "nickname": "get_counter_entries",
          "produces": [
            "application/json"
--- a/api/api-doc/column_family.json
+++ b/api/api-doc/column_family.json
@@ -70,7 +70,7 @@
            {
               "method":"POST",
               "summary":"Force a major compaction of this column family",
-               "type":"string",
+               "type":"void",
               "nickname":"force_major_compaction",
               "produces":[
                  "application/json"
@@ -121,7 +121,7 @@
                     "description":"The minimum number of sstables in queue before compaction kicks off",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -172,7 +172,7 @@
                     "description":"The maximum number of sstables in queue before compaction kicks off",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -223,7 +223,7 @@
                     "description":"The maximum number of sstables in queue before compaction kicks off",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  },
                  {
@@ -231,7 +231,7 @@
                     "description":"The minimum number of sstables in queue before compaction kicks off",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -544,7 +544,7 @@
               "summary":"sstable count for each level. empty unless leveled compaction is used",
               "type":"array",
               "items":{
-                  "type":"int"
+                  "type": "long"
               },
               "nickname":"get_sstable_count_per_level",
               "produces":[
@@ -636,7 +636,7 @@
                     "description":"Duration (in milliseconds) of monitoring operation",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  },
                  {
@@ -644,7 +644,7 @@
                    "description":"number of the top partitions to list",
                    "required":false,
                    "allowMultiple":false,
-                    "type":"int",
+                    "type": "long",
                    "paramType":"query"
                 },
                 {
@@ -652,7 +652,7 @@
                    "description":"capacity of stream summary: determines amount of resources used in query processing",
                    "required":false,
                    "allowMultiple":false,
-                    "type":"int",
+                    "type": "long",
                    "paramType":"query"
                 }
              ]
@@ -921,7 +921,7 @@
            {
               "method":"GET",
               "summary":"Get memtable switch count",
-               "type":"int",
+               "type": "long",
               "nickname":"get_memtable_switch_count",
               "produces":[
                  "application/json"
@@ -945,7 +945,7 @@
            {
               "method":"GET",
               "summary":"Get all memtable switch count",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_memtable_switch_count",
               "produces":[
                  "application/json"
@@ -1082,7 +1082,7 @@
            {
               "method":"GET",
               "summary":"Get read latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_read_latency",
               "produces":[
                  "application/json"
@@ -1235,7 +1235,7 @@
            {
               "method":"GET",
               "summary":"Get all read latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_read_latency",
               "produces":[
                  "application/json"
@@ -1251,7 +1251,7 @@
            {
               "method":"GET",
               "summary":"Get range latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_range_latency",
               "produces":[
                  "application/json"
@@ -1275,7 +1275,7 @@
            {
               "method":"GET",
               "summary":"Get all range latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_range_latency",
               "produces":[
                  "application/json"
@@ -1291,7 +1291,7 @@
            {
               "method":"GET",
               "summary":"Get write latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_write_latency",
               "produces":[
                  "application/json"
@@ -1444,7 +1444,7 @@
            {
               "method":"GET",
               "summary":"Get all write latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_write_latency",
               "produces":[
                  "application/json"
@@ -1460,7 +1460,7 @@
            {
               "method":"GET",
               "summary":"Get pending flushes",
-               "type":"int",
+               "type": "long",
               "nickname":"get_pending_flushes",
               "produces":[
                  "application/json"
@@ -1484,7 +1484,7 @@
            {
               "method":"GET",
               "summary":"Get all pending flushes",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_pending_flushes",
               "produces":[
                  "application/json"
@@ -1500,7 +1500,7 @@
            {
               "method":"GET",
               "summary":"Get pending compactions",
-               "type":"int",
+               "type": "long",
               "nickname":"get_pending_compactions",
               "produces":[
                  "application/json"
@@ -1524,7 +1524,7 @@
            {
               "method":"GET",
               "summary":"Get all pending compactions",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_pending_compactions",
               "produces":[
                  "application/json"
@@ -1540,7 +1540,7 @@
            {
               "method":"GET",
               "summary":"Get live ss table count",
-               "type":"int",
+               "type": "long",
               "nickname":"get_live_ss_table_count",
               "produces":[
                  "application/json"
@@ -1564,7 +1564,7 @@
            {
               "method":"GET",
               "summary":"Get all live ss table count",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_live_ss_table_count",
               "produces":[
                  "application/json"
@@ -1580,7 +1580,7 @@
            {
               "method":"GET",
               "summary":"Get live disk space used",
-               "type":"int",
+               "type": "long",
               "nickname":"get_live_disk_space_used",
               "produces":[
                  "application/json"
@@ -1604,7 +1604,7 @@
            {
               "method":"GET",
               "summary":"Get all live disk space used",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_live_disk_space_used",
               "produces":[
                  "application/json"
@@ -1620,7 +1620,7 @@
            {
               "method":"GET",
               "summary":"Get total disk space used",
-               "type":"int",
+               "type": "long",
               "nickname":"get_total_disk_space_used",
               "produces":[
                  "application/json"
@@ -1644,7 +1644,7 @@
            {
               "method":"GET",
               "summary":"Get all total disk space used",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_total_disk_space_used",
               "produces":[
                  "application/json"
@@ -2100,7 +2100,7 @@
            {
               "method":"GET",
               "summary":"Get speculative retries",
-               "type":"int",
+               "type": "long",
               "nickname":"get_speculative_retries",
               "produces":[
                  "application/json"
@@ -2124,7 +2124,7 @@
            {
               "method":"GET",
               "summary":"Get all speculative retries",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_speculative_retries",
               "produces":[
                  "application/json"
@@ -2204,7 +2204,7 @@
            {
               "method":"GET",
               "summary":"Get row cache hit out of range",
-               "type":"int",
+               "type": "long",
               "nickname":"get_row_cache_hit_out_of_range",
               "produces":[
                  "application/json"
@@ -2228,7 +2228,7 @@
            {
               "method":"GET",
               "summary":"Get all row cache hit out of range",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_row_cache_hit_out_of_range",
               "produces":[
                  "application/json"
@@ -2244,7 +2244,7 @@
            {
               "method":"GET",
               "summary":"Get row cache hit",
-               "type":"int",
+               "type": "long",
               "nickname":"get_row_cache_hit",
               "produces":[
                  "application/json"
@@ -2268,7 +2268,7 @@
            {
               "method":"GET",
               "summary":"Get all row cache hit",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_row_cache_hit",
               "produces":[
                  "application/json"
@@ -2284,7 +2284,7 @@
            {
               "method":"GET",
               "summary":"Get row cache miss",
-               "type":"int",
+               "type": "long",
               "nickname":"get_row_cache_miss",
               "produces":[
                  "application/json"
@@ -2308,7 +2308,7 @@
            {
               "method":"GET",
               "summary":"Get all row cache miss",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_row_cache_miss",
               "produces":[
                  "application/json"
@@ -2324,7 +2324,7 @@
            {
               "method":"GET",
               "summary":"Get cas prepare",
-               "type":"int",
+               "type": "long",
               "nickname":"get_cas_prepare",
               "produces":[
                  "application/json"
@@ -2348,7 +2348,7 @@
            {
               "method":"GET",
               "summary":"Get cas propose",
-               "type":"int",
+               "type": "long",
               "nickname":"get_cas_propose",
               "produces":[
                  "application/json"
@@ -2372,7 +2372,7 @@
            {
               "method":"GET",
               "summary":"Get cas commit",
-               "type":"int",
+               "type": "long",
               "nickname":"get_cas_commit",
               "produces":[
                  "application/json"
--- a/api/api-doc/compaction_manager.json
+++ b/api/api-doc/compaction_manager.json
@@ -118,7 +118,7 @@
        {
          "method": "GET",
          "summary": "Get pending tasks",
-          "type": "int",
+          "type": "long",
          "nickname": "get_pending_tasks",
          "produces": [
            "application/json"
@@ -181,7 +181,7 @@
        {
          "method": "GET",
          "summary": "Get bytes compacted",
-          "type": "int",
+          "type": "long",
          "nickname": "get_bytes_compacted",
          "produces": [
            "application/json"
@@ -197,7 +197,7 @@
         "description":"A row merged information",
         "properties":{
            "key":{
-               "type":"int",
+               "type": "long",
               "description":"The number of sstable"
            },
            "value":{
--- a/api/api-doc/error_injection.json
+++ b/api/api-doc/error_injection.json
@@ -0,0 +1,90 @@
+{
+   "apiVersion":"0.0.1",
+   "swaggerVersion":"1.2",
+   "basePath":"{{Protocol}}://{{Host}}",
+   "resourcePath":"/error_injection",
+   "produces":[
+      "application/json"
+   ],
+   "apis":[
+      {
+         "path":"/v2/error_injection/injection/{injection}",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Activate an injection that triggers an error in code",
+               "type":"void",
+               "nickname":"enable_injection",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"injection",
+                     "description":"injection name, should correspond to an injection added in code",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  },
+                  {
+                     "name":"one_shot",
+                     "description":"boolean flag indicating whether the injection should be enabled to trigger only once",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
+                  }
+               ]
+            },
+            {
+               "method":"DELETE",
+               "summary":"Deactivate an injection previously activated by the API",
+               "type":"void",
+               "nickname":"disable_injection",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"injection",
+                     "description":"injection name",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/v2/error_injection/injection",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"List all enabled injections on all shards, i.e. injections that will trigger an error in the code",
+               "type":"array",
+               "items":{
+                  "type":"string"
+               },
+               "nickname":"get_enabled_injections_on_all",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            },
+            {
+               "method":"DELETE",
+               "summary":"Deactivate all injections previously activated on all shards by the API",
+               "type":"void",
+               "nickname":"disable_on_all",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            }
+         ]
+      }
+   ]
+}
--- a/api/api-doc/failure_detector.json
+++ b/api/api-doc/failure_detector.json
@@ -110,7 +110,7 @@
            {
               "method":"GET",
               "summary":"Get count down endpoint",
-               "type":"int",
+               "type": "long",
               "nickname":"get_down_endpoint_count",
               "produces":[
                  "application/json"
@@ -126,7 +126,7 @@
            {
               "method":"GET",
               "summary":"Get count up endpoint",
-               "type":"int",
+               "type": "long",
               "nickname":"get_up_endpoint_count",
               "produces":[
                  "application/json"
@@ -180,11 +180,11 @@
                    "description": "The endpoint address"
                },
                "generation": {
-                    "type": "int",
+                    "type": "long",
                    "description": "The heart beat generation"
                },
                "version": {
-                    "type": "int",
+                    "type": "long",
                    "description": "The heart beat version"
                },
                "update_time": {
@@ -209,7 +209,7 @@
           "description": "Holds a version value for an application state",
               "properties": {
                "application_state": {
-                    "type": "int",
+                    "type": "long",
                    "description": "The application state enum index"
                },
                "value": {
@@ -217,7 +217,7 @@
                    "description": "The version value"
                },
                "version": {
-                    "type": "int",
+                    "type": "long",
                    "description": "The application state version"
                }
            }
--- a/api/api-doc/gossiper.json
+++ b/api/api-doc/gossiper.json
@@ -75,7 +75,7 @@
            {
               "method":"GET",
               "summary":"Returns files which are pending for archival attempt. Does NOT include failed archive attempts",
-               "type":"int",
+               "type": "long",
               "nickname":"get_current_generation_number",
               "produces":[
                  "application/json"
@@ -99,7 +99,7 @@
            {
               "method":"GET",
               "summary":"Get heart beat version for a node",
-               "type":"int",
+               "type": "long",
               "nickname":"get_current_heart_beat_version",
               "produces":[
                  "application/json"
--- a/api/api-doc/hinted_handoff.json
+++ b/api/api-doc/hinted_handoff.json
@@ -99,7 +99,7 @@
        {
          "method": "GET",
          "summary": "Get create hint count",
-          "type": "int",
+          "type": "long",
          "nickname": "get_create_hint_count",
          "produces": [
            "application/json"
@@ -123,7 +123,7 @@
        {
          "method": "GET",
          "summary": "Get not stored hints count",
-          "type": "int",
+          "type": "long",
          "nickname": "get_not_stored_hints_count",
          "produces": [
            "application/json"
--- a/api/api-doc/messaging_service.json
+++ b/api/api-doc/messaging_service.json
@@ -191,7 +191,7 @@
            {
               "method":"GET",
               "summary":"Get the version number",
-               "type":"int",
+               "type": "long",
               "nickname":"get_version",
               "produces":[
                  "application/json"
--- a/api/api-doc/storage_proxy.json
+++ b/api/api-doc/storage_proxy.json
@@ -105,7 +105,7 @@
            {
               "method":"GET",
               "summary":"Get the max hint window",
-               "type":"int",
+               "type": "long",
               "nickname":"get_max_hint_window",
               "produces":[
                  "application/json"
@@ -128,7 +128,7 @@
                     "description":"max hint window in ms",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -141,7 +141,7 @@
            {
               "method":"GET",
               "summary":"Get max hints in progress",
-               "type":"int",
+               "type": "long",
               "nickname":"get_max_hints_in_progress",
               "produces":[
                  "application/json"
@@ -164,7 +164,7 @@
                     "description":"max hints in progress",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -177,7 +177,7 @@
            {
               "method":"GET",
               "summary":"get hints in progress",
-               "type":"int",
+               "type": "long",
               "nickname":"get_hints_in_progress",
               "produces":[
                  "application/json"
@@ -602,7 +602,7 @@
        {
          "method": "GET",
          "summary": "Get cas write metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_cas_write_metrics_unfinished_commit",
          "produces": [
            "application/json"
@@ -632,7 +632,7 @@
        {
          "method": "GET",
          "summary": "Get cas write metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_cas_write_metrics_condition_not_met",
          "produces": [
            "application/json"
@@ -641,13 +641,28 @@
        }
      ]
    },
+    {
+      "path": "/storage_proxy/metrics/cas_write/failed_read_round_optimization",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get cas write metrics",
+          "type": "long",
+          "nickname": "get_cas_write_metrics_failed_read_round_optimization",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
    {
      "path": "/storage_proxy/metrics/cas_read/unfinished_commit",
      "operations": [
        {
          "method": "GET",
          "summary": "Get cas read metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_cas_read_metrics_unfinished_commit",
          "produces": [
            "application/json"
@@ -677,7 +692,7 @@
        {
          "method": "GET",
          "summary": "Get read metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_read_metrics_timeouts",
          "produces": [
            "application/json"
@@ -692,7 +707,7 @@
        {
          "method": "GET",
          "summary": "Get read metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_read_metrics_unavailables",
          "produces": [
            "application/json"
@@ -827,7 +842,7 @@
        {
          "method": "GET",
          "summary": "Get range metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_range_metrics_timeouts",
          "produces": [
            "application/json"
@@ -842,7 +857,7 @@
        {
          "method": "GET",
          "summary": "Get range metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_range_metrics_unavailables",
          "produces": [
            "application/json"
@@ -887,7 +902,7 @@
        {
          "method": "GET",
          "summary": "Get write metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_write_metrics_timeouts",
          "produces": [
            "application/json"
@@ -902,7 +917,7 @@
        {
          "method": "GET",
          "summary": "Get write metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_write_metrics_unavailables",
          "produces": [
            "application/json"
@@ -1008,7 +1023,7 @@
            {
               "method":"GET",
               "summary":"Get read latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_read_latency",
               "produces":[
                  "application/json"
@@ -1040,7 +1055,7 @@
            {
               "method":"GET",
               "summary":"Get write latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_write_latency",
               "produces":[
                  "application/json"
@@ -1072,7 +1087,7 @@
            {
               "method":"GET",
               "summary":"Get range latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_range_latency",
               "produces":[
                  "application/json"
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -458,7 +458,7 @@
            {
               "method":"GET",
               "summary":"Return the generation value for this node.",
-               "type":"int",
+               "type": "long",
               "nickname":"get_current_generation_number",
               "produces":[
                  "application/json"
@@ -582,7 +582,15 @@
                  },
                  {
                     "name":"kn",
-                     "description":"Comma seperated keyspaces name to snapshot",
+                     "description":"Comma seperated keyspaces name that their snapshot will be deleted",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"cf",
+                     "description":"an optional table name that its snapshot will be deleted",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
@@ -646,7 +654,7 @@
            {
               "method":"POST",
               "summary":"Trigger a cleanup of keys on a single keyspace",
-               "type":"int",
+               "type": "long",
               "nickname":"force_keyspace_cleanup",
               "produces":[
                  "application/json"
@@ -678,7 +686,7 @@
            {
               "method":"GET",
               "summary":"Scrub (deserialize + reserialize at the latest version, skipping bad rows if any) the given keyspace. If columnFamilies array is empty, all CFs are scrubbed. Scrubbed CFs will be snapshotted first, if disableSnapshot is false",
-               "type":"int",
+               "type": "long",
               "nickname":"scrub",
               "produces":[
                  "application/json"
@@ -726,7 +734,7 @@
            {
               "method":"GET",
               "summary":"Rewrite all sstables to the latest version. Unlike scrub, it doesn't skip bad rows and do not snapshot sstables first.",
-               "type":"int",
+               "type": "long",
               "nickname":"upgrade_sstables",
               "produces":[
                  "application/json"
@@ -800,7 +808,7 @@
               "summary":"Return an array with the ids of the currently active repairs",
               "type":"array",
               "items":{
-                  "type":"int"
+                  "type": "long"
               },
               "nickname":"get_active_repair_async",
               "produces":[
@@ -816,7 +824,7 @@
            {
               "method":"POST",
               "summary":"Invoke repair asynchronously. You can track repair progress by using the get supplying id",
-               "type":"int",
+               "type": "long",
               "nickname":"repair_async",
               "produces":[
                  "application/json"
@@ -947,7 +955,7 @@
                     "description":"The repair ID to check for status",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -1277,18 +1285,18 @@
                  },
                  {
                     "name":"dynamic_update_interval",
-                     "description":"integer, in ms (default 100)",
+                     "description":"interval in ms (default 100)",
                     "required":false,
                     "allowMultiple":false,
-                     "type":"integer",
+                     "type":"long",
                     "paramType":"query"
                  },
                  {
                     "name":"dynamic_reset_interval",
-                     "description":"integer, in ms (default 600,000)",
+                     "description":"interval in ms (default 600,000)",
                     "required":false,
                     "allowMultiple":false,
-                     "type":"integer",
+                     "type":"long",
                     "paramType":"query"
                  },
                  {
@@ -1493,7 +1501,7 @@
                     "description":"Stream throughput",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -1501,7 +1509,7 @@
            {
               "method":"GET",
               "summary":"Get stream throughput mb per sec",
-               "type":"int",
+               "type": "long",
               "nickname":"get_stream_throughput_mb_per_sec",
               "produces":[
                  "application/json"
@@ -1517,7 +1525,7 @@
            {
               "method":"GET",
               "summary":"get compaction throughput mb per sec",
-               "type":"int",
+               "type": "long",
               "nickname":"get_compaction_throughput_mb_per_sec",
               "produces":[
                  "application/json"
@@ -1539,7 +1547,7 @@
                     "description":"compaction throughput",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -1943,7 +1951,7 @@
            {
               "method":"GET",
               "summary":"Returns the threshold for warning of queries with many tombstones",
-               "type":"int",
+               "type": "long",
               "nickname":"get_tombstone_warn_threshold",
               "produces":[
                  "application/json"
@@ -1965,7 +1973,7 @@
                     "description":"tombstone debug threshold",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -1978,7 +1986,7 @@
            {
               "method":"GET",
               "summary":"",
-               "type":"int",
+               "type": "long",
               "nickname":"get_tombstone_failure_threshold",
               "produces":[
                  "application/json"
@@ -2000,7 +2008,7 @@
                     "description":"tombstone debug threshold",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -2013,7 +2021,7 @@
            {
               "method":"GET",
               "summary":"Returns the threshold for rejecting queries due to a large batch size",
-               "type":"int",
+               "type": "long",
               "nickname":"get_batch_size_failure_threshold",
               "produces":[
                  "application/json"
@@ -2035,7 +2043,7 @@
                     "description":"batch size debug threshold",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -2059,7 +2067,7 @@
                     "description":"throttle in kb",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -2072,7 +2080,7 @@
            {
               "method":"GET",
               "summary":"Get load",
-               "type":"int",
+               "type": "long",
               "nickname":"get_metrics_load",
               "produces":[
                  "application/json"
@@ -2088,7 +2096,7 @@
            {
               "method":"GET",
               "summary":"Get exceptions",
-               "type":"int",
+               "type": "long",
               "nickname":"get_exceptions",
               "produces":[
                  "application/json"
@@ -2104,7 +2112,7 @@
            {
               "method":"GET",
               "summary":"Get total hints in progress",
-               "type":"int",
+               "type": "long",
               "nickname":"get_total_hints_in_progress",
               "produces":[
                  "application/json"
@@ -2120,7 +2128,7 @@
            {
               "method":"GET",
               "summary":"Get total hints",
-               "type":"int",
+               "type": "long",
               "nickname":"get_total_hints",
               "produces":[
                  "application/json"
--- a/api/api-doc/stream_manager.json
+++ b/api/api-doc/stream_manager.json
@@ -32,7 +32,7 @@
            {
               "method":"GET",
               "summary":"Get number of active outbound streams",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_active_streams_outbound",
               "produces":[
                  "application/json"
@@ -48,7 +48,7 @@
            {
               "method":"GET",
               "summary":"Get total incoming bytes",
-               "type":"int",
+               "type": "long",
               "nickname":"get_total_incoming_bytes",
               "produces":[
                  "application/json"
@@ -72,7 +72,7 @@
            {
               "method":"GET",
               "summary":"Get all total incoming bytes",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_total_incoming_bytes",
               "produces":[
                  "application/json"
@@ -88,7 +88,7 @@
            {
               "method":"GET",
               "summary":"Get total outgoing bytes",
-               "type":"int",
+               "type": "long",
               "nickname":"get_total_outgoing_bytes",
               "produces":[
                  "application/json"
@@ -112,7 +112,7 @@
            {
               "method":"GET",
               "summary":"Get all total outgoing bytes",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_total_outgoing_bytes",
               "produces":[
                  "application/json"
@@ -154,7 +154,7 @@
               "description":"The peer"
            },
            "session_index":{
-               "type":"int",
+               "type": "long",
               "description":"The session index"
            },
            "connecting":{
@@ -211,7 +211,7 @@
               "description":"The ID"
            },
            "files":{
-               "type":"int",
+               "type": "long",
               "description":"Number of files to transfer. Can be 0 if nothing to transfer for some streaming request."
            },
            "total_size":{
@@ -242,7 +242,7 @@
               "description":"The peer address"
            },
            "session_index":{
-               "type":"int",
+               "type": "long",
               "description":"The session index"
            },
            "file_name":{
--- a/api/api-doc/system.json
+++ b/api/api-doc/system.json
@@ -52,6 +52,21 @@
            }
         ]
      },
+      {
+         "path":"/system/uptime_ms",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get system uptime, in milliseconds",
+               "type":"long",
+               "nickname":"get_system_uptime",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            }
+         ]
+      },
      {
         "path":"/system/logger/{name}",
         "operations":[
--- a/api/api.cc
+++ b/api/api.cc
@@ -36,6 +36,7 @@
 #include "endpoint_snitch.hh"
 #include "compaction_manager.hh"
 #include "hinted_handoff.hh"
+#include "error_injection.hh"
 #include <seastar/http/exception.hh>
 #include "stream_manager.hh"
 #include "system.hh"
@@ -68,13 +69,19 @@ future<> set_server_init(http_context& ctx) {
        rb->set_api_doc(r);
        rb02->set_api_doc(r);
        rb02->register_api_file(r, "swagger20_header");
-        set_config(rb02, ctx, r);
        rb->register_function(r, "system",
                "The system related API");
        set_system(ctx, r);
    });
 }

+future<> set_server_config(http_context& ctx) {
+    auto rb02 = std::make_shared < api_registry_builder20 > (ctx.api_doc, "/v2");
+    return ctx.http_server.set_routes([&ctx, rb02](routes& r) {
+        set_config(rb02, ctx, r);
+    });
+}
+
 static future<> register_api(http_context& ctx, const sstring& api_name,
        const sstring api_desc,
        std::function<void(http_context& ctx, routes& r)> f) {
@@ -90,6 +97,10 @@ future<> set_server_storage_service(http_context& ctx) {
    return register_api(ctx, "storage_service", "The storage service API", set_storage_service);
 }

+future<> set_server_snapshot(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { set_snapshot(ctx, r); });
+}
+
 future<> set_server_snitch(http_context& ctx) {
    return register_api(ctx, "endpoint_snitch_info", "The endpoint snitch info API", set_endpoint_snitch);
 }
@@ -153,6 +164,9 @@ future<> set_server_done(http_context& ctx) {
        rb->register_function(r, "collectd",
                "The collectd API");
        set_collectd(ctx, r);
+        rb->register_function(r, "error_injection",
+                "The error injection API");
+        set_error_injection(ctx, r);
    });
 }

--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -23,6 +23,9 @@
 #include "service/storage_proxy.hh"
 #include <seastar/http/httpd.hh>

+namespace service { class load_meter; }
+namespace locator { class token_metadata; }
+
 namespace api {

 struct http_context {
@@ -31,15 +34,21 @@ struct http_context {
    httpd::http_server_control http_server;
    distributed<database>& db;
    distributed<service::storage_proxy>& sp;
+    service::load_meter& lmeter;
+    sharded<locator::token_metadata>& token_metadata;
+
    http_context(distributed<database>& _db,
-            distributed<service::storage_proxy>& _sp)
-            : db(_db), sp(_sp) {
+            distributed<service::storage_proxy>& _sp,
+            service::load_meter& _lm, sharded<locator::token_metadata>& _tm)
+            : db(_db), sp(_sp), lmeter(_lm), token_metadata(_tm) {
    }
 };

 future<> set_server_init(http_context& ctx);
+future<> set_server_config(http_context& ctx);
 future<> set_server_snitch(http_context& ctx);
 future<> set_server_storage_service(http_context& ctx);
+future<> set_server_snapshot(http_context& ctx);
 future<> set_server_gossip(http_context& ctx);
 future<> set_server_load_sstable(http_context& ctx);
 future<> set_server_messaging_service(http_context& ctx);
--- a/api/collectd.cc
+++ b/api/collectd.cc
@@ -64,7 +64,7 @@ static const char* str_to_regex(const sstring& v) {
 void set_collectd(http_context& ctx, routes& r) {
    cd::get_collectd.set(r, [&ctx](std::unique_ptr<request> req) {

-        auto id = make_shared<scollectd::type_instance_id>(req->param["pluginid"],
+        auto id = ::make_shared<scollectd::type_instance_id>(req->param["pluginid"],
                req->get_query_param("instance"), req->get_query_param("type"),
                req->get_query_param("type_instance"));

--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -994,5 +994,15 @@ void set_column_family(http_context& ctx, routes& r) {
        });
    });

+    cf::force_major_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
+        if (req->get_query_param("split_output") != "") {
+            fail(unimplemented::cause::API);
+        }
+        return foreach_column_family(ctx, req->param["name"], [](column_family &cf) {
+            return cf.compact_all_sstables();
+        }).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
 }
 }
--- a/api/error_injection.cc
+++ b/api/error_injection.cc
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "api/api-doc/error_injection.json.hh"
+#include "api/api.hh"
+
+#include <seastar/http/exception.hh>
+#include "log.hh"
+#include "utils/error_injection.hh"
+#include "seastar/core/future-util.hh"
+
+namespace api {
+
+namespace hf = httpd::error_injection_json;
+
+void set_error_injection(http_context& ctx, routes& r) {
+
+    hf::enable_injection.set(r, [](std::unique_ptr<request> req) {
+        sstring injection = req->param["injection"];
+        bool one_shot = req->get_query_param("one_shot") == "True";
+        auto& errinj = utils::get_local_injector();
+        errinj.enable_on_all(injection, one_shot);
+        return make_ready_future<json::json_return_type>(json::json_void());
+    });
+
+    hf::get_enabled_injections_on_all.set(r, [](std::unique_ptr<request> req) {
+        auto& errinj = utils::get_local_injector();
+        auto ret = errinj.enabled_injections_on_all();
+        return make_ready_future<json::json_return_type>(ret);
+    });
+
+    hf::disable_injection.set(r, [](std::unique_ptr<request> req) {
+        sstring injection = req->param["injection"];
+
+        auto& errinj = utils::get_local_injector();
+        errinj.disable_on_all(injection);
+        return make_ready_future<json::json_return_type>(json::json_void());
+    });
+
+    hf::disable_on_all.set(r, [](std::unique_ptr<request> req) {
+        auto& errinj = utils::get_local_injector();
+        errinj.disable_on_all();
+        return make_ready_future<json::json_return_type>(json::json_void());
+    });
+
+}
+
+} // namespace api
--- a/api/error_injection.hh
+++ b/api/error_injection.hh
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "api.hh"
+
+namespace api {
+
+void set_error_injection(http_context& ctx, routes& r);
+
+}
--- a/api/storage_proxy.cc
+++ b/api/storage_proxy.cc
@@ -27,6 +27,7 @@
 #include "db/config.hh"
 #include "utils/histogram.hh"
 #include "database.hh"
+#include "seastar/core/scheduling_specific.hh"

 namespace api {

@@ -34,12 +35,70 @@ namespace sp = httpd::storage_proxy_json;
 using proxy = service::storage_proxy;
 using namespace json;

-static future<utils::rate_moving_average>  sum_timed_rate(distributed<proxy>& d, utils::timed_rate_moving_average proxy::stats::*f) {
-    return d.map_reduce0([f](const proxy& p) {return (p.get_stats().*f).rate();}, utils::rate_moving_average(),
-            std::plus<utils::rate_moving_average>());
+
+/**
+ * This function implement a two dimentional map reduce where
+ * the first level is a distributed storage_proxy class and the
+ * second level is the stats per scheduling group class.
+ * @param d -  a reference to the storage_proxy distributed class.
+ * @param mapper -  the internal mapper that is used to map the internal
+ * stat class into a value of type `V`.
+ * @param reducer - the reducer that is used in both outer and inner
+ * aggregations.
+ * @param initial_value - the initial value to use for both aggregations
+ * @return A future that resolves to the result of the aggregation.
+ */
+template<typename V, typename Reducer, typename InnerMapper>
+future<V> two_dimensional_map_reduce(distributed<service::storage_proxy>& d,
+        InnerMapper mapper, Reducer reducer, V initial_value) {
+    return d.map_reduce0( [mapper, reducer, initial_value] (const service::storage_proxy& sp) {
+        return map_reduce_scheduling_group_specific<service::storage_proxy_stats::stats>(
+                mapper, reducer, initial_value, sp.get_stats_key());
+    }, initial_value, reducer);
 }

-static future<json::json_return_type>  sum_timed_rate_as_obj(distributed<proxy>& d, utils::timed_rate_moving_average proxy::stats::*f) {
+/**
+ * This function implement a two dimentional map reduce where
+ * the first level is a distributed storage_proxy class and the
+ * second level is the stats per scheduling group class.
+ * @param d -  a reference to the storage_proxy distributed class.
+ * @param f - a field pointer which is the implicit internal reducer.
+ * @param reducer - the reducer that is used in both outer and inner
+ * aggregations.
+ * @param initial_value - the initial value to use for both aggregations* @return
+ * @return A future that resolves to the result of the aggregation.
+ */
+template<typename V, typename Reducer, typename F>
+future<V> two_dimensional_map_reduce(distributed<service::storage_proxy>& d,
+        V F::*f, Reducer reducer, V initial_value) {
+    return two_dimensional_map_reduce(d, [f] (F& stats) {
+        return stats.*f;
+    }, reducer, initial_value);
+}
+
+/**
+ * A partial Specialization of sum_stats for the storage proxy
+ * case where the get stats function doesn't return a
+ * stats object with fields but a per scheduling group
+ * stats object, the name was also changed since functions
+ * partial specialization is not supported in C++.
+ *
+ */
+template<typename V, typename F>
+future<json::json_return_type>  sum_stats_storage_proxy(distributed<proxy>& d, V F::*f) {
+    return two_dimensional_map_reduce(d, [f] (F& stats) { return stats.*f; }, std::plus<V>(), V(0)).then([] (V val) {
+        return make_ready_future<json::json_return_type>(val);
+    });
+}
+
+
+static future<utils::rate_moving_average>  sum_timed_rate(distributed<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
+    return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
+        return (stats.*f).rate();
+    }, std::plus<utils::rate_moving_average>(), utils::rate_moving_average());
+}
+
+static future<json::json_return_type>  sum_timed_rate_as_obj(distributed<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
    return sum_timed_rate(d, f).then([](const utils::rate_moving_average& val) {
        httpd::utils_json::rate_moving_average m;
        m = val;
@@ -51,29 +110,72 @@ httpd::utils_json::rate_moving_average_and_histogram get_empty_moving_average()
    return timer_to_json(utils::rate_moving_average_and_histogram());
 }

-static future<json::json_return_type>  sum_timed_rate_as_long(distributed<proxy>& d, utils::timed_rate_moving_average proxy::stats::*f) {
+static future<json::json_return_type>  sum_timed_rate_as_long(distributed<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
    return sum_timed_rate(d, f).then([](const utils::rate_moving_average& val) {
        return make_ready_future<json::json_return_type>(val.count);
    });
 }

-static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::estimated_histogram proxy::stats::*f) {
-    return ctx.sp.map_reduce0([f](const proxy& p) {return p.get_stats().*f;}, utils::estimated_histogram(),
-            utils::estimated_histogram_merge).then([](const utils::estimated_histogram& val) {
+static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::estimated_histogram service::storage_proxy_stats::stats::*f) {
+
+    return two_dimensional_map_reduce(ctx.sp, f, utils::estimated_histogram_merge,
+            utils::estimated_histogram()).then([](const utils::estimated_histogram& val) {
        utils_json::estimated_histogram res;
        res = val;
        return make_ready_future<json::json_return_type>(res);
    });
 }

-static future<json::json_return_type>  total_latency(http_context& ctx, utils::timed_rate_moving_average_and_histogram proxy::stats::*f) {
-    return ctx.sp.map_reduce0([f](const proxy& p) {return (p.get_stats().*f).hist.mean * (p.get_stats().*f).hist.count;}, 0.0,
-            std::plus<double>()).then([](double val) {
+static future<json::json_return_type>  total_latency(http_context& ctx, utils::timed_rate_moving_average_and_histogram service::storage_proxy_stats::stats::*f) {
+    return two_dimensional_map_reduce(ctx.sp, [f] (service::storage_proxy_stats::stats& stats) {
+            return (stats.*f).hist.mean * (stats.*f).hist.count;
+        }, std::plus<double>(), 0.0).then([](double val) {
        int64_t res = val;
        return make_ready_future<json::json_return_type>(res);
    });
 }

+/**
+ * A partial Specialization of sum_histogram_stats
+ * for the storage proxy case where the get stats
+ * function doesn't return a stats object with
+ * fields but a per scheduling group stats object,
+ * the name was also changed since function partial
+ * specialization is not supported in C++.
+ */
+template<typename F>
+future<json::json_return_type>
+sum_histogram_stats_storage_proxy(distributed<proxy>& d,
+        utils::timed_rate_moving_average_and_histogram F::*f) {
+    return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
+        return (stats.*f).hist;
+    }, std::plus<utils::ihistogram>(), utils::ihistogram()).
+            then([](const utils::ihistogram& val) {
+        return make_ready_future<json::json_return_type>(to_json(val));
+    });
+}
+
+/**
+ * A partial Specialization of sum_timer_stats for the
+ * storage proxy case where the get stats function
+ * doesn't return a stats object with fields but a
+ * per scheduling group stats object, the name
+ * was also changed since partial function specialization
+ * is not supported in C++.
+ */
+template<typename F>
+future<json::json_return_type>
+sum_timer_stats_storage_proxy(distributed<proxy>& d,
+        utils::timed_rate_moving_average_and_histogram F::*f) {
+
+    return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
+        return (stats.*f).rate();
+    }, std::plus<utils::rate_moving_average_and_histogram>(),
+            utils::rate_moving_average_and_histogram()).then([](const utils::rate_moving_average_and_histogram& val) {
+        return make_ready_future<json::json_return_type>(timer_to_json(val));
+    });
+}
+
 void set_storage_proxy(http_context& ctx, routes& r) {
    sp::get_total_hints.set(r, [](std::unique_ptr<request> req)  {
        //TBD
@@ -223,15 +325,15 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_read_repair_attempted.set(r, [&ctx](std::unique_ptr<request> req)  {
-        return sum_stats(ctx.sp, &proxy::stats::read_repair_attempts);
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_attempts);
    });

    sp::get_read_repair_repaired_blocking.set(r, [&ctx](std::unique_ptr<request> req)  {
-        return sum_stats(ctx.sp, &proxy::stats::read_repair_repaired_blocking);
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_repaired_blocking);
    });

    sp::get_read_repair_repaired_background.set(r, [&ctx](std::unique_ptr<request> req)  {
-        return sum_stats(ctx.sp, &proxy::stats::read_repair_repaired_background);
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_repaired_background);
    });

    sp::get_schema_versions.set(r, [](std::unique_ptr<request> req)  {
@@ -275,6 +377,10 @@ void set_storage_proxy(http_context& ctx, routes& r) {
        return sum_stats(ctx.sp, &proxy::stats::cas_write_condition_not_met);
    });

+    sp::get_cas_write_metrics_failed_read_round_optimization.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_failed_read_round_optimization);
+    });
+
    sp::get_cas_read_metrics_unfinished_commit.set(r, [&ctx](std::unique_ptr<request> req) {
        return sum_stats(ctx.sp, &proxy::stats::cas_read_unfinished_commit);
    });
@@ -284,71 +390,71 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_read_metrics_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::read_timeouts);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::read_timeouts);
    });

    sp::get_read_metrics_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::read_unavailables);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::read_unavailables);
    });

    sp::get_range_metrics_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::range_slice_timeouts);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::range_slice_timeouts);
    });

    sp::get_range_metrics_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::range_slice_unavailables);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::range_slice_unavailables);
    });

    sp::get_write_metrics_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::write_timeouts);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::write_timeouts);
    });

    sp::get_write_metrics_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::write_unavailables);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::write_unavailables);
    });

    sp::get_read_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::read_timeouts);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::read_timeouts);
    });

    sp::get_read_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::read_unavailables);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::read_unavailables);
    });

    sp::get_range_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::range_slice_timeouts);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::range_slice_timeouts);
    });

    sp::get_range_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::range_slice_unavailables);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::range_slice_unavailables);
    });

    sp::get_write_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::write_timeouts);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::write_timeouts);
    });

    sp::get_write_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::write_unavailables);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::write_unavailables);
    });

    sp::get_range_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_histogram_stats(ctx.sp, &proxy::stats::range);
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

    sp::get_write_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_histogram_stats(ctx.sp, &proxy::stats::write);
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::write);
    });

    sp::get_read_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_histogram_stats(ctx.sp, &proxy::stats::read);
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read);
    });

    sp::get_range_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats(ctx.sp, &proxy::stats::range);
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

    sp::get_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats(ctx.sp, &proxy::stats::write);
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::write);
    });
    sp::get_cas_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
        return sum_timer_stats(ctx.sp, &proxy::stats::cas_write);
@@ -367,30 +473,30 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_read_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats(ctx.sp, &proxy::stats::read);
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read);
    });

    sp::get_read_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_estimated_histogram(ctx, &proxy::stats::estimated_read);
+        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::estimated_read);
    });

    sp::get_read_latency.set(r, [&ctx](std::unique_ptr<request> req) {
-        return total_latency(ctx, &proxy::stats::read);
+        return total_latency(ctx, &service::storage_proxy_stats::stats::read);
    });
    sp::get_write_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_estimated_histogram(ctx, &proxy::stats::estimated_write);
+        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::estimated_write);
    });

    sp::get_write_latency.set(r, [&ctx](std::unique_ptr<request> req) {
-        return total_latency(ctx, &proxy::stats::write);
+        return total_latency(ctx, &service::storage_proxy_stats::stats::write);
    });

    sp::get_range_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats(ctx.sp, &proxy::stats::range);
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

    sp::get_range_latency.set(r, [&ctx](std::unique_ptr<request> req) {
-        return total_latency(ctx, &proxy::stats::range);
+        return total_latency(ctx, &service::storage_proxy_stats::stats::range);
    });
 }

--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -27,6 +27,7 @@
 #include <boost/range/adaptor/map.hpp>
 #include <boost/range/adaptor/filtered.hpp>
 #include "service/storage_service.hh"
+#include "service/load_meter.hh"
 #include "db/commitlog/commitlog.hh"
 #include "gms/gossiper.hh"
 #include "db/system_keyspace.hh"
@@ -41,8 +42,6 @@
 #include "database.hh"
 #include "db/extensions.hh"

-sstables::sstable::version_types get_highest_supported_format();
-
 namespace api {

 namespace ss = httpd::storage_service_json;
@@ -55,57 +54,53 @@ static sstring validate_keyspace(http_context& ctx, const parameters& param) {
    throw bad_param_exception("Keyspace " + param["keyspace"] + " Does not exist");
 }

-static std::vector<ss::token_range> describe_ring(const sstring& keyspace) {
-    std::vector<ss::token_range> res;
-    for (auto d : service::get_local_storage_service().describe_ring(keyspace)) {
-        ss::token_range r;
-        r.start_token = d._start_token;
-        r.end_token = d._end_token;
-        r.endpoints = d._endpoints;
-        r.rpc_endpoints = d._rpc_endpoints;
-        for (auto det : d._endpoint_details) {
-            ss::endpoint_detail ed;
-            ed.host = det._host;
-            ed.datacenter = det._datacenter;
-            if (det._rack != "") {
-                ed.rack = det._rack;
-            }
-            r.endpoint_details.push(ed);
+static ss::token_range token_range_endpoints_to_json(const dht::token_range_endpoints& d) {
+    ss::token_range r;
+    r.start_token = d._start_token;
+    r.end_token = d._end_token;
+    r.endpoints = d._endpoints;
+    r.rpc_endpoints = d._rpc_endpoints;
+    for (auto det : d._endpoint_details) {
+        ss::endpoint_detail ed;
+        ed.host = det._host;
+        ed.datacenter = det._datacenter;
+        if (det._rack != "") {
+            ed.rack = det._rack;
        }
-        res.push_back(r);
+        r.endpoint_details.push(ed);
    }
-    return res;
+    return r;
+}
+
+using ks_cf_func = std::function<future<json::json_return_type>(http_context&, std::unique_ptr<request>, sstring, std::vector<sstring>)>;
+
+static auto wrap_ks_cf(http_context &ctx, ks_cf_func f) {
+    return [&ctx, f = std::move(f)](std::unique_ptr<request> req) {
+        auto keyspace = validate_keyspace(ctx, req->param);
+        auto column_families = split_cf(req->get_query_param("cf"));
+        if (column_families.empty()) {
+            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
+        }
+        return f(ctx, std::move(req), std::move(keyspace), std::move(column_families));
+    };
 }

 void set_storage_service(http_context& ctx, routes& r) {
-    using ks_cf_func = std::function<future<json::json_return_type>(std::unique_ptr<request>, sstring, std::vector<sstring>)>;
-
-    auto wrap_ks_cf = [&ctx](ks_cf_func f) {
-        return [&ctx, f = std::move(f)](std::unique_ptr<request> req) {
-            auto keyspace = validate_keyspace(ctx, req->param);
-            auto column_families = split_cf(req->get_query_param("cf"));
-            if (column_families.empty()) {
-                column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
-            }
-            return f(std::move(req), std::move(keyspace), std::move(column_families));
-        };
-    };
-
    ss::local_hostid.set(r, [](std::unique_ptr<request> req) {
        return db::system_keyspace::get_local_host_id().then([](const utils::UUID& id) {
            return make_ready_future<json::json_return_type>(id.to_sstring());
        });
    });

-    ss::get_tokens.set(r, [] (std::unique_ptr<request> req) {
-        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().get_token_metadata().sorted_tokens(), [](const dht::token& i) {
+    ss::get_tokens.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.token_metadata.local().sorted_tokens(), [](const dht::token& i) {
           return boost::lexical_cast<std::string>(i);
        }));
    });

-    ss::get_node_tokens.set(r, [] (std::unique_ptr<request> req) {
+    ss::get_node_tokens.set(r, [&ctx] (std::unique_ptr<request> req) {
        gms::inet_address addr(req->param["endpoint"]);
-        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().get_token_metadata().get_tokens(addr), [](const dht::token& i) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.token_metadata.local().get_tokens(addr), [](const dht::token& i) {
           return boost::lexical_cast<std::string>(i);
       }));
    });
@@ -123,8 +118,8 @@ void set_storage_service(http_context& ctx, routes& r) {
        }));
    });

-    ss::get_leaving_nodes.set(r, [](const_req req) {
-        return container_to_vec(service::get_local_storage_service().get_token_metadata().get_leaving_endpoints());
+    ss::get_leaving_nodes.set(r, [&ctx](const_req req) {
+        return container_to_vec(ctx.token_metadata.local().get_leaving_endpoints());
    });

    ss::get_moving_nodes.set(r, [](const_req req) {
@@ -132,8 +127,8 @@ void set_storage_service(http_context& ctx, routes& r) {
        return container_to_vec(addr);
    });

-    ss::get_joining_nodes.set(r, [](const_req req) {
-        auto points = service::get_local_storage_service().get_token_metadata().get_bootstrap_tokens();
+    ss::get_joining_nodes.set(r, [&ctx](const_req req) {
+        auto points = ctx.token_metadata.local().get_bootstrap_tokens();
        std::unordered_set<sstring> addr;
        for (auto i: points) {
            addr.insert(boost::lexical_cast<std::string>(i.second));
@@ -176,27 +171,26 @@ void set_storage_service(http_context& ctx, routes& r) {
        return make_ready_future<json::json_return_type>(res);
    });

-    ss::describe_any_ring.set(r, [&ctx](const_req req) {
-        return describe_ring("");
+    ss::describe_any_ring.set(r, [&ctx](std::unique_ptr<request> req) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(""), token_range_endpoints_to_json));
    });

-    ss::describe_ring.set(r, [&ctx](const_req req) {
-        auto keyspace = validate_keyspace(ctx, req.param);
-        return describe_ring(keyspace);
+    ss::describe_ring.set(r, [&ctx](std::unique_ptr<request> req) {
+        auto keyspace = validate_keyspace(ctx, req->param);
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(keyspace), token_range_endpoints_to_json));
    });

-    ss::get_host_id_map.set(r, [](const_req req) {
+    ss::get_host_id_map.set(r, [&ctx](const_req req) {
        std::vector<ss::mapper> res;
-        return map_to_key_value(service::get_local_storage_service().
-                get_token_metadata().get_endpoint_to_host_id_map_for_reading(), res);
+        return map_to_key_value(ctx.token_metadata.local().get_endpoint_to_host_id_map_for_reading(), res);
    });

    ss::get_load.set(r, [&ctx](std::unique_ptr<request> req) {
        return get_cf_stats(ctx, &column_family_stats::live_disk_space_used);
    });

-    ss::get_load_map.set(r, [] (std::unique_ptr<request> req) {
-        return service::get_local_storage_service().get_load_map().then([] (auto&& load_map) {
+    ss::get_load_map.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return ctx.lmeter.get_load_map().then([] (auto&& load_map) {
            std::vector<ss::map_string_double> res;
            for (auto i : load_map) {
                ss::map_string_double val;
@@ -221,67 +215,6 @@ void set_storage_service(http_context& ctx, routes& r) {
                req.get_query_param("key")));
    });

-    ss::get_snapshot_details.set(r, [](std::unique_ptr<request> req) {
-        return service::get_local_storage_service().get_snapshot_details().then([] (auto result) {
-            std::vector<ss::snapshots> res;
-            for (auto& map: result) {
-                ss::snapshots all_snapshots;
-                all_snapshots.key = map.first;
-
-                std::vector<ss::snapshot> snapshot;
-                for (auto& cf: map.second) {
-                    ss::snapshot s;
-                    s.ks = cf.ks;
-                    s.cf = cf.cf;
-                    s.live = cf.live;
-                    s.total = cf.total;
-                    snapshot.push_back(std::move(s));
-                }
-                all_snapshots.value = std::move(snapshot);
-                res.push_back(std::move(all_snapshots));
-            }
-            return make_ready_future<json::json_return_type>(std::move(res));
-        });
-    });
-
-    ss::take_snapshot.set(r, [](std::unique_ptr<request> req) {
-        auto tag = req->get_query_param("tag");
-        auto column_family = req->get_query_param("cf");
-
-        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-
-        auto resp = make_ready_future<>();
-        if (column_family.empty()) {
-            resp = service::get_local_storage_service().take_snapshot(tag, keynames);
-        } else {
-            if (keynames.empty()) {
-                throw httpd::bad_param_exception("The keyspace of column families must be specified");
-            }
-            if (keynames.size() > 1) {
-                throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
-            }
-            resp = service::get_local_storage_service().take_column_family_snapshot(keynames[0], column_family, tag);
-        }
-        return resp.then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
-    ss::del_snapshot.set(r, [](std::unique_ptr<request> req) {
-        auto tag = req->get_query_param("tag");
-
-        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-        return service::get_local_storage_service().clear_snapshot(tag, keynames).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
-    ss::true_snapshots_size.set(r, [](std::unique_ptr<request> req) {
-        return service::get_local_storage_service().true_snapshots_size().then([] (int64_t size) {
-            return make_ready_future<json::json_return_type>(size);
-        });
-    });
-
    ss::force_keyspace_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
        auto keyspace = validate_keyspace(ctx, req->param);
        auto column_families = split_cf(req->get_query_param("cf"));
@@ -319,8 +252,8 @@ void set_storage_service(http_context& ctx, routes& r) {
                for (auto cf : column_families) {
                    column_families_vec.push_back(&db.find_column_family(keyspace, cf));
                }
-                return parallel_for_each(column_families_vec, [&cm] (column_family* cf) {
-                    return cm.perform_cleanup(cf);
+                return parallel_for_each(column_families_vec, [&cm, &db] (column_family* cf) {
+                    return cm.perform_cleanup(db, cf);
                });
            }).then([]{
                return make_ready_future<json::json_return_type>(0);
@@ -328,32 +261,7 @@ void set_storage_service(http_context& ctx, routes& r) {
        });
    });

-    ss::scrub.set(r, wrap_ks_cf([&ctx](std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
-        // TODO: respect this
-        auto skip_corrupted = req->get_query_param("skip_corrupted");
-
-        auto f = make_ready_future<>();
-        if (!req_param<bool>(*req, "disable_snapshot", false)) {
-            auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
-            f = parallel_for_each(column_families, [keyspace, tag](sstring cf) {
-                return service::get_local_storage_service().take_column_family_snapshot(keyspace, cf, tag);
-            });
-        }
-
-        return f.then([&ctx, keyspace, column_families] {
-            return ctx.db.invoke_on_all([=] (database& db) {
-                return do_for_each(column_families, [=, &db](sstring cfname) {
-                    auto& cm = db.get_compaction_manager();
-                    auto& cf = db.find_column_family(keyspace, cfname);
-                    return cm.perform_sstable_scrub(&cf);
-                });
-            });
-        }).then([]{
-            return make_ready_future<json::json_return_type>(0);
-        });
-    }));
-
-    ss::upgrade_sstables.set(r, wrap_ks_cf([&ctx](std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
+    ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);

        return ctx.db.invoke_on_all([=] (database& db) {
@@ -608,9 +516,7 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::join_ring.set(r, [](std::unique_ptr<request> req) {
-        return service::get_local_storage_service().join_ring().then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
+        return make_ready_future<json::json_return_type>(json_void());
    });

    ss::is_joined.set(r, [] (std::unique_ptr<request> req) {
@@ -1041,4 +947,107 @@ void set_storage_service(http_context& ctx, routes& r) {

 }

+void set_snapshot(http_context& ctx, routes& r) {
+    ss::get_snapshot_details.set(r, [](std::unique_ptr<request> req) {
+        std::function<future<>(output_stream<char>&&)> f = [](output_stream<char>&& s) {
+            return do_with(output_stream<char>(std::move(s)), true, [] (output_stream<char>& s, bool& first){
+                return s.write("[").then([&s, &first] {
+                    return service::get_local_storage_service().get_snapshot_details().then([&s, &first] (std::unordered_map<sstring, std::vector<service::storage_service::snapshot_details>>&& result) {
+                        return do_with(std::move(result), [&s, &first](const std::unordered_map<sstring, std::vector<service::storage_service::snapshot_details>>& result) {
+                            return do_for_each(result, [&s, &result,&first](std::tuple<sstring, std::vector<service::storage_service::snapshot_details>>&& map){
+                                return do_with(ss::snapshots(), [&s, &first, &result, &map](ss::snapshots& all_snapshots) {
+                                    all_snapshots.key = std::get<0>(map);
+                                    future<> f = first ? make_ready_future<>() : s.write(", ");
+                                    first = false;
+                                    std::vector<ss::snapshot> snapshot;
+                                    for (auto& cf: std::get<1>(map)) {
+                                        ss::snapshot snp;
+                                        snp.ks = cf.ks;
+                                        snp.cf = cf.cf;
+                                        snp.live = cf.live;
+                                        snp.total = cf.total;
+                                        snapshot.push_back(std::move(snp));
+                                    }
+                                    all_snapshots.value = std::move(snapshot);
+                                    return f.then([&s, &all_snapshots] {
+                                        return all_snapshots.write(s);
+                                    });
+                                });
+                            });
+                        });
+                    }).then([&s] {
+                        return s.write("]").then([&s] {
+                            return s.close();
+                        });
+                    });
+                });
+            });
+        };
+        return make_ready_future<json::json_return_type>(std::move(f));
+    });
+
+    ss::take_snapshot.set(r, [](std::unique_ptr<request> req) {
+        auto tag = req->get_query_param("tag");
+        auto column_family = req->get_query_param("cf");
+
+        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
+
+        auto resp = make_ready_future<>();
+        if (column_family.empty()) {
+            resp = service::get_local_storage_service().take_snapshot(tag, keynames);
+        } else {
+            if (keynames.empty()) {
+                throw httpd::bad_param_exception("The keyspace of column families must be specified");
+            }
+            if (keynames.size() > 1) {
+                throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
+            }
+            resp = service::get_local_storage_service().take_column_family_snapshot(keynames[0], column_family, tag);
+        }
+        return resp.then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    ss::del_snapshot.set(r, [](std::unique_ptr<request> req) {
+        auto tag = req->get_query_param("tag");
+        auto column_family = req->get_query_param("cf");
+
+        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
+        return service::get_local_storage_service().clear_snapshot(tag, keynames, column_family).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    ss::true_snapshots_size.set(r, [](std::unique_ptr<request> req) {
+        return service::get_local_storage_service().true_snapshots_size().then([] (int64_t size) {
+            return make_ready_future<json::json_return_type>(size);
+        });
+    });
+
+    ss::scrub.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
+        const auto skip_corrupted = req_param<bool>(*req, "skip_corrupted", false);
+
+        auto f = make_ready_future<>();
+        if (!req_param<bool>(*req, "disable_snapshot", false)) {
+            auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
+            f = parallel_for_each(column_families, [keyspace, tag](sstring cf) {
+                return service::get_local_storage_service().take_column_family_snapshot(keyspace, cf, tag);
+            });
+        }
+
+        return f.then([&ctx, keyspace, column_families, skip_corrupted] {
+            return ctx.db.invoke_on_all([=] (database& db) {
+                return do_for_each(column_families, [=, &db](sstring cfname) {
+                    auto& cm = db.get_compaction_manager();
+                    auto& cf = db.find_column_family(keyspace, cfname);
+                    return cm.perform_sstable_scrub(&cf, skip_corrupted);
+                });
+            });
+        }).then([]{
+            return make_ready_future<json::json_return_type>(0);
+        });
+    }));
+}
+
 }
--- a/api/storage_service.hh
+++ b/api/storage_service.hh
@@ -26,5 +26,6 @@
 namespace api {

 void set_storage_service(http_context& ctx, routes& r);
+void set_snapshot(http_context& ctx, routes& r);

 }
--- a/api/system.cc
+++ b/api/system.cc
@@ -30,6 +30,10 @@ namespace api {
 namespace hs = httpd::system_json;

 void set_system(http_context& ctx, routes& r) {
+    hs::get_system_uptime.set(r, [](const_req req) {
+        return std::chrono::duration_cast<std::chrono::milliseconds>(engine().uptime()).count();
+    });
+
    hs::get_all_logger_names.set(r, [](const_req req) {
        return logging::logger_registry().get_all_logger_names();
    });
--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -21,6 +21,7 @@

 #include "atomic_cell.hh"
 #include "atomic_cell_or_collection.hh"
+#include "counters.hh"
 #include "types.hh"

 /// LSA mirator for cells with irrelevant type
@@ -214,6 +215,61 @@ size_t atomic_cell_or_collection::external_memory_usage(const abstract_type& t)
        + imr_object_type::size_overhead + external_value_size;
 }

+std::ostream&
+operator<<(std::ostream& os, const atomic_cell_view& acv) {
+    if (acv.is_live()) {
+        return fmt_print(os, "atomic_cell{{{},ts={:d},expiry={:d},ttl={:d}}}",
+            acv.is_counter_update()
+                    ? "counter_update_value=" + to_sstring(acv.counter_update_value())
+                    : to_hex(acv.value().linearize()),
+            acv.timestamp(),
+            acv.is_live_and_has_ttl() ? acv.expiry().time_since_epoch().count() : -1,
+            acv.is_live_and_has_ttl() ? acv.ttl().count() : 0);
+    } else {
+        return fmt_print(os, "atomic_cell{{DEAD,ts={:d},deletion_time={:d}}}",
+            acv.timestamp(), acv.deletion_time().time_since_epoch().count());
+    }
+}
+
+std::ostream&
+operator<<(std::ostream& os, const atomic_cell& ac) {
+    return os << atomic_cell_view(ac);
+}
+
+std::ostream&
+operator<<(std::ostream& os, const atomic_cell_view::printer& acvp) {
+    auto& type = acvp._type;
+    auto& acv = acvp._cell;
+    if (acv.is_live()) {
+        std::ostringstream cell_value_string_builder;
+        if (type.is_counter()) {
+            if (acv.is_counter_update()) {
+                cell_value_string_builder << "counter_update_value=" << acv.counter_update_value();
+            } else {
+                cell_value_string_builder << "shards: ";
+                counter_cell_view::with_linearized(acv, [&cell_value_string_builder] (counter_cell_view& ccv) {
+                    cell_value_string_builder << ::join(", ", ccv.shards());
+                });
+            }
+        } else {
+            cell_value_string_builder << type.to_string(acv.value().linearize());
+        }
+        return fmt_print(os, "atomic_cell{{{},ts={:d},expiry={:d},ttl={:d}}}",
+            cell_value_string_builder.str(),
+            acv.timestamp(),
+            acv.is_live_and_has_ttl() ? acv.expiry().time_since_epoch().count() : -1,
+            acv.is_live_and_has_ttl() ? acv.ttl().count() : 0);
+    } else {
+        return fmt_print(os, "atomic_cell{{DEAD,ts={:d},deletion_time={:d}}}",
+            acv.timestamp(), acv.deletion_time().time_since_epoch().count());
+    }
+}
+
+std::ostream&
+operator<<(std::ostream& os, const atomic_cell::printer& acp) {
+    return operator<<(os, static_cast<const atomic_cell_view::printer&>(acp));
+}
+
 std::ostream& operator<<(std::ostream& os, const atomic_cell_or_collection::printer& p) {
    if (!p._cell._data.get()) {
        return os << "{ null atomic_cell_or_collection }";
@@ -223,9 +279,9 @@ std::ostream& operator<<(std::ostream& os, const atomic_cell_or_collection::prin
    if (dc::structure::get_member<dc::tags::flags>(p._cell._data.get()).get<dc::tags::collection>()) {
        os << "collection ";
        auto cmv = p._cell.as_collection_mutation();
-        os << to_hex(cmv.data.linearize());
+        os << collection_mutation_view::printer(*p._cdef.type, cmv);
    } else {
-        os << p._cell.as_atomic_cell(p._cdef);
+        os << atomic_cell_view::printer(*p._cdef.type, p._cell.as_atomic_cell(p._cdef));
    }
    return os << " }";
 }
--- a/atomic_cell.hh
+++ b/atomic_cell.hh
@@ -153,6 +153,14 @@ public:
    }

    friend std::ostream& operator<<(std::ostream& os, const atomic_cell_view& acv);
+
+    class printer {
+        const abstract_type& _type;
+        const atomic_cell_view& _cell;
+    public:
+        printer(const abstract_type& type, const atomic_cell_view& cell) : _type(type), _cell(cell) {}
+        friend std::ostream& operator<<(std::ostream& os, const printer& acvp);
+    };
 };

 class atomic_cell_mutable_view final : public basic_atomic_cell_view<mutable_view::yes> {
@@ -219,6 +227,12 @@ public:
    static atomic_cell make_live_uninitialized(const abstract_type& type, api::timestamp_type timestamp, size_t size);
    friend class atomic_cell_or_collection;
    friend std::ostream& operator<<(std::ostream& os, const atomic_cell& ac);
+
+    class printer : atomic_cell_view::printer {
+    public:
+        printer(const abstract_type& type, const atomic_cell_view& cell) : atomic_cell_view::printer(type, cell) {}
+        friend std::ostream& operator<<(std::ostream& os, const printer& acvp);
+    };
 };

 class column_definition;
--- a/auth/allow_all_authenticator.hh
+++ b/auth/allow_all_authenticator.hh
@@ -52,7 +52,7 @@ public:
        return make_ready_future<>();
    }

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return allow_all_authenticator_name();
    }

--- a/auth/allow_all_authorizer.hh
+++ b/auth/allow_all_authorizer.hh
@@ -49,7 +49,7 @@ public:
        return make_ready_future<>();
    }

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return allow_all_authorizer_name();
    }

--- a/auth/authenticator.hh
+++ b/auth/authenticator.hh
@@ -96,7 +96,7 @@ public:
    ///
    /// A fully-qualified (class with package) Java-like name for this implementation.
    ///
-    virtual const sstring& qualified_java_name() const = 0;
+    virtual std::string_view qualified_java_name() const = 0;

    virtual bool require_authentication() const = 0;

--- a/auth/authorizer.hh
+++ b/auth/authorizer.hh
@@ -100,7 +100,7 @@ public:
    ///
    /// A fully-qualified (class with package) Java-like name for this implementation.
    ///
-    virtual const sstring& qualified_java_name() const = 0;
+    virtual std::string_view qualified_java_name() const = 0;

    ///
    /// Query for the permissions granted directly to a role for a particular \ref resource (and not any of its
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -59,7 +59,7 @@ future<> do_after_system_ready(seastar::abort_source& as, seastar::noncopyable_f
    }).discard_result();
 }

-future<> create_metadata_table_if_missing(
+static future<> create_metadata_table_if_missing_impl(
        std::string_view table_name,
        cql3::query_processor& qp,
        std::string_view cql,
@@ -85,7 +85,14 @@ future<> create_metadata_table_if_missing(
    return ignore_existing([&mm, table = std::move(table)] () {
        return mm.announce_new_column_family(table, false);
    });
+}

+future<> create_metadata_table_if_missing(
+        std::string_view table_name,
+        cql3::query_processor& qp,
+        std::string_view cql,
+        ::service::migration_manager& mm) noexcept {
+    return futurize_apply(create_metadata_table_if_missing_impl, table_name, qp, cql, mm);
 }

 future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db, seastar::abort_source& as) {
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -79,7 +79,7 @@ future<> create_metadata_table_if_missing(
        std::string_view table_name,
        cql3::query_processor&,
        std::string_view cql,
-        ::service::migration_manager&);
+        ::service::migration_manager&) noexcept;

 future<> wait_for_schema_agreement(::service::migration_manager&, const database&, seastar::abort_source&);

--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -101,7 +101,7 @@ bool default_authorizer::legacy_metadata_exists() const {
 future<bool> default_authorizer::any_granted() const {
    static const sstring query = format("SELECT * FROM {}.{} LIMIT 1", meta::AUTH_KS, PERMISSIONS_CF);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config,
@@ -115,7 +115,7 @@ future<> default_authorizer::migrate_legacy_metadata() const {
    alogger.info("Starting migration of legacy permissions metadata.");
    static const sstring query = format("SELECT * FROM {}.{}", meta::AUTH_KS, legacy_table_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config).then([this](::shared_ptr<cql3::untyped_result_set> results) {
@@ -195,7 +195,7 @@ default_authorizer::authorize(const role_or_anonymous& maybe_role, const resourc
            ROLE_NAME,
            RESOURCE_NAME);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config,
@@ -224,7 +224,7 @@ default_authorizer::modify(
                    ROLE_NAME,
                    RESOURCE_NAME),
            [this, &role_name, set, &resource](const auto& query) {
-        return _qp.process(
+        return _qp.execute_internal(
                query,
                db::consistency_level::ONE,
                internal_distributed_timeout_config(),
@@ -249,7 +249,7 @@ future<std::vector<permission_details>> default_authorizer::list_all() const {
            meta::AUTH_KS,
            PERMISSIONS_CF);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::ONE,
            internal_distributed_timeout_config(),
@@ -276,7 +276,7 @@ future<> default_authorizer::revoke_all(std::string_view role_name) const {
            PERMISSIONS_CF,
            ROLE_NAME);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::ONE,
            internal_distributed_timeout_config(),
@@ -296,7 +296,7 @@ future<> default_authorizer::revoke_all(const resource& resource) const {
            PERMISSIONS_CF,
            RESOURCE_NAME);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config,
@@ -313,7 +313,7 @@ future<> default_authorizer::revoke_all(const resource& resource) const {
                        ROLE_NAME,
                        RESOURCE_NAME);

-                return _qp.process(
+                return _qp.execute_internal(
                        query,
                        db::consistency_level::LOCAL_ONE,
                        infinite_timeout_config,
--- a/auth/default_authorizer.hh
+++ b/auth/default_authorizer.hh
@@ -71,7 +71,7 @@ public:

    virtual future<> stop() override;

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return default_authorizer_name();
    }

--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -96,10 +96,13 @@ static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
    return !row.get_or<sstring>(SALTED_HASH, "").empty();
 }

-static const sstring update_row_query = format("UPDATE {} SET {} = ? WHERE {} = ?",
-        meta::roles_table::qualified_name(),
-        SALTED_HASH,
-        meta::roles_table::role_col_name);
+static const sstring& update_row_query() {
+    static const sstring update_row_query = format("UPDATE {} SET {} = ? WHERE {} = ?",
+            meta::roles_table::qualified_name(),
+            SALTED_HASH,
+            meta::roles_table::role_col_name);
+    return update_row_query;
+}

 static const sstring legacy_table_name{"credentials"};

@@ -111,7 +114,7 @@ future<> password_authenticator::migrate_legacy_metadata() const {
    plogger.info("Starting migration of legacy authentication metadata.");
    static const sstring query = format("SELECT * FROM {}.{}", meta::AUTH_KS, legacy_table_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
            internal_distributed_timeout_config()).then([this](::shared_ptr<cql3::untyped_result_set> results) {
@@ -119,8 +122,8 @@ future<> password_authenticator::migrate_legacy_metadata() const {
            auto username = row.get_as<sstring>("username");
            auto salted_hash = row.get_as<sstring>(SALTED_HASH);

-            return _qp.process(
-                    update_row_query,
+            return _qp.execute_internal(
+                    update_row_query(),
                    consistency_for_user(username),
                    internal_distributed_timeout_config(),
                    {std::move(salted_hash), username}).discard_result();
@@ -136,8 +139,8 @@ future<> password_authenticator::migrate_legacy_metadata() const {
 future<> password_authenticator::create_default_if_missing() const {
    return default_role_row_satisfies(_qp, &has_salted_hash).then([this](bool exists) {
        if (!exists) {
-            return _qp.process(
-                    update_row_query,
+            return _qp.execute_internal(
+                    update_row_query(),
                    db::consistency_level::QUORUM,
                    internal_distributed_timeout_config(),
                    {passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt), DEFAULT_USER_NAME}).then([](auto&&) {
@@ -194,7 +197,7 @@ db::consistency_level password_authenticator::consistency_for_user(std::string_v
    return db::consistency_level::LOCAL_ONE;
 }

-const sstring& password_authenticator::qualified_java_name() const {
+std::string_view password_authenticator::qualified_java_name() const {
    return password_authenticator_name();
 }

@@ -233,7 +236,7 @@ future<authenticated_user> password_authenticator::authenticate(
                meta::roles_table::qualified_name(),
                meta::roles_table::role_col_name);

-        return _qp.process(
+        return _qp.execute_internal(
                query,
                consistency_for_user(username),
                internal_distributed_timeout_config(),
@@ -267,8 +270,8 @@ future<> password_authenticator::create(std::string_view role_name, const authen
        return make_ready_future<>();
    }

-    return _qp.process(
-            update_row_query,
+    return _qp.execute_internal(
+            update_row_query(),
            consistency_for_user(role_name),
            internal_distributed_timeout_config(),
            {passwords::hash(*options.password, rng_for_salt), sstring(role_name)}).discard_result();
@@ -284,7 +287,7 @@ future<> password_authenticator::alter(std::string_view role_name, const authent
            SALTED_HASH,
            meta::roles_table::role_col_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            consistency_for_user(role_name),
            internal_distributed_timeout_config(),
@@ -297,7 +300,7 @@ future<> password_authenticator::drop(std::string_view name) const {
            meta::roles_table::qualified_name(),
            meta::roles_table::role_col_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query, consistency_for_user(name),
            internal_distributed_timeout_config(),
            {sstring(name)}).discard_result();
--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -71,7 +71,7 @@ public:

    virtual future<> stop() override;

-    virtual const sstring& qualified_java_name() const override;
+    virtual std::string_view qualified_java_name() const override;

    virtual bool require_authentication() const override;

--- a/auth/role_manager.hh
+++ b/auth/role_manager.hh
@@ -33,6 +33,7 @@

 #include "auth/resource.hh"
 #include "seastarx.hh"
+#include "exceptions/exceptions.hh"

 namespace auth {

@@ -52,9 +53,9 @@ struct role_config_update final {
 ///
 /// A logical argument error for a role-management operation.
 ///
-class roles_argument_exception : public std::invalid_argument {
+class roles_argument_exception : public exceptions::invalid_request_exception {
 public:
-    using std::invalid_argument::invalid_argument;
+    using exceptions::invalid_request_exception::invalid_request_exception;
 };

 class role_already_exists : public roles_argument_exception {
--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -68,14 +68,14 @@ future<bool> default_role_row_satisfies(
            meta::roles_table::role_col_name);

    return do_with(std::move(p), [&qp](const auto& p) {
-        return qp.process(
+        return qp.execute_internal(
                query,
                db::consistency_level::ONE,
                infinite_timeout_config,
                {meta::DEFAULT_SUPERUSER_NAME},
                true).then([&qp, &p](::shared_ptr<cql3::untyped_result_set> results) {
            if (results->empty()) {
-                return qp.process(
+                return qp.execute_internal(
                        query,
                        db::consistency_level::QUORUM,
                        internal_distributed_timeout_config(),
@@ -100,7 +100,7 @@ future<bool> any_nondefault_role_row_satisfies(
    static const sstring query = format("SELECT * FROM {}", meta::roles_table::qualified_name());

    return do_with(std::move(p), [&qp](const auto& p) {
-        return qp.process(
+        return qp.execute_internal(
                query,
                db::consistency_level::QUORUM,
                internal_distributed_timeout_config()).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -39,7 +39,7 @@
 #include "db/consistency_level_type.hh"
 #include "exceptions/exceptions.hh"
 #include "log.hh"
-#include "service/migration_listener.hh"
+#include "service/migration_manager.hh"
 #include "utils/class_registrator.hh"
 #include "database.hh"

@@ -114,14 +114,14 @@ static future<> validate_role_exists(const service& ser, std::string_view role_n
 service::service(
        permissions_cache_config c,
        cql3::query_processor& qp,
-        ::service::migration_manager& mm,
+        ::service::migration_notifier& mn,
        std::unique_ptr<authorizer> z,
        std::unique_ptr<authenticator> a,
        std::unique_ptr<role_manager> r)
            : _permissions_cache_config(std::move(c))
            , _permissions_cache(nullptr)
            , _qp(qp)
-            , _migration_manager(mm)
+            , _mnotifier(mn)
            , _authorizer(std::move(z))
            , _authenticator(std::move(a))
            , _role_manager(std::move(r))
@@ -141,18 +141,19 @@ service::service(
 service::service(
        permissions_cache_config c,
        cql3::query_processor& qp,
+        ::service::migration_notifier& mn,
        ::service::migration_manager& mm,
        const service_config& sc)
            : service(
                      std::move(c),
                      qp,
-                      mm,
+                      mn,
                      create_object<authorizer>(sc.authorizer_java_name, qp, mm),
                      create_object<authenticator>(sc.authenticator_java_name, qp, mm),
                      create_object<role_manager>(sc.role_manager_java_name, qp, mm)) {
 }

-future<> service::create_keyspace_if_missing() const {
+future<> service::create_keyspace_if_missing(::service::migration_manager& mm) const {
    auto& db = _qp.db();

    if (!db.has_keyspace(meta::AUTH_KS)) {
@@ -166,15 +167,15 @@ future<> service::create_keyspace_if_missing() const {

        // We use min_timestamp so that default keyspace metadata will loose with any manual adjustments.
        // See issue #2129.
-        return _migration_manager.announce_new_keyspace(ksm, api::min_timestamp, false);
+        return mm.announce_new_keyspace(ksm, api::min_timestamp, false);
    }

    return make_ready_future<>();
 }

-future<> service::start() {
-    return once_among_shards([this] {
-        return create_keyspace_if_missing();
+future<> service::start(::service::migration_manager& mm) {
+    return once_among_shards([this, &mm] {
+        return create_keyspace_if_missing(mm);
    }).then([this] {
        return _role_manager->start().then([this] {
            return when_all_succeed(_authorizer->start(), _authenticator->start());
@@ -183,7 +184,7 @@ future<> service::start() {
        _permissions_cache = std::make_unique<permissions_cache>(_permissions_cache_config, *this, log);
    }).then([this] {
        return once_among_shards([this] {
-            _migration_manager.register_listener(_migration_listener.get());
+            _mnotifier.register_listener(_migration_listener.get());
            return make_ready_future<>();
        });
    });
@@ -192,9 +193,12 @@ future<> service::start() {
 future<> service::stop() {
    // Only one of the shards has the listener registered, but let's try to
    // unregister on each one just to make sure.
-    _migration_manager.unregister_listener(_migration_listener.get());
-
-    return _permissions_cache->stop().then([this] {
+    return _mnotifier.unregister_listener(_migration_listener.get()).then([this] {
+        if (_permissions_cache) {
+            return _permissions_cache->stop();
+        }
+        return make_ready_future<>();
+    }).then([this] {
        return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop());
    });
 }
@@ -216,7 +220,7 @@ future<bool> service::has_existing_legacy_users() const {
    // This logic is borrowed directly from Apache Cassandra. By first checking for the presence of the default user, we
    // can potentially avoid doing a range query with a high consistency level.

-    return _qp.process(
+    return _qp.execute_internal(
            default_user_query,
            db::consistency_level::ONE,
            infinite_timeout_config,
@@ -226,7 +230,7 @@ future<bool> service::has_existing_legacy_users() const {
            return make_ready_future<bool>(true);
        }

-        return _qp.process(
+        return _qp.execute_internal(
                default_user_query,
                db::consistency_level::QUORUM,
                infinite_timeout_config,
@@ -236,7 +240,7 @@ future<bool> service::has_existing_legacy_users() const {
                return make_ready_future<bool>(true);
            }

-            return _qp.process(
+            return _qp.execute_internal(
                    all_users_query,
                    db::consistency_level::QUORUM,
                    infinite_timeout_config).then([](auto results) {
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -28,6 +28,7 @@
 #include <seastar/core/future.hh>
 #include <seastar/core/sstring.hh>
 #include <seastar/util/bool_class.hh>
+#include <seastar/core/sharded.hh>

 #include "auth/authenticator.hh"
 #include "auth/authorizer.hh"
@@ -42,6 +43,7 @@ class query_processor;

 namespace service {
 class migration_manager;
+class migration_notifier;
 class migration_listener;
 }

@@ -76,13 +78,15 @@ public:
 ///
 /// All state associated with access-control is stored externally to any particular instance of this class.
 ///
-class service final {
+/// peering_sharded_service inheritance is needed to be able to access shard local authentication service
+/// given an object from another shard. Used for bouncing lwt requests to correct shard.
+class service final : public seastar::peering_sharded_service<service> {
    permissions_cache_config _permissions_cache_config;
    std::unique_ptr<permissions_cache> _permissions_cache;

    cql3::query_processor& _qp;

-    ::service::migration_manager& _migration_manager;
+    ::service::migration_notifier& _mnotifier;

    std::unique_ptr<authorizer> _authorizer;

@@ -97,7 +101,7 @@ public:
    service(
            permissions_cache_config,
            cql3::query_processor&,
-            ::service::migration_manager&,
+            ::service::migration_notifier&,
            std::unique_ptr<authorizer>,
            std::unique_ptr<authenticator>,
            std::unique_ptr<role_manager>);
@@ -110,10 +114,11 @@ public:
    service(
            permissions_cache_config,
            cql3::query_processor&,
+            ::service::migration_notifier&,
            ::service::migration_manager&,
            const service_config&);

-    future<> start();
+    future<> start(::service::migration_manager&);

    future<> stop();

@@ -159,7 +164,7 @@ public:
 private:
    future<bool> has_existing_legacy_users() const;

-    future<> create_keyspace_if_missing() const;
+    future<> create_keyspace_if_missing(::service::migration_manager& mm) const;
 };

 future<bool> has_superuser(const service&, const authenticated_user&);
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -35,6 +35,7 @@
 #include "auth/common.hh"
 #include "auth/roles-metadata.hh"
 #include "cql3/query_processor.hh"
+#include "cql3/untyped_result_set.hh"
 #include "db/consistency_level_type.hh"
 #include "exceptions/exceptions.hh"
 #include "log.hh"
@@ -86,7 +87,7 @@ static future<std::optional<record>> find_record(cql3::query_processor& qp, std:
            meta::roles_table::qualified_name(),
            meta::roles_table::role_col_name);

-    return qp.process(
+    return qp.execute_internal(
            query,
            consistency_for_role(role_name),
            internal_distributed_timeout_config(),
@@ -170,7 +171,7 @@ future<> standard_role_manager::create_default_role_if_missing() const {
                    meta::roles_table::qualified_name(),
                    meta::roles_table::role_col_name);

-            return _qp.process(
+            return _qp.execute_internal(
                    query,
                    db::consistency_level::QUORUM,
                    internal_distributed_timeout_config(),
@@ -197,7 +198,7 @@ future<> standard_role_manager::migrate_legacy_metadata() const {
    log.info("Starting migration of legacy user metadata.");
    static const sstring query = format("SELECT * FROM {}.{}", meta::AUTH_KS, legacy_table_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
            internal_distributed_timeout_config()).then([this](::shared_ptr<cql3::untyped_result_set> results) {
@@ -258,7 +259,7 @@ future<> standard_role_manager::create_or_replace(std::string_view role_name, co
            meta::roles_table::qualified_name(),
            meta::roles_table::role_col_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            consistency_for_role(role_name),
            internal_distributed_timeout_config(),
@@ -298,7 +299,7 @@ standard_role_manager::alter(std::string_view role_name, const role_config_updat
            return make_ready_future<>();
        }

-        return _qp.process(
+        return _qp.execute_internal(
                format("UPDATE {} SET {} WHERE {} = ?",
                        meta::roles_table::qualified_name(),
                        build_column_assignments(u),
@@ -320,7 +321,7 @@ future<> standard_role_manager::drop(std::string_view role_name) const {
            static const sstring query = format("SELECT member FROM {} WHERE role = ?",
                    meta::role_members_table::qualified_name());

-            return _qp.process(
+            return _qp.execute_internal(
                    query,
                    consistency_for_role(role_name),
                    internal_distributed_timeout_config(),
@@ -359,7 +360,7 @@ future<> standard_role_manager::drop(std::string_view role_name) const {
                    meta::roles_table::qualified_name(),
                    meta::roles_table::role_col_name);

-            return _qp.process(
+            return _qp.execute_internal(
                    query,
                    consistency_for_role(role_name),
                    internal_distributed_timeout_config(),
@@ -386,7 +387,7 @@ standard_role_manager::modify_membership(
                (ch == membership_change::add ? '+' : '-'),
                meta::roles_table::role_col_name);

-        return _qp.process(
+        return _qp.execute_internal(
                query,
                consistency_for_role(grantee_name),
                internal_distributed_timeout_config(),
@@ -396,7 +397,7 @@ standard_role_manager::modify_membership(
    const auto modify_role_members = [this, role_name, grantee_name, ch] {
        switch (ch) {
            case membership_change::add:
-                return _qp.process(
+                return _qp.execute_internal(
                        format("INSERT INTO {} (role, member) VALUES (?, ?)",
                                meta::role_members_table::qualified_name()),
                        consistency_for_role(role_name),
@@ -404,7 +405,7 @@ standard_role_manager::modify_membership(
                        {sstring(role_name), sstring(grantee_name)}).discard_result();

            case membership_change::remove:
-                return _qp.process(
+                return _qp.execute_internal(
                        format("DELETE FROM {} WHERE role = ? AND member = ?",
                                meta::role_members_table::qualified_name()),
                        consistency_for_role(role_name),
@@ -508,7 +509,7 @@ future<role_set> standard_role_manager::query_all() const {
    // To avoid many copies of a view.
    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
            internal_distributed_timeout_config()).then([](::shared_ptr<cql3::untyped_result_set> results) {
--- a/auth/transitional.cc
+++ b/auth/transitional.cc
@@ -82,7 +82,7 @@ public:
        return _authenticator->stop();
    }

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return transitional_authenticator_name();
    }

@@ -201,7 +201,7 @@ public:
        return _authorizer->stop();
    }

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return transitional_authorizer_name();
    }

--- a/backlog_controller.hh
+++ b/backlog_controller.hh
@@ -23,7 +23,11 @@
 #include <seastar/core/scheduling.hh>
 #include <seastar/core/timer.hh>
 #include <seastar/core/gate.hh>
+#include <seastar/core/file.hh>
 #include <chrono>
+#include <cmath>
+
+#include "seastarx.hh"

 // Simple proportional controller to adjust shares for processes for which a backlog can be clearly
 // defined.
--- a/build_id.cc
+++ b/build_id.cc
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+#include "build_id.hh"
+#include <fmt/printf.h>
+#include <link.h>
+#include <seastar/core/align.hh>
+#include <sstream>
+#include <cassert>
+
+using namespace seastar;
+
+static const Elf64_Nhdr* get_nt_build_id(dl_phdr_info* info) {
+    auto base = info->dlpi_addr;
+    const auto* h = info->dlpi_phdr;
+    auto num_headers = info->dlpi_phnum;
+    for (int i = 0; i != num_headers; ++i, ++h) {
+        if (h->p_type != PT_NOTE) {
+            continue;
+        }
+
+        auto* p = reinterpret_cast<const char*>(base) + h->p_vaddr;
+        auto* e = p + h->p_memsz;
+        while (p != e) {
+            const auto* n = reinterpret_cast<const Elf64_Nhdr*>(p);
+            if (n->n_type == NT_GNU_BUILD_ID) {
+                return n;
+            }
+
+            p += sizeof(Elf64_Nhdr);
+
+            p += n->n_namesz;
+            p = align_up(p, 4);
+
+            p += n->n_descsz;
+            p = align_up(p, 4);
+        }
+    }
+
+    assert(0 && "no NT_GNU_BUILD_ID note");
+}
+
+static int callback(dl_phdr_info* info, size_t size, void* data) {
+    std::string& ret = *(std::string*)data;
+    std::ostringstream os;
+
+    // The first DSO is always the main program, which has an empty name.
+    assert(strlen(info->dlpi_name) == 0);
+
+    auto* n = get_nt_build_id(info);
+    auto* p = reinterpret_cast<const char*>(n);
+
+    p += sizeof(Elf64_Nhdr);
+
+    p += n->n_namesz;
+    p = align_up(p, 4);
+
+    const char* desc = p;
+    for (unsigned i = 0; i < n->n_descsz; ++i) {
+        fmt::fprintf(os, "%02x", (unsigned char)*(desc + i));
+    }
+    ret = os.str();
+    return 1;
+}
+
+std::string get_build_id() {
+    std::string ret;
+    int r = dl_iterate_phdr(callback, &ret);
+    assert(r == 1);
+    return ret;
+}
--- a/build_id.hh
+++ b/build_id.hh
@@ -0,0 +1,9 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+#pragma once
+
+#include <string>
+
+std::string get_build_id();
--- a/bytes.cc
+++ b/bytes.cc
@@ -64,7 +64,7 @@ bytes from_hex(sstring_view s) {

 sstring to_hex(bytes_view b) {
    static char digits[] = "0123456789abcdef";
-    sstring out(sstring::initialized_later(), b.size() * 2);
+    sstring out = uninitialized_string(b.size() * 2);
    unsigned end = b.size();
    for (unsigned i = 0; i != end; ++i) {
        uint8_t x = b[i];
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -38,6 +38,7 @@ class bytes_ostream {
 public:
    using size_type = bytes::size_type;
    using value_type = bytes::value_type;
+    using fragment_type = bytes_view;
    static constexpr size_type max_chunk_size() { return 128 * 1024; }
 private:
    static_assert(sizeof(value_type) == 1, "value_type is assumed to be one byte long");
@@ -93,6 +94,29 @@ public:
            return _current != other._current;
        }
    };
+    using const_iterator = fragment_iterator;
+
+    class output_iterator {
+    public:
+        using iterator_category = std::output_iterator_tag;
+        using difference_type = std::ptrdiff_t;
+        using value_type = bytes_ostream::value_type;
+        using pointer = bytes_ostream::value_type*;
+        using reference = bytes_ostream::value_type&;
+
+        friend class bytes_ostream;
+
+    private:
+        bytes_ostream* _ostream = nullptr;
+
+    private:
+        explicit output_iterator(bytes_ostream& os) : _ostream(&os) { }
+
+    public:
+        reference operator*() const { return *_ostream->write_place_holder(1); }
+        output_iterator& operator++() { return *this; }
+        output_iterator operator++(int) { return *this; }
+    };
 private:
    inline size_type current_space_left() const {
        if (!_current) {
@@ -289,6 +313,11 @@ public:
        return _size;
    }

+    // For the FragmentRange concept
+    size_type size_bytes() const {
+        return _size;
+    }
+
    bool empty() const {
        return _size == 0;
    }
@@ -326,6 +355,8 @@ public:
    fragment_iterator begin() const { return { _begin.get() }; }
    fragment_iterator end() const { return { nullptr }; }

+    output_iterator write_begin() { return output_iterator(*this); }
+
    boost::iterator_range<fragment_iterator> fragments() const {
        return { begin(), end() };
    }
--- a/canonical_mutation.cc
+++ b/canonical_mutation.cc
@@ -35,6 +35,7 @@
 #include "idl/uuid.dist.impl.hh"
 #include "idl/keys.dist.impl.hh"
 #include "idl/mutation.dist.impl.hh"
+#include <iostream>

 canonical_mutation::canonical_mutation(bytes data)
        : _data(std::move(data))
@@ -89,3 +90,81 @@ mutation canonical_mutation::to_mutation(schema_ptr s) const {
    }
    return m;
 }
+
+static sstring bytes_to_text(bytes_view bv) {
+    sstring ret = uninitialized_string(bv.size());
+    std::copy_n(reinterpret_cast<const char*>(bv.data()), bv.size(), ret.data());
+    return ret;
+}
+
+std::ostream& operator<<(std::ostream& os, const canonical_mutation& cm) {
+    auto in = ser::as_input_stream(cm._data);
+    auto mv = ser::deserialize(in, boost::type<ser::canonical_mutation_view>());
+    column_mapping mapping = mv.mapping();
+    auto partition_view = mutation_partition_view::from_view(mv.partition());
+    fmt::print(os, "{{canonical_mutation: ");
+    fmt::print(os, "table_id {} schema_version {} ", mv.table_id(), mv.schema_version());
+    fmt::print(os, "partition_key {} ", mv.key());
+    class printing_visitor : public mutation_partition_view_virtual_visitor {
+        std::ostream& _os;
+        const column_mapping& _cm;
+        bool _first = true;
+        bool _in_row = false;
+    private:
+        void print_separator() {
+            if (!_first) {
+                fmt::print(_os, ", ");
+            }
+            _first = false;
+        }
+    public:
+        printing_visitor(std::ostream& os, const column_mapping& cm) : _os(os), _cm(cm) {}
+        virtual void accept_partition_tombstone(tombstone t) override {
+            print_separator();
+            fmt::print(_os, "partition_tombstone {}", t);
+        }
+        virtual void accept_static_cell(column_id id, atomic_cell ac) override {
+            print_separator();
+            auto&& entry = _cm.static_column_at(id);
+            fmt::print(_os, "static column {} {}", bytes_to_text(entry.name()), atomic_cell::printer(*entry.type(), ac));
+        }
+        virtual void accept_static_cell(column_id id, collection_mutation_view cmv) override {
+            print_separator();
+            auto&& entry = _cm.static_column_at(id);
+            fmt::print(_os, "static column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cmv));
+        }
+        virtual void accept_row_tombstone(range_tombstone rt) override {
+            print_separator();
+            fmt::print(_os, "row tombstone {}", rt);
+        }
+        virtual void accept_row(position_in_partition_view pipv, row_tombstone rt, row_marker rm, is_dummy, is_continuous) override {
+            if (_in_row) {
+                fmt::print(_os, "}}, ");
+            }
+            fmt::print(_os, "{{row {} tombstone {} marker {}", pipv, rt, rm);
+            _in_row = true;
+            _first = false;
+        }
+        virtual void accept_row_cell(column_id id, atomic_cell ac) override {
+            print_separator();
+            auto&& entry = _cm.regular_column_at(id);
+            fmt::print(_os, "column {} {}", bytes_to_text(entry.name()), atomic_cell::printer(*entry.type(), ac));
+        }
+        virtual void accept_row_cell(column_id id, collection_mutation_view cmv) override {
+            print_separator();
+            auto&& entry = _cm.regular_column_at(id);
+            fmt::print(_os, "column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cmv));
+        }
+        void finalize() {
+            if (_in_row) {
+                fmt::print(_os, "}}");
+            }
+        }
+    };
+    printing_visitor pv(os, mapping);
+    partition_view.accept(mapping, pv);
+    pv.finalize();
+    fmt::print(os, "}}");
+    return os;
+}
+
--- a/canonical_mutation.hh
+++ b/canonical_mutation.hh
@@ -22,10 +22,11 @@
 #pragma once

 #include "bytes.hh"
-#include "schema.hh"
+#include "schema_fwd.hh"
 #include "database_fwd.hh"
 #include "mutation_partition_visitor.hh"
 #include "mutation_partition_serializer.hh"
+#include <iosfwd>

 // Immutable mutation form which can be read using any schema version of the same table.
 // Safe to access from other shards via const&.
@@ -52,4 +53,5 @@ public:

    const bytes& representation() const { return _data; }

+    friend std::ostream& operator<<(std::ostream& os, const canonical_mutation& cm);
 };
--- a/cartesian_product.hh
+++ b/cartesian_product.hh
@@ -22,6 +22,9 @@

 #pragma once

+#include <vector>
+#include <sys/types.h>
+
 // Single-pass range over cartesian product of vectors.

 // Note:
--- a/cdc/cdc.cc
+++ b/cdc/cdc.cc
@@ -1,604 +0,0 @@
-/*
- * Copyright (C) 2019 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <utility>
-#include <algorithm>
-
-#include <seastar/util/defer.hh>
-#include <seastar/core/thread.hh>
-
-#include "cdc/cdc.hh"
-#include "bytes.hh"
-#include "database.hh"
-#include "db/config.hh"
-#include "dht/murmur3_partitioner.hh"
-#include "partition_slice_builder.hh"
-#include "schema.hh"
-#include "schema_builder.hh"
-#include "service/migration_manager.hh"
-#include "service/storage_service.hh"
-#include "types/tuple.hh"
-#include "cql3/statements/select_statement.hh"
-#include "cql3/multi_column_relation.hh"
-#include "cql3/tuples.hh"
-#include "log.hh"
-
-using locator::snitch_ptr;
-using locator::token_metadata;
-using locator::topology;
-using seastar::sstring;
-using service::migration_manager;
-using service::storage_proxy;
-
-namespace std {
-
-template<> struct hash<std::pair<net::inet_address, unsigned int>> {
-    std::size_t operator()(const std::pair<net::inet_address, unsigned int> &p) const {
-        return std::hash<net::inet_address>{}(p.first) ^ std::hash<int>{}(p.second);
-    }
-};
-
-}
-
-using namespace std::chrono_literals;
-
-static logging::logger cdc_log("cdc");
-
-namespace cdc {
-
-using operation_native_type = std::underlying_type_t<operation>;
-using column_op_native_type = std::underlying_type_t<column_op>;
-
-sstring log_name(const sstring& table_name) {
-    static constexpr auto cdc_log_suffix = "_scylla_cdc_log";
-    return table_name + cdc_log_suffix;
-}
-
-sstring desc_name(const sstring& table_name) {
-    static constexpr auto cdc_desc_suffix = "_scylla_cdc_desc";
-    return table_name + cdc_desc_suffix;
-}
-
-static future<>
-remove_log(db_context ctx, const sstring& ks_name, const sstring& table_name) {
-    try {
-        return ctx._migration_manager.announce_column_family_drop(
-                ks_name, log_name(table_name), false);
-    } catch (exceptions::configuration_exception& e) {
-        // It's fine if the table does not exist.
-        return make_ready_future<>();
-    } catch (...) {
-        return make_exception_future<>(std::current_exception());
-    }
-}
-
-static future<>
-remove_desc(db_context ctx, const sstring& ks_name, const sstring& table_name) {
-    try {
-        return ctx._migration_manager.announce_column_family_drop(
-                ks_name, desc_name(table_name), false);
-    } catch (exceptions::configuration_exception& e) {
-        // It's fine if the table does not exist.
-        return make_ready_future<>();
-    } catch (...) {
-        return make_exception_future<>(std::current_exception());
-    }
-}
-
-future<>
-remove(db_context ctx, const sstring& ks_name, const sstring& table_name) {
-    return when_all(remove_log(ctx, ks_name, table_name),
-                    remove_desc(ctx, ks_name, table_name)).discard_result();
-}
-
-static future<> setup_log(db_context ctx, const schema& s) {
-    schema_builder b(s.ks_name(), log_name(s.cf_name()));
-    b.set_default_time_to_live(gc_clock::duration{s.cdc_options().ttl()});
-    b.set_comment(sprint("CDC log for %s.%s", s.ks_name(), s.cf_name()));
-    b.with_column("stream_id", uuid_type, column_kind::partition_key);
-    b.with_column("time", timeuuid_type, column_kind::clustering_key);
-    b.with_column("batch_seq_no", int32_type, column_kind::clustering_key);
-    b.with_column("operation", data_type_for<operation_native_type>());
-    b.with_column("ttl", long_type);
-    auto add_columns = [&] (const schema::const_iterator_range_type& columns, bool is_data_col = false) {
-        for (const auto& column : columns) {
-            auto type = column.type;
-            if (is_data_col) {
-                type = tuple_type_impl::get_instance({ /* op */ data_type_for<column_op_native_type>(), /* value */ type, /* ttl */long_type});
-            }
-            b.with_column("_" + column.name(), type);
-        }
-    };
-    add_columns(s.partition_key_columns());
-    add_columns(s.clustering_key_columns());
-    add_columns(s.static_columns(), true);
-    add_columns(s.regular_columns(), true);
-    return ctx._migration_manager.announce_new_column_family(b.build(), false);
-}
-
-static future<> setup_stream_description_table(db_context ctx, const schema& s) {
-    schema_builder b(s.ks_name(), desc_name(s.cf_name()));
-    b.set_comment(sprint("CDC description for %s.%s", s.ks_name(), s.cf_name()));
-    b.with_column("node_ip", inet_addr_type, column_kind::partition_key);
-    b.with_column("shard_id", int32_type, column_kind::partition_key);
-    b.with_column("created_at", timestamp_type, column_kind::clustering_key);
-    b.with_column("stream_id", uuid_type);
-    return ctx._migration_manager.announce_new_column_family(b.build(), false);
-}
-
-// This function assumes setup_stream_description_table was called on |s| before the call to this
-// function.
-static future<> populate_desc(db_context ctx, const schema& s) {
-    auto& db = ctx._proxy.get_db().local();
-    auto desc_schema =
-        db.find_schema(s.ks_name(), desc_name(s.cf_name()));
-    auto log_schema =
-        db.find_schema(s.ks_name(), log_name(s.cf_name()));
-    auto belongs_to = [&](const gms::inet_address& endpoint,
-                          const unsigned int shard_id,
-                          const int shard_count,
-                          const unsigned int ignore_msb_bits,
-                          const utils::UUID& stream_id) {
-        const auto log_pk = partition_key::from_singular(*log_schema,
-                                                         data_value(stream_id));
-        const auto token = ctx._partitioner.decorate_key(*log_schema, log_pk).token();
-        if (ctx._token_metadata.get_endpoint(ctx._token_metadata.first_token(token)) != endpoint) {
-            return false;
-        }
-        const auto owning_shard_id = dht::murmur3_partitioner(shard_count, ignore_msb_bits).shard_of(token);
-        return owning_shard_id == shard_id;
-    };
-
-    std::vector<mutation> mutations;
-    const auto ts = api::new_timestamp();
-    const auto ck = clustering_key::from_single_value(
-            *desc_schema, timestamp_type->decompose(ts));
-    auto cdef = desc_schema->get_column_definition(to_bytes("stream_id"));
-
-    for (const auto& dc : ctx._token_metadata.get_topology().get_datacenter_endpoints()) {
-        for (const auto& endpoint : dc.second) {
-            const auto decomposed_ip = inet_addr_type->decompose(endpoint.addr());
-            const unsigned int shard_count = ctx._snitch->get_shard_count(endpoint);
-            const unsigned int ignore_msb_bits = ctx._snitch->get_ignore_msb_bits(endpoint);
-            for (unsigned int shard_id = 0; shard_id < shard_count; ++shard_id) {
-                const auto pk = partition_key::from_exploded(
-                        *desc_schema, { decomposed_ip, int32_type->decompose(static_cast<int>(shard_id)) });
-                mutations.emplace_back(desc_schema, pk);
-
-                auto stream_id = utils::make_random_uuid();
-                while (!belongs_to(endpoint, shard_id, shard_count, ignore_msb_bits, stream_id)) {
-                    stream_id = utils::make_random_uuid();
-                }
-                auto value = atomic_cell::make_live(*uuid_type,
-                                                    ts,
-                                                    uuid_type->decompose(stream_id));
-                mutations.back().set_cell(ck, *cdef, std::move(value));
-            }
-        }
-    }
-    return ctx._proxy.mutate(std::move(mutations),
-                             db::consistency_level::QUORUM,
-                             db::no_timeout,
-                             nullptr,
-                             empty_service_permit());
-}
-
-future<> setup(db_context ctx, schema_ptr s) {
-    return seastar::async([ctx = std::move(ctx), s = std::move(s)] {
-        setup_log(ctx, *s).get();
-        auto log_guard = seastar::defer([&] { remove_log(ctx, s->ks_name(), s->cf_name()).get(); });
-        setup_stream_description_table(ctx, *s).get();
-        auto desc_guard = seastar::defer([&] { remove_desc(ctx, s->ks_name(), s->cf_name()).get(); });
-        populate_desc(ctx, *s).get();
-        desc_guard.cancel();
-        log_guard.cancel();
-    });
-}
-
-db_context db_context::builder::build() {
-    return db_context{
-        _proxy,
-        _migration_manager ? _migration_manager->get() : service::get_local_migration_manager(),
-        _token_metadata ? _token_metadata->get() : service::get_local_storage_service().get_token_metadata(),
-        _snitch ? _snitch->get() : locator::i_endpoint_snitch::get_local_snitch_ptr(),
-        _partitioner ? _partitioner->get() : dht::global_partitioner()
-    };
-}
-
-class transformer final {
-public:
-    using streams_type = std::unordered_map<std::pair<net::inet_address, unsigned int>, utils::UUID>;
-private:
-    db_context _ctx;
-    schema_ptr _schema;
-    schema_ptr _log_schema;
-    utils::UUID _time;
-    bytes _decomposed_time;
-    ::shared_ptr<const transformer::streams_type> _streams;
-    const column_definition& _op_col;
-
-    clustering_key set_pk_columns(const partition_key& pk, int batch_no, mutation& m) const {
-        const auto log_ck = clustering_key::from_exploded(
-                *m.schema(), { _decomposed_time, int32_type->decompose(batch_no) });
-        auto pk_value = pk.explode(*_schema);
-        size_t pos = 0;
-        for (const auto& column : _schema->partition_key_columns()) {
-            assert (pos < pk_value.size());
-            auto cdef = m.schema()->get_column_definition(to_bytes("_" + column.name()));
-            auto value = atomic_cell::make_live(*column.type,
-                                                _time.timestamp(),
-                                                bytes_view(pk_value[pos]));
-            m.set_cell(log_ck, *cdef, std::move(value));
-            ++pos;
-        }
-        return log_ck;
-    }
-
-    void set_operation(const clustering_key& ck, operation op, mutation& m) const {
-        m.set_cell(ck, _op_col, atomic_cell::make_live(*_op_col.type, _time.timestamp(), _op_col.type->decompose(operation_native_type(op))));
-    }
-
-    partition_key stream_id(const net::inet_address& ip, unsigned int shard_id) const {
-        auto it = _streams->find(std::make_pair(ip, shard_id));
-        if (it == std::end(*_streams)) {
-                throw std::runtime_error(format("No stream found for node {} and shard {}", ip, shard_id));
-        }
-        return partition_key::from_exploded(*_log_schema, { uuid_type->decompose(it->second) });
-    }
-public:
-    transformer(db_context ctx, schema_ptr s, ::shared_ptr<const transformer::streams_type> streams)
-        : _ctx(ctx)
-        , _schema(std::move(s))
-        , _log_schema(ctx._proxy.get_db().local().find_schema(_schema->ks_name(), log_name(_schema->cf_name())))
-        , _time(utils::UUID_gen::get_time_UUID())
-        , _decomposed_time(timeuuid_type->decompose(_time))
-        , _streams(std::move(streams))
-        , _op_col(*_log_schema->get_column_definition(to_bytes("operation")))
-    {}
-
-    // TODO: is pre-image data based on query enough. We only have actual column data. Do we need
-    // more details like tombstones/ttl? Probably not but keep in mind.
-    mutation transform(const mutation& m, const cql3::untyped_result_set* rs = nullptr) const {
-        auto& t = m.token();
-        auto&& ep = _ctx._token_metadata.get_endpoint(
-                _ctx._token_metadata.first_token(t));
-        if (!ep) {
-            throw std::runtime_error(format("No owner found for key {}", m.decorated_key()));
-        }
-        auto shard_id = dht::murmur3_partitioner(_ctx._snitch->get_shard_count(*ep), _ctx._snitch->get_ignore_msb_bits(*ep)).shard_of(t);
-        mutation res(_log_schema, stream_id(ep->addr(), shard_id));
-        auto& p = m.partition();
-        if (p.partition_tombstone()) {
-            // Partition deletion
-            auto log_ck = set_pk_columns(m.key(), 0, res);
-            set_operation(log_ck, operation::partition_delete, res);
-        } else if (!p.row_tombstones().empty()) {
-            // range deletion
-            int batch_no = 0;
-            for (auto& rt : p.row_tombstones()) {
-                auto set_bound = [&] (const clustering_key& log_ck, const clustering_key_prefix& ckp) {
-                    auto exploded = ckp.explode(*_schema);
-                    size_t pos = 0;
-                    for (const auto& column : _schema->clustering_key_columns()) {
-                        if (pos >= exploded.size()) {
-                            break;
-                        }
-                        auto cdef = _log_schema->get_column_definition(to_bytes("_" + column.name()));
-                        auto value = atomic_cell::make_live(*column.type,
-                                                            _time.timestamp(),
-                                                            bytes_view(exploded[pos]));
-                        res.set_cell(log_ck, *cdef, std::move(value));
-                        ++pos;
-                    }
-                };
-                {
-                    auto log_ck = set_pk_columns(m.key(), batch_no, res);
-                    set_bound(log_ck, rt.start);
-                    // TODO: separate inclusive/exclusive range
-                    set_operation(log_ck, operation::range_delete_start, res);
-                    ++batch_no;
-                }
-                {
-                    auto log_ck = set_pk_columns(m.key(), batch_no, res);
-                    set_bound(log_ck, rt.end);
-                    // TODO: separate inclusive/exclusive range
-                    set_operation(log_ck, operation::range_delete_end, res);
-                    ++batch_no;
-                }
-            }
-        } else {
-            // should be update or deletion
-            int batch_no = 0;
-            for (const rows_entry& r : p.clustered_rows()) {
-                auto ck_value = r.key().explode(*_schema);
-
-                std::optional<clustering_key> pikey;
-                const cql3::untyped_result_set_row * pirow = nullptr;
-
-                if (rs) {
-                    for (auto& utr : *rs) {
-                        bool match = true;
-                        for (auto& c : _schema->clustering_key_columns()) {
-                            auto rv = utr.get_view(c.name_as_text());
-                            auto cv = r.key().get_component(*_schema, c.component_index());
-                            if (rv != cv) {
-                                match = false;
-                                break;
-                            }
-                        }
-                        if (match) {
-                            pikey = set_pk_columns(m.key(), batch_no, res);
-                            set_operation(*pikey, operation::pre_image, res);
-                            pirow = &utr;
-                            ++batch_no;
-                            break;
-                        }
-                    }
-                }
-
-                auto log_ck = set_pk_columns(m.key(), batch_no, res);
-
-                size_t pos = 0;
-                for (const auto& column : _schema->clustering_key_columns()) {
-                    assert (pos < ck_value.size());
-                    auto cdef = _log_schema->get_column_definition(to_bytes("_" + column.name()));
-                    res.set_cell(log_ck, *cdef, atomic_cell::make_live(*column.type, _time.timestamp(), bytes_view(ck_value[pos])));
-
-                    if (pirow) {
-                        assert(pirow->has(column.name_as_text()));
-                        res.set_cell(*pikey, *cdef, atomic_cell::make_live(*column.type, _time.timestamp(), bytes_view(ck_value[pos])));
-                    }
-
-                    ++pos;
-                }
-
-                std::vector<bytes_opt> values(3);
-
-                auto process_cells = [&](const row& r, column_kind ckind) {
-                    r.for_each_cell([&](column_id id, const atomic_cell_or_collection& cell) {
-                        auto& cdef = _schema->column_at(ckind, id);
-                        auto* dst = _log_schema->get_column_definition(to_bytes("_" + cdef.name()));
-                        // todo: collections.
-                        if (cdef.is_atomic()) {
-                            column_op op;
-
-                            values[1] = values[2] = std::nullopt;
-                            auto view = cell.as_atomic_cell(cdef);
-                            if (view.is_live()) {
-                                op = column_op::set;
-                                values[1] = view.value().linearize();
-                                if (view.is_live_and_has_ttl()) {
-                                    values[2] = long_type->decompose(data_value(view.ttl().count()));
-                                }
-                            } else {
-                                op = column_op::del;
-                            }
-
-                            values[0] = data_type_for<column_op_native_type>()->decompose(data_value(static_cast<column_op_native_type>(op)));
-                            res.set_cell(log_ck, *dst, atomic_cell::make_live(*dst->type, _time.timestamp(), tuple_type_impl::build_value(values)));
-
-                            if (pirow && pirow->has(cdef.name_as_text())) {
-                                values[0] = data_type_for<column_op_native_type>()->decompose(data_value(static_cast<column_op_native_type>(column_op::set)));
-                                values[1] = pirow->get_blob(cdef.name_as_text());
-                                values[2] = std::nullopt;
-
-                                assert(std::addressof(res.partition().clustered_row(*_log_schema, *pikey)) != std::addressof(res.partition().clustered_row(*_log_schema, log_ck)));
-                                assert(pikey->explode() != log_ck.explode());
-                                res.set_cell(*pikey, *dst, atomic_cell::make_live(*dst->type, _time.timestamp(), tuple_type_impl::build_value(values)));
-                            }
-                        } else {
-                            cdc_log.warn("Non-atomic cell ignored {}.{}:{}", _schema->ks_name(), _schema->cf_name(), cdef.name_as_text());
-                        }
-                    });
-                };
-
-                process_cells(r.row().cells(), column_kind::regular_column);
-                process_cells(p.static_row().get(), column_kind::static_column);
-
-                set_operation(log_ck, operation::update, res);
-                ++batch_no;
-            }
-        }
-
-        return res;
-    }
-
-    static db::timeout_clock::time_point default_timeout() {
-        return db::timeout_clock::now() + 10s;
-    }
-
-    future<lw_shared_ptr<cql3::untyped_result_set>> pre_image_select(
-            service::storage_proxy& proxy,
-            service::client_state& client_state,
-            db::consistency_level cl,
-            const mutation& m)
-    {
-        auto& p = m.partition();
-        if (p.partition_tombstone() || !p.row_tombstones().empty() || p.clustered_rows().empty()) {
-            return make_ready_future<lw_shared_ptr<cql3::untyped_result_set>>();
-        }
-
-        dht::partition_range_vector partition_ranges{dht::partition_range(m.decorated_key())};
-
-        auto&& pc = _schema->partition_key_columns();
-        auto&& cc = _schema->clustering_key_columns();
-
-        std::vector<query::clustering_range> bounds;
-        if (cc.empty()) {
-            bounds.push_back(query::clustering_range::make_open_ended_both_sides());
-        } else {
-            for (const rows_entry& r : p.clustered_rows()) {
-                auto& ck = r.key();
-                bounds.push_back(query::clustering_range::make_singular(ck));
-            }
-        }
-
-        std::vector<const column_definition*> columns;
-        columns.reserve(_schema->all_columns().size());
-
-        std::transform(pc.begin(), pc.end(), std::back_inserter(columns), [](auto& c) { return &c; });
-        std::transform(cc.begin(), cc.end(), std::back_inserter(columns), [](auto& c) { return &c; });
-
-        query::column_id_vector static_columns, regular_columns;
-
-        auto sk = column_kind::static_column;
-        auto rk = column_kind::regular_column;
-        // TODO: this assumes all mutations touch the same set of columns. This might not be true, and we may need to do more horrible set operation here.
-        for (auto& [r, cids, kind] : { std::tie(p.static_row().get(), static_columns, sk), std::tie(p.clustered_rows().begin()->row().cells(), regular_columns, rk) }) {
-            r.for_each_cell([&](column_id id, const atomic_cell_or_collection&) {
-                auto& cdef =_schema->column_at(kind, id);
-                cids.emplace_back(id);
-                columns.emplace_back(&cdef);
-            });
-        }
-
-        auto selection = cql3::selection::selection::for_columns(_schema, std::move(columns));
-        auto partition_slice = query::partition_slice(std::move(bounds), std::move(static_columns), std::move(regular_columns), selection->get_query_options());
-        auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), partition_slice, query::max_partitions);
-
-        return proxy.query(_schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), empty_service_permit(), client_state)).then(
-                [this, partition_slice = std::move(partition_slice), selection = std::move(selection)] (service::storage_proxy::coordinator_query_result qr) -> lw_shared_ptr<cql3::untyped_result_set> {
-                    cql3::selection::result_set_builder builder(*selection, gc_clock::now(), cql_serialization_format::latest());
-                    query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *_schema, *selection));
-                    auto result_set = builder.build();
-                    if (!result_set || result_set->empty()) {
-                        return {};
-                    }
-                    return make_lw_shared<cql3::untyped_result_set>(*result_set);
-        });
-    }
-};
-
-// This class is used to build a mapping from <node ip, shard id> to stream_id
-// It is used as a consumer for rows returned by the query to CDC Description Table
-class streams_builder {
-    const schema& _schema;
-    transformer::streams_type _streams;
-    net::inet_address _node_ip = net::inet_address();
-    unsigned int _shard_id = 0;
-    api::timestamp_type _latest_row_timestamp = api::min_timestamp;
-    utils::UUID _latest_row_stream_id = utils::UUID();
-public:
-    streams_builder(const schema& s) : _schema(s) {}
-
-    void accept_new_partition(const partition_key& key, uint32_t row_count) {
-        auto exploded = key.explode(_schema);
-        _node_ip = value_cast<net::inet_address>(inet_addr_type->deserialize(exploded[0]));
-        _shard_id = static_cast<unsigned int>(value_cast<int>(int32_type->deserialize(exploded[1])));
-        _latest_row_timestamp = api::min_timestamp;
-        _latest_row_stream_id = utils::UUID();
-    }
-
-    void accept_new_partition(uint32_t row_count) {
-        assert(false);
-    }
-
-    void accept_new_row(
-            const clustering_key& key,
-            const query::result_row_view& static_row,
-            const query::result_row_view& row) {
-        auto row_iterator = row.iterator();
-        api::timestamp_type timestamp = value_cast<db_clock::time_point>(
-                timestamp_type->deserialize(key.explode(_schema)[0])).time_since_epoch().count();
-        if (timestamp <= _latest_row_timestamp) {
-            return;
-        }
-        _latest_row_timestamp = timestamp;
-        for (auto&& cdef : _schema.regular_columns()) {
-            if (cdef.name_as_text() != "stream_id") {
-                row_iterator.skip(cdef);
-                continue;
-            }
-            auto val_opt = row_iterator.next_atomic_cell();
-            assert(val_opt);
-            val_opt->value().with_linearized([&] (bytes_view bv) {
-                _latest_row_stream_id = value_cast<utils::UUID>(uuid_type->deserialize(bv));
-            });
-        }
-    }
-
-    void accept_new_row(const query::result_row_view& static_row, const query::result_row_view& row) {
-        assert(false);
-    }
-
-    void accept_partition_end(const query::result_row_view& static_row) {
-        _streams.emplace(std::make_pair(_node_ip, _shard_id), _latest_row_stream_id);
-    }
-
-    transformer::streams_type build() {
-        return std::move(_streams);
-    }
-};
-
-static future<::shared_ptr<transformer::streams_type>> get_streams(
-        db_context ctx,
-        const sstring& ks_name,
-        const sstring& cf_name,
-        lowres_clock::time_point timeout,
-        service::query_state& qs) {
-    auto s =
-        ctx._proxy.get_db().local().find_schema(ks_name, desc_name(cf_name));
-    query::read_command cmd(
-            s->id(),
-            s->version(),
-            partition_slice_builder(*s).with_no_static_columns().build());
-    return ctx._proxy.query(
-            s,
-            make_lw_shared(std::move(cmd)),
-            {dht::partition_range::make_open_ended_both_sides()},
-            db::consistency_level::QUORUM,
-            {timeout, qs.get_permit(), qs.get_client_state()}).then([s = std::move(s)] (auto qr) mutable {
-        return query::result_view::do_with(*qr.query_result,
-                [s = std::move(s)] (query::result_view v) {
-            auto slice = partition_slice_builder(*s)
-                    .with_no_static_columns()
-                    .build();
-            streams_builder builder{ *s };
-            v.consume(slice, builder);
-            return ::make_shared<transformer::streams_type>(builder.build());
-        });
-    });
-}
-
-future<std::vector<mutation>> append_log_mutations(
-        db_context ctx,
-        schema_ptr s,
-        service::storage_proxy::clock_type::time_point timeout,
-        service::query_state& qs,
-        std::vector<mutation> muts) {
-    auto mp = ::make_lw_shared<std::vector<mutation>>(std::move(muts));
-
-    return get_streams(ctx, s->ks_name(), s->cf_name(), timeout, qs).then([ctx, s = std::move(s), mp, &qs](::shared_ptr<transformer::streams_type> streams) mutable {
-        mp->reserve(2 * mp->size());
-        auto trans = make_lw_shared<transformer>(ctx, s, std::move(streams));
-        auto i = mp->begin();
-        auto e = mp->end();
-        return parallel_for_each(i, e, [ctx, &qs, trans, mp](mutation& m) {
-            return trans->pre_image_select(ctx._proxy, qs.get_client_state(), db::consistency_level::LOCAL_QUORUM, m).then([trans, mp, &m](lw_shared_ptr<cql3::untyped_result_set> rs) {
-                mp->push_back(trans->transform(m, rs.get()));
-            });
-        }).then([mp] {
-            return std::move(*mp);
-        });
-    });
-}
-
-} // namespace cdc
--- a/cdc/cdc.hh
+++ b/cdc/cdc.hh
@@ -1,233 +0,0 @@
-/*
- * Copyright (C) 2019 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <functional>
-#include <optional>
-#include <map>
-#include <string>
-#include <vector>
-
-#include <seastar/core/future.hh>
-#include <seastar/core/lowres_clock.hh>
-#include <seastar/core/shared_ptr.hh>
-#include <seastar/core/sstring.hh>
-
-#include "exceptions/exceptions.hh"
-#include "json.hh"
-#include "timestamp.hh"
-
-class schema;
-using schema_ptr = seastar::lw_shared_ptr<const schema>;
-
-namespace locator {
-
-class snitch_ptr;
-class token_metadata;
-
-} // namespace locator
-
-namespace service {
-
-class migration_manager;
-class storage_proxy;
-class query_state;
-
-} // namespace service
-
-namespace dht {
-
-class i_partitioner;
-
-} // namespace dht
-
-class mutation;
-class partition_key;
-
-namespace cdc {
-
-class options final {
-    bool _enabled = false;
-    bool _preimage = false;
-    bool _postimage = false;
-    int _ttl = 86400; // 24h in seconds
-public:
-    options() = default;
-    options(const std::map<sstring, sstring>& map) {
-        if (map.find("enabled") == std::end(map)) {
-            return;
-        }
-
-        for (auto& p : map) {
-            if (p.first == "enabled") {
-                _enabled = p.second == "true";
-            } else if (p.first == "preimage") {
-                _preimage = p.second == "true";
-            } else if (p.first == "postimage") {
-                _postimage = p.second == "true";
-            } else if (p.first == "ttl") {
-                _ttl = std::stoi(p.second);
-            } else {
-                throw exceptions::configuration_exception("Invalid CDC option: " + p.first);
-            }
-        }
-    }
-    std::map<sstring, sstring> to_map() const {
-        if (!_enabled) {
-            return {};
-        }
-        return {
-            { "enabled", _enabled ? "true" : "false" },
-            { "preimage", _preimage ? "true" : "false" },
-            { "postimage", _postimage ? "true" : "false" },
-            { "ttl", std::to_string(_ttl) },
-        };
-    }
-
-    sstring to_sstring() const {
-        return json::to_json(to_map());
-    }
-
-    bool enabled() const { return _enabled; }
-    bool preimage() const { return _preimage; }
-    bool postimage() const { return _postimage; }
-    int ttl() const { return _ttl; }
-
-    bool operator==(const options& o) const {
-        return _enabled == o._enabled && _preimage == o._preimage && _postimage == o._postimage && _ttl == o._ttl;
-    }
-    bool operator!=(const options& o) const {
-        return !(*this == o);
-    }
-};
-
-struct db_context final {
-    service::storage_proxy& _proxy;
-    service::migration_manager& _migration_manager;
-    locator::token_metadata& _token_metadata;
-    locator::snitch_ptr& _snitch;
-    dht::i_partitioner& _partitioner;
-
-    class builder final {
-        service::storage_proxy& _proxy;
-        std::optional<std::reference_wrapper<service::migration_manager>> _migration_manager;
-        std::optional<std::reference_wrapper<locator::token_metadata>> _token_metadata;
-        std::optional<std::reference_wrapper<locator::snitch_ptr>> _snitch;
-        std::optional<std::reference_wrapper<dht::i_partitioner>> _partitioner;
-    public:
-        builder(service::storage_proxy& proxy) : _proxy(proxy) { }
-
-        builder& with_migration_manager(service::migration_manager& migration_manager) {
-            _migration_manager = migration_manager;
-            return *this;
-        }
-
-        builder& with_token_metadata(locator::token_metadata& token_metadata) {
-            _token_metadata = token_metadata;
-            return *this;
-        }
-
-        builder& with_snitch(locator::snitch_ptr& snitch) {
-            _snitch = snitch;
-            return *this;
-        }
-
-        builder& with_partitioner(dht::i_partitioner& partitioner) {
-            _partitioner = partitioner;
-            return *this;
-        }
-
-        db_context build();
-    };
-};
-
-/// \brief Sets up CDC related tables for a given table
-///
-/// This function not only creates CDC Log and CDC Description for a given table
-/// but also populates CDC Description with a list of change streams.
-///
-/// param[in] ctx object with references to database components
-/// param[in] schema schema of a table for which CDC tables are being created
-seastar::future<> setup(db_context ctx, schema_ptr schema);
-
-// cdc log table operation
-enum class operation : int8_t {
-    // note: these values will eventually be read by a third party, probably not privvy to this
-    // enum decl, so don't change the constant values (or the datatype).
-    pre_image = 0, update = 1, row_delete = 2, range_delete_start = 3, range_delete_end = 4, partition_delete = 5
-};
-
-// cdc log data column operation
-enum class column_op : int8_t {
-    // same as "operation". Do not edit values or type/type unless you _really_ want to.
-    set = 0, del = 1, add = 2,
-};
-
-/// \brief Deletes CDC Log and CDC Description tables for a given table
-///
-/// This function cleans up all CDC related tables created for a given table.
-/// At the moment, CDC Log and CDC Description are the only affected tables.
-/// It's ok if some/all of them don't exist.
-///
-/// \param[in] ctx object with references to database components
-/// \param[in] ks_name keyspace name of a table for which CDC tables are removed
-/// \param[in] table_name name of a table for which CDC tables are removed
-///
-/// \pre This function works correctly no matter if CDC Log and/or CDC Description
-///      exist.
-seastar::future<>
-remove(db_context ctx, const seastar::sstring& ks_name, const seastar::sstring& table_name);
-
-seastar::sstring log_name(const seastar::sstring& table_name);
-
-seastar::sstring desc_name(const seastar::sstring& table_name);
-
-/// \brief For each mutation in the set appends related CDC Log mutation
-///
-/// This function should be called with a set of mutations of a table
-/// with CDC enabled. Returned set of mutations contains all original mutations
-/// and for each original mutation appends a mutation to CDC Log that reflects
-/// the change.
-///
-/// \param[in] ctx object with references to database components
-/// \param[in] s schema of a CDC enabled table which is being modified
-/// \param[in] timeout period of time after which a request is considered timed out
-/// \param[in] qs the state of the query that's being executed
-/// \param[in] mutations set of changes of a CDC enabled table
-///
-/// \return set of mutations from input parameter with relevant CDC Log mutations appended
-///
-/// \pre CDC Log and CDC Description have to exist
-/// \pre CDC Description has to be in sync with cluster topology
-///
-/// \note At the moment, cluster topology changes are not supported
-//        so the assumption that CDC Description is in sync with cluster topology
-//        is easy to enforce. When support for cluster topology changes is added
-//        it has to make sure the assumption holds.
-seastar::future<std::vector<mutation>>append_log_mutations(
-        db_context ctx,
-        schema_ptr s,
-        lowres_clock::time_point timeout,
-        service::query_state& qs,
-        std::vector<mutation> mutations);
-
-} // namespace cdc
--- a/cdc/cdc_extension.hh
+++ b/cdc/cdc_extension.hh
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2020 ScyllaDB
+ */
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "serializer.hh"
+#include "db/extensions.hh"
+#include "cdc/cdc_options.hh"
+#include "schema.hh"
+
+namespace cdc {
+
+class cdc_extension : public schema_extension {
+    cdc::options _cdc_options;
+public:
+    static constexpr auto NAME = "cdc";
+
+    cdc_extension() = default;
+    explicit cdc_extension(std::map<sstring, sstring> tags) : _cdc_options(std::move(tags)) {}
+    explicit cdc_extension(const bytes& b) : _cdc_options(cdc_extension::deserialize(b)) {}
+    explicit cdc_extension(const sstring& s) {
+        throw std::logic_error("Cannot create cdc info from string");
+    }
+    bytes serialize() const override {
+        return ser::serialize_to_buffer<bytes>(_cdc_options.to_map());
+    }
+    static std::map<sstring, sstring> deserialize(const bytes_view& buffer) {
+        return ser::deserialize_from_buffer(buffer, boost::type<std::map<sstring, sstring>>());
+    }
+    const options& get_options() const {
+        return _cdc_options;
+    }
+};
+
+}
--- a/cdc/cdc_options.hh
+++ b/cdc/cdc_options.hh
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <map>
+#include <seastar/core/sstring.hh>
+#include "seastarx.hh"
+
+namespace cdc {
+
+class options final {
+    bool _enabled = false;
+    bool _preimage = false;
+    bool _postimage = false;
+    int _ttl = 86400; // 24h in seconds
+public:
+    options() = default;
+    options(const std::map<sstring, sstring>& map);
+
+    std::map<sstring, sstring> to_map() const;
+    sstring to_sstring() const;
+
+    bool enabled() const { return _enabled; }
+    bool preimage() const { return _preimage; }
+    bool postimage() const { return _postimage; }
+    int ttl() const { return _ttl; }
+
+    bool operator==(const options& o) const;
+    bool operator!=(const options& o) const;
+};
+
+} // namespace cdc
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -0,0 +1,405 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <boost/type.hpp>
+#include <random>
+#include <unordered_set>
+#include <seastar/core/sleep.hh>
+
+#include "keys.hh"
+#include "schema_builder.hh"
+#include "db/config.hh"
+#include "db/system_keyspace.hh"
+#include "db/system_distributed_keyspace.hh"
+#include "dht/token-sharding.hh"
+#include "locator/token_metadata.hh"
+#include "gms/application_state.hh"
+#include "gms/inet_address.hh"
+#include "gms/gossiper.hh"
+
+#include "cdc/generation.hh"
+
+extern logging::logger cdc_log;
+
+static int get_shard_count(const gms::inet_address& endpoint, const gms::gossiper& g) {
+    auto ep_state = g.get_application_state_ptr(endpoint, gms::application_state::SHARD_COUNT);
+    return ep_state ? std::stoi(ep_state->value) : -1;
+}
+
+static unsigned get_sharding_ignore_msb(const gms::inet_address& endpoint, const gms::gossiper& g) {
+    auto ep_state = g.get_application_state_ptr(endpoint, gms::application_state::IGNORE_MSB_BITS);
+    return ep_state ? std::stoi(ep_state->value) : 0;
+}
+
+namespace cdc {
+
+extern const api::timestamp_clock::duration generation_leeway =
+    std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));
+
+static void copy_int_to_bytes(int64_t i, size_t offset, bytes& b) {
+    i = net::hton(i);
+    std::copy_n(reinterpret_cast<int8_t*>(&i), sizeof(int64_t), b.begin() + offset);
+}
+
+stream_id::stream_id(int64_t first, int64_t second)
+    : _value(bytes::initialized_later(), 2 * sizeof(int64_t))
+{
+    copy_int_to_bytes(first, 0, _value);
+    copy_int_to_bytes(second, sizeof(int64_t), _value);
+}
+
+stream_id::stream_id(bytes b) : _value(std::move(b)) { }
+
+bool stream_id::is_set() const {
+    return !_value.empty();
+}
+
+bool stream_id::operator==(const stream_id& o) const {
+    return _value == o._value;
+}
+
+bool stream_id::operator<(const stream_id& o) const {
+    return _value < o._value;
+}
+
+static int64_t bytes_to_int64(const bytes& b, size_t offset) {
+    assert(b.size() >= offset + sizeof(int64_t));
+    int64_t res;
+    std::copy_n(b.begin() + offset, sizeof(int64_t), reinterpret_cast<int8_t *>(&res));
+    return net::ntoh(res);
+}
+
+int64_t stream_id::first() const {
+    return bytes_to_int64(_value, 0);
+}
+
+int64_t stream_id::second() const {
+    return bytes_to_int64(_value, sizeof(int64_t));
+}
+
+const bytes& stream_id::to_bytes() const {
+    return _value;
+}
+
+partition_key stream_id::to_partition_key(const schema& log_schema) const {
+    return partition_key::from_single_value(log_schema, _value);
+}
+
+bool token_range_description::operator==(const token_range_description& o) const {
+    return token_range_end == o.token_range_end && streams == o.streams
+        && sharding_ignore_msb == o.sharding_ignore_msb;
+}
+
+topology_description::topology_description(std::vector<token_range_description> entries)
+    : _entries(std::move(entries)) {}
+
+bool topology_description::operator==(const topology_description& o) const {
+    return _entries == o._entries;
+}
+
+const std::vector<token_range_description>& topology_description::entries() const {
+    return _entries;
+}
+
+static stream_id make_random_stream_id() {
+    static thread_local std::mt19937_64 rand_gen(std::random_device().operator()());
+    static thread_local std::uniform_int_distribution<int64_t> rand_dist(std::numeric_limits<int64_t>::min());
+
+    return {rand_dist(rand_gen), rand_dist(rand_gen)};
+}
+
+/* Given:
+ * 1. a set of tokens which split the token ring into token ranges (vnodes),
+ * 2. information on how each token range is distributed among its owning node's shards
+ * this function tries to generate a set of CDC stream identifiers such that for each
+ * shard and vnode pair there exists a stream whose token falls into this
+ * vnode and is owned by this shard.
+ *
+ * It then builds a cdc::topology_description which maps tokens to these
+ * found stream identifiers, such that if token T is owned by shard S in vnode V,
+ * it gets mapped to the stream identifier generated for (S, V).
+ */
+// Run in seastar::async context.
+topology_description generate_topology_description(
+        const db::config& cfg,
+        const std::unordered_set<dht::token>& bootstrap_tokens,
+        const locator::token_metadata& token_metadata,
+        const gms::gossiper& gossiper) {
+    if (bootstrap_tokens.empty()) {
+        throw std::runtime_error(
+                "cdc: bootstrap tokens is empty in generate_topology_description");
+    }
+
+    auto tokens = token_metadata.sorted_tokens();
+    tokens.insert(tokens.end(), bootstrap_tokens.begin(), bootstrap_tokens.end());
+    std::sort(tokens.begin(), tokens.end());
+    tokens.erase(std::unique(tokens.begin(), tokens.end()), tokens.end());
+
+    std::vector<token_range_description> entries(tokens.size());
+    int spots_to_fill = 0;
+
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        auto& entry = entries[i];
+        entry.token_range_end = tokens[i];
+
+        if (bootstrap_tokens.count(entry.token_range_end) > 0) {
+            entry.streams.resize(smp::count);
+            entry.sharding_ignore_msb = cfg.murmur3_partitioner_ignore_msb_bits();
+        } else {
+            auto endpoint = token_metadata.get_endpoint(entry.token_range_end);
+            if (!endpoint) {
+                throw std::runtime_error(format("Can't find endpoint for token {}", entry.token_range_end));
+            }
+            auto sc = get_shard_count(*endpoint, gossiper);
+            entry.streams.resize(sc > 0 ? sc : 1);
+            entry.sharding_ignore_msb = get_sharding_ignore_msb(*endpoint, gossiper);
+        }
+
+        spots_to_fill += entry.streams.size();
+    }
+
+    auto schema = schema_builder("fake_ks", "fake_table")
+        .with_column("stream_id", bytes_type, column_kind::partition_key)
+        .build();
+
+    auto quota = std::chrono::seconds(spots_to_fill / 2000 + 1);
+    auto start_time = std::chrono::system_clock::now();
+
+    // For each pair (i, j), 0 <= i < streams.size(), 0 <= j < streams[i].size(),
+    // try to find a stream (stream[i][j]) such that the token of this stream will get mapped to this stream
+    // (refer to the comments above topology_description's definition to understand how it describes the mapping).
+    // We find the streams by randomly generating them and checking into which pairs they get mapped.
+    // NOTE: this algorithm is temporary and will be replaced after per-table-partitioner feature gets merged in.
+    repeat([&] {
+        for (int i = 0; i < 500; ++i) {
+            auto stream_id = make_random_stream_id();
+            auto token = dht::get_token(*schema, stream_id.to_partition_key(*schema));
+
+            // Find the token range into which our stream_id's token landed.
+            auto it = std::lower_bound(tokens.begin(), tokens.end(), token);
+            auto& entry = entries[it != tokens.end() ? std::distance(tokens.begin(), it) : 0];
+
+            auto shard_id = dht::shard_of(entry.streams.size(), entry.sharding_ignore_msb, token);
+            assert(shard_id < entry.streams.size());
+
+            if (!entry.streams[shard_id].is_set()) {
+                --spots_to_fill;
+                entry.streams[shard_id] = stream_id;
+            }
+        }
+
+        if (!spots_to_fill) {
+            return stop_iteration::yes;
+        }
+
+        auto now = std::chrono::system_clock::now();
+        auto passed = std::chrono::duration_cast<std::chrono::seconds>(now - start_time);
+        if (passed > quota) {
+            return stop_iteration::yes;
+        }
+
+        return stop_iteration::no;
+    }).get();
+
+    if (spots_to_fill) {
+        // We were not able to generate stream ids for each (token range, shard) pair.
+
+        // For each range that has a stream, for each shard for this range that doesn't have a stream,
+        // use the stream id of the next shard for this range.
+
+        // For each range that doesn't have any stream,
+        // use streams of the first range to the left which does have a stream.
+
+        cdc_log.warn("Generation of CDC streams failed to create streams for some (vnode, shard) pair."
+                     " This can lead to worse performance.");
+
+        stream_id some_stream;
+        size_t idx = 0;
+        for (; idx < entries.size(); ++idx) {
+            for (auto s: entries[idx].streams) {
+                if (s.is_set()) {
+                    some_stream = s;
+                    break;
+                }
+            }
+            if (some_stream.is_set()) {
+                break;
+            }
+        }
+
+        assert(idx != entries.size() && some_stream.is_set());
+
+        // Iterate over all ranges in the clockwise direction, starting with the one we found a stream for.
+        for (size_t off = 0; off < entries.size(); ++off) {
+            auto& ss = entries[(idx + off) % entries.size()].streams;
+
+            int last_set_stream_idx = ss.size() - 1;
+            while (last_set_stream_idx > -1 && !ss[last_set_stream_idx].is_set()) {
+                --last_set_stream_idx;
+            }
+
+            if (last_set_stream_idx == -1) {
+                cdc_log.warn(
+                        "CDC wasn't able to generate any stream for vnode ({}, {}]. We'll use another vnode's streams"
+                        " instead. This might lead to inconsistencies.",
+                        tokens[(idx + off + entries.size() - 1) % entries.size()], tokens[(idx + off) % entries.size()]);
+
+                ss[0] = some_stream;
+                last_set_stream_idx = 0;
+            }
+
+            some_stream = ss[last_set_stream_idx];
+
+            // Replace 'unset' stream ids with indexes below last_set_stream_idx
+            for (int s_idx = last_set_stream_idx - 1; s_idx > -1; --s_idx) {
+                if (ss[s_idx].is_set()) {
+                    some_stream = ss[s_idx];
+                } else {
+                    ss[s_idx] = some_stream;
+                }
+            }
+            // Replace 'unset' stream ids with indexes above last_set_stream_idx
+            for (int s_idx = ss.size() - 1; s_idx > last_set_stream_idx; --s_idx) {
+                if (ss[s_idx].is_set()) {
+                    some_stream = ss[s_idx];
+                } else {
+                    ss[s_idx] = some_stream;
+                }
+            }
+        }
+    }
+
+    return {std::move(entries)};
+}
+
+bool should_propose_first_generation(const gms::inet_address& me, const gms::gossiper& g) {
+    auto my_host_id = g.get_host_id(me);
+    auto& eps = g.get_endpoint_states();
+    return std::none_of(eps.begin(), eps.end(),
+            [&] (const std::pair<gms::inet_address, gms::endpoint_state>& ep) {
+        return my_host_id < g.get_host_id(ep.first);
+    });
+}
+
+future<db_clock::time_point> get_local_streams_timestamp() {
+    return db::system_keyspace::get_saved_cdc_streams_timestamp().then([] (std::optional<db_clock::time_point> ts) {
+        if (!ts) {
+            auto err = format("get_local_streams_timestamp: tried to retrieve streams timestamp after bootstrapping, but it's not present");
+            cdc_log.error("{}", err);
+            throw std::runtime_error(err);
+        }
+        return *ts;
+    });
+}
+
+// Run inside seastar::async context.
+db_clock::time_point make_new_cdc_generation(
+        const db::config& cfg,
+        const std::unordered_set<dht::token>& bootstrap_tokens,
+        const locator::token_metadata& tm,
+        const gms::gossiper& g,
+        db::system_distributed_keyspace& sys_dist_ks,
+        std::chrono::milliseconds ring_delay,
+        bool for_testing) {
+    assert(!bootstrap_tokens.empty());
+
+    auto gen = generate_topology_description(cfg, bootstrap_tokens, tm, g);
+
+    // Begin the race.
+    auto ts = db_clock::now() + (
+            for_testing ? std::chrono::milliseconds(0) : (
+                2 * ring_delay + std::chrono::duration_cast<std::chrono::milliseconds>(generation_leeway)));
+    sys_dist_ks.insert_cdc_topology_description(ts, std::move(gen), { tm.count_normal_token_owners() }).get();
+
+    return ts;
+}
+
+std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_address& endpoint, const gms::gossiper& g) {
+    auto streams_ts_string = g.get_application_state_value(endpoint, gms::application_state::CDC_STREAMS_TIMESTAMP);
+    cdc_log.trace("endpoint={}, streams_ts_string={}", endpoint, streams_ts_string);
+
+    if (streams_ts_string.empty()) {
+        return {};
+    }
+
+    return db_clock::time_point(db_clock::duration(std::stoll(streams_ts_string)));
+}
+
+// Run inside seastar::async context.
+static void do_update_streams_description(
+        db_clock::time_point streams_ts,
+        db::system_distributed_keyspace& sys_dist_ks,
+        db::system_distributed_keyspace::context ctx) {
+    if (sys_dist_ks.cdc_desc_exists(streams_ts, ctx).get0()) {
+        cdc_log.debug("update_streams_description: description of generation {} already inserted", streams_ts);
+        return;
+    }
+
+    // We might race with another node also inserting the description, but that's ok. It's an idempotent operation.
+
+    auto topo = sys_dist_ks.read_cdc_topology_description(streams_ts, ctx).get0();
+    if (!topo) {
+        throw std::runtime_error(format("could not find streams data for timestamp {}", streams_ts));
+    }
+
+    std::set<cdc::stream_id> streams_set;
+    for (auto& entry: topo->entries()) {
+        streams_set.insert(entry.streams.begin(), entry.streams.end());
+    }
+
+    std::vector<cdc::stream_id> streams_vec(streams_set.begin(), streams_set.end());
+
+    sys_dist_ks.create_cdc_desc(streams_ts, streams_vec, ctx).get();
+    cdc_log.info("CDC description table successfully updated with generation {}.", streams_ts);
+}
+
+void update_streams_description(
+        db_clock::time_point streams_ts,
+        shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source& abort_src) {
+    try {
+        do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
+    } catch(...) {
+        cdc_log.warn(
+            "Could not update CDC description table with generation {}: {}. Will retry in the background.",
+            streams_ts, std::current_exception());
+
+        // It is safe to discard this future: we keep system distributed keyspace alive.
+        (void)seastar::async([
+            streams_ts, sys_dist_ks, get_num_token_owners = std::move(get_num_token_owners), &abort_src
+        ] {
+            while (true) {
+                sleep_abortable(std::chrono::seconds(60), abort_src).get();
+                try {
+                    do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
+                    return;
+                } catch (...) {
+                    cdc_log.warn(
+                        "Could not update CDC description table with generation {}: {}. Will try again.",
+                        streams_ts, std::current_exception());
+                }
+            }
+        });
+    }
+}
+
+} // namespace cdc
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -0,0 +1,176 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+/* This module contains classes and functions used to manage CDC generations:
+ * sets of CDC stream identifiers used by the cluster to choose partition keys for CDC log writes.
+ * Each CDC generation begins operating at a specific time point, called the generation's timestamp
+ * (`cdc_streams_timpestamp` or `streams_timestamp` in the code).
+ * The generation is used by all nodes in the cluster to pick CDC streams until superseded by a new generation.
+ *
+ * Functions from this module are used by the node joining procedure to introduce new CDC generations to the cluster
+ * (which is necessary due to new tokens being inserted into the token ring), or during rolling upgrade
+ * if CDC is enabled for the first time.
+ */
+
+#pragma once
+
+#include <vector>
+#include <unordered_set>
+#include <seastar/util/noncopyable_function.hh>
+
+#include "database_fwd.hh"
+#include "db_clock.hh"
+#include "dht/token.hh"
+
+namespace seastar {
+    class abort_source;
+} // namespace seastar
+
+namespace db {
+    class config;
+    class system_distributed_keyspace;
+} // namespace db
+
+namespace gms {
+    class inet_address;
+    class gossiper;
+} // namespace gms
+
+namespace locator {
+    class token_metadata;
+} // namespace locator
+
+namespace cdc {
+
+class stream_id final {
+    bytes _value;
+public:
+    stream_id() = default;
+    stream_id(int64_t, int64_t);
+    stream_id(bytes);
+    bool is_set() const;
+    bool operator==(const stream_id&) const;
+    bool operator<(const stream_id&) const;
+
+    int64_t first() const;
+    int64_t second() const;
+
+    const bytes& to_bytes() const;
+
+    partition_key to_partition_key(const schema& log_schema) const;
+};
+
+/* Describes a mapping of tokens to CDC streams in a token range.
+ *
+ * The range ends with `token_range_end`. A vector of `token_range_description`s defines the ranges entirely
+ * (the end of the `i`th range is the beginning of the `i+1 % size()`th range). Ranges are left-opened, right-closed.
+ *
+ * Tokens in the range ending with `token_range_end` are mapped to streams in the `streams` vector as follows:
+ * token `T` is mapped to `streams[j]` if and only if the used partitioner maps `T` to the `j`th shard,
+ * assuming that the partitioner is configured for `streams.size()` shards and (partitioner's) `sharding_ignore_msb`
+ * equals to the given `sharding_ignore_msb`.
+*/
+struct token_range_description {
+    dht::token token_range_end;
+    std::vector<stream_id> streams;
+    uint8_t sharding_ignore_msb;
+
+    bool operator==(const token_range_description&) const;
+};
+
+
+/* Describes a mapping of tokens to CDC streams in a whole token ring.
+ *
+ * Division of the ring to token ranges is defined in terms of `token_range_end`s
+ * in the `_entries` vector. See the comment above `token_range_description` for explanation.
+ */
+class topology_description {
+    std::vector<token_range_description> _entries;
+public:
+    topology_description(std::vector<token_range_description> entries);
+    bool operator==(const topology_description&) const;
+
+    const std::vector<token_range_description>& entries() const;
+};
+
+/* Should be called when we're restarting and we noticed that we didn't save any streams timestamp in our local tables,
+ * which means that we're probably upgrading from a non-CDC/old CDC version (another reason could be
+ * that there's a bug, or the user messed with our local tables).
+ *
+ * It checks whether we should be the node to propose the first generation of CDC streams.
+ * The chosen condition is arbitrary, it only tries to make sure that no two nodes propose a generation of streams
+ * when upgrading, and nothing bad happens if they for some reason do (it's mostly an optimization).
+ */
+bool should_propose_first_generation(const gms::inet_address& me, const gms::gossiper&);
+
+/*
+ * Read this node's streams generation timestamp stored in the LOCAL table.
+ * Assumes that the node has successfully bootstrapped, and we're not upgrading from a non-CDC version,
+ * so the timestamp is present.
+ */
+future<db_clock::time_point> get_local_streams_timestamp();
+
+/* Generate a new set of CDC streams and insert it into the distributed cdc_topology_description table.
+ * Returns the timestamp of this new generation.
+ *
+ * Should be called when starting the node for the first time (i.e., joining the ring).
+ *
+ * Assumes that the system_distributed keyspace is initialized.
+ *
+ * The caller of this function is expected to insert this timestamp into the gossiper as fast as possible,
+ * so that other nodes learn about the generation before their clocks cross the timestmap
+ * (not guaranteed in the current implementation, but expected to be the common case;
+ *  we assume that `ring_delay` is enough for other nodes to learn about the new generation).
+ */
+db_clock::time_point make_new_cdc_generation(
+        const db::config& cfg,
+        const std::unordered_set<dht::token>& bootstrap_tokens,
+        const locator::token_metadata& tm,
+        const gms::gossiper& g,
+        db::system_distributed_keyspace& sys_dist_ks,
+        std::chrono::milliseconds ring_delay,
+        bool for_testing);
+
+/* Retrieves CDC streams generation timestamp from the given endpoint's application state (broadcasted through gossip).
+ * We might be during a rolling upgrade, so the timestamp might not be there (if the other node didn't upgrade yet),
+ * but if the cluster already supports CDC, then every newly joining node will propose a new CDC generation,
+ * which means it will gossip the generation's timestamp.
+ */
+std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_address& endpoint, const gms::gossiper&);
+
+/* Inform CDC users about a generation of streams (identified by the given timestamp)
+ * by inserting it into the cdc_description table.
+ *
+ * Assumes that the cdc_topology_description table contains this generation.
+ *
+ * Returning from this function does not mean that the table update was successful: the function
+ * might run an asynchronous task in the background.
+ *
+ * Run inside seastar::async context.
+ */
+void update_streams_description(
+        db_clock::time_point,
+        shared_ptr<db::system_distributed_keyspace>,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source&);
+
+} // namespace cdc
--- a/cdc/log.cc
+++ b/cdc/log.cc
--- a/cdc/log.hh
+++ b/cdc/log.hh
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * This module manages CDC log tables. It contains facilities used to:
+ * - perform schema changes to CDC log tables correspondingly when base tables are changed,
+ * - perform writes to CDC log tables correspondingly when writes to base tables are made.
+ */
+
+#pragma once
+
+#include <functional>
+#include <optional>
+#include <map>
+#include <string>
+#include <vector>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/sstring.hh>
+
+#include "exceptions/exceptions.hh"
+#include "timestamp.hh"
+#include "tracing/trace_state.hh"
+#include "cdc_options.hh"
+#include "utils/UUID.hh"
+
+class schema;
+using schema_ptr = seastar::lw_shared_ptr<const schema>;
+
+namespace locator {
+
+class token_metadata;
+
+} // namespace locator
+
+namespace service {
+
+class migration_notifier;
+class storage_proxy;
+class query_state;
+
+} // namespace service
+
+class mutation;
+class partition_key;
+
+namespace cdc {
+
+struct operation_result_tracker;
+class db_context;
+class metadata;
+
+/// \brief CDC service, responsible for schema listeners
+///
+/// CDC service will listen for schema changes and iff CDC is enabled/changed
+/// create/modify/delete corresponding log tables etc as part of the schema change. 
+///
+class cdc_service {
+    class impl;
+    std::unique_ptr<impl> _impl;
+public:
+    future<> stop();
+    cdc_service(service::storage_proxy&);
+    cdc_service(db_context);
+    ~cdc_service();
+
+    // If any of the mutations are cdc enabled, optionally selects preimage, and adds the
+    // appropriate augments to set the log entries.
+    // Iff post-image is enabled for any of these, a non-empty callback is also
+    // returned to be invoked post the mutation query.
+    future<std::tuple<std::vector<mutation>, lw_shared_ptr<operation_result_tracker>>> augment_mutation_call(
+        lowres_clock::time_point timeout,
+        std::vector<mutation>&& mutations,
+        tracing::trace_state_ptr tr_state
+        );
+    bool needs_cdc_augmentation(const std::vector<mutation>&) const;
+};
+
+struct db_context final {
+    service::storage_proxy& _proxy;
+    service::migration_notifier& _migration_notifier;
+    locator::token_metadata& _token_metadata;
+    cdc::metadata& _cdc_metadata;
+
+    class builder final {
+        service::storage_proxy& _proxy;
+        std::optional<std::reference_wrapper<service::migration_notifier>> _migration_notifier;
+        std::optional<std::reference_wrapper<locator::token_metadata>> _token_metadata;
+        std::optional<std::reference_wrapper<cdc::metadata>> _cdc_metadata;
+    public:
+        builder(service::storage_proxy& proxy);
+
+        builder& with_migration_notifier(service::migration_notifier& migration_notifier);
+        builder& with_token_metadata(locator::token_metadata& token_metadata);
+        builder& with_cdc_metadata(cdc::metadata&);
+
+        db_context build();
+    };
+};
+
+// cdc log table operation
+enum class operation : int8_t {
+    // note: these values will eventually be read by a third party, probably not privvy to this
+    // enum decl, so don't change the constant values (or the datatype).
+    pre_image = 0, update = 1, insert = 2, row_delete = 3, partition_delete = 4,
+    range_delete_start_inclusive = 5, range_delete_start_exclusive = 6, range_delete_end_inclusive = 7, range_delete_end_exclusive = 8,
+    post_image = 9,
+};
+
+bool is_log_for_some_table(const sstring& ks_name, const std::string_view& table_name);
+seastar::sstring log_name(const seastar::sstring& table_name);
+seastar::sstring log_data_column_name(std::string_view column_name);
+seastar::sstring log_meta_column_name(std::string_view column_name);
+bytes log_data_column_name_bytes(const bytes& column_name);
+bytes log_meta_column_name_bytes(const bytes& column_name);
+
+seastar::sstring log_data_column_deleted_name(std::string_view column_name);
+bytes log_data_column_deleted_name_bytes(const bytes& column_name);
+
+seastar::sstring log_data_column_deleted_elements_name(std::string_view column_name);
+bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name);
+
+utils::UUID generate_timeuuid(api::timestamp_type t);
+
+} // namespace cdc
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -0,0 +1,200 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "dht/token-sharding.hh"
+#include "utils/exceptions.hh"
+#include "exceptions/exceptions.hh"
+
+#include "cdc/generation.hh"
+#include "cdc/metadata.hh"
+
+extern logging::logger cdc_log;
+
+namespace cdc {
+    extern const api::timestamp_clock::duration generation_leeway;
+} // namespace cdc
+
+static api::timestamp_type to_ts(db_clock::time_point tp) {
+    // This assumes that timestamp_clock and db_clock have the same epochs.
+    return std::chrono::duration_cast<api::timestamp_clock::duration>(tp.time_since_epoch()).count();
+}
+
+static cdc::stream_id get_stream(
+        const cdc::token_range_description& entry,
+        dht::token tok) {
+    // The ith stream is the stream for the ith shard.
+    auto shard_cnt = entry.streams.size();
+    auto shard_id = dht::shard_of(shard_cnt, entry.sharding_ignore_msb, tok);
+
+    if (shard_id >= shard_cnt) {
+        on_internal_error(cdc_log, "get_stream: shard_id out of bounds");
+    }
+
+    return entry.streams[shard_id];
+}
+
+static cdc::stream_id get_stream(
+        const std::vector<cdc::token_range_description>& entries,
+        dht::token tok) {
+    if (entries.empty()) {
+        on_internal_error(cdc_log, "get_stream: entries empty");
+    }
+
+    auto it = std::lower_bound(entries.begin(), entries.end(), tok,
+            [] (const cdc::token_range_description& e, dht::token t) { return e.token_range_end < t; });
+    if (it == entries.end()) {
+        it = entries.begin();
+    }
+
+    return get_stream(*it, tok);
+}
+
+cdc::metadata::container_t::const_iterator cdc::metadata::gen_used_at(api::timestamp_type ts) const {
+    auto it = _gens.upper_bound(ts);
+    if (it == _gens.begin()) {
+        // All known generations have higher timestamps than `ts`.
+        return _gens.end();
+    }
+
+    return std::prev(it);
+}
+
+cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok) {
+    auto now = api::new_timestamp();
+    if (ts > now + generation_leeway.count()) {
+        throw exceptions::invalid_request_exception(format(
+                "cdc: attempted to get a stream \"from the future\" ({}; current server time: {})."
+                " With CDC you cannot send writes with timestamps arbitrarily into the future, because we don't"
+                " know what streams will be used at that time.\n"
+                "We *do* allow sending writes into the near future, but our ability to do that is limited."
+                " If you really must use your own timestamps, then make sure your clocks are well-synchronized"
+               "  with the database's clocks.", format_timestamp(ts), format_timestamp(now)));
+        // Note that we might still send a write to a wrong generation, if we learn about the current
+        // generation too late (we might think that an earlier generation is the current one).
+        // Nothing protects us from that until we start using transactions for generation switching.
+    }
+
+    auto it = gen_used_at(now);
+    if (it == _gens.end()) {
+        throw std::runtime_error(format(
+                "cdc::metadata::get_stream: could not find any CDC stream (current time: {})."
+                " Are we in the middle of a cluster upgrade?", format_timestamp(now)));
+    }
+
+    // Garbage-collect generations that will no longer be used.
+    it = _gens.erase(_gens.begin(), it);
+
+    if (it->first > ts) {
+        throw exceptions::invalid_request_exception(format(
+                "cdc: attempted to get a stream from an earlier generation than the currently used one."
+                " With CDC you cannot send writes with timestamps too far into the past, because that would break"
+                " consistency properties (write timestamp: {}, current generation started at: {})",
+                format_timestamp(ts), format_timestamp(it->first)));
+    }
+
+    // With `generation_leeway` we allow sending writes to the near future. It might happen
+    // that `ts` doesn't belong to the current generation ("current" according to our clock),
+    // but to the next generation. Adjust for this case:
+    {
+        auto next_it = std::next(it);
+        while (next_it != _gens.end() && next_it->first <= ts) {
+            it = next_it++;
+        }
+    }
+    // Note: if there is a next generation that `ts` belongs to, but we don't know about it,
+    // then too bad. This is no different from the situation in which we didn't manage to learn
+    // about the current generation in time. We won't be able to prevent it until we introduce transactions.
+
+    if (!it->second) {
+        throw std::runtime_error(format(
+                "cdc: attempted to get a stream from a generation that we know about, but weren't able to retrieve"
+                " (generation timestamp: {}, write timestamp: {}). Make sure that the replicas which contain"
+                " this generation's data are alive and reachable from this node.", format_timestamp(it->first), format_timestamp(ts)));
+    }
+
+    auto& gen = *it->second;
+    auto ret = ::get_stream(gen.entries(), tok);
+    _last_stream_timestamp = ts;
+    return ret;
+}
+
+bool cdc::metadata::known_or_obsolete(db_clock::time_point tp) const {
+    auto ts = to_ts(tp);
+    auto it = _gens.lower_bound(ts);
+
+    if (it == _gens.end()) {
+        // No known generations with timestamp >= ts.
+        return false;
+    }
+
+    if (it->first == ts) {
+        if (it->second) {
+            // We already inserted this particular generation.
+            return true;
+        }
+        ++it;
+    }
+
+    // Check if some new generation has already superseded this one.
+    return it != _gens.end() && it->first <= api::new_timestamp();
+}
+
+bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen) {
+    if (known_or_obsolete(tp)) {
+        return false;
+    }
+
+    auto now = api::new_timestamp();
+    auto it = gen_used_at(now);
+
+    if (it != _gens.end()) {
+        // Garbage-collect generations that will no longer be used.
+        it = _gens.erase(_gens.begin(), it);
+
+    }
+
+    _gens.insert_or_assign(to_ts(tp), std::move(gen));
+    return true;
+}
+
+bool cdc::metadata::prepare(db_clock::time_point tp) {
+    if (known_or_obsolete(tp)) {
+        return false;
+    }
+
+    auto ts = to_ts(tp);
+    auto emplaced = _gens.emplace(to_ts(tp), std::nullopt).second;
+
+    if (_last_stream_timestamp != api::missing_timestamp) {
+        auto last_correct_gen = gen_used_at(_last_stream_timestamp);
+        if (emplaced && last_correct_gen != _gens.end() && last_correct_gen->first == ts) {
+            cdc_log.error(
+                "just learned about a CDC generation newer than the one used the last time"
+                " streams were retrieved. This generation, or some newer one, should have"
+                " been used instead (new generation's timestamp: {}, last time streams were retrieved: {})."
+                " The new generation probably arrived too late due to a network partition"
+                " and we've made a write using the wrong set streams.",
+                format_timestamp(ts), format_timestamp(_last_stream_timestamp));
+        }
+    }
+
+    return emplaced;
+}
--- a/cdc/metadata.hh
+++ b/cdc/metadata.hh
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <map>
+
+#include "db_clock.hh"
+#include "timestamp.hh"
+
+namespace dht {
+    class token;
+}
+
+namespace cdc {
+
+class stream_id;
+class topology_description;
+
+/* Represents the node's knowledge about CDC generations used in the cluster.
+ * Used during writes to pick streams to which CDC log writes should be sent to
+ * (i.e., to pick partition keys for these writes).
+ */
+class metadata final {
+    // Note: we use db_clock (1ms resolution) for generation timestaps
+    // (because we need to insert them into tables using columns of timestamp types,
+    //  and the native type of our columns' timestamp_type is db_clock::time_point).
+    // On the other hand, timestamp_clock (1us resolution) is used for mutation timestamps,
+    // and api::timestamp_type represents the number of ticks of a timestamp_clock::time_point since epoch.
+
+    using container_t = std::map<api::timestamp_type, std::optional<topology_description>>;
+    container_t _gens;
+
+    /* The timestamp used in the last successful `get_stream` call. */
+    api::timestamp_type _last_stream_timestamp = api::missing_timestamp;
+
+    container_t::const_iterator gen_used_at(api::timestamp_type ts) const;
+public:
+    /* Is a generation with the given timestamp already known or superseded by a newer generation? */
+    bool known_or_obsolete(db_clock::time_point) const;
+
+    /* Return the stream for the base partition whose token is `tok` to which a corresponding log write should go
+     * according to the generation used at time `ts` (i.e, the latest generation whose timestamp is less or equal to `ts`).
+     *
+     * If the provided timestamp is too far away "into the future" (where "now" is defined according to our local clock),
+     * we reject the get_stream query. This is because the resulting stream might belong to a generation which we don't
+     * yet know about. The amount of leeway (how much "into the future" we allow `ts` to be) is defined
+     * by the `cdc::generation_leeway` constant.
+     */
+    stream_id get_stream(api::timestamp_type ts, dht::token tok);
+
+    /* Insert the generation given by `gen` with timestamp `ts` to be used by the `get_stream` function,
+     * if the generation is not already known or older than the currently known ones.
+     *
+     * Returns true if the generation was inserted,
+     * meaning that `get_stream` might return a stream from this generation (at some time points).
+     */
+    bool insert(db_clock::time_point ts, topology_description&& gen);
+
+    /* Prepare for inserting a new generation whose timestamp is `ts`.
+     * This method is not required to be called before `insert`, but it's here
+     * to increase safety of `get_stream` calls in some situations. Use it if you:
+     * 1. know that there is a new generation, but
+     * 2. you didn't yet retrieve the generation's topology_description.
+     *
+     * After preparing a generation, if `get_stream` is supposed to return a stream from this generation
+     * but we don't yet have the generation's data, it will reject the query to maintain consistency of streams.
+     *
+     * Returns true iff this generation is not obsolete and wasn't previously prepared nor inserted.
+     */
+    bool prepare(db_clock::time_point ts);
+};
+
+} // namespace cdc
--- a/cdc/split.cc
+++ b/cdc/split.cc
@@ -0,0 +1,463 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "mutation.hh"
+#include "schema.hh"
+
+#include "split.hh"
+#include "log.hh"
+
+struct atomic_column_update {
+    column_id id;
+    atomic_cell cell;
+};
+
+// see the comment inside `clustered_row_insert` for motivation for separating
+// nonatomic deletions from nonatomic updates
+struct nonatomic_column_deletion {
+    column_id id;
+    tombstone t;
+};
+
+struct nonatomic_column_update {
+    column_id id;
+    utils::chunked_vector<std::pair<bytes, atomic_cell>> cells;
+};
+
+struct static_row_update {
+    gc_clock::duration ttl;
+    std::vector<atomic_column_update> atomic_entries;
+    std::vector<nonatomic_column_deletion> nonatomic_deletions;
+    std::vector<nonatomic_column_update> nonatomic_updates;
+};
+
+struct clustered_row_insert {
+    gc_clock::duration ttl;
+    clustering_key key;
+    row_marker marker;
+    std::vector<atomic_column_update> atomic_entries;
+    std::vector<nonatomic_column_deletion> nonatomic_deletions;
+    // INSERTs can't express updates of individual cells inside a non-atomic
+    // (without deleting the entire field first), so no `nonatomic_updates` field
+    // overwriting a nonatomic column inside an INSERT will be split into two changes:
+    // one with a nonatomic deletion, and one with a nonatomic update
+};
+
+struct clustered_row_update {
+    gc_clock::duration ttl;
+    clustering_key key;
+    std::vector<atomic_column_update> atomic_entries;
+    std::vector<nonatomic_column_deletion> nonatomic_deletions;
+    std::vector<nonatomic_column_update> nonatomic_updates;
+};
+
+struct clustered_row_deletion {
+    clustering_key key;
+    tombstone t;
+};
+
+struct clustered_range_deletion {
+    range_tombstone rt;
+};
+
+struct partition_deletion {
+    tombstone t;
+};
+
+struct batch {
+    std::vector<static_row_update> static_updates;
+    std::vector<clustered_row_insert> clustered_inserts;
+    std::vector<clustered_row_update> clustered_updates;
+    std::vector<clustered_row_deletion> clustered_row_deletions;
+    std::vector<clustered_range_deletion> clustered_range_deletions;
+    std::optional<partition_deletion> partition_deletions;
+};
+
+using set_of_changes = std::map<api::timestamp_type, batch>;
+
+struct row_update {
+    std::vector<atomic_column_update> atomic_entries;
+    std::vector<nonatomic_column_deletion> nonatomic_deletions;
+    std::vector<nonatomic_column_update> nonatomic_updates;
+};
+
+static
+std::map<std::pair<api::timestamp_type, gc_clock::duration>, row_update>
+extract_row_updates(const row& r, column_kind ckind, const schema& schema) {
+    std::map<std::pair<api::timestamp_type, gc_clock::duration>, row_update> result;
+    r.for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
+        auto& cdef = schema.column_at(ckind, id);
+        if (cdef.is_atomic()) {
+            auto view = cell.as_atomic_cell(cdef);
+            auto timestamp_and_ttl = std::pair(
+                    view.timestamp(),
+                    view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0)
+                );
+            result[timestamp_and_ttl].atomic_entries.push_back({id, atomic_cell(*cdef.type, view)});
+            return;
+        }
+
+        cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
+            auto desc = mview.materialize(*cdef.type);
+            for (auto& [k, v]: desc.cells) {
+                auto timestamp_and_ttl = std::pair(
+                        v.timestamp(),
+                        v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0)
+                    );
+                auto& updates = result[timestamp_and_ttl].nonatomic_updates;
+                if (updates.empty() || updates.back().id != id) {
+                    updates.push_back({id, {}});
+                }
+                updates.back().cells.push_back({std::move(k), std::move(v)});
+            }
+
+            if (desc.tomb) {
+                auto timestamp_and_ttl = std::pair(desc.tomb.timestamp, gc_clock::duration(0));
+                result[timestamp_and_ttl].nonatomic_deletions.push_back({id, desc.tomb});
+            }
+        });
+    });
+    return result;
+};
+
+set_of_changes extract_changes(const mutation& base_mutation, const schema& base_schema) {
+    set_of_changes res;
+    auto& p = base_mutation.partition();
+
+    auto sr_updates = extract_row_updates(p.static_row().get(), column_kind::static_column, base_schema);
+    for (auto& [k, up]: sr_updates) {
+        auto [timestamp, ttl] = k;
+        res[timestamp].static_updates.push_back({
+                ttl,
+                std::move(up.atomic_entries),
+                std::move(up.nonatomic_deletions),
+                std::move(up.nonatomic_updates)
+            });
+    }
+
+    for (const rows_entry& cr : p.clustered_rows()) {
+        auto cr_updates = extract_row_updates(cr.row().cells(), column_kind::regular_column, base_schema);
+
+        const auto& marker = cr.row().marker();
+        auto marker_timestamp = marker.timestamp();
+        auto marker_ttl = marker.is_expiring() ? marker.ttl() : gc_clock::duration(0);
+        if (marker.is_live()) {
+            // make sure that an entry corresponding to the row marker's timestamp and ttl is in the map
+            (void)cr_updates[std::pair(marker_timestamp, marker_ttl)];
+        }
+
+        auto is_insert = [&] (api::timestamp_type timestamp, gc_clock::duration ttl) {
+            if (!marker.is_live()) {
+                return false;
+            }
+
+            return timestamp == marker_timestamp && ttl == marker_ttl;
+        };
+
+        for (auto& [k, up]: cr_updates) {
+            auto [timestamp, ttl] = k;
+
+            if (is_insert(timestamp, ttl)) {
+                res[timestamp].clustered_inserts.push_back({
+                        ttl,
+                        cr.key(),
+                        marker,
+                        std::move(up.atomic_entries),
+                        std::move(up.nonatomic_deletions)
+                    });
+                if (!up.nonatomic_updates.empty()) {
+                    // nonatomic updates cannot be expressed with an INSERT.
+                    res[timestamp].clustered_updates.push_back({
+                            ttl,
+                            cr.key(),
+                            {},
+                            {},
+                            std::move(up.nonatomic_updates)
+                        });
+                }
+            } else {
+                res[timestamp].clustered_updates.push_back({
+                        ttl,
+                        cr.key(),
+                        std::move(up.atomic_entries),
+                        std::move(up.nonatomic_deletions),
+                        std::move(up.nonatomic_updates)
+                    });
+            }
+        }
+
+        auto row_tomb = cr.row().deleted_at().regular();
+        if (row_tomb) {
+            res[row_tomb.timestamp].clustered_row_deletions.push_back({cr.key(), row_tomb});
+        }
+    }
+
+    for (const auto& rt: p.row_tombstones()) {
+        if (rt.tomb.timestamp != api::missing_timestamp) {
+            res[rt.tomb.timestamp].clustered_range_deletions.push_back({rt});
+        }
+    }
+
+    auto partition_tomb_timestamp = p.partition_tombstone().timestamp;
+    if (partition_tomb_timestamp != api::missing_timestamp) {
+        res[partition_tomb_timestamp].partition_deletions = {p.partition_tombstone()};
+    }
+
+    return res;
+}
+
+namespace cdc {
+
+bool should_split(const mutation& base_mutation, const schema& base_schema) {
+    auto& p = base_mutation.partition();
+
+    api::timestamp_type found_ts = api::missing_timestamp;
+    std::optional<gc_clock::duration> found_ttl; // 0 = "no ttl"
+
+    auto check_or_set = [&] (api::timestamp_type ts, gc_clock::duration ttl) {
+        if (found_ts != api::missing_timestamp && found_ts != ts) {
+            return true;
+        }
+        found_ts = ts;
+
+        if (found_ttl && *found_ttl != ttl) {
+            return true;
+        }
+        found_ttl = ttl;
+
+        return false;
+    };
+
+    bool had_static_row = false;
+
+    bool should_split = false;
+    p.static_row().get().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
+        had_static_row = true;
+
+        auto& cdef = base_schema.column_at(column_kind::static_column, id);
+        if (cdef.is_atomic()) {
+            auto view = cell.as_atomic_cell(cdef);
+            if (check_or_set(view.timestamp(), view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0))) {
+                should_split = true;
+            }
+            return;
+        }
+
+        cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
+            auto desc = mview.materialize(*cdef.type);
+            for (auto& [k, v]: desc.cells) {
+                if (check_or_set(v.timestamp(), v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0))) {
+                    should_split = true;
+                    return;
+                }
+            }
+
+            if (desc.tomb) {
+                if (check_or_set(desc.tomb.timestamp, gc_clock::duration(0))) {
+                    should_split = true;
+                    return;
+                }
+            }
+        });
+    });
+
+    if (should_split) {
+        return true;
+    }
+
+    bool had_clustered_row = false;
+
+    if (!p.clustered_rows().empty() && had_static_row) {
+        return true;
+    }
+    for (const rows_entry& cr : p.clustered_rows()) {
+        had_clustered_row = true;
+
+        const auto& marker = cr.row().marker();
+        if (marker.is_live() && check_or_set(marker.timestamp(), marker.is_expiring() ? marker.ttl() : gc_clock::duration(0))) {
+            return true;
+        }
+
+        bool is_insert = marker.is_live();
+
+        bool had_cells = false;
+        cr.row().cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
+            had_cells = true;
+
+            auto& cdef = base_schema.column_at(column_kind::regular_column, id);
+            if (cdef.is_atomic()) {
+                auto view = cell.as_atomic_cell(cdef);
+                if (check_or_set(view.timestamp(), view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0))) {
+                    should_split = true;
+                }
+                return;
+            }
+
+            cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
+                for (auto& [k, v]: mview.cells) {
+                    if (check_or_set(v.timestamp(), v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0))) {
+                        should_split = true;
+                        return;
+                    }
+
+                    if (is_insert) {
+                        // nonatomic updates cannot be expressed with an INSERT.
+                        should_split = true;
+                        return;
+                    }
+                }
+
+                if (mview.tomb) {
+                    if (check_or_set(mview.tomb.timestamp, gc_clock::duration(0))) {
+                        should_split = true;
+                        return;
+                    }
+                }
+            });
+        });
+
+        if (should_split) {
+            return true;
+        }
+
+        auto row_tomb = cr.row().deleted_at().regular();
+        if (row_tomb) {
+            if (had_cells) {
+                return true;
+            }
+
+            // there were no cells, so no ttl
+            assert(!found_ttl);
+            if (found_ts != api::missing_timestamp && found_ts != row_tomb.timestamp) {
+                return true;
+            }
+
+            found_ts = row_tomb.timestamp;
+        }
+    }
+
+    if (!p.row_tombstones().empty() && (had_static_row || had_clustered_row)) {
+        return true;
+    }
+
+    for (const auto& rt: p.row_tombstones()) {
+        if (rt.tomb) {
+            if (found_ts != api::missing_timestamp && found_ts != rt.tomb.timestamp) {
+                return true;
+            }
+
+            found_ts = rt.tomb.timestamp;
+        }
+    }
+
+    if (p.partition_tombstone().timestamp != api::missing_timestamp
+            && (!p.row_tombstones().empty() || had_static_row || had_clustered_row)) {
+        return true;
+    }
+
+    // A mutation with no timestamp will be split into 0 mutations
+    return found_ts == api::missing_timestamp;
+}
+
+void for_each_change(const mutation& base_mutation, const schema_ptr& base_schema,
+        seastar::noncopyable_function<void(mutation, api::timestamp_type, bytes, int&)> f) {
+    auto changes = extract_changes(base_mutation, *base_schema);
+    auto pk = base_mutation.key();
+
+    for (auto& [change_ts, btch] : changes) {
+        auto tuuid = timeuuid_type->decompose(generate_timeuuid(change_ts));
+        int batch_no = 0;
+
+        for (auto& sr_update : btch.static_updates) {
+            mutation m(base_schema, pk);
+            for (auto& atomic_update : sr_update.atomic_entries) {
+                auto& cdef = base_schema->column_at(column_kind::static_column, atomic_update.id);
+                m.set_static_cell(cdef, std::move(atomic_update.cell));
+            }
+            for (auto& nonatomic_delete : sr_update.nonatomic_deletions) {
+                auto& cdef = base_schema->column_at(column_kind::static_column, nonatomic_delete.id);
+                m.set_static_cell(cdef, collection_mutation_description{nonatomic_delete.t, {}}.serialize(*cdef.type));
+            }
+            for (auto& nonatomic_update : sr_update.nonatomic_updates) {
+                auto& cdef = base_schema->column_at(column_kind::static_column, nonatomic_update.id);
+                m.set_static_cell(cdef, collection_mutation_description{{}, std::move(nonatomic_update.cells)}.serialize(*cdef.type));
+            }
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        for (auto& cr_insert : btch.clustered_inserts) {
+            mutation m(base_schema, pk);
+
+            auto& row = m.partition().clustered_row(*base_schema, cr_insert.key);
+            for (auto& atomic_update : cr_insert.atomic_entries) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, atomic_update.id);
+                row.cells().apply(cdef, std::move(atomic_update.cell));
+            }
+            for (auto& nonatomic_delete : cr_insert.nonatomic_deletions) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, nonatomic_delete.id);
+                row.cells().apply(cdef, collection_mutation_description{nonatomic_delete.t, {}}.serialize(*cdef.type));
+            }
+            row.apply(cr_insert.marker);
+
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        for (auto& cr_update : btch.clustered_updates) {
+            mutation m(base_schema, pk);
+
+            auto& row = m.partition().clustered_row(*base_schema, cr_update.key).cells();
+            for (auto& atomic_update : cr_update.atomic_entries) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, atomic_update.id);
+                row.apply(cdef, std::move(atomic_update.cell));
+            }
+            for (auto& nonatomic_delete : cr_update.nonatomic_deletions) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, nonatomic_delete.id);
+                row.apply(cdef, collection_mutation_description{nonatomic_delete.t, {}}.serialize(*cdef.type));
+            }
+            for (auto& nonatomic_update : cr_update.nonatomic_updates) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, nonatomic_update.id);
+                row.apply(cdef, collection_mutation_description{{}, std::move(nonatomic_update.cells)}.serialize(*cdef.type));
+            }
+
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        for (auto& cr_delete : btch.clustered_row_deletions) {
+            mutation m(base_schema, pk);
+            m.partition().apply_delete(*base_schema, cr_delete.key, cr_delete.t);
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        for (auto& crange_delete : btch.clustered_range_deletions) {
+            mutation m(base_schema, pk);
+            m.partition().apply_delete(*base_schema, crange_delete.rt);
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        if (btch.partition_deletions) {
+            mutation m(base_schema, pk);
+            m.partition().apply(btch.partition_deletions->t);
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+    }
+}
+
+} // namespace cdc
--- a/cdc/split.hh
+++ b/cdc/split.hh
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <vector>
+#include "schema_fwd.hh"
+#include "timestamp.hh"
+#include "bytes.hh"
+#include <seastar/util/noncopyable_function.hh>
+
+class mutation;
+
+namespace cdc {
+
+bool should_split(const mutation& base_mutation, const schema& base_schema);
+void for_each_change(const mutation& base_mutation, const schema_ptr& base_schema,
+        seastar::noncopyable_function<void(mutation, api::timestamp_type, bytes, int&)>);
+
+}
--- a/cdc/stats.hh
+++ b/cdc/stats.hh
@@ -0,0 +1,120 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <string>
+#include <seastar/core/metrics_registration.hh>
+#include "enum_set.hh"
+#include "utils/histogram.hh"
+#include "utils/estimated_histogram.hh"
+
+namespace cdc {
+
+class stats final {
+    seastar::metrics::metric_groups _metrics;
+
+public:
+    enum class part_type {
+        STATIC_ROW,
+        CLUSTERING_ROW,
+        MAP,
+        SET,
+        LIST,
+        UDT,
+        RANGE_TOMBSTONE,
+        PARTITION_DELETE,
+        ROW_DELETE,
+
+        MAX
+    };
+
+    using part_type_set = enum_set<super_enum<part_type,
+        part_type::STATIC_ROW,
+        part_type::CLUSTERING_ROW,
+        part_type::MAP,
+        part_type::SET,
+        part_type::LIST,
+        part_type::UDT,
+        part_type::RANGE_TOMBSTONE,
+        part_type::PARTITION_DELETE,
+        part_type::ROW_DELETE
+    >>;
+
+    struct parts_touched_stats final {
+        std::array<uint64_t, (size_t)part_type::MAX> count = {};
+
+        inline void apply(part_type_set parts_set) {
+            for (part_type idx : parts_set) {
+                count[(size_t)idx]++;
+            }
+        }
+
+        void register_metrics(seastar::metrics::metric_groups& metrics, std::string_view suffix);
+    };
+
+    struct counters final {
+        uint64_t unsplit_count = 0;
+        uint64_t split_count = 0;
+        uint64_t preimage_selects = 0;
+        uint64_t with_preimage_count = 0;
+        uint64_t with_postimage_count = 0;
+
+        parts_touched_stats touches;
+    };
+
+    counters counters_total;
+    counters counters_failed;
+
+    stats();
+};
+
+// Contains the details on what happened during a CDC operation.
+struct operation_details final {
+    stats::part_type_set touched_parts;
+    bool was_split = false;
+    bool had_preimage = false;
+    bool had_postimage = false;
+};
+
+// This object tracks the lifetime of write handlers related to one CDC operation. After all
+// write handlers for the operation finish, CDC metrics are updated.
+class operation_result_tracker final {
+    stats& _stats;
+    operation_details _details;
+    bool _failed;
+
+public:
+    operation_result_tracker(stats& stats, operation_details details)
+        : _stats(stats)
+        , _details(details)
+        , _failed(false)
+    {}
+    ~operation_result_tracker();
+
+    void on_mutation_failed() {
+        _failed = true;
+    }
+};
+
+}
--- a/checked-file-impl.hh
+++ b/checked-file-impl.hh
@@ -22,7 +22,10 @@
 #pragma once

 #include "seastar/core/file.hh"
-#include "disk-error-handler.hh"
+#include "seastar/core/reactor.hh"
+#include "utils/disk-error-handler.hh"
+
+#include "seastarx.hh"

 class checked_file_impl : public file_impl {
 public:
--- a/clocks-impl.cc
+++ b/clocks-impl.cc
@@ -19,6 +19,23 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <seastar/core/print.hh>
+
+#include "db_clock.hh"
+#include "timestamp.hh"
+
 #include "clocks-impl.hh"

 std::atomic<int64_t> clocks_offset;
+
+std::ostream& operator<<(std::ostream& os, db_clock::time_point tp) {
+    auto t = db_clock::to_time_t(tp);
+    ::tm t_buf;
+    return os << std::put_time(::gmtime_r(&t, &t_buf), "%Y/%m/%d %T");
+}
+
+std::string format_timestamp(api::timestamp_type ts) {
+    auto t = std::time_t(std::chrono::duration_cast<std::chrono::seconds>(api::timestamp_clock::duration(ts)).count());
+    ::tm t_buf;
+    return format("{}", std::put_time(::gmtime_r(&t, &t_buf), "%Y/%m/%d %T"));
+}
--- a/clustering_bounds_comparator.hh
+++ b/clustering_bounds_comparator.hh
@@ -24,7 +24,7 @@

 #include <functional>
 #include "keys.hh"
-#include "schema.hh"
+#include "schema_fwd.hh"
 #include "range.hh"

 /**
--- a/clustering_interval_set.hh
+++ b/clustering_interval_set.hh
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "schema_fwd.hh"
+#include "position_in_partition.hh"
+#include <boost/icl/interval_set.hpp>
+
+// Represents a non-contiguous subset of clustering_key domain of a particular schema.
+// Can be treated like an ordered and non-overlapping sequence of position_range:s.
+class clustering_interval_set {
+    // Needed to make position_in_partition comparable, required by boost::icl::interval_set.
+    class position_in_partition_with_schema {
+        schema_ptr _schema;
+        position_in_partition _pos;
+    public:
+        position_in_partition_with_schema()
+            : _pos(position_in_partition::for_static_row())
+        { }
+        position_in_partition_with_schema(schema_ptr s, position_in_partition pos)
+            : _schema(std::move(s))
+            , _pos(std::move(pos))
+        { }
+        bool operator<(const position_in_partition_with_schema& other) const {
+            return position_in_partition::less_compare(*_schema)(_pos, other._pos);
+        }
+        bool operator==(const position_in_partition_with_schema& other) const {
+            return position_in_partition::equal_compare(*_schema)(_pos, other._pos);
+        }
+        const position_in_partition& position() const { return _pos; }
+    };
+private:
+    // We want to represent intervals of clustering keys, not position_in_partitions,
+    // but clustering_key domain is not enough to represent all kinds of clustering ranges.
+    // All intervals in this set are of the form [x, y).
+    using set_type = boost::icl::interval_set<position_in_partition_with_schema>;
+    using interval = boost::icl::interval<position_in_partition_with_schema>;
+    set_type _set;
+public:
+    clustering_interval_set() = default;
+    // Constructs from legacy clustering_row_ranges
+    clustering_interval_set(const schema& s, const query::clustering_row_ranges& ranges) {
+        for (auto&& r : ranges) {
+            add(s, position_range::from_range(r));
+        }
+    }
+    query::clustering_row_ranges to_clustering_row_ranges() const {
+        query::clustering_row_ranges result;
+        for (position_range r : *this) {
+            result.push_back(query::clustering_range::make(
+                {r.start().key(), r.start()._bound_weight != bound_weight::after_all_prefixed},
+                {r.end().key(), r.end()._bound_weight == bound_weight::after_all_prefixed}));
+        }
+        return result;
+    }
+    class position_range_iterator : public std::iterator<std::input_iterator_tag, const position_range> {
+        set_type::iterator _i;
+    public:
+        position_range_iterator(set_type::iterator i) : _i(i) {}
+        position_range operator*() const {
+            // FIXME: Produce position_range view. Not performance critical yet.
+            const interval::interval_type& iv = *_i;
+            return position_range{iv.lower().position(), iv.upper().position()};
+        }
+        bool operator==(const position_range_iterator& other) const { return _i == other._i; }
+        bool operator!=(const position_range_iterator& other) const { return _i != other._i; }
+        position_range_iterator& operator++() {
+            ++_i;
+            return *this;
+        }
+        position_range_iterator operator++(int) {
+            auto tmp = *this;
+            ++_i;
+            return tmp;
+        }
+    };
+    static interval::type make_interval(const schema& s, const position_range& r) {
+        assert(r.start().has_clustering_key());
+        assert(r.end().has_clustering_key());
+        return interval::right_open(
+            position_in_partition_with_schema(s.shared_from_this(), r.start()),
+            position_in_partition_with_schema(s.shared_from_this(), r.end()));
+    }
+public:
+    bool equals(const schema& s, const clustering_interval_set& other) const {
+        return boost::equal(_set, other._set);
+    }
+    bool contains(const schema& s, position_in_partition_view pos) const {
+        // FIXME: Avoid copy
+        return _set.find(position_in_partition_with_schema(s.shared_from_this(), position_in_partition(pos))) != _set.end();
+    }
+    // Returns true iff this set is fully contained in the other set.
+    bool contained_in(clustering_interval_set& other) const {
+        return boost::icl::within(_set, other._set);
+    }
+    bool overlaps(const schema& s, const position_range& range) const {
+        // FIXME: Avoid copy
+        auto r = _set.equal_range(make_interval(s, range));
+        return r.first != r.second;
+    }
+    // Adds given clustering range to this interval set.
+    // The range may overlap with this set.
+    void add(const schema& s, const position_range& r) {
+        _set += make_interval(s, r);
+    }
+    void add(const schema& s, const clustering_interval_set& other) {
+        for (auto&& r : other) {
+            add(s, r);
+        }
+    }
+    position_range_iterator begin() const { return {_set.begin()}; }
+    position_range_iterator end() const { return {_set.end()}; }
+    friend std::ostream& operator<<(std::ostream&, const clustering_interval_set&);
+};
+
--- a/clustering_key_filter.hh
+++ b/clustering_key_filter.hh
@@ -23,7 +23,7 @@

 #pragma once

-#include "schema.hh"
+#include "schema_fwd.hh"
 #include "query-request.hh"

 namespace query {
--- a/Show More
+++ b/Show More