storage_proxy: do not touch all_replicas.front() if it's empty.

The list of all endpoints for a query can be empty if we have replication_factor 0 or there are no live endpoints for this token. Do not access all_replicas.front() in this case. Fixes #5935. Message-Id: <20200306192521.73486-2-kostja@scylladb.com> (cherry picked from commit 9827efe554)
cql transport: do not log broken pipe error when a client closes its side of a connection abruptly
2020-06-22 18:29:15 +03:00 · 2020-06-21 13:09:22 +03:00 · 2020-06-21 13:07:21 +03:00 · 2020-06-21 13:03:05 +03:00 · 2020-06-21 12:57:48 +03:00 · 2020-06-21 12:47:05 +03:00
3646 changed files with 17410 additions and 42672 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,4 +1,3 @@
 .git
 build
 seastar/build
-testlog
--- a/.gitmodules
+++ b/.gitmodules
@@ -15,6 +15,3 @@
 [submodule "zstd"]
 	path = zstd
 	url = ../zstd
-[submodule "abseil"]
-	path = abseil
-	url = ../abseil-cpp
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,25 +5,13 @@
 cmake_minimum_required(VERSION 3.7)
 project(scylla)

-if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-  message(STATUS "Setting build type to 'Release' as none was specified.")
-  set(CMAKE_BUILD_TYPE "Release" CACHE
-      STRING "Choose the type of build." FORCE)
-  # Set the possible values of build type for cmake-gui
-  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
-    "Debug" "Release" "Dev" "Sanitize")
-endif()
-
-if(CMAKE_BUILD_TYPE)
-    string(TOLOWER "${CMAKE_BUILD_TYPE}" BUILD_TYPE)
-else()
-    set(BUILD_TYPE "release")
-endif()
-
 if (NOT DEFINED FOR_IDE AND NOT DEFINED ENV{FOR_IDE} AND NOT DEFINED ENV{CLION_IDE})
    message(FATAL_ERROR "This CMakeLists.txt file is only valid for use in IDEs, please define FOR_IDE to acknowledge this.")
 endif()

+# Default value. A more accurate list is populated through `pkg-config` below if `seastar.pc` is available.
+set(SEASTAR_INCLUDE_DIRS "seastar")
+
 # These paths are always available, since they're included in the repository. Additional DPDK headers are placed while
 # Seastar is built, and are captured in `SEASTAR_INCLUDE_DIRS` through parsing the Seastar pkg-config file (below).
 set(SEASTAR_DPDK_INCLUDE_DIRS
@@ -34,14 +22,9 @@ set(SEASTAR_DPDK_INCLUDE_DIRS

 find_package(PkgConfig REQUIRED)

-set(ENV{PKG_CONFIG_PATH} "${CMAKE_SOURCE_DIR}/build/${BUILD_TYPE}/seastar:$ENV{PKG_CONFIG_PATH}")
+set(ENV{PKG_CONFIG_PATH} "${CMAKE_SOURCE_DIR}/seastar/build/release:$ENV{PKG_CONFIG_PATH}")
 pkg_check_modules(SEASTAR seastar)

-if(NOT SEASTAR_INCLUDE_DIRS)
-    # Default value. A more accurate list is populated through `pkg-config` below if `seastar.pc` is available.
-    set(SEASTAR_INCLUDE_DIRS "seastar/include")
-endif()
-
 find_package(Boost COMPONENTS filesystem program_options system thread)

 ##
@@ -87,7 +70,7 @@ scan_scylla_source_directories(
          seastar/json
          seastar/net
          seastar/rpc
-          seastar/testing
+          seastar/tests
          seastar/util)

 scan_scylla_source_directories(
@@ -123,7 +106,7 @@ scan_scylla_source_directories(
 scan_scylla_source_directories(
        VAR SCYLLA_GEN_SOURCE_FILES
        RECURSIVE
-        PATHS build/${BUILD_TYPE}/gen)
+        PATHS build/release/gen)

 set(SCYLLA_SOURCE_FILES
        ${SCYLLA_ROOT_SOURCE_FILES}
@@ -156,4 +139,4 @@ target_include_directories(scylla PUBLIC
        ${Boost_INCLUDE_DIRS}
        xxhash
        libdeflate
-        build/${BUILD_TYPE}/gen)
+        build/release/gen)
--- a/HACKING.md
+++ b/HACKING.md
@@ -141,7 +141,7 @@ In v3:
 "Tests: unit ({mode}), dtest ({smp})"
 ```

-The usual is "Tests: unit (dev)", although running debug tests is encouraged.
+The usual is "Tests: unit (release)", although running debug tests is encouraged.

 5. When answering review comments, prefer inline quotes as they make it easier to track the conversation across multiple e-mails.

--- a/README.md
+++ b/README.md
@@ -38,10 +38,6 @@ Please see [HACKING.md](HACKING.md) for detailed information on building and dev
 ./build/release/scylla --help
 ```

-## Testing
-
-See [test.py manual](docs/testing.md).
-
 ## Scylla APIs and compatibility
 By default, Scylla is compatible with Apache Cassandra and its APIs - CQL and
 Thrift. There is also experimental support for the API of Amazon DynamoDB,
@@ -60,12 +56,31 @@ both.
 Seastar documentation can be found [here](http://docs.seastar.io/master/index.html).
 User documentation can be found [here](https://docs.scylladb.com/).

-## Training 
+## Building Fedora RPM

-Training material and online courses can be found at [Scylla University](https://university.scylladb.com/). 
-The courses are free, self-paced and include hands-on examples. They cover a variety of topics including Scylla data modeling, 
-administration, architecture, basic NoSQL concepts, using drivers for application development, Scylla setup, failover, compactions, 
-multi-datacenters and how Scylla integrates with third-party applications.
+As a pre-requisite, you need to install [Mock](https://fedoraproject.org/wiki/Mock) on your machine:
+
+```
+# Install mock:
+sudo yum install mock
+
+# Add user to the "mock" group:
+usermod -a -G mock $USER && newgrp mock
+```
+
+Then, to build an RPM, run:
+
+```
+./dist/redhat/build_rpm.sh
+```
+
+The built RPM is stored in ``/var/lib/mock/<configuration>/result`` directory.
+For example, on Fedora 21 mock reports the following:
+
+```
+INFO: Done(scylla-server-0.00-1.fc21.src.rpm) Config(default) 20 minutes 7 seconds
+INFO: Results and/or logs in: /var/lib/mock/fedora-21-x86_64/result
+```

 ## Building Fedora-based Docker image

--- a/10
+++ b/10
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=4.0.11
+VERSION=3.3.4

 if test -f version
 then
@@ -19,14 +19,6 @@ else
 	SCYLLA_RELEASE=$SCYLLA_BUILD.$DATE.$GIT_COMMIT
 fi

-if [ -f build/SCYLLA-RELEASE-FILE ]; then
-	RELEASE_FILE=$(cat build/SCYLLA-RELEASE-FILE)
-	GIT_COMMIT_FILE=$(cat build/SCYLLA-RELEASE-FILE |cut -d . -f 3)
-	if [ "$GIT_COMMIT" = "$GIT_COMMIT_FILE" ]; then
-		exit 0
-	fi
-fi
-
 echo "$SCYLLA_VERSION-$SCYLLA_RELEASE"
 mkdir -p build
 echo "$SCYLLA_VERSION" > build/SCYLLA-VERSION-FILE
--- a/1
+++ b/1
--- a/alternator-test/README.md
+++ b/alternator-test/README.md
--- a/alternator-test/conftest.py
+++ b/alternator-test/conftest.py
@@ -26,14 +26,6 @@ import pytest
 import boto3
 from util import create_test_table

-# When tests are run with HTTPS, the server often won't have its SSL
-# certificate signed by a known authority. So we will disable certificate
-# verification with the "verify=False" request option. However, once we do
-# that, we start getting scary-looking warning messages, saying that this
-# makes HTTPS insecure. The following silences those warnings:
-import urllib3
-urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-
 # Test that the Boto libraries are new enough. These tests want to test a
 # large variety of DynamoDB API features, and to do this we need a new-enough
 # version of the the Boto libraries (boto3 and botocore) so that they can
@@ -54,8 +46,6 @@ def pytest_addoption(parser):
    parser.addoption("--https", action="store_true",
        help="communicate via HTTPS protocol on port 8043 instead of HTTP when"
            " running against a local Scylla installation")
-    parser.addoption("--url", action="store",
-        help="communicate with given URL instead of defaults")

 # "dynamodb" fixture: set up client object for communicating with the DynamoDB
 # API. Currently this chooses either Amazon's DynamoDB in the default region
@@ -72,15 +62,15 @@ def dynamodb(request):
        # requires us to specify dummy region and credential parameters,
        # otherwise the user is forced to properly configure ~/.aws even
        # for local runs.
-        if request.config.getoption('url') != None:
-            local_url = request.config.getoption('url')
-        else:
-            local_url = 'https://localhost:8043' if request.config.getoption('https') else 'http://localhost:8000'
+        local_url = 'https://localhost:8043' if request.config.getoption('https') else 'http://localhost:8000'
        # Disable verifying in order to be able to use self-signed TLS certificates
        verify = not request.config.getoption('https')
+        # Silencing the 'Unverified HTTPS request warning'
+        if request.config.getoption('https'):
+            import urllib3
+            urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
        return boto3.resource('dynamodb', endpoint_url=local_url, verify=verify,
-            region_name='us-east-1', aws_access_key_id='alternator', aws_secret_access_key='secret_pass',
-            config=botocore.client.Config(retries={"max_attempts": 3}))
+            region_name='us-east-1', aws_access_key_id='alternator', aws_secret_access_key='secret_pass')

 # "test_table" fixture: Create and return a temporary table to be used in tests
 # that need a table to work on. The table is automatically deleted at the end.
@@ -125,15 +115,6 @@ def test_table_s(dynamodb):
        AttributeDefinitions=[ { 'AttributeName': 'p', 'AttributeType': 'S' } ])
    yield table
    table.delete()
-# test_table_s_2 has exactly the same schema as test_table_s, and is useful
-# for tests which need two different tables with the same schema.
-@pytest.fixture(scope="session")
-def test_table_s_2(dynamodb):
-    table = create_test_table(dynamodb,
-        KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, ],
-        AttributeDefinitions=[ { 'AttributeName': 'p', 'AttributeType': 'S' } ])
-    yield table
-    table.delete()
@pytest.fixture(scope="session")
 def test_table_b(dynamodb):
    table = create_test_table(dynamodb,
@@ -196,11 +177,3 @@ def filled_test_table(dynamodb):

    yield table, items
    table.delete()
-
-# The "scylla_only" fixture can be used by tests for Scylla-only features,
-# which do not exist on AWS DynamoDB. A test using this fixture will be
-# skipped if running with "--aws".
-@pytest.fixture(scope="session")
-def scylla_only(dynamodb):
-    if dynamodb.meta.client._endpoint.host.endswith('.amazonaws.com'):
-        pytest.skip('Scylla-only feature not supported by AWS')
--- a/alternator-test/test_authorization.py
+++ b/alternator-test/test_authorization.py
@@ -59,18 +59,6 @@ def test_expired_signature(dynamodb, test_table):
    assert not response.ok
    assert "InvalidSignatureException" in response.text and "Signature expired" in response.text

-# A test verifying that missing Authorization header is handled properly
-def test_no_authorization_header(dynamodb, test_table):
-    url = dynamodb.meta.client._endpoint.host
-    print(url)
-    headers = {'Content-Type': 'application/x-amz-json-1.0',
-               'X-Amz-Date': '20170101T010101Z',
-               'X-Amz-Target': 'DynamoDB_20120810.DescribeEndpoints',
-    }
-    response = requests.post(url, headers=headers, verify=False)
-    assert not response.ok
-    assert "InvalidSignatureException" in response.text and "Authorization header" in response.text
-
 # A test ensuring that signatures that exceed current time too much are not accepted.
 # Watch out - this test is valid only for around next 1000 years, it needs to be updated later.
 def test_signature_too_futuristic(dynamodb, test_table):
--- a/alternator-test/test_batch.py
+++ b/alternator-test/test_batch.py
@@ -20,7 +20,6 @@
 # so they are actually tested by other tests as well.

 import pytest
-import random
 from botocore.exceptions import ClientError
 from util import random_string, full_scan, full_query, multiset

@@ -45,19 +44,6 @@ def test_basic_batch_write_item(test_table):
        assert item['attribute'] == str(i)
        assert item['another'] == 'xyz' 

-# Try a batch which includes both multiple writes to the same partition
-# and several partitions. The LWT code collects multiple mutations to the
-# same partition together, and we want to test that this worked correctly.
-def test_batch_write_item_mixed(test_table):
-    partitions = [random_string() for i in range(4)]
-    items = [{'p': p, 'c': str(i)} for p in partitions for i in range(4)]
-    with test_table.batch_writer() as batch:
-        # Reorder items randomly, just for the heck of it
-        for item in random.sample(items, len(items)):
-            batch.put_item(item)
-    for item in items:
-        assert test_table.get_item(Key={'p': item['p'], 'c': item['c']}, ConsistentRead=True)['Item'] == item
-
 # Test batch write to a table with only a hash key
 def test_batch_write_hash_only(test_table_s):
    items = [{'p': random_string(), 'val': random_string()} for i in range(10)]
@@ -152,20 +138,6 @@ def test_batch_write_duplicate_write_and_delete(test_table_s, test_table):
        batch.put_item({'p': p, 'c': other})
        batch.put_item({'p': other, 'c': c})

-# The BatchWriteIem API allows writing to more than one table in the same
-# batch. This test verifies that the duplicate-key checking doesn't mistake
-# updates to the same key in different tables to be duplicates.
-def test_batch_write_nonduplicate_multiple_tables(test_table_s, test_table_s_2):
-    p = random_string()
-    # The batch_writer() function used in previous tests can't write to more
-    # than one table. So we use the lower level interface boto3 gives us.
-    reply = test_table_s.meta.client.batch_write_item(RequestItems = {
-        test_table_s.name: [{'PutRequest': {'Item': {'p': p, 'a': 'hi'}}}],
-        test_table_s_2.name: [{'PutRequest': {'Item': {'p': p, 'b': 'hello'}}}]
-    })
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'a': 'hi'}
-    assert test_table_s_2.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'b': 'hello'}
-
 # Test that BatchWriteItem's PutRequest completely replaces an existing item.
 # It shouldn't merge it with a previously existing value. See also the same
 # test for PutItem - test_put_item_replace().
@@ -210,32 +182,6 @@ def test_batch_write_invalid_operation(test_table_s):
    for p in [p1, p2]:
        assert not 'item' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)

-# In test_item.py we have a bunch of test_empty_* tests on different ways to
-# create an empty item (which in Scylla requires the special CQL row marker
-# to be supported correctly). BatchWriteItems provides yet another way of
-# creating items, so check the empty case here too:
-def test_empty_batch_write(test_table):
-    p = random_string()
-    c = random_string()
-    with test_table.batch_writer() as batch:
-        batch.put_item({'p': p, 'c': c})
-    assert test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item'] == {'p': p, 'c': c}
-
-# Test that BatchWriteItems allows writing to multiple tables in one operation
-def test_batch_write_multiple_tables(test_table_s, test_table):
-    p1 = random_string()
-    c1 = random_string()
-    p2 = random_string()
-    # We use the low-level batch_write_item API for lack of a more convenient
-    # API (the batch_writer() API can only write to one table). At least it
-    # spares us the need to encode the key's types...
-    reply = test_table.meta.client.batch_write_item(RequestItems = {
-        test_table.name: [{'PutRequest': {'Item': {'p': p1, 'c': c1, 'a': 'hi'}}}],
-        test_table_s.name: [{'PutRequest': {'Item': {'p': p2, 'b': 'hello'}}}]
-    })
-    assert test_table.get_item(Key={'p': p1, 'c': c1}, ConsistentRead=True)['Item'] == {'p': p1, 'c': c1, 'a': 'hi'}
-    assert test_table_s.get_item(Key={'p': p2}, ConsistentRead=True)['Item'] == {'p': p2, 'b': 'hello'}
-
 # Basic test for BatchGetItem, reading several entire items.
 # Schema has both hash and sort keys.
 def test_batch_get_item(test_table):
@@ -305,16 +251,3 @@ def test_batch_get_item_projection_expression(test_table):
        got_items = reply['Responses'][test_table.name]
        expected_items = [{k: item[k] for k in wanted if k in item} for item in items]
        assert multiset(got_items) == multiset(expected_items)
-
-# Test that we return the required UnprocessedKeys/UnprocessedItems parameters
-def test_batch_unprocessed(test_table_s):
-    p = random_string()
-    write_reply = test_table_s.meta.client.batch_write_item(RequestItems = {
-        test_table_s.name: [{'PutRequest': {'Item': {'p': p, 'a': 'hi'}}}],
-    })
-    assert 'UnprocessedItems' in write_reply and write_reply['UnprocessedItems'] == dict()
-
-    read_reply = test_table_s.meta.client.batch_get_item(RequestItems = {
-        test_table_s.name: {'Keys': [{'p': p}], 'ProjectionExpression': 'p, a', 'ConsistentRead': True}
-    })
-    assert 'UnprocessedKeys' in read_reply and read_reply['UnprocessedKeys'] == dict()
--- a/alternator-test/test_condition_expression.py
+++ b/alternator-test/test_condition_expression.py
@@ -22,36 +22,9 @@
 # test_condition_expression.py. Many of the tests there are very similar to
 # the ones included here.

-# NOTE: In this file, we use the b'xyz' syntax to represent DynamoDB's binary
-# values. This syntax works as expected only in Python3. In Python2 it
-# appears to work, but the "b" is actually ignored and the result is a normal
-# string 'xyz'. That means that we end up testing the string type instead of
-# the binary type as intended. So this test can run on Python2 but doesn't
-# cover testing binary types. The test should be run in Python3 to ensure full
-# coverage.
-
 import pytest
 from botocore.exceptions import ClientError
 from util import random_string
-from sys import version_info
-
-# A helper function for changing write isolation policies
-def set_write_isolation(table, isolation):
-    got = table.meta.client.describe_table(TableName=table.name)['Table']
-    arn =  got['TableArn']
-    tags = [
-        {
-            'Key': 'system:write_isolation',
-            'Value': isolation
-        }
-    ]
-    table.meta.client.tag_resource(ResourceArn=arn, Tags=tags)
-
-# A helper function to clear previous isolation tags
-def clear_write_isolation(table):
-    got = table.meta.client.describe_table(TableName=table.name)['Table']
-    arn =  got['TableArn']
-    table.meta.client.untag_resource(ResourceArn=arn, TagKeys=['system:write_isolation'])

 # Most of the tests in this file check that the ConditionExpression
 # parameter works for the UpdateItem operation. It should also work the
@@ -88,6 +61,7 @@ def test_condition_expression_attribute_updates(test_table_s):
 # attribute from the request, and the case of comparing two different
 # attributes of the same item (the latter case wasn't possible to express
 # with Expected, and becomes possible with ConditionExpression).
+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_eq_success(test_table_s):
    p = random_string()
    values = (1, "hello", True, b'xyz', None, ['hello', 42], {'hello': 'world'}, set(['hello', 'world']), set([1, 2, 3]), set([b'xyz', b'hi']))
@@ -111,6 +85,7 @@ def test_update_condition_eq_success(test_table_s):

 # Comparing values of *different* types should always fail. Check all the
 # combination of different types.
+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_eq_different(test_table_s):
    p = random_string()
    values = (1, "hello", True, b'xyz', None, ['hello', 42], {'hello': 'world'}, set(['hello', 'world']), set([1, 2, 3]), set([b'xyz', b'hi']))
@@ -137,6 +112,7 @@ def test_update_condition_eq_different(test_table_s):
                        ExpressionAttributeValues={':val1': val1, ':val2': val2})

 # Also check an actual case of same time, but inequality.
+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_eq_unequal(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
@@ -150,6 +126,7 @@ def test_update_condition_eq_unequal(test_table_s):
 # Check that set equality is checked correctly. Unlike string equality (for
 # example), it cannot be done with just naive string comparison of the JSON
 # representation, and we need to allow for any order. (see issue #5021)
+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_eq_set(test_table_s):
    p = random_string()
    # Because boto3 sorts the set values we give it, in order to generate a
@@ -169,6 +146,7 @@ def test_update_condition_eq_set(test_table_s):
    assert 'b' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']

 # Test for ConditionExpression with operator "<>" (non-equality),
+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_ne(test_table_s):
    p = random_string()
    # We only check here one type of attributes (numbers), assuming that the
@@ -209,6 +187,7 @@ def test_update_condition_ne(test_table_s):
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['c'] == 3

 # Test for ConditionExpression with operator "<"
+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_lt(test_table_s):
    p = random_string()
    # The < operator should work for string, number and binary types
@@ -281,6 +260,7 @@ def test_update_condition_lt(test_table_s):
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 4

 # Test for ConditionExpression with operator "<="
+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_le(test_table_s):
    p = random_string()
    # The <= operator should work for string, number and binary types
@@ -344,6 +324,7 @@ def test_update_condition_le(test_table_s):
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 7

 # Test for ConditionExpression with operator ">"
+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_gt(test_table_s):
    p = random_string()
    # The > operator should work for string, number and binary types
@@ -407,6 +388,7 @@ def test_update_condition_gt(test_table_s):
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 4

 # Test for ConditionExpression with operator ">="
+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_ge(test_table_s):
    p = random_string()
    # The >= operator should work for string, number and binary types
@@ -472,6 +454,7 @@ def test_update_condition_ge(test_table_s):
 # Test for ConditionExpression with ternary operator "BETWEEN" (checking
 # if a value is between two others, equality included). The keywords
 # "BETWEEN" and "AND" are case insensitive.
+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_between(test_table_s):
    p = random_string()
    # The BETWEEN operator should work for string, number and binary types
@@ -553,6 +536,7 @@ def test_update_condition_between(test_table_s):
 # Test for ConditionExpression with multi-operand operator "IN", checking
 # whether a value is equal to one of possibly many values (up to 100 should
 # be supported, according to the DynamoDB documentation).
+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_in(test_table_s):
    p = random_string()
    
@@ -599,12 +583,6 @@ def test_update_condition_in(test_table_s):
        ConditionExpression='a IN (:x, :y)',
        ExpressionAttributeValues={':val': 1, ':x': 'dog', ':y': 174})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['c'] == 1
-    # IN with zero arguments results in a syntax error, not a failed condition
-    with pytest.raises(ClientError, match='ValidationException.*yntax error'):
-        test_table_s.update_item(Key={'p': p},
-            UpdateExpression='SET c = :val37',
-            ConditionExpression='a IN ()',
-            ExpressionAttributeValues=values)

 # Beyond the above operators, there are also test functions supported -
 # attribute_exists, attribute_not_exists, attribute_type, begins_with,
@@ -612,6 +590,7 @@ def test_update_condition_in(test_table_s):
 # These functions are listed and described in
 # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Expressions.OperatorsAndFunctions.html

+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_attribute_exists(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
@@ -626,34 +605,8 @@ def test_update_condition_attribute_exists(test_table_s):
            UpdateExpression='SET c = :val',
                ConditionExpression='attribute_exists (z)',
                ExpressionAttributeValues={':val': 3})
-    # Somewhat artificially, attribute_exists() requires that its parameter
-    # be a path - it cannot be a different sort of value.
-    with pytest.raises(ClientError, match='ValidationException.*path'):
-        test_table_s.update_item(Key={'p': p},
-            UpdateExpression='SET c = :val',
-                ConditionExpression='attribute_exists (:val)',
-                ExpressionAttributeValues={':val': 3})
-
-# Primitive conditions usually look like an operator between two (<, <=,
-# etc.), three (BETWEEN) or more (IN) values. Can just a single value be
-# a condition? The special case of a single function call *can* be - we saw
-# an example attribute_exists(z) in the previous test. However that only
-# function calls are supported in this context - not general values (i.e.,
-# attribute or value references).
-# While DynamoDB does not accept a non-function-call value as a condition
-# (it results with with a syntax error), in Alternator currently, for
-# simplicity of the parser, this case is parsed correctly and only fails
-# later when the calculated value ends up to not be a boolean.
-def test_update_condition_single_value_attribute(test_table_s):
-    p = random_string()
-    test_table_s.update_item(Key={'p': p},
-            AttributeUpdates={'a': {'Value': 1, 'Action': 'PUT'}})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.update_item(Key={'p': p},
-            UpdateExpression='SET c = :val',
-                ConditionExpression='a',
-                ExpressionAttributeValues={':val': 1})

+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_attribute_not_exists(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
@@ -669,6 +622,7 @@ def test_update_condition_attribute_not_exists(test_table_s):
                ConditionExpression='attribute_not_exists (a)',
                ExpressionAttributeValues={':val': 3})

+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_attribute_type(test_table_s):
    p = random_string()
    type_values = [
@@ -686,10 +640,6 @@ def test_update_condition_attribute_type(test_table_s):
    test_table_s.update_item(Key={'p': p}, AttributeUpdates=updates)
    for i in range(len(type_values)):
        expected_type = type_values[i][0]
-        # As explained in a comment in the top of the file, the binary types
-        # cannot be tested with Python 2
-        if expected_type in ('B', 'BS') and version_info[0] == 2:
-            continue
        test_table_s.update_item(Key={'p': p},
            UpdateExpression='SET c = :val',
            ConditionExpression='attribute_type (a{}, :type)'.format(i),
@@ -701,32 +651,25 @@ def test_update_condition_attribute_type(test_table_s):
                UpdateExpression='SET c = :val',
                ConditionExpression='attribute_type (a{}, :type)'.format(i),
                ExpressionAttributeValues={':val': i, ':type': wrong_type})
-    # The DynamoDB documentation suggests that attribute_type()'s first
-    # parameter must be a path (as we saw above, this is indeed the case for
-    # attribute_exists()). But in fact, attribute_type() does work fine also
-    # for an expression attribute.
-    test_table_s.update_item(Key={'p': p},
-        UpdateExpression='SET c = :val',
-            ConditionExpression='attribute_type (:val, :type)',
-            ExpressionAttributeValues={':val': 0, ':type': 'N'})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['c'] == 0

 # The DynamoDB documentation explicitly states that the second argument
 # of the attribute_type function - the type to compare to - *must* be an
 # expression attribute (:name) - it cannot be an item attribute.
 # I don't know why this was important to forbid, but this test confirms that
 # DynamoDB does forbid it.
+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_attribute_type_second_arg(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
            AttributeUpdates={'a': {'Value': 1, 'Action': 'PUT'},
                              'b': {'Value': 'N', 'Action': 'PUT'}})
-    with pytest.raises(ClientError, match='ValidationException'):
+    with pytest.raises(ClientError, match='ValidationException.*Incorrect'):
        test_table_s.update_item(Key={'p': p},
            UpdateExpression='SET c = :val',
                ConditionExpression='attribute_type (a, b)',
                ExpressionAttributeValues={':val': 1})

+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_begins_with(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
@@ -769,11 +712,12 @@ def test_update_condition_begins_with(test_table_s):
                ConditionExpression='begins_with(c, a)',
                ExpressionAttributeValues={':val': 3})

+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_contains(test_table_s):
    p = random_string()
    # contains() can be used for two unrelated things: check substring (in
    # string or binary) and membership (in set or a list). The DynamoDB
-    # documentation only mention string and set (not binary or list) but
+    # documentation only bention string and set (not binary or list) but
    # the fact is that binary and list are also support.
    test_table_s.update_item(Key={'p': p},
        AttributeUpdates={'a': {'Value': 'hello', 'Action': 'PUT'},
@@ -808,19 +752,11 @@ def test_update_condition_contains(test_table_s):
            UpdateExpression='SET z = :val',
                ConditionExpression='contains(d, :arg)',
                ExpressionAttributeValues={':val': 4, ':arg': b'dog'})
-
-
-# While both operands of contains() may be item attributes, strangely
-# it is explicitly forbidden to have the same attribute as both and
-# trying to do so results in a ValidationException. I don't know why it's
-# important to make this query fail, when it could have just worked...
-# TODO: Is this limitation only for contains() or other functions as well?
-@pytest.mark.xfail(reason="extra check for same attribute not implemented yet")
-def test_update_condition_contains_same_attribute(test_table_s):
-    p = random_string()
+    # While both operands of contains may be item attributes, strangely
+    # it is explicitly forbidden to have the same attribute as both and
+    # trying to do so results in a ValidationException.
    test_table_s.update_item(Key={'p': p},
-        AttributeUpdates={'a1': {'Value': 'hello', 'Action': 'PUT'},
-                          'a': {'Value': 'hello', 'Action': 'PUT'}})
+        AttributeUpdates={'a1': {'Value': 'hello', 'Action': 'PUT'}})
    test_table_s.update_item(Key={'p': p},
        UpdateExpression='SET z = :val',
            ConditionExpression='contains(a, a1)',
@@ -838,6 +774,7 @@ def test_update_condition_contains_same_attribute(test_table_s):
 # function whose return value needs to be further combined with another
 # operand using a comparison operation - and it isn't specified which is
 # supported.
+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_size(test_table_s):
    p = random_string()
    # First verify what size() returns for various types. We use only the
@@ -847,7 +784,7 @@ def test_update_condition_size(test_table_s):
                          'b': {'Value': set([2, 4, 7]), 'Action': 'PUT'},
                          'c': {'Value': [2, 'dog', 7], 'Action': 'PUT'},
                          'd': {'Value': b'hi there', 'Action': 'PUT'},
-                          'e': {'Value': {'x': 2, 'y': {'m': 3, 'n': 4}}, 'Action': 'PUT'},
+                          'e': {'Value': {'x': 2, 'y': 3}, 'Action': 'PUT'},
                          'f': {'Value': 5, 'Action': 'PUT'},
                          'g': {'Value': True, 'Action': 'PUT'},
                          'h': {'Value': None, 'Action': 'PUT'}})
@@ -932,70 +869,6 @@ def test_update_condition_size(test_table_s):
            ConditionExpression='size(a)>=:arg',
            ExpressionAttributeValues={':val': 11, ':arg': 2})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 11
-    # size() is only allowed one operand; More operands are allowed by the
-    # parser, but later result in an error:
-    with pytest.raises(ClientError, match='ValidationException.*2'):
-        test_table_s.update_item(Key={'p': p},
-            UpdateExpression='SET z = :val',
-            ConditionExpression='size(a, a)=:arg',
-            ExpressionAttributeValues={':val': 1, ':arg': 5})
-
-# The above test tested conditions involving size() in a comparison.
-# Trying to use just size(a) as a condition (as we use the rest of the
-# functions supported by ConditionExpression) does not work - DynamoDB
-# reports # that "The function is not allowed to be used this way in an
-# expression; function: size".
-def test_update_condition_size_alone(test_table_s):
-    p = random_string()
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.update_item(Key={'p': p},
-            UpdateExpression='SET z = :val',
-            ConditionExpression='size(a)',
-            ExpressionAttributeValues={':val': 1})
-
-# Similarly, while attribute_exists(a) works alone, it cannot be used in
-# a comparison, e.g., attribute_exists(a) < 1 also causes DynamoDB to
-# complain about "The function is not allowed to be used in this way in an
-# expression.".
-def test_update_condition_attribute_exists_in_comparison(test_table_s):
-    p = random_string()
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.update_item(Key={'p': p},
-            UpdateExpression='SET z = :val',
-            ConditionExpression='attribute_exists(a) < :val',
-            ExpressionAttributeValues={':val': 1})
-
-# In essense, the size() function tested in the previous test behaves
-# exactly like the functions of UpdateExpressions, i.e., it transforms a
-# value (attribute from the item or the query) into a new value, which
-# can than be operated (in our case, compared). In this test we check
-# that other functions supported by UpdateExpression - if_not_exists()
-# and list_append() - are not supported.
-def test_update_condition_other_funcs(test_table_s):
-    p = random_string()
-    test_table_s.update_item(Key={'p': p},
-        AttributeUpdates={'a': {'Value': 'hello', 'Action': 'PUT'}})
-    # dog() is an unknown function name:
-    with pytest.raises(ClientError, match='ValidationException.*function'):
-        test_table_s.update_item(Key={'p': p},
-            UpdateExpression='SET z = :val',
-            ConditionExpression='dog(a)=:arg',
-            ExpressionAttributeValues={':val': 1, ':arg': 5})
-    # The functions if_not_exists() and list_append() are known functions
-    # (they are supported in UpdateExpression) but not allowed in
-    # ConditionExpression. This means we can have a single function for
-    # evaluation a parsed::value, but it needs to know whether it is
-    # called for a UpdateExpression or a ConditionExpression.
-    with pytest.raises(ClientError, match='ValidationException.*not allowed'):
-        test_table_s.update_item(Key={'p': p},
-            UpdateExpression='SET z = :val',
-            ConditionExpression='if_not_exists(a, a)=:arg',
-            ExpressionAttributeValues={':val': 1, ':arg': 5})
-    with pytest.raises(ClientError, match='ValidationException.*not allowed'):
-        test_table_s.update_item(Key={'p': p},
-            UpdateExpression='SET z = :val',
-            ConditionExpression='list_append(a, a)=:arg',
-            ExpressionAttributeValues={':val': 1, ':arg': 5})

 # All the previous tests involved top-level attributes to be tested. But
 # ConditionExpressions also allows reading nested attributes, and we should
@@ -1021,6 +894,7 @@ def test_update_condition_nested_attributes(test_table_s):
 # But the DynamoDB API also allows to refer to attributes using a #reference.
 # Among other things this allows using attribute names which are usually
 # reserved keywords in condition expressions.
+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_attribute_reference(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
@@ -1050,6 +924,7 @@ def test_update_condition_nested_attribute_reference(test_table_s):
 # precedence involved, and should be tested (see the definitions in
 # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Expressions.OperatorsAndFunctions.html

+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_and(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
@@ -1074,6 +949,7 @@ def test_update_condition_and(test_table_s):
                ConditionExpression='a < b AND c < b',
                ExpressionAttributeValues={':val': 1})

+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_or(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
@@ -1103,6 +979,7 @@ def test_update_condition_or(test_table_s):
                ConditionExpression='b < a OR c < b',
                ExpressionAttributeValues={':val': 1})

+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_not(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
@@ -1125,13 +1002,8 @@ def test_update_condition_not(test_table_s):
            UpdateExpression='SET z = :val',
                ConditionExpression='NOT a < b',
                ExpressionAttributeValues={':val': 1})
-    # NOT NOT NOT NOT also works (and does nothing) :-)
-    test_table_s.update_item(Key={'p': p},
-        UpdateExpression='SET z = :val',
-            ConditionExpression='NOT NOT NOT NOT a < b',
-            ExpressionAttributeValues={':val': 3})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 3

+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_parentheses(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
@@ -1147,6 +1019,7 @@ def test_update_condition_parentheses(test_table_s):
 # There is operator precedence that allows a user to use less parentheses.
 # We need to implement it correctly:

+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_and_before_or(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
@@ -1159,6 +1032,7 @@ def test_update_condition_and_before_or(test_table_s):
            ExpressionAttributeValues={':val': 1})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 1

+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_not_before_and(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
@@ -1171,6 +1045,7 @@ def test_update_condition_not_before_and(test_table_s):
                ConditionExpression='NOT a < b AND c < b',
                ExpressionAttributeValues={':val': 1})

+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_between_before_and(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
@@ -1185,6 +1060,7 @@ def test_update_condition_between_before_and(test_table_s):

 # An empty ConditionExpression is not allowed - resulting in a validation
 # error, not a failed condition:
+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_update_condition_empty(test_table_s):
    p = random_string()
    with pytest.raises(ClientError, match='ValidationException.*empty'):
@@ -1200,6 +1076,7 @@ def test_update_condition_empty(test_table_s):
 # used to test the condition. So we just need one test for each operation,
 # to verify that this code actually gets called.

+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_delete_item_condition(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
@@ -1214,6 +1091,7 @@ def test_delete_item_condition(test_table_s):
            ExpressionAttributeValues={':oldval': 1})
    assert not 'Item' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)

+@pytest.mark.xfail(reason="ConditionExpression not yet implemented")
 def test_put_item_condition(test_table_s):
    p = random_string()
    test_table_s.update_item(Key={'p': p},
@@ -1226,162 +1104,3 @@ def test_put_item_condition(test_table_s):
        test_table_s.put_item(Item={'p': p, 'a': 3},
            ConditionExpression='a = :oldval',
            ExpressionAttributeValues={':oldval': 1})
-
-# DynamoDB frowns upon unused entries in ExpressionAttributeValues and
-# ExpressionAttributeNames. Check that we do too (in all three operations),
-# although it's not terribly important that we be compatible with DynamoDB
-# here...
-# There's one delicate issue, though. Should we check for unused entries
-# during parsing, or during evaluation? The stage we check this changes
-# our behavior when the condition was supposed to fail. So we have two
-# separate tests here, one for failed condition and one for successful.
-# Because Alternator does this check at a different stage from DynamoDB,
-# this test currently fails.
-@pytest.mark.xfail(reason="unused entries are checked too late")
-def test_update_condition_unused_entries_failed(test_table_s):
-    p = random_string()
-    # unused val3:
-    with pytest.raises(ClientError, match='ValidationException.*val3'):
-        test_table_s.update_item(Key={'p': p},
-            UpdateExpression='SET #name1 = :val1',
-            ConditionExpression='#name2 = :val2',
-            ExpressionAttributeValues={':val1': 1, ':val2': 2, ':val3': 3},
-            ExpressionAttributeNames={'#name1': 'a', '#name2': 'b'})
-    with pytest.raises(ClientError, match='ValidationException.*val3'):
-        test_table_s.delete_item(Key={'p': p},
-            ConditionExpression='#name1 = :val1',
-            ExpressionAttributeValues={':val1': 1, ':val3': 3},
-            ExpressionAttributeNames={'#name1': 'a'})
-    with pytest.raises(ClientError, match='ValidationException.*val3'):
-        test_table_s.put_item(Item={'p': p, 'a': 3},
-            ConditionExpression='#name1 = :val1',
-            ExpressionAttributeValues={':val1': 1, ':val3': 3},
-            ExpressionAttributeNames={'#name1': 'a'})
-    # unused name3:
-    with pytest.raises(ClientError, match='ValidationException.*name3'):
-        test_table_s.update_item(Key={'p': p},
-            UpdateExpression='SET #name1 = :val1',
-            ConditionExpression='#name2 = :val2',
-            ExpressionAttributeValues={':val1': 1, ':val2': 2},
-            ExpressionAttributeNames={'#name1': 'a', '#name2': 'b', '#name3': 'c'})
-    with pytest.raises(ClientError, match='ValidationException.*name3'):
-        test_table_s.delete_item(Key={'p': p},
-            ConditionExpression='#name1 = :val1',
-            ExpressionAttributeValues={':val1': 1},
-            ExpressionAttributeNames={'#name1': 'a', '#name3': 'c'})
-    with pytest.raises(ClientError, match='ValidationException.*name3'):
-        test_table_s.put_item(Item={'p': p, 'a': 3},
-            ConditionExpression='#name1 = :val1',
-            ExpressionAttributeValues={':val1': 1},
-            ExpressionAttributeNames={'#name1': 'a', '#name3': 'c'})
-def test_update_condition_unused_entries_succeeded(test_table_s):
-    p = random_string()
-    test_table_s.update_item(Key={'p': p},
-        AttributeUpdates={'b': {'Value': 2, 'Action': 'PUT'}})
-    # unused val3:
-    with pytest.raises(ClientError, match='ValidationException.*val3'):
-        test_table_s.update_item(Key={'p': p},
-            UpdateExpression='SET #name1 = :val1',
-            ConditionExpression='#name2 = :val2',
-            ExpressionAttributeValues={':val1': 1, ':val2': 2, ':val3': 3},
-            ExpressionAttributeNames={'#name1': 'a', '#name2': 'b'})
-    with pytest.raises(ClientError, match='ValidationException.*val3'):
-        test_table_s.delete_item(Key={'p': p},
-            ConditionExpression='#name2 = :val2',
-            ExpressionAttributeValues={':val2': 2, ':val3': 3},
-            ExpressionAttributeNames={'#name2': 'b'})
-    with pytest.raises(ClientError, match='ValidationException.*val3'):
-        test_table_s.put_item(Item={'p': p, 'a': 3},
-            ConditionExpression='#name2 = :val2',
-            ExpressionAttributeValues={':val2': 2, ':val3': 3},
-            ExpressionAttributeNames={'#name2': 'b'})
-    # unused name3:
-    with pytest.raises(ClientError, match='ValidationException.*name3'):
-        test_table_s.update_item(Key={'p': p},
-            UpdateExpression='SET #name1 = :val1',
-            ConditionExpression='#name2 = :val2',
-            ExpressionAttributeValues={':val1': 1, ':val2': 2},
-            ExpressionAttributeNames={'#name1': 'a', '#name2': 'b', '#name3': 'c'})
-    with pytest.raises(ClientError, match='ValidationException.*name3'):
-        test_table_s.delete_item(Key={'p': p},
-            ConditionExpression='#name2 = :val2',
-            ExpressionAttributeValues={':val2': 2},
-            ExpressionAttributeNames={'#name2': 'b', '#name3': 'c'})
-    with pytest.raises(ClientError, match='ValidationException.*name3'):
-        test_table_s.put_item(Item={'p': p, 'a': 3},
-            ConditionExpression='#name2 = :val2',
-            ExpressionAttributeValues={':val2': 2},
-            ExpressionAttributeNames={'#name2': 'b', '#name3': 'c'})
-
-# Test a bunch of cases with permissive write isolation levels,
-# i.e. LWT_ALWAYS, LWT_RMW_ONLY and UNSAFE_RMW.
-# These test cases make sense only for alternator, so they're skipped
-# when run on AWS
-def test_condition_expression_with_permissive_write_isolation(scylla_only, dynamodb, test_table_s):
-    def do_test_with_permissive_isolation_levels(test_case, table, *args):
-        try:
-            for isolation in ['a', 'o', 'u']:
-                set_write_isolation(table, isolation)
-                test_case(table, *args)
-        finally:
-            clear_write_isolation(table)
-    for test_case in [test_update_condition_eq_success, test_update_condition_attribute_exists,
-                      test_delete_item_condition, test_put_item_condition, test_update_condition_attribute_reference]:
-        do_test_with_permissive_isolation_levels(test_case, test_table_s)
-
-# Test that the forbid_rmw isolation level prevents read-modify-write requests
-# from working. These test cases make sense only for alternator, so they're skipped
-# when run on AWS
-def test_condition_expression_with_forbidden_rmw(scylla_only, dynamodb, test_table_s):
-    def do_test_with_forbidden_rmw(test_case, table, *args):
-        try:
-            set_write_isolation(table, 'f')
-            test_case(table, *args)
-            assert False, "Expected an exception when running {}".format(test_case.__name__)
-        except ClientError:
-            pass
-        finally:
-            clear_write_isolation(table)
-    for test_case in [test_update_condition_eq_success, test_update_condition_attribute_exists,
-                      test_put_item_condition, test_update_condition_attribute_reference]:
-        do_test_with_forbidden_rmw(test_case, test_table_s)
-    # Ensure that regular writes (without rmw) work just fine
-    s = random_string()
-    test_table_s.put_item(Item={'p': s, 'regular': 'write'})
-    assert test_table_s.get_item(Key={'p': s}, ConsistentRead=True)['Item'] == {'p': s, 'regular': 'write'}
-    test_table_s.update_item(Key={'p': s}, AttributeUpdates={'write': {'Value': 'regular', 'Action': 'PUT'}})
-    assert test_table_s.get_item(Key={'p': s}, ConsistentRead=True)['Item'] == {'p': s, 'regular': 'write', 'write': 'regular'}
-
-# Reproducer for issue #6573: binary strings should be ordered as unsigned
-# bytes, i.e., byte 128 comes after 127, not before as with signed bytes.
-# Test the five ordering operators: <, <=, >, >=, between
-def test_condition_expression_unsigned_bytes(test_table_s):
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'b': bytearray([127])})
-    test_table_s.update_item(Key={'p': p},
-        UpdateExpression='SET z = :newval',
-        ConditionExpression='b < :oldval',
-        ExpressionAttributeValues={':newval': 1, ':oldval': bytearray([128])})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 1
-    test_table_s.update_item(Key={'p': p},
-        UpdateExpression='SET z = :newval',
-        ConditionExpression='b <= :oldval',
-        ExpressionAttributeValues={':newval': 2, ':oldval': bytearray([128])})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 2
-    test_table_s.update_item(Key={'p': p},
-        UpdateExpression='SET z = :newval',
-        ConditionExpression='b between :oldval1 and :oldval2',
-        ExpressionAttributeValues={':newval': 3, ':oldval1': bytearray([126]), ':oldval2': bytearray([128])})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 3
-
-    test_table_s.put_item(Item={'p': p, 'b': bytearray([128])})
-    test_table_s.update_item(Key={'p': p},
-        UpdateExpression='SET z = :newval',
-        ConditionExpression='b > :oldval',
-        ExpressionAttributeValues={':newval': 4, ':oldval': bytearray([127])})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 4
-    test_table_s.update_item(Key={'p': p},
-        UpdateExpression='SET z = :newval',
-        ConditionExpression='b >= :oldval',
-        ExpressionAttributeValues={':newval': 5, ':oldval': bytearray([127])})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 5
--- a/alternator-test/test_describe_endpoints.py
+++ b/alternator-test/test_describe_endpoints.py
--- a/alternator-test/test_describe_table.py
+++ b/alternator-test/test_describe_table.py
@@ -141,6 +141,7 @@ def test_describe_table_stream_specification(test_table):
 # includes which zone it is on, which account, and of course the table's
 # name. The ARN format is described in
 # https://docs.aws.amazon.com/general/latest/gr/aws-arns-and-namespaces.html#genref-arns
+@pytest.mark.xfail(reason="DescribeTable does not return ARN")
 def test_describe_table_arn(test_table):
    got = test_table.meta.client.describe_table(TableName=test_table.name)['Table']
    assert 'TableArn' in got and got['TableArn'].startswith('arn:')
--- a/alternator-test/test_expected.py
+++ b/alternator-test/test_expected.py
@@ -1077,42 +1077,3 @@ def test_put_item_expected(test_table_s):
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'a': 2}
    with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
        test_table_s.put_item(Item={'p': p, 'a': 3}, Expected={'a': {'Value': 1}})
-
-# Reproducer for issue #6573: binary strings should be ordered as unsigned
-# bytes, i.e., byte 128 comes after 127, not before as with signed bytes.
-# Test the five ordering operators: LT, LE, GT, GE, BETWEEN
-def test_update_expected_unsigned_bytes(test_table_s):
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'b': bytearray([127])})
-    test_table_s.update_item(Key={'p': p},
-        AttributeUpdates={'z': {'Value': 1, 'Action': 'PUT'}},
-        Expected={'b': {'ComparisonOperator': 'LT',
-                        'AttributeValueList': [bytearray([128])]}}
-    )
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 1
-    test_table_s.update_item(Key={'p': p},
-        AttributeUpdates={'z': {'Value': 2, 'Action': 'PUT'}},
-        Expected={'b': {'ComparisonOperator': 'LE',
-                        'AttributeValueList': [bytearray([128])]}}
-    )
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 2
-    test_table_s.update_item(Key={'p': p},
-        AttributeUpdates={'z': {'Value': 3, 'Action': 'PUT'}},
-        Expected={'b': {'ComparisonOperator': 'BETWEEN',
-                        'AttributeValueList': [bytearray([126]), bytearray([128])]}}
-    )
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 3
-
-    test_table_s.put_item(Item={'p': p, 'b': bytearray([128])})
-    test_table_s.update_item(Key={'p': p},
-        AttributeUpdates={'z': {'Value': 4, 'Action': 'PUT'}},
-        Expected={'b': {'ComparisonOperator': 'GT',
-                        'AttributeValueList': [bytearray([127])]}}
-    )
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 4
-    test_table_s.update_item(Key={'p': p},
-        AttributeUpdates={'z': {'Value': 5, 'Action': 'PUT'}},
-        Expected={'b': {'ComparisonOperator': 'GE',
-                        'AttributeValueList': [bytearray([127])]}}
-    )
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 5
--- a/alternator-test/test_gsi.py
+++ b/alternator-test/test_gsi.py
@@ -71,7 +71,7 @@ def test_gsi_identical(dynamodb):
    # results (in different order).
    assert multiset(items) == multiset(full_scan(table))
    assert_index_scan(table, 'hello', items)
-    # We can't scan a non-existent index
+    # We can't scan a non-existant index
    with pytest.raises(ClientError, match='ValidationException'):
        full_scan(table, IndexName='wrong')
    table.delete()
@@ -150,6 +150,7 @@ def test_gsi_missing_table(dynamodb):
        dynamodb.meta.client.scan(TableName='nonexistent_table', IndexName='any_name')

 # Verify that strongly-consistent reads on GSI are *not* allowed.
+@pytest.mark.xfail(reason="GSI strong consistency not checked")
 def test_gsi_strong_consistency(test_table_gsi_1):
    with pytest.raises(ClientError, match='ValidationException.*Consistent'):
        full_query(test_table_gsi_1, KeyConditions={'c': {'AttributeValueList': ['hi'], 'ComparisonOperator': 'EQ'}}, IndexName='hello', ConsistentRead=True)
--- a/alternator-test/test_health.py
+++ b/alternator-test/test_health.py
@@ -22,7 +22,7 @@ import requests
 # Test that a health check can be performed with a GET packet
 def test_health_works(dynamodb):
    url = dynamodb.meta.client._endpoint.host
-    response = requests.get(url, verify=False)
+    response = requests.get(url)
    assert response.ok
    assert response.content.decode('utf-8').strip()  == 'healthy: {}'.format(url.replace('https://', '').replace('http://', ''))

--- a/alternator-test/test_item.py
+++ b/alternator-test/test_item.py
@@ -0,0 +1,402 @@
+# Copyright 2019 ScyllaDB
+#
+# This file is part of Scylla.
+#
+# Scylla is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Scylla is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+
+# Tests for the CRUD item operations: PutItem, GetItem, UpdateItem, DeleteItem
+
+import pytest
+from botocore.exceptions import ClientError
+from decimal import Decimal
+from util import random_string, random_bytes
+
+# Basic test for creating a new item with a random name, and reading it back
+# with strong consistency.
+# Only the string type is used for keys and attributes. None of the various
+# optional PutItem features (Expected, ReturnValues, ReturnConsumedCapacity,
+# ReturnItemCollectionMetrics, ConditionalOperator, ConditionExpression,
+# ExpressionAttributeNames, ExpressionAttributeValues) are used, and
+# for GetItem strong consistency is requested as well as all attributes,
+# but no other optional features (AttributesToGet, ReturnConsumedCapacity,
+# ProjectionExpression, ExpressionAttributeNames)
+def test_basic_string_put_and_get(test_table):
+    p = random_string()
+    c = random_string()
+    val = random_string()
+    val2 = random_string()
+    test_table.put_item(Item={'p': p, 'c': c, 'attribute': val, 'another': val2})
+    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
+    assert item['p'] == p
+    assert item['c'] == c
+    assert item['attribute'] == val
+    assert item['another'] == val2
+
+# Similar to test_basic_string_put_and_get, just uses UpdateItem instead of
+# PutItem. Because the item does not yet exist, it should work the same.
+def test_basic_string_update_and_get(test_table):
+    p = random_string()
+    c = random_string()
+    val = random_string()
+    val2 = random_string()
+    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'attribute': {'Value': val, 'Action': 'PUT'}, 'another': {'Value': val2, 'Action': 'PUT'}})
+    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
+    assert item['p'] == p
+    assert item['c'] == c
+    assert item['attribute'] == val
+    assert item['another'] == val2
+
+# Test put_item and get_item of various types for the *attributes*,
+# including both scalars as well as nested documents, lists and sets.
+# The full list of types tested here:
+#    number, boolean, bytes, null, list, map, string set, number set,
+#    binary set.
+# The keys are still strings.
+# Note that only top-level attributes are written and read in this test -
+# this test does not attempt to modify *nested* attributes.
+# See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/dynamodb.html
+# on how to pass these various types to Boto3's put_item().
+def test_put_and_get_attribute_types(test_table):
+    key = {'p': random_string(), 'c': random_string()}
+    test_items = [
+        Decimal("12.345"),
+        42,
+        True,
+        False,
+        b'xyz',
+        None,
+        ['hello', 'world', 42],
+        {'hello': 'world', 'life': 42},
+        {'hello': {'test': 'hi', 'hello': True, 'list': [1, 2, 'hi']}},
+        set(['hello', 'world', 'hi']),
+        set([1, 42, Decimal("3.14")]),
+        set([b'xyz', b'hi']),
+    ]
+    item = { str(i) : test_items[i] for i in range(len(test_items)) }
+    item.update(key)
+    test_table.put_item(Item=item)
+    got_item = test_table.get_item(Key=key, ConsistentRead=True)['Item']
+    assert item == got_item
+
+# The test_empty_* tests below verify support for empty items, with no
+# attributes except the key. This is a difficult case for Scylla, because
+# for an empty row to exist, Scylla needs to add a "CQL row marker".
+# There are several ways to create empty items - via PutItem, UpdateItem
+# and deleting attributes from non-empty items, and we need to check them
+# all, in several test_empty_* tests:
+def test_empty_put(test_table):
+    p = random_string()
+    c = random_string()
+    test_table.put_item(Item={'p': p, 'c': c})
+    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
+    assert item == {'p': p, 'c': c}
+def test_empty_put_delete(test_table):
+    p = random_string()
+    c = random_string()
+    test_table.put_item(Item={'p': p, 'c': c, 'hello': 'world'})
+    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'hello': {'Action': 'DELETE'}})
+    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
+    assert item == {'p': p, 'c': c}
+def test_empty_update(test_table):
+    p = random_string()
+    c = random_string()
+    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={})
+    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
+    assert item == {'p': p, 'c': c}
+def test_empty_update_delete(test_table):
+    p = random_string()
+    c = random_string()
+    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'hello': {'Value': 'world', 'Action': 'PUT'}})
+    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'hello': {'Action': 'DELETE'}})
+    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
+    assert item == {'p': p, 'c': c}
+
+# Test error handling of UpdateItem passed a bad "Action" field.
+def test_update_bad_action(test_table):
+    p = random_string()
+    c = random_string()
+    val = random_string()
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'attribute': {'Value': val, 'Action': 'NONEXISTENT'}})
+
+# A more elaborate UpdateItem test, updating different attributes at different
+# times. Includes PUT and DELETE operations.
+def test_basic_string_more_update(test_table):
+    p = random_string()
+    c = random_string()
+    val1 = random_string()
+    val2 = random_string()
+    val3 = random_string()
+    val4 = random_string()
+    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a3': {'Value': val1, 'Action': 'PUT'}})
+    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a1': {'Value': val1, 'Action': 'PUT'}})
+    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a2': {'Value': val2, 'Action': 'PUT'}})
+    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a1': {'Value': val3, 'Action': 'PUT'}})
+    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a3': {'Action': 'DELETE'}})
+    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
+    assert item['p'] == p
+    assert item['c'] == c
+    assert item['a1'] == val3
+    assert item['a2'] == val2
+    assert not 'a3' in item
+
+# Test that item operations on a non-existant table name fail with correct
+# error code.
+def test_item_operations_nonexistent_table(dynamodb):
+    with pytest.raises(ClientError, match='ResourceNotFoundException'):
+        dynamodb.meta.client.put_item(TableName='non_existent_table',
+            Item={'a':{'S':'b'}})
+
+# Fetching a non-existant item. According to the DynamoDB doc, "If there is no
+# matching item, GetItem does not return any data and there will be no Item
+# element in the response."
+def test_get_item_missing_item(test_table):
+    p = random_string()
+    c = random_string()
+    assert not "Item" in test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)
+
+# Test that if we have a table with string hash and sort keys, we can't read
+# or write items with other key types to it.
+def test_put_item_wrong_key_type(test_table):
+    b = random_bytes()
+    s = random_string()
+    n = Decimal("3.14")
+    # Should succeed (correct key types)
+    test_table.put_item(Item={'p': s, 'c': s})
+    assert test_table.get_item(Key={'p': s, 'c': s}, ConsistentRead=True)['Item'] == {'p': s, 'c': s}
+    # Should fail (incorrect hash key types)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.put_item(Item={'p': b, 'c': s})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.put_item(Item={'p': n, 'c': s})
+    # Should fail (incorrect sort key types)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.put_item(Item={'p': s, 'c': b})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.put_item(Item={'p': s, 'c': n})
+    # Should fail (missing hash key)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.put_item(Item={'c': s})
+    # Should fail (missing sort key)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.put_item(Item={'p': s})
+def test_update_item_wrong_key_type(test_table, test_table_s):
+    b = random_bytes()
+    s = random_string()
+    n = Decimal("3.14")
+    # Should succeed (correct key types)
+    test_table.update_item(Key={'p': s, 'c': s}, AttributeUpdates={})
+    assert test_table.get_item(Key={'p': s, 'c': s}, ConsistentRead=True)['Item'] == {'p': s, 'c': s}
+    # Should fail (incorrect hash key types)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.update_item(Key={'p': b, 'c': s}, AttributeUpdates={})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.update_item(Key={'p': n, 'c': s}, AttributeUpdates={})
+    # Should fail (incorrect sort key types)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.update_item(Key={'p': s, 'c': b}, AttributeUpdates={})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.update_item(Key={'p': s, 'c': n}, AttributeUpdates={})
+    # Should fail (missing hash key)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.update_item(Key={'c': s}, AttributeUpdates={})
+    # Should fail (missing sort key)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.update_item(Key={'p': s}, AttributeUpdates={})
+    # Should fail (spurious key columns)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.get_item(Key={'p': s, 'c': s, 'spurious': s})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.get_item(Key={'p': s, 'c': s})
+def test_get_item_wrong_key_type(test_table, test_table_s):
+    b = random_bytes()
+    s = random_string()
+    n = Decimal("3.14")
+    # Should succeed (correct key types) but have empty result
+    assert not "Item" in test_table.get_item(Key={'p': s, 'c': s}, ConsistentRead=True)
+    # Should fail (incorrect hash key types)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.get_item(Key={'p': b, 'c': s})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.get_item(Key={'p': n, 'c': s})
+    # Should fail (incorrect sort key types)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.get_item(Key={'p': s, 'c': b})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.get_item(Key={'p': s, 'c': n})
+    # Should fail (missing hash key)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.get_item(Key={'c': s})
+    # Should fail (missing sort key)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.get_item(Key={'p': s})
+    # Should fail (spurious key columns)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.get_item(Key={'p': s, 'c': s, 'spurious': s})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.get_item(Key={'p': s, 'c': s})
+def test_delete_item_wrong_key_type(test_table, test_table_s):
+    b = random_bytes()
+    s = random_string()
+    n = Decimal("3.14")
+    # Should succeed (correct key types)
+    test_table.delete_item(Key={'p': s, 'c': s})
+    # Should fail (incorrect hash key types)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.delete_item(Key={'p': b, 'c': s})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.delete_item(Key={'p': n, 'c': s})
+    # Should fail (incorrect sort key types)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.delete_item(Key={'p': s, 'c': b})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.delete_item(Key={'p': s, 'c': n})
+    # Should fail (missing hash key)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.delete_item(Key={'c': s})
+    # Should fail (missing sort key)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.delete_item(Key={'p': s})
+    # Should fail (spurious key columns)
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.delete_item(Key={'p': s, 'c': s, 'spurious': s})
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.delete_item(Key={'p': s, 'c': s})
+
+# Most of the tests here arbitrarily used a table with both hash and sort keys
+# (both strings). Let's check that a table with *only* a hash key works ok
+# too, for PutItem, GetItem, and UpdateItem.
+def test_only_hash_key(test_table_s):
+    s = random_string()
+    test_table_s.put_item(Item={'p': s, 'hello': 'world'})
+    assert test_table_s.get_item(Key={'p': s}, ConsistentRead=True)['Item'] == {'p': s, 'hello': 'world'}
+    test_table_s.update_item(Key={'p': s}, AttributeUpdates={'hi': {'Value': 'there', 'Action': 'PUT'}})
+    assert test_table_s.get_item(Key={'p': s}, ConsistentRead=True)['Item'] == {'p': s, 'hello': 'world', 'hi': 'there'}
+
+# Tests for item operations in tables with non-string hash or sort keys.
+# These tests focus only on the type of the key - everything else is as
+# simple as we can (string attributes, no special options for GetItem
+# and PutItem). These tests also focus on individual items only, and
+# not about the sort order of sort keys - this should be verified in
+# test_query.py, for example.
+def test_bytes_hash_key(test_table_b):
+    # Bytes values are passed using base64 encoding, which has weird cases
+    # depending on len%3 and len%4. So let's try various lengths.
+    for len in range(10,18):
+        p = random_bytes(len)
+        val = random_string()
+        test_table_b.put_item(Item={'p': p, 'attribute': val})
+        assert test_table_b.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'attribute': val}
+def test_bytes_sort_key(test_table_sb):
+    p = random_string()
+    c = random_bytes()
+    val = random_string()
+    test_table_sb.put_item(Item={'p': p, 'c': c, 'attribute': val})
+    assert test_table_sb.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item'] == {'p': p, 'c': c, 'attribute': val}
+
+# Tests for using a large binary blob as hash key, sort key, or attribute.
+# DynamoDB strictly limits the size of the binary hash key to 2048 bytes,
+# and binary sort key to 1024 bytes, and refuses anything larger. The total
+# size of an item is limited to 400KB, which also limits the size of the
+# largest attributes. For more details on these limits, see
+# https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Limits.html
+# Alternator currently does *not* have these limitations, and can accept much
+# larger keys and attributes, but what we do in the following tests is to verify
+# that items up to DynamoDB's maximum sizes also work well in Alternator.
+def test_large_blob_hash_key(test_table_b):
+    b = random_bytes(2048)
+    test_table_b.put_item(Item={'p': b})
+    assert test_table_b.get_item(Key={'p': b}, ConsistentRead=True)['Item'] == {'p': b}
+def test_large_blob_sort_key(test_table_sb):
+    s = random_string()
+    b = random_bytes(1024)
+    test_table_sb.put_item(Item={'p': s, 'c': b})
+    assert test_table_sb.get_item(Key={'p': s, 'c': b}, ConsistentRead=True)['Item'] == {'p': s, 'c': b}
+def test_large_blob_attribute(test_table):
+    p = random_string()
+    c = random_string()
+    b = random_bytes(409500)  # a bit less than 400KB
+    test_table.put_item(Item={'p': p, 'c': c, 'attribute': b })
+    assert test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item'] == {'p': p, 'c': c, 'attribute': b}
+
+# Checks what it is not allowed to use in a single UpdateItem request both
+# old-style AttributeUpdates and new-style UpdateExpression.
+def test_update_item_two_update_methods(test_table_s):
+    p = random_string()
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p},
+            AttributeUpdates={'a': {'Value': 3, 'Action': 'PUT'}},
+            UpdateExpression='SET b = :val1',
+            ExpressionAttributeValues={':val1': 4})
+
+# Verify that having neither AttributeUpdates nor UpdateExpression is
+# allowed, and results in creation of an empty item.
+def test_update_item_no_update_method(test_table_s):
+    p = random_string()
+    assert not "Item" in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)
+    test_table_s.update_item(Key={'p': p})
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p}
+
+# Test GetItem with the AttributesToGet parameter. Result should include the
+# selected attributes only - if one wants the key attributes as well, one
+# needs to select them explicitly. When no key attributes are selected,
+# some items may have *none* of the selected attributes. Those items are
+# returned too, as empty items - they are not outright missing.
+def test_getitem_attributes_to_get(dynamodb, test_table):
+    p = random_string()
+    c = random_string()
+    item = {'p': p, 'c': c, 'a': 'hello', 'b': 'hi'}
+    test_table.put_item(Item=item)
+    for wanted in [ ['a'],             # only non-key attribute
+                    ['c', 'a'],        # a key attribute (sort key) and non-key
+                    ['p', 'c'],        # entire key
+                    ['nonexistent']    # Our item doesn't have this
+                   ]:
+        got_item = test_table.get_item(Key={'p': p, 'c': c}, AttributesToGet=wanted, ConsistentRead=True)['Item']
+        expected_item = {k: item[k] for k in wanted if k in item}
+        assert expected_item == got_item
+
+# Basic test for DeleteItem, with hash key only
+def test_delete_item_hash(test_table_s):
+    p = random_string()
+    test_table_s.put_item(Item={'p': p})
+    assert 'Item' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)
+    test_table_s.delete_item(Key={'p': p})
+    assert not 'Item' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)
+
+# Basic test for DeleteItem, with hash and sort key
+def test_delete_item_sort(test_table):
+    p = random_string()
+    c = random_string()
+    key = {'p': p, 'c': c}
+    test_table.put_item(Item=key)
+    assert 'Item' in test_table.get_item(Key=key, ConsistentRead=True)
+    test_table.delete_item(Key=key)
+    assert not 'Item' in test_table.get_item(Key=key, ConsistentRead=True)
+
+# Test that PutItem completely replaces an existing item. It shouldn't merge
+# it with a previously existing value, as UpdateItem does!
+# We test for a table with just hash key, and for a table with both hash and
+# sort keys.
+def test_put_item_replace(test_table_s, test_table):
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'a': 'hi'}
+    test_table_s.put_item(Item={'p': p, 'b': 'hello'})
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'b': 'hello'}
+    c = random_string()
+    test_table.put_item(Item={'p': p, 'c': c, 'a': 'hi'})
+    assert test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item'] == {'p': p, 'c': c, 'a': 'hi'}
+    test_table.put_item(Item={'p': p, 'c': c, 'b': 'hello'})
+    assert test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item'] == {'p': p, 'c': c, 'b': 'hello'}
--- a/alternator-test/test_lsi.py
+++ b/alternator-test/test_lsi.py
@@ -26,23 +26,26 @@ import time
 from botocore.exceptions import ClientError, ParamValidationError
 from util import create_test_table, random_string, full_scan, full_query, multiset, list_tables

-# LSIs support strongly-consistent reads, so the following functions do not
-# need to retry like we did in test_gsi.py for GSIs:
+# Currently, Alternator's LSIs only support eventually consistent reads, so tests
+# that involve writing to a table and then expect to read something from it cannot
+# be guaranteed to succeed without retrying the read. The following utility
+# functions make it easy to write such tests.
 def assert_index_query(table, index_name, expected_items, **kwargs):
-    assert multiset(expected_items) == multiset(full_query(table, IndexName=index_name, ConsistentRead=True, **kwargs))
-def assert_index_scan(table, index_name, expected_items, **kwargs):
-    assert multiset(expected_items) == multiset(full_scan(table, IndexName=index_name, ConsistentRead=True, **kwargs))
-
-# A version doing retries instead of ConsistentRead, to be used just for the
-# one test below which has both GSI and LSI:
-def retrying_assert_index_query(table, index_name, expected_items, **kwargs):
    for i in range(3):
        if multiset(expected_items) == multiset(full_query(table, IndexName=index_name, **kwargs)):
            return
-        print('retrying_assert_index_query retrying')
+        print('assert_index_query retrying')
        time.sleep(1)
    assert multiset(expected_items) == multiset(full_query(table, IndexName=index_name, **kwargs))

+def assert_index_scan(table, index_name, expected_items, **kwargs):
+    for i in range(3):
+        if multiset(expected_items) == multiset(full_scan(table, IndexName=index_name, **kwargs)):
+            return
+        print('assert_index_scan retrying')
+        time.sleep(1)
+    assert multiset(expected_items) == multiset(full_scan(table, IndexName=index_name, **kwargs))
+
 # Although quite silly, it is actually allowed to create an index which is
 # identical to the base table.
 def test_lsi_identical(dynamodb):
@@ -63,7 +66,7 @@ def test_lsi_identical(dynamodb):
    # results (in different order).
    assert multiset(items) == multiset(full_scan(table))
    assert_index_scan(table, 'hello', items)
-    # We can't scan a non-existent index
+    # We can't scan a non-existant index
    with pytest.raises(ClientError, match='ValidationException'):
        full_scan(table, IndexName='wrong')
    table.delete()
@@ -299,11 +302,13 @@ def test_lsi_consistent_read(test_table_lsi_1):
    expected_items = [i for i in items if i['p'] == p1 and i['b'] == b1]
    assert_index_query(test_table_lsi_1, 'hello', expected_items,
        KeyConditions={'p': {'AttributeValueList': [p1], 'ComparisonOperator': 'EQ'},
-                       'b': {'AttributeValueList': [b1], 'ComparisonOperator': 'EQ'}})
+                       'b': {'AttributeValueList': [b1], 'ComparisonOperator': 'EQ'}},
+        ConsistentRead=True)
    expected_items = [i for i in items if i['p'] == p2 and i['b'] == b2]
    assert_index_query(test_table_lsi_1, 'hello', expected_items,
        KeyConditions={'p': {'AttributeValueList': [p2], 'ComparisonOperator': 'EQ'},
-                       'b': {'AttributeValueList': [b2], 'ComparisonOperator': 'EQ'}})
+                       'b': {'AttributeValueList': [b2], 'ComparisonOperator': 'EQ'}},
+        ConsistentRead=True)

 # A table with both gsi and lsi present
@pytest.fixture(scope="session")
@@ -355,6 +360,6 @@ def test_lsi_and_gsi(test_table_lsi_gsi):

    for index in ['hello_g1', 'hello_l1']:
        expected_items = [i for i in items if i['p'] == p1 and i['x1'] == x1]
-        retrying_assert_index_query(test_table_lsi_gsi, index, expected_items,
+        assert_index_query(test_table_lsi_gsi, index, expected_items,
            KeyConditions={'p': {'AttributeValueList': [p1], 'ComparisonOperator': 'EQ'},
                           'x1': {'AttributeValueList': [x1], 'ComparisonOperator': 'EQ'}})
--- a/alternator-test/test_nested.py
+++ b/alternator-test/test_nested.py
--- a/alternator-test/test_projection_expression.py
+++ b/alternator-test/test_projection_expression.py
@@ -134,10 +134,10 @@ def test_projection_expression_path(test_table_s):
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a.b[0]')['Item'] == {'a': {'b': [2]}}
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a.b[2]')['Item'] == {'a': {'b': [{'x': 'hi', 'y': 'yo'}]}}
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a.b[2].y')['Item'] == {'a': {'b': [{'y': 'yo'}]}}
-    # Trying to read any sort of non-existent attribute returns an empty item.
+    # Trying to read any sort of non-existant attribute returns an empty item.
    # This includes a non-existing top-level attribute, an attempt to read
-    # beyond the end of an array or a non-existent member of a dictionary, as
-    # well as paths which begin with a non-existent prefix.
+    # beyond the end of an array or a non-existant member of a dictionary, as
+    # well as paths which begin with a non-existant prefix.
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='x')['Item'] == {}
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a.b[3]')['Item'] == {}
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True, ProjectionExpression='a.x')['Item'] == {}
--- a/alternator-test/test_query.py
+++ b/alternator-test/test_query.py
@@ -100,13 +100,16 @@ def test_query_basic_restrictions(dynamodb, filled_test_table):
    print(got_items)
    assert multiset([item for item in items if item['p'] == 'long' and item['c'].startswith('11')]) == multiset(got_items)

-def test_query_nonexistent_table(dynamodb):
-    client = dynamodb.meta.client
-    with pytest.raises(ClientError, match="ResourceNotFoundException"):
-        client.query(TableName="i_do_not_exist", KeyConditions={
-            'p' : {'AttributeValueList': ['long'], 'ComparisonOperator': 'EQ'},
-            'c' : {'AttributeValueList': ['11'], 'ComparisonOperator': 'BEGINS_WITH'}
-        })
+# Test that KeyConditionExpression parameter is supported
+@pytest.mark.xfail(reason="KeyConditionExpression not supported yet")
+def test_query_key_condition_expression(dynamodb, filled_test_table):
+    test_table, items = filled_test_table
+    paginator = dynamodb.meta.client.get_paginator('query')
+    got_items = []
+    for page in paginator.paginate(TableName=test_table.name, KeyConditionExpression=Key("p").eq("long") & Key("c").lt("12")):
+        got_items += page['Items']
+    print(got_items)
+    assert multiset([item for item in items if item['p'] == 'long' and item['c'] < '12']) == multiset(got_items)

 def test_begins_with(dynamodb, test_table):
    paginator = dynamodb.meta.client.get_paginator('query')
@@ -459,6 +462,7 @@ def test_query_limit_paging(test_table_sn):
 # return items sorted in reverse order. Combining this with Limit can
 # be used to return the last items instead of the first items of the
 # partition.
+@pytest.mark.xfail(reason="ScanIndexForward not supported yet")
 def test_query_reverse(test_table_sn):
    numbers = [Decimal(i) for i in range(20)]
    # Insert these numbers, in random order, into one partition:
@@ -493,6 +497,7 @@ def test_query_reverse(test_table_sn):

 # Test that paging also works properly with reverse order
 # (ScanIndexForward=false), i.e., reverse-order queries can be resumed
+@pytest.mark.xfail(reason="ScanIndexForward not supported yet")
 def test_query_reverse_paging(test_table_sn):
    numbers = [Decimal(i) for i in range(20)]
    # Insert these numbers, in random order, into one partition:
@@ -509,11 +514,3 @@ def test_query_reverse_paging(test_table_sn):
        got_items = full_query(test_table_sn, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, ScanIndexForward=False, Limit=limit)
        got_sort_keys = [x['c'] for x in got_items]
        assert got_sort_keys == reversed_numbers
-
-# A query without a KeyConditions or KeyConditionExpress is, or an empty
-# one, is obviously not allowed:
-def test_query_missing_key(test_table):
-    with pytest.raises(ClientError, match='ValidationException'):
-        full_query(test_table, KeyConditions={})
-    with pytest.raises(ClientError, match='ValidationException'):
-        full_query(test_table)
--- a/alternator-test/test_returnvalues.py
+++ b/alternator-test/test_returnvalues.py
@@ -0,0 +1,226 @@
+# Copyright 2019 ScyllaDB
+#
+# This file is part of Scylla.
+#
+# Scylla is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Scylla is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+
+# Tests for the ReturnValues parameter for the different update operations
+# (PutItem, UpdateItem, DeleteItem).
+
+import pytest
+from botocore.exceptions import ClientError
+from util import random_string
+
+# Test trivial support for the ReturnValues parameter in PutItem, UpdateItem
+# and DeleteItem - test that "NONE" works (and changes nothing), while a
+# completely unsupported value gives an error.
+# This test is useful to check that before the ReturnValues parameter is fully
+# implemented, it returns an error when a still-unsupported ReturnValues
+# option is attempted in the request - instead of simply being ignored.
+def test_trivial_returnvalues(test_table_s):
+    # PutItem:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
+    ret=test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='NONE')
+    assert not 'Attributes' in ret
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='DOG')
+    # UpdateItem:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='NONE',
+        UpdateExpression='SET b = :val',
+        ExpressionAttributeValues={':val': 'cat'})
+    assert not 'Attributes' in ret
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p}, ReturnValues='DOG',
+            UpdateExpression='SET a = a + :val',
+            ExpressionAttributeValues={':val': 1})
+    # DeleteItem:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
+    ret=test_table_s.delete_item(Key={'p': p}, ReturnValues='NONE')
+    assert not 'Attributes' in ret
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.delete_item(Key={'p': p}, ReturnValues='DOG')
+
+# Test the ReturnValues parameter on a PutItem operation. Only two settings
+# are supported for this parameter for this operation: NONE (the default)
+# and ALL_OLD.
+@pytest.mark.xfail(reason="ReturnValues not supported")
+def test_put_item_returnvalues(test_table_s):
+    # By default, the previous value of an item is not returned:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
+    ret=test_table_s.put_item(Item={'p': p, 'a': 'hello'})
+    assert not 'Attributes' in ret
+    # Using ReturnValues=NONE is the same:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
+    ret=test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='NONE')
+    assert not 'Attributes' in ret
+    # With ReturnValues=ALL_OLD, the old value of the item is returned
+    # in an "Attributes" attribute:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
+    ret=test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='ALL_OLD')
+    assert ret['Attributes'] == {'p': p, 'a': 'hi'}
+    # Other ReturnValue options - UPDATED_OLD, ALL_NEW, UPDATED_NEW,
+    # are supported by other operations but not by PutItem:
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='UPDATED_OLD')
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='ALL_NEW')
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='UPDATED_NEW')
+    # Also, obviously, a non-supported setting "DOG" also returns in error:
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='DOG')
+    # The ReturnValues value is case sensitive, so while "NONE" is supported
+    # (and tested above), "none" isn't:
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='none')
+
+# Test the ReturnValues parameter on a DeleteItem operation. Only two settings
+# are supported for this parameter for this operation: NONE (the default)
+# and ALL_OLD.
+@pytest.mark.xfail(reason="ReturnValues not supported")
+def test_delete_item_returnvalues(test_table_s):
+    # By default, the previous value of an item is not returned:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
+    ret=test_table_s.delete_item(Key={'p': p})
+    assert not 'Attributes' in ret
+    # Using ReturnValues=NONE is the same:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
+    ret=test_table_s.delete_item(Key={'p': p}, ReturnValues='NONE')
+    assert not 'Attributes' in ret
+    # With ReturnValues=ALL_OLD, the old value of the item is returned
+    # in an "Attributes" attribute:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
+    ret=test_table_s.delete_item(Key={'p': p}, ReturnValues='ALL_OLD')
+    assert ret['Attributes'] == {'p': p, 'a': 'hi'}
+    # Other ReturnValue options - UPDATED_OLD, ALL_NEW, UPDATED_NEW,
+    # are supported by other operations but not by PutItem:
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.delete_item(Key={'p': p}, ReturnValues='UPDATE_OLD')
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.delete_item(Key={'p': p}, ReturnValues='ALL_NEW')
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.delete_item(Key={'p': p}, ReturnValues='UPDATE_NEW')
+    # Also, obviously, a non-supported setting "DOG" also returns in error:
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.delete_item(Key={'p': p}, ReturnValues='DOG')
+    # The ReturnValues value is case sensitive, so while "NONE" is supported
+    # (and tested above), "none" isn't:
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.delete_item(Key={'p': p}, ReturnValues='none')
+
+# Test the ReturnValues parameter on a UpdateItem operation. All five
+# settings are supported for this parameter for this operation: NONE
+# (the default), ALL_OLD, UPDATED_OLD, ALL_NEW and UPDATED_NEW.
+@pytest.mark.xfail(reason="ReturnValues not supported")
+def test_update_item_returnvalues(test_table_s):
+    # By default, the previous value of an item is not returned:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
+    ret=test_table_s.update_item(Key={'p': p},
+        UpdateExpression='SET b = :val',
+        ExpressionAttributeValues={':val': 'cat'})
+    assert not 'Attributes' in ret
+
+    # Using ReturnValues=NONE is the same:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='NONE',
+        UpdateExpression='SET b = :val',
+        ExpressionAttributeValues={':val': 'cat'})
+    assert not 'Attributes' in ret
+
+    # With ReturnValues=ALL_OLD, the entire old value of the item (even
+    # attributes we did not modify) is returned in an "Attributes" attribute:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='ALL_OLD',
+        UpdateExpression='SET b = :val',
+        ExpressionAttributeValues={':val': 'cat'})
+    assert ret['Attributes'] == {'p': p, 'a': 'hi', 'b': 'dog'}
+
+    # With ReturnValues=UPDATED_OLD, only the overwritten attributes of the
+    # old item are returned in an "Attributes" attribute:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_OLD',
+        UpdateExpression='SET b = :val, c = :val2',
+        ExpressionAttributeValues={':val': 'cat', ':val2': 'hello'})
+    assert ret['Attributes'] == {'b': 'dog'}
+    # Even if an update overwrites an attribute by the same value again,
+    # this is considered an update, and the old value (identical to the
+    # new one) is returned:
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_OLD',
+        UpdateExpression='SET b = :val',
+        ExpressionAttributeValues={':val': 'cat'})
+    assert ret['Attributes'] == {'b': 'cat'}
+    # Deleting an attribute also counts as overwriting it, of course:
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_OLD',
+        UpdateExpression='REMOVE b')
+    assert ret['Attributes'] == {'b': 'cat'}
+
+    # With ReturnValues=ALL_NEW, the entire new value of the item (including
+    # old attributes we did not modify) is returned:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='ALL_NEW',
+        UpdateExpression='SET b = :val',
+        ExpressionAttributeValues={':val': 'cat'})
+    assert ret['Attributes'] == {'p': p, 'a': 'hi', 'b': 'cat'}
+
+    # With ReturnValues=UPDATED_NEW, only the new value of the updated
+    # attributes are returned. Note that "updated attributes" means
+    # the newly set attributes - it doesn't require that these attributes
+    # have any previous values
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_NEW',
+        UpdateExpression='SET b = :val, c = :val2',
+        ExpressionAttributeValues={':val': 'cat', ':val2': 'hello'})
+    assert ret['Attributes'] == {'b': 'cat', 'c': 'hello'}
+    # Deleting an attribute also counts as overwriting it, but the delete
+    # column is not returned in the response - so it's empty in this case.
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_NEW',
+        UpdateExpression='REMOVE b')
+    assert not 'Attributes' in ret
+    # In the above examples, UPDATED_NEW is not useful because it just
+    # returns the new values we already know from the request... UPDATED_NEW
+    # becomes more useful in read-modify-write operations:
+    p = random_string()
+    test_table_s.put_item(Item={'p': p, 'a': 1})
+    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_NEW',
+        UpdateExpression='SET a = a + :val',
+        ExpressionAttributeValues={':val': 1})
+    assert ret['Attributes'] == {'a': 2}
+
+    # A non-supported setting "DOG" also returns in error:
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p}, ReturnValues='DOG',
+            UpdateExpression='SET a = a + :val',
+            ExpressionAttributeValues={':val': 1})
+    # The ReturnValues value is case sensitive, so while "NONE" is supported
+    # (and tested above), "none" isn't:
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table_s.update_item(Key={'p': p}, ReturnValues='none',
+            UpdateExpression='SET a = a + :val',
+            ExpressionAttributeValues={':val': 1})
--- a/alternator-test/test_scan.py
+++ b/alternator-test/test_scan.py
@@ -42,11 +42,6 @@ def test_scan_basic(filled_test_table):
        assert len(items) == len(got_items)
        assert multiset(items) == multiset(got_items)

-def test_scan_nonexistent_table(dynamodb):
-    client = dynamodb.meta.client
-    with pytest.raises(ClientError, match="ResourceNotFoundException"):
-        client.scan(TableName="i_do_not_exist")
-
 def test_scan_with_paginator(dynamodb, filled_test_table):
    test_table, items = filled_test_table
    paginator = dynamodb.meta.client.get_paginator('scan')
@@ -244,6 +239,7 @@ def test_scan_select(filled_test_table):
 # a scan into multiple parts, and that these parts are in fact disjoint,
 # and their union is the entire contents of the table. We do not actually
 # try to run these queries in *parallel* in this test.
+@pytest.mark.xfail(reason="parallel scan not supported yet")
 def test_scan_parallel(filled_test_table):
    test_table, items = filled_test_table
    for nsegments in [1, 2, 17]:
@@ -254,14 +250,3 @@ def test_scan_parallel(filled_test_table):
        # The following comparison verifies that each of the expected item
        # in items was returned in one - and just one - of the segments.
        assert multiset(items) == multiset(got_items)
-
-# Test correct handling of incorrect parallel scan parameters.
-# Most of the corner cases (like TotalSegments=0) are validated
-# by boto3 itself, but some checks can still be performed.
-def test_scan_parallel_incorrect(filled_test_table):
-    test_table, items = filled_test_table
-    with pytest.raises(ClientError, match='ValidationException.*Segment'):
-        full_scan(test_table, TotalSegments=1000001, Segment=0)
-    for segment in [7, 9]:
-        with pytest.raises(ClientError, match='ValidationException.*Segment'):
-            full_scan(test_table, TotalSegments=5, Segment=segment)
--- a/alternator-test/test_table.py
+++ b/alternator-test/test_table.py
@@ -74,11 +74,6 @@ def create_and_delete_table(dynamodb, name, **kwargs):
 def test_create_and_delete_table(dynamodb):
    create_and_delete_table(dynamodb, 'alternator_test')

-# Test that recreating a table right after deleting it works without issues
-def test_recreate_table(dynamodb):
-    create_and_delete_table(dynamodb, 'alternator_recr_test')
-    create_and_delete_table(dynamodb, 'alternator_recr_test')
-
 # DynamoDB documentation specifies that table names must be 3-255 characters,
 # and match the regex [a-zA-Z0-9._-]+. Names not matching these rules should
 # be rejected, and no table be created.
@@ -232,35 +227,6 @@ def test_create_table_billing_mode_errors(dynamodb, test_table):
            KeySchema=[{ 'AttributeName': 'p', 'KeyType': 'HASH' }],
            AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }])

-# Even before Alternator gains full support for the DynamoDB stream API
-# and CreateTable's StreamSpecification option, we should support the
-# options which mean it is turned *off*.
-def test_table_streams_off(dynamodb):
-    # If StreamSpecification is given, but has StreamEnabled=false, it's as
-    # if StreamSpecification was missing. StreamViewType isn't needed.
-    table = create_test_table(dynamodb, StreamSpecification={'StreamEnabled': False},
-        KeySchema=[{ 'AttributeName': 'p', 'KeyType': 'HASH' }],
-        AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }]);
-    table.delete();
-    # DynamoDB doesn't allow StreamSpecification to be empty map - if it
-    # exists, it must have a StreamEnabled
-    # Unfortunately, new versions of boto3 doesn't let us pass this...
-    #with pytest.raises(ClientError, match='ValidationException'):
-    #    table = create_test_table(dynamodb, StreamSpecification={},
-    #        KeySchema=[{ 'AttributeName': 'p', 'KeyType': 'HASH' }],
-    #        AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }]);
-    #    table.delete();
-    # Unfortunately, boto3 doesn't allow us to pass StreamSpecification=None.
-    # This is what we had in issue #5796.
-
-@pytest.mark.xfail(reason="streams not yet implemented")
-def test_table_streams_on(dynamodb):
-    table = create_test_table(dynamodb,
-        StreamSpecification={'StreamEnabled': True, 'StreamViewType': 'OLD_IMAGE'},
-        KeySchema=[{ 'AttributeName': 'p', 'KeyType': 'HASH' }],
-        AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }]);
-    table.delete();
-
 # Our first implementation had a special column name called "attrs" where
 # we stored a map for all non-key columns. If the user tried to name one
 # of the key columns with this same name, the result was a disaster - Scylla
--- a/alternator-test/test_update_expression.py
+++ b/alternator-test/test_update_expression.py
@@ -370,7 +370,7 @@ def test_update_expression_cannot_modify_key(test_table):

 # Test that trying to start an expression with some nonsense like HELLO
 # instead of SET, REMOVE, ADD or DELETE, fails.
-def test_update_expression_non_existent_clause(test_table_s):
+def test_update_expression_non_existant_clause(test_table_s):
    p = random_string()
    with pytest.raises(ClientError, match='ValidationException'):
        test_table_s.update_item(Key={'p': p},
@@ -717,28 +717,10 @@ def test_update_expression_delete_sets(test_table_s):
        UpdateExpression='DELETE a :val1',
        ExpressionAttributeValues={':val1': set(['pig'])})
    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == set(['dog'])
-    # Deleting all the elements cannot leave an empty set (which isn't
-    # supported). Rather, it deletes the attribute altogether:
-    test_table_s.update_item(Key={'p': p},
-        UpdateExpression='DELETE a :val1',
-        ExpressionAttributeValues={':val1': set(['dog'])})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'b': 'hi'}
-    # Deleting elements from a non-existent attribute is allowed, and
-    # simply does nothing:
-    test_table_s.update_item(Key={'p': p},
-        UpdateExpression='DELETE a :val1',
-        ExpressionAttributeValues={':val1': set(['dog'])})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'b': 'hi'}
-    # An empty set parameter is not allowed
-    with pytest.raises(ClientError, match='ValidationException.*empty'):
-        test_table_s.update_item(Key={'p': p},
-            UpdateExpression='DELETE a :val1',
-            ExpressionAttributeValues={':val1': set([])})
    # The value to be deleted must be a set of the same type - it can't
    # be a single element or anything else. If the value has the wrong type,
    # we get an error like "Invalid UpdateExpression: Incorrect operand type
    # for operator or function; operator: DELETE, operand type: STRING".
-    test_table_s.put_item(Item={'p': p, 'a': set(['dog', 'cat', 'mouse']), 'b': 'hi'})
    with pytest.raises(ClientError, match='ValidationException.*type'):
        test_table_s.update_item(Key={'p': p},
            UpdateExpression='DELETE a :val1',
@@ -870,25 +852,3 @@ def test_nested_attribute_update_bad_path_array(test_table_s):
    with pytest.raises(ClientError, match='ValidationException.*path'):
        test_table_s.update_item(Key={'p': p}, UpdateExpression='SET a[0] = :val1',
            ExpressionAttributeValues={':val1': 7})
-
-# DynamoDB Does not allow empty strings, empty byte arrays, or empty sets.
-# Trying to ask UpdateItem to put one of these in an attribute should be
-# forbidden. Empty lists and maps *are* allowed.
-# Note that in test_item.py::test_update_item_empty_attribute we checked
-# this with the AttributeUpdates syntax. Here we check the same with the
-# UpdateExpression syntax.
-def test_update_expression_empty_attribute(test_table_s):
-    p = random_string()
-    # Empty string, byte array and set are *not* allowed
-    for v in ['', bytearray('', 'utf-8'), set()]:
-        with pytest.raises(ClientError, match='ValidationException.*empty'):
-            test_table_s.update_item(Key={'p': p},
-                UpdateExpression='SET a = :v',
-                ExpressionAttributeValues={':v': v})
-    assert not 'Item' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)
-    # But empty lists and maps *are* allowed:
-    test_table_s.update_item(Key={'p': p},
-        UpdateExpression='SET d = :v1, e = :v2',
-        ExpressionAttributeValues={':v1': [], ':v2': {}})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'd': [], 'e': {}}
-#
--- a/alternator-test/util.py
+++ b/alternator-test/util.py
@@ -84,9 +84,8 @@ def freeze(item):
 def multiset(items):
    return collections.Counter([freeze(item) for item in items])

-# NOTE: alternator_Test prefix contains a capital letter on purpose,
-#in order to validate case sensitivity in alternator
-test_table_prefix = 'alternator_Test_'
+
+test_table_prefix = 'alternator_test_'
 def test_table_name():
    current_ms = int(round(time.time() * 1000))
    # In the off chance that test_table_name() is called twice in the same millisecond...
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -130,7 +130,7 @@ future<std::string> get_key_from_roles(cql3::query_processor& qp, std::string us

    auto cl = auth::password_authenticator::consistency_for_user(username);
    auto timeout = auth::internal_distributed_timeout_config();
-    return qp.execute_internal(query, cl, timeout, {sstring(username)}, true).then_wrapped([username = std::move(username)] (future<::shared_ptr<cql3::untyped_result_set>> f) {
+    return qp.process(query, cl, timeout, {sstring(username)}, true).then_wrapped([username = std::move(username)] (future<::shared_ptr<cql3::untyped_result_set>> f) {
        auto res = f.get0();
        auto salted_hash = std::optional<sstring>();
        if (res->empty()) {
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -30,11 +30,6 @@
 #include "serialization.hh"
 #include "base64.hh"
 #include <stdexcept>
-#include <boost/algorithm/cxx11/all_of.hpp>
-#include <boost/algorithm/cxx11/any_of.hpp>
-#include "utils/overloaded_functor.hh"
-
-#include "expressions_eval.hh"

 namespace alternator {

@@ -76,7 +71,7 @@ static ::shared_ptr<cql3::restrictions::single_column_restriction::contains> mak
 }

 static ::shared_ptr<cql3::restrictions::single_column_restriction::EQ> make_key_eq_restriction(const column_definition& cdef, const rjson::value& value) {
-    bytes raw_value = get_key_from_typed_value(value, cdef);
+    bytes raw_value = get_key_from_typed_value(value, cdef, type_to_string(cdef.type));
    auto restriction_value = ::make_shared<cql3::constants::value>(cql3::raw_value::make_value(std::move(raw_value)));
    return make_shared<cql3::restrictions::single_column_restriction::EQ>(cdef, std::move(restriction_value));
 }
@@ -230,12 +225,16 @@ static bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2) {
    }
 }

+static std::string_view to_string_view(const rjson::value& v) {
+    return std::string_view(v.GetString(), v.GetStringLength());
+}
+
 static bool is_set_of(const rjson::value& type1, const rjson::value& type2) {
    return (type2 == "S" && type1 == "SS") || (type2 == "N" && type1 == "NS") || (type2 == "B" && type1 == "BS");
 }

 // Check if two JSON-encoded values match with the CONTAINS relation
-bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2) {
+static bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2) {
    if (!v1) {
        return false;
    }
@@ -247,7 +246,7 @@ bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2) {
                               "got {} instead", kv2.name));
    }
    if (kv1.name == "S" && kv2.name == "S") {
-        return rjson::to_string_view(kv1.value).find(rjson::to_string_view(kv2.value)) != std::string_view::npos;
+        return to_string_view(kv1.value).find(to_string_view(kv2.value)) != std::string_view::npos;
    } else if (kv1.name == "B" && kv2.name == "B") {
        return base64_decode(kv1.value).find(base64_decode(kv2.value)) != bytes::npos;
    } else if (is_set_of(kv1.name, kv2.name)) {
@@ -307,19 +306,6 @@ static bool check_IN(const rjson::value* val, const rjson::value& array) {
    return have_match;
 }

-// Another variant of check_IN, this one for ConditionExpression. It needs to
-// check whether the first element in the given vector is equal to any of the
-// others.
-static bool check_IN(const std::vector<rjson::value>& array) {
-    const rjson::value* first = &array[0];
-    for (unsigned i = 1; i < array.size(); i++) {
-        if (check_EQ(first, array[i])) {
-            return true;
-        }
-    }
-    return false;
-}
-
 static bool check_NULL(const rjson::value* val) {
    return val == nullptr;
 }
@@ -365,35 +351,31 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara

 struct cmp_lt {
    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs; }
-    // We cannot use the normal comparison operators like "<" on the bytes
-    // type, because they treat individual bytes as signed but we need to
-    // compare them as *unsigned*. So we need a specialization for bytes.
-    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) < 0; }
    static constexpr const char* diagnostic = "LT operator";
 };

 struct cmp_le {
-    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs <= rhs; }
-    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) <= 0; }
+    // bytes only has <, so we cannot use <=.
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs || lhs == rhs; }
    static constexpr const char* diagnostic = "LE operator";
 };

 struct cmp_ge {
-    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs >= rhs; }
-    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) >= 0; }
+    // bytes only has <, so we cannot use >=.
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs || lhs == rhs; }
    static constexpr const char* diagnostic = "GE operator";
 };

 struct cmp_gt {
-    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs > rhs; }
-    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) > 0; }
+    // bytes only has <, so we cannot use >.
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs; }
    static constexpr const char* diagnostic = "GT operator";
 };

 // True if v is between lb and ub, inclusive.  Throws if lb > ub.
 template <typename T>
 bool check_BETWEEN(const T& v, const T& lb, const T& ub) {
-    if (cmp_lt()(ub, lb)) {
+    if (ub < lb) {
        throw api_error("ValidationException",
                        format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
    }
@@ -523,15 +505,16 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
    }
 }

-// Check if the existing values of the item (previous_item) match the
+// Verify that the existing values of the item (previous_item) match the
 // conditions given by the Expected and ConditionalOperator parameters
 // (if they exist) in the request (an UpdateItem, PutItem or DeleteItem).
-// This function can throw an ValidationException API error if there
+// This function will throw a ConditionalCheckFailedException API error
+// if the values do not match the condition, or ValidationException if there
 // are errors in the format of the condition itself.
-bool verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value>& previous_item) {
+void verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value>& previous_item) {
    const rjson::value* expected = rjson::find(req, "Expected");
    if (!expected) {
-        return true;
+        return;
    }
    if (!expected->IsObject()) {
        throw api_error("ValidationException", "'Expected' parameter, if given, must be an object");
@@ -560,123 +543,22 @@ bool verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value
    for (auto it = expected->MemberBegin(); it != expected->MemberEnd(); ++it) {
        const rjson::value* got = nullptr;
        if (previous_item && previous_item->IsObject() && previous_item->HasMember("Item")) {
-            got = rjson::find((*previous_item)["Item"], rjson::to_string_view(it->name));
+            got = rjson::find((*previous_item)["Item"], rjson::string_ref_type(it->name.GetString()));
        }
        bool success = verify_expected_one(it->value, got);
        if (success && !require_all) {
            // When !require_all, one success is enough!
-            return true;
+            return;
        } else if (!success && require_all) {
            // When require_all, one failure is enough!
-            return false;
+            throw api_error("ConditionalCheckFailedException", "Failed condition.");
        }
    }
    // If we got here and require_all, none of the checks failed, so succeed.
    // If we got here and !require_all, all of the checks failed, so fail.
-    return require_all;
-}
-
-bool calculate_primitive_condition(const parsed::primitive_condition& cond,
-        std::unordered_set<std::string>& used_attribute_values,
-        std::unordered_set<std::string>& used_attribute_names,
-        const rjson::value& req,
-        schema_ptr schema,
-        const std::unique_ptr<rjson::value>& previous_item) {
-    std::vector<rjson::value> calculated_values;
-    calculated_values.reserve(cond._values.size());
-    for (const parsed::value& v : cond._values) {
-        calculated_values.push_back(calculate_value(v,
-                cond._op == parsed::primitive_condition::type::VALUE ?
-                        calculate_value_caller::ConditionExpressionAlone :
-                        calculate_value_caller::ConditionExpression,
-                rjson::find(req, "ExpressionAttributeValues"),
-                used_attribute_names, used_attribute_values,
-                req, schema, previous_item));
-    }
-    switch (cond._op) {
-    case parsed::primitive_condition::type::BETWEEN:
-        if (calculated_values.size() != 3) {
-            // Shouldn't happen unless we have a bug in the parser
-            throw std::logic_error(format("Wrong number of values {} in BETWEEN primitive_condition", cond._values.size()));
-        }
-        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2]);
-    case parsed::primitive_condition::type::IN:
-        return check_IN(calculated_values);
-    case parsed::primitive_condition::type::VALUE:
-        if (calculated_values.size() != 1) {
-            // Shouldn't happen unless we have a bug in the parser
-            throw std::logic_error(format("Unexpected values in primitive_condition", cond._values.size()));
-        }
-        // Unwrap the boolean wrapped as the value (if it is a boolean)
-        if (calculated_values[0].IsObject() && calculated_values[0].MemberCount() == 1) {
-            auto it = calculated_values[0].MemberBegin();
-            if (it->name == "BOOL" && it->value.IsBool()) {
-                return it->value.GetBool();
-            }
-        }
-        throw api_error("ValidationException",
-                format("ConditionExpression: condition results in a non-boolean value: {}",
-                        calculated_values[0]));
-    default:
-        // All the rest of the operators have exactly two parameters (and unless
-        // we have a bug in the parser, that's what we have in the parsed object:
-        if (calculated_values.size() != 2) {
-            throw std::logic_error(format("Wrong number of values {} in primitive_condition object", cond._values.size()));
-        }
-    }
-    switch (cond._op) {
-    case parsed::primitive_condition::type::EQ:
-        return check_EQ(&calculated_values[0], calculated_values[1]);
-    case parsed::primitive_condition::type::NE:
-        return check_NE(&calculated_values[0], calculated_values[1]);
-    case parsed::primitive_condition::type::GT:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{});
-    case parsed::primitive_condition::type::GE:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{});
-    case parsed::primitive_condition::type::LT:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{});
-    case parsed::primitive_condition::type::LE:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{});
-    default:
-        // Shouldn't happen unless we have a bug in the parser
-        throw std::logic_error(format("Unknown type {} in primitive_condition object", (int)(cond._op)));
+    if (!require_all) {
+        throw api_error("ConditionalCheckFailedException", "None of ORed Expect conditions were successful.");
    }
 }

-// Check if the existing values of the item (previous_item) match the
-// conditions given by the given parsed ConditionExpression.
-bool verify_condition_expression(
-        const parsed::condition_expression& condition_expression,
-        std::unordered_set<std::string>& used_attribute_values,
-        std::unordered_set<std::string>& used_attribute_names,
-        const rjson::value& req,
-        schema_ptr schema,
-        const std::unique_ptr<rjson::value>& previous_item) {
-    if (condition_expression.empty()) {
-        return true;
-    }
-    bool ret = std::visit(overloaded_functor {
-        [&] (const parsed::primitive_condition& cond) -> bool {
-            return calculate_primitive_condition(cond, used_attribute_values,
-                    used_attribute_names, req, schema, previous_item);
-        },
-        [&] (const parsed::condition_expression::condition_list& list) -> bool {
-            auto verify_condition = [&] (const parsed::condition_expression& e) {
-                return verify_condition_expression(e, used_attribute_values,
-                        used_attribute_names, req, schema, previous_item);
-            };
-            switch (list.op) {
-            case '&':
-                return boost::algorithm::all_of(list.conditions, verify_condition);
-            case '|':
-                return boost::algorithm::any_of(list.conditions, verify_condition);
-            default:
-                // Shouldn't happen unless we have a bug in the parser
-                throw std::logic_error("bad operator in condition_list");
-            }
-        }
-    }, condition_expression._expression);
-    return condition_expression._negated ? !ret : ret;
-}
-
 }
--- a/alternator/conditions.hh
+++ b/alternator/conditions.hh
@@ -44,6 +44,6 @@ comparison_operator_type get_comparison_operator(const rjson::value& comparison_

 ::shared_ptr<cql3::restrictions::statement_restrictions> get_filtering_restrictions(schema_ptr schema, const column_definition& attrs_col, const rjson::value& query_filter);

-bool verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value>& previous_item);
+void verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value>& previous_item);

 }
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -25,56 +25,45 @@
 #include <seastar/http/httpd.hh>
 #include "seastarx.hh"
 #include <seastar/json/json_elements.hh>
-#include <seastar/core/sharded.hh>

 #include "service/storage_proxy.hh"
 #include "service/migration_manager.hh"
 #include "service/client_state.hh"

-#include "alternator/error.hh"
 #include "stats.hh"
-#include "rjson.hh"

 namespace alternator {

-class executor : public peering_sharded_service<executor> {
+class executor {
    service::storage_proxy& _proxy;
    service::migration_manager& _mm;
-    // An smp_service_group to be used for limiting the concurrency when
-    // forwarding Alternator request between shards - if necessary for LWT.
-    smp_service_group _ssg;

 public:
    using client_state = service::client_state;
-    using request_return_type = std::variant<json::json_return_type, api_error>;
    stats _stats;
    static constexpr auto ATTRS_COLUMN_NAME = ":attrs";
-    static constexpr auto KEYSPACE_NAME_PREFIX = "alternator_";
+    static constexpr auto KEYSPACE_NAME = "alternator";

-    executor(service::storage_proxy& proxy, service::migration_manager& mm, smp_service_group ssg)
-        : _proxy(proxy), _mm(mm), _ssg(ssg) {}
+    executor(service::storage_proxy& proxy, service::migration_manager& mm) : _proxy(proxy), _mm(mm) {}

-    future<request_return_type> create_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
-    future<request_return_type> describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
-    future<request_return_type> delete_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
-    future<request_return_type> put_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
-    future<request_return_type> get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
-    future<request_return_type> delete_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
-    future<request_return_type> update_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
-    future<request_return_type> list_tables(client_state& client_state, service_permit permit, rjson::value request);
-    future<request_return_type> scan(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
-    future<request_return_type> describe_endpoints(client_state& client_state, service_permit permit, rjson::value request, std::string host_header);
-    future<request_return_type> batch_write_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
-    future<request_return_type> batch_get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
-    future<request_return_type> query(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
-    future<request_return_type> tag_resource(client_state& client_state, service_permit permit, rjson::value request);
-    future<request_return_type> untag_resource(client_state& client_state, service_permit permit, rjson::value request);
-    future<request_return_type> list_tags_of_resource(client_state& client_state, service_permit permit, rjson::value request);
+    future<json::json_return_type> create_table(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
+    future<json::json_return_type> describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
+    future<json::json_return_type> delete_table(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
+    future<json::json_return_type> put_item(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
+    future<json::json_return_type> get_item(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
+    future<json::json_return_type> delete_item(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
+    future<json::json_return_type> update_item(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
+    future<json::json_return_type> list_tables(client_state& client_state, std::string content);
+    future<json::json_return_type> scan(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
+    future<json::json_return_type> describe_endpoints(client_state& client_state, std::string content, std::string host_header);
+    future<json::json_return_type> batch_write_item(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
+    future<json::json_return_type> batch_get_item(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);
+    future<json::json_return_type> query(client_state& client_state, tracing::trace_state_ptr trace_state, std::string content);

    future<> start();
    future<> stop() { return make_ready_future<>(); }

-    future<> create_keyspace(std::string_view keyspace_name);
+    future<> maybe_create_keyspace();

    static tracing::trace_state_ptr maybe_trace_query(client_state& client_state, sstring_view op, sstring_view query);
 };
--- a/alternator/expressions.cc
+++ b/alternator/expressions.cc
@@ -22,7 +22,6 @@
 #include "expressions.hh"
 #include "alternator/expressionsLexer.hpp"
 #include "alternator/expressionsParser.hpp"
-#include "utils/overloaded_functor.hh"

 #include <seastarx.hh>

@@ -66,19 +65,13 @@ parse_projection_expression(std::string query) {
    }
 }

-parsed::condition_expression
-parse_condition_expression(std::string query) {
-    try {
-        return do_with_parser(query,  std::mem_fn(&expressionsParser::condition_expression));
-    } catch (...) {
-        throw expressions_syntax_error(format("Failed parsing ConditionExpression '{}': {}", query, std::current_exception()));
-    }
-}
+template<class... Ts> struct overloaded : Ts... { using Ts::operator()...; };
+template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;

 namespace parsed {

 void update_expression::add(update_expression::action a) {
-    std::visit(overloaded_functor {
+    std::visit(overloaded {
        [&] (action::set&)    { seen_set = true; },
        [&] (action::remove&) { seen_remove = true; },
        [&] (action::add&)    { seen_add = true; },
@@ -101,27 +94,5 @@ void update_expression::append(update_expression other) {
    seen_del |= other.seen_del;
 }

-void condition_expression::append(condition_expression&& a, char op) {
-    std::visit(overloaded_functor {
-        [&] (condition_list& x) {
-            // If 'a' has a single condition, we could, instead of inserting
-            // it insert its single condition (possibly negated if a._negated)
-            // But considering it we don't evaluate these expressions many
-            // times, this optimization is not worth extra code complexity.
-            if (!x.conditions.empty() && x.op != op) {
-                // Shouldn't happen unless we have a bug in the parser
-                throw std::logic_error("condition_expression::append called with mixed operators");
-            }
-            x.conditions.push_back(std::move(a));
-            x.op = op;
-        },
-        [&] (primitive_condition& x) {
-            // Shouldn't happen unless we have a bug in the parser
-            throw std::logic_error("condition_expression::append called on primitive_condition");
-        }
-    }, _expression);
-}
-
-
 } // namespace parsed
 } // namespace alternator
--- a/alternator/expressions.g
+++ b/alternator/expressions.g
@@ -145,12 +145,6 @@ REMOVE: R E M O V E;
 ADD: A D D;
 DELETE: D E L E T E;

-AND: A N D;
-OR: O R;
-NOT: N O T;
-BETWEEN: B E T W E E N;
-IN: I N;
-
 fragment ALPHA: 'A'..'Z' | 'a'..'z';
 fragment DIGIT: '0'..'9';
 fragment ALNUM: ALPHA | DIGIT | '_';
@@ -171,19 +165,19 @@ path returns [parsed::path p]:
      | '[' INTEGER ']'           { $p.add_index(std::stoi($INTEGER.text)); }
    )*;

-value returns [parsed::value v]:
-      VALREF       { $v.set_valref($VALREF.text); }
-    | path         { $v.set_path($path.p); }
-    | NAME         { $v.set_func_name($NAME.text); }
-     '(' x=value   { $v.add_func_parameter($x.v); }
-     (',' x=value  { $v.add_func_parameter($x.v); })*
+update_expression_set_value returns [parsed::value v]:
+      VALREF                             { $v.set_valref($VALREF.text); }
+    | path                               { $v.set_path($path.p); }
+    | NAME                               { $v.set_func_name($NAME.text); }
+     '(' x=update_expression_set_value   { $v.add_func_parameter($x.v); }
+     (',' x=update_expression_set_value  { $v.add_func_parameter($x.v); })*
     ')'
    ;

 update_expression_set_rhs returns [parsed::set_rhs rhs]:
-    v=value  { $rhs.set_value(std::move($v.v)); }
-    (   '+' v=value  { $rhs.set_plus(std::move($v.v)); }
-      | '-' v=value  { $rhs.set_minus(std::move($v.v)); }
+    v=update_expression_set_value  { $rhs.set_value(std::move($v.v)); }
+    (   '+' v=update_expression_set_value  { $rhs.set_plus(std::move($v.v)); }
+      | '-' v=update_expression_set_value  { $rhs.set_minus(std::move($v.v)); }
    )?
    ;

@@ -218,48 +212,3 @@ update_expression returns [parsed::update_expression e]:
 projection_expression returns [std::vector<parsed::path> v]:
    p=path      { $v.push_back(std::move($p.p)); }
    (',' p=path { $v.push_back(std::move($p.p)); } )* EOF;
-
-
-primitive_condition returns [parsed::primitive_condition c]:
-      v=value         { $c.add_value(std::move($v.v));
-                        $c.set_operator(parsed::primitive_condition::type::VALUE); }
-      (  (  '='       { $c.set_operator(parsed::primitive_condition::type::EQ); }
-          | '<' '>'   { $c.set_operator(parsed::primitive_condition::type::NE); }
-          | '<'       { $c.set_operator(parsed::primitive_condition::type::LT); }
-          | '<' '='   { $c.set_operator(parsed::primitive_condition::type::LE); }
-          | '>'       { $c.set_operator(parsed::primitive_condition::type::GT); }
-          | '>' '='   { $c.set_operator(parsed::primitive_condition::type::GE); }
-         )
-         v=value      { $c.add_value(std::move($v.v)); }
-       | BETWEEN      { $c.set_operator(parsed::primitive_condition::type::BETWEEN); }
-         v=value      { $c.add_value(std::move($v.v)); }
-         AND
-         v=value      { $c.add_value(std::move($v.v)); }
-       | IN '('       { $c.set_operator(parsed::primitive_condition::type::IN); }
-         v=value      { $c.add_value(std::move($v.v)); }
-         (',' v=value { $c.add_value(std::move($v.v)); })*
-         ')'
-      )?
-    ;
-
-// The following rules for parsing boolean expressions are verbose and
-// somewhat strange because of Antlr 3's limitations on recursive rules,
-// common rule prefixes, and (lack of) support for operator precedence.
-// These rules could have been written more clearly using a more powerful
-// parser generator - such as Yacc.
-boolean_expression returns [parsed::condition_expression e]:
-	  b=boolean_expression_1       { $e.append(std::move($b.e), '|'); }
-	  (OR b=boolean_expression_1   { $e.append(std::move($b.e), '|'); } )*
-	;
-boolean_expression_1 returns [parsed::condition_expression e]:
-	  b=boolean_expression_2       { $e.append(std::move($b.e), '&'); }
-	  (AND b=boolean_expression_2  { $e.append(std::move($b.e), '&'); } )*
-	;
-boolean_expression_2 returns [parsed::condition_expression e]:
-	  p=primitive_condition        { $e.set_primitive(std::move($p.c)); }
-	| NOT b=boolean_expression_2   { $e = std::move($b.e); $e.apply_not(); }
-	| '(' b=boolean_expression ')' { $e = std::move($b.e); }
-    ;
-
-condition_expression returns [parsed::condition_expression e]:
-    boolean_expression { e=std::move($boolean_expression.e); } EOF;
--- a/alternator/expressions.hh
+++ b/alternator/expressions.hh
@@ -36,6 +36,6 @@ public:

 parsed::update_expression parse_update_expression(std::string query);
 std::vector<parsed::path> parse_projection_expression(std::string query);
-parsed::condition_expression parse_condition_expression(std::string query);
+

 } /* namespace alternator */
--- a/alternator/expressions_eval.hh
+++ b/alternator/expressions_eval.hh
@@ -1,78 +0,0 @@
-/*
- * Copyright 2020 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <string>
-#include <unordered_set>
-
-#include "rjson.hh"
-#include "schema_fwd.hh"
-
-#include "expressions_types.hh"
-
-namespace alternator {
-
-// calculate_value() behaves slightly different (especially, different
-// functions supported) when used in different types of expressions, as
-// enumerated in this enum:
-enum class calculate_value_caller {
-    UpdateExpression, ConditionExpression, ConditionExpressionAlone
-};
-
-inline std::ostream& operator<<(std::ostream& out, calculate_value_caller caller) {
-    switch (caller) {
-        case calculate_value_caller::UpdateExpression:
-            out << "UpdateExpression";
-            break;
-        case calculate_value_caller::ConditionExpression:
-            out << "ConditionExpression";
-            break;
-        case calculate_value_caller::ConditionExpressionAlone:
-            out << "ConditionExpression";
-            break;
-        default:
-            out << "unknown type of expression";
-            break;
-    }
-    return out;
-}
-
-bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2);
-
-rjson::value calculate_value(const parsed::value& v,
-        calculate_value_caller caller,
-        const rjson::value* expression_attribute_values,
-        std::unordered_set<std::string>& used_attribute_names,
-        std::unordered_set<std::string>& used_attribute_values,
-        const rjson::value& update_info,
-        schema_ptr schema,
-        const std::unique_ptr<rjson::value>& previous_item);
-
-bool verify_condition_expression(
-        const parsed::condition_expression& condition_expression,
-        std::unordered_set<std::string>& used_attribute_values,
-        std::unordered_set<std::string>& used_attribute_names,
-        const rjson::value& req,
-        schema_ptr schema,
-        const std::unique_ptr<rjson::value>& previous_item);
-
-} /* namespace alternator */
--- a/alternator/expressions_types.hh
+++ b/alternator/expressions_types.hh
@@ -88,15 +88,6 @@ struct value {
    void add_func_parameter(value v) {
        std::get<function_call>(_value)._parameters.emplace_back(std::move(v));
    }
-    bool is_valref() const {
-        return std::holds_alternative<std::string>(_value);
-    }
-    bool is_path() const {
-        return std::holds_alternative<path>(_value);
-    }
-    bool is_func() const {
-        return std::holds_alternative<function_call>(_value);
-    }
 };

 // The right-hand-side of a SET in an update expression can be either a
@@ -171,58 +162,5 @@ public:
    }
 };

-// A primitive_condition is a condition expression involving one condition,
-// while the full condition_expression below adds boolean logic over these
-// primitive conditions.
-// The supported primitive conditions are:
-// 1. Binary operators - v1 OP v2, where OP is =, <>, <, <=, >, or >= and
-//    v1 and v2 are values - from the item (an attribute path), the query
-//    (a ":val" reference), or a function of the the above (only the size()
-//    function is supported).
-// 2. Ternary operator - v1 BETWEEN v2 and v3 (means v1 >= v2 AND v1 <= v3).
-// 3. N-ary operator - v1 IN ( v2, v3, ... )
-// 4. A single function call (attribute_exists etc.). The parser actually
-//    accepts a more general "value" here but later stages reject a value
-//    which is not a function call (because DynamoDB does it too).
-class primitive_condition {
-public:
-    enum class type {
-        UNDEFINED, VALUE, EQ, NE, LT, LE, GT, GE, BETWEEN, IN
-    };
-    type _op = type::UNDEFINED;
-    std::vector<value> _values;
-    void set_operator(type op) {
-        _op = op;
-    }
-    void add_value(value&& v) {
-        _values.push_back(std::move(v));
-    }
-    bool empty() const {
-        return _op == type::UNDEFINED;
-    }
-};
-
-class condition_expression {
-public:
-    bool _negated = false; // If true, the entire condition is negated
-    struct condition_list {
-        char op = '|'; // '&' or '|'
-        std::vector<condition_expression> conditions;
-    };
-    std::variant<primitive_condition, condition_list> _expression = condition_list();
-
-    void set_primitive(primitive_condition&& p) {
-        _expression = std::move(p);
-    }
-    void append(condition_expression&& c, char op);
-    void apply_not() {
-        _negated = !_negated;
-    }
-    bool empty() const {
-        return std::holds_alternative<condition_list>(_expression) &&
-               std::get<condition_list>(_expression).conditions.empty();
-    }
-};
-
 } // namespace parsed
 } // namespace alternator
--- a/alternator/rjson.cc
+++ b/alternator/rjson.cc
@@ -22,108 +22,14 @@
 #include "rjson.hh"
 #include "error.hh"
 #include <seastar/core/print.hh>
-#include <seastar/core/thread.hh>

 namespace rjson {

 static allocator the_allocator;

-/*
- * This wrapper class adds nested level checks to rapidjson's handlers.
- * Each rapidjson handler implements functions for accepting JSON values,
- * which includes strings, numbers, objects, arrays, etc.
- * Parsing objects and arrays needs to be performed carefully with regard
- * to stack overflow - each object/array layer adds another stack frame
- * to parsing, printing and destroying the parent JSON document.
- * To prevent stack overflow, a rapidjson handler can be wrapped with
- * guarded_json_handler, which accepts an additional max_nested_level parameter.
- * After trying to exceed the max nested level, a proper rjson::error will be thrown.
- */
-template<typename Handler, bool EnableYield>
-struct guarded_yieldable_json_handler : public Handler {
-    size_t _nested_level = 0;
-    size_t _max_nested_level;
-public:
-    using handler_base = Handler;
-
-    explicit guarded_yieldable_json_handler(size_t max_nested_level) : _max_nested_level(max_nested_level) {}
-    guarded_yieldable_json_handler(string_buffer& buf, size_t max_nested_level)
-            : handler_base(buf), _max_nested_level(max_nested_level) {}
-
-    void Parse(const char* str, size_t length) {
-        rapidjson::MemoryStream ms(static_cast<const char*>(str), length * sizeof(typename encoding::Ch));
-        rapidjson::EncodedInputStream<encoding, rapidjson::MemoryStream> is(ms);
-        rapidjson::GenericReader<encoding, encoding, allocator> reader(&the_allocator);
-        reader.Parse(is, *this);
-        if (reader.HasParseError()) {
-            throw rjson::error(format("Parsing JSON failed: {}", rapidjson::GetParseError_En(reader.GetParseErrorCode())));
-        }
-        //NOTICE: The handler has parsed the string, but in case of rapidjson::GenericDocument
-        // the data now resides in an internal stack_ variable, which is private instead of
-        // protected... which means we cannot simply access its data. Fortunately, another
-        // function for populating documents from SAX events can be abused to extract the data
-        // from the stack via gadget-oriented programming - we use an empty event generator
-        // which does nothing, and use it to call Populate(), which assumes that the generator
-        // will fill the stack with something. It won't, but our stack is already filled with
-        // data we want to steal, so once Populate() ends, our document will be properly parsed.
-        // A proper solution could be programmed once rapidjson declares this stack_ variable
-        // as protected instead of private, so that this class can access it.
-        auto dummy_generator = [](handler_base&){return true;};
-        handler_base::Populate(dummy_generator);
-    }
-
-    bool StartObject() {
-        ++_nested_level;
-        check_nested_level();
-        maybe_yield();
-        return handler_base::StartObject();
-    }
-
-    bool EndObject(rapidjson::SizeType elements_count = 0) {
-        --_nested_level;
-        return handler_base::EndObject(elements_count);
-    }
-
-    bool StartArray() {
-        ++_nested_level;
-        check_nested_level();
-        maybe_yield();
-        return handler_base::StartArray();
-    }
-
-    bool EndArray(rapidjson::SizeType elements_count = 0) {
-        --_nested_level;
-        return handler_base::EndArray(elements_count);
-    }
-
-    bool Null()                 { maybe_yield(); return handler_base::Null(); }
-    bool Bool(bool b)           { maybe_yield(); return handler_base::Bool(b); }
-    bool Int(int i)             { maybe_yield(); return handler_base::Int(i); }
-    bool Uint(unsigned u)       { maybe_yield(); return handler_base::Uint(u); }
-    bool Int64(int64_t i64)     { maybe_yield(); return handler_base::Int64(i64); }
-    bool Uint64(uint64_t u64)   { maybe_yield(); return handler_base::Uint64(u64); }
-    bool Double(double d)       { maybe_yield(); return handler_base::Double(d); }
-    bool String(const value::Ch* str, size_t length, bool copy = false) { maybe_yield(); return handler_base::String(str, length, copy); }
-    bool Key(const value::Ch* str, size_t length, bool copy = false) { maybe_yield(); return handler_base::Key(str, length, copy); }
-
-
-protected:
-    static void maybe_yield() {
-        if constexpr (EnableYield) {
-            thread::maybe_yield();
-        }
-    }
-
-    void check_nested_level() const {
-        if (RAPIDJSON_UNLIKELY(_nested_level > _max_nested_level)) {
-            throw rjson::error(format("Max nested level reached: {}", _max_nested_level));
-        }
-    }
-};
-
 std::string print(const rjson::value& value) {
    string_buffer buffer;
-    guarded_yieldable_json_handler<writer, false> writer(buffer, 39);
+    writer writer(buffer);
    value.Accept(writer);
    return std::string(buffer.GetString());
 }
@@ -132,9 +38,13 @@ rjson::value copy(const rjson::value& value) {
    return rjson::value(value, the_allocator);
 }

-rjson::value parse(std::string_view str) {
-    guarded_yieldable_json_handler<document, false> d(39);
-    d.Parse(str.data(), str.size());
+rjson::value parse(const std::string& str) {
+    return parse_raw(str.c_str(), str.size());
+}
+
+rjson::value parse_raw(const char* c_str, size_t size) {
+    rjson::document d;
+    d.Parse(c_str, size);
    if (d.HasParseError()) {
        throw rjson::error(format("Parsing JSON failed: {}", GetParseError_En(d.GetParseError())));
    }
@@ -142,22 +52,8 @@ rjson::value parse(std::string_view str) {
    return std::move(v);
 }

-rjson::value parse_yieldable(std::string_view str) {
-    guarded_yieldable_json_handler<document, true> d(39);
-    d.Parse(str.data(), str.size());
-    if (d.HasParseError()) {
-        throw rjson::error(format("Parsing JSON failed: {}", GetParseError_En(d.GetParseError())));
-    }
-    rjson::value& v = d;
-    return std::move(v);
-}
-
-rjson::value& get(rjson::value& value, std::string_view name) {
-    // Although FindMember() has a variant taking a StringRef, it ignores the
-    // given length (see https://github.com/Tencent/rapidjson/issues/1649).
-    // Luckily, the variant taking a GenericValue doesn't share this bug,
-    // and we can create a string GenericValue without copying the string.
-    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
+rjson::value& get(rjson::value& value, rjson::string_ref_type name) {
+    auto member_it = value.FindMember(name);
    if (member_it != value.MemberEnd())
        return member_it->value;
    else {
@@ -165,8 +61,8 @@ rjson::value& get(rjson::value& value, std::string_view name) {
    }
 }

-const rjson::value& get(const rjson::value& value, std::string_view name) {
-    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
+const rjson::value& get(const rjson::value& value, rjson::string_ref_type name) {
+    auto member_it = value.FindMember(name);
    if (member_it != value.MemberEnd())
        return member_it->value;
    else {
@@ -186,48 +82,24 @@ rjson::value from_string(const char* str, size_t size) {
    return rjson::value(str, size, the_allocator);
 }

-rjson::value from_string(std::string_view view) {
-    return rjson::value(view.data(), view.size(), the_allocator);
-}
-
-const rjson::value* find(const rjson::value& value, std::string_view name) {
-    // Although FindMember() has a variant taking a StringRef, it ignores the
-    // given length (see https://github.com/Tencent/rapidjson/issues/1649).
-    // Luckily, the variant taking a GenericValue doesn't share this bug,
-    // and we can create a string GenericValue without copying the string.
-    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
+const rjson::value* find(const rjson::value& value, string_ref_type name) {
+    auto member_it = value.FindMember(name);
    return member_it != value.MemberEnd() ? &member_it->value : nullptr;
 }

-rjson::value* find(rjson::value& value, std::string_view name) {
-    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
+rjson::value* find(rjson::value& value, string_ref_type name) {
+    auto member_it = value.FindMember(name);
    return member_it != value.MemberEnd() ? &member_it->value : nullptr;
 }

-bool remove_member(rjson::value& value, std::string_view name) {
-    // Although RemoveMember() has a variant taking a StringRef, it ignores
-    // given length (see https://github.com/Tencent/rapidjson/issues/1649).
-    // Luckily, the variant taking a GenericValue doesn't share this bug,
-    // and we can create a string GenericValue without copying the string.
-    return value.RemoveMember(rjson::value(name.data(), name.size()));
-}
-
 void set_with_string_name(rjson::value& base, const std::string& name, rjson::value&& member) {
    base.AddMember(rjson::value(name.c_str(), name.size(), the_allocator), std::move(member), the_allocator);
 }

-void set_with_string_name(rjson::value& base, std::string_view name, rjson::value&& member) {
-    base.AddMember(rjson::value(name.data(), name.size(), the_allocator), std::move(member), the_allocator);
-}
-
 void set_with_string_name(rjson::value& base, const std::string& name, rjson::string_ref_type member) {
    base.AddMember(rjson::value(name.c_str(), name.size(), the_allocator), rjson::value(member), the_allocator);
 }

-void set_with_string_name(rjson::value& base, std::string_view name, rjson::string_ref_type member) {
-    base.AddMember(rjson::value(name.data(), name.size(), the_allocator), rjson::value(member), the_allocator);
-}
-
 void set(rjson::value& base, rjson::string_ref_type name, rjson::value&& member) {
    base.AddMember(name, std::move(member), the_allocator);
 }
--- a/alternator/rjson.hh
+++ b/alternator/rjson.hh
@@ -104,49 +104,38 @@ inline rjson::value empty_string() {
 // The representation is dense - without any redundant indentation.
 std::string print(const rjson::value& value);

-// Returns a string_view to the string held in a JSON value (which is
-// assumed to hold a string, i.e., v.IsString() == true). This is a view
-// to the existing data - no copying is done.
-inline std::string_view to_string_view(const rjson::value& v) {
-    return std::string_view(v.GetString(), v.GetStringLength());
-}
-
 // Copies given JSON value - involves allocation
 rjson::value copy(const rjson::value& value);

 // Parses a JSON value from given string or raw character array.
 // The string/char array liveness does not need to be persisted,
-// as parse() will allocate member names and values.
+// as both parse() and parse_raw() will allocate member names and values.
 // Throws rjson::error if parsing failed.
-rjson::value parse(std::string_view str);
-// Needs to be run in thread context
-rjson::value parse_yieldable(std::string_view str);
+rjson::value parse(const std::string& str);
+rjson::value parse_raw(const char* c_str, size_t size);

 // Creates a JSON value (of JSON string type) out of internal string representations.
 // The string value is copied, so str's liveness does not need to be persisted.
 rjson::value from_string(const std::string& str);
 rjson::value from_string(const sstring& str);
 rjson::value from_string(const char* str, size_t size);
-rjson::value from_string(std::string_view view);

 // Returns a pointer to JSON member if it exists, nullptr otherwise
-rjson::value* find(rjson::value& value, std::string_view name);
-const rjson::value* find(const rjson::value& value, std::string_view name);
+rjson::value* find(rjson::value& value, rjson::string_ref_type name);
+const rjson::value* find(const rjson::value& value, rjson::string_ref_type name);

 // Returns a reference to JSON member if it exists, throws otherwise
-rjson::value& get(rjson::value& value, std::string_view name);
-const rjson::value& get(const rjson::value& value, std::string_view name);
+rjson::value& get(rjson::value& value, rjson::string_ref_type name);
+const rjson::value& get(const rjson::value& value, rjson::string_ref_type name);

 // Sets a member in given JSON object by moving the member - allocates the name.
 // Throws if base is not a JSON object.
 void set_with_string_name(rjson::value& base, const std::string& name, rjson::value&& member);
-void set_with_string_name(rjson::value& base, std::string_view name, rjson::value&& member);

 // Sets a string member in given JSON object by assigning its reference - allocates the name.
 // NOTICE: member string liveness must be ensured to be at least as long as base's.
 // Throws if base is not a JSON object.
 void set_with_string_name(rjson::value& base, const std::string& name, rjson::string_ref_type member);
-void set_with_string_name(rjson::value& base, std::string_view name, rjson::string_ref_type member);

 // Sets a member in given JSON object by moving the member.
 // NOTICE: name liveness must be ensured to be at least as long as base's.
@@ -163,9 +152,6 @@ void set(rjson::value& base, rjson::string_ref_type name, rjson::string_ref_type
 // Throws if base_array is not a JSON array.
 void push_back(rjson::value& base_array, rjson::value&& item);

-// Remove a member from a JSON object. Throws if value isn't an object.
-bool remove_member(rjson::value& value, std::string_view name);
-
 struct single_value_comp {
    bool operator()(const rjson::value& r1, const rjson::value& r2) const;
 };
--- a/alternator/rmw_operation.hh
+++ b/alternator/rmw_operation.hh
@@ -1,124 +0,0 @@
-/*
- * Copyright 2020 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <seastarx.hh>
-#include <service/storage_proxy.hh>
-#include <service/storage_proxy.hh>
-#include "rjson.hh"
-#include "executor.hh"
-
-namespace alternator {
-
-// An rmw_operation encapsulates the common logic of all the item update
-// operations which may involve a read of the item before the write
-// (so-called Read-Modify-Write operations). These operations include PutItem,
-// UpdateItem and DeleteItem: All of these may be conditional operations (the
-// "Expected" parameter) which requir a read before the write, and UpdateItem
-// may also have an update expression which refers to the item's old value.
-//
-// The code below supports running the read and the write together as one
-// transaction using LWT (this is why rmw_operation is a subclass of
-// cas_request, as required by storage_proxy::cas()), but also has optional
-// modes not using LWT.
-class rmw_operation : public service::cas_request, public enable_shared_from_this<rmw_operation> {
-public:
-    // The following options choose which mechanism to use for isolating
-    // parallel write operations:
-    // * The FORBID_RMW option forbids RMW (read-modify-write) operations
-    //   such as conditional updates. For the remaining write-only
-    //   operations, ordinary quorum writes are isolated enough.
-    // * The LWT_ALWAYS option always uses LWT (lightweight transactions)
-    //   for any write operation - whether or not it also has a read.
-    // * The LWT_RMW_ONLY option uses LWT only for RMW operations, and uses
-    //   ordinary quorum writes for write-only operations.
-    //   This option is not safe if the user may send both RMW and write-only
-    //   operations on the same item.
-    // * The UNSAFE_RMW option does read-modify-write operations as separate
-    //   read and write. It is unsafe - concurrent RMW operations are not
-    //   isolated at all. This option will likely be removed in the future.
-    enum class write_isolation {
-        FORBID_RMW, LWT_ALWAYS, LWT_RMW_ONLY, UNSAFE_RMW
-    };
-    static constexpr auto WRITE_ISOLATION_TAG_KEY = "system:write_isolation";
-
-    static write_isolation get_write_isolation_for_schema(schema_ptr schema);
-
-protected:
-    // The full request JSON
-    rjson::value _request;
-    // All RMW operations involve a single item with a specific partition
-    // and optional clustering key, in a single table, so the following
-    // information is common to all of them:
-    schema_ptr _schema;
-    partition_key _pk = partition_key::make_empty();
-    clustering_key _ck = clustering_key::make_empty();
-    write_isolation _write_isolation;
-
-    // All RMW operations can have a ReturnValues parameter from the following
-    // choices. But note that only UpdateItem actually supports all of them:
-    enum class returnvalues {
-        NONE, ALL_OLD, UPDATED_OLD, ALL_NEW, UPDATED_NEW
-    } _returnvalues;
-    static returnvalues parse_returnvalues(const rjson::value& request);
-    // When _returnvalues != NONE, apply() should store here, in JSON form,
-    // the values which are to be returned in the "Attributes" field.
-    // The default null JSON means do not return an Attributes field at all.
-    // This field is marked "mutable" so that the const apply() can modify
-    // it (see explanation below), but note that because apply() may be
-    // called more than once, if apply() will sometimes set this field it
-    // must set it (even if just to the default empty value) every time.
-    mutable rjson::value _return_attributes;
-public:
-    // The constructor of a rmw_operation subclass should parse the request
-    // and try to discover as many input errors as it can before really
-    // attempting the read or write operations.
-    rmw_operation(service::storage_proxy& proxy, rjson::value&& request);
-    // rmw_operation subclasses (update_item_operation, put_item_operation
-    // and delete_item_operation) shall implement an apply() function which
-    // takes the previous value of the item (if it was read) and creates the
-    // write mutation. If the previous value of item does not pass the needed
-    // conditional expression, apply() should return an empty optional.
-    // apply() may throw if it encounters input errors not discovered during
-    // the constructor.
-    // apply() may be called more than once in case of contention, so it must
-    // not change the state saved in the object (issue #7218 was caused by
-    // violating this). We mark apply() "const" to let the compiler validate
-    // this for us. The output-only field _return_attributes is marked
-    // "mutable" above so that apply() can still write to it.
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const = 0;
-    // Convert the above apply() into the signature needed by cas_request:
-    virtual std::optional<mutation> apply(query::result& qr, const query::partition_slice& slice, api::timestamp_type ts) override;
-    virtual ~rmw_operation() = default;
-    schema_ptr schema() const { return _schema; }
-    const rjson::value& request() const { return _request; }
-    rjson::value&& move_request() && { return std::move(_request); }
-    future<executor::request_return_type> execute(service::storage_proxy& proxy,
-            service::client_state& client_state,
-            tracing::trace_state_ptr trace_state,
-            service_permit permit,
-            bool needs_read_before_write,
-            stats& stats);
-    std::optional<shard_id> shard_for_execute(bool needs_read_before_write);
-};
-
-} // namespace alternator
--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -136,7 +136,7 @@ rjson::value deserialize_item(bytes_view bv) {

    if (atype == alternator_type::NOT_SUPPORTED_YET) {
        slogger.trace("Non-optimal deserialization of alternator type {}", int8_t(atype));
-        return rjson::parse(std::string_view(reinterpret_cast<const char *>(bv.data()), bv.size()));
+        return rjson::parse_raw(reinterpret_cast<const char *>(bv.data()), bv.size());
    }
    type_representation type_representation = represent_type(atype);
    visit(*type_representation.dtype, to_json_visitor{deserialized, type_representation.ident, bv});
@@ -160,34 +160,27 @@ std::string type_to_string(data_type type) {

 bytes get_key_column_value(const rjson::value& item, const column_definition& column) {
    std::string column_name = column.name_as_text();
-    const rjson::value* key_typed_value = rjson::find(item, column_name);
-    if (!key_typed_value) {
-        throw api_error("ValidationException", format("Key column {} not found", column_name));
+    std::string expected_type = type_to_string(column.type);
+
+    const rjson::value& key_typed_value = rjson::get(item, rjson::value::StringRefType(column_name.c_str()));
+    if (!key_typed_value.IsObject() || key_typed_value.MemberCount() != 1) {
+        throw api_error("ValidationException",
+                format("Missing or invalid value object for key column {}: {}", column_name, item));
    }
-    return get_key_from_typed_value(*key_typed_value, column);
+    return get_key_from_typed_value(key_typed_value, column, expected_type);
 }

-// Parses the JSON encoding for a key value, which is a map with a single
-// entry, whose key is the type (expected to match the key column's type)
-// and the value is the encoded value.
-bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column) {
-    if (!key_typed_value.IsObject() || key_typed_value.MemberCount() != 1 ||
-            !key_typed_value.MemberBegin()->value.IsString()) {
-        throw api_error("ValidationException",
-                format("Malformed value object for key column {}: {}",
-                        column.name_as_text(), key_typed_value));
-    }
-
+bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column, const std::string& expected_type) {
    auto it = key_typed_value.MemberBegin();
-    if (it->name != type_to_string(column.type)) {
+    if (it->name.GetString() != expected_type) {
        throw api_error("ValidationException",
                format("Type mismatch: expected type {} for key column {}, got type {}",
-                        type_to_string(column.type), column.name_as_text(), it->name.GetString()));
+                        expected_type, column.name_as_text(), it->name.GetString()));
    }
    if (column.type == bytes_type) {
        return base64_decode(it->value);
    } else {
-        return column.type->from_string(rjson::to_string_view(it->value));
+        return column.type->from_string(it->value.GetString());
    }

 }
--- a/alternator/serialization.hh
+++ b/alternator/serialization.hh
@@ -24,7 +24,7 @@
 #include <string>
 #include <string_view>
 #include "types.hh"
-#include "schema_fwd.hh"
+#include "schema.hh"
 #include "keys.hh"
 #include "rjson.hh"
 #include "utils/big_decimal.hh"
@@ -54,7 +54,7 @@ rjson::value deserialize_item(bytes_view bv);
 std::string type_to_string(data_type type);

 bytes get_key_column_value(const rjson::value& item, const column_definition& column);
-bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column);
+bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column, const std::string& expected_type);
 rjson::value json_key_column_value(bytes_view cell, const column_definition& column);

 partition_key pk_from_json(const rjson::value& item, schema_ptr schema);
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -29,8 +29,6 @@
 #include "auth.hh"
 #include <cctype>
 #include "cql3/query_processor.hh"
-#include "service/storage_service.hh"
-#include "utils/overloaded_functor.hh"

 static logging::logger slogger("alternator-server");

@@ -67,9 +65,9 @@ inline std::vector<std::string_view> split(std::string_view text, char separator
 // Internal Server Error.
 class api_handler : public handler_base {
 public:
-    api_handler(const std::function<future<executor::request_return_type>(std::unique_ptr<request> req)>& _handle) : _f_handle(
-         [this, _handle](std::unique_ptr<request> req, std::unique_ptr<reply> rep) {
-         return seastar::futurize_apply(_handle, std::move(req)).then_wrapped([this, rep = std::move(rep)](future<executor::request_return_type> resf) mutable {
+    api_handler(const future_json_function& _handle) : _f_handle(
+         [_handle](std::unique_ptr<request> req, std::unique_ptr<reply> rep) {
+         return seastar::futurize_apply(_handle, std::move(req)).then_wrapped([rep = std::move(rep)](future<json::json_return_type> resf) mutable {
             if (resf.failed()) {
                 // Exceptions of type api_error are wrapped as JSON and
                 // returned to the client as expected. Other types of
@@ -88,24 +86,20 @@ public:
                             format("Internal server error: {}", std::current_exception()),
                             reply::status_type::internal_server_error);
                 }
-                 generate_error_reply(*rep, ret);
+                 // FIXME: what is this version number?
+                 rep->_content += "{\"__type\":\"com.amazonaws.dynamodb.v20120810#" + ret._type + "\"," +
+                         "\"message\":\"" + ret._msg + "\"}";
+                 rep->_status = ret._http_code;
+                 slogger.trace("api_handler error case: {}", rep->_content);
                 return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
             }
+             slogger.trace("api_handler success case");
             auto res = resf.get0();
-             std::visit(overloaded_functor {
-                 [&] (const json::json_return_type& json_return_value) {
-                     slogger.trace("api_handler success case");
-                     if (json_return_value._body_writer) {
-                         rep->write_body("json", std::move(json_return_value._body_writer));
-                     } else {
-                         rep->_content += json_return_value._res;
-                     }
-                 },
-                 [&] (const api_error& err) {
-                     generate_error_reply(*rep, err);
-                 }
-             }, res);
-
+             if (res._body_writer) {
+                 rep->write_body("json", std::move(res._body_writer));
+             } else {
+                 rep->_content += res._res;
+             }
             return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
         });
    }), _type("json") { }
@@ -121,66 +115,18 @@ public:
    }

 protected:
-    void generate_error_reply(reply& rep, const api_error& err) {
-        rep._content += "{\"__type\":\"com.amazonaws.dynamodb.v20120810#" + err._type + "\"," +
-                "\"message\":\"" + err._msg + "\"}";
-        rep._status = err._http_code;
-        slogger.trace("api_handler error case: {}", rep._content);
-    }
-
    future_handler_function _f_handle;
    sstring _type;
 };

-class gated_handler : public handler_base {
-    seastar::gate& _gate;
-public:
-    gated_handler(seastar::gate& gate) : _gate(gate) {}
-    virtual future<std::unique_ptr<reply>> do_handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) = 0;
-    virtual future<std::unique_ptr<reply>> handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) final override {
-        return with_gate(_gate, [this, &path, req = std::move(req), rep = std::move(rep)] () mutable {
-            return do_handle(path, std::move(req), std::move(rep));
-        });
-    }
-};
-
-class health_handler : public gated_handler {
-public:
-    health_handler(seastar::gate& pending_requests) : gated_handler(pending_requests) {}
-protected:
-    virtual future<std::unique_ptr<reply>> do_handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
+class health_handler : public handler_base {
+    virtual future<std::unique_ptr<reply>> handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
        rep->set_status(reply::status_type::ok);
        rep->write_body("txt", format("healthy: {}", req->get_header("Host")));
        return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
    }
 };

-class local_nodelist_handler : public gated_handler {
-public:
-    local_nodelist_handler(seastar::gate& pending_requests) : gated_handler(pending_requests) {}
-protected:
-    virtual future<std::unique_ptr<reply>> do_handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
-        rjson::value results = rjson::empty_array();
-        // It's very easy to get a list of all live nodes on the cluster,
-        // using gms::get_local_gossiper().get_live_members(). But getting
-        // just the list of live nodes in this DC needs more elaborate code:
-        sstring local_dc = locator::i_endpoint_snitch::get_local_snitch_ptr()->get_datacenter(
-                utils::fb_utilities::get_broadcast_address());
-        std::unordered_set<gms::inet_address> local_dc_nodes =
-                service::get_local_storage_service().get_token_metadata().
-                get_topology().get_datacenter_endpoints().at(local_dc);
-        for (auto& ip : local_dc_nodes) {
-            if (gms::get_local_gossiper().is_alive(ip)) {
-                rjson::push_back(results, rjson::from_string(ip.to_sstring()));
-            }
-        }
-        rep->set_status(reply::status_type::ok);
-        rep->set_content_type("json");
-        rep->_content = rjson::print(results);
-        return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
-    }
-};
-
 future<> server::verify_signature(const request& req) {
    if (!_enforce_authorization) {
        slogger.debug("Skipping authorization");
@@ -191,7 +137,7 @@ future<> server::verify_signature(const request& req) {
        throw api_error("InvalidSignatureException", "Host header is mandatory for signature verification");
    }
    auto authorization_it = req._headers.find("Authorization");
-    if (authorization_it == req._headers.end()) {
+    if (host_it == req._headers.end()) {
        throw api_error("InvalidSignatureException", "Authorization header is mandatory for signature verification");
    }
    std::string host = host_it->second;
@@ -268,8 +214,8 @@ future<> server::verify_signature(const request& req) {
    });
 }

-future<executor::request_return_type> server::handle_api_request(std::unique_ptr<request>&& req) {
-    _executor._stats.total_operations++;
+future<json::json_return_type> server::handle_api_request(std::unique_ptr<request>&& req) {
+    _executor.local()._stats.total_operations++;
    sstring target = req->get_header(TARGET);
    std::vector<std::string_view> split_target = split(target, '.');
    //NOTICE(sarna): Target consists of Dynamo API version followed by a dot '.' and operation type (e.g. CreateTable)
@@ -278,32 +224,17 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
    return verify_signature(*req).then([this, op, req = std::move(req)] () mutable {
        auto callback_it = _callbacks.find(op);
        if (callback_it == _callbacks.end()) {
-            _executor._stats.unsupported_operations++;
+            _executor.local()._stats.unsupported_operations++;
            throw api_error("UnknownOperationException",
                    format("Unsupported operation {}", op));
        }
-        return with_gate(_pending_requests, [this, callback_it = std::move(callback_it), op = std::move(op), req = std::move(req)] () mutable {
-            //FIXME: Client state can provide more context, e.g. client's endpoint address
-            // We use unique_ptr because client_state cannot be moved or copied
-            return do_with(std::make_unique<executor::client_state>(executor::client_state::internal_tag()),
-                    [this, callback_it = std::move(callback_it), op = std::move(op), req = std::move(req)] (std::unique_ptr<executor::client_state>& client_state) mutable {
-                tracing::trace_state_ptr trace_state = executor::maybe_trace_query(*client_state, op, req->content);
-                tracing::trace(trace_state, op);
-                // JSON parsing can allocate up to roughly 2x the size of the raw document, + a couple of bytes for maintenance.
-                // FIXME: by this time, the whole HTTP request was already read, so some memory is already occupied.
-                // Once HTTP allows working on streams, we should grab the permit *before* reading the HTTP payload.
-                size_t mem_estimate = req->content.size() * 3 + 8000;
-                auto units_fut = get_units(*_memory_limiter, mem_estimate);
-                if (_memory_limiter->waiters()) {
-                    ++_executor._stats.requests_blocked_memory;
-                }
-                return units_fut.then([this, callback_it = std::move(callback_it), &client_state, trace_state, req = std::move(req)] (semaphore_units<> units) mutable {
-                    return _json_parser.parse(req->content).then([this, callback_it = std::move(callback_it), &client_state, trace_state,
-                            units = std::move(units), req = std::move(req)] (rjson::value json_request) mutable {
-                        return callback_it->second(_executor, *client_state, trace_state, make_service_permit(std::move(units)), std::move(json_request), std::move(req)).finally([trace_state] {});
-                    });
-                });
-            });
+        //FIXME: Client state can provide more context, e.g. client's endpoint address
+        // We use unique_ptr because client_state cannot be moved or copied
+        return do_with(std::make_unique<executor::client_state>(executor::client_state::internal_tag()), [this, callback_it = std::move(callback_it), op = std::move(op), req = std::move(req)] (std::unique_ptr<executor::client_state>& client_state) mutable {
+            client_state->set_raw_keyspace(executor::KEYSPACE_NAME);
+            tracing::trace_state_ptr trace_state = executor::maybe_trace_query(*client_state, op, req->content);
+            tracing::trace(trace_state, op);
+            return callback_it->second(_executor.local(), *client_state, trace_state, std::move(req)).finally([trace_state] {});
        });
    });
 }
@@ -313,88 +244,35 @@ void server::set_routes(routes& r) {
        return handle_api_request(std::move(req));
    });

-    r.put(operation_type::POST, "/", req_handler);
-    r.put(operation_type::GET, "/", new health_handler(_pending_requests));
-    // The "/localnodes" request is a new Alternator feature, not supported by
-    // DynamoDB and not required for DynamoDB compatibility. It allows a
-    // client to enquire - using a trivial HTTP request without requiring
-    // authentication - the list of all live nodes in the same data center of
-    // the Alternator cluster. The client can use this list to balance its
-    // request load to all the nodes in the same geographical region.
-    // Note that this API exposes - openly without authentication - the
-    // information on the cluster's members inside one data center. We do not
-    // consider this to be a security risk, because an attacker can already
-    // scan an entire subnet for nodes responding to the health request,
-    // or even just scan for open ports.
-    r.put(operation_type::GET, "/localnodes", new local_nodelist_handler(_pending_requests));
+    r.add(operation_type::POST, url("/"), req_handler);
+    r.add(operation_type::GET, url("/"), new health_handler);
 }

 //FIXME: A way to immediately invalidate the cache should be considered,
 // e.g. when the system table which stores the keys is changed.
 // For now, this propagation may take up to 1 minute.
-server::server(executor& exec)
-        : _http_server("http-alternator")
-        , _https_server("https-alternator")
-        , _executor(exec)
-        , _key_cache(1024, 1min, slogger)
-        , _enforce_authorization(false)
-        , _enabled_servers{}
-        , _pending_requests{}
+server::server(seastar::sharded<executor>& e)
+        : _executor(e), _key_cache(1024, 1min, slogger), _enforce_authorization(false)
      , _callbacks{
-        {"CreateTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
-            return e.create_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
-        }},
-        {"DescribeTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
-            return e.describe_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
-        }},
-        {"DeleteTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
-            return e.delete_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
-        }},
-        {"PutItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
-            return e.put_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
-        }},
-        {"UpdateItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
-            return e.update_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
-        }},
-        {"GetItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
-            return e.get_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
-        }},
-        {"DeleteItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
-            return e.delete_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
-        }},
-        {"ListTables", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
-            return e.list_tables(client_state, std::move(permit), std::move(json_request));
-        }},
-        {"Scan", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
-            return e.scan(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
-        }},
-        {"DescribeEndpoints", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
-            return e.describe_endpoints(client_state, std::move(permit), std::move(json_request), req->get_header("Host"));
-        }},
-        {"BatchWriteItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
-            return e.batch_write_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
-        }},
-        {"BatchGetItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
-            return e.batch_get_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
-        }},
-        {"Query", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
-            return e.query(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
-        }},
-        {"TagResource", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
-            return e.tag_resource(client_state, std::move(permit), std::move(json_request));
-        }},
-        {"UntagResource", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
-            return e.untag_resource(client_state, std::move(permit), std::move(json_request));
-        }},
-        {"ListTagsOfResource", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
-            return e.list_tags_of_resource(client_state, std::move(permit), std::move(json_request));
-        }},
+        {"CreateTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) {
+            return e.maybe_create_keyspace().then([&e, &client_state, req = std::move(req), trace_state = std::move(trace_state)] () mutable { return e.create_table(client_state, std::move(trace_state), req->content); }); }
+        },
+        {"DescribeTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.describe_table(client_state, std::move(trace_state), req->content); }},
+        {"DeleteTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.delete_table(client_state, std::move(trace_state), req->content); }},
+        {"PutItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.put_item(client_state, std::move(trace_state), req->content); }},
+        {"UpdateItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.update_item(client_state, std::move(trace_state), req->content); }},
+        {"GetItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.get_item(client_state, std::move(trace_state), req->content); }},
+        {"DeleteItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.delete_item(client_state, std::move(trace_state), req->content); }},
+        {"ListTables", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.list_tables(client_state, req->content); }},
+        {"Scan", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.scan(client_state, std::move(trace_state), req->content); }},
+        {"DescribeEndpoints", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.describe_endpoints(client_state, req->content, req->get_header("Host")); }},
+        {"BatchWriteItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.batch_write_item(client_state, std::move(trace_state), req->content); }},
+        {"BatchGetItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.batch_get_item(client_state, std::move(trace_state), req->content); }},
+        {"Query", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, std::unique_ptr<request> req) { return e.query(client_state, std::move(trace_state), req->content); }},
    } {
 }

-future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
-        bool enforce_authorization, semaphore* memory_limiter) {
-    _memory_limiter = memory_limiter;
+future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds, bool enforce_authorization) {
    _enforce_authorization = enforce_authorization;
    if (!port && !https_port) {
        return make_exception_future<>(std::runtime_error("Either regular port or TLS port"
@@ -402,21 +280,24 @@ future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std:
    }
    return seastar::async([this, addr, port, https_port, creds] {
        try {
-            _executor.start().get();
+            _executor.invoke_on_all([] (executor& e) {
+                return e.start();
+            }).get();

            if (port) {
-                set_routes(_http_server._routes);
-                _http_server.set_content_length_limit(server::content_length_limit);
-                _http_server.listen(socket_address{addr, *port}).get();
-                _enabled_servers.push_back(std::ref(_http_server));
+                _control.start().get();
+                _control.set_routes(std::bind(&server::set_routes, this, std::placeholders::_1)).get();
+                _control.listen(socket_address{addr, *port}).get();
                slogger.info("Alternator HTTP server listening on {} port {}", addr, *port);
            }
            if (https_port) {
-                set_routes(_https_server._routes);
-                _https_server.set_content_length_limit(server::content_length_limit);
-                _https_server.set_tls_credentials(creds->build_server_credentials());
-                _https_server.listen(socket_address{addr, *https_port}).get();
-                _enabled_servers.push_back(std::ref(_https_server));
+                _https_control.start().get();
+                _https_control.set_routes(std::bind(&server::set_routes, this, std::placeholders::_1)).get();
+                _https_control.server().invoke_on_all([creds] (http_server& serv) {
+                    return serv.set_tls_credentials(creds->build_server_credentials());
+                }).get();
+
+                _https_control.listen(socket_address{addr, *https_port}).get();
                slogger.info("Alternator HTTPS server listening on {} port {}", addr, *https_port);
            }
        } catch (...) {
@@ -429,55 +310,5 @@ future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std:
    });
 }

-future<> server::stop() {
-    return parallel_for_each(_enabled_servers, [] (http_server& server) {
-        return server.stop();
-    }).then([this] {
-        return _pending_requests.close();
-    }).then([this] {
-        return _json_parser.stop();
-    });
-}
-
-server::json_parser::json_parser() : _run_parse_json_thread(async([this] {
-        while (true) {
-            _document_waiting.wait().get();
-            if (_as.abort_requested()) {
-                return;
-            }
-            try {
-                _parsed_document = rjson::parse_yieldable(_raw_document);
-                _current_exception = nullptr;
-            } catch (...) {
-                _current_exception = std::current_exception();
-            }
-            _document_parsed.signal();
-        }
-    })) {
-}
-
-future<rjson::value> server::json_parser::parse(std::string_view content) {
-    if (content.size() < yieldable_parsing_threshold) {
-        return make_ready_future<rjson::value>(rjson::parse(content));
-    }
-    return with_semaphore(_parsing_sem, 1, [this, content] {
-        _raw_document = content;
-        _document_waiting.signal();
-        return _document_parsed.wait().then([this] {
-            if (_current_exception) {
-                return make_exception_future<rjson::value>(_current_exception);
-            }
-            return make_ready_future<rjson::value>(std::move(_parsed_document));
-        });
-    });
-}
-
-future<> server::json_parser::stop() {
-    _as.request_abort();
-    _document_waiting.signal();
-    _document_parsed.broken();
-    return std::move(_run_parse_json_thread);
-}
-
 }

--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -27,56 +27,27 @@
 #include <seastar/net/tls.hh>
 #include <optional>
 #include <alternator/auth.hh>
-#include <utils/small_vector.hh>
-#include <seastar/core/units.hh>

 namespace alternator {

 class server {
-    static constexpr size_t content_length_limit = 16*MB;
-    using alternator_callback = std::function<future<executor::request_return_type>(executor&, executor::client_state&,
-            tracing::trace_state_ptr, service_permit, rjson::value, std::unique_ptr<request>)>;
+    using alternator_callback = std::function<future<json::json_return_type>(executor&, executor::client_state&, tracing::trace_state_ptr, std::unique_ptr<request>)>;
    using alternator_callbacks_map = std::unordered_map<std::string_view, alternator_callback>;

-    http_server _http_server;
-    http_server _https_server;
-    executor& _executor;
-
+    seastar::httpd::http_server_control _control;
+    seastar::httpd::http_server_control _https_control;
+    seastar::sharded<executor>& _executor;
    key_cache _key_cache;
    bool _enforce_authorization;
-    utils::small_vector<std::reference_wrapper<seastar::httpd::http_server>, 2> _enabled_servers;
-    gate _pending_requests;
    alternator_callbacks_map _callbacks;
-
-    semaphore* _memory_limiter;
-
-    class json_parser {
-        static constexpr size_t yieldable_parsing_threshold = 16*KB;
-        std::string_view _raw_document;
-        rjson::value _parsed_document;
-        std::exception_ptr _current_exception;
-        semaphore _parsing_sem{1};
-        condition_variable _document_waiting;
-        condition_variable _document_parsed;
-        abort_source _as;
-        future<> _run_parse_json_thread;
-    public:
-        json_parser();
-        future<rjson::value> parse(std::string_view content);
-        future<> stop();
-    };
-    json_parser _json_parser;
-
 public:
-    server(executor& executor);
+    server(seastar::sharded<executor>& executor);

-    future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
-            bool enforce_authorization, semaphore* memory_limiter);
-    future<> stop();
+    seastar::future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds, bool enforce_authorization);
 private:
    void set_routes(seastar::httpd::routes& r);
    future<> verify_signature(const seastar::httpd::request& r);
-    future<executor::request_return_type> handle_api_request(std::unique_ptr<request>&& req);
+    future<json::json_return_type> handle_api_request(std::unique_ptr<request>&& req);
 };

 }
--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -85,12 +85,6 @@ stats::stats() : api_operations{} {
                    seastar::metrics::description("number of total operations via Alternator API")),
            seastar::metrics::make_total_operations("reads_before_write", reads_before_write,
                    seastar::metrics::description("number of performed read-before-write operations")),
-            seastar::metrics::make_total_operations("write_using_lwt", write_using_lwt,
-                    seastar::metrics::description("number of writes that used LWT")),
-            seastar::metrics::make_total_operations("shard_bounce_for_lwt", shard_bounce_for_lwt,
-                    seastar::metrics::description("number writes that had to be bounced from this shard because of LWT requirements")),
-            seastar::metrics::make_total_operations("requests_blocked_memory", requests_blocked_memory,
-                    seastar::metrics::description("Counts a number of requests blocked due to memory pressure.")),
            seastar::metrics::make_total_operations("filtered_rows_read_total", cql_stats.filtered_rows_read_total,
                    seastar::metrics::description("number of rows read during filtering operations")),
            seastar::metrics::make_total_operations("filtered_rows_matched_total", cql_stats.filtered_rows_matched_total,
--- a/alternator/stats.hh
+++ b/alternator/stats.hh
@@ -84,9 +84,6 @@ public:
    uint64_t total_operations = 0;
    uint64_t unsupported_operations = 0;
    uint64_t reads_before_write = 0;
-    uint64_t write_using_lwt = 0;
-    uint64_t shard_bounce_for_lwt = 0;
-    uint64_t requests_blocked_memory = 0;
    // CQL-derived stats
    cql3::cql_stats cql_stats;
 private:
--- a/alternator/tags_extension.hh
+++ b/alternator/tags_extension.hh
@@ -1,53 +0,0 @@
-/*
- * Copyright 2019 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "serializer.hh"
-#include "schema.hh"
-#include "db/extensions.hh"
-
-namespace alternator {
-
-class tags_extension : public schema_extension {
-public:
-    static constexpr auto NAME = "scylla_tags";
-
-    tags_extension() = default;
-    explicit tags_extension(const std::map<sstring, sstring>& tags) : _tags(std::move(tags)) {}
-    explicit tags_extension(bytes b) : _tags(tags_extension::deserialize(b)) {}
-    explicit tags_extension(const sstring& s) {
-        throw std::logic_error("Cannot create tags from string");
-    }
-    bytes serialize() const override {
-        return ser::serialize_to_buffer<bytes>(_tags);
-    }
-    static std::map<sstring, sstring> deserialize(bytes_view buffer) {
-        return ser::deserialize_from_buffer(buffer, boost::type<std::map<sstring, sstring>>());
-    }
-    const std::map<sstring, sstring>& tags() const {
-        return _tags;
-    }
-private:
-    std::map<sstring, sstring> _tags;
-};
-
-}
--- a/api/api-doc/column_family.json
+++ b/api/api-doc/column_family.json
@@ -70,7 +70,7 @@
            {
               "method":"POST",
               "summary":"Force a major compaction of this column family",
-               "type":"void",
+               "type":"string",
               "nickname":"force_major_compaction",
               "produces":[
                  "application/json"
--- a/api/api-doc/error_injection.json
+++ b/api/api-doc/error_injection.json
@@ -1,90 +0,0 @@
-{
-   "apiVersion":"0.0.1",
-   "swaggerVersion":"1.2",
-   "basePath":"{{Protocol}}://{{Host}}",
-   "resourcePath":"/error_injection",
-   "produces":[
-      "application/json"
-   ],
-   "apis":[
-      {
-         "path":"/v2/error_injection/injection/{injection}",
-         "operations":[
-            {
-               "method":"POST",
-               "summary":"Activate an injection that triggers an error in code",
-               "type":"void",
-               "nickname":"enable_injection",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"injection",
-                     "description":"injection name, should correspond to an injection added in code",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  },
-                  {
-                     "name":"one_shot",
-                     "description":"boolean flag indicating whether the injection should be enabled to trigger only once",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"boolean",
-                     "paramType":"query"
-                  }
-               ]
-            },
-            {
-               "method":"DELETE",
-               "summary":"Deactivate an injection previously activated by the API",
-               "type":"void",
-               "nickname":"disable_injection",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[
-                  {
-                     "name":"injection",
-                     "description":"injection name",
-                     "required":true,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"path"
-                  }
-               ]
-            }
-         ]
-      },
-      {
-         "path":"/v2/error_injection/injection",
-         "operations":[
-            {
-               "method":"GET",
-               "summary":"List all enabled injections on all shards, i.e. injections that will trigger an error in the code",
-               "type":"array",
-               "items":{
-                  "type":"string"
-               },
-               "nickname":"get_enabled_injections_on_all",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[]
-            },
-            {
-               "method":"DELETE",
-               "summary":"Deactivate all injections previously activated on all shards by the API",
-               "type":"void",
-               "nickname":"disable_on_all",
-               "produces":[
-                  "application/json"
-               ],
-               "parameters":[]
-            }
-         ]
-      }
-   ]
-}
--- a/api/api-doc/storage_proxy.json
+++ b/api/api-doc/storage_proxy.json
@@ -641,21 +641,6 @@
        }
      ]
    },
-    {
-      "path": "/storage_proxy/metrics/cas_write/failed_read_round_optimization",
-      "operations": [
-        {
-          "method": "GET",
-          "summary": "Get cas write metrics",
-          "type": "long",
-          "nickname": "get_cas_write_metrics_failed_read_round_optimization",
-          "produces": [
-            "application/json"
-          ],
-          "parameters": []
-        }
-      ]
-    },
    {
      "path": "/storage_proxy/metrics/cas_read/unfinished_commit",
      "operations": [
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -582,15 +582,7 @@
                  },
                  {
                     "name":"kn",
-                     "description":"Comma seperated keyspaces name that their snapshot will be deleted",
-                     "required":false,
-                     "allowMultiple":false,
-                     "type":"string",
-                     "paramType":"query"
-                  },
-                  {
-                     "name":"cf",
-                     "description":"an optional table name that its snapshot will be deleted",
+                     "description":"Comma seperated keyspaces name to snapshot",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
--- a/api/api.cc
+++ b/api/api.cc
@@ -36,7 +36,6 @@
 #include "endpoint_snitch.hh"
 #include "compaction_manager.hh"
 #include "hinted_handoff.hh"
-#include "error_injection.hh"
 #include <seastar/http/exception.hh>
 #include "stream_manager.hh"
 #include "system.hh"
@@ -69,19 +68,13 @@ future<> set_server_init(http_context& ctx) {
        rb->set_api_doc(r);
        rb02->set_api_doc(r);
        rb02->register_api_file(r, "swagger20_header");
+        set_config(rb02, ctx, r);
        rb->register_function(r, "system",
                "The system related API");
        set_system(ctx, r);
    });
 }

-future<> set_server_config(http_context& ctx) {
-    auto rb02 = std::make_shared < api_registry_builder20 > (ctx.api_doc, "/v2");
-    return ctx.http_server.set_routes([&ctx, rb02](routes& r) {
-        set_config(rb02, ctx, r);
-    });
-}
-
 static future<> register_api(http_context& ctx, const sstring& api_name,
        const sstring api_desc,
        std::function<void(http_context& ctx, routes& r)> f) {
@@ -97,10 +90,6 @@ future<> set_server_storage_service(http_context& ctx) {
    return register_api(ctx, "storage_service", "The storage service API", set_storage_service);
 }

-future<> set_server_snapshot(http_context& ctx) {
-    return ctx.http_server.set_routes([&ctx] (routes& r) { set_snapshot(ctx, r); });
-}
-
 future<> set_server_snitch(http_context& ctx) {
    return register_api(ctx, "endpoint_snitch_info", "The endpoint snitch info API", set_endpoint_snitch);
 }
@@ -164,9 +153,6 @@ future<> set_server_done(http_context& ctx) {
        rb->register_function(r, "collectd",
                "The collectd API");
        set_collectd(ctx, r);
-        rb->register_function(r, "error_injection",
-                "The error injection API");
-        set_error_injection(ctx, r);
    });
 }

--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -24,7 +24,6 @@
 #include <seastar/http/httpd.hh>

 namespace service { class load_meter; }
-namespace locator { class token_metadata; }

 namespace api {

@@ -35,20 +34,16 @@ struct http_context {
    distributed<database>& db;
    distributed<service::storage_proxy>& sp;
    service::load_meter& lmeter;
-    sharded<locator::token_metadata>& token_metadata;
-
    http_context(distributed<database>& _db,
            distributed<service::storage_proxy>& _sp,
-            service::load_meter& _lm, sharded<locator::token_metadata>& _tm)
-            : db(_db), sp(_sp), lmeter(_lm), token_metadata(_tm) {
+            service::load_meter& _lm)
+            : db(_db), sp(_sp), lmeter(_lm) {
    }
 };

 future<> set_server_init(http_context& ctx);
-future<> set_server_config(http_context& ctx);
 future<> set_server_snitch(http_context& ctx);
 future<> set_server_storage_service(http_context& ctx);
-future<> set_server_snapshot(http_context& ctx);
 future<> set_server_gossip(http_context& ctx);
 future<> set_server_load_sstable(http_context& ctx);
 future<> set_server_messaging_service(http_context& ctx);
--- a/api/collectd.cc
+++ b/api/collectd.cc
@@ -64,7 +64,7 @@ static const char* str_to_regex(const sstring& v) {
 void set_collectd(http_context& ctx, routes& r) {
    cd::get_collectd.set(r, [&ctx](std::unique_ptr<request> req) {

-        auto id = ::make_shared<scollectd::type_instance_id>(req->param["pluginid"],
+        auto id = make_shared<scollectd::type_instance_id>(req->param["pluginid"],
                req->get_query_param("instance"), req->get_query_param("type"),
                req->get_query_param("type_instance"));

--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -994,15 +994,5 @@ void set_column_family(http_context& ctx, routes& r) {
        });
    });

-    cf::force_major_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
-        if (req->get_query_param("split_output") != "") {
-            fail(unimplemented::cause::API);
-        }
-        return foreach_column_family(ctx, req->param["name"], [](column_family &cf) {
-            return cf.compact_all_sstables();
-        }).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
 }
 }
--- a/api/error_injection.cc
+++ b/api/error_injection.cc
@@ -1,66 +0,0 @@
-/*
- * Copyright (C) 2020 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "api/api-doc/error_injection.json.hh"
-#include "api/api.hh"
-
-#include <seastar/http/exception.hh>
-#include "log.hh"
-#include "utils/error_injection.hh"
-#include "seastar/core/future-util.hh"
-
-namespace api {
-
-namespace hf = httpd::error_injection_json;
-
-void set_error_injection(http_context& ctx, routes& r) {
-
-    hf::enable_injection.set(r, [](std::unique_ptr<request> req) {
-        sstring injection = req->param["injection"];
-        bool one_shot = req->get_query_param("one_shot") == "True";
-        auto& errinj = utils::get_local_injector();
-        errinj.enable_on_all(injection, one_shot);
-        return make_ready_future<json::json_return_type>(json::json_void());
-    });
-
-    hf::get_enabled_injections_on_all.set(r, [](std::unique_ptr<request> req) {
-        auto& errinj = utils::get_local_injector();
-        auto ret = errinj.enabled_injections_on_all();
-        return make_ready_future<json::json_return_type>(ret);
-    });
-
-    hf::disable_injection.set(r, [](std::unique_ptr<request> req) {
-        sstring injection = req->param["injection"];
-
-        auto& errinj = utils::get_local_injector();
-        errinj.disable_on_all(injection);
-        return make_ready_future<json::json_return_type>(json::json_void());
-    });
-
-    hf::disable_on_all.set(r, [](std::unique_ptr<request> req) {
-        auto& errinj = utils::get_local_injector();
-        errinj.disable_on_all();
-        return make_ready_future<json::json_return_type>(json::json_void());
-    });
-
-}
-
-} // namespace api
--- a/api/error_injection.hh
+++ b/api/error_injection.hh
@@ -1,30 +0,0 @@
-/*
- * Copyright (C) 2019 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "api.hh"
-
-namespace api {
-
-void set_error_injection(http_context& ctx, routes& r);
-
-}
--- a/api/storage_proxy.cc
+++ b/api/storage_proxy.cc
@@ -27,7 +27,6 @@
 #include "db/config.hh"
 #include "utils/histogram.hh"
 #include "database.hh"
-#include "seastar/core/scheduling_specific.hh"

 namespace api {

@@ -35,70 +34,12 @@ namespace sp = httpd::storage_proxy_json;
 using proxy = service::storage_proxy;
 using namespace json;

-
-/**
- * This function implement a two dimentional map reduce where
- * the first level is a distributed storage_proxy class and the
- * second level is the stats per scheduling group class.
- * @param d -  a reference to the storage_proxy distributed class.
- * @param mapper -  the internal mapper that is used to map the internal
- * stat class into a value of type `V`.
- * @param reducer - the reducer that is used in both outer and inner
- * aggregations.
- * @param initial_value - the initial value to use for both aggregations
- * @return A future that resolves to the result of the aggregation.
- */
-template<typename V, typename Reducer, typename InnerMapper>
-future<V> two_dimensional_map_reduce(distributed<service::storage_proxy>& d,
-        InnerMapper mapper, Reducer reducer, V initial_value) {
-    return d.map_reduce0( [mapper, reducer, initial_value] (const service::storage_proxy& sp) {
-        return map_reduce_scheduling_group_specific<service::storage_proxy_stats::stats>(
-                mapper, reducer, initial_value, sp.get_stats_key());
-    }, initial_value, reducer);
+static future<utils::rate_moving_average>  sum_timed_rate(distributed<proxy>& d, utils::timed_rate_moving_average proxy::stats::*f) {
+    return d.map_reduce0([f](const proxy& p) {return (p.get_stats().*f).rate();}, utils::rate_moving_average(),
+            std::plus<utils::rate_moving_average>());
 }

-/**
- * This function implement a two dimentional map reduce where
- * the first level is a distributed storage_proxy class and the
- * second level is the stats per scheduling group class.
- * @param d -  a reference to the storage_proxy distributed class.
- * @param f - a field pointer which is the implicit internal reducer.
- * @param reducer - the reducer that is used in both outer and inner
- * aggregations.
- * @param initial_value - the initial value to use for both aggregations* @return
- * @return A future that resolves to the result of the aggregation.
- */
-template<typename V, typename Reducer, typename F>
-future<V> two_dimensional_map_reduce(distributed<service::storage_proxy>& d,
-        V F::*f, Reducer reducer, V initial_value) {
-    return two_dimensional_map_reduce(d, [f] (F& stats) {
-        return stats.*f;
-    }, reducer, initial_value);
-}
-
-/**
- * A partial Specialization of sum_stats for the storage proxy
- * case where the get stats function doesn't return a
- * stats object with fields but a per scheduling group
- * stats object, the name was also changed since functions
- * partial specialization is not supported in C++.
- *
- */
-template<typename V, typename F>
-future<json::json_return_type>  sum_stats_storage_proxy(distributed<proxy>& d, V F::*f) {
-    return two_dimensional_map_reduce(d, [f] (F& stats) { return stats.*f; }, std::plus<V>(), V(0)).then([] (V val) {
-        return make_ready_future<json::json_return_type>(val);
-    });
-}
-
-
-static future<utils::rate_moving_average>  sum_timed_rate(distributed<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
-    return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
-        return (stats.*f).rate();
-    }, std::plus<utils::rate_moving_average>(), utils::rate_moving_average());
-}
-
-static future<json::json_return_type>  sum_timed_rate_as_obj(distributed<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
+static future<json::json_return_type>  sum_timed_rate_as_obj(distributed<proxy>& d, utils::timed_rate_moving_average proxy::stats::*f) {
    return sum_timed_rate(d, f).then([](const utils::rate_moving_average& val) {
        httpd::utils_json::rate_moving_average m;
        m = val;
@@ -110,72 +51,29 @@ httpd::utils_json::rate_moving_average_and_histogram get_empty_moving_average()
    return timer_to_json(utils::rate_moving_average_and_histogram());
 }

-static future<json::json_return_type>  sum_timed_rate_as_long(distributed<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
+static future<json::json_return_type>  sum_timed_rate_as_long(distributed<proxy>& d, utils::timed_rate_moving_average proxy::stats::*f) {
    return sum_timed_rate(d, f).then([](const utils::rate_moving_average& val) {
        return make_ready_future<json::json_return_type>(val.count);
    });
 }

-static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::estimated_histogram service::storage_proxy_stats::stats::*f) {
-
-    return two_dimensional_map_reduce(ctx.sp, f, utils::estimated_histogram_merge,
-            utils::estimated_histogram()).then([](const utils::estimated_histogram& val) {
+static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::estimated_histogram proxy::stats::*f) {
+    return ctx.sp.map_reduce0([f](const proxy& p) {return p.get_stats().*f;}, utils::estimated_histogram(),
+            utils::estimated_histogram_merge).then([](const utils::estimated_histogram& val) {
        utils_json::estimated_histogram res;
        res = val;
        return make_ready_future<json::json_return_type>(res);
    });
 }

-static future<json::json_return_type>  total_latency(http_context& ctx, utils::timed_rate_moving_average_and_histogram service::storage_proxy_stats::stats::*f) {
-    return two_dimensional_map_reduce(ctx.sp, [f] (service::storage_proxy_stats::stats& stats) {
-            return (stats.*f).hist.mean * (stats.*f).hist.count;
-        }, std::plus<double>(), 0.0).then([](double val) {
+static future<json::json_return_type>  total_latency(http_context& ctx, utils::timed_rate_moving_average_and_histogram proxy::stats::*f) {
+    return ctx.sp.map_reduce0([f](const proxy& p) {return (p.get_stats().*f).hist.mean * (p.get_stats().*f).hist.count;}, 0.0,
+            std::plus<double>()).then([](double val) {
        int64_t res = val;
        return make_ready_future<json::json_return_type>(res);
    });
 }

-/**
- * A partial Specialization of sum_histogram_stats
- * for the storage proxy case where the get stats
- * function doesn't return a stats object with
- * fields but a per scheduling group stats object,
- * the name was also changed since function partial
- * specialization is not supported in C++.
- */
-template<typename F>
-future<json::json_return_type>
-sum_histogram_stats_storage_proxy(distributed<proxy>& d,
-        utils::timed_rate_moving_average_and_histogram F::*f) {
-    return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
-        return (stats.*f).hist;
-    }, std::plus<utils::ihistogram>(), utils::ihistogram()).
-            then([](const utils::ihistogram& val) {
-        return make_ready_future<json::json_return_type>(to_json(val));
-    });
-}
-
-/**
- * A partial Specialization of sum_timer_stats for the
- * storage proxy case where the get stats function
- * doesn't return a stats object with fields but a
- * per scheduling group stats object, the name
- * was also changed since partial function specialization
- * is not supported in C++.
- */
-template<typename F>
-future<json::json_return_type>
-sum_timer_stats_storage_proxy(distributed<proxy>& d,
-        utils::timed_rate_moving_average_and_histogram F::*f) {
-
-    return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
-        return (stats.*f).rate();
-    }, std::plus<utils::rate_moving_average_and_histogram>(),
-            utils::rate_moving_average_and_histogram()).then([](const utils::rate_moving_average_and_histogram& val) {
-        return make_ready_future<json::json_return_type>(timer_to_json(val));
-    });
-}
-
 void set_storage_proxy(http_context& ctx, routes& r) {
    sp::get_total_hints.set(r, [](std::unique_ptr<request> req)  {
        //TBD
@@ -325,15 +223,15 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_read_repair_attempted.set(r, [&ctx](std::unique_ptr<request> req)  {
-        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_attempts);
+        return sum_stats(ctx.sp, &proxy::stats::read_repair_attempts);
    });

    sp::get_read_repair_repaired_blocking.set(r, [&ctx](std::unique_ptr<request> req)  {
-        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_repaired_blocking);
+        return sum_stats(ctx.sp, &proxy::stats::read_repair_repaired_blocking);
    });

    sp::get_read_repair_repaired_background.set(r, [&ctx](std::unique_ptr<request> req)  {
-        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_repaired_background);
+        return sum_stats(ctx.sp, &proxy::stats::read_repair_repaired_background);
    });

    sp::get_schema_versions.set(r, [](std::unique_ptr<request> req)  {
@@ -377,10 +275,6 @@ void set_storage_proxy(http_context& ctx, routes& r) {
        return sum_stats(ctx.sp, &proxy::stats::cas_write_condition_not_met);
    });

-    sp::get_cas_write_metrics_failed_read_round_optimization.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_stats(ctx.sp, &proxy::stats::cas_failed_read_round_optimization);
-    });
-
    sp::get_cas_read_metrics_unfinished_commit.set(r, [&ctx](std::unique_ptr<request> req) {
        return sum_stats(ctx.sp, &proxy::stats::cas_read_unfinished_commit);
    });
@@ -390,71 +284,71 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_read_metrics_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::read_timeouts);
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::read_timeouts);
    });

    sp::get_read_metrics_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::read_unavailables);
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::read_unavailables);
    });

    sp::get_range_metrics_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::range_slice_timeouts);
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::range_slice_timeouts);
    });

    sp::get_range_metrics_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::range_slice_unavailables);
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::range_slice_unavailables);
    });

    sp::get_write_metrics_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::write_timeouts);
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::write_timeouts);
    });

    sp::get_write_metrics_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::write_unavailables);
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::write_unavailables);
    });

    sp::get_read_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::read_timeouts);
+        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::read_timeouts);
    });

    sp::get_read_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::read_unavailables);
+        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::read_unavailables);
    });

    sp::get_range_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::range_slice_timeouts);
+        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::range_slice_timeouts);
    });

    sp::get_range_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::range_slice_unavailables);
+        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::range_slice_unavailables);
    });

    sp::get_write_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::write_timeouts);
+        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::write_timeouts);
    });

    sp::get_write_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::write_unavailables);
+        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::write_unavailables);
    });

    sp::get_range_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
+        return sum_histogram_stats(ctx.sp, &proxy::stats::range);
    });

    sp::get_write_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::write);
+        return sum_histogram_stats(ctx.sp, &proxy::stats::write);
    });

    sp::get_read_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read);
+        return sum_histogram_stats(ctx.sp, &proxy::stats::read);
    });

    sp::get_range_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
+        return sum_timer_stats(ctx.sp, &proxy::stats::range);
    });

    sp::get_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::write);
+        return sum_timer_stats(ctx.sp, &proxy::stats::write);
    });
    sp::get_cas_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
        return sum_timer_stats(ctx.sp, &proxy::stats::cas_write);
@@ -473,30 +367,30 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_read_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read);
+        return sum_timer_stats(ctx.sp, &proxy::stats::read);
    });

    sp::get_read_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::estimated_read);
+        return sum_estimated_histogram(ctx, &proxy::stats::estimated_read);
    });

    sp::get_read_latency.set(r, [&ctx](std::unique_ptr<request> req) {
-        return total_latency(ctx, &service::storage_proxy_stats::stats::read);
+        return total_latency(ctx, &proxy::stats::read);
    });
    sp::get_write_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::estimated_write);
+        return sum_estimated_histogram(ctx, &proxy::stats::estimated_write);
    });

    sp::get_write_latency.set(r, [&ctx](std::unique_ptr<request> req) {
-        return total_latency(ctx, &service::storage_proxy_stats::stats::write);
+        return total_latency(ctx, &proxy::stats::write);
    });

    sp::get_range_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
+        return sum_timer_stats(ctx.sp, &proxy::stats::range);
    });

    sp::get_range_latency.set(r, [&ctx](std::unique_ptr<request> req) {
-        return total_latency(ctx, &service::storage_proxy_stats::stats::range);
+        return total_latency(ctx, &proxy::stats::range);
    });
 }

--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -42,6 +42,8 @@
 #include "database.hh"
 #include "db/extensions.hh"

+sstables::sstable::version_types get_highest_supported_format();
+
 namespace api {

 namespace ss = httpd::storage_service_json;
@@ -72,35 +74,35 @@ static ss::token_range token_range_endpoints_to_json(const dht::token_range_endp
    return r;
 }

-using ks_cf_func = std::function<future<json::json_return_type>(http_context&, std::unique_ptr<request>, sstring, std::vector<sstring>)>;
-
-static auto wrap_ks_cf(http_context &ctx, ks_cf_func f) {
-    return [&ctx, f = std::move(f)](std::unique_ptr<request> req) {
-        auto keyspace = validate_keyspace(ctx, req->param);
-        auto column_families = split_cf(req->get_query_param("cf"));
-        if (column_families.empty()) {
-            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
-        }
-        return f(ctx, std::move(req), std::move(keyspace), std::move(column_families));
-    };
-}
-
 void set_storage_service(http_context& ctx, routes& r) {
+    using ks_cf_func = std::function<future<json::json_return_type>(std::unique_ptr<request>, sstring, std::vector<sstring>)>;
+
+    auto wrap_ks_cf = [&ctx](ks_cf_func f) {
+        return [&ctx, f = std::move(f)](std::unique_ptr<request> req) {
+            auto keyspace = validate_keyspace(ctx, req->param);
+            auto column_families = split_cf(req->get_query_param("cf"));
+            if (column_families.empty()) {
+                column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
+            }
+            return f(std::move(req), std::move(keyspace), std::move(column_families));
+        };
+    };
+
    ss::local_hostid.set(r, [](std::unique_ptr<request> req) {
        return db::system_keyspace::get_local_host_id().then([](const utils::UUID& id) {
            return make_ready_future<json::json_return_type>(id.to_sstring());
        });
    });

-    ss::get_tokens.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.token_metadata.local().sorted_tokens(), [](const dht::token& i) {
+    ss::get_tokens.set(r, [] (std::unique_ptr<request> req) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().get_token_metadata().sorted_tokens(), [](const dht::token& i) {
           return boost::lexical_cast<std::string>(i);
        }));
    });

-    ss::get_node_tokens.set(r, [&ctx] (std::unique_ptr<request> req) {
+    ss::get_node_tokens.set(r, [] (std::unique_ptr<request> req) {
        gms::inet_address addr(req->param["endpoint"]);
-        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.token_metadata.local().get_tokens(addr), [](const dht::token& i) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().get_token_metadata().get_tokens(addr), [](const dht::token& i) {
           return boost::lexical_cast<std::string>(i);
       }));
    });
@@ -118,8 +120,8 @@ void set_storage_service(http_context& ctx, routes& r) {
        }));
    });

-    ss::get_leaving_nodes.set(r, [&ctx](const_req req) {
-        return container_to_vec(ctx.token_metadata.local().get_leaving_endpoints());
+    ss::get_leaving_nodes.set(r, [](const_req req) {
+        return container_to_vec(service::get_local_storage_service().get_token_metadata().get_leaving_endpoints());
    });

    ss::get_moving_nodes.set(r, [](const_req req) {
@@ -127,8 +129,8 @@ void set_storage_service(http_context& ctx, routes& r) {
        return container_to_vec(addr);
    });

-    ss::get_joining_nodes.set(r, [&ctx](const_req req) {
-        auto points = ctx.token_metadata.local().get_bootstrap_tokens();
+    ss::get_joining_nodes.set(r, [](const_req req) {
+        auto points = service::get_local_storage_service().get_token_metadata().get_bootstrap_tokens();
        std::unordered_set<sstring> addr;
        for (auto i: points) {
            addr.insert(boost::lexical_cast<std::string>(i.second));
@@ -180,9 +182,10 @@ void set_storage_service(http_context& ctx, routes& r) {
        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(keyspace), token_range_endpoints_to_json));
    });

-    ss::get_host_id_map.set(r, [&ctx](const_req req) {
+    ss::get_host_id_map.set(r, [](const_req req) {
        std::vector<ss::mapper> res;
-        return map_to_key_value(ctx.token_metadata.local().get_endpoint_to_host_id_map_for_reading(), res);
+        return map_to_key_value(service::get_local_storage_service().
+                get_token_metadata().get_endpoint_to_host_id_map_for_reading(), res);
    });

    ss::get_load.set(r, [&ctx](std::unique_ptr<request> req) {
@@ -215,6 +218,67 @@ void set_storage_service(http_context& ctx, routes& r) {
                req.get_query_param("key")));
    });

+    ss::get_snapshot_details.set(r, [](std::unique_ptr<request> req) {
+        return service::get_local_storage_service().get_snapshot_details().then([] (auto result) {
+            std::vector<ss::snapshots> res;
+            for (auto& map: result) {
+                ss::snapshots all_snapshots;
+                all_snapshots.key = map.first;
+
+                std::vector<ss::snapshot> snapshot;
+                for (auto& cf: map.second) {
+                    ss::snapshot s;
+                    s.ks = cf.ks;
+                    s.cf = cf.cf;
+                    s.live = cf.live;
+                    s.total = cf.total;
+                    snapshot.push_back(std::move(s));
+                }
+                all_snapshots.value = std::move(snapshot);
+                res.push_back(std::move(all_snapshots));
+            }
+            return make_ready_future<json::json_return_type>(std::move(res));
+        });
+    });
+
+    ss::take_snapshot.set(r, [](std::unique_ptr<request> req) {
+        auto tag = req->get_query_param("tag");
+        auto column_family = req->get_query_param("cf");
+
+        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
+
+        auto resp = make_ready_future<>();
+        if (column_family.empty()) {
+            resp = service::get_local_storage_service().take_snapshot(tag, keynames);
+        } else {
+            if (keynames.empty()) {
+                throw httpd::bad_param_exception("The keyspace of column families must be specified");
+            }
+            if (keynames.size() > 1) {
+                throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
+            }
+            resp = service::get_local_storage_service().take_column_family_snapshot(keynames[0], column_family, tag);
+        }
+        return resp.then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    ss::del_snapshot.set(r, [](std::unique_ptr<request> req) {
+        auto tag = req->get_query_param("tag");
+
+        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
+        return service::get_local_storage_service().clear_snapshot(tag, keynames).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    ss::true_snapshots_size.set(r, [](std::unique_ptr<request> req) {
+        return service::get_local_storage_service().true_snapshots_size().then([] (int64_t size) {
+            return make_ready_future<json::json_return_type>(size);
+        });
+    });
+
    ss::force_keyspace_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
        auto keyspace = validate_keyspace(ctx, req->param);
        auto column_families = split_cf(req->get_query_param("cf"));
@@ -252,8 +316,8 @@ void set_storage_service(http_context& ctx, routes& r) {
                for (auto cf : column_families) {
                    column_families_vec.push_back(&db.find_column_family(keyspace, cf));
                }
-                return parallel_for_each(column_families_vec, [&cm, &db] (column_family* cf) {
-                    return cm.perform_cleanup(db, cf);
+                return parallel_for_each(column_families_vec, [&cm] (column_family* cf) {
+                    return cm.perform_cleanup(cf);
                });
            }).then([]{
                return make_ready_future<json::json_return_type>(0);
@@ -261,7 +325,32 @@ void set_storage_service(http_context& ctx, routes& r) {
        });
    });

-    ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
+    ss::scrub.set(r, wrap_ks_cf([&ctx](std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
+        // TODO: respect this
+        auto skip_corrupted = req->get_query_param("skip_corrupted");
+
+        auto f = make_ready_future<>();
+        if (!req_param<bool>(*req, "disable_snapshot", false)) {
+            auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
+            f = parallel_for_each(column_families, [keyspace, tag](sstring cf) {
+                return service::get_local_storage_service().take_column_family_snapshot(keyspace, cf, tag);
+            });
+        }
+
+        return f.then([&ctx, keyspace, column_families] {
+            return ctx.db.invoke_on_all([=] (database& db) {
+                return do_for_each(column_families, [=, &db](sstring cfname) {
+                    auto& cm = db.get_compaction_manager();
+                    auto& cf = db.find_column_family(keyspace, cfname);
+                    return cm.perform_sstable_scrub(&cf);
+                });
+            });
+        }).then([]{
+            return make_ready_future<json::json_return_type>(0);
+        });
+    }));
+
+    ss::upgrade_sstables.set(r, wrap_ks_cf([&ctx](std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);

        return ctx.db.invoke_on_all([=] (database& db) {
@@ -947,107 +1036,4 @@ void set_storage_service(http_context& ctx, routes& r) {

 }

-void set_snapshot(http_context& ctx, routes& r) {
-    ss::get_snapshot_details.set(r, [](std::unique_ptr<request> req) {
-        std::function<future<>(output_stream<char>&&)> f = [](output_stream<char>&& s) {
-            return do_with(output_stream<char>(std::move(s)), true, [] (output_stream<char>& s, bool& first){
-                return s.write("[").then([&s, &first] {
-                    return service::get_local_storage_service().get_snapshot_details().then([&s, &first] (std::unordered_map<sstring, std::vector<service::storage_service::snapshot_details>>&& result) {
-                        return do_with(std::move(result), [&s, &first](const std::unordered_map<sstring, std::vector<service::storage_service::snapshot_details>>& result) {
-                            return do_for_each(result, [&s, &result,&first](std::tuple<sstring, std::vector<service::storage_service::snapshot_details>>&& map){
-                                return do_with(ss::snapshots(), [&s, &first, &result, &map](ss::snapshots& all_snapshots) {
-                                    all_snapshots.key = std::get<0>(map);
-                                    future<> f = first ? make_ready_future<>() : s.write(", ");
-                                    first = false;
-                                    std::vector<ss::snapshot> snapshot;
-                                    for (auto& cf: std::get<1>(map)) {
-                                        ss::snapshot snp;
-                                        snp.ks = cf.ks;
-                                        snp.cf = cf.cf;
-                                        snp.live = cf.live;
-                                        snp.total = cf.total;
-                                        snapshot.push_back(std::move(snp));
-                                    }
-                                    all_snapshots.value = std::move(snapshot);
-                                    return f.then([&s, &all_snapshots] {
-                                        return all_snapshots.write(s);
-                                    });
-                                });
-                            });
-                        });
-                    }).then([&s] {
-                        return s.write("]").then([&s] {
-                            return s.close();
-                        });
-                    });
-                });
-            });
-        };
-        return make_ready_future<json::json_return_type>(std::move(f));
-    });
-
-    ss::take_snapshot.set(r, [](std::unique_ptr<request> req) {
-        auto tag = req->get_query_param("tag");
-        auto column_family = req->get_query_param("cf");
-
-        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-
-        auto resp = make_ready_future<>();
-        if (column_family.empty()) {
-            resp = service::get_local_storage_service().take_snapshot(tag, keynames);
-        } else {
-            if (keynames.empty()) {
-                throw httpd::bad_param_exception("The keyspace of column families must be specified");
-            }
-            if (keynames.size() > 1) {
-                throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
-            }
-            resp = service::get_local_storage_service().take_column_family_snapshot(keynames[0], column_family, tag);
-        }
-        return resp.then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
-    ss::del_snapshot.set(r, [](std::unique_ptr<request> req) {
-        auto tag = req->get_query_param("tag");
-        auto column_family = req->get_query_param("cf");
-
-        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-        return service::get_local_storage_service().clear_snapshot(tag, keynames, column_family).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
-    ss::true_snapshots_size.set(r, [](std::unique_ptr<request> req) {
-        return service::get_local_storage_service().true_snapshots_size().then([] (int64_t size) {
-            return make_ready_future<json::json_return_type>(size);
-        });
-    });
-
-    ss::scrub.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
-        const auto skip_corrupted = req_param<bool>(*req, "skip_corrupted", false);
-
-        auto f = make_ready_future<>();
-        if (!req_param<bool>(*req, "disable_snapshot", false)) {
-            auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
-            f = parallel_for_each(column_families, [keyspace, tag](sstring cf) {
-                return service::get_local_storage_service().take_column_family_snapshot(keyspace, cf, tag);
-            });
-        }
-
-        return f.then([&ctx, keyspace, column_families, skip_corrupted] {
-            return ctx.db.invoke_on_all([=] (database& db) {
-                return do_for_each(column_families, [=, &db](sstring cfname) {
-                    auto& cm = db.get_compaction_manager();
-                    auto& cf = db.find_column_family(keyspace, cfname);
-                    return cm.perform_sstable_scrub(&cf, skip_corrupted);
-                });
-            });
-        }).then([]{
-            return make_ready_future<json::json_return_type>(0);
-        });
-    }));
-}
-
 }
--- a/api/storage_service.hh
+++ b/api/storage_service.hh
@@ -26,6 +26,5 @@
 namespace api {

 void set_storage_service(http_context& ctx, routes& r);
-void set_snapshot(http_context& ctx, routes& r);

 }
--- a/auth/allow_all_authenticator.hh
+++ b/auth/allow_all_authenticator.hh
@@ -52,7 +52,7 @@ public:
        return make_ready_future<>();
    }

-    virtual std::string_view qualified_java_name() const override {
+    virtual const sstring& qualified_java_name() const override {
        return allow_all_authenticator_name();
    }

--- a/auth/allow_all_authorizer.hh
+++ b/auth/allow_all_authorizer.hh
@@ -49,7 +49,7 @@ public:
        return make_ready_future<>();
    }

-    virtual std::string_view qualified_java_name() const override {
+    virtual const sstring& qualified_java_name() const override {
        return allow_all_authorizer_name();
    }

--- a/auth/authenticator.hh
+++ b/auth/authenticator.hh
@@ -96,7 +96,7 @@ public:
    ///
    /// A fully-qualified (class with package) Java-like name for this implementation.
    ///
-    virtual std::string_view qualified_java_name() const = 0;
+    virtual const sstring& qualified_java_name() const = 0;

    virtual bool require_authentication() const = 0;

--- a/auth/authorizer.hh
+++ b/auth/authorizer.hh
@@ -100,7 +100,7 @@ public:
    ///
    /// A fully-qualified (class with package) Java-like name for this implementation.
    ///
-    virtual std::string_view qualified_java_name() const = 0;
+    virtual const sstring& qualified_java_name() const = 0;

    ///
    /// Query for the permissions granted directly to a role for a particular \ref resource (and not any of its
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -59,7 +59,7 @@ future<> do_after_system_ready(seastar::abort_source& as, seastar::noncopyable_f
    }).discard_result();
 }

-static future<> create_metadata_table_if_missing_impl(
+future<> create_metadata_table_if_missing(
        std::string_view table_name,
        cql3::query_processor& qp,
        std::string_view cql,
@@ -85,14 +85,7 @@ static future<> create_metadata_table_if_missing_impl(
    return ignore_existing([&mm, table = std::move(table)] () {
        return mm.announce_new_column_family(table, false);
    });
-}

-future<> create_metadata_table_if_missing(
-        std::string_view table_name,
-        cql3::query_processor& qp,
-        std::string_view cql,
-        ::service::migration_manager& mm) noexcept {
-    return futurize_apply(create_metadata_table_if_missing_impl, table_name, qp, cql, mm);
 }

 future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db, seastar::abort_source& as) {
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -79,7 +79,7 @@ future<> create_metadata_table_if_missing(
        std::string_view table_name,
        cql3::query_processor&,
        std::string_view cql,
-        ::service::migration_manager&) noexcept;
+        ::service::migration_manager&);

 future<> wait_for_schema_agreement(::service::migration_manager&, const database&, seastar::abort_source&);

--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -101,7 +101,7 @@ bool default_authorizer::legacy_metadata_exists() const {
 future<bool> default_authorizer::any_granted() const {
    static const sstring query = format("SELECT * FROM {}.{} LIMIT 1", meta::AUTH_KS, PERMISSIONS_CF);

-    return _qp.execute_internal(
+    return _qp.process(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config,
@@ -115,7 +115,7 @@ future<> default_authorizer::migrate_legacy_metadata() const {
    alogger.info("Starting migration of legacy permissions metadata.");
    static const sstring query = format("SELECT * FROM {}.{}", meta::AUTH_KS, legacy_table_name);

-    return _qp.execute_internal(
+    return _qp.process(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config).then([this](::shared_ptr<cql3::untyped_result_set> results) {
@@ -195,7 +195,7 @@ default_authorizer::authorize(const role_or_anonymous& maybe_role, const resourc
            ROLE_NAME,
            RESOURCE_NAME);

-    return _qp.execute_internal(
+    return _qp.process(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config,
@@ -224,7 +224,7 @@ default_authorizer::modify(
                    ROLE_NAME,
                    RESOURCE_NAME),
            [this, &role_name, set, &resource](const auto& query) {
-        return _qp.execute_internal(
+        return _qp.process(
                query,
                db::consistency_level::ONE,
                internal_distributed_timeout_config(),
@@ -249,7 +249,7 @@ future<std::vector<permission_details>> default_authorizer::list_all() const {
            meta::AUTH_KS,
            PERMISSIONS_CF);

-    return _qp.execute_internal(
+    return _qp.process(
            query,
            db::consistency_level::ONE,
            internal_distributed_timeout_config(),
@@ -276,7 +276,7 @@ future<> default_authorizer::revoke_all(std::string_view role_name) const {
            PERMISSIONS_CF,
            ROLE_NAME);

-    return _qp.execute_internal(
+    return _qp.process(
            query,
            db::consistency_level::ONE,
            internal_distributed_timeout_config(),
@@ -296,7 +296,7 @@ future<> default_authorizer::revoke_all(const resource& resource) const {
            PERMISSIONS_CF,
            RESOURCE_NAME);

-    return _qp.execute_internal(
+    return _qp.process(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config,
@@ -313,7 +313,7 @@ future<> default_authorizer::revoke_all(const resource& resource) const {
                        ROLE_NAME,
                        RESOURCE_NAME);

-                return _qp.execute_internal(
+                return _qp.process(
                        query,
                        db::consistency_level::LOCAL_ONE,
                        infinite_timeout_config,
--- a/auth/default_authorizer.hh
+++ b/auth/default_authorizer.hh
@@ -71,7 +71,7 @@ public:

    virtual future<> stop() override;

-    virtual std::string_view qualified_java_name() const override {
+    virtual const sstring& qualified_java_name() const override {
        return default_authorizer_name();
    }

--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -96,13 +96,10 @@ static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
    return !row.get_or<sstring>(SALTED_HASH, "").empty();
 }

-static const sstring& update_row_query() {
-    static const sstring update_row_query = format("UPDATE {} SET {} = ? WHERE {} = ?",
-            meta::roles_table::qualified_name(),
-            SALTED_HASH,
-            meta::roles_table::role_col_name);
-    return update_row_query;
-}
+static const sstring update_row_query = format("UPDATE {} SET {} = ? WHERE {} = ?",
+        meta::roles_table::qualified_name(),
+        SALTED_HASH,
+        meta::roles_table::role_col_name);

 static const sstring legacy_table_name{"credentials"};

@@ -114,7 +111,7 @@ future<> password_authenticator::migrate_legacy_metadata() const {
    plogger.info("Starting migration of legacy authentication metadata.");
    static const sstring query = format("SELECT * FROM {}.{}", meta::AUTH_KS, legacy_table_name);

-    return _qp.execute_internal(
+    return _qp.process(
            query,
            db::consistency_level::QUORUM,
            internal_distributed_timeout_config()).then([this](::shared_ptr<cql3::untyped_result_set> results) {
@@ -122,8 +119,8 @@ future<> password_authenticator::migrate_legacy_metadata() const {
            auto username = row.get_as<sstring>("username");
            auto salted_hash = row.get_as<sstring>(SALTED_HASH);

-            return _qp.execute_internal(
-                    update_row_query(),
+            return _qp.process(
+                    update_row_query,
                    consistency_for_user(username),
                    internal_distributed_timeout_config(),
                    {std::move(salted_hash), username}).discard_result();
@@ -139,8 +136,8 @@ future<> password_authenticator::migrate_legacy_metadata() const {
 future<> password_authenticator::create_default_if_missing() const {
    return default_role_row_satisfies(_qp, &has_salted_hash).then([this](bool exists) {
        if (!exists) {
-            return _qp.execute_internal(
-                    update_row_query(),
+            return _qp.process(
+                    update_row_query,
                    db::consistency_level::QUORUM,
                    internal_distributed_timeout_config(),
                    {passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt), DEFAULT_USER_NAME}).then([](auto&&) {
@@ -197,7 +194,7 @@ db::consistency_level password_authenticator::consistency_for_user(std::string_v
    return db::consistency_level::LOCAL_ONE;
 }

-std::string_view password_authenticator::qualified_java_name() const {
+const sstring& password_authenticator::qualified_java_name() const {
    return password_authenticator_name();
 }

@@ -236,7 +233,7 @@ future<authenticated_user> password_authenticator::authenticate(
                meta::roles_table::qualified_name(),
                meta::roles_table::role_col_name);

-        return _qp.execute_internal(
+        return _qp.process(
                query,
                consistency_for_user(username),
                internal_distributed_timeout_config(),
@@ -270,8 +267,8 @@ future<> password_authenticator::create(std::string_view role_name, const authen
        return make_ready_future<>();
    }

-    return _qp.execute_internal(
-            update_row_query(),
+    return _qp.process(
+            update_row_query,
            consistency_for_user(role_name),
            internal_distributed_timeout_config(),
            {passwords::hash(*options.password, rng_for_salt), sstring(role_name)}).discard_result();
@@ -287,7 +284,7 @@ future<> password_authenticator::alter(std::string_view role_name, const authent
            SALTED_HASH,
            meta::roles_table::role_col_name);

-    return _qp.execute_internal(
+    return _qp.process(
            query,
            consistency_for_user(role_name),
            internal_distributed_timeout_config(),
@@ -300,7 +297,7 @@ future<> password_authenticator::drop(std::string_view name) const {
            meta::roles_table::qualified_name(),
            meta::roles_table::role_col_name);

-    return _qp.execute_internal(
+    return _qp.process(
            query, consistency_for_user(name),
            internal_distributed_timeout_config(),
            {sstring(name)}).discard_result();
--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -71,7 +71,7 @@ public:

    virtual future<> stop() override;

-    virtual std::string_view qualified_java_name() const override;
+    virtual const sstring& qualified_java_name() const override;

    virtual bool require_authentication() const override;

--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -68,14 +68,14 @@ future<bool> default_role_row_satisfies(
            meta::roles_table::role_col_name);

    return do_with(std::move(p), [&qp](const auto& p) {
-        return qp.execute_internal(
+        return qp.process(
                query,
                db::consistency_level::ONE,
                infinite_timeout_config,
                {meta::DEFAULT_SUPERUSER_NAME},
                true).then([&qp, &p](::shared_ptr<cql3::untyped_result_set> results) {
            if (results->empty()) {
-                return qp.execute_internal(
+                return qp.process(
                        query,
                        db::consistency_level::QUORUM,
                        internal_distributed_timeout_config(),
@@ -100,7 +100,7 @@ future<bool> any_nondefault_role_row_satisfies(
    static const sstring query = format("SELECT * FROM {}", meta::roles_table::qualified_name());

    return do_with(std::move(p), [&qp](const auto& p) {
-        return qp.execute_internal(
+        return qp.process(
                query,
                db::consistency_level::QUORUM,
                internal_distributed_timeout_config()).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -194,10 +194,7 @@ future<> service::stop() {
    // Only one of the shards has the listener registered, but let's try to
    // unregister on each one just to make sure.
    return _mnotifier.unregister_listener(_migration_listener.get()).then([this] {
-        if (_permissions_cache) {
-            return _permissions_cache->stop();
-        }
-        return make_ready_future<>();
+        return _permissions_cache->stop();
    }).then([this] {
        return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop());
    });
@@ -220,7 +217,7 @@ future<bool> service::has_existing_legacy_users() const {
    // This logic is borrowed directly from Apache Cassandra. By first checking for the presence of the default user, we
    // can potentially avoid doing a range query with a high consistency level.

-    return _qp.execute_internal(
+    return _qp.process(
            default_user_query,
            db::consistency_level::ONE,
            infinite_timeout_config,
@@ -230,7 +227,7 @@ future<bool> service::has_existing_legacy_users() const {
            return make_ready_future<bool>(true);
        }

-        return _qp.execute_internal(
+        return _qp.process(
                default_user_query,
                db::consistency_level::QUORUM,
                infinite_timeout_config,
@@ -240,7 +237,7 @@ future<bool> service::has_existing_legacy_users() const {
                return make_ready_future<bool>(true);
            }

-            return _qp.execute_internal(
+            return _qp.process(
                    all_users_query,
                    db::consistency_level::QUORUM,
                    infinite_timeout_config).then([](auto results) {
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -35,7 +35,6 @@
 #include "auth/common.hh"
 #include "auth/roles-metadata.hh"
 #include "cql3/query_processor.hh"
-#include "cql3/untyped_result_set.hh"
 #include "db/consistency_level_type.hh"
 #include "exceptions/exceptions.hh"
 #include "log.hh"
@@ -87,7 +86,7 @@ static future<std::optional<record>> find_record(cql3::query_processor& qp, std:
            meta::roles_table::qualified_name(),
            meta::roles_table::role_col_name);

-    return qp.execute_internal(
+    return qp.process(
            query,
            consistency_for_role(role_name),
            internal_distributed_timeout_config(),
@@ -171,7 +170,7 @@ future<> standard_role_manager::create_default_role_if_missing() const {
                    meta::roles_table::qualified_name(),
                    meta::roles_table::role_col_name);

-            return _qp.execute_internal(
+            return _qp.process(
                    query,
                    db::consistency_level::QUORUM,
                    internal_distributed_timeout_config(),
@@ -198,7 +197,7 @@ future<> standard_role_manager::migrate_legacy_metadata() const {
    log.info("Starting migration of legacy user metadata.");
    static const sstring query = format("SELECT * FROM {}.{}", meta::AUTH_KS, legacy_table_name);

-    return _qp.execute_internal(
+    return _qp.process(
            query,
            db::consistency_level::QUORUM,
            internal_distributed_timeout_config()).then([this](::shared_ptr<cql3::untyped_result_set> results) {
@@ -259,7 +258,7 @@ future<> standard_role_manager::create_or_replace(std::string_view role_name, co
            meta::roles_table::qualified_name(),
            meta::roles_table::role_col_name);

-    return _qp.execute_internal(
+    return _qp.process(
            query,
            consistency_for_role(role_name),
            internal_distributed_timeout_config(),
@@ -299,7 +298,7 @@ standard_role_manager::alter(std::string_view role_name, const role_config_updat
            return make_ready_future<>();
        }

-        return _qp.execute_internal(
+        return _qp.process(
                format("UPDATE {} SET {} WHERE {} = ?",
                        meta::roles_table::qualified_name(),
                        build_column_assignments(u),
@@ -321,7 +320,7 @@ future<> standard_role_manager::drop(std::string_view role_name) const {
            static const sstring query = format("SELECT member FROM {} WHERE role = ?",
                    meta::role_members_table::qualified_name());

-            return _qp.execute_internal(
+            return _qp.process(
                    query,
                    consistency_for_role(role_name),
                    internal_distributed_timeout_config(),
@@ -360,7 +359,7 @@ future<> standard_role_manager::drop(std::string_view role_name) const {
                    meta::roles_table::qualified_name(),
                    meta::roles_table::role_col_name);

-            return _qp.execute_internal(
+            return _qp.process(
                    query,
                    consistency_for_role(role_name),
                    internal_distributed_timeout_config(),
@@ -387,7 +386,7 @@ standard_role_manager::modify_membership(
                (ch == membership_change::add ? '+' : '-'),
                meta::roles_table::role_col_name);

-        return _qp.execute_internal(
+        return _qp.process(
                query,
                consistency_for_role(grantee_name),
                internal_distributed_timeout_config(),
@@ -397,7 +396,7 @@ standard_role_manager::modify_membership(
    const auto modify_role_members = [this, role_name, grantee_name, ch] {
        switch (ch) {
            case membership_change::add:
-                return _qp.execute_internal(
+                return _qp.process(
                        format("INSERT INTO {} (role, member) VALUES (?, ?)",
                                meta::role_members_table::qualified_name()),
                        consistency_for_role(role_name),
@@ -405,7 +404,7 @@ standard_role_manager::modify_membership(
                        {sstring(role_name), sstring(grantee_name)}).discard_result();

            case membership_change::remove:
-                return _qp.execute_internal(
+                return _qp.process(
                        format("DELETE FROM {} WHERE role = ? AND member = ?",
                                meta::role_members_table::qualified_name()),
                        consistency_for_role(role_name),
@@ -509,7 +508,7 @@ future<role_set> standard_role_manager::query_all() const {
    // To avoid many copies of a view.
    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);

-    return _qp.execute_internal(
+    return _qp.process(
            query,
            db::consistency_level::QUORUM,
            internal_distributed_timeout_config()).then([](::shared_ptr<cql3::untyped_result_set> results) {
--- a/auth/transitional.cc
+++ b/auth/transitional.cc
@@ -82,7 +82,7 @@ public:
        return _authenticator->stop();
    }

-    virtual std::string_view qualified_java_name() const override {
+    virtual const sstring& qualified_java_name() const override {
        return transitional_authenticator_name();
    }

@@ -201,7 +201,7 @@ public:
        return _authorizer->stop();
    }

-    virtual std::string_view qualified_java_name() const override {
+    virtual const sstring& qualified_java_name() const override {
        return transitional_authorizer_name();
    }

--- a/backlog_controller.hh
+++ b/backlog_controller.hh
@@ -23,11 +23,7 @@
 #include <seastar/core/scheduling.hh>
 #include <seastar/core/timer.hh>
 #include <seastar/core/gate.hh>
-#include <seastar/core/file.hh>
 #include <chrono>
-#include <cmath>
-
-#include "seastarx.hh"

 // Simple proportional controller to adjust shares for processes for which a backlog can be clearly
 // defined.
--- a/build_id.cc
+++ b/build_id.cc
@@ -7,7 +7,6 @@
 #include <link.h>
 #include <seastar/core/align.hh>
 #include <sstream>
-#include <cassert>

 using namespace seastar;

--- a/bytes.cc
+++ b/bytes.cc
@@ -64,7 +64,7 @@ bytes from_hex(sstring_view s) {

 sstring to_hex(bytes_view b) {
    static char digits[] = "0123456789abcdef";
-    sstring out = uninitialized_string(b.size() * 2);
+    sstring out(sstring::initialized_later(), b.size() * 2);
    unsigned end = b.size();
    for (unsigned i = 0; i != end; ++i) {
        uint8_t x = b[i];
--- a/canonical_mutation.cc
+++ b/canonical_mutation.cc
@@ -92,7 +92,7 @@ mutation canonical_mutation::to_mutation(schema_ptr s) const {
 }

 static sstring bytes_to_text(bytes_view bv) {
-    sstring ret = uninitialized_string(bv.size());
+    sstring ret(sstring::initialized_later(), bv.size());
    std::copy_n(reinterpret_cast<const char*>(bv.data()), bv.size(), ret.data());
    return ret;
 }
--- a/canonical_mutation.hh
+++ b/canonical_mutation.hh
@@ -22,7 +22,7 @@
 #pragma once

 #include "bytes.hh"
-#include "schema_fwd.hh"
+#include "schema.hh"
 #include "database_fwd.hh"
 #include "mutation_partition_visitor.hh"
 #include "mutation_partition_serializer.hh"
--- a/cartesian_product.hh
+++ b/cartesian_product.hh
@@ -22,9 +22,6 @@

 #pragma once

-#include <vector>
-#include <sys/types.h>
-
 // Single-pass range over cartesian product of vectors.

 // Note:
--- a/cdc/cdc.cc
+++ b/cdc/cdc.cc
@@ -0,0 +1,835 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <utility>
+#include <algorithm>
+
+#include <boost/range/irange.hpp>
+#include <seastar/util/defer.hh>
+#include <seastar/core/thread.hh>
+
+#include "cdc/cdc.hh"
+#include "bytes.hh"
+#include "database.hh"
+#include "db/config.hh"
+#include "dht/murmur3_partitioner.hh"
+#include "partition_slice_builder.hh"
+#include "schema.hh"
+#include "schema_builder.hh"
+#include "service/migration_listener.hh"
+#include "service/storage_service.hh"
+#include "types/tuple.hh"
+#include "cql3/statements/select_statement.hh"
+#include "cql3/multi_column_relation.hh"
+#include "cql3/tuples.hh"
+#include "log.hh"
+#include "json.hh"
+
+using locator::snitch_ptr;
+using locator::token_metadata;
+using locator::topology;
+using seastar::sstring;
+using service::migration_notifier;
+using service::storage_proxy;
+
+namespace std {
+
+template<> struct hash<std::pair<net::inet_address, unsigned int>> {
+    std::size_t operator()(const std::pair<net::inet_address, unsigned int> &p) const {
+        return std::hash<net::inet_address>{}(p.first) ^ std::hash<int>{}(p.second);
+    }
+};
+
+}
+
+using namespace std::chrono_literals;
+
+static logging::logger cdc_log("cdc");
+
+namespace cdc {
+static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {});
+static schema_ptr create_stream_description_table_schema(const schema&, std::optional<utils::UUID> = {});
+static future<> populate_desc(db_context ctx, const schema& s);
+}
+
+class cdc::cdc_service::impl : service::migration_listener::empty_listener {
+    friend cdc_service;
+    db_context _ctxt;
+    bool _stopped = false;
+public:
+    impl(db_context ctxt)
+        : _ctxt(std::move(ctxt))
+    {
+        _ctxt._migration_notifier.register_listener(this);
+    }
+    ~impl() {
+        assert(_stopped);
+    }
+
+    future<> stop() {
+        return _ctxt._migration_notifier.unregister_listener(this).then([this] {
+            _stopped = true;
+        });
+    }
+
+    void on_before_create_column_family(const schema& schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
+        if (schema.cdc_options().enabled()) {
+            auto& db = _ctxt._proxy.get_db().local();
+            auto logname = log_name(schema.cf_name());
+            if (!db.has_schema(schema.ks_name(), logname)) {
+                // in seastar thread
+                auto log_schema = create_log_schema(schema);
+                auto stream_desc_schema = create_stream_description_table_schema(schema);
+                auto& keyspace = db.find_keyspace(schema.ks_name());
+
+                auto log_mut = db::schema_tables::make_create_table_mutations(keyspace.metadata(), log_schema, timestamp);
+                auto stream_mut = db::schema_tables::make_create_table_mutations(keyspace.metadata(), stream_desc_schema, timestamp);
+
+                mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
+                mutations.insert(mutations.end(), std::make_move_iterator(stream_mut.begin()), std::make_move_iterator(stream_mut.end()));
+            }
+        }
+    }
+
+    void on_before_update_column_family(const schema& new_schema, const schema& old_schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
+        bool is_cdc = new_schema.cdc_options().enabled();
+        bool was_cdc = old_schema.cdc_options().enabled();
+
+        // we need to create or modify the log & stream schemas iff either we changed cdc status (was != is)
+        // or if cdc is on now unconditionally, since then any actual base schema changes will affect the column 
+        // etc.
+        if (was_cdc || is_cdc) {
+            auto logname = log_name(old_schema.cf_name());
+            auto descname = desc_name(old_schema.cf_name());
+            auto& db = _ctxt._proxy.get_db().local();
+            auto& keyspace = db.find_keyspace(old_schema.ks_name());
+            auto log_schema = was_cdc ? db.find_column_family(old_schema.ks_name(), logname).schema() : nullptr;
+            auto stream_desc_schema = was_cdc ? db.find_column_family(old_schema.ks_name(), descname).schema() : nullptr;
+
+            if (!is_cdc) {
+                auto log_mut = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), log_schema, timestamp);
+                auto stream_mut = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), stream_desc_schema, timestamp);
+
+                mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
+                mutations.insert(mutations.end(), std::make_move_iterator(stream_mut.begin()), std::make_move_iterator(stream_mut.end()));
+                return;
+            }
+
+            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt);
+            auto new_stream_desc_schema = create_stream_description_table_schema(new_schema, stream_desc_schema ? std::make_optional(stream_desc_schema->id()) : std::nullopt);
+
+            auto log_mut = log_schema 
+                ? db::schema_tables::make_update_table_mutations(keyspace.metadata(), log_schema, new_log_schema, timestamp, false)
+                : db::schema_tables::make_create_table_mutations(keyspace.metadata(), new_log_schema, timestamp)
+                ;
+            auto stream_mut = stream_desc_schema 
+                ? db::schema_tables::make_update_table_mutations(keyspace.metadata(), stream_desc_schema, new_stream_desc_schema, timestamp, false)
+                : db::schema_tables::make_create_table_mutations(keyspace.metadata(), new_stream_desc_schema, timestamp)
+                ;
+
+            mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
+            mutations.insert(mutations.end(), std::make_move_iterator(stream_mut.begin()), std::make_move_iterator(stream_mut.end()));
+        }
+    }
+
+    void on_before_drop_column_family(const schema& schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
+        if (schema.cdc_options().enabled()) {
+            auto logname = log_name(schema.cf_name());
+            auto descname = desc_name(schema.cf_name());
+            auto& db = _ctxt._proxy.get_db().local();
+            auto& keyspace = db.find_keyspace(schema.ks_name());
+            auto log_schema = db.find_column_family(schema.ks_name(), logname).schema();
+            auto stream_desc_schema = db.find_column_family(schema.ks_name(), descname).schema();
+
+            auto log_mut = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), log_schema, timestamp);
+            auto stream_mut = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), stream_desc_schema, timestamp);
+
+            mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
+            mutations.insert(mutations.end(), std::make_move_iterator(stream_mut.begin()), std::make_move_iterator(stream_mut.end()));
+        }
+    }
+
+    void on_create_column_family(const sstring& ks_name, const sstring& cf_name) override {
+        // This callback is done on all shards. Only do the work once. 
+        if (engine().cpu_id() != 0) {
+            return; 
+        }
+        auto& db = _ctxt._proxy.get_db().local();
+        auto& cf = db.find_column_family(ks_name, cf_name);
+        auto schema = cf.schema();
+        if (schema->cdc_options().enabled()) {
+            populate_desc(_ctxt, *schema).get();
+        }
+    }
+
+    void on_update_column_family(const sstring& ks_name, const sstring& cf_name, bool columns_changed) override {
+        on_create_column_family(ks_name, cf_name);
+    }
+
+    void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {}
+
+    future<std::tuple<std::vector<mutation>, result_callback>> augment_mutation_call(
+        lowres_clock::time_point timeout,
+        std::vector<mutation>&& mutations
+    );
+
+    template<typename Iter>
+    future<> append_mutations(Iter i, Iter e, schema_ptr s, lowres_clock::time_point, std::vector<mutation>&);
+};
+
+cdc::cdc_service::cdc_service(service::storage_proxy& proxy)
+    : cdc_service(db_context::builder(proxy).build())
+{}
+
+cdc::cdc_service::cdc_service(db_context ctxt)
+    : _impl(std::make_unique<impl>(std::move(ctxt)))
+{
+    _impl->_ctxt._proxy.set_cdc_service(this);
+}
+
+future<> cdc::cdc_service::stop() {
+    return _impl->stop();
+}
+
+cdc::cdc_service::~cdc_service() = default;
+
+cdc::options::options(const std::map<sstring, sstring>& map) {
+    if (map.find("enabled") == std::end(map)) {
+        return;
+    }
+
+    for (auto& p : map) {
+        if (p.first == "enabled") {
+            _enabled = p.second == "true";
+        } else if (p.first == "preimage") {
+            _preimage = p.second == "true";
+        } else if (p.first == "postimage") {
+            _postimage = p.second == "true";
+        } else if (p.first == "ttl") {
+            _ttl = std::stoi(p.second);
+        } else {
+            throw exceptions::configuration_exception("Invalid CDC option: " + p.first);
+        }
+    }
+}
+
+std::map<sstring, sstring> cdc::options::to_map() const {
+    if (!_enabled) {
+        return {};
+    }
+    return {
+        { "enabled", _enabled ? "true" : "false" },
+        { "preimage", _preimage ? "true" : "false" },
+        { "postimage", _postimage ? "true" : "false" },
+        { "ttl", std::to_string(_ttl) },
+    };
+}
+
+sstring cdc::options::to_sstring() const {
+    return json::to_json(to_map());
+}
+
+bool cdc::options::operator==(const options& o) const {
+    return _enabled == o._enabled && _preimage == o._preimage && _postimage == o._postimage && _ttl == o._ttl;
+}
+bool cdc::options::operator!=(const options& o) const {
+    return !(*this == o);
+}
+
+namespace cdc {
+
+using operation_native_type = std::underlying_type_t<operation>;
+using column_op_native_type = std::underlying_type_t<column_op>;
+
+sstring log_name(const sstring& table_name) {
+    static constexpr auto cdc_log_suffix = "_scylla_cdc_log";
+    return table_name + cdc_log_suffix;
+}
+
+sstring desc_name(const sstring& table_name) {
+    static constexpr auto cdc_desc_suffix = "_scylla_cdc_desc";
+    return table_name + cdc_desc_suffix;
+}
+
+static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid) {
+    schema_builder b(s.ks_name(), log_name(s.cf_name()));
+    b.set_comment(sprint("CDC log for %s.%s", s.ks_name(), s.cf_name()));
+    b.with_column("stream_id", uuid_type, column_kind::partition_key);
+    b.with_column("time", timeuuid_type, column_kind::clustering_key);
+    b.with_column("batch_seq_no", int32_type, column_kind::clustering_key);
+    b.with_column("operation", data_type_for<operation_native_type>());
+    b.with_column("ttl", long_type);
+    auto add_columns = [&] (const schema::const_iterator_range_type& columns, bool is_data_col = false) {
+        for (const auto& column : columns) {
+            auto type = column.type;
+            if (is_data_col) {
+                type = tuple_type_impl::get_instance({ /* op */ data_type_for<column_op_native_type>(), /* value */ type, /* ttl */long_type});
+            }
+            b.with_column("_" + column.name(), type);
+        }
+    };
+    add_columns(s.partition_key_columns());
+    add_columns(s.clustering_key_columns());
+    add_columns(s.static_columns(), true);
+    add_columns(s.regular_columns(), true);
+
+    if (uuid) {
+        b.set_uuid(*uuid);
+    }
+    
+    return b.build();
+}
+
+static schema_ptr create_stream_description_table_schema(const schema& s, std::optional<utils::UUID> uuid) {
+    schema_builder b(s.ks_name(), desc_name(s.cf_name()));
+    b.set_comment(sprint("CDC description for %s.%s", s.ks_name(), s.cf_name()));
+    b.with_column("node_ip", inet_addr_type, column_kind::partition_key);
+    b.with_column("shard_id", int32_type, column_kind::partition_key);
+    b.with_column("created_at", timestamp_type, column_kind::clustering_key);
+    b.with_column("stream_id", uuid_type);
+
+    if (uuid) {
+        b.set_uuid(*uuid);
+    }
+
+    return b.build();
+}
+
+// This function assumes setup_stream_description_table was called on |s| before the call to this
+// function.
+static future<> populate_desc(db_context ctx, const schema& s) {
+    auto& db = ctx._proxy.get_db().local();
+    auto desc_schema =
+        db.find_schema(s.ks_name(), desc_name(s.cf_name()));
+    auto log_schema =
+        db.find_schema(s.ks_name(), log_name(s.cf_name()));
+    auto belongs_to = [&](const gms::inet_address& endpoint,
+                          const unsigned int shard_id,
+                          const int shard_count,
+                          const unsigned int ignore_msb_bits,
+                          const utils::UUID& stream_id) {
+        const auto log_pk = partition_key::from_singular(*log_schema,
+                                                         data_value(stream_id));
+        const auto token = ctx._partitioner.decorate_key(*log_schema, log_pk).token();
+        if (ctx._token_metadata.get_endpoint(ctx._token_metadata.first_token(token)) != endpoint) {
+            return false;
+        }
+        const auto owning_shard_id = dht::murmur3_partitioner(shard_count, ignore_msb_bits).shard_of(token);
+        return owning_shard_id == shard_id;
+    };
+
+    std::vector<mutation> mutations;
+    const auto ts = api::new_timestamp();
+    const auto ck = clustering_key::from_single_value(
+            *desc_schema, timestamp_type->decompose(ts));
+    auto cdef = desc_schema->get_column_definition(to_bytes("stream_id"));
+
+    for (const auto& dc : ctx._token_metadata.get_topology().get_datacenter_endpoints()) {
+        for (const auto& endpoint : dc.second) {
+            const auto decomposed_ip = inet_addr_type->decompose(endpoint.addr());
+            const unsigned int shard_count = ctx._snitch->get_shard_count(endpoint);
+            const unsigned int ignore_msb_bits = ctx._snitch->get_ignore_msb_bits(endpoint);
+            for (unsigned int shard_id = 0; shard_id < shard_count; ++shard_id) {
+                const auto pk = partition_key::from_exploded(
+                        *desc_schema, { decomposed_ip, int32_type->decompose(static_cast<int>(shard_id)) });
+                mutations.emplace_back(desc_schema, pk);
+
+                auto stream_id = utils::make_random_uuid();
+                while (!belongs_to(endpoint, shard_id, shard_count, ignore_msb_bits, stream_id)) {
+                    stream_id = utils::make_random_uuid();
+                }
+                auto value = atomic_cell::make_live(*uuid_type,
+                                                    ts,
+                                                    uuid_type->decompose(stream_id));
+                mutations.back().set_cell(ck, *cdef, std::move(value));
+            }
+        }
+    }
+    return ctx._proxy.mutate(std::move(mutations),
+                             db::consistency_level::QUORUM,
+                             db::no_timeout,
+                             nullptr,
+                             empty_service_permit());
+}
+
+db_context::builder::builder(service::storage_proxy& proxy) 
+    : _proxy(proxy) 
+{}
+
+db_context::builder& db_context::builder::with_migration_notifier(service::migration_notifier& migration_notifier) {
+    _migration_notifier = migration_notifier;
+    return *this;
+}
+
+db_context::builder& db_context::builder::with_token_metadata(locator::token_metadata& token_metadata) {
+    _token_metadata = token_metadata;
+    return *this;
+}
+
+db_context::builder& db_context::builder::with_snitch(locator::snitch_ptr& snitch) {
+    _snitch = snitch;
+    return *this;
+}
+
+db_context::builder& db_context::builder::with_partitioner(dht::i_partitioner& partitioner) {
+    _partitioner = partitioner;
+    return *this;
+}
+
+db_context db_context::builder::build() {
+    return db_context{
+        _proxy,
+        _migration_notifier ? _migration_notifier->get() : service::get_local_storage_service().get_migration_notifier(),
+        _token_metadata ? _token_metadata->get() : service::get_local_storage_service().get_token_metadata(),
+        _snitch ? _snitch->get() : locator::i_endpoint_snitch::get_local_snitch_ptr(),
+        _partitioner ? _partitioner->get() : dht::global_partitioner()
+    };
+}
+
+class transformer final {
+public:
+    using streams_type = std::unordered_map<std::pair<net::inet_address, unsigned int>, utils::UUID>;
+private:
+    db_context _ctx;
+    schema_ptr _schema;
+    schema_ptr _log_schema;
+    utils::UUID _time;
+    bytes _decomposed_time;
+    ::shared_ptr<const transformer::streams_type> _streams;
+    const column_definition& _op_col;
+    ttl_opt _cdc_ttl_opt;
+
+    clustering_key set_pk_columns(const partition_key& pk, int batch_no, mutation& m) const {
+        const auto log_ck = clustering_key::from_exploded(
+                *m.schema(), { _decomposed_time, int32_type->decompose(batch_no) });
+        auto pk_value = pk.explode(*_schema);
+        size_t pos = 0;
+        for (const auto& column : _schema->partition_key_columns()) {
+            assert (pos < pk_value.size());
+            auto cdef = m.schema()->get_column_definition(to_bytes("_" + column.name()));
+            auto value = atomic_cell::make_live(*column.type,
+                                                _time.timestamp(),
+                                                bytes_view(pk_value[pos]),
+                                                _cdc_ttl_opt);
+            m.set_cell(log_ck, *cdef, std::move(value));
+            ++pos;
+        }
+        return log_ck;
+    }
+
+    void set_operation(const clustering_key& ck, operation op, mutation& m) const {
+        m.set_cell(ck, _op_col, atomic_cell::make_live(*_op_col.type, _time.timestamp(), _op_col.type->decompose(operation_native_type(op)), _cdc_ttl_opt));
+    }
+
+    partition_key stream_id(const net::inet_address& ip, unsigned int shard_id) const {
+        auto it = _streams->find(std::make_pair(ip, shard_id));
+        if (it == std::end(*_streams)) {
+                throw std::runtime_error(format("No stream found for node {} and shard {}", ip, shard_id));
+        }
+        return partition_key::from_exploded(*_log_schema, { uuid_type->decompose(it->second) });
+    }
+public:
+    transformer(db_context ctx, schema_ptr s, ::shared_ptr<const transformer::streams_type> streams)
+        : _ctx(ctx)
+        , _schema(std::move(s))
+        , _log_schema(ctx._proxy.get_db().local().find_schema(_schema->ks_name(), log_name(_schema->cf_name())))
+        , _time(utils::UUID_gen::get_time_UUID())
+        , _decomposed_time(timeuuid_type->decompose(_time))
+        , _streams(std::move(streams))
+        , _op_col(*_log_schema->get_column_definition(to_bytes("operation")))
+    {
+        if (_schema->cdc_options().ttl()) {
+            _cdc_ttl_opt = std::chrono::seconds(_schema->cdc_options().ttl());
+        }
+    }
+
+    // TODO: is pre-image data based on query enough. We only have actual column data. Do we need
+    // more details like tombstones/ttl? Probably not but keep in mind.
+    mutation transform(const mutation& m, const cql3::untyped_result_set* rs = nullptr) const {
+        auto& t = m.token();
+        auto&& ep = _ctx._token_metadata.get_endpoint(
+                _ctx._token_metadata.first_token(t));
+        if (!ep) {
+            throw std::runtime_error(format("No owner found for key {}", m.decorated_key()));
+        }
+        auto shard_id = dht::murmur3_partitioner(_ctx._snitch->get_shard_count(*ep), _ctx._snitch->get_ignore_msb_bits(*ep)).shard_of(t);
+        mutation res(_log_schema, stream_id(ep->addr(), shard_id));
+        auto& p = m.partition();
+        if (p.partition_tombstone()) {
+            // Partition deletion
+            auto log_ck = set_pk_columns(m.key(), 0, res);
+            set_operation(log_ck, operation::partition_delete, res);
+        } else if (!p.row_tombstones().empty()) {
+            // range deletion
+            int batch_no = 0;
+            for (auto& rt : p.row_tombstones()) {
+                auto set_bound = [&] (const clustering_key& log_ck, const clustering_key_prefix& ckp) {
+                    auto exploded = ckp.explode(*_schema);
+                    size_t pos = 0;
+                    for (const auto& column : _schema->clustering_key_columns()) {
+                        if (pos >= exploded.size()) {
+                            break;
+                        }
+                        auto cdef = _log_schema->get_column_definition(to_bytes("_" + column.name()));
+                        auto value = atomic_cell::make_live(*column.type,
+                                                            _time.timestamp(),
+                                                            bytes_view(exploded[pos]),
+                                                            _cdc_ttl_opt);
+                        res.set_cell(log_ck, *cdef, std::move(value));
+                        ++pos;
+                    }
+                };
+                {
+                    auto log_ck = set_pk_columns(m.key(), batch_no, res);
+                    set_bound(log_ck, rt.start);
+                    // TODO: separate inclusive/exclusive range
+                    set_operation(log_ck, operation::range_delete_start, res);
+                    ++batch_no;
+                }
+                {
+                    auto log_ck = set_pk_columns(m.key(), batch_no, res);
+                    set_bound(log_ck, rt.end);
+                    // TODO: separate inclusive/exclusive range
+                    set_operation(log_ck, operation::range_delete_end, res);
+                    ++batch_no;
+                }
+            }
+        } else {
+            // should be update or deletion
+            int batch_no = 0;
+            for (const rows_entry& r : p.clustered_rows()) {
+                auto ck_value = r.key().explode(*_schema);
+
+                std::optional<clustering_key> pikey;
+                const cql3::untyped_result_set_row * pirow = nullptr;
+
+                if (rs) {
+                    for (auto& utr : *rs) {
+                        bool match = true;
+                        for (auto& c : _schema->clustering_key_columns()) {
+                            auto rv = utr.get_view(c.name_as_text());
+                            auto cv = r.key().get_component(*_schema, c.component_index());
+                            if (rv != cv) {
+                                match = false;
+                                break;
+                            }
+                        }
+                        if (match) {
+                            pikey = set_pk_columns(m.key(), batch_no, res);
+                            set_operation(*pikey, operation::pre_image, res);
+                            pirow = &utr;
+                            ++batch_no;
+                            break;
+                        }
+                    }
+                }
+
+                auto log_ck = set_pk_columns(m.key(), batch_no, res);
+
+                size_t pos = 0;
+                for (const auto& column : _schema->clustering_key_columns()) {
+                    assert (pos < ck_value.size());
+                    auto cdef = _log_schema->get_column_definition(to_bytes("_" + column.name()));
+                    res.set_cell(log_ck, *cdef, atomic_cell::make_live(*column.type, _time.timestamp(), bytes_view(ck_value[pos]), _cdc_ttl_opt));
+
+                    if (pirow) {
+                        assert(pirow->has(column.name_as_text()));
+                        res.set_cell(*pikey, *cdef, atomic_cell::make_live(*column.type, _time.timestamp(), bytes_view(ck_value[pos]), _cdc_ttl_opt));
+                    }
+
+                    ++pos;
+                }
+
+                std::vector<bytes_opt> values(3);
+
+                auto process_cells = [&](const row& r, column_kind ckind) {
+                    r.for_each_cell([&](column_id id, const atomic_cell_or_collection& cell) {
+                        auto& cdef = _schema->column_at(ckind, id);
+                        auto* dst = _log_schema->get_column_definition(to_bytes("_" + cdef.name()));
+                        // todo: collections.
+                        if (cdef.is_atomic()) {
+                            column_op op;
+
+                            values[1] = values[2] = std::nullopt;
+                            auto view = cell.as_atomic_cell(cdef);
+                            if (view.is_live()) {
+                                op = column_op::set;
+                                values[1] = view.value().linearize();
+                                if (view.is_live_and_has_ttl()) {
+                                    values[2] = long_type->decompose(data_value(view.ttl().count()));
+                                }
+                            } else {
+                                op = column_op::del;
+                            }
+
+                            values[0] = data_type_for<column_op_native_type>()->decompose(data_value(static_cast<column_op_native_type>(op)));
+                            res.set_cell(log_ck, *dst, atomic_cell::make_live(*dst->type, _time.timestamp(), tuple_type_impl::build_value(values), _cdc_ttl_opt));
+
+                            if (pirow && pirow->has(cdef.name_as_text())) {
+                                values[0] = data_type_for<column_op_native_type>()->decompose(data_value(static_cast<column_op_native_type>(column_op::set)));
+                                values[1] = pirow->get_blob(cdef.name_as_text());
+                                values[2] = std::nullopt;
+
+                                assert(std::addressof(res.partition().clustered_row(*_log_schema, *pikey)) != std::addressof(res.partition().clustered_row(*_log_schema, log_ck)));
+                                assert(pikey->explode() != log_ck.explode());
+                                res.set_cell(*pikey, *dst, atomic_cell::make_live(*dst->type, _time.timestamp(), tuple_type_impl::build_value(values), _cdc_ttl_opt));
+                            }
+                        } else {
+                            cdc_log.warn("Non-atomic cell ignored {}.{}:{}", _schema->ks_name(), _schema->cf_name(), cdef.name_as_text());
+                        }
+                    });
+                };
+
+                process_cells(r.row().cells(), column_kind::regular_column);
+                process_cells(p.static_row().get(), column_kind::static_column);
+
+                set_operation(log_ck, operation::update, res);
+                ++batch_no;
+            }
+        }
+
+        return res;
+    }
+
+    static db::timeout_clock::time_point default_timeout() {
+        return db::timeout_clock::now() + 10s;
+    }
+
+    future<lw_shared_ptr<cql3::untyped_result_set>> pre_image_select(
+            service::client_state& client_state,
+            db::consistency_level cl,
+            const mutation& m)
+    {
+        auto& p = m.partition();
+        if (p.partition_tombstone() || !p.row_tombstones().empty() || p.clustered_rows().empty()) {
+            return make_ready_future<lw_shared_ptr<cql3::untyped_result_set>>();
+        }
+
+        dht::partition_range_vector partition_ranges{dht::partition_range(m.decorated_key())};
+
+        auto&& pc = _schema->partition_key_columns();
+        auto&& cc = _schema->clustering_key_columns();
+
+        std::vector<query::clustering_range> bounds;
+        if (cc.empty()) {
+            bounds.push_back(query::clustering_range::make_open_ended_both_sides());
+        } else {
+            for (const rows_entry& r : p.clustered_rows()) {
+                auto& ck = r.key();
+                bounds.push_back(query::clustering_range::make_singular(ck));
+            }
+        }
+
+        std::vector<const column_definition*> columns;
+        columns.reserve(_schema->all_columns().size());
+
+        std::transform(pc.begin(), pc.end(), std::back_inserter(columns), [](auto& c) { return &c; });
+        std::transform(cc.begin(), cc.end(), std::back_inserter(columns), [](auto& c) { return &c; });
+
+        query::column_id_vector static_columns, regular_columns;
+
+        auto sk = column_kind::static_column;
+        auto rk = column_kind::regular_column;
+        // TODO: this assumes all mutations touch the same set of columns. This might not be true, and we may need to do more horrible set operation here.
+        for (auto& [r, cids, kind] : { std::tie(p.static_row().get(), static_columns, sk), std::tie(p.clustered_rows().begin()->row().cells(), regular_columns, rk) }) {
+            r.for_each_cell([&](column_id id, const atomic_cell_or_collection&) {
+                auto& cdef =_schema->column_at(kind, id);
+                cids.emplace_back(id);
+                columns.emplace_back(&cdef);
+            });
+        }
+
+        auto selection = cql3::selection::selection::for_columns(_schema, std::move(columns));
+        auto partition_slice = query::partition_slice(std::move(bounds), std::move(static_columns), std::move(regular_columns), selection->get_query_options());
+        auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), partition_slice, query::max_partitions);
+
+        return _ctx._proxy.query(_schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), empty_service_permit(), client_state)).then(
+                [s = _schema, partition_slice = std::move(partition_slice), selection = std::move(selection)] (service::storage_proxy::coordinator_query_result qr) -> lw_shared_ptr<cql3::untyped_result_set> {
+                    cql3::selection::result_set_builder builder(*selection, gc_clock::now(), cql_serialization_format::latest());
+                    query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *s, *selection));
+                    auto result_set = builder.build();
+                    if (!result_set || result_set->empty()) {
+                        return {};
+                    }
+                    return make_lw_shared<cql3::untyped_result_set>(*result_set);
+        });
+    }
+};
+
+// This class is used to build a mapping from <node ip, shard id> to stream_id
+// It is used as a consumer for rows returned by the query to CDC Description Table
+class streams_builder {
+    const schema& _schema;
+    transformer::streams_type _streams;
+    net::inet_address _node_ip = net::inet_address();
+    unsigned int _shard_id = 0;
+    api::timestamp_type _latest_row_timestamp = api::min_timestamp;
+    utils::UUID _latest_row_stream_id = utils::UUID();
+public:
+    streams_builder(const schema& s) : _schema(s) {}
+
+    void accept_new_partition(const partition_key& key, uint32_t row_count) {
+        auto exploded = key.explode(_schema);
+        _node_ip = value_cast<net::inet_address>(inet_addr_type->deserialize(exploded[0]));
+        _shard_id = static_cast<unsigned int>(value_cast<int>(int32_type->deserialize(exploded[1])));
+        _latest_row_timestamp = api::min_timestamp;
+        _latest_row_stream_id = utils::UUID();
+    }
+
+    void accept_new_partition(uint32_t row_count) {
+        assert(false);
+    }
+
+    void accept_new_row(
+            const clustering_key& key,
+            const query::result_row_view& static_row,
+            const query::result_row_view& row) {
+        auto row_iterator = row.iterator();
+        api::timestamp_type timestamp = value_cast<db_clock::time_point>(
+                timestamp_type->deserialize(key.explode(_schema)[0])).time_since_epoch().count();
+        if (timestamp <= _latest_row_timestamp) {
+            return;
+        }
+        _latest_row_timestamp = timestamp;
+        for (auto&& cdef : _schema.regular_columns()) {
+            if (cdef.name_as_text() != "stream_id") {
+                row_iterator.skip(cdef);
+                continue;
+            }
+            auto val_opt = row_iterator.next_atomic_cell();
+            assert(val_opt);
+            val_opt->value().with_linearized([&] (bytes_view bv) {
+                _latest_row_stream_id = value_cast<utils::UUID>(uuid_type->deserialize(bv));
+            });
+        }
+    }
+
+    void accept_new_row(const query::result_row_view& static_row, const query::result_row_view& row) {
+        assert(false);
+    }
+
+    void accept_partition_end(const query::result_row_view& static_row) {
+        _streams.emplace(std::make_pair(_node_ip, _shard_id), _latest_row_stream_id);
+    }
+
+    transformer::streams_type build() {
+        return std::move(_streams);
+    }
+};
+
+static future<::shared_ptr<transformer::streams_type>> get_streams(
+        db_context ctx,
+        const sstring& ks_name,
+        const sstring& cf_name,
+        lowres_clock::time_point timeout,
+        service::query_state& qs) {
+    auto s =
+        ctx._proxy.get_db().local().find_schema(ks_name, desc_name(cf_name));
+    query::read_command cmd(
+            s->id(),
+            s->version(),
+            partition_slice_builder(*s).with_no_static_columns().build());
+    return ctx._proxy.query(
+            s,
+            make_lw_shared(std::move(cmd)),
+            {dht::partition_range::make_open_ended_both_sides()},
+            db::consistency_level::QUORUM,
+            {timeout, qs.get_permit(), qs.get_client_state()}).then([s = std::move(s)] (auto qr) mutable {
+        return query::result_view::do_with(*qr.query_result,
+                [s = std::move(s)] (query::result_view v) {
+            auto slice = partition_slice_builder(*s)
+                    .with_no_static_columns()
+                    .build();
+            streams_builder builder{ *s };
+            v.consume(slice, builder);
+            return ::make_shared<transformer::streams_type>(builder.build());
+        });
+    });
+}
+
+template <typename Func>
+future<std::vector<mutation>>
+transform_mutations(std::vector<mutation>& muts, decltype(muts.size()) batch_size, Func&& f) {
+    return parallel_for_each(
+            boost::irange(static_cast<decltype(muts.size())>(0), muts.size(), batch_size),
+            std::move(f))
+        .then([&muts] () mutable { return std::move(muts); });
+}
+
+} // namespace cdc
+
+future<std::tuple<std::vector<mutation>, cdc::result_callback>>
+cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations) {
+    // we do all this because in the case of batches, we can have mixed schemas.
+    auto e = mutations.end();
+    auto i = std::find_if(mutations.begin(), e, [](const mutation& m) {
+        return m.schema()->cdc_options().enabled();
+    });
+
+    if (i == e) {
+        return make_ready_future<std::tuple<std::vector<mutation>, cdc::result_callback>>(std::make_tuple(std::move(mutations), result_callback{}));
+    }
+
+    mutations.reserve(2 * mutations.size());
+
+    return do_with(std::move(mutations), service::query_state(service::client_state::for_internal_calls(), empty_service_permit()), [this, timeout, i](std::vector<mutation>& mutations, service::query_state& qs) {
+        return transform_mutations(mutations, 1, [this, &mutations, timeout, &qs] (int idx) {
+            auto& m = mutations[idx];
+            auto s = m.schema();
+
+            if (!s->cdc_options().enabled()) {
+                return make_ready_future<>();
+            }
+            // for batches/multiple mutations this is super inefficient. either partition the mutation set by schema
+            // and re-use streams, or probably better: add a cache so this lookup is a noop on second mutation
+            return get_streams(_ctxt, s->ks_name(), s->cf_name(), timeout, qs).then([this, s = std::move(s), &qs, &mutations, idx](::shared_ptr<transformer::streams_type> streams) mutable {
+                auto& m = mutations[idx]; // should not really need because of reserve, but lets be conservative
+                transformer trans(_ctxt, s, streams);
+
+                if (!s->cdc_options().preimage()) {
+                    mutations.emplace_back(trans.transform(m));
+                    return make_ready_future<>();
+                }
+
+                // Note: further improvement here would be to coalesce the pre-image selects into one
+                // iff a batch contains several modifications to the same table. Otoh, batch is rare(?)
+                // so this is premature.
+                auto f = trans.pre_image_select(qs.get_client_state(), db::consistency_level::LOCAL_QUORUM, m);
+                return f.then([trans = std::move(trans), &mutations, idx] (lw_shared_ptr<cql3::untyped_result_set> rs) mutable {
+                    mutations.push_back(trans.transform(mutations[idx], rs.get()));
+                });
+            });
+        }).then([](std::vector<mutation> mutations) {
+            return make_ready_future<std::tuple<std::vector<mutation>, cdc::result_callback>>(std::make_tuple(std::move(mutations), result_callback{}));
+        });
+    });
+}
+
+bool cdc::cdc_service::needs_cdc_augmentation(const std::vector<mutation>& mutations) const {
+    return std::any_of(mutations.begin(), mutations.end(), [](const mutation& m) {
+        return m.schema()->cdc_options().enabled();
+    });
+}
+
+future<std::tuple<std::vector<mutation>, cdc::result_callback>>
+cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations) {
+    return _impl->augment_mutation_call(timeout, std::move(mutations));
+}
--- a/cdc/cdc.hh
+++ b/cdc/cdc.hh
@@ -0,0 +1,142 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <functional>
+#include <optional>
+#include <map>
+#include <string>
+#include <vector>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/sstring.hh>
+
+#include "exceptions/exceptions.hh"
+#include "timestamp.hh"
+#include "cdc_options.hh"
+
+class schema;
+using schema_ptr = seastar::lw_shared_ptr<const schema>;
+
+namespace locator {
+
+class snitch_ptr;
+class token_metadata;
+
+} // namespace locator
+
+namespace service {
+
+class migration_notifier;
+class storage_proxy;
+class query_state;
+
+} // namespace service
+
+namespace dht {
+
+class i_partitioner;
+
+} // namespace dht
+
+class mutation;
+class partition_key;
+
+namespace cdc {
+
+class db_context;
+
+// Callback to be invoked on mutation finish to fix
+// the whole bit about post-image.
+// TODO: decide on what the parameters are to be for this.
+using result_callback = std::function<future<>()>;
+
+/// \brief CDC service, responsible for schema listeners
+///
+/// CDC service will listen for schema changes and iff CDC is enabled/changed
+/// create/modify/delete corresponding log tables etc as part of the schema change. 
+///
+class cdc_service {
+    class impl;
+    std::unique_ptr<impl> _impl;
+public:
+    future<> stop();
+    cdc_service(service::storage_proxy&);
+    cdc_service(db_context);
+    ~cdc_service();
+
+    // If any of the mutations are cdc enabled, optionally selects preimage, and adds the
+    // appropriate augments to set the log entries.
+    // Iff post-image is enabled for any of these, a non-empty callback is also
+    // returned to be invoked post the mutation query.
+    future<std::tuple<std::vector<mutation>, result_callback>> augment_mutation_call(
+        lowres_clock::time_point timeout,
+        std::vector<mutation>&& mutations
+        );
+    bool needs_cdc_augmentation(const std::vector<mutation>&) const;
+};
+
+struct db_context final {
+    service::storage_proxy& _proxy;
+    service::migration_notifier& _migration_notifier;
+    locator::token_metadata& _token_metadata;
+    locator::snitch_ptr& _snitch;
+    dht::i_partitioner& _partitioner;
+
+    class builder final {
+        service::storage_proxy& _proxy;
+        std::optional<std::reference_wrapper<service::migration_notifier>> _migration_notifier;
+        std::optional<std::reference_wrapper<locator::token_metadata>> _token_metadata;
+        std::optional<std::reference_wrapper<locator::snitch_ptr>> _snitch;
+        std::optional<std::reference_wrapper<dht::i_partitioner>> _partitioner;
+    public:
+        builder(service::storage_proxy& proxy);
+
+        builder& with_migration_notifier(service::migration_notifier& migration_notifier);
+        builder& with_token_metadata(locator::token_metadata& token_metadata);
+        builder& with_snitch(locator::snitch_ptr& snitch);
+        builder& with_partitioner(dht::i_partitioner& partitioner);
+
+        db_context build();
+    };
+};
+
+// cdc log table operation
+enum class operation : int8_t {
+    // note: these values will eventually be read by a third party, probably not privvy to this
+    // enum decl, so don't change the constant values (or the datatype).
+    pre_image = 0, update = 1, row_delete = 2, range_delete_start = 3, range_delete_end = 4, partition_delete = 5
+};
+
+// cdc log data column operation
+enum class column_op : int8_t {
+    // same as "operation". Do not edit values or type/type unless you _really_ want to.
+    set = 0, del = 1, add = 2,
+};
+
+seastar::sstring log_name(const seastar::sstring& table_name);
+
+seastar::sstring desc_name(const seastar::sstring& table_name);
+
+} // namespace cdc
--- a/cdc/cdc_extension.hh
+++ b/cdc/cdc_extension.hh
@@ -1,52 +0,0 @@
-/*
- * Copyright 2020 ScyllaDB
- */
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "serializer.hh"
-#include "db/extensions.hh"
-#include "cdc/cdc_options.hh"
-#include "schema.hh"
-
-namespace cdc {
-
-class cdc_extension : public schema_extension {
-    cdc::options _cdc_options;
-public:
-    static constexpr auto NAME = "cdc";
-
-    cdc_extension() = default;
-    explicit cdc_extension(std::map<sstring, sstring> tags) : _cdc_options(std::move(tags)) {}
-    explicit cdc_extension(const bytes& b) : _cdc_options(cdc_extension::deserialize(b)) {}
-    explicit cdc_extension(const sstring& s) {
-        throw std::logic_error("Cannot create cdc info from string");
-    }
-    bytes serialize() const override {
-        return ser::serialize_to_buffer<bytes>(_cdc_options.to_map());
-    }
-    static std::map<sstring, sstring> deserialize(const bytes_view& buffer) {
-        return ser::deserialize_from_buffer(buffer, boost::type<std::map<sstring, sstring>>());
-    }
-    const options& get_options() const {
-        return _cdc_options;
-    }
-};
-
-}
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -1,405 +0,0 @@
-/*
- * Copyright (C) 2019 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <boost/type.hpp>
-#include <random>
-#include <unordered_set>
-#include <seastar/core/sleep.hh>
-
-#include "keys.hh"
-#include "schema_builder.hh"
-#include "db/config.hh"
-#include "db/system_keyspace.hh"
-#include "db/system_distributed_keyspace.hh"
-#include "dht/token-sharding.hh"
-#include "locator/token_metadata.hh"
-#include "gms/application_state.hh"
-#include "gms/inet_address.hh"
-#include "gms/gossiper.hh"
-
-#include "cdc/generation.hh"
-
-extern logging::logger cdc_log;
-
-static int get_shard_count(const gms::inet_address& endpoint, const gms::gossiper& g) {
-    auto ep_state = g.get_application_state_ptr(endpoint, gms::application_state::SHARD_COUNT);
-    return ep_state ? std::stoi(ep_state->value) : -1;
-}
-
-static unsigned get_sharding_ignore_msb(const gms::inet_address& endpoint, const gms::gossiper& g) {
-    auto ep_state = g.get_application_state_ptr(endpoint, gms::application_state::IGNORE_MSB_BITS);
-    return ep_state ? std::stoi(ep_state->value) : 0;
-}
-
-namespace cdc {
-
-extern const api::timestamp_clock::duration generation_leeway =
-    std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));
-
-static void copy_int_to_bytes(int64_t i, size_t offset, bytes& b) {
-    i = net::hton(i);
-    std::copy_n(reinterpret_cast<int8_t*>(&i), sizeof(int64_t), b.begin() + offset);
-}
-
-stream_id::stream_id(int64_t first, int64_t second)
-    : _value(bytes::initialized_later(), 2 * sizeof(int64_t))
-{
-    copy_int_to_bytes(first, 0, _value);
-    copy_int_to_bytes(second, sizeof(int64_t), _value);
-}
-
-stream_id::stream_id(bytes b) : _value(std::move(b)) { }
-
-bool stream_id::is_set() const {
-    return !_value.empty();
-}
-
-bool stream_id::operator==(const stream_id& o) const {
-    return _value == o._value;
-}
-
-bool stream_id::operator<(const stream_id& o) const {
-    return _value < o._value;
-}
-
-static int64_t bytes_to_int64(const bytes& b, size_t offset) {
-    assert(b.size() >= offset + sizeof(int64_t));
-    int64_t res;
-    std::copy_n(b.begin() + offset, sizeof(int64_t), reinterpret_cast<int8_t *>(&res));
-    return net::ntoh(res);
-}
-
-int64_t stream_id::first() const {
-    return bytes_to_int64(_value, 0);
-}
-
-int64_t stream_id::second() const {
-    return bytes_to_int64(_value, sizeof(int64_t));
-}
-
-const bytes& stream_id::to_bytes() const {
-    return _value;
-}
-
-partition_key stream_id::to_partition_key(const schema& log_schema) const {
-    return partition_key::from_single_value(log_schema, _value);
-}
-
-bool token_range_description::operator==(const token_range_description& o) const {
-    return token_range_end == o.token_range_end && streams == o.streams
-        && sharding_ignore_msb == o.sharding_ignore_msb;
-}
-
-topology_description::topology_description(std::vector<token_range_description> entries)
-    : _entries(std::move(entries)) {}
-
-bool topology_description::operator==(const topology_description& o) const {
-    return _entries == o._entries;
-}
-
-const std::vector<token_range_description>& topology_description::entries() const {
-    return _entries;
-}
-
-static stream_id make_random_stream_id() {
-    static thread_local std::mt19937_64 rand_gen(std::random_device().operator()());
-    static thread_local std::uniform_int_distribution<int64_t> rand_dist(std::numeric_limits<int64_t>::min());
-
-    return {rand_dist(rand_gen), rand_dist(rand_gen)};
-}
-
-/* Given:
- * 1. a set of tokens which split the token ring into token ranges (vnodes),
- * 2. information on how each token range is distributed among its owning node's shards
- * this function tries to generate a set of CDC stream identifiers such that for each
- * shard and vnode pair there exists a stream whose token falls into this
- * vnode and is owned by this shard.
- *
- * It then builds a cdc::topology_description which maps tokens to these
- * found stream identifiers, such that if token T is owned by shard S in vnode V,
- * it gets mapped to the stream identifier generated for (S, V).
- */
-// Run in seastar::async context.
-topology_description generate_topology_description(
-        const db::config& cfg,
-        const std::unordered_set<dht::token>& bootstrap_tokens,
-        const locator::token_metadata& token_metadata,
-        const gms::gossiper& gossiper) {
-    if (bootstrap_tokens.empty()) {
-        throw std::runtime_error(
-                "cdc: bootstrap tokens is empty in generate_topology_description");
-    }
-
-    auto tokens = token_metadata.sorted_tokens();
-    tokens.insert(tokens.end(), bootstrap_tokens.begin(), bootstrap_tokens.end());
-    std::sort(tokens.begin(), tokens.end());
-    tokens.erase(std::unique(tokens.begin(), tokens.end()), tokens.end());
-
-    std::vector<token_range_description> entries(tokens.size());
-    int spots_to_fill = 0;
-
-    for (size_t i = 0; i < tokens.size(); ++i) {
-        auto& entry = entries[i];
-        entry.token_range_end = tokens[i];
-
-        if (bootstrap_tokens.count(entry.token_range_end) > 0) {
-            entry.streams.resize(smp::count);
-            entry.sharding_ignore_msb = cfg.murmur3_partitioner_ignore_msb_bits();
-        } else {
-            auto endpoint = token_metadata.get_endpoint(entry.token_range_end);
-            if (!endpoint) {
-                throw std::runtime_error(format("Can't find endpoint for token {}", entry.token_range_end));
-            }
-            auto sc = get_shard_count(*endpoint, gossiper);
-            entry.streams.resize(sc > 0 ? sc : 1);
-            entry.sharding_ignore_msb = get_sharding_ignore_msb(*endpoint, gossiper);
-        }
-
-        spots_to_fill += entry.streams.size();
-    }
-
-    auto schema = schema_builder("fake_ks", "fake_table")
-        .with_column("stream_id", bytes_type, column_kind::partition_key)
-        .build();
-
-    auto quota = std::chrono::seconds(spots_to_fill / 2000 + 1);
-    auto start_time = std::chrono::system_clock::now();
-
-    // For each pair (i, j), 0 <= i < streams.size(), 0 <= j < streams[i].size(),
-    // try to find a stream (stream[i][j]) such that the token of this stream will get mapped to this stream
-    // (refer to the comments above topology_description's definition to understand how it describes the mapping).
-    // We find the streams by randomly generating them and checking into which pairs they get mapped.
-    // NOTE: this algorithm is temporary and will be replaced after per-table-partitioner feature gets merged in.
-    repeat([&] {
-        for (int i = 0; i < 500; ++i) {
-            auto stream_id = make_random_stream_id();
-            auto token = dht::get_token(*schema, stream_id.to_partition_key(*schema));
-
-            // Find the token range into which our stream_id's token landed.
-            auto it = std::lower_bound(tokens.begin(), tokens.end(), token);
-            auto& entry = entries[it != tokens.end() ? std::distance(tokens.begin(), it) : 0];
-
-            auto shard_id = dht::shard_of(entry.streams.size(), entry.sharding_ignore_msb, token);
-            assert(shard_id < entry.streams.size());
-
-            if (!entry.streams[shard_id].is_set()) {
-                --spots_to_fill;
-                entry.streams[shard_id] = stream_id;
-            }
-        }
-
-        if (!spots_to_fill) {
-            return stop_iteration::yes;
-        }
-
-        auto now = std::chrono::system_clock::now();
-        auto passed = std::chrono::duration_cast<std::chrono::seconds>(now - start_time);
-        if (passed > quota) {
-            return stop_iteration::yes;
-        }
-
-        return stop_iteration::no;
-    }).get();
-
-    if (spots_to_fill) {
-        // We were not able to generate stream ids for each (token range, shard) pair.
-
-        // For each range that has a stream, for each shard for this range that doesn't have a stream,
-        // use the stream id of the next shard for this range.
-
-        // For each range that doesn't have any stream,
-        // use streams of the first range to the left which does have a stream.
-
-        cdc_log.warn("Generation of CDC streams failed to create streams for some (vnode, shard) pair."
-                     " This can lead to worse performance.");
-
-        stream_id some_stream;
-        size_t idx = 0;
-        for (; idx < entries.size(); ++idx) {
-            for (auto s: entries[idx].streams) {
-                if (s.is_set()) {
-                    some_stream = s;
-                    break;
-                }
-            }
-            if (some_stream.is_set()) {
-                break;
-            }
-        }
-
-        assert(idx != entries.size() && some_stream.is_set());
-
-        // Iterate over all ranges in the clockwise direction, starting with the one we found a stream for.
-        for (size_t off = 0; off < entries.size(); ++off) {
-            auto& ss = entries[(idx + off) % entries.size()].streams;
-
-            int last_set_stream_idx = ss.size() - 1;
-            while (last_set_stream_idx > -1 && !ss[last_set_stream_idx].is_set()) {
-                --last_set_stream_idx;
-            }
-
-            if (last_set_stream_idx == -1) {
-                cdc_log.warn(
-                        "CDC wasn't able to generate any stream for vnode ({}, {}]. We'll use another vnode's streams"
-                        " instead. This might lead to inconsistencies.",
-                        tokens[(idx + off + entries.size() - 1) % entries.size()], tokens[(idx + off) % entries.size()]);
-
-                ss[0] = some_stream;
-                last_set_stream_idx = 0;
-            }
-
-            some_stream = ss[last_set_stream_idx];
-
-            // Replace 'unset' stream ids with indexes below last_set_stream_idx
-            for (int s_idx = last_set_stream_idx - 1; s_idx > -1; --s_idx) {
-                if (ss[s_idx].is_set()) {
-                    some_stream = ss[s_idx];
-                } else {
-                    ss[s_idx] = some_stream;
-                }
-            }
-            // Replace 'unset' stream ids with indexes above last_set_stream_idx
-            for (int s_idx = ss.size() - 1; s_idx > last_set_stream_idx; --s_idx) {
-                if (ss[s_idx].is_set()) {
-                    some_stream = ss[s_idx];
-                } else {
-                    ss[s_idx] = some_stream;
-                }
-            }
-        }
-    }
-
-    return {std::move(entries)};
-}
-
-bool should_propose_first_generation(const gms::inet_address& me, const gms::gossiper& g) {
-    auto my_host_id = g.get_host_id(me);
-    auto& eps = g.get_endpoint_states();
-    return std::none_of(eps.begin(), eps.end(),
-            [&] (const std::pair<gms::inet_address, gms::endpoint_state>& ep) {
-        return my_host_id < g.get_host_id(ep.first);
-    });
-}
-
-future<db_clock::time_point> get_local_streams_timestamp() {
-    return db::system_keyspace::get_saved_cdc_streams_timestamp().then([] (std::optional<db_clock::time_point> ts) {
-        if (!ts) {
-            auto err = format("get_local_streams_timestamp: tried to retrieve streams timestamp after bootstrapping, but it's not present");
-            cdc_log.error("{}", err);
-            throw std::runtime_error(err);
-        }
-        return *ts;
-    });
-}
-
-// Run inside seastar::async context.
-db_clock::time_point make_new_cdc_generation(
-        const db::config& cfg,
-        const std::unordered_set<dht::token>& bootstrap_tokens,
-        const locator::token_metadata& tm,
-        const gms::gossiper& g,
-        db::system_distributed_keyspace& sys_dist_ks,
-        std::chrono::milliseconds ring_delay,
-        bool for_testing) {
-    assert(!bootstrap_tokens.empty());
-
-    auto gen = generate_topology_description(cfg, bootstrap_tokens, tm, g);
-
-    // Begin the race.
-    auto ts = db_clock::now() + (
-            for_testing ? std::chrono::milliseconds(0) : (
-                2 * ring_delay + std::chrono::duration_cast<std::chrono::milliseconds>(generation_leeway)));
-    sys_dist_ks.insert_cdc_topology_description(ts, std::move(gen), { tm.count_normal_token_owners() }).get();
-
-    return ts;
-}
-
-std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_address& endpoint, const gms::gossiper& g) {
-    auto streams_ts_string = g.get_application_state_value(endpoint, gms::application_state::CDC_STREAMS_TIMESTAMP);
-    cdc_log.trace("endpoint={}, streams_ts_string={}", endpoint, streams_ts_string);
-
-    if (streams_ts_string.empty()) {
-        return {};
-    }
-
-    return db_clock::time_point(db_clock::duration(std::stoll(streams_ts_string)));
-}
-
-// Run inside seastar::async context.
-static void do_update_streams_description(
-        db_clock::time_point streams_ts,
-        db::system_distributed_keyspace& sys_dist_ks,
-        db::system_distributed_keyspace::context ctx) {
-    if (sys_dist_ks.cdc_desc_exists(streams_ts, ctx).get0()) {
-        cdc_log.debug("update_streams_description: description of generation {} already inserted", streams_ts);
-        return;
-    }
-
-    // We might race with another node also inserting the description, but that's ok. It's an idempotent operation.
-
-    auto topo = sys_dist_ks.read_cdc_topology_description(streams_ts, ctx).get0();
-    if (!topo) {
-        throw std::runtime_error(format("could not find streams data for timestamp {}", streams_ts));
-    }
-
-    std::set<cdc::stream_id> streams_set;
-    for (auto& entry: topo->entries()) {
-        streams_set.insert(entry.streams.begin(), entry.streams.end());
-    }
-
-    std::vector<cdc::stream_id> streams_vec(streams_set.begin(), streams_set.end());
-
-    sys_dist_ks.create_cdc_desc(streams_ts, streams_vec, ctx).get();
-    cdc_log.info("CDC description table successfully updated with generation {}.", streams_ts);
-}
-
-void update_streams_description(
-        db_clock::time_point streams_ts,
-        shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
-        noncopyable_function<unsigned()> get_num_token_owners,
-        abort_source& abort_src) {
-    try {
-        do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
-    } catch(...) {
-        cdc_log.warn(
-            "Could not update CDC description table with generation {}: {}. Will retry in the background.",
-            streams_ts, std::current_exception());
-
-        // It is safe to discard this future: we keep system distributed keyspace alive.
-        (void)seastar::async([
-            streams_ts, sys_dist_ks, get_num_token_owners = std::move(get_num_token_owners), &abort_src
-        ] {
-            while (true) {
-                sleep_abortable(std::chrono::seconds(60), abort_src).get();
-                try {
-                    do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
-                    return;
-                } catch (...) {
-                    cdc_log.warn(
-                        "Could not update CDC description table with generation {}: {}. Will try again.",
-                        streams_ts, std::current_exception());
-                }
-            }
-        });
-    }
-}
-
-} // namespace cdc
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -1,176 +0,0 @@
-/*
- * Copyright (C) 2019 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-/* This module contains classes and functions used to manage CDC generations:
- * sets of CDC stream identifiers used by the cluster to choose partition keys for CDC log writes.
- * Each CDC generation begins operating at a specific time point, called the generation's timestamp
- * (`cdc_streams_timpestamp` or `streams_timestamp` in the code).
- * The generation is used by all nodes in the cluster to pick CDC streams until superseded by a new generation.
- *
- * Functions from this module are used by the node joining procedure to introduce new CDC generations to the cluster
- * (which is necessary due to new tokens being inserted into the token ring), or during rolling upgrade
- * if CDC is enabled for the first time.
- */
-
-#pragma once
-
-#include <vector>
-#include <unordered_set>
-#include <seastar/util/noncopyable_function.hh>
-
-#include "database_fwd.hh"
-#include "db_clock.hh"
-#include "dht/token.hh"
-
-namespace seastar {
-    class abort_source;
-} // namespace seastar
-
-namespace db {
-    class config;
-    class system_distributed_keyspace;
-} // namespace db
-
-namespace gms {
-    class inet_address;
-    class gossiper;
-} // namespace gms
-
-namespace locator {
-    class token_metadata;
-} // namespace locator
-
-namespace cdc {
-
-class stream_id final {
-    bytes _value;
-public:
-    stream_id() = default;
-    stream_id(int64_t, int64_t);
-    stream_id(bytes);
-    bool is_set() const;
-    bool operator==(const stream_id&) const;
-    bool operator<(const stream_id&) const;
-
-    int64_t first() const;
-    int64_t second() const;
-
-    const bytes& to_bytes() const;
-
-    partition_key to_partition_key(const schema& log_schema) const;
-};
-
-/* Describes a mapping of tokens to CDC streams in a token range.
- *
- * The range ends with `token_range_end`. A vector of `token_range_description`s defines the ranges entirely
- * (the end of the `i`th range is the beginning of the `i+1 % size()`th range). Ranges are left-opened, right-closed.
- *
- * Tokens in the range ending with `token_range_end` are mapped to streams in the `streams` vector as follows:
- * token `T` is mapped to `streams[j]` if and only if the used partitioner maps `T` to the `j`th shard,
- * assuming that the partitioner is configured for `streams.size()` shards and (partitioner's) `sharding_ignore_msb`
- * equals to the given `sharding_ignore_msb`.
-*/
-struct token_range_description {
-    dht::token token_range_end;
-    std::vector<stream_id> streams;
-    uint8_t sharding_ignore_msb;
-
-    bool operator==(const token_range_description&) const;
-};
-
-
-/* Describes a mapping of tokens to CDC streams in a whole token ring.
- *
- * Division of the ring to token ranges is defined in terms of `token_range_end`s
- * in the `_entries` vector. See the comment above `token_range_description` for explanation.
- */
-class topology_description {
-    std::vector<token_range_description> _entries;
-public:
-    topology_description(std::vector<token_range_description> entries);
-    bool operator==(const topology_description&) const;
-
-    const std::vector<token_range_description>& entries() const;
-};
-
-/* Should be called when we're restarting and we noticed that we didn't save any streams timestamp in our local tables,
- * which means that we're probably upgrading from a non-CDC/old CDC version (another reason could be
- * that there's a bug, or the user messed with our local tables).
- *
- * It checks whether we should be the node to propose the first generation of CDC streams.
- * The chosen condition is arbitrary, it only tries to make sure that no two nodes propose a generation of streams
- * when upgrading, and nothing bad happens if they for some reason do (it's mostly an optimization).
- */
-bool should_propose_first_generation(const gms::inet_address& me, const gms::gossiper&);
-
-/*
- * Read this node's streams generation timestamp stored in the LOCAL table.
- * Assumes that the node has successfully bootstrapped, and we're not upgrading from a non-CDC version,
- * so the timestamp is present.
- */
-future<db_clock::time_point> get_local_streams_timestamp();
-
-/* Generate a new set of CDC streams and insert it into the distributed cdc_topology_description table.
- * Returns the timestamp of this new generation.
- *
- * Should be called when starting the node for the first time (i.e., joining the ring).
- *
- * Assumes that the system_distributed keyspace is initialized.
- *
- * The caller of this function is expected to insert this timestamp into the gossiper as fast as possible,
- * so that other nodes learn about the generation before their clocks cross the timestmap
- * (not guaranteed in the current implementation, but expected to be the common case;
- *  we assume that `ring_delay` is enough for other nodes to learn about the new generation).
- */
-db_clock::time_point make_new_cdc_generation(
-        const db::config& cfg,
-        const std::unordered_set<dht::token>& bootstrap_tokens,
-        const locator::token_metadata& tm,
-        const gms::gossiper& g,
-        db::system_distributed_keyspace& sys_dist_ks,
-        std::chrono::milliseconds ring_delay,
-        bool for_testing);
-
-/* Retrieves CDC streams generation timestamp from the given endpoint's application state (broadcasted through gossip).
- * We might be during a rolling upgrade, so the timestamp might not be there (if the other node didn't upgrade yet),
- * but if the cluster already supports CDC, then every newly joining node will propose a new CDC generation,
- * which means it will gossip the generation's timestamp.
- */
-std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_address& endpoint, const gms::gossiper&);
-
-/* Inform CDC users about a generation of streams (identified by the given timestamp)
- * by inserting it into the cdc_description table.
- *
- * Assumes that the cdc_topology_description table contains this generation.
- *
- * Returning from this function does not mean that the table update was successful: the function
- * might run an asynchronous task in the background.
- *
- * Run inside seastar::async context.
- */
-void update_streams_description(
-        db_clock::time_point,
-        shared_ptr<db::system_distributed_keyspace>,
-        noncopyable_function<unsigned()> get_num_token_owners,
-        abort_source&);
-
-} // namespace cdc
--- a/cdc/log.cc
+++ b/cdc/log.cc
--- a/cdc/log.hh
+++ b/cdc/log.hh
@@ -1,145 +0,0 @@
-/*
- * Copyright (C) 2019 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * This module manages CDC log tables. It contains facilities used to:
- * - perform schema changes to CDC log tables correspondingly when base tables are changed,
- * - perform writes to CDC log tables correspondingly when writes to base tables are made.
- */
-
-#pragma once
-
-#include <functional>
-#include <optional>
-#include <map>
-#include <string>
-#include <vector>
-
-#include <seastar/core/future.hh>
-#include <seastar/core/lowres_clock.hh>
-#include <seastar/core/shared_ptr.hh>
-#include <seastar/core/sstring.hh>
-
-#include "exceptions/exceptions.hh"
-#include "timestamp.hh"
-#include "tracing/trace_state.hh"
-#include "cdc_options.hh"
-#include "utils/UUID.hh"
-
-class schema;
-using schema_ptr = seastar::lw_shared_ptr<const schema>;
-
-namespace locator {
-
-class token_metadata;
-
-} // namespace locator
-
-namespace service {
-
-class migration_notifier;
-class storage_proxy;
-class query_state;
-
-} // namespace service
-
-class mutation;
-class partition_key;
-
-namespace cdc {
-
-struct operation_result_tracker;
-class db_context;
-class metadata;
-
-/// \brief CDC service, responsible for schema listeners
-///
-/// CDC service will listen for schema changes and iff CDC is enabled/changed
-/// create/modify/delete corresponding log tables etc as part of the schema change. 
-///
-class cdc_service {
-    class impl;
-    std::unique_ptr<impl> _impl;
-public:
-    future<> stop();
-    cdc_service(service::storage_proxy&);
-    cdc_service(db_context);
-    ~cdc_service();
-
-    // If any of the mutations are cdc enabled, optionally selects preimage, and adds the
-    // appropriate augments to set the log entries.
-    // Iff post-image is enabled for any of these, a non-empty callback is also
-    // returned to be invoked post the mutation query.
-    future<std::tuple<std::vector<mutation>, lw_shared_ptr<operation_result_tracker>>> augment_mutation_call(
-        lowres_clock::time_point timeout,
-        std::vector<mutation>&& mutations,
-        tracing::trace_state_ptr tr_state
-        );
-    bool needs_cdc_augmentation(const std::vector<mutation>&) const;
-};
-
-struct db_context final {
-    service::storage_proxy& _proxy;
-    service::migration_notifier& _migration_notifier;
-    locator::token_metadata& _token_metadata;
-    cdc::metadata& _cdc_metadata;
-
-    class builder final {
-        service::storage_proxy& _proxy;
-        std::optional<std::reference_wrapper<service::migration_notifier>> _migration_notifier;
-        std::optional<std::reference_wrapper<locator::token_metadata>> _token_metadata;
-        std::optional<std::reference_wrapper<cdc::metadata>> _cdc_metadata;
-    public:
-        builder(service::storage_proxy& proxy);
-
-        builder& with_migration_notifier(service::migration_notifier& migration_notifier);
-        builder& with_token_metadata(locator::token_metadata& token_metadata);
-        builder& with_cdc_metadata(cdc::metadata&);
-
-        db_context build();
-    };
-};
-
-// cdc log table operation
-enum class operation : int8_t {
-    // note: these values will eventually be read by a third party, probably not privvy to this
-    // enum decl, so don't change the constant values (or the datatype).
-    pre_image = 0, update = 1, insert = 2, row_delete = 3, partition_delete = 4,
-    range_delete_start_inclusive = 5, range_delete_start_exclusive = 6, range_delete_end_inclusive = 7, range_delete_end_exclusive = 8,
-    post_image = 9,
-};
-
-bool is_log_for_some_table(const sstring& ks_name, const std::string_view& table_name);
-seastar::sstring log_name(const seastar::sstring& table_name);
-seastar::sstring log_data_column_name(std::string_view column_name);
-seastar::sstring log_meta_column_name(std::string_view column_name);
-bytes log_data_column_name_bytes(const bytes& column_name);
-bytes log_meta_column_name_bytes(const bytes& column_name);
-
-seastar::sstring log_data_column_deleted_name(std::string_view column_name);
-bytes log_data_column_deleted_name_bytes(const bytes& column_name);
-
-seastar::sstring log_data_column_deleted_elements_name(std::string_view column_name);
-bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name);
-
-utils::UUID generate_timeuuid(api::timestamp_type t);
-
-} // namespace cdc
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -1,200 +0,0 @@
-/*
- * Copyright (C) 2019 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "dht/token-sharding.hh"
-#include "utils/exceptions.hh"
-#include "exceptions/exceptions.hh"
-
-#include "cdc/generation.hh"
-#include "cdc/metadata.hh"
-
-extern logging::logger cdc_log;
-
-namespace cdc {
-    extern const api::timestamp_clock::duration generation_leeway;
-} // namespace cdc
-
-static api::timestamp_type to_ts(db_clock::time_point tp) {
-    // This assumes that timestamp_clock and db_clock have the same epochs.
-    return std::chrono::duration_cast<api::timestamp_clock::duration>(tp.time_since_epoch()).count();
-}
-
-static cdc::stream_id get_stream(
-        const cdc::token_range_description& entry,
-        dht::token tok) {
-    // The ith stream is the stream for the ith shard.
-    auto shard_cnt = entry.streams.size();
-    auto shard_id = dht::shard_of(shard_cnt, entry.sharding_ignore_msb, tok);
-
-    if (shard_id >= shard_cnt) {
-        on_internal_error(cdc_log, "get_stream: shard_id out of bounds");
-    }
-
-    return entry.streams[shard_id];
-}
-
-static cdc::stream_id get_stream(
-        const std::vector<cdc::token_range_description>& entries,
-        dht::token tok) {
-    if (entries.empty()) {
-        on_internal_error(cdc_log, "get_stream: entries empty");
-    }
-
-    auto it = std::lower_bound(entries.begin(), entries.end(), tok,
-            [] (const cdc::token_range_description& e, dht::token t) { return e.token_range_end < t; });
-    if (it == entries.end()) {
-        it = entries.begin();
-    }
-
-    return get_stream(*it, tok);
-}
-
-cdc::metadata::container_t::const_iterator cdc::metadata::gen_used_at(api::timestamp_type ts) const {
-    auto it = _gens.upper_bound(ts);
-    if (it == _gens.begin()) {
-        // All known generations have higher timestamps than `ts`.
-        return _gens.end();
-    }
-
-    return std::prev(it);
-}
-
-cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok) {
-    auto now = api::new_timestamp();
-    if (ts > now + generation_leeway.count()) {
-        throw exceptions::invalid_request_exception(format(
-                "cdc: attempted to get a stream \"from the future\" ({}; current server time: {})."
-                " With CDC you cannot send writes with timestamps arbitrarily into the future, because we don't"
-                " know what streams will be used at that time.\n"
-                "We *do* allow sending writes into the near future, but our ability to do that is limited."
-                " If you really must use your own timestamps, then make sure your clocks are well-synchronized"
-               "  with the database's clocks.", format_timestamp(ts), format_timestamp(now)));
-        // Note that we might still send a write to a wrong generation, if we learn about the current
-        // generation too late (we might think that an earlier generation is the current one).
-        // Nothing protects us from that until we start using transactions for generation switching.
-    }
-
-    auto it = gen_used_at(now);
-    if (it == _gens.end()) {
-        throw std::runtime_error(format(
-                "cdc::metadata::get_stream: could not find any CDC stream (current time: {})."
-                " Are we in the middle of a cluster upgrade?", format_timestamp(now)));
-    }
-
-    // Garbage-collect generations that will no longer be used.
-    it = _gens.erase(_gens.begin(), it);
-
-    if (it->first > ts) {
-        throw exceptions::invalid_request_exception(format(
-                "cdc: attempted to get a stream from an earlier generation than the currently used one."
-                " With CDC you cannot send writes with timestamps too far into the past, because that would break"
-                " consistency properties (write timestamp: {}, current generation started at: {})",
-                format_timestamp(ts), format_timestamp(it->first)));
-    }
-
-    // With `generation_leeway` we allow sending writes to the near future. It might happen
-    // that `ts` doesn't belong to the current generation ("current" according to our clock),
-    // but to the next generation. Adjust for this case:
-    {
-        auto next_it = std::next(it);
-        while (next_it != _gens.end() && next_it->first <= ts) {
-            it = next_it++;
-        }
-    }
-    // Note: if there is a next generation that `ts` belongs to, but we don't know about it,
-    // then too bad. This is no different from the situation in which we didn't manage to learn
-    // about the current generation in time. We won't be able to prevent it until we introduce transactions.
-
-    if (!it->second) {
-        throw std::runtime_error(format(
-                "cdc: attempted to get a stream from a generation that we know about, but weren't able to retrieve"
-                " (generation timestamp: {}, write timestamp: {}). Make sure that the replicas which contain"
-                " this generation's data are alive and reachable from this node.", format_timestamp(it->first), format_timestamp(ts)));
-    }
-
-    auto& gen = *it->second;
-    auto ret = ::get_stream(gen.entries(), tok);
-    _last_stream_timestamp = ts;
-    return ret;
-}
-
-bool cdc::metadata::known_or_obsolete(db_clock::time_point tp) const {
-    auto ts = to_ts(tp);
-    auto it = _gens.lower_bound(ts);
-
-    if (it == _gens.end()) {
-        // No known generations with timestamp >= ts.
-        return false;
-    }
-
-    if (it->first == ts) {
-        if (it->second) {
-            // We already inserted this particular generation.
-            return true;
-        }
-        ++it;
-    }
-
-    // Check if some new generation has already superseded this one.
-    return it != _gens.end() && it->first <= api::new_timestamp();
-}
-
-bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen) {
-    if (known_or_obsolete(tp)) {
-        return false;
-    }
-
-    auto now = api::new_timestamp();
-    auto it = gen_used_at(now);
-
-    if (it != _gens.end()) {
-        // Garbage-collect generations that will no longer be used.
-        it = _gens.erase(_gens.begin(), it);
-
-    }
-
-    _gens.insert_or_assign(to_ts(tp), std::move(gen));
-    return true;
-}
-
-bool cdc::metadata::prepare(db_clock::time_point tp) {
-    if (known_or_obsolete(tp)) {
-        return false;
-    }
-
-    auto ts = to_ts(tp);
-    auto emplaced = _gens.emplace(to_ts(tp), std::nullopt).second;
-
-    if (_last_stream_timestamp != api::missing_timestamp) {
-        auto last_correct_gen = gen_used_at(_last_stream_timestamp);
-        if (emplaced && last_correct_gen != _gens.end() && last_correct_gen->first == ts) {
-            cdc_log.error(
-                "just learned about a CDC generation newer than the one used the last time"
-                " streams were retrieved. This generation, or some newer one, should have"
-                " been used instead (new generation's timestamp: {}, last time streams were retrieved: {})."
-                " The new generation probably arrived too late due to a network partition"
-                " and we've made a write using the wrong set streams.",
-                format_timestamp(ts), format_timestamp(_last_stream_timestamp));
-        }
-    }
-
-    return emplaced;
-}
--- a/cdc/metadata.hh
+++ b/cdc/metadata.hh
@@ -1,92 +0,0 @@
-/*
- * Copyright (C) 2019 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <map>
-
-#include "db_clock.hh"
-#include "timestamp.hh"
-
-namespace dht {
-    class token;
-}
-
-namespace cdc {
-
-class stream_id;
-class topology_description;
-
-/* Represents the node's knowledge about CDC generations used in the cluster.
- * Used during writes to pick streams to which CDC log writes should be sent to
- * (i.e., to pick partition keys for these writes).
- */
-class metadata final {
-    // Note: we use db_clock (1ms resolution) for generation timestaps
-    // (because we need to insert them into tables using columns of timestamp types,
-    //  and the native type of our columns' timestamp_type is db_clock::time_point).
-    // On the other hand, timestamp_clock (1us resolution) is used for mutation timestamps,
-    // and api::timestamp_type represents the number of ticks of a timestamp_clock::time_point since epoch.
-
-    using container_t = std::map<api::timestamp_type, std::optional<topology_description>>;
-    container_t _gens;
-
-    /* The timestamp used in the last successful `get_stream` call. */
-    api::timestamp_type _last_stream_timestamp = api::missing_timestamp;
-
-    container_t::const_iterator gen_used_at(api::timestamp_type ts) const;
-public:
-    /* Is a generation with the given timestamp already known or superseded by a newer generation? */
-    bool known_or_obsolete(db_clock::time_point) const;
-
-    /* Return the stream for the base partition whose token is `tok` to which a corresponding log write should go
-     * according to the generation used at time `ts` (i.e, the latest generation whose timestamp is less or equal to `ts`).
-     *
-     * If the provided timestamp is too far away "into the future" (where "now" is defined according to our local clock),
-     * we reject the get_stream query. This is because the resulting stream might belong to a generation which we don't
-     * yet know about. The amount of leeway (how much "into the future" we allow `ts` to be) is defined
-     * by the `cdc::generation_leeway` constant.
-     */
-    stream_id get_stream(api::timestamp_type ts, dht::token tok);
-
-    /* Insert the generation given by `gen` with timestamp `ts` to be used by the `get_stream` function,
-     * if the generation is not already known or older than the currently known ones.
-     *
-     * Returns true if the generation was inserted,
-     * meaning that `get_stream` might return a stream from this generation (at some time points).
-     */
-    bool insert(db_clock::time_point ts, topology_description&& gen);
-
-    /* Prepare for inserting a new generation whose timestamp is `ts`.
-     * This method is not required to be called before `insert`, but it's here
-     * to increase safety of `get_stream` calls in some situations. Use it if you:
-     * 1. know that there is a new generation, but
-     * 2. you didn't yet retrieve the generation's topology_description.
-     *
-     * After preparing a generation, if `get_stream` is supposed to return a stream from this generation
-     * but we don't yet have the generation's data, it will reject the query to maintain consistency of streams.
-     *
-     * Returns true iff this generation is not obsolete and wasn't previously prepared nor inserted.
-     */
-    bool prepare(db_clock::time_point ts);
-};
-
-} // namespace cdc
--- a/cdc/split.cc
+++ b/cdc/split.cc
@@ -1,463 +0,0 @@
-/*
- * Copyright (C) 2020 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "mutation.hh"
-#include "schema.hh"
-
-#include "split.hh"
-#include "log.hh"
-
-struct atomic_column_update {
-    column_id id;
-    atomic_cell cell;
-};
-
-// see the comment inside `clustered_row_insert` for motivation for separating
-// nonatomic deletions from nonatomic updates
-struct nonatomic_column_deletion {
-    column_id id;
-    tombstone t;
-};
-
-struct nonatomic_column_update {
-    column_id id;
-    utils::chunked_vector<std::pair<bytes, atomic_cell>> cells;
-};
-
-struct static_row_update {
-    gc_clock::duration ttl;
-    std::vector<atomic_column_update> atomic_entries;
-    std::vector<nonatomic_column_deletion> nonatomic_deletions;
-    std::vector<nonatomic_column_update> nonatomic_updates;
-};
-
-struct clustered_row_insert {
-    gc_clock::duration ttl;
-    clustering_key key;
-    row_marker marker;
-    std::vector<atomic_column_update> atomic_entries;
-    std::vector<nonatomic_column_deletion> nonatomic_deletions;
-    // INSERTs can't express updates of individual cells inside a non-atomic
-    // (without deleting the entire field first), so no `nonatomic_updates` field
-    // overwriting a nonatomic column inside an INSERT will be split into two changes:
-    // one with a nonatomic deletion, and one with a nonatomic update
-};
-
-struct clustered_row_update {
-    gc_clock::duration ttl;
-    clustering_key key;
-    std::vector<atomic_column_update> atomic_entries;
-    std::vector<nonatomic_column_deletion> nonatomic_deletions;
-    std::vector<nonatomic_column_update> nonatomic_updates;
-};
-
-struct clustered_row_deletion {
-    clustering_key key;
-    tombstone t;
-};
-
-struct clustered_range_deletion {
-    range_tombstone rt;
-};
-
-struct partition_deletion {
-    tombstone t;
-};
-
-struct batch {
-    std::vector<static_row_update> static_updates;
-    std::vector<clustered_row_insert> clustered_inserts;
-    std::vector<clustered_row_update> clustered_updates;
-    std::vector<clustered_row_deletion> clustered_row_deletions;
-    std::vector<clustered_range_deletion> clustered_range_deletions;
-    std::optional<partition_deletion> partition_deletions;
-};
-
-using set_of_changes = std::map<api::timestamp_type, batch>;
-
-struct row_update {
-    std::vector<atomic_column_update> atomic_entries;
-    std::vector<nonatomic_column_deletion> nonatomic_deletions;
-    std::vector<nonatomic_column_update> nonatomic_updates;
-};
-
-static
-std::map<std::pair<api::timestamp_type, gc_clock::duration>, row_update>
-extract_row_updates(const row& r, column_kind ckind, const schema& schema) {
-    std::map<std::pair<api::timestamp_type, gc_clock::duration>, row_update> result;
-    r.for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
-        auto& cdef = schema.column_at(ckind, id);
-        if (cdef.is_atomic()) {
-            auto view = cell.as_atomic_cell(cdef);
-            auto timestamp_and_ttl = std::pair(
-                    view.timestamp(),
-                    view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0)
-                );
-            result[timestamp_and_ttl].atomic_entries.push_back({id, atomic_cell(*cdef.type, view)});
-            return;
-        }
-
-        cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
-            auto desc = mview.materialize(*cdef.type);
-            for (auto& [k, v]: desc.cells) {
-                auto timestamp_and_ttl = std::pair(
-                        v.timestamp(),
-                        v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0)
-                    );
-                auto& updates = result[timestamp_and_ttl].nonatomic_updates;
-                if (updates.empty() || updates.back().id != id) {
-                    updates.push_back({id, {}});
-                }
-                updates.back().cells.push_back({std::move(k), std::move(v)});
-            }
-
-            if (desc.tomb) {
-                auto timestamp_and_ttl = std::pair(desc.tomb.timestamp, gc_clock::duration(0));
-                result[timestamp_and_ttl].nonatomic_deletions.push_back({id, desc.tomb});
-            }
-        });
-    });
-    return result;
-};
-
-set_of_changes extract_changes(const mutation& base_mutation, const schema& base_schema) {
-    set_of_changes res;
-    auto& p = base_mutation.partition();
-
-    auto sr_updates = extract_row_updates(p.static_row().get(), column_kind::static_column, base_schema);
-    for (auto& [k, up]: sr_updates) {
-        auto [timestamp, ttl] = k;
-        res[timestamp].static_updates.push_back({
-                ttl,
-                std::move(up.atomic_entries),
-                std::move(up.nonatomic_deletions),
-                std::move(up.nonatomic_updates)
-            });
-    }
-
-    for (const rows_entry& cr : p.clustered_rows()) {
-        auto cr_updates = extract_row_updates(cr.row().cells(), column_kind::regular_column, base_schema);
-
-        const auto& marker = cr.row().marker();
-        auto marker_timestamp = marker.timestamp();
-        auto marker_ttl = marker.is_expiring() ? marker.ttl() : gc_clock::duration(0);
-        if (marker.is_live()) {
-            // make sure that an entry corresponding to the row marker's timestamp and ttl is in the map
-            (void)cr_updates[std::pair(marker_timestamp, marker_ttl)];
-        }
-
-        auto is_insert = [&] (api::timestamp_type timestamp, gc_clock::duration ttl) {
-            if (!marker.is_live()) {
-                return false;
-            }
-
-            return timestamp == marker_timestamp && ttl == marker_ttl;
-        };
-
-        for (auto& [k, up]: cr_updates) {
-            auto [timestamp, ttl] = k;
-
-            if (is_insert(timestamp, ttl)) {
-                res[timestamp].clustered_inserts.push_back({
-                        ttl,
-                        cr.key(),
-                        marker,
-                        std::move(up.atomic_entries),
-                        std::move(up.nonatomic_deletions)
-                    });
-                if (!up.nonatomic_updates.empty()) {
-                    // nonatomic updates cannot be expressed with an INSERT.
-                    res[timestamp].clustered_updates.push_back({
-                            ttl,
-                            cr.key(),
-                            {},
-                            {},
-                            std::move(up.nonatomic_updates)
-                        });
-                }
-            } else {
-                res[timestamp].clustered_updates.push_back({
-                        ttl,
-                        cr.key(),
-                        std::move(up.atomic_entries),
-                        std::move(up.nonatomic_deletions),
-                        std::move(up.nonatomic_updates)
-                    });
-            }
-        }
-
-        auto row_tomb = cr.row().deleted_at().regular();
-        if (row_tomb) {
-            res[row_tomb.timestamp].clustered_row_deletions.push_back({cr.key(), row_tomb});
-        }
-    }
-
-    for (const auto& rt: p.row_tombstones()) {
-        if (rt.tomb.timestamp != api::missing_timestamp) {
-            res[rt.tomb.timestamp].clustered_range_deletions.push_back({rt});
-        }
-    }
-
-    auto partition_tomb_timestamp = p.partition_tombstone().timestamp;
-    if (partition_tomb_timestamp != api::missing_timestamp) {
-        res[partition_tomb_timestamp].partition_deletions = {p.partition_tombstone()};
-    }
-
-    return res;
-}
-
-namespace cdc {
-
-bool should_split(const mutation& base_mutation, const schema& base_schema) {
-    auto& p = base_mutation.partition();
-
-    api::timestamp_type found_ts = api::missing_timestamp;
-    std::optional<gc_clock::duration> found_ttl; // 0 = "no ttl"
-
-    auto check_or_set = [&] (api::timestamp_type ts, gc_clock::duration ttl) {
-        if (found_ts != api::missing_timestamp && found_ts != ts) {
-            return true;
-        }
-        found_ts = ts;
-
-        if (found_ttl && *found_ttl != ttl) {
-            return true;
-        }
-        found_ttl = ttl;
-
-        return false;
-    };
-
-    bool had_static_row = false;
-
-    bool should_split = false;
-    p.static_row().get().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
-        had_static_row = true;
-
-        auto& cdef = base_schema.column_at(column_kind::static_column, id);
-        if (cdef.is_atomic()) {
-            auto view = cell.as_atomic_cell(cdef);
-            if (check_or_set(view.timestamp(), view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0))) {
-                should_split = true;
-            }
-            return;
-        }
-
-        cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
-            auto desc = mview.materialize(*cdef.type);
-            for (auto& [k, v]: desc.cells) {
-                if (check_or_set(v.timestamp(), v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0))) {
-                    should_split = true;
-                    return;
-                }
-            }
-
-            if (desc.tomb) {
-                if (check_or_set(desc.tomb.timestamp, gc_clock::duration(0))) {
-                    should_split = true;
-                    return;
-                }
-            }
-        });
-    });
-
-    if (should_split) {
-        return true;
-    }
-
-    bool had_clustered_row = false;
-
-    if (!p.clustered_rows().empty() && had_static_row) {
-        return true;
-    }
-    for (const rows_entry& cr : p.clustered_rows()) {
-        had_clustered_row = true;
-
-        const auto& marker = cr.row().marker();
-        if (marker.is_live() && check_or_set(marker.timestamp(), marker.is_expiring() ? marker.ttl() : gc_clock::duration(0))) {
-            return true;
-        }
-
-        bool is_insert = marker.is_live();
-
-        bool had_cells = false;
-        cr.row().cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
-            had_cells = true;
-
-            auto& cdef = base_schema.column_at(column_kind::regular_column, id);
-            if (cdef.is_atomic()) {
-                auto view = cell.as_atomic_cell(cdef);
-                if (check_or_set(view.timestamp(), view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0))) {
-                    should_split = true;
-                }
-                return;
-            }
-
-            cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
-                for (auto& [k, v]: mview.cells) {
-                    if (check_or_set(v.timestamp(), v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0))) {
-                        should_split = true;
-                        return;
-                    }
-
-                    if (is_insert) {
-                        // nonatomic updates cannot be expressed with an INSERT.
-                        should_split = true;
-                        return;
-                    }
-                }
-
-                if (mview.tomb) {
-                    if (check_or_set(mview.tomb.timestamp, gc_clock::duration(0))) {
-                        should_split = true;
-                        return;
-                    }
-                }
-            });
-        });
-
-        if (should_split) {
-            return true;
-        }
-
-        auto row_tomb = cr.row().deleted_at().regular();
-        if (row_tomb) {
-            if (had_cells) {
-                return true;
-            }
-
-            // there were no cells, so no ttl
-            assert(!found_ttl);
-            if (found_ts != api::missing_timestamp && found_ts != row_tomb.timestamp) {
-                return true;
-            }
-
-            found_ts = row_tomb.timestamp;
-        }
-    }
-
-    if (!p.row_tombstones().empty() && (had_static_row || had_clustered_row)) {
-        return true;
-    }
-
-    for (const auto& rt: p.row_tombstones()) {
-        if (rt.tomb) {
-            if (found_ts != api::missing_timestamp && found_ts != rt.tomb.timestamp) {
-                return true;
-            }
-
-            found_ts = rt.tomb.timestamp;
-        }
-    }
-
-    if (p.partition_tombstone().timestamp != api::missing_timestamp
-            && (!p.row_tombstones().empty() || had_static_row || had_clustered_row)) {
-        return true;
-    }
-
-    // A mutation with no timestamp will be split into 0 mutations
-    return found_ts == api::missing_timestamp;
-}
-
-void for_each_change(const mutation& base_mutation, const schema_ptr& base_schema,
-        seastar::noncopyable_function<void(mutation, api::timestamp_type, bytes, int&)> f) {
-    auto changes = extract_changes(base_mutation, *base_schema);
-    auto pk = base_mutation.key();
-
-    for (auto& [change_ts, btch] : changes) {
-        auto tuuid = timeuuid_type->decompose(generate_timeuuid(change_ts));
-        int batch_no = 0;
-
-        for (auto& sr_update : btch.static_updates) {
-            mutation m(base_schema, pk);
-            for (auto& atomic_update : sr_update.atomic_entries) {
-                auto& cdef = base_schema->column_at(column_kind::static_column, atomic_update.id);
-                m.set_static_cell(cdef, std::move(atomic_update.cell));
-            }
-            for (auto& nonatomic_delete : sr_update.nonatomic_deletions) {
-                auto& cdef = base_schema->column_at(column_kind::static_column, nonatomic_delete.id);
-                m.set_static_cell(cdef, collection_mutation_description{nonatomic_delete.t, {}}.serialize(*cdef.type));
-            }
-            for (auto& nonatomic_update : sr_update.nonatomic_updates) {
-                auto& cdef = base_schema->column_at(column_kind::static_column, nonatomic_update.id);
-                m.set_static_cell(cdef, collection_mutation_description{{}, std::move(nonatomic_update.cells)}.serialize(*cdef.type));
-            }
-            f(std::move(m), change_ts, tuuid, batch_no);
-        }
-
-        for (auto& cr_insert : btch.clustered_inserts) {
-            mutation m(base_schema, pk);
-
-            auto& row = m.partition().clustered_row(*base_schema, cr_insert.key);
-            for (auto& atomic_update : cr_insert.atomic_entries) {
-                auto& cdef = base_schema->column_at(column_kind::regular_column, atomic_update.id);
-                row.cells().apply(cdef, std::move(atomic_update.cell));
-            }
-            for (auto& nonatomic_delete : cr_insert.nonatomic_deletions) {
-                auto& cdef = base_schema->column_at(column_kind::regular_column, nonatomic_delete.id);
-                row.cells().apply(cdef, collection_mutation_description{nonatomic_delete.t, {}}.serialize(*cdef.type));
-            }
-            row.apply(cr_insert.marker);
-
-            f(std::move(m), change_ts, tuuid, batch_no);
-        }
-
-        for (auto& cr_update : btch.clustered_updates) {
-            mutation m(base_schema, pk);
-
-            auto& row = m.partition().clustered_row(*base_schema, cr_update.key).cells();
-            for (auto& atomic_update : cr_update.atomic_entries) {
-                auto& cdef = base_schema->column_at(column_kind::regular_column, atomic_update.id);
-                row.apply(cdef, std::move(atomic_update.cell));
-            }
-            for (auto& nonatomic_delete : cr_update.nonatomic_deletions) {
-                auto& cdef = base_schema->column_at(column_kind::regular_column, nonatomic_delete.id);
-                row.apply(cdef, collection_mutation_description{nonatomic_delete.t, {}}.serialize(*cdef.type));
-            }
-            for (auto& nonatomic_update : cr_update.nonatomic_updates) {
-                auto& cdef = base_schema->column_at(column_kind::regular_column, nonatomic_update.id);
-                row.apply(cdef, collection_mutation_description{{}, std::move(nonatomic_update.cells)}.serialize(*cdef.type));
-            }
-
-            f(std::move(m), change_ts, tuuid, batch_no);
-        }
-
-        for (auto& cr_delete : btch.clustered_row_deletions) {
-            mutation m(base_schema, pk);
-            m.partition().apply_delete(*base_schema, cr_delete.key, cr_delete.t);
-            f(std::move(m), change_ts, tuuid, batch_no);
-        }
-
-        for (auto& crange_delete : btch.clustered_range_deletions) {
-            mutation m(base_schema, pk);
-            m.partition().apply_delete(*base_schema, crange_delete.rt);
-            f(std::move(m), change_ts, tuuid, batch_no);
-        }
-
-        if (btch.partition_deletions) {
-            mutation m(base_schema, pk);
-            m.partition().apply(btch.partition_deletions->t);
-            f(std::move(m), change_ts, tuuid, batch_no);
-        }
-    }
-}
-
-} // namespace cdc
--- a/cdc/split.hh
+++ b/cdc/split.hh
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2020 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <vector>
-#include "schema_fwd.hh"
-#include "timestamp.hh"
-#include "bytes.hh"
-#include <seastar/util/noncopyable_function.hh>
-
-class mutation;
-
-namespace cdc {
-
-bool should_split(const mutation& base_mutation, const schema& base_schema);
-void for_each_change(const mutation& base_mutation, const schema_ptr& base_schema,
-        seastar::noncopyable_function<void(mutation, api::timestamp_type, bytes, int&)>);
-
-}
--- a/cdc/stats.hh
+++ b/cdc/stats.hh
@@ -1,120 +0,0 @@
-/*
- * Copyright (C) 2020 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <array>
-#include <cstdint>
-#include <string>
-#include <seastar/core/metrics_registration.hh>
-#include "enum_set.hh"
-#include "utils/histogram.hh"
-#include "utils/estimated_histogram.hh"
-
-namespace cdc {
-
-class stats final {
-    seastar::metrics::metric_groups _metrics;
-
-public:
-    enum class part_type {
-        STATIC_ROW,
-        CLUSTERING_ROW,
-        MAP,
-        SET,
-        LIST,
-        UDT,
-        RANGE_TOMBSTONE,
-        PARTITION_DELETE,
-        ROW_DELETE,
-
-        MAX
-    };
-
-    using part_type_set = enum_set<super_enum<part_type,
-        part_type::STATIC_ROW,
-        part_type::CLUSTERING_ROW,
-        part_type::MAP,
-        part_type::SET,
-        part_type::LIST,
-        part_type::UDT,
-        part_type::RANGE_TOMBSTONE,
-        part_type::PARTITION_DELETE,
-        part_type::ROW_DELETE
-    >>;
-
-    struct parts_touched_stats final {
-        std::array<uint64_t, (size_t)part_type::MAX> count = {};
-
-        inline void apply(part_type_set parts_set) {
-            for (part_type idx : parts_set) {
-                count[(size_t)idx]++;
-            }
-        }
-
-        void register_metrics(seastar::metrics::metric_groups& metrics, std::string_view suffix);
-    };
-
-    struct counters final {
-        uint64_t unsplit_count = 0;
-        uint64_t split_count = 0;
-        uint64_t preimage_selects = 0;
-        uint64_t with_preimage_count = 0;
-        uint64_t with_postimage_count = 0;
-
-        parts_touched_stats touches;
-    };
-
-    counters counters_total;
-    counters counters_failed;
-
-    stats();
-};
-
-// Contains the details on what happened during a CDC operation.
-struct operation_details final {
-    stats::part_type_set touched_parts;
-    bool was_split = false;
-    bool had_preimage = false;
-    bool had_postimage = false;
-};
-
-// This object tracks the lifetime of write handlers related to one CDC operation. After all
-// write handlers for the operation finish, CDC metrics are updated.
-class operation_result_tracker final {
-    stats& _stats;
-    operation_details _details;
-    bool _failed;
-
-public:
-    operation_result_tracker(stats& stats, operation_details details)
-        : _stats(stats)
-        , _details(details)
-        , _failed(false)
-    {}
-    ~operation_result_tracker();
-
-    void on_mutation_failed() {
-        _failed = true;
-    }
-};
-
-}
--- a/checked-file-impl.hh
+++ b/checked-file-impl.hh
@@ -22,10 +22,7 @@
 #pragma once

 #include "seastar/core/file.hh"
-#include "seastar/core/reactor.hh"
-#include "utils/disk-error-handler.hh"
-
-#include "seastarx.hh"
+#include "disk-error-handler.hh"

 class checked_file_impl : public file_impl {
 public:
--- a/clocks-impl.cc
+++ b/clocks-impl.cc
@@ -19,23 +19,6 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

-#include <seastar/core/print.hh>
-
-#include "db_clock.hh"
-#include "timestamp.hh"
-
 #include "clocks-impl.hh"

 std::atomic<int64_t> clocks_offset;
-
-std::ostream& operator<<(std::ostream& os, db_clock::time_point tp) {
-    auto t = db_clock::to_time_t(tp);
-    ::tm t_buf;
-    return os << std::put_time(::gmtime_r(&t, &t_buf), "%Y/%m/%d %T");
-}
-
-std::string format_timestamp(api::timestamp_type ts) {
-    auto t = std::time_t(std::chrono::duration_cast<std::chrono::seconds>(api::timestamp_clock::duration(ts)).count());
-    ::tm t_buf;
-    return format("{}", std::put_time(::gmtime_r(&t, &t_buf), "%Y/%m/%d %T"));
-}
--- a/clustering_bounds_comparator.hh
+++ b/clustering_bounds_comparator.hh
@@ -24,7 +24,7 @@

 #include <functional>
 #include "keys.hh"
-#include "schema_fwd.hh"
+#include "schema.hh"
 #include "range.hh"

 /**
--- a/clustering_interval_set.hh
+++ b/clustering_interval_set.hh
@@ -1,134 +0,0 @@
-/*
- * Copyright (C) 2020 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "schema_fwd.hh"
-#include "position_in_partition.hh"
-#include <boost/icl/interval_set.hpp>
-
-// Represents a non-contiguous subset of clustering_key domain of a particular schema.
-// Can be treated like an ordered and non-overlapping sequence of position_range:s.
-class clustering_interval_set {
-    // Needed to make position_in_partition comparable, required by boost::icl::interval_set.
-    class position_in_partition_with_schema {
-        schema_ptr _schema;
-        position_in_partition _pos;
-    public:
-        position_in_partition_with_schema()
-            : _pos(position_in_partition::for_static_row())
-        { }
-        position_in_partition_with_schema(schema_ptr s, position_in_partition pos)
-            : _schema(std::move(s))
-            , _pos(std::move(pos))
-        { }
-        bool operator<(const position_in_partition_with_schema& other) const {
-            return position_in_partition::less_compare(*_schema)(_pos, other._pos);
-        }
-        bool operator==(const position_in_partition_with_schema& other) const {
-            return position_in_partition::equal_compare(*_schema)(_pos, other._pos);
-        }
-        const position_in_partition& position() const { return _pos; }
-    };
-private:
-    // We want to represent intervals of clustering keys, not position_in_partitions,
-    // but clustering_key domain is not enough to represent all kinds of clustering ranges.
-    // All intervals in this set are of the form [x, y).
-    using set_type = boost::icl::interval_set<position_in_partition_with_schema>;
-    using interval = boost::icl::interval<position_in_partition_with_schema>;
-    set_type _set;
-public:
-    clustering_interval_set() = default;
-    // Constructs from legacy clustering_row_ranges
-    clustering_interval_set(const schema& s, const query::clustering_row_ranges& ranges) {
-        for (auto&& r : ranges) {
-            add(s, position_range::from_range(r));
-        }
-    }
-    query::clustering_row_ranges to_clustering_row_ranges() const {
-        query::clustering_row_ranges result;
-        for (position_range r : *this) {
-            result.push_back(query::clustering_range::make(
-                {r.start().key(), r.start()._bound_weight != bound_weight::after_all_prefixed},
-                {r.end().key(), r.end()._bound_weight == bound_weight::after_all_prefixed}));
-        }
-        return result;
-    }
-    class position_range_iterator : public std::iterator<std::input_iterator_tag, const position_range> {
-        set_type::iterator _i;
-    public:
-        position_range_iterator(set_type::iterator i) : _i(i) {}
-        position_range operator*() const {
-            // FIXME: Produce position_range view. Not performance critical yet.
-            const interval::interval_type& iv = *_i;
-            return position_range{iv.lower().position(), iv.upper().position()};
-        }
-        bool operator==(const position_range_iterator& other) const { return _i == other._i; }
-        bool operator!=(const position_range_iterator& other) const { return _i != other._i; }
-        position_range_iterator& operator++() {
-            ++_i;
-            return *this;
-        }
-        position_range_iterator operator++(int) {
-            auto tmp = *this;
-            ++_i;
-            return tmp;
-        }
-    };
-    static interval::type make_interval(const schema& s, const position_range& r) {
-        assert(r.start().has_clustering_key());
-        assert(r.end().has_clustering_key());
-        return interval::right_open(
-            position_in_partition_with_schema(s.shared_from_this(), r.start()),
-            position_in_partition_with_schema(s.shared_from_this(), r.end()));
-    }
-public:
-    bool equals(const schema& s, const clustering_interval_set& other) const {
-        return boost::equal(_set, other._set);
-    }
-    bool contains(const schema& s, position_in_partition_view pos) const {
-        // FIXME: Avoid copy
-        return _set.find(position_in_partition_with_schema(s.shared_from_this(), position_in_partition(pos))) != _set.end();
-    }
-    // Returns true iff this set is fully contained in the other set.
-    bool contained_in(clustering_interval_set& other) const {
-        return boost::icl::within(_set, other._set);
-    }
-    bool overlaps(const schema& s, const position_range& range) const {
-        // FIXME: Avoid copy
-        auto r = _set.equal_range(make_interval(s, range));
-        return r.first != r.second;
-    }
-    // Adds given clustering range to this interval set.
-    // The range may overlap with this set.
-    void add(const schema& s, const position_range& r) {
-        _set += make_interval(s, r);
-    }
-    void add(const schema& s, const clustering_interval_set& other) {
-        for (auto&& r : other) {
-            add(s, r);
-        }
-    }
-    position_range_iterator begin() const { return {_set.begin()}; }
-    position_range_iterator end() const { return {_set.end()}; }
-    friend std::ostream& operator<<(std::ostream&, const clustering_interval_set&);
-};
-
--- a/clustering_key_filter.hh
+++ b/clustering_key_filter.hh
@@ -23,7 +23,7 @@

 #pragma once

-#include "schema_fwd.hh"
+#include "schema.hh"
 #include "query-request.hh"

 namespace query {
--- a/column_computation.hh
+++ b/column_computation.hh
@@ -21,8 +21,6 @@

 #pragma once

-#include <json/json.h>
-
 #include "bytes.hh"

 class schema;
--- a/combine.hh
+++ b/combine.hh
@@ -21,8 +21,6 @@

 #pragma once

-#include <algorithm>
-
 // combine two sorted uniqued sequences into a single sorted sequence
 // unique elements are copied, duplicate elements are merged with a
 // binary function.
--- a/compaction_garbage_collector.hh
+++ b/compaction_garbage_collector.hh
@@ -21,6 +21,7 @@

 #pragma once

+#include "schema.hh"
 #include "collection_mutation.hh"

 class atomic_cell;
--- a/Show More
+++ b/Show More