release: prepare for 4.0.11

Update seastar submodule
* seastar 065a40b34a...748428930a (1): > append_challenged_posix_file_impl: allow destructing file with no queued work Fixes #7285.
2020-10-26 18:12:47 +02:00 · 2020-10-19 15:06:24 +03:00 · 2020-10-19 15:05:13 +03:00 · 2020-10-18 15:03:04 +03:00 · 2020-10-06 17:12:28 +03:00 · 2020-10-04 18:05:00 +03:00
3672 changed files with 43781 additions and 17727 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,3 +1,4 @@
 .git
 build
 seastar/build
+testlog
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
@@ -15,3 +15,6 @@
 [submodule "zstd"]
 	path = zstd
 	url = ../zstd
+[submodule "abseil"]
+	path = abseil
+	url = ../abseil-cpp
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,13 +5,25 @@
 cmake_minimum_required(VERSION 3.7)
 project(scylla)

+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+  message(STATUS "Setting build type to 'Release' as none was specified.")
+  set(CMAKE_BUILD_TYPE "Release" CACHE
+      STRING "Choose the type of build." FORCE)
+  # Set the possible values of build type for cmake-gui
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
+    "Debug" "Release" "Dev" "Sanitize")
+endif()
+
+if(CMAKE_BUILD_TYPE)
+    string(TOLOWER "${CMAKE_BUILD_TYPE}" BUILD_TYPE)
+else()
+    set(BUILD_TYPE "release")
+endif()
+
 if (NOT DEFINED FOR_IDE AND NOT DEFINED ENV{FOR_IDE} AND NOT DEFINED ENV{CLION_IDE})
    message(FATAL_ERROR "This CMakeLists.txt file is only valid for use in IDEs, please define FOR_IDE to acknowledge this.")
 endif()

-# Default value. A more accurate list is populated through `pkg-config` below if `seastar.pc` is available.
-set(SEASTAR_INCLUDE_DIRS "seastar")
-
 # These paths are always available, since they're included in the repository. Additional DPDK headers are placed while
 # Seastar is built, and are captured in `SEASTAR_INCLUDE_DIRS` through parsing the Seastar pkg-config file (below).
 set(SEASTAR_DPDK_INCLUDE_DIRS
@@ -22,9 +34,14 @@ set(SEASTAR_DPDK_INCLUDE_DIRS

 find_package(PkgConfig REQUIRED)

-set(ENV{PKG_CONFIG_PATH} "${CMAKE_SOURCE_DIR}/seastar/build/release:$ENV{PKG_CONFIG_PATH}")
+set(ENV{PKG_CONFIG_PATH} "${CMAKE_SOURCE_DIR}/build/${BUILD_TYPE}/seastar:$ENV{PKG_CONFIG_PATH}")
 pkg_check_modules(SEASTAR seastar)

+if(NOT SEASTAR_INCLUDE_DIRS)
+    # Default value. A more accurate list is populated through `pkg-config` below if `seastar.pc` is available.
+    set(SEASTAR_INCLUDE_DIRS "seastar/include")
+endif()
+
 find_package(Boost COMPONENTS filesystem program_options system thread)

 ##
@@ -70,7 +87,7 @@ scan_scylla_source_directories(
          seastar/json
          seastar/net
          seastar/rpc
-          seastar/tests
+          seastar/testing
          seastar/util)

 scan_scylla_source_directories(
@@ -106,7 +123,7 @@ scan_scylla_source_directories(
 scan_scylla_source_directories(
        VAR SCYLLA_GEN_SOURCE_FILES
        RECURSIVE
-        PATHS build/release/gen)
+        PATHS build/${BUILD_TYPE}/gen)

 set(SCYLLA_SOURCE_FILES
        ${SCYLLA_ROOT_SOURCE_FILES}
@@ -139,4 +156,4 @@ target_include_directories(scylla PUBLIC
        ${Boost_INCLUDE_DIRS}
        xxhash
        libdeflate
-        build/release/gen)
+        build/${BUILD_TYPE}/gen)
--- a/HACKING.md
+++ b/HACKING.md
@@ -141,7 +141,7 @@ In v3:
 "Tests: unit ({mode}), dtest ({smp})"
 ```

-The usual is "Tests: unit (release)", although running debug tests is encouraged.
+The usual is "Tests: unit (dev)", although running debug tests is encouraged.

 5. When answering review comments, prefer inline quotes as they make it easier to track the conversation across multiple e-mails.

--- a/README.md
+++ b/README.md
@@ -38,6 +38,10 @@ Please see [HACKING.md](HACKING.md) for detailed information on building and dev
 ./build/release/scylla --help
 ```

+## Testing
+
+See [test.py manual](docs/testing.md).
+
 ## Scylla APIs and compatibility
 By default, Scylla is compatible with Apache Cassandra and its APIs - CQL and
 Thrift. There is also experimental support for the API of Amazon DynamoDB,
@@ -56,31 +60,12 @@ both.
 Seastar documentation can be found [here](http://docs.seastar.io/master/index.html).
 User documentation can be found [here](https://docs.scylladb.com/).

-## Building Fedora RPM
+## Training 

-As a pre-requisite, you need to install [Mock](https://fedoraproject.org/wiki/Mock) on your machine:
-
-```
-# Install mock:
-sudo yum install mock
-
-# Add user to the "mock" group:
-usermod -a -G mock $USER && newgrp mock
-```
-
-Then, to build an RPM, run:
-
-```
-./dist/redhat/build_rpm.sh
-```
-
-The built RPM is stored in ``/var/lib/mock/<configuration>/result`` directory.
-For example, on Fedora 21 mock reports the following:
-
-```
-INFO: Done(scylla-server-0.00-1.fc21.src.rpm) Config(default) 20 minutes 7 seconds
-INFO: Results and/or logs in: /var/lib/mock/fedora-21-x86_64/result
-```
+Training material and online courses can be found at [Scylla University](https://university.scylladb.com/). 
+The courses are free, self-paced and include hands-on examples. They cover a variety of topics including Scylla data modeling, 
+administration, architecture, basic NoSQL concepts, using drivers for application development, Scylla setup, failover, compactions, 
+multi-datacenters and how Scylla integrates with third-party applications.

 ## Building Fedora-based Docker image

--- a/10
+++ b/10
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=666.development
+VERSION=4.0.11

 if test -f version
 then
@@ -19,6 +19,14 @@ else
 	SCYLLA_RELEASE=$SCYLLA_BUILD.$DATE.$GIT_COMMIT
 fi

+if [ -f build/SCYLLA-RELEASE-FILE ]; then
+	RELEASE_FILE=$(cat build/SCYLLA-RELEASE-FILE)
+	GIT_COMMIT_FILE=$(cat build/SCYLLA-RELEASE-FILE |cut -d . -f 3)
+	if [ "$GIT_COMMIT" = "$GIT_COMMIT_FILE" ]; then
+		exit 0
+	fi
+fi
+
 echo "$SCYLLA_VERSION-$SCYLLA_RELEASE"
 mkdir -p build
 echo "$SCYLLA_VERSION" > build/SCYLLA-VERSION-FILE
--- a/1
+++ b/1
--- a/alternator-test/test_item.py
+++ b/alternator-test/test_item.py
@@ -1,402 +0,0 @@
-# Copyright 2019 ScyllaDB
-#
-# This file is part of Scylla.
-#
-# Scylla is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Scylla is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
-
-# Tests for the CRUD item operations: PutItem, GetItem, UpdateItem, DeleteItem
-
-import pytest
-from botocore.exceptions import ClientError
-from decimal import Decimal
-from util import random_string, random_bytes
-
-# Basic test for creating a new item with a random name, and reading it back
-# with strong consistency.
-# Only the string type is used for keys and attributes. None of the various
-# optional PutItem features (Expected, ReturnValues, ReturnConsumedCapacity,
-# ReturnItemCollectionMetrics, ConditionalOperator, ConditionExpression,
-# ExpressionAttributeNames, ExpressionAttributeValues) are used, and
-# for GetItem strong consistency is requested as well as all attributes,
-# but no other optional features (AttributesToGet, ReturnConsumedCapacity,
-# ProjectionExpression, ExpressionAttributeNames)
-def test_basic_string_put_and_get(test_table):
-    p = random_string()
-    c = random_string()
-    val = random_string()
-    val2 = random_string()
-    test_table.put_item(Item={'p': p, 'c': c, 'attribute': val, 'another': val2})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item['p'] == p
-    assert item['c'] == c
-    assert item['attribute'] == val
-    assert item['another'] == val2
-
-# Similar to test_basic_string_put_and_get, just uses UpdateItem instead of
-# PutItem. Because the item does not yet exist, it should work the same.
-def test_basic_string_update_and_get(test_table):
-    p = random_string()
-    c = random_string()
-    val = random_string()
-    val2 = random_string()
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'attribute': {'Value': val, 'Action': 'PUT'}, 'another': {'Value': val2, 'Action': 'PUT'}})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item['p'] == p
-    assert item['c'] == c
-    assert item['attribute'] == val
-    assert item['another'] == val2
-
-# Test put_item and get_item of various types for the *attributes*,
-# including both scalars as well as nested documents, lists and sets.
-# The full list of types tested here:
-#    number, boolean, bytes, null, list, map, string set, number set,
-#    binary set.
-# The keys are still strings.
-# Note that only top-level attributes are written and read in this test -
-# this test does not attempt to modify *nested* attributes.
-# See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/dynamodb.html
-# on how to pass these various types to Boto3's put_item().
-def test_put_and_get_attribute_types(test_table):
-    key = {'p': random_string(), 'c': random_string()}
-    test_items = [
-        Decimal("12.345"),
-        42,
-        True,
-        False,
-        b'xyz',
-        None,
-        ['hello', 'world', 42],
-        {'hello': 'world', 'life': 42},
-        {'hello': {'test': 'hi', 'hello': True, 'list': [1, 2, 'hi']}},
-        set(['hello', 'world', 'hi']),
-        set([1, 42, Decimal("3.14")]),
-        set([b'xyz', b'hi']),
-    ]
-    item = { str(i) : test_items[i] for i in range(len(test_items)) }
-    item.update(key)
-    test_table.put_item(Item=item)
-    got_item = test_table.get_item(Key=key, ConsistentRead=True)['Item']
-    assert item == got_item
-
-# The test_empty_* tests below verify support for empty items, with no
-# attributes except the key. This is a difficult case for Scylla, because
-# for an empty row to exist, Scylla needs to add a "CQL row marker".
-# There are several ways to create empty items - via PutItem, UpdateItem
-# and deleting attributes from non-empty items, and we need to check them
-# all, in several test_empty_* tests:
-def test_empty_put(test_table):
-    p = random_string()
-    c = random_string()
-    test_table.put_item(Item={'p': p, 'c': c})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item == {'p': p, 'c': c}
-def test_empty_put_delete(test_table):
-    p = random_string()
-    c = random_string()
-    test_table.put_item(Item={'p': p, 'c': c, 'hello': 'world'})
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'hello': {'Action': 'DELETE'}})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item == {'p': p, 'c': c}
-def test_empty_update(test_table):
-    p = random_string()
-    c = random_string()
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item == {'p': p, 'c': c}
-def test_empty_update_delete(test_table):
-    p = random_string()
-    c = random_string()
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'hello': {'Value': 'world', 'Action': 'PUT'}})
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'hello': {'Action': 'DELETE'}})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item == {'p': p, 'c': c}
-
-# Test error handling of UpdateItem passed a bad "Action" field.
-def test_update_bad_action(test_table):
-    p = random_string()
-    c = random_string()
-    val = random_string()
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'attribute': {'Value': val, 'Action': 'NONEXISTENT'}})
-
-# A more elaborate UpdateItem test, updating different attributes at different
-# times. Includes PUT and DELETE operations.
-def test_basic_string_more_update(test_table):
-    p = random_string()
-    c = random_string()
-    val1 = random_string()
-    val2 = random_string()
-    val3 = random_string()
-    val4 = random_string()
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a3': {'Value': val1, 'Action': 'PUT'}})
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a1': {'Value': val1, 'Action': 'PUT'}})
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a2': {'Value': val2, 'Action': 'PUT'}})
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a1': {'Value': val3, 'Action': 'PUT'}})
-    test_table.update_item(Key={'p': p, 'c': c}, AttributeUpdates={'a3': {'Action': 'DELETE'}})
-    item = test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
-    assert item['p'] == p
-    assert item['c'] == c
-    assert item['a1'] == val3
-    assert item['a2'] == val2
-    assert not 'a3' in item
-
-# Test that item operations on a non-existant table name fail with correct
-# error code.
-def test_item_operations_nonexistent_table(dynamodb):
-    with pytest.raises(ClientError, match='ResourceNotFoundException'):
-        dynamodb.meta.client.put_item(TableName='non_existent_table',
-            Item={'a':{'S':'b'}})
-
-# Fetching a non-existant item. According to the DynamoDB doc, "If there is no
-# matching item, GetItem does not return any data and there will be no Item
-# element in the response."
-def test_get_item_missing_item(test_table):
-    p = random_string()
-    c = random_string()
-    assert not "Item" in test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)
-
-# Test that if we have a table with string hash and sort keys, we can't read
-# or write items with other key types to it.
-def test_put_item_wrong_key_type(test_table):
-    b = random_bytes()
-    s = random_string()
-    n = Decimal("3.14")
-    # Should succeed (correct key types)
-    test_table.put_item(Item={'p': s, 'c': s})
-    assert test_table.get_item(Key={'p': s, 'c': s}, ConsistentRead=True)['Item'] == {'p': s, 'c': s}
-    # Should fail (incorrect hash key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.put_item(Item={'p': b, 'c': s})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.put_item(Item={'p': n, 'c': s})
-    # Should fail (incorrect sort key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.put_item(Item={'p': s, 'c': b})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.put_item(Item={'p': s, 'c': n})
-    # Should fail (missing hash key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.put_item(Item={'c': s})
-    # Should fail (missing sort key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.put_item(Item={'p': s})
-def test_update_item_wrong_key_type(test_table, test_table_s):
-    b = random_bytes()
-    s = random_string()
-    n = Decimal("3.14")
-    # Should succeed (correct key types)
-    test_table.update_item(Key={'p': s, 'c': s}, AttributeUpdates={})
-    assert test_table.get_item(Key={'p': s, 'c': s}, ConsistentRead=True)['Item'] == {'p': s, 'c': s}
-    # Should fail (incorrect hash key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'p': b, 'c': s}, AttributeUpdates={})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'p': n, 'c': s}, AttributeUpdates={})
-    # Should fail (incorrect sort key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'p': s, 'c': b}, AttributeUpdates={})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'p': s, 'c': n}, AttributeUpdates={})
-    # Should fail (missing hash key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'c': s}, AttributeUpdates={})
-    # Should fail (missing sort key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.update_item(Key={'p': s}, AttributeUpdates={})
-    # Should fail (spurious key columns)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': s, 'c': s, 'spurious': s})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.get_item(Key={'p': s, 'c': s})
-def test_get_item_wrong_key_type(test_table, test_table_s):
-    b = random_bytes()
-    s = random_string()
-    n = Decimal("3.14")
-    # Should succeed (correct key types) but have empty result
-    assert not "Item" in test_table.get_item(Key={'p': s, 'c': s}, ConsistentRead=True)
-    # Should fail (incorrect hash key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': b, 'c': s})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': n, 'c': s})
-    # Should fail (incorrect sort key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': s, 'c': b})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': s, 'c': n})
-    # Should fail (missing hash key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'c': s})
-    # Should fail (missing sort key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': s})
-    # Should fail (spurious key columns)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.get_item(Key={'p': s, 'c': s, 'spurious': s})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.get_item(Key={'p': s, 'c': s})
-def test_delete_item_wrong_key_type(test_table, test_table_s):
-    b = random_bytes()
-    s = random_string()
-    n = Decimal("3.14")
-    # Should succeed (correct key types)
-    test_table.delete_item(Key={'p': s, 'c': s})
-    # Should fail (incorrect hash key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'p': b, 'c': s})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'p': n, 'c': s})
-    # Should fail (incorrect sort key types)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'p': s, 'c': b})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'p': s, 'c': n})
-    # Should fail (missing hash key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'c': s})
-    # Should fail (missing sort key)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'p': s})
-    # Should fail (spurious key columns)
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table.delete_item(Key={'p': s, 'c': s, 'spurious': s})
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.delete_item(Key={'p': s, 'c': s})
-
-# Most of the tests here arbitrarily used a table with both hash and sort keys
-# (both strings). Let's check that a table with *only* a hash key works ok
-# too, for PutItem, GetItem, and UpdateItem.
-def test_only_hash_key(test_table_s):
-    s = random_string()
-    test_table_s.put_item(Item={'p': s, 'hello': 'world'})
-    assert test_table_s.get_item(Key={'p': s}, ConsistentRead=True)['Item'] == {'p': s, 'hello': 'world'}
-    test_table_s.update_item(Key={'p': s}, AttributeUpdates={'hi': {'Value': 'there', 'Action': 'PUT'}})
-    assert test_table_s.get_item(Key={'p': s}, ConsistentRead=True)['Item'] == {'p': s, 'hello': 'world', 'hi': 'there'}
-
-# Tests for item operations in tables with non-string hash or sort keys.
-# These tests focus only on the type of the key - everything else is as
-# simple as we can (string attributes, no special options for GetItem
-# and PutItem). These tests also focus on individual items only, and
-# not about the sort order of sort keys - this should be verified in
-# test_query.py, for example.
-def test_bytes_hash_key(test_table_b):
-    # Bytes values are passed using base64 encoding, which has weird cases
-    # depending on len%3 and len%4. So let's try various lengths.
-    for len in range(10,18):
-        p = random_bytes(len)
-        val = random_string()
-        test_table_b.put_item(Item={'p': p, 'attribute': val})
-        assert test_table_b.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'attribute': val}
-def test_bytes_sort_key(test_table_sb):
-    p = random_string()
-    c = random_bytes()
-    val = random_string()
-    test_table_sb.put_item(Item={'p': p, 'c': c, 'attribute': val})
-    assert test_table_sb.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item'] == {'p': p, 'c': c, 'attribute': val}
-
-# Tests for using a large binary blob as hash key, sort key, or attribute.
-# DynamoDB strictly limits the size of the binary hash key to 2048 bytes,
-# and binary sort key to 1024 bytes, and refuses anything larger. The total
-# size of an item is limited to 400KB, which also limits the size of the
-# largest attributes. For more details on these limits, see
-# https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Limits.html
-# Alternator currently does *not* have these limitations, and can accept much
-# larger keys and attributes, but what we do in the following tests is to verify
-# that items up to DynamoDB's maximum sizes also work well in Alternator.
-def test_large_blob_hash_key(test_table_b):
-    b = random_bytes(2048)
-    test_table_b.put_item(Item={'p': b})
-    assert test_table_b.get_item(Key={'p': b}, ConsistentRead=True)['Item'] == {'p': b}
-def test_large_blob_sort_key(test_table_sb):
-    s = random_string()
-    b = random_bytes(1024)
-    test_table_sb.put_item(Item={'p': s, 'c': b})
-    assert test_table_sb.get_item(Key={'p': s, 'c': b}, ConsistentRead=True)['Item'] == {'p': s, 'c': b}
-def test_large_blob_attribute(test_table):
-    p = random_string()
-    c = random_string()
-    b = random_bytes(409500)  # a bit less than 400KB
-    test_table.put_item(Item={'p': p, 'c': c, 'attribute': b })
-    assert test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item'] == {'p': p, 'c': c, 'attribute': b}
-
-# Checks what it is not allowed to use in a single UpdateItem request both
-# old-style AttributeUpdates and new-style UpdateExpression.
-def test_update_item_two_update_methods(test_table_s):
-    p = random_string()
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.update_item(Key={'p': p},
-            AttributeUpdates={'a': {'Value': 3, 'Action': 'PUT'}},
-            UpdateExpression='SET b = :val1',
-            ExpressionAttributeValues={':val1': 4})
-
-# Verify that having neither AttributeUpdates nor UpdateExpression is
-# allowed, and results in creation of an empty item.
-def test_update_item_no_update_method(test_table_s):
-    p = random_string()
-    assert not "Item" in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)
-    test_table_s.update_item(Key={'p': p})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p}
-
-# Test GetItem with the AttributesToGet parameter. Result should include the
-# selected attributes only - if one wants the key attributes as well, one
-# needs to select them explicitly. When no key attributes are selected,
-# some items may have *none* of the selected attributes. Those items are
-# returned too, as empty items - they are not outright missing.
-def test_getitem_attributes_to_get(dynamodb, test_table):
-    p = random_string()
-    c = random_string()
-    item = {'p': p, 'c': c, 'a': 'hello', 'b': 'hi'}
-    test_table.put_item(Item=item)
-    for wanted in [ ['a'],             # only non-key attribute
-                    ['c', 'a'],        # a key attribute (sort key) and non-key
-                    ['p', 'c'],        # entire key
-                    ['nonexistent']    # Our item doesn't have this
-                   ]:
-        got_item = test_table.get_item(Key={'p': p, 'c': c}, AttributesToGet=wanted, ConsistentRead=True)['Item']
-        expected_item = {k: item[k] for k in wanted if k in item}
-        assert expected_item == got_item
-
-# Basic test for DeleteItem, with hash key only
-def test_delete_item_hash(test_table_s):
-    p = random_string()
-    test_table_s.put_item(Item={'p': p})
-    assert 'Item' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)
-    test_table_s.delete_item(Key={'p': p})
-    assert not 'Item' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)
-
-# Basic test for DeleteItem, with hash and sort key
-def test_delete_item_sort(test_table):
-    p = random_string()
-    c = random_string()
-    key = {'p': p, 'c': c}
-    test_table.put_item(Item=key)
-    assert 'Item' in test_table.get_item(Key=key, ConsistentRead=True)
-    test_table.delete_item(Key=key)
-    assert not 'Item' in test_table.get_item(Key=key, ConsistentRead=True)
-
-# Test that PutItem completely replaces an existing item. It shouldn't merge
-# it with a previously existing value, as UpdateItem does!
-# We test for a table with just hash key, and for a table with both hash and
-# sort keys.
-def test_put_item_replace(test_table_s, test_table):
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'a': 'hi'}
-    test_table_s.put_item(Item={'p': p, 'b': 'hello'})
-    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == {'p': p, 'b': 'hello'}
-    c = random_string()
-    test_table.put_item(Item={'p': p, 'c': c, 'a': 'hi'})
-    assert test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item'] == {'p': p, 'c': c, 'a': 'hi'}
-    test_table.put_item(Item={'p': p, 'c': c, 'b': 'hello'})
-    assert test_table.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item'] == {'p': p, 'c': c, 'b': 'hello'}
--- a/alternator-test/test_returnvalues.py
+++ b/alternator-test/test_returnvalues.py
@@ -1,226 +0,0 @@
-# Copyright 2019 ScyllaDB
-#
-# This file is part of Scylla.
-#
-# Scylla is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Scylla is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
-
-# Tests for the ReturnValues parameter for the different update operations
-# (PutItem, UpdateItem, DeleteItem).
-
-import pytest
-from botocore.exceptions import ClientError
-from util import random_string
-
-# Test trivial support for the ReturnValues parameter in PutItem, UpdateItem
-# and DeleteItem - test that "NONE" works (and changes nothing), while a
-# completely unsupported value gives an error.
-# This test is useful to check that before the ReturnValues parameter is fully
-# implemented, it returns an error when a still-unsupported ReturnValues
-# option is attempted in the request - instead of simply being ignored.
-def test_trivial_returnvalues(test_table_s):
-    # PutItem:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    ret=test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='NONE')
-    assert not 'Attributes' in ret
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='DOG')
-    # UpdateItem:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='NONE',
-        UpdateExpression='SET b = :val',
-        ExpressionAttributeValues={':val': 'cat'})
-    assert not 'Attributes' in ret
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.update_item(Key={'p': p}, ReturnValues='DOG',
-            UpdateExpression='SET a = a + :val',
-            ExpressionAttributeValues={':val': 1})
-    # DeleteItem:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    ret=test_table_s.delete_item(Key={'p': p}, ReturnValues='NONE')
-    assert not 'Attributes' in ret
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.delete_item(Key={'p': p}, ReturnValues='DOG')
-
-# Test the ReturnValues parameter on a PutItem operation. Only two settings
-# are supported for this parameter for this operation: NONE (the default)
-# and ALL_OLD.
-@pytest.mark.xfail(reason="ReturnValues not supported")
-def test_put_item_returnvalues(test_table_s):
-    # By default, the previous value of an item is not returned:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    ret=test_table_s.put_item(Item={'p': p, 'a': 'hello'})
-    assert not 'Attributes' in ret
-    # Using ReturnValues=NONE is the same:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    ret=test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='NONE')
-    assert not 'Attributes' in ret
-    # With ReturnValues=ALL_OLD, the old value of the item is returned
-    # in an "Attributes" attribute:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    ret=test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='ALL_OLD')
-    assert ret['Attributes'] == {'p': p, 'a': 'hi'}
-    # Other ReturnValue options - UPDATED_OLD, ALL_NEW, UPDATED_NEW,
-    # are supported by other operations but not by PutItem:
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='UPDATED_OLD')
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='ALL_NEW')
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='UPDATED_NEW')
-    # Also, obviously, a non-supported setting "DOG" also returns in error:
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='DOG')
-    # The ReturnValues value is case sensitive, so while "NONE" is supported
-    # (and tested above), "none" isn't:
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.put_item(Item={'p': p, 'a': 'hello'}, ReturnValues='none')
-
-# Test the ReturnValues parameter on a DeleteItem operation. Only two settings
-# are supported for this parameter for this operation: NONE (the default)
-# and ALL_OLD.
-@pytest.mark.xfail(reason="ReturnValues not supported")
-def test_delete_item_returnvalues(test_table_s):
-    # By default, the previous value of an item is not returned:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    ret=test_table_s.delete_item(Key={'p': p})
-    assert not 'Attributes' in ret
-    # Using ReturnValues=NONE is the same:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    ret=test_table_s.delete_item(Key={'p': p}, ReturnValues='NONE')
-    assert not 'Attributes' in ret
-    # With ReturnValues=ALL_OLD, the old value of the item is returned
-    # in an "Attributes" attribute:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi'})
-    ret=test_table_s.delete_item(Key={'p': p}, ReturnValues='ALL_OLD')
-    assert ret['Attributes'] == {'p': p, 'a': 'hi'}
-    # Other ReturnValue options - UPDATED_OLD, ALL_NEW, UPDATED_NEW,
-    # are supported by other operations but not by PutItem:
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.delete_item(Key={'p': p}, ReturnValues='UPDATE_OLD')
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.delete_item(Key={'p': p}, ReturnValues='ALL_NEW')
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.delete_item(Key={'p': p}, ReturnValues='UPDATE_NEW')
-    # Also, obviously, a non-supported setting "DOG" also returns in error:
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.delete_item(Key={'p': p}, ReturnValues='DOG')
-    # The ReturnValues value is case sensitive, so while "NONE" is supported
-    # (and tested above), "none" isn't:
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.delete_item(Key={'p': p}, ReturnValues='none')
-
-# Test the ReturnValues parameter on a UpdateItem operation. All five
-# settings are supported for this parameter for this operation: NONE
-# (the default), ALL_OLD, UPDATED_OLD, ALL_NEW and UPDATED_NEW.
-@pytest.mark.xfail(reason="ReturnValues not supported")
-def test_update_item_returnvalues(test_table_s):
-    # By default, the previous value of an item is not returned:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
-    ret=test_table_s.update_item(Key={'p': p},
-        UpdateExpression='SET b = :val',
-        ExpressionAttributeValues={':val': 'cat'})
-    assert not 'Attributes' in ret
-
-    # Using ReturnValues=NONE is the same:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='NONE',
-        UpdateExpression='SET b = :val',
-        ExpressionAttributeValues={':val': 'cat'})
-    assert not 'Attributes' in ret
-
-    # With ReturnValues=ALL_OLD, the entire old value of the item (even
-    # attributes we did not modify) is returned in an "Attributes" attribute:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='ALL_OLD',
-        UpdateExpression='SET b = :val',
-        ExpressionAttributeValues={':val': 'cat'})
-    assert ret['Attributes'] == {'p': p, 'a': 'hi', 'b': 'dog'}
-
-    # With ReturnValues=UPDATED_OLD, only the overwritten attributes of the
-    # old item are returned in an "Attributes" attribute:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_OLD',
-        UpdateExpression='SET b = :val, c = :val2',
-        ExpressionAttributeValues={':val': 'cat', ':val2': 'hello'})
-    assert ret['Attributes'] == {'b': 'dog'}
-    # Even if an update overwrites an attribute by the same value again,
-    # this is considered an update, and the old value (identical to the
-    # new one) is returned:
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_OLD',
-        UpdateExpression='SET b = :val',
-        ExpressionAttributeValues={':val': 'cat'})
-    assert ret['Attributes'] == {'b': 'cat'}
-    # Deleting an attribute also counts as overwriting it, of course:
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_OLD',
-        UpdateExpression='REMOVE b')
-    assert ret['Attributes'] == {'b': 'cat'}
-
-    # With ReturnValues=ALL_NEW, the entire new value of the item (including
-    # old attributes we did not modify) is returned:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='ALL_NEW',
-        UpdateExpression='SET b = :val',
-        ExpressionAttributeValues={':val': 'cat'})
-    assert ret['Attributes'] == {'p': p, 'a': 'hi', 'b': 'cat'}
-
-    # With ReturnValues=UPDATED_NEW, only the new value of the updated
-    # attributes are returned. Note that "updated attributes" means
-    # the newly set attributes - it doesn't require that these attributes
-    # have any previous values
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 'hi', 'b': 'dog'})
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_NEW',
-        UpdateExpression='SET b = :val, c = :val2',
-        ExpressionAttributeValues={':val': 'cat', ':val2': 'hello'})
-    assert ret['Attributes'] == {'b': 'cat', 'c': 'hello'}
-    # Deleting an attribute also counts as overwriting it, but the delete
-    # column is not returned in the response - so it's empty in this case.
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_NEW',
-        UpdateExpression='REMOVE b')
-    assert not 'Attributes' in ret
-    # In the above examples, UPDATED_NEW is not useful because it just
-    # returns the new values we already know from the request... UPDATED_NEW
-    # becomes more useful in read-modify-write operations:
-    p = random_string()
-    test_table_s.put_item(Item={'p': p, 'a': 1})
-    ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_NEW',
-        UpdateExpression='SET a = a + :val',
-        ExpressionAttributeValues={':val': 1})
-    assert ret['Attributes'] == {'a': 2}
-
-    # A non-supported setting "DOG" also returns in error:
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.update_item(Key={'p': p}, ReturnValues='DOG',
-            UpdateExpression='SET a = a + :val',
-            ExpressionAttributeValues={':val': 1})
-    # The ReturnValues value is case sensitive, so while "NONE" is supported
-    # (and tested above), "none" isn't:
-    with pytest.raises(ClientError, match='ValidationException'):
-        test_table_s.update_item(Key={'p': p}, ReturnValues='none',
-            UpdateExpression='SET a = a + :val',
-            ExpressionAttributeValues={':val': 1})
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -66,8 +66,9 @@ static std::string format_time_point(db_clock::time_point tp) {
    time_t time_point_repr = db_clock::to_time_t(tp);
    std::string time_point_str;
    time_point_str.resize(17);
+    ::tm time_buf;
    // strftime prints the terminating null character as well
-    std::strftime(time_point_str.data(), time_point_str.size(), "%Y%m%dT%H%M%SZ", std::gmtime(&time_point_repr));
+    std::strftime(time_point_str.data(), time_point_str.size(), "%Y%m%dT%H%M%SZ", ::gmtime_r(&time_point_repr, &time_buf));
    time_point_str.resize(16);
    return time_point_str;
 }
@@ -129,7 +130,7 @@ future<std::string> get_key_from_roles(cql3::query_processor& qp, std::string us

    auto cl = auth::password_authenticator::consistency_for_user(username);
    auto timeout = auth::internal_distributed_timeout_config();
-    return qp.process(query, cl, timeout, {sstring(username)}, true).then_wrapped([username = std::move(username)] (future<::shared_ptr<cql3::untyped_result_set>> f) {
+    return qp.execute_internal(query, cl, timeout, {sstring(username)}, true).then_wrapped([username = std::move(username)] (future<::shared_ptr<cql3::untyped_result_set>> f) {
        auto res = f.get0();
        auto salted_hash = std::optional<sstring>();
        if (res->empty()) {
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -30,6 +30,11 @@
 #include "serialization.hh"
 #include "base64.hh"
 #include <stdexcept>
+#include <boost/algorithm/cxx11/all_of.hpp>
+#include <boost/algorithm/cxx11/any_of.hpp>
+#include "utils/overloaded_functor.hh"
+
+#include "expressions_eval.hh"

 namespace alternator {

@@ -71,7 +76,7 @@ static ::shared_ptr<cql3::restrictions::single_column_restriction::contains> mak
 }

 static ::shared_ptr<cql3::restrictions::single_column_restriction::EQ> make_key_eq_restriction(const column_definition& cdef, const rjson::value& value) {
-    bytes raw_value = get_key_from_typed_value(value, cdef, type_to_string(cdef.type));
+    bytes raw_value = get_key_from_typed_value(value, cdef);
    auto restriction_value = ::make_shared<cql3::constants::value>(cql3::raw_value::make_value(std::move(raw_value)));
    return make_shared<cql3::restrictions::single_column_restriction::EQ>(cdef, std::move(restriction_value));
 }
@@ -225,16 +230,12 @@ static bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2) {
    }
 }

-static std::string_view to_string_view(const rjson::value& v) {
-    return std::string_view(v.GetString(), v.GetStringLength());
-}
-
 static bool is_set_of(const rjson::value& type1, const rjson::value& type2) {
    return (type2 == "S" && type1 == "SS") || (type2 == "N" && type1 == "NS") || (type2 == "B" && type1 == "BS");
 }

 // Check if two JSON-encoded values match with the CONTAINS relation
-static bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2) {
+bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2) {
    if (!v1) {
        return false;
    }
@@ -246,7 +247,7 @@ static bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2) {
                               "got {} instead", kv2.name));
    }
    if (kv1.name == "S" && kv2.name == "S") {
-        return to_string_view(kv1.value).find(to_string_view(kv2.value)) != std::string_view::npos;
+        return rjson::to_string_view(kv1.value).find(rjson::to_string_view(kv2.value)) != std::string_view::npos;
    } else if (kv1.name == "B" && kv2.name == "B") {
        return base64_decode(kv1.value).find(base64_decode(kv2.value)) != bytes::npos;
    } else if (is_set_of(kv1.name, kv2.name)) {
@@ -306,6 +307,19 @@ static bool check_IN(const rjson::value* val, const rjson::value& array) {
    return have_match;
 }

+// Another variant of check_IN, this one for ConditionExpression. It needs to
+// check whether the first element in the given vector is equal to any of the
+// others.
+static bool check_IN(const std::vector<rjson::value>& array) {
+    const rjson::value* first = &array[0];
+    for (unsigned i = 1; i < array.size(); i++) {
+        if (check_EQ(first, array[i])) {
+            return true;
+        }
+    }
+    return false;
+}
+
 static bool check_NULL(const rjson::value* val) {
    return val == nullptr;
 }
@@ -351,31 +365,35 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara

 struct cmp_lt {
    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs; }
+    // We cannot use the normal comparison operators like "<" on the bytes
+    // type, because they treat individual bytes as signed but we need to
+    // compare them as *unsigned*. So we need a specialization for bytes.
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) < 0; }
    static constexpr const char* diagnostic = "LT operator";
 };

 struct cmp_le {
-    // bytes only has <, so we cannot use <=.
-    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs || lhs == rhs; }
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs <= rhs; }
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) <= 0; }
    static constexpr const char* diagnostic = "LE operator";
 };

 struct cmp_ge {
-    // bytes only has <, so we cannot use >=.
-    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs || lhs == rhs; }
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs >= rhs; }
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) >= 0; }
    static constexpr const char* diagnostic = "GE operator";
 };

 struct cmp_gt {
-    // bytes only has <, so we cannot use >.
-    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs; }
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs > rhs; }
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) > 0; }
    static constexpr const char* diagnostic = "GT operator";
 };

 // True if v is between lb and ub, inclusive.  Throws if lb > ub.
 template <typename T>
 bool check_BETWEEN(const T& v, const T& lb, const T& ub) {
-    if (ub < lb) {
+    if (cmp_lt()(ub, lb)) {
        throw api_error("ValidationException",
                        format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
    }
@@ -505,16 +523,15 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
    }
 }

-// Verify that the existing values of the item (previous_item) match the
+// Check if the existing values of the item (previous_item) match the
 // conditions given by the Expected and ConditionalOperator parameters
 // (if they exist) in the request (an UpdateItem, PutItem or DeleteItem).
-// This function will throw a ConditionalCheckFailedException API error
-// if the values do not match the condition, or ValidationException if there
+// This function can throw an ValidationException API error if there
 // are errors in the format of the condition itself.
-void verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value>& previous_item) {
+bool verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value>& previous_item) {
    const rjson::value* expected = rjson::find(req, "Expected");
    if (!expected) {
-        return;
+        return true;
    }
    if (!expected->IsObject()) {
        throw api_error("ValidationException", "'Expected' parameter, if given, must be an object");
@@ -543,22 +560,123 @@ void verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value
    for (auto it = expected->MemberBegin(); it != expected->MemberEnd(); ++it) {
        const rjson::value* got = nullptr;
        if (previous_item && previous_item->IsObject() && previous_item->HasMember("Item")) {
-            got = rjson::find((*previous_item)["Item"], rjson::string_ref_type(it->name.GetString()));
+            got = rjson::find((*previous_item)["Item"], rjson::to_string_view(it->name));
        }
        bool success = verify_expected_one(it->value, got);
        if (success && !require_all) {
            // When !require_all, one success is enough!
-            return;
+            return true;
        } else if (!success && require_all) {
            // When require_all, one failure is enough!
-            throw api_error("ConditionalCheckFailedException", "Failed condition.");
+            return false;
        }
    }
    // If we got here and require_all, none of the checks failed, so succeed.
    // If we got here and !require_all, all of the checks failed, so fail.
-    if (!require_all) {
-        throw api_error("ConditionalCheckFailedException", "None of ORed Expect conditions were successful.");
+    return require_all;
+}
+
+bool calculate_primitive_condition(const parsed::primitive_condition& cond,
+        std::unordered_set<std::string>& used_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        const rjson::value& req,
+        schema_ptr schema,
+        const std::unique_ptr<rjson::value>& previous_item) {
+    std::vector<rjson::value> calculated_values;
+    calculated_values.reserve(cond._values.size());
+    for (const parsed::value& v : cond._values) {
+        calculated_values.push_back(calculate_value(v,
+                cond._op == parsed::primitive_condition::type::VALUE ?
+                        calculate_value_caller::ConditionExpressionAlone :
+                        calculate_value_caller::ConditionExpression,
+                rjson::find(req, "ExpressionAttributeValues"),
+                used_attribute_names, used_attribute_values,
+                req, schema, previous_item));
+    }
+    switch (cond._op) {
+    case parsed::primitive_condition::type::BETWEEN:
+        if (calculated_values.size() != 3) {
+            // Shouldn't happen unless we have a bug in the parser
+            throw std::logic_error(format("Wrong number of values {} in BETWEEN primitive_condition", cond._values.size()));
+        }
+        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2]);
+    case parsed::primitive_condition::type::IN:
+        return check_IN(calculated_values);
+    case parsed::primitive_condition::type::VALUE:
+        if (calculated_values.size() != 1) {
+            // Shouldn't happen unless we have a bug in the parser
+            throw std::logic_error(format("Unexpected values in primitive_condition", cond._values.size()));
+        }
+        // Unwrap the boolean wrapped as the value (if it is a boolean)
+        if (calculated_values[0].IsObject() && calculated_values[0].MemberCount() == 1) {
+            auto it = calculated_values[0].MemberBegin();
+            if (it->name == "BOOL" && it->value.IsBool()) {
+                return it->value.GetBool();
+            }
+        }
+        throw api_error("ValidationException",
+                format("ConditionExpression: condition results in a non-boolean value: {}",
+                        calculated_values[0]));
+    default:
+        // All the rest of the operators have exactly two parameters (and unless
+        // we have a bug in the parser, that's what we have in the parsed object:
+        if (calculated_values.size() != 2) {
+            throw std::logic_error(format("Wrong number of values {} in primitive_condition object", cond._values.size()));
+        }
+    }
+    switch (cond._op) {
+    case parsed::primitive_condition::type::EQ:
+        return check_EQ(&calculated_values[0], calculated_values[1]);
+    case parsed::primitive_condition::type::NE:
+        return check_NE(&calculated_values[0], calculated_values[1]);
+    case parsed::primitive_condition::type::GT:
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{});
+    case parsed::primitive_condition::type::GE:
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{});
+    case parsed::primitive_condition::type::LT:
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{});
+    case parsed::primitive_condition::type::LE:
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{});
+    default:
+        // Shouldn't happen unless we have a bug in the parser
+        throw std::logic_error(format("Unknown type {} in primitive_condition object", (int)(cond._op)));
    }
 }

+// Check if the existing values of the item (previous_item) match the
+// conditions given by the given parsed ConditionExpression.
+bool verify_condition_expression(
+        const parsed::condition_expression& condition_expression,
+        std::unordered_set<std::string>& used_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        const rjson::value& req,
+        schema_ptr schema,
+        const std::unique_ptr<rjson::value>& previous_item) {
+    if (condition_expression.empty()) {
+        return true;
+    }
+    bool ret = std::visit(overloaded_functor {
+        [&] (const parsed::primitive_condition& cond) -> bool {
+            return calculate_primitive_condition(cond, used_attribute_values,
+                    used_attribute_names, req, schema, previous_item);
+        },
+        [&] (const parsed::condition_expression::condition_list& list) -> bool {
+            auto verify_condition = [&] (const parsed::condition_expression& e) {
+                return verify_condition_expression(e, used_attribute_values,
+                        used_attribute_names, req, schema, previous_item);
+            };
+            switch (list.op) {
+            case '&':
+                return boost::algorithm::all_of(list.conditions, verify_condition);
+            case '|':
+                return boost::algorithm::any_of(list.conditions, verify_condition);
+            default:
+                // Shouldn't happen unless we have a bug in the parser
+                throw std::logic_error("bad operator in condition_list");
+            }
+        }
+    }, condition_expression._expression);
+    return condition_expression._negated ? !ret : ret;
+}
+
 }
--- a/alternator/conditions.hh
+++ b/alternator/conditions.hh
@@ -44,6 +44,6 @@ comparison_operator_type get_comparison_operator(const rjson::value& comparison_

 ::shared_ptr<cql3::restrictions::statement_restrictions> get_filtering_restrictions(schema_ptr schema, const column_definition& attrs_col, const rjson::value& query_filter);

-void verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value>& previous_item);
+bool verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value>& previous_item);

 }
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -25,47 +25,58 @@
 #include <seastar/http/httpd.hh>
 #include "seastarx.hh"
 #include <seastar/json/json_elements.hh>
+#include <seastar/core/sharded.hh>

 #include "service/storage_proxy.hh"
 #include "service/migration_manager.hh"
 #include "service/client_state.hh"

+#include "alternator/error.hh"
 #include "stats.hh"
+#include "rjson.hh"

 namespace alternator {

-class executor {
+class executor : public peering_sharded_service<executor> {
    service::storage_proxy& _proxy;
    service::migration_manager& _mm;
+    // An smp_service_group to be used for limiting the concurrency when
+    // forwarding Alternator request between shards - if necessary for LWT.
+    smp_service_group _ssg;

 public:
    using client_state = service::client_state;
+    using request_return_type = std::variant<json::json_return_type, api_error>;
    stats _stats;
    static constexpr auto ATTRS_COLUMN_NAME = ":attrs";
-    static constexpr auto KEYSPACE_NAME = "alternator";
+    static constexpr auto KEYSPACE_NAME_PREFIX = "alternator_";

-    executor(service::storage_proxy& proxy, service::migration_manager& mm) : _proxy(proxy), _mm(mm) {}
+    executor(service::storage_proxy& proxy, service::migration_manager& mm, smp_service_group ssg)
+        : _proxy(proxy), _mm(mm), _ssg(ssg) {}

-    future<json::json_return_type> create_table(client_state& client_state, std::string content);
-    future<json::json_return_type> describe_table(client_state& client_state, std::string content);
-    future<json::json_return_type> delete_table(client_state& client_state, std::string content);
-    future<json::json_return_type> put_item(client_state& client_state, std::string content);
-    future<json::json_return_type> get_item(client_state& client_state, std::string content);
-    future<json::json_return_type> delete_item(client_state& client_state, std::string content);
-    future<json::json_return_type> update_item(client_state& client_state, std::string content);
-    future<json::json_return_type> list_tables(client_state& client_state, std::string content);
-    future<json::json_return_type> scan(client_state& client_state, std::string content);
-    future<json::json_return_type> describe_endpoints(client_state& client_state, std::string content, std::string host_header);
-    future<json::json_return_type> batch_write_item(client_state& client_state, std::string content);
-    future<json::json_return_type> batch_get_item(client_state& client_state, std::string content);
-    future<json::json_return_type> query(client_state& client_state, std::string content);
+    future<request_return_type> create_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> delete_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> put_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> delete_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> update_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> list_tables(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> scan(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> describe_endpoints(client_state& client_state, service_permit permit, rjson::value request, std::string host_header);
+    future<request_return_type> batch_write_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> batch_get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> query(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> tag_resource(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> untag_resource(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> list_tags_of_resource(client_state& client_state, service_permit permit, rjson::value request);

    future<> start();
    future<> stop() { return make_ready_future<>(); }

-    future<> maybe_create_keyspace();
+    future<> create_keyspace(std::string_view keyspace_name);

-    static void maybe_trace_query(client_state& client_state, sstring_view op, sstring_view query);
+    static tracing::trace_state_ptr maybe_trace_query(client_state& client_state, sstring_view op, sstring_view query);
 };

 }
--- a/alternator/expressions.cc
+++ b/alternator/expressions.cc
@@ -22,6 +22,7 @@
 #include "expressions.hh"
 #include "alternator/expressionsLexer.hpp"
 #include "alternator/expressionsParser.hpp"
+#include "utils/overloaded_functor.hh"

 #include <seastarx.hh>

@@ -65,13 +66,19 @@ parse_projection_expression(std::string query) {
    }
 }

-template<class... Ts> struct overloaded : Ts... { using Ts::operator()...; };
-template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;
+parsed::condition_expression
+parse_condition_expression(std::string query) {
+    try {
+        return do_with_parser(query,  std::mem_fn(&expressionsParser::condition_expression));
+    } catch (...) {
+        throw expressions_syntax_error(format("Failed parsing ConditionExpression '{}': {}", query, std::current_exception()));
+    }
+}

 namespace parsed {

 void update_expression::add(update_expression::action a) {
-    std::visit(overloaded {
+    std::visit(overloaded_functor {
        [&] (action::set&)    { seen_set = true; },
        [&] (action::remove&) { seen_remove = true; },
        [&] (action::add&)    { seen_add = true; },
@@ -94,5 +101,27 @@ void update_expression::append(update_expression other) {
    seen_del |= other.seen_del;
 }

+void condition_expression::append(condition_expression&& a, char op) {
+    std::visit(overloaded_functor {
+        [&] (condition_list& x) {
+            // If 'a' has a single condition, we could, instead of inserting
+            // it insert its single condition (possibly negated if a._negated)
+            // But considering it we don't evaluate these expressions many
+            // times, this optimization is not worth extra code complexity.
+            if (!x.conditions.empty() && x.op != op) {
+                // Shouldn't happen unless we have a bug in the parser
+                throw std::logic_error("condition_expression::append called with mixed operators");
+            }
+            x.conditions.push_back(std::move(a));
+            x.op = op;
+        },
+        [&] (primitive_condition& x) {
+            // Shouldn't happen unless we have a bug in the parser
+            throw std::logic_error("condition_expression::append called on primitive_condition");
+        }
+    }, _expression);
+}
+
+
 } // namespace parsed
 } // namespace alternator
--- a/alternator/expressions.g
+++ b/alternator/expressions.g
@@ -145,6 +145,12 @@ REMOVE: R E M O V E;
 ADD: A D D;
 DELETE: D E L E T E;

+AND: A N D;
+OR: O R;
+NOT: N O T;
+BETWEEN: B E T W E E N;
+IN: I N;
+
 fragment ALPHA: 'A'..'Z' | 'a'..'z';
 fragment DIGIT: '0'..'9';
 fragment ALNUM: ALPHA | DIGIT | '_';
@@ -165,19 +171,19 @@ path returns [parsed::path p]:
      | '[' INTEGER ']'           { $p.add_index(std::stoi($INTEGER.text)); }
    )*;

-update_expression_set_value returns [parsed::value v]:
-      VALREF                             { $v.set_valref($VALREF.text); }
-    | path                               { $v.set_path($path.p); }
-    | NAME                               { $v.set_func_name($NAME.text); }
-     '(' x=update_expression_set_value   { $v.add_func_parameter($x.v); }
-     (',' x=update_expression_set_value  { $v.add_func_parameter($x.v); })*
+value returns [parsed::value v]:
+      VALREF       { $v.set_valref($VALREF.text); }
+    | path         { $v.set_path($path.p); }
+    | NAME         { $v.set_func_name($NAME.text); }
+     '(' x=value   { $v.add_func_parameter($x.v); }
+     (',' x=value  { $v.add_func_parameter($x.v); })*
     ')'
    ;

 update_expression_set_rhs returns [parsed::set_rhs rhs]:
-    v=update_expression_set_value  { $rhs.set_value(std::move($v.v)); }
-    (   '+' v=update_expression_set_value  { $rhs.set_plus(std::move($v.v)); }
-      | '-' v=update_expression_set_value  { $rhs.set_minus(std::move($v.v)); }
+    v=value  { $rhs.set_value(std::move($v.v)); }
+    (   '+' v=value  { $rhs.set_plus(std::move($v.v)); }
+      | '-' v=value  { $rhs.set_minus(std::move($v.v)); }
    )?
    ;

@@ -212,3 +218,48 @@ update_expression returns [parsed::update_expression e]:
 projection_expression returns [std::vector<parsed::path> v]:
    p=path      { $v.push_back(std::move($p.p)); }
    (',' p=path { $v.push_back(std::move($p.p)); } )* EOF;
+
+
+primitive_condition returns [parsed::primitive_condition c]:
+      v=value         { $c.add_value(std::move($v.v));
+                        $c.set_operator(parsed::primitive_condition::type::VALUE); }
+      (  (  '='       { $c.set_operator(parsed::primitive_condition::type::EQ); }
+          | '<' '>'   { $c.set_operator(parsed::primitive_condition::type::NE); }
+          | '<'       { $c.set_operator(parsed::primitive_condition::type::LT); }
+          | '<' '='   { $c.set_operator(parsed::primitive_condition::type::LE); }
+          | '>'       { $c.set_operator(parsed::primitive_condition::type::GT); }
+          | '>' '='   { $c.set_operator(parsed::primitive_condition::type::GE); }
+         )
+         v=value      { $c.add_value(std::move($v.v)); }
+       | BETWEEN      { $c.set_operator(parsed::primitive_condition::type::BETWEEN); }
+         v=value      { $c.add_value(std::move($v.v)); }
+         AND
+         v=value      { $c.add_value(std::move($v.v)); }
+       | IN '('       { $c.set_operator(parsed::primitive_condition::type::IN); }
+         v=value      { $c.add_value(std::move($v.v)); }
+         (',' v=value { $c.add_value(std::move($v.v)); })*
+         ')'
+      )?
+    ;
+
+// The following rules for parsing boolean expressions are verbose and
+// somewhat strange because of Antlr 3's limitations on recursive rules,
+// common rule prefixes, and (lack of) support for operator precedence.
+// These rules could have been written more clearly using a more powerful
+// parser generator - such as Yacc.
+boolean_expression returns [parsed::condition_expression e]:
+	  b=boolean_expression_1       { $e.append(std::move($b.e), '|'); }
+	  (OR b=boolean_expression_1   { $e.append(std::move($b.e), '|'); } )*
+	;
+boolean_expression_1 returns [parsed::condition_expression e]:
+	  b=boolean_expression_2       { $e.append(std::move($b.e), '&'); }
+	  (AND b=boolean_expression_2  { $e.append(std::move($b.e), '&'); } )*
+	;
+boolean_expression_2 returns [parsed::condition_expression e]:
+	  p=primitive_condition        { $e.set_primitive(std::move($p.c)); }
+	| NOT b=boolean_expression_2   { $e = std::move($b.e); $e.apply_not(); }
+	| '(' b=boolean_expression ')' { $e = std::move($b.e); }
+    ;
+
+condition_expression returns [parsed::condition_expression e]:
+    boolean_expression { e=std::move($boolean_expression.e); } EOF;
--- a/alternator/expressions.hh
+++ b/alternator/expressions.hh
@@ -36,6 +36,6 @@ public:

 parsed::update_expression parse_update_expression(std::string query);
 std::vector<parsed::path> parse_projection_expression(std::string query);
-
+parsed::condition_expression parse_condition_expression(std::string query);

 } /* namespace alternator */
--- a/alternator/expressions_eval.hh
+++ b/alternator/expressions_eval.hh
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <string>
+#include <unordered_set>
+
+#include "rjson.hh"
+#include "schema_fwd.hh"
+
+#include "expressions_types.hh"
+
+namespace alternator {
+
+// calculate_value() behaves slightly different (especially, different
+// functions supported) when used in different types of expressions, as
+// enumerated in this enum:
+enum class calculate_value_caller {
+    UpdateExpression, ConditionExpression, ConditionExpressionAlone
+};
+
+inline std::ostream& operator<<(std::ostream& out, calculate_value_caller caller) {
+    switch (caller) {
+        case calculate_value_caller::UpdateExpression:
+            out << "UpdateExpression";
+            break;
+        case calculate_value_caller::ConditionExpression:
+            out << "ConditionExpression";
+            break;
+        case calculate_value_caller::ConditionExpressionAlone:
+            out << "ConditionExpression";
+            break;
+        default:
+            out << "unknown type of expression";
+            break;
+    }
+    return out;
+}
+
+bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2);
+
+rjson::value calculate_value(const parsed::value& v,
+        calculate_value_caller caller,
+        const rjson::value* expression_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        std::unordered_set<std::string>& used_attribute_values,
+        const rjson::value& update_info,
+        schema_ptr schema,
+        const std::unique_ptr<rjson::value>& previous_item);
+
+bool verify_condition_expression(
+        const parsed::condition_expression& condition_expression,
+        std::unordered_set<std::string>& used_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        const rjson::value& req,
+        schema_ptr schema,
+        const std::unique_ptr<rjson::value>& previous_item);
+
+} /* namespace alternator */
--- a/alternator/expressions_types.hh
+++ b/alternator/expressions_types.hh
@@ -88,6 +88,15 @@ struct value {
    void add_func_parameter(value v) {
        std::get<function_call>(_value)._parameters.emplace_back(std::move(v));
    }
+    bool is_valref() const {
+        return std::holds_alternative<std::string>(_value);
+    }
+    bool is_path() const {
+        return std::holds_alternative<path>(_value);
+    }
+    bool is_func() const {
+        return std::holds_alternative<function_call>(_value);
+    }
 };

 // The right-hand-side of a SET in an update expression can be either a
@@ -162,5 +171,58 @@ public:
    }
 };

+// A primitive_condition is a condition expression involving one condition,
+// while the full condition_expression below adds boolean logic over these
+// primitive conditions.
+// The supported primitive conditions are:
+// 1. Binary operators - v1 OP v2, where OP is =, <>, <, <=, >, or >= and
+//    v1 and v2 are values - from the item (an attribute path), the query
+//    (a ":val" reference), or a function of the the above (only the size()
+//    function is supported).
+// 2. Ternary operator - v1 BETWEEN v2 and v3 (means v1 >= v2 AND v1 <= v3).
+// 3. N-ary operator - v1 IN ( v2, v3, ... )
+// 4. A single function call (attribute_exists etc.). The parser actually
+//    accepts a more general "value" here but later stages reject a value
+//    which is not a function call (because DynamoDB does it too).
+class primitive_condition {
+public:
+    enum class type {
+        UNDEFINED, VALUE, EQ, NE, LT, LE, GT, GE, BETWEEN, IN
+    };
+    type _op = type::UNDEFINED;
+    std::vector<value> _values;
+    void set_operator(type op) {
+        _op = op;
+    }
+    void add_value(value&& v) {
+        _values.push_back(std::move(v));
+    }
+    bool empty() const {
+        return _op == type::UNDEFINED;
+    }
+};
+
+class condition_expression {
+public:
+    bool _negated = false; // If true, the entire condition is negated
+    struct condition_list {
+        char op = '|'; // '&' or '|'
+        std::vector<condition_expression> conditions;
+    };
+    std::variant<primitive_condition, condition_list> _expression = condition_list();
+
+    void set_primitive(primitive_condition&& p) {
+        _expression = std::move(p);
+    }
+    void append(condition_expression&& c, char op);
+    void apply_not() {
+        _negated = !_negated;
+    }
+    bool empty() const {
+        return std::holds_alternative<condition_list>(_expression) &&
+               std::get<condition_list>(_expression).conditions.empty();
+    }
+};
+
 } // namespace parsed
 } // namespace alternator
--- a/alternator/rjson.cc
+++ b/alternator/rjson.cc
@@ -22,14 +22,108 @@
 #include "rjson.hh"
 #include "error.hh"
 #include <seastar/core/print.hh>
+#include <seastar/core/thread.hh>

 namespace rjson {

 static allocator the_allocator;

+/*
+ * This wrapper class adds nested level checks to rapidjson's handlers.
+ * Each rapidjson handler implements functions for accepting JSON values,
+ * which includes strings, numbers, objects, arrays, etc.
+ * Parsing objects and arrays needs to be performed carefully with regard
+ * to stack overflow - each object/array layer adds another stack frame
+ * to parsing, printing and destroying the parent JSON document.
+ * To prevent stack overflow, a rapidjson handler can be wrapped with
+ * guarded_json_handler, which accepts an additional max_nested_level parameter.
+ * After trying to exceed the max nested level, a proper rjson::error will be thrown.
+ */
+template<typename Handler, bool EnableYield>
+struct guarded_yieldable_json_handler : public Handler {
+    size_t _nested_level = 0;
+    size_t _max_nested_level;
+public:
+    using handler_base = Handler;
+
+    explicit guarded_yieldable_json_handler(size_t max_nested_level) : _max_nested_level(max_nested_level) {}
+    guarded_yieldable_json_handler(string_buffer& buf, size_t max_nested_level)
+            : handler_base(buf), _max_nested_level(max_nested_level) {}
+
+    void Parse(const char* str, size_t length) {
+        rapidjson::MemoryStream ms(static_cast<const char*>(str), length * sizeof(typename encoding::Ch));
+        rapidjson::EncodedInputStream<encoding, rapidjson::MemoryStream> is(ms);
+        rapidjson::GenericReader<encoding, encoding, allocator> reader(&the_allocator);
+        reader.Parse(is, *this);
+        if (reader.HasParseError()) {
+            throw rjson::error(format("Parsing JSON failed: {}", rapidjson::GetParseError_En(reader.GetParseErrorCode())));
+        }
+        //NOTICE: The handler has parsed the string, but in case of rapidjson::GenericDocument
+        // the data now resides in an internal stack_ variable, which is private instead of
+        // protected... which means we cannot simply access its data. Fortunately, another
+        // function for populating documents from SAX events can be abused to extract the data
+        // from the stack via gadget-oriented programming - we use an empty event generator
+        // which does nothing, and use it to call Populate(), which assumes that the generator
+        // will fill the stack with something. It won't, but our stack is already filled with
+        // data we want to steal, so once Populate() ends, our document will be properly parsed.
+        // A proper solution could be programmed once rapidjson declares this stack_ variable
+        // as protected instead of private, so that this class can access it.
+        auto dummy_generator = [](handler_base&){return true;};
+        handler_base::Populate(dummy_generator);
+    }
+
+    bool StartObject() {
+        ++_nested_level;
+        check_nested_level();
+        maybe_yield();
+        return handler_base::StartObject();
+    }
+
+    bool EndObject(rapidjson::SizeType elements_count = 0) {
+        --_nested_level;
+        return handler_base::EndObject(elements_count);
+    }
+
+    bool StartArray() {
+        ++_nested_level;
+        check_nested_level();
+        maybe_yield();
+        return handler_base::StartArray();
+    }
+
+    bool EndArray(rapidjson::SizeType elements_count = 0) {
+        --_nested_level;
+        return handler_base::EndArray(elements_count);
+    }
+
+    bool Null()                 { maybe_yield(); return handler_base::Null(); }
+    bool Bool(bool b)           { maybe_yield(); return handler_base::Bool(b); }
+    bool Int(int i)             { maybe_yield(); return handler_base::Int(i); }
+    bool Uint(unsigned u)       { maybe_yield(); return handler_base::Uint(u); }
+    bool Int64(int64_t i64)     { maybe_yield(); return handler_base::Int64(i64); }
+    bool Uint64(uint64_t u64)   { maybe_yield(); return handler_base::Uint64(u64); }
+    bool Double(double d)       { maybe_yield(); return handler_base::Double(d); }
+    bool String(const value::Ch* str, size_t length, bool copy = false) { maybe_yield(); return handler_base::String(str, length, copy); }
+    bool Key(const value::Ch* str, size_t length, bool copy = false) { maybe_yield(); return handler_base::Key(str, length, copy); }
+
+
+protected:
+    static void maybe_yield() {
+        if constexpr (EnableYield) {
+            thread::maybe_yield();
+        }
+    }
+
+    void check_nested_level() const {
+        if (RAPIDJSON_UNLIKELY(_nested_level > _max_nested_level)) {
+            throw rjson::error(format("Max nested level reached: {}", _max_nested_level));
+        }
+    }
+};
+
 std::string print(const rjson::value& value) {
    string_buffer buffer;
-    writer writer(buffer);
+    guarded_yieldable_json_handler<writer, false> writer(buffer, 39);
    value.Accept(writer);
    return std::string(buffer.GetString());
 }
@@ -38,13 +132,9 @@ rjson::value copy(const rjson::value& value) {
    return rjson::value(value, the_allocator);
 }

-rjson::value parse(const std::string& str) {
-    return parse_raw(str.c_str(), str.size());
-}
-
-rjson::value parse_raw(const char* c_str, size_t size) {
-    rjson::document d;
-    d.Parse(c_str, size);
+rjson::value parse(std::string_view str) {
+    guarded_yieldable_json_handler<document, false> d(39);
+    d.Parse(str.data(), str.size());
    if (d.HasParseError()) {
        throw rjson::error(format("Parsing JSON failed: {}", GetParseError_En(d.GetParseError())));
    }
@@ -52,8 +142,22 @@ rjson::value parse_raw(const char* c_str, size_t size) {
    return std::move(v);
 }

-rjson::value& get(rjson::value& value, rjson::string_ref_type name) {
-    auto member_it = value.FindMember(name);
+rjson::value parse_yieldable(std::string_view str) {
+    guarded_yieldable_json_handler<document, true> d(39);
+    d.Parse(str.data(), str.size());
+    if (d.HasParseError()) {
+        throw rjson::error(format("Parsing JSON failed: {}", GetParseError_En(d.GetParseError())));
+    }
+    rjson::value& v = d;
+    return std::move(v);
+}
+
+rjson::value& get(rjson::value& value, std::string_view name) {
+    // Although FindMember() has a variant taking a StringRef, it ignores the
+    // given length (see https://github.com/Tencent/rapidjson/issues/1649).
+    // Luckily, the variant taking a GenericValue doesn't share this bug,
+    // and we can create a string GenericValue without copying the string.
+    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
    if (member_it != value.MemberEnd())
        return member_it->value;
    else {
@@ -61,8 +165,8 @@ rjson::value& get(rjson::value& value, rjson::string_ref_type name) {
    }
 }

-const rjson::value& get(const rjson::value& value, rjson::string_ref_type name) {
-    auto member_it = value.FindMember(name);
+const rjson::value& get(const rjson::value& value, std::string_view name) {
+    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
    if (member_it != value.MemberEnd())
        return member_it->value;
    else {
@@ -82,24 +186,48 @@ rjson::value from_string(const char* str, size_t size) {
    return rjson::value(str, size, the_allocator);
 }

-const rjson::value* find(const rjson::value& value, string_ref_type name) {
-    auto member_it = value.FindMember(name);
+rjson::value from_string(std::string_view view) {
+    return rjson::value(view.data(), view.size(), the_allocator);
+}
+
+const rjson::value* find(const rjson::value& value, std::string_view name) {
+    // Although FindMember() has a variant taking a StringRef, it ignores the
+    // given length (see https://github.com/Tencent/rapidjson/issues/1649).
+    // Luckily, the variant taking a GenericValue doesn't share this bug,
+    // and we can create a string GenericValue without copying the string.
+    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
    return member_it != value.MemberEnd() ? &member_it->value : nullptr;
 }

-rjson::value* find(rjson::value& value, string_ref_type name) {
-    auto member_it = value.FindMember(name);
+rjson::value* find(rjson::value& value, std::string_view name) {
+    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
    return member_it != value.MemberEnd() ? &member_it->value : nullptr;
 }

+bool remove_member(rjson::value& value, std::string_view name) {
+    // Although RemoveMember() has a variant taking a StringRef, it ignores
+    // given length (see https://github.com/Tencent/rapidjson/issues/1649).
+    // Luckily, the variant taking a GenericValue doesn't share this bug,
+    // and we can create a string GenericValue without copying the string.
+    return value.RemoveMember(rjson::value(name.data(), name.size()));
+}
+
 void set_with_string_name(rjson::value& base, const std::string& name, rjson::value&& member) {
    base.AddMember(rjson::value(name.c_str(), name.size(), the_allocator), std::move(member), the_allocator);
 }

+void set_with_string_name(rjson::value& base, std::string_view name, rjson::value&& member) {
+    base.AddMember(rjson::value(name.data(), name.size(), the_allocator), std::move(member), the_allocator);
+}
+
 void set_with_string_name(rjson::value& base, const std::string& name, rjson::string_ref_type member) {
    base.AddMember(rjson::value(name.c_str(), name.size(), the_allocator), rjson::value(member), the_allocator);
 }

+void set_with_string_name(rjson::value& base, std::string_view name, rjson::string_ref_type member) {
+    base.AddMember(rjson::value(name.data(), name.size(), the_allocator), rjson::value(member), the_allocator);
+}
+
 void set(rjson::value& base, rjson::string_ref_type name, rjson::value&& member) {
    base.AddMember(name, std::move(member), the_allocator);
 }
--- a/alternator/rjson.hh
+++ b/alternator/rjson.hh
@@ -104,38 +104,49 @@ inline rjson::value empty_string() {
 // The representation is dense - without any redundant indentation.
 std::string print(const rjson::value& value);

+// Returns a string_view to the string held in a JSON value (which is
+// assumed to hold a string, i.e., v.IsString() == true). This is a view
+// to the existing data - no copying is done.
+inline std::string_view to_string_view(const rjson::value& v) {
+    return std::string_view(v.GetString(), v.GetStringLength());
+}
+
 // Copies given JSON value - involves allocation
 rjson::value copy(const rjson::value& value);

 // Parses a JSON value from given string or raw character array.
 // The string/char array liveness does not need to be persisted,
-// as both parse() and parse_raw() will allocate member names and values.
+// as parse() will allocate member names and values.
 // Throws rjson::error if parsing failed.
-rjson::value parse(const std::string& str);
-rjson::value parse_raw(const char* c_str, size_t size);
+rjson::value parse(std::string_view str);
+// Needs to be run in thread context
+rjson::value parse_yieldable(std::string_view str);

 // Creates a JSON value (of JSON string type) out of internal string representations.
 // The string value is copied, so str's liveness does not need to be persisted.
 rjson::value from_string(const std::string& str);
 rjson::value from_string(const sstring& str);
 rjson::value from_string(const char* str, size_t size);
+rjson::value from_string(std::string_view view);

 // Returns a pointer to JSON member if it exists, nullptr otherwise
-rjson::value* find(rjson::value& value, rjson::string_ref_type name);
-const rjson::value* find(const rjson::value& value, rjson::string_ref_type name);
+rjson::value* find(rjson::value& value, std::string_view name);
+const rjson::value* find(const rjson::value& value, std::string_view name);

 // Returns a reference to JSON member if it exists, throws otherwise
-rjson::value& get(rjson::value& value, rjson::string_ref_type name);
-const rjson::value& get(const rjson::value& value, rjson::string_ref_type name);
+rjson::value& get(rjson::value& value, std::string_view name);
+const rjson::value& get(const rjson::value& value, std::string_view name);

 // Sets a member in given JSON object by moving the member - allocates the name.
 // Throws if base is not a JSON object.
 void set_with_string_name(rjson::value& base, const std::string& name, rjson::value&& member);
+void set_with_string_name(rjson::value& base, std::string_view name, rjson::value&& member);

 // Sets a string member in given JSON object by assigning its reference - allocates the name.
 // NOTICE: member string liveness must be ensured to be at least as long as base's.
 // Throws if base is not a JSON object.
 void set_with_string_name(rjson::value& base, const std::string& name, rjson::string_ref_type member);
+void set_with_string_name(rjson::value& base, std::string_view name, rjson::string_ref_type member);

 // Sets a member in given JSON object by moving the member.
 // NOTICE: name liveness must be ensured to be at least as long as base's.
@@ -152,6 +163,9 @@ void set(rjson::value& base, rjson::string_ref_type name, rjson::string_ref_type
 // Throws if base_array is not a JSON array.
 void push_back(rjson::value& base_array, rjson::value&& item);

+// Remove a member from a JSON object. Throws if value isn't an object.
+bool remove_member(rjson::value& value, std::string_view name);
+
 struct single_value_comp {
    bool operator()(const rjson::value& r1, const rjson::value& r2) const;
 };
--- a/alternator/rmw_operation.hh
+++ b/alternator/rmw_operation.hh
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <seastarx.hh>
+#include <service/storage_proxy.hh>
+#include <service/storage_proxy.hh>
+#include "rjson.hh"
+#include "executor.hh"
+
+namespace alternator {
+
+// An rmw_operation encapsulates the common logic of all the item update
+// operations which may involve a read of the item before the write
+// (so-called Read-Modify-Write operations). These operations include PutItem,
+// UpdateItem and DeleteItem: All of these may be conditional operations (the
+// "Expected" parameter) which requir a read before the write, and UpdateItem
+// may also have an update expression which refers to the item's old value.
+//
+// The code below supports running the read and the write together as one
+// transaction using LWT (this is why rmw_operation is a subclass of
+// cas_request, as required by storage_proxy::cas()), but also has optional
+// modes not using LWT.
+class rmw_operation : public service::cas_request, public enable_shared_from_this<rmw_operation> {
+public:
+    // The following options choose which mechanism to use for isolating
+    // parallel write operations:
+    // * The FORBID_RMW option forbids RMW (read-modify-write) operations
+    //   such as conditional updates. For the remaining write-only
+    //   operations, ordinary quorum writes are isolated enough.
+    // * The LWT_ALWAYS option always uses LWT (lightweight transactions)
+    //   for any write operation - whether or not it also has a read.
+    // * The LWT_RMW_ONLY option uses LWT only for RMW operations, and uses
+    //   ordinary quorum writes for write-only operations.
+    //   This option is not safe if the user may send both RMW and write-only
+    //   operations on the same item.
+    // * The UNSAFE_RMW option does read-modify-write operations as separate
+    //   read and write. It is unsafe - concurrent RMW operations are not
+    //   isolated at all. This option will likely be removed in the future.
+    enum class write_isolation {
+        FORBID_RMW, LWT_ALWAYS, LWT_RMW_ONLY, UNSAFE_RMW
+    };
+    static constexpr auto WRITE_ISOLATION_TAG_KEY = "system:write_isolation";
+
+    static write_isolation get_write_isolation_for_schema(schema_ptr schema);
+
+protected:
+    // The full request JSON
+    rjson::value _request;
+    // All RMW operations involve a single item with a specific partition
+    // and optional clustering key, in a single table, so the following
+    // information is common to all of them:
+    schema_ptr _schema;
+    partition_key _pk = partition_key::make_empty();
+    clustering_key _ck = clustering_key::make_empty();
+    write_isolation _write_isolation;
+
+    // All RMW operations can have a ReturnValues parameter from the following
+    // choices. But note that only UpdateItem actually supports all of them:
+    enum class returnvalues {
+        NONE, ALL_OLD, UPDATED_OLD, ALL_NEW, UPDATED_NEW
+    } _returnvalues;
+    static returnvalues parse_returnvalues(const rjson::value& request);
+    // When _returnvalues != NONE, apply() should store here, in JSON form,
+    // the values which are to be returned in the "Attributes" field.
+    // The default null JSON means do not return an Attributes field at all.
+    // This field is marked "mutable" so that the const apply() can modify
+    // it (see explanation below), but note that because apply() may be
+    // called more than once, if apply() will sometimes set this field it
+    // must set it (even if just to the default empty value) every time.
+    mutable rjson::value _return_attributes;
+public:
+    // The constructor of a rmw_operation subclass should parse the request
+    // and try to discover as many input errors as it can before really
+    // attempting the read or write operations.
+    rmw_operation(service::storage_proxy& proxy, rjson::value&& request);
+    // rmw_operation subclasses (update_item_operation, put_item_operation
+    // and delete_item_operation) shall implement an apply() function which
+    // takes the previous value of the item (if it was read) and creates the
+    // write mutation. If the previous value of item does not pass the needed
+    // conditional expression, apply() should return an empty optional.
+    // apply() may throw if it encounters input errors not discovered during
+    // the constructor.
+    // apply() may be called more than once in case of contention, so it must
+    // not change the state saved in the object (issue #7218 was caused by
+    // violating this). We mark apply() "const" to let the compiler validate
+    // this for us. The output-only field _return_attributes is marked
+    // "mutable" above so that apply() can still write to it.
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const = 0;
+    // Convert the above apply() into the signature needed by cas_request:
+    virtual std::optional<mutation> apply(query::result& qr, const query::partition_slice& slice, api::timestamp_type ts) override;
+    virtual ~rmw_operation() = default;
+    schema_ptr schema() const { return _schema; }
+    const rjson::value& request() const { return _request; }
+    rjson::value&& move_request() && { return std::move(_request); }
+    future<executor::request_return_type> execute(service::storage_proxy& proxy,
+            service::client_state& client_state,
+            tracing::trace_state_ptr trace_state,
+            service_permit permit,
+            bool needs_read_before_write,
+            stats& stats);
+    std::optional<shard_id> shard_for_execute(bool needs_read_before_write);
+};
+
+} // namespace alternator
--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -136,7 +136,7 @@ rjson::value deserialize_item(bytes_view bv) {

    if (atype == alternator_type::NOT_SUPPORTED_YET) {
        slogger.trace("Non-optimal deserialization of alternator type {}", int8_t(atype));
-        return rjson::parse_raw(reinterpret_cast<const char *>(bv.data()), bv.size());
+        return rjson::parse(std::string_view(reinterpret_cast<const char *>(bv.data()), bv.size()));
    }
    type_representation type_representation = represent_type(atype);
    visit(*type_representation.dtype, to_json_visitor{deserialized, type_representation.ident, bv});
@@ -160,27 +160,34 @@ std::string type_to_string(data_type type) {

 bytes get_key_column_value(const rjson::value& item, const column_definition& column) {
    std::string column_name = column.name_as_text();
-    std::string expected_type = type_to_string(column.type);
-
-    const rjson::value& key_typed_value = rjson::get(item, rjson::value::StringRefType(column_name.c_str()));
-    if (!key_typed_value.IsObject() || key_typed_value.MemberCount() != 1) {
-        throw api_error("ValidationException",
-                format("Missing or invalid value object for key column {}: {}", column_name, item));
+    const rjson::value* key_typed_value = rjson::find(item, column_name);
+    if (!key_typed_value) {
+        throw api_error("ValidationException", format("Key column {} not found", column_name));
    }
-    return get_key_from_typed_value(key_typed_value, column, expected_type);
+    return get_key_from_typed_value(*key_typed_value, column);
 }

-bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column, const std::string& expected_type) {
+// Parses the JSON encoding for a key value, which is a map with a single
+// entry, whose key is the type (expected to match the key column's type)
+// and the value is the encoded value.
+bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column) {
+    if (!key_typed_value.IsObject() || key_typed_value.MemberCount() != 1 ||
+            !key_typed_value.MemberBegin()->value.IsString()) {
+        throw api_error("ValidationException",
+                format("Malformed value object for key column {}: {}",
+                        column.name_as_text(), key_typed_value));
+    }
+
    auto it = key_typed_value.MemberBegin();
-    if (it->name.GetString() != expected_type) {
+    if (it->name != type_to_string(column.type)) {
        throw api_error("ValidationException",
                format("Type mismatch: expected type {} for key column {}, got type {}",
-                        expected_type, column.name_as_text(), it->name.GetString()));
+                        type_to_string(column.type), column.name_as_text(), it->name.GetString()));
    }
    if (column.type == bytes_type) {
        return base64_decode(it->value);
    } else {
-        return column.type->from_string(it->value.GetString());
+        return column.type->from_string(rjson::to_string_view(it->value));
    }

 }
--- a/alternator/serialization.hh
+++ b/alternator/serialization.hh
@@ -24,7 +24,7 @@
 #include <string>
 #include <string_view>
 #include "types.hh"
-#include "schema.hh"
+#include "schema_fwd.hh"
 #include "keys.hh"
 #include "rjson.hh"
 #include "utils/big_decimal.hh"
@@ -54,7 +54,7 @@ rjson::value deserialize_item(bytes_view bv);
 std::string type_to_string(data_type type);

 bytes get_key_column_value(const rjson::value& item, const column_definition& column);
-bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column, const std::string& expected_type);
+bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column);
 rjson::value json_key_column_value(bytes_view cell, const column_definition& column);

 partition_key pk_from_json(const rjson::value& item, schema_ptr schema);
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -29,6 +29,8 @@
 #include "auth.hh"
 #include <cctype>
 #include "cql3/query_processor.hh"
+#include "service/storage_service.hh"
+#include "utils/overloaded_functor.hh"

 static logging::logger slogger("alternator-server");

@@ -65,9 +67,9 @@ inline std::vector<std::string_view> split(std::string_view text, char separator
 // Internal Server Error.
 class api_handler : public handler_base {
 public:
-    api_handler(const future_json_function& _handle) : _f_handle(
-         [_handle](std::unique_ptr<request> req, std::unique_ptr<reply> rep) {
-         return seastar::futurize_apply(_handle, std::move(req)).then_wrapped([rep = std::move(rep)](future<json::json_return_type> resf) mutable {
+    api_handler(const std::function<future<executor::request_return_type>(std::unique_ptr<request> req)>& _handle) : _f_handle(
+         [this, _handle](std::unique_ptr<request> req, std::unique_ptr<reply> rep) {
+         return seastar::futurize_apply(_handle, std::move(req)).then_wrapped([this, rep = std::move(rep)](future<executor::request_return_type> resf) mutable {
             if (resf.failed()) {
                 // Exceptions of type api_error are wrapped as JSON and
                 // returned to the client as expected. Other types of
@@ -86,20 +88,24 @@ public:
                             format("Internal server error: {}", std::current_exception()),
                             reply::status_type::internal_server_error);
                 }
-                 // FIXME: what is this version number?
-                 rep->_content += "{\"__type\":\"com.amazonaws.dynamodb.v20120810#" + ret._type + "\"," +
-                         "\"message\":\"" + ret._msg + "\"}";
-                 rep->_status = ret._http_code;
-                 slogger.trace("api_handler error case: {}", rep->_content);
+                 generate_error_reply(*rep, ret);
                 return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
             }
-             slogger.trace("api_handler success case");
             auto res = resf.get0();
-             if (res._body_writer) {
-                 rep->write_body("json", std::move(res._body_writer));
-             } else {
-                 rep->_content += res._res;
-             }
+             std::visit(overloaded_functor {
+                 [&] (const json::json_return_type& json_return_value) {
+                     slogger.trace("api_handler success case");
+                     if (json_return_value._body_writer) {
+                         rep->write_body("json", std::move(json_return_value._body_writer));
+                     } else {
+                         rep->_content += json_return_value._res;
+                     }
+                 },
+                 [&] (const api_error& err) {
+                     generate_error_reply(*rep, err);
+                 }
+             }, res);
+
             return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
         });
    }), _type("json") { }
@@ -115,18 +121,66 @@ public:
    }

 protected:
+    void generate_error_reply(reply& rep, const api_error& err) {
+        rep._content += "{\"__type\":\"com.amazonaws.dynamodb.v20120810#" + err._type + "\"," +
+                "\"message\":\"" + err._msg + "\"}";
+        rep._status = err._http_code;
+        slogger.trace("api_handler error case: {}", rep._content);
+    }
+
    future_handler_function _f_handle;
    sstring _type;
 };

-class health_handler : public handler_base {
-    virtual future<std::unique_ptr<reply>> handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
+class gated_handler : public handler_base {
+    seastar::gate& _gate;
+public:
+    gated_handler(seastar::gate& gate) : _gate(gate) {}
+    virtual future<std::unique_ptr<reply>> do_handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) = 0;
+    virtual future<std::unique_ptr<reply>> handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) final override {
+        return with_gate(_gate, [this, &path, req = std::move(req), rep = std::move(rep)] () mutable {
+            return do_handle(path, std::move(req), std::move(rep));
+        });
+    }
+};
+
+class health_handler : public gated_handler {
+public:
+    health_handler(seastar::gate& pending_requests) : gated_handler(pending_requests) {}
+protected:
+    virtual future<std::unique_ptr<reply>> do_handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
        rep->set_status(reply::status_type::ok);
        rep->write_body("txt", format("healthy: {}", req->get_header("Host")));
        return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
    }
 };

+class local_nodelist_handler : public gated_handler {
+public:
+    local_nodelist_handler(seastar::gate& pending_requests) : gated_handler(pending_requests) {}
+protected:
+    virtual future<std::unique_ptr<reply>> do_handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
+        rjson::value results = rjson::empty_array();
+        // It's very easy to get a list of all live nodes on the cluster,
+        // using gms::get_local_gossiper().get_live_members(). But getting
+        // just the list of live nodes in this DC needs more elaborate code:
+        sstring local_dc = locator::i_endpoint_snitch::get_local_snitch_ptr()->get_datacenter(
+                utils::fb_utilities::get_broadcast_address());
+        std::unordered_set<gms::inet_address> local_dc_nodes =
+                service::get_local_storage_service().get_token_metadata().
+                get_topology().get_datacenter_endpoints().at(local_dc);
+        for (auto& ip : local_dc_nodes) {
+            if (gms::get_local_gossiper().is_alive(ip)) {
+                rjson::push_back(results, rjson::from_string(ip.to_sstring()));
+            }
+        }
+        rep->set_status(reply::status_type::ok);
+        rep->set_content_type("json");
+        rep->_content = rjson::print(results);
+        return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
+    }
+};
+
 future<> server::verify_signature(const request& req) {
    if (!_enforce_authorization) {
        slogger.debug("Skipping authorization");
@@ -137,7 +191,7 @@ future<> server::verify_signature(const request& req) {
        throw api_error("InvalidSignatureException", "Host header is mandatory for signature verification");
    }
    auto authorization_it = req._headers.find("Authorization");
-    if (host_it == req._headers.end()) {
+    if (authorization_it == req._headers.end()) {
        throw api_error("InvalidSignatureException", "Authorization header is mandatory for signature verification");
    }
    std::string host = host_it->second;
@@ -214,7 +268,8 @@ future<> server::verify_signature(const request& req) {
    });
 }

-future<json::json_return_type> server::handle_api_request(std::unique_ptr<request>&& req) {
+future<executor::request_return_type> server::handle_api_request(std::unique_ptr<request>&& req) {
+    _executor._stats.total_operations++;
    sstring target = req->get_header(TARGET);
    std::vector<std::string_view> split_target = split(target, '.');
    //NOTICE(sarna): Target consists of Dynamo API version followed by a dot '.' and operation type (e.g. CreateTable)
@@ -223,17 +278,32 @@ future<json::json_return_type> server::handle_api_request(std::unique_ptr<reques
    return verify_signature(*req).then([this, op, req = std::move(req)] () mutable {
        auto callback_it = _callbacks.find(op);
        if (callback_it == _callbacks.end()) {
-            _executor.local()._stats.unsupported_operations++;
+            _executor._stats.unsupported_operations++;
            throw api_error("UnknownOperationException",
                    format("Unsupported operation {}", op));
        }
-        //FIXME: Client state can provide more context, e.g. client's endpoint address
-        // We use unique_ptr because client_state cannot be moved or copied
-        return do_with(std::make_unique<executor::client_state>(executor::client_state::internal_tag()), [this, callback_it = std::move(callback_it), op = std::move(op), req = std::move(req)] (std::unique_ptr<executor::client_state>& client_state) mutable {
-            client_state->set_raw_keyspace(executor::KEYSPACE_NAME);
-            executor::maybe_trace_query(*client_state, op, req->content);
-            tracing::trace(client_state->get_trace_state(), op);
-            return callback_it->second(_executor.local(), *client_state, std::move(req));
+        return with_gate(_pending_requests, [this, callback_it = std::move(callback_it), op = std::move(op), req = std::move(req)] () mutable {
+            //FIXME: Client state can provide more context, e.g. client's endpoint address
+            // We use unique_ptr because client_state cannot be moved or copied
+            return do_with(std::make_unique<executor::client_state>(executor::client_state::internal_tag()),
+                    [this, callback_it = std::move(callback_it), op = std::move(op), req = std::move(req)] (std::unique_ptr<executor::client_state>& client_state) mutable {
+                tracing::trace_state_ptr trace_state = executor::maybe_trace_query(*client_state, op, req->content);
+                tracing::trace(trace_state, op);
+                // JSON parsing can allocate up to roughly 2x the size of the raw document, + a couple of bytes for maintenance.
+                // FIXME: by this time, the whole HTTP request was already read, so some memory is already occupied.
+                // Once HTTP allows working on streams, we should grab the permit *before* reading the HTTP payload.
+                size_t mem_estimate = req->content.size() * 3 + 8000;
+                auto units_fut = get_units(*_memory_limiter, mem_estimate);
+                if (_memory_limiter->waiters()) {
+                    ++_executor._stats.requests_blocked_memory;
+                }
+                return units_fut.then([this, callback_it = std::move(callback_it), &client_state, trace_state, req = std::move(req)] (semaphore_units<> units) mutable {
+                    return _json_parser.parse(req->content).then([this, callback_it = std::move(callback_it), &client_state, trace_state,
+                            units = std::move(units), req = std::move(req)] (rjson::value json_request) mutable {
+                        return callback_it->second(_executor, *client_state, trace_state, make_service_permit(std::move(units)), std::move(json_request), std::move(req)).finally([trace_state] {});
+                    });
+                });
+            });
        });
    });
 }
@@ -243,35 +313,88 @@ void server::set_routes(routes& r) {
        return handle_api_request(std::move(req));
    });

-    r.add(operation_type::POST, url("/"), req_handler);
-    r.add(operation_type::GET, url("/"), new health_handler);
+    r.put(operation_type::POST, "/", req_handler);
+    r.put(operation_type::GET, "/", new health_handler(_pending_requests));
+    // The "/localnodes" request is a new Alternator feature, not supported by
+    // DynamoDB and not required for DynamoDB compatibility. It allows a
+    // client to enquire - using a trivial HTTP request without requiring
+    // authentication - the list of all live nodes in the same data center of
+    // the Alternator cluster. The client can use this list to balance its
+    // request load to all the nodes in the same geographical region.
+    // Note that this API exposes - openly without authentication - the
+    // information on the cluster's members inside one data center. We do not
+    // consider this to be a security risk, because an attacker can already
+    // scan an entire subnet for nodes responding to the health request,
+    // or even just scan for open ports.
+    r.put(operation_type::GET, "/localnodes", new local_nodelist_handler(_pending_requests));
 }

 //FIXME: A way to immediately invalidate the cache should be considered,
 // e.g. when the system table which stores the keys is changed.
 // For now, this propagation may take up to 1 minute.
-server::server(seastar::sharded<executor>& e)
-        : _executor(e), _key_cache(1024, 1min, slogger), _enforce_authorization(false)
+server::server(executor& exec)
+        : _http_server("http-alternator")
+        , _https_server("https-alternator")
+        , _executor(exec)
+        , _key_cache(1024, 1min, slogger)
+        , _enforce_authorization(false)
+        , _enabled_servers{}
+        , _pending_requests{}
      , _callbacks{
-        {"CreateTable", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) {
-            return e.maybe_create_keyspace().then([&e, &client_state, req = std::move(req)] { return e.create_table(client_state, req->content); }); }
-        },
-        {"DescribeTable", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.describe_table(client_state, req->content); }},
-        {"DeleteTable", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.delete_table(client_state, req->content); }},
-        {"PutItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.put_item(client_state, req->content); }},
-        {"UpdateItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.update_item(client_state, req->content); }},
-        {"GetItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.get_item(client_state, req->content); }},
-        {"DeleteItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.delete_item(client_state, req->content); }},
-        {"ListTables", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.list_tables(client_state, req->content); }},
-        {"Scan", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.scan(client_state, req->content); }},
-        {"DescribeEndpoints", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.describe_endpoints(client_state, req->content, req->get_header("Host")); }},
-        {"BatchWriteItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.batch_write_item(client_state, req->content); }},
-        {"BatchGetItem", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.batch_get_item(client_state, req->content); }},
-        {"Query", [] (executor& e, executor::client_state& client_state, std::unique_ptr<request> req) { return e.query(client_state, req->content); }},
+        {"CreateTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.create_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"DescribeTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.describe_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"DeleteTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.delete_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"PutItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.put_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"UpdateItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.update_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"GetItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.get_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"DeleteItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.delete_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"ListTables", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.list_tables(client_state, std::move(permit), std::move(json_request));
+        }},
+        {"Scan", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.scan(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"DescribeEndpoints", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.describe_endpoints(client_state, std::move(permit), std::move(json_request), req->get_header("Host"));
+        }},
+        {"BatchWriteItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.batch_write_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"BatchGetItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.batch_get_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"Query", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.query(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"TagResource", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.tag_resource(client_state, std::move(permit), std::move(json_request));
+        }},
+        {"UntagResource", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.untag_resource(client_state, std::move(permit), std::move(json_request));
+        }},
+        {"ListTagsOfResource", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.list_tags_of_resource(client_state, std::move(permit), std::move(json_request));
+        }},
    } {
 }

-future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds, bool enforce_authorization) {
+future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
+        bool enforce_authorization, semaphore* memory_limiter) {
+    _memory_limiter = memory_limiter;
    _enforce_authorization = enforce_authorization;
    if (!port && !https_port) {
        return make_exception_future<>(std::runtime_error("Either regular port or TLS port"
@@ -279,24 +402,21 @@ future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std:
    }
    return seastar::async([this, addr, port, https_port, creds] {
        try {
-            _executor.invoke_on_all([] (executor& e) {
-                return e.start();
-            }).get();
+            _executor.start().get();

            if (port) {
-                _control.start().get();
-                _control.set_routes(std::bind(&server::set_routes, this, std::placeholders::_1)).get();
-                _control.listen(socket_address{addr, *port}).get();
+                set_routes(_http_server._routes);
+                _http_server.set_content_length_limit(server::content_length_limit);
+                _http_server.listen(socket_address{addr, *port}).get();
+                _enabled_servers.push_back(std::ref(_http_server));
                slogger.info("Alternator HTTP server listening on {} port {}", addr, *port);
            }
            if (https_port) {
-                _https_control.start().get();
-                _https_control.set_routes(std::bind(&server::set_routes, this, std::placeholders::_1)).get();
-                _https_control.server().invoke_on_all([creds] (http_server& serv) {
-                    return serv.set_tls_credentials(creds->build_server_credentials());
-                }).get();
-
-                _https_control.listen(socket_address{addr, *https_port}).get();
+                set_routes(_https_server._routes);
+                _https_server.set_content_length_limit(server::content_length_limit);
+                _https_server.set_tls_credentials(creds->build_server_credentials());
+                _https_server.listen(socket_address{addr, *https_port}).get();
+                _enabled_servers.push_back(std::ref(_https_server));
                slogger.info("Alternator HTTPS server listening on {} port {}", addr, *https_port);
            }
        } catch (...) {
@@ -309,5 +429,55 @@ future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std:
    });
 }

+future<> server::stop() {
+    return parallel_for_each(_enabled_servers, [] (http_server& server) {
+        return server.stop();
+    }).then([this] {
+        return _pending_requests.close();
+    }).then([this] {
+        return _json_parser.stop();
+    });
+}
+
+server::json_parser::json_parser() : _run_parse_json_thread(async([this] {
+        while (true) {
+            _document_waiting.wait().get();
+            if (_as.abort_requested()) {
+                return;
+            }
+            try {
+                _parsed_document = rjson::parse_yieldable(_raw_document);
+                _current_exception = nullptr;
+            } catch (...) {
+                _current_exception = std::current_exception();
+            }
+            _document_parsed.signal();
+        }
+    })) {
+}
+
+future<rjson::value> server::json_parser::parse(std::string_view content) {
+    if (content.size() < yieldable_parsing_threshold) {
+        return make_ready_future<rjson::value>(rjson::parse(content));
+    }
+    return with_semaphore(_parsing_sem, 1, [this, content] {
+        _raw_document = content;
+        _document_waiting.signal();
+        return _document_parsed.wait().then([this] {
+            if (_current_exception) {
+                return make_exception_future<rjson::value>(_current_exception);
+            }
+            return make_ready_future<rjson::value>(std::move(_parsed_document));
+        });
+    });
+}
+
+future<> server::json_parser::stop() {
+    _as.request_abort();
+    _document_waiting.signal();
+    _document_parsed.broken();
+    return std::move(_run_parse_json_thread);
+}
+
 }

--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -27,27 +27,56 @@
 #include <seastar/net/tls.hh>
 #include <optional>
 #include <alternator/auth.hh>
+#include <utils/small_vector.hh>
+#include <seastar/core/units.hh>

 namespace alternator {

 class server {
-    using alternator_callback = std::function<future<json::json_return_type>(executor&, executor::client_state&, std::unique_ptr<request>)>;
+    static constexpr size_t content_length_limit = 16*MB;
+    using alternator_callback = std::function<future<executor::request_return_type>(executor&, executor::client_state&,
+            tracing::trace_state_ptr, service_permit, rjson::value, std::unique_ptr<request>)>;
    using alternator_callbacks_map = std::unordered_map<std::string_view, alternator_callback>;

-    seastar::httpd::http_server_control _control;
-    seastar::httpd::http_server_control _https_control;
-    seastar::sharded<executor>& _executor;
+    http_server _http_server;
+    http_server _https_server;
+    executor& _executor;
+
    key_cache _key_cache;
    bool _enforce_authorization;
+    utils::small_vector<std::reference_wrapper<seastar::httpd::http_server>, 2> _enabled_servers;
+    gate _pending_requests;
    alternator_callbacks_map _callbacks;
-public:
-    server(seastar::sharded<executor>& executor);

-    seastar::future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds, bool enforce_authorization);
+    semaphore* _memory_limiter;
+
+    class json_parser {
+        static constexpr size_t yieldable_parsing_threshold = 16*KB;
+        std::string_view _raw_document;
+        rjson::value _parsed_document;
+        std::exception_ptr _current_exception;
+        semaphore _parsing_sem{1};
+        condition_variable _document_waiting;
+        condition_variable _document_parsed;
+        abort_source _as;
+        future<> _run_parse_json_thread;
+    public:
+        json_parser();
+        future<rjson::value> parse(std::string_view content);
+        future<> stop();
+    };
+    json_parser _json_parser;
+
+public:
+    server(executor& executor);
+
+    future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
+            bool enforce_authorization, semaphore* memory_limiter);
+    future<> stop();
 private:
    void set_routes(seastar::httpd::routes& r);
    future<> verify_signature(const seastar::httpd::request& r);
-    future<json::json_return_type> handle_api_request(std::unique_ptr<request>&& req);
+    future<executor::request_return_type> handle_api_request(std::unique_ptr<request>&& req);
 };

 }
--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -85,6 +85,12 @@ stats::stats() : api_operations{} {
                    seastar::metrics::description("number of total operations via Alternator API")),
            seastar::metrics::make_total_operations("reads_before_write", reads_before_write,
                    seastar::metrics::description("number of performed read-before-write operations")),
+            seastar::metrics::make_total_operations("write_using_lwt", write_using_lwt,
+                    seastar::metrics::description("number of writes that used LWT")),
+            seastar::metrics::make_total_operations("shard_bounce_for_lwt", shard_bounce_for_lwt,
+                    seastar::metrics::description("number writes that had to be bounced from this shard because of LWT requirements")),
+            seastar::metrics::make_total_operations("requests_blocked_memory", requests_blocked_memory,
+                    seastar::metrics::description("Counts a number of requests blocked due to memory pressure.")),
            seastar::metrics::make_total_operations("filtered_rows_read_total", cql_stats.filtered_rows_read_total,
                    seastar::metrics::description("number of rows read during filtering operations")),
            seastar::metrics::make_total_operations("filtered_rows_matched_total", cql_stats.filtered_rows_matched_total,
--- a/alternator/stats.hh
+++ b/alternator/stats.hh
@@ -84,6 +84,9 @@ public:
    uint64_t total_operations = 0;
    uint64_t unsupported_operations = 0;
    uint64_t reads_before_write = 0;
+    uint64_t write_using_lwt = 0;
+    uint64_t shard_bounce_for_lwt = 0;
+    uint64_t requests_blocked_memory = 0;
    // CQL-derived stats
    cql3::cql_stats cql_stats;
 private:
--- a/alternator/tags_extension.hh
+++ b/alternator/tags_extension.hh
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "serializer.hh"
+#include "schema.hh"
+#include "db/extensions.hh"
+
+namespace alternator {
+
+class tags_extension : public schema_extension {
+public:
+    static constexpr auto NAME = "scylla_tags";
+
+    tags_extension() = default;
+    explicit tags_extension(const std::map<sstring, sstring>& tags) : _tags(std::move(tags)) {}
+    explicit tags_extension(bytes b) : _tags(tags_extension::deserialize(b)) {}
+    explicit tags_extension(const sstring& s) {
+        throw std::logic_error("Cannot create tags from string");
+    }
+    bytes serialize() const override {
+        return ser::serialize_to_buffer<bytes>(_tags);
+    }
+    static std::map<sstring, sstring> deserialize(bytes_view buffer) {
+        return ser::deserialize_from_buffer(buffer, boost::type<std::map<sstring, sstring>>());
+    }
+    const std::map<sstring, sstring>& tags() const {
+        return _tags;
+    }
+private:
+    std::map<sstring, sstring> _tags;
+};
+
+}
--- a/api/api-doc/column_family.json
+++ b/api/api-doc/column_family.json
@@ -70,7 +70,7 @@
            {
               "method":"POST",
               "summary":"Force a major compaction of this column family",
-               "type":"string",
+               "type":"void",
               "nickname":"force_major_compaction",
               "produces":[
                  "application/json"
--- a/api/api-doc/error_injection.json
+++ b/api/api-doc/error_injection.json
@@ -0,0 +1,90 @@
+{
+   "apiVersion":"0.0.1",
+   "swaggerVersion":"1.2",
+   "basePath":"{{Protocol}}://{{Host}}",
+   "resourcePath":"/error_injection",
+   "produces":[
+      "application/json"
+   ],
+   "apis":[
+      {
+         "path":"/v2/error_injection/injection/{injection}",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Activate an injection that triggers an error in code",
+               "type":"void",
+               "nickname":"enable_injection",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"injection",
+                     "description":"injection name, should correspond to an injection added in code",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  },
+                  {
+                     "name":"one_shot",
+                     "description":"boolean flag indicating whether the injection should be enabled to trigger only once",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
+                  }
+               ]
+            },
+            {
+               "method":"DELETE",
+               "summary":"Deactivate an injection previously activated by the API",
+               "type":"void",
+               "nickname":"disable_injection",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"injection",
+                     "description":"injection name",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/v2/error_injection/injection",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"List all enabled injections on all shards, i.e. injections that will trigger an error in the code",
+               "type":"array",
+               "items":{
+                  "type":"string"
+               },
+               "nickname":"get_enabled_injections_on_all",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            },
+            {
+               "method":"DELETE",
+               "summary":"Deactivate all injections previously activated on all shards by the API",
+               "type":"void",
+               "nickname":"disable_on_all",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            }
+         ]
+      }
+   ]
+}
--- a/api/api-doc/storage_proxy.json
+++ b/api/api-doc/storage_proxy.json
@@ -641,6 +641,21 @@
        }
      ]
    },
+    {
+      "path": "/storage_proxy/metrics/cas_write/failed_read_round_optimization",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get cas write metrics",
+          "type": "long",
+          "nickname": "get_cas_write_metrics_failed_read_round_optimization",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
    {
      "path": "/storage_proxy/metrics/cas_read/unfinished_commit",
      "operations": [
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -582,7 +582,15 @@
                  },
                  {
                     "name":"kn",
-                     "description":"Comma seperated keyspaces name to snapshot",
+                     "description":"Comma seperated keyspaces name that their snapshot will be deleted",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"cf",
+                     "description":"an optional table name that its snapshot will be deleted",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
--- a/api/api.cc
+++ b/api/api.cc
@@ -36,6 +36,7 @@
 #include "endpoint_snitch.hh"
 #include "compaction_manager.hh"
 #include "hinted_handoff.hh"
+#include "error_injection.hh"
 #include <seastar/http/exception.hh>
 #include "stream_manager.hh"
 #include "system.hh"
@@ -68,13 +69,19 @@ future<> set_server_init(http_context& ctx) {
        rb->set_api_doc(r);
        rb02->set_api_doc(r);
        rb02->register_api_file(r, "swagger20_header");
-        set_config(rb02, ctx, r);
        rb->register_function(r, "system",
                "The system related API");
        set_system(ctx, r);
    });
 }

+future<> set_server_config(http_context& ctx) {
+    auto rb02 = std::make_shared < api_registry_builder20 > (ctx.api_doc, "/v2");
+    return ctx.http_server.set_routes([&ctx, rb02](routes& r) {
+        set_config(rb02, ctx, r);
+    });
+}
+
 static future<> register_api(http_context& ctx, const sstring& api_name,
        const sstring api_desc,
        std::function<void(http_context& ctx, routes& r)> f) {
@@ -90,6 +97,10 @@ future<> set_server_storage_service(http_context& ctx) {
    return register_api(ctx, "storage_service", "The storage service API", set_storage_service);
 }

+future<> set_server_snapshot(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { set_snapshot(ctx, r); });
+}
+
 future<> set_server_snitch(http_context& ctx) {
    return register_api(ctx, "endpoint_snitch_info", "The endpoint snitch info API", set_endpoint_snitch);
 }
@@ -153,6 +164,9 @@ future<> set_server_done(http_context& ctx) {
        rb->register_function(r, "collectd",
                "The collectd API");
        set_collectd(ctx, r);
+        rb->register_function(r, "error_injection",
+                "The error injection API");
+        set_error_injection(ctx, r);
    });
 }

--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -24,6 +24,7 @@
 #include <seastar/http/httpd.hh>

 namespace service { class load_meter; }
+namespace locator { class token_metadata; }

 namespace api {

@@ -34,16 +35,20 @@ struct http_context {
    distributed<database>& db;
    distributed<service::storage_proxy>& sp;
    service::load_meter& lmeter;
+    sharded<locator::token_metadata>& token_metadata;
+
    http_context(distributed<database>& _db,
            distributed<service::storage_proxy>& _sp,
-            service::load_meter& _lm)
-            : db(_db), sp(_sp), lmeter(_lm) {
+            service::load_meter& _lm, sharded<locator::token_metadata>& _tm)
+            : db(_db), sp(_sp), lmeter(_lm), token_metadata(_tm) {
    }
 };

 future<> set_server_init(http_context& ctx);
+future<> set_server_config(http_context& ctx);
 future<> set_server_snitch(http_context& ctx);
 future<> set_server_storage_service(http_context& ctx);
+future<> set_server_snapshot(http_context& ctx);
 future<> set_server_gossip(http_context& ctx);
 future<> set_server_load_sstable(http_context& ctx);
 future<> set_server_messaging_service(http_context& ctx);
--- a/api/collectd.cc
+++ b/api/collectd.cc
@@ -64,7 +64,7 @@ static const char* str_to_regex(const sstring& v) {
 void set_collectd(http_context& ctx, routes& r) {
    cd::get_collectd.set(r, [&ctx](std::unique_ptr<request> req) {

-        auto id = make_shared<scollectd::type_instance_id>(req->param["pluginid"],
+        auto id = ::make_shared<scollectd::type_instance_id>(req->param["pluginid"],
                req->get_query_param("instance"), req->get_query_param("type"),
                req->get_query_param("type_instance"));

--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -994,5 +994,15 @@ void set_column_family(http_context& ctx, routes& r) {
        });
    });

+    cf::force_major_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
+        if (req->get_query_param("split_output") != "") {
+            fail(unimplemented::cause::API);
+        }
+        return foreach_column_family(ctx, req->param["name"], [](column_family &cf) {
+            return cf.compact_all_sstables();
+        }).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
 }
 }
--- a/api/error_injection.cc
+++ b/api/error_injection.cc
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "api/api-doc/error_injection.json.hh"
+#include "api/api.hh"
+
+#include <seastar/http/exception.hh>
+#include "log.hh"
+#include "utils/error_injection.hh"
+#include "seastar/core/future-util.hh"
+
+namespace api {
+
+namespace hf = httpd::error_injection_json;
+
+void set_error_injection(http_context& ctx, routes& r) {
+
+    hf::enable_injection.set(r, [](std::unique_ptr<request> req) {
+        sstring injection = req->param["injection"];
+        bool one_shot = req->get_query_param("one_shot") == "True";
+        auto& errinj = utils::get_local_injector();
+        errinj.enable_on_all(injection, one_shot);
+        return make_ready_future<json::json_return_type>(json::json_void());
+    });
+
+    hf::get_enabled_injections_on_all.set(r, [](std::unique_ptr<request> req) {
+        auto& errinj = utils::get_local_injector();
+        auto ret = errinj.enabled_injections_on_all();
+        return make_ready_future<json::json_return_type>(ret);
+    });
+
+    hf::disable_injection.set(r, [](std::unique_ptr<request> req) {
+        sstring injection = req->param["injection"];
+
+        auto& errinj = utils::get_local_injector();
+        errinj.disable_on_all(injection);
+        return make_ready_future<json::json_return_type>(json::json_void());
+    });
+
+    hf::disable_on_all.set(r, [](std::unique_ptr<request> req) {
+        auto& errinj = utils::get_local_injector();
+        errinj.disable_on_all();
+        return make_ready_future<json::json_return_type>(json::json_void());
+    });
+
+}
+
+} // namespace api
--- a/api/error_injection.hh
+++ b/api/error_injection.hh
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "api.hh"
+
+namespace api {
+
+void set_error_injection(http_context& ctx, routes& r);
+
+}
--- a/api/storage_proxy.cc
+++ b/api/storage_proxy.cc
@@ -27,6 +27,7 @@
 #include "db/config.hh"
 #include "utils/histogram.hh"
 #include "database.hh"
+#include "seastar/core/scheduling_specific.hh"

 namespace api {

@@ -34,12 +35,70 @@ namespace sp = httpd::storage_proxy_json;
 using proxy = service::storage_proxy;
 using namespace json;

-static future<utils::rate_moving_average>  sum_timed_rate(distributed<proxy>& d, utils::timed_rate_moving_average proxy::stats::*f) {
-    return d.map_reduce0([f](const proxy& p) {return (p.get_stats().*f).rate();}, utils::rate_moving_average(),
-            std::plus<utils::rate_moving_average>());
+
+/**
+ * This function implement a two dimentional map reduce where
+ * the first level is a distributed storage_proxy class and the
+ * second level is the stats per scheduling group class.
+ * @param d -  a reference to the storage_proxy distributed class.
+ * @param mapper -  the internal mapper that is used to map the internal
+ * stat class into a value of type `V`.
+ * @param reducer - the reducer that is used in both outer and inner
+ * aggregations.
+ * @param initial_value - the initial value to use for both aggregations
+ * @return A future that resolves to the result of the aggregation.
+ */
+template<typename V, typename Reducer, typename InnerMapper>
+future<V> two_dimensional_map_reduce(distributed<service::storage_proxy>& d,
+        InnerMapper mapper, Reducer reducer, V initial_value) {
+    return d.map_reduce0( [mapper, reducer, initial_value] (const service::storage_proxy& sp) {
+        return map_reduce_scheduling_group_specific<service::storage_proxy_stats::stats>(
+                mapper, reducer, initial_value, sp.get_stats_key());
+    }, initial_value, reducer);
 }

-static future<json::json_return_type>  sum_timed_rate_as_obj(distributed<proxy>& d, utils::timed_rate_moving_average proxy::stats::*f) {
+/**
+ * This function implement a two dimentional map reduce where
+ * the first level is a distributed storage_proxy class and the
+ * second level is the stats per scheduling group class.
+ * @param d -  a reference to the storage_proxy distributed class.
+ * @param f - a field pointer which is the implicit internal reducer.
+ * @param reducer - the reducer that is used in both outer and inner
+ * aggregations.
+ * @param initial_value - the initial value to use for both aggregations* @return
+ * @return A future that resolves to the result of the aggregation.
+ */
+template<typename V, typename Reducer, typename F>
+future<V> two_dimensional_map_reduce(distributed<service::storage_proxy>& d,
+        V F::*f, Reducer reducer, V initial_value) {
+    return two_dimensional_map_reduce(d, [f] (F& stats) {
+        return stats.*f;
+    }, reducer, initial_value);
+}
+
+/**
+ * A partial Specialization of sum_stats for the storage proxy
+ * case where the get stats function doesn't return a
+ * stats object with fields but a per scheduling group
+ * stats object, the name was also changed since functions
+ * partial specialization is not supported in C++.
+ *
+ */
+template<typename V, typename F>
+future<json::json_return_type>  sum_stats_storage_proxy(distributed<proxy>& d, V F::*f) {
+    return two_dimensional_map_reduce(d, [f] (F& stats) { return stats.*f; }, std::plus<V>(), V(0)).then([] (V val) {
+        return make_ready_future<json::json_return_type>(val);
+    });
+}
+
+
+static future<utils::rate_moving_average>  sum_timed_rate(distributed<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
+    return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
+        return (stats.*f).rate();
+    }, std::plus<utils::rate_moving_average>(), utils::rate_moving_average());
+}
+
+static future<json::json_return_type>  sum_timed_rate_as_obj(distributed<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
    return sum_timed_rate(d, f).then([](const utils::rate_moving_average& val) {
        httpd::utils_json::rate_moving_average m;
        m = val;
@@ -51,29 +110,72 @@ httpd::utils_json::rate_moving_average_and_histogram get_empty_moving_average()
    return timer_to_json(utils::rate_moving_average_and_histogram());
 }

-static future<json::json_return_type>  sum_timed_rate_as_long(distributed<proxy>& d, utils::timed_rate_moving_average proxy::stats::*f) {
+static future<json::json_return_type>  sum_timed_rate_as_long(distributed<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
    return sum_timed_rate(d, f).then([](const utils::rate_moving_average& val) {
        return make_ready_future<json::json_return_type>(val.count);
    });
 }

-static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::estimated_histogram proxy::stats::*f) {
-    return ctx.sp.map_reduce0([f](const proxy& p) {return p.get_stats().*f;}, utils::estimated_histogram(),
-            utils::estimated_histogram_merge).then([](const utils::estimated_histogram& val) {
+static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::estimated_histogram service::storage_proxy_stats::stats::*f) {
+
+    return two_dimensional_map_reduce(ctx.sp, f, utils::estimated_histogram_merge,
+            utils::estimated_histogram()).then([](const utils::estimated_histogram& val) {
        utils_json::estimated_histogram res;
        res = val;
        return make_ready_future<json::json_return_type>(res);
    });
 }

-static future<json::json_return_type>  total_latency(http_context& ctx, utils::timed_rate_moving_average_and_histogram proxy::stats::*f) {
-    return ctx.sp.map_reduce0([f](const proxy& p) {return (p.get_stats().*f).hist.mean * (p.get_stats().*f).hist.count;}, 0.0,
-            std::plus<double>()).then([](double val) {
+static future<json::json_return_type>  total_latency(http_context& ctx, utils::timed_rate_moving_average_and_histogram service::storage_proxy_stats::stats::*f) {
+    return two_dimensional_map_reduce(ctx.sp, [f] (service::storage_proxy_stats::stats& stats) {
+            return (stats.*f).hist.mean * (stats.*f).hist.count;
+        }, std::plus<double>(), 0.0).then([](double val) {
        int64_t res = val;
        return make_ready_future<json::json_return_type>(res);
    });
 }

+/**
+ * A partial Specialization of sum_histogram_stats
+ * for the storage proxy case where the get stats
+ * function doesn't return a stats object with
+ * fields but a per scheduling group stats object,
+ * the name was also changed since function partial
+ * specialization is not supported in C++.
+ */
+template<typename F>
+future<json::json_return_type>
+sum_histogram_stats_storage_proxy(distributed<proxy>& d,
+        utils::timed_rate_moving_average_and_histogram F::*f) {
+    return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
+        return (stats.*f).hist;
+    }, std::plus<utils::ihistogram>(), utils::ihistogram()).
+            then([](const utils::ihistogram& val) {
+        return make_ready_future<json::json_return_type>(to_json(val));
+    });
+}
+
+/**
+ * A partial Specialization of sum_timer_stats for the
+ * storage proxy case where the get stats function
+ * doesn't return a stats object with fields but a
+ * per scheduling group stats object, the name
+ * was also changed since partial function specialization
+ * is not supported in C++.
+ */
+template<typename F>
+future<json::json_return_type>
+sum_timer_stats_storage_proxy(distributed<proxy>& d,
+        utils::timed_rate_moving_average_and_histogram F::*f) {
+
+    return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
+        return (stats.*f).rate();
+    }, std::plus<utils::rate_moving_average_and_histogram>(),
+            utils::rate_moving_average_and_histogram()).then([](const utils::rate_moving_average_and_histogram& val) {
+        return make_ready_future<json::json_return_type>(timer_to_json(val));
+    });
+}
+
 void set_storage_proxy(http_context& ctx, routes& r) {
    sp::get_total_hints.set(r, [](std::unique_ptr<request> req)  {
        //TBD
@@ -223,15 +325,15 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_read_repair_attempted.set(r, [&ctx](std::unique_ptr<request> req)  {
-        return sum_stats(ctx.sp, &proxy::stats::read_repair_attempts);
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_attempts);
    });

    sp::get_read_repair_repaired_blocking.set(r, [&ctx](std::unique_ptr<request> req)  {
-        return sum_stats(ctx.sp, &proxy::stats::read_repair_repaired_blocking);
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_repaired_blocking);
    });

    sp::get_read_repair_repaired_background.set(r, [&ctx](std::unique_ptr<request> req)  {
-        return sum_stats(ctx.sp, &proxy::stats::read_repair_repaired_background);
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_repaired_background);
    });

    sp::get_schema_versions.set(r, [](std::unique_ptr<request> req)  {
@@ -275,6 +377,10 @@ void set_storage_proxy(http_context& ctx, routes& r) {
        return sum_stats(ctx.sp, &proxy::stats::cas_write_condition_not_met);
    });

+    sp::get_cas_write_metrics_failed_read_round_optimization.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_failed_read_round_optimization);
+    });
+
    sp::get_cas_read_metrics_unfinished_commit.set(r, [&ctx](std::unique_ptr<request> req) {
        return sum_stats(ctx.sp, &proxy::stats::cas_read_unfinished_commit);
    });
@@ -284,71 +390,71 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_read_metrics_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::read_timeouts);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::read_timeouts);
    });

    sp::get_read_metrics_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::read_unavailables);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::read_unavailables);
    });

    sp::get_range_metrics_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::range_slice_timeouts);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::range_slice_timeouts);
    });

    sp::get_range_metrics_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::range_slice_unavailables);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::range_slice_unavailables);
    });

    sp::get_write_metrics_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::write_timeouts);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::write_timeouts);
    });

    sp::get_write_metrics_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::write_unavailables);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::write_unavailables);
    });

    sp::get_read_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::read_timeouts);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::read_timeouts);
    });

    sp::get_read_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::read_unavailables);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::read_unavailables);
    });

    sp::get_range_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::range_slice_timeouts);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::range_slice_timeouts);
    });

    sp::get_range_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::range_slice_unavailables);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::range_slice_unavailables);
    });

    sp::get_write_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::write_timeouts);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::write_timeouts);
    });

    sp::get_write_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::write_unavailables);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::write_unavailables);
    });

    sp::get_range_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_histogram_stats(ctx.sp, &proxy::stats::range);
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

    sp::get_write_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_histogram_stats(ctx.sp, &proxy::stats::write);
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::write);
    });

    sp::get_read_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_histogram_stats(ctx.sp, &proxy::stats::read);
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read);
    });

    sp::get_range_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats(ctx.sp, &proxy::stats::range);
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

    sp::get_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats(ctx.sp, &proxy::stats::write);
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::write);
    });
    sp::get_cas_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
        return sum_timer_stats(ctx.sp, &proxy::stats::cas_write);
@@ -367,30 +473,30 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_read_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats(ctx.sp, &proxy::stats::read);
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read);
    });

    sp::get_read_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_estimated_histogram(ctx, &proxy::stats::estimated_read);
+        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::estimated_read);
    });

    sp::get_read_latency.set(r, [&ctx](std::unique_ptr<request> req) {
-        return total_latency(ctx, &proxy::stats::read);
+        return total_latency(ctx, &service::storage_proxy_stats::stats::read);
    });
    sp::get_write_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_estimated_histogram(ctx, &proxy::stats::estimated_write);
+        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::estimated_write);
    });

    sp::get_write_latency.set(r, [&ctx](std::unique_ptr<request> req) {
-        return total_latency(ctx, &proxy::stats::write);
+        return total_latency(ctx, &service::storage_proxy_stats::stats::write);
    });

    sp::get_range_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats(ctx.sp, &proxy::stats::range);
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

    sp::get_range_latency.set(r, [&ctx](std::unique_ptr<request> req) {
-        return total_latency(ctx, &proxy::stats::range);
+        return total_latency(ctx, &service::storage_proxy_stats::stats::range);
    });
 }

--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -42,8 +42,6 @@
 #include "database.hh"
 #include "db/extensions.hh"

-sstables::sstable::version_types get_highest_supported_format();
-
 namespace api {

 namespace ss = httpd::storage_service_json;
@@ -56,57 +54,53 @@ static sstring validate_keyspace(http_context& ctx, const parameters& param) {
    throw bad_param_exception("Keyspace " + param["keyspace"] + " Does not exist");
 }

-static std::vector<ss::token_range> describe_ring(const sstring& keyspace) {
-    std::vector<ss::token_range> res;
-    for (auto d : service::get_local_storage_service().describe_ring(keyspace)) {
-        ss::token_range r;
-        r.start_token = d._start_token;
-        r.end_token = d._end_token;
-        r.endpoints = d._endpoints;
-        r.rpc_endpoints = d._rpc_endpoints;
-        for (auto det : d._endpoint_details) {
-            ss::endpoint_detail ed;
-            ed.host = det._host;
-            ed.datacenter = det._datacenter;
-            if (det._rack != "") {
-                ed.rack = det._rack;
-            }
-            r.endpoint_details.push(ed);
+static ss::token_range token_range_endpoints_to_json(const dht::token_range_endpoints& d) {
+    ss::token_range r;
+    r.start_token = d._start_token;
+    r.end_token = d._end_token;
+    r.endpoints = d._endpoints;
+    r.rpc_endpoints = d._rpc_endpoints;
+    for (auto det : d._endpoint_details) {
+        ss::endpoint_detail ed;
+        ed.host = det._host;
+        ed.datacenter = det._datacenter;
+        if (det._rack != "") {
+            ed.rack = det._rack;
        }
-        res.push_back(r);
+        r.endpoint_details.push(ed);
    }
-    return res;
+    return r;
+}
+
+using ks_cf_func = std::function<future<json::json_return_type>(http_context&, std::unique_ptr<request>, sstring, std::vector<sstring>)>;
+
+static auto wrap_ks_cf(http_context &ctx, ks_cf_func f) {
+    return [&ctx, f = std::move(f)](std::unique_ptr<request> req) {
+        auto keyspace = validate_keyspace(ctx, req->param);
+        auto column_families = split_cf(req->get_query_param("cf"));
+        if (column_families.empty()) {
+            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
+        }
+        return f(ctx, std::move(req), std::move(keyspace), std::move(column_families));
+    };
 }

 void set_storage_service(http_context& ctx, routes& r) {
-    using ks_cf_func = std::function<future<json::json_return_type>(std::unique_ptr<request>, sstring, std::vector<sstring>)>;
-
-    auto wrap_ks_cf = [&ctx](ks_cf_func f) {
-        return [&ctx, f = std::move(f)](std::unique_ptr<request> req) {
-            auto keyspace = validate_keyspace(ctx, req->param);
-            auto column_families = split_cf(req->get_query_param("cf"));
-            if (column_families.empty()) {
-                column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
-            }
-            return f(std::move(req), std::move(keyspace), std::move(column_families));
-        };
-    };
-
    ss::local_hostid.set(r, [](std::unique_ptr<request> req) {
        return db::system_keyspace::get_local_host_id().then([](const utils::UUID& id) {
            return make_ready_future<json::json_return_type>(id.to_sstring());
        });
    });

-    ss::get_tokens.set(r, [] (std::unique_ptr<request> req) {
-        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().get_token_metadata().sorted_tokens(), [](const dht::token& i) {
+    ss::get_tokens.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.token_metadata.local().sorted_tokens(), [](const dht::token& i) {
           return boost::lexical_cast<std::string>(i);
        }));
    });

-    ss::get_node_tokens.set(r, [] (std::unique_ptr<request> req) {
+    ss::get_node_tokens.set(r, [&ctx] (std::unique_ptr<request> req) {
        gms::inet_address addr(req->param["endpoint"]);
-        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().get_token_metadata().get_tokens(addr), [](const dht::token& i) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.token_metadata.local().get_tokens(addr), [](const dht::token& i) {
           return boost::lexical_cast<std::string>(i);
       }));
    });
@@ -124,8 +118,8 @@ void set_storage_service(http_context& ctx, routes& r) {
        }));
    });

-    ss::get_leaving_nodes.set(r, [](const_req req) {
-        return container_to_vec(service::get_local_storage_service().get_token_metadata().get_leaving_endpoints());
+    ss::get_leaving_nodes.set(r, [&ctx](const_req req) {
+        return container_to_vec(ctx.token_metadata.local().get_leaving_endpoints());
    });

    ss::get_moving_nodes.set(r, [](const_req req) {
@@ -133,8 +127,8 @@ void set_storage_service(http_context& ctx, routes& r) {
        return container_to_vec(addr);
    });

-    ss::get_joining_nodes.set(r, [](const_req req) {
-        auto points = service::get_local_storage_service().get_token_metadata().get_bootstrap_tokens();
+    ss::get_joining_nodes.set(r, [&ctx](const_req req) {
+        auto points = ctx.token_metadata.local().get_bootstrap_tokens();
        std::unordered_set<sstring> addr;
        for (auto i: points) {
            addr.insert(boost::lexical_cast<std::string>(i.second));
@@ -177,19 +171,18 @@ void set_storage_service(http_context& ctx, routes& r) {
        return make_ready_future<json::json_return_type>(res);
    });

-    ss::describe_any_ring.set(r, [&ctx](const_req req) {
-        return describe_ring("");
+    ss::describe_any_ring.set(r, [&ctx](std::unique_ptr<request> req) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(""), token_range_endpoints_to_json));
    });

-    ss::describe_ring.set(r, [&ctx](const_req req) {
-        auto keyspace = validate_keyspace(ctx, req.param);
-        return describe_ring(keyspace);
+    ss::describe_ring.set(r, [&ctx](std::unique_ptr<request> req) {
+        auto keyspace = validate_keyspace(ctx, req->param);
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(keyspace), token_range_endpoints_to_json));
    });

-    ss::get_host_id_map.set(r, [](const_req req) {
+    ss::get_host_id_map.set(r, [&ctx](const_req req) {
        std::vector<ss::mapper> res;
-        return map_to_key_value(service::get_local_storage_service().
-                get_token_metadata().get_endpoint_to_host_id_map_for_reading(), res);
+        return map_to_key_value(ctx.token_metadata.local().get_endpoint_to_host_id_map_for_reading(), res);
    });

    ss::get_load.set(r, [&ctx](std::unique_ptr<request> req) {
@@ -222,64 +215,6 @@ void set_storage_service(http_context& ctx, routes& r) {
                req.get_query_param("key")));
    });

-    ss::get_snapshot_details.set(r, [](std::unique_ptr<request> req) {
-        return service::get_local_storage_service().get_snapshot_details().then([] (auto result) {
-            std::vector<ss::snapshots> res;
-            for (auto& map: result) {
-                ss::snapshots all_snapshots;
-                all_snapshots.key = map.first;
-
-                std::vector<ss::snapshot> snapshot;
-                for (auto& cf: map.second) {
-                    ss::snapshot s;
-                    s.ks = cf.ks;
-                    s.cf = cf.cf;
-                    s.live = cf.live;
-                    s.total = cf.total;
-                    snapshot.push_back(std::move(s));
-                }
-                all_snapshots.value = std::move(snapshot);
-                res.push_back(std::move(all_snapshots));
-            }
-            return make_ready_future<json::json_return_type>(std::move(res));
-        });
-    });
-
-    ss::take_snapshot.set(r, [](std::unique_ptr<request> req) {
-        auto tag = req->get_query_param("tag");
-        auto column_family = req->get_query_param("cf");
-
-        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-
-        auto resp = make_ready_future<>();
-        if (column_family.empty()) {
-            resp = service::get_local_storage_service().take_snapshot(tag, keynames);
-        } else {
-            if (keynames.size() > 1) {
-                throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
-            }
-            resp = service::get_local_storage_service().take_column_family_snapshot(keynames[0], column_family, tag);
-        }
-        return resp.then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
-    ss::del_snapshot.set(r, [](std::unique_ptr<request> req) {
-        auto tag = req->get_query_param("tag");
-
-        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-        return service::get_local_storage_service().clear_snapshot(tag, keynames).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
-    ss::true_snapshots_size.set(r, [](std::unique_ptr<request> req) {
-        return service::get_local_storage_service().true_snapshots_size().then([] (int64_t size) {
-            return make_ready_future<json::json_return_type>(size);
-        });
-    });
-
    ss::force_keyspace_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
        auto keyspace = validate_keyspace(ctx, req->param);
        auto column_families = split_cf(req->get_query_param("cf"));
@@ -317,8 +252,8 @@ void set_storage_service(http_context& ctx, routes& r) {
                for (auto cf : column_families) {
                    column_families_vec.push_back(&db.find_column_family(keyspace, cf));
                }
-                return parallel_for_each(column_families_vec, [&cm] (column_family* cf) {
-                    return cm.perform_cleanup(cf);
+                return parallel_for_each(column_families_vec, [&cm, &db] (column_family* cf) {
+                    return cm.perform_cleanup(db, cf);
                });
            }).then([]{
                return make_ready_future<json::json_return_type>(0);
@@ -326,32 +261,7 @@ void set_storage_service(http_context& ctx, routes& r) {
        });
    });

-    ss::scrub.set(r, wrap_ks_cf([&ctx](std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
-        // TODO: respect this
-        auto skip_corrupted = req->get_query_param("skip_corrupted");
-
-        auto f = make_ready_future<>();
-        if (!req_param<bool>(*req, "disable_snapshot", false)) {
-            auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
-            f = parallel_for_each(column_families, [keyspace, tag](sstring cf) {
-                return service::get_local_storage_service().take_column_family_snapshot(keyspace, cf, tag);
-            });
-        }
-
-        return f.then([&ctx, keyspace, column_families] {
-            return ctx.db.invoke_on_all([=] (database& db) {
-                return do_for_each(column_families, [=, &db](sstring cfname) {
-                    auto& cm = db.get_compaction_manager();
-                    auto& cf = db.find_column_family(keyspace, cfname);
-                    return cm.perform_sstable_scrub(&cf);
-                });
-            });
-        }).then([]{
-            return make_ready_future<json::json_return_type>(0);
-        });
-    }));
-
-    ss::upgrade_sstables.set(r, wrap_ks_cf([&ctx](std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
+    ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);

        return ctx.db.invoke_on_all([=] (database& db) {
@@ -1037,4 +947,107 @@ void set_storage_service(http_context& ctx, routes& r) {

 }

+void set_snapshot(http_context& ctx, routes& r) {
+    ss::get_snapshot_details.set(r, [](std::unique_ptr<request> req) {
+        std::function<future<>(output_stream<char>&&)> f = [](output_stream<char>&& s) {
+            return do_with(output_stream<char>(std::move(s)), true, [] (output_stream<char>& s, bool& first){
+                return s.write("[").then([&s, &first] {
+                    return service::get_local_storage_service().get_snapshot_details().then([&s, &first] (std::unordered_map<sstring, std::vector<service::storage_service::snapshot_details>>&& result) {
+                        return do_with(std::move(result), [&s, &first](const std::unordered_map<sstring, std::vector<service::storage_service::snapshot_details>>& result) {
+                            return do_for_each(result, [&s, &result,&first](std::tuple<sstring, std::vector<service::storage_service::snapshot_details>>&& map){
+                                return do_with(ss::snapshots(), [&s, &first, &result, &map](ss::snapshots& all_snapshots) {
+                                    all_snapshots.key = std::get<0>(map);
+                                    future<> f = first ? make_ready_future<>() : s.write(", ");
+                                    first = false;
+                                    std::vector<ss::snapshot> snapshot;
+                                    for (auto& cf: std::get<1>(map)) {
+                                        ss::snapshot snp;
+                                        snp.ks = cf.ks;
+                                        snp.cf = cf.cf;
+                                        snp.live = cf.live;
+                                        snp.total = cf.total;
+                                        snapshot.push_back(std::move(snp));
+                                    }
+                                    all_snapshots.value = std::move(snapshot);
+                                    return f.then([&s, &all_snapshots] {
+                                        return all_snapshots.write(s);
+                                    });
+                                });
+                            });
+                        });
+                    }).then([&s] {
+                        return s.write("]").then([&s] {
+                            return s.close();
+                        });
+                    });
+                });
+            });
+        };
+        return make_ready_future<json::json_return_type>(std::move(f));
+    });
+
+    ss::take_snapshot.set(r, [](std::unique_ptr<request> req) {
+        auto tag = req->get_query_param("tag");
+        auto column_family = req->get_query_param("cf");
+
+        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
+
+        auto resp = make_ready_future<>();
+        if (column_family.empty()) {
+            resp = service::get_local_storage_service().take_snapshot(tag, keynames);
+        } else {
+            if (keynames.empty()) {
+                throw httpd::bad_param_exception("The keyspace of column families must be specified");
+            }
+            if (keynames.size() > 1) {
+                throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
+            }
+            resp = service::get_local_storage_service().take_column_family_snapshot(keynames[0], column_family, tag);
+        }
+        return resp.then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    ss::del_snapshot.set(r, [](std::unique_ptr<request> req) {
+        auto tag = req->get_query_param("tag");
+        auto column_family = req->get_query_param("cf");
+
+        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
+        return service::get_local_storage_service().clear_snapshot(tag, keynames, column_family).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    ss::true_snapshots_size.set(r, [](std::unique_ptr<request> req) {
+        return service::get_local_storage_service().true_snapshots_size().then([] (int64_t size) {
+            return make_ready_future<json::json_return_type>(size);
+        });
+    });
+
+    ss::scrub.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
+        const auto skip_corrupted = req_param<bool>(*req, "skip_corrupted", false);
+
+        auto f = make_ready_future<>();
+        if (!req_param<bool>(*req, "disable_snapshot", false)) {
+            auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
+            f = parallel_for_each(column_families, [keyspace, tag](sstring cf) {
+                return service::get_local_storage_service().take_column_family_snapshot(keyspace, cf, tag);
+            });
+        }
+
+        return f.then([&ctx, keyspace, column_families, skip_corrupted] {
+            return ctx.db.invoke_on_all([=] (database& db) {
+                return do_for_each(column_families, [=, &db](sstring cfname) {
+                    auto& cm = db.get_compaction_manager();
+                    auto& cf = db.find_column_family(keyspace, cfname);
+                    return cm.perform_sstable_scrub(&cf, skip_corrupted);
+                });
+            });
+        }).then([]{
+            return make_ready_future<json::json_return_type>(0);
+        });
+    }));
+}
+
 }
--- a/api/storage_service.hh
+++ b/api/storage_service.hh
@@ -26,5 +26,6 @@
 namespace api {

 void set_storage_service(http_context& ctx, routes& r);
+void set_snapshot(http_context& ctx, routes& r);

 }
--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -21,6 +21,7 @@

 #include "atomic_cell.hh"
 #include "atomic_cell_or_collection.hh"
+#include "counters.hh"
 #include "types.hh"

 /// LSA mirator for cells with irrelevant type
@@ -218,7 +219,9 @@ std::ostream&
 operator<<(std::ostream& os, const atomic_cell_view& acv) {
    if (acv.is_live()) {
        return fmt_print(os, "atomic_cell{{{},ts={:d},expiry={:d},ttl={:d}}}",
-            to_hex(acv.value().linearize()),
+            acv.is_counter_update()
+                    ? "counter_update_value=" + to_sstring(acv.counter_update_value())
+                    : to_hex(acv.value().linearize()),
            acv.timestamp(),
            acv.is_live_and_has_ttl() ? acv.expiry().time_since_epoch().count() : -1,
            acv.is_live_and_has_ttl() ? acv.ttl().count() : 0);
@@ -238,8 +241,21 @@ operator<<(std::ostream& os, const atomic_cell_view::printer& acvp) {
    auto& type = acvp._type;
    auto& acv = acvp._cell;
    if (acv.is_live()) {
+        std::ostringstream cell_value_string_builder;
+        if (type.is_counter()) {
+            if (acv.is_counter_update()) {
+                cell_value_string_builder << "counter_update_value=" << acv.counter_update_value();
+            } else {
+                cell_value_string_builder << "shards: ";
+                counter_cell_view::with_linearized(acv, [&cell_value_string_builder] (counter_cell_view& ccv) {
+                    cell_value_string_builder << ::join(", ", ccv.shards());
+                });
+            }
+        } else {
+            cell_value_string_builder << type.to_string(acv.value().linearize());
+        }
        return fmt_print(os, "atomic_cell{{{},ts={:d},expiry={:d},ttl={:d}}}",
-            type.to_string(acv.value().linearize()),
+            cell_value_string_builder.str(),
            acv.timestamp(),
            acv.is_live_and_has_ttl() ? acv.expiry().time_since_epoch().count() : -1,
            acv.is_live_and_has_ttl() ? acv.ttl().count() : 0);
--- a/auth/allow_all_authenticator.hh
+++ b/auth/allow_all_authenticator.hh
@@ -52,7 +52,7 @@ public:
        return make_ready_future<>();
    }

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return allow_all_authenticator_name();
    }

--- a/auth/allow_all_authorizer.hh
+++ b/auth/allow_all_authorizer.hh
@@ -49,7 +49,7 @@ public:
        return make_ready_future<>();
    }

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return allow_all_authorizer_name();
    }

--- a/auth/authenticator.hh
+++ b/auth/authenticator.hh
@@ -96,7 +96,7 @@ public:
    ///
    /// A fully-qualified (class with package) Java-like name for this implementation.
    ///
-    virtual const sstring& qualified_java_name() const = 0;
+    virtual std::string_view qualified_java_name() const = 0;

    virtual bool require_authentication() const = 0;

--- a/auth/authorizer.hh
+++ b/auth/authorizer.hh
@@ -100,7 +100,7 @@ public:
    ///
    /// A fully-qualified (class with package) Java-like name for this implementation.
    ///
-    virtual const sstring& qualified_java_name() const = 0;
+    virtual std::string_view qualified_java_name() const = 0;

    ///
    /// Query for the permissions granted directly to a role for a particular \ref resource (and not any of its
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -59,7 +59,7 @@ future<> do_after_system_ready(seastar::abort_source& as, seastar::noncopyable_f
    }).discard_result();
 }

-future<> create_metadata_table_if_missing(
+static future<> create_metadata_table_if_missing_impl(
        std::string_view table_name,
        cql3::query_processor& qp,
        std::string_view cql,
@@ -85,7 +85,14 @@ future<> create_metadata_table_if_missing(
    return ignore_existing([&mm, table = std::move(table)] () {
        return mm.announce_new_column_family(table, false);
    });
+}

+future<> create_metadata_table_if_missing(
+        std::string_view table_name,
+        cql3::query_processor& qp,
+        std::string_view cql,
+        ::service::migration_manager& mm) noexcept {
+    return futurize_apply(create_metadata_table_if_missing_impl, table_name, qp, cql, mm);
 }

 future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db, seastar::abort_source& as) {
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -79,7 +79,7 @@ future<> create_metadata_table_if_missing(
        std::string_view table_name,
        cql3::query_processor&,
        std::string_view cql,
-        ::service::migration_manager&);
+        ::service::migration_manager&) noexcept;

 future<> wait_for_schema_agreement(::service::migration_manager&, const database&, seastar::abort_source&);

--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -101,7 +101,7 @@ bool default_authorizer::legacy_metadata_exists() const {
 future<bool> default_authorizer::any_granted() const {
    static const sstring query = format("SELECT * FROM {}.{} LIMIT 1", meta::AUTH_KS, PERMISSIONS_CF);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config,
@@ -115,7 +115,7 @@ future<> default_authorizer::migrate_legacy_metadata() const {
    alogger.info("Starting migration of legacy permissions metadata.");
    static const sstring query = format("SELECT * FROM {}.{}", meta::AUTH_KS, legacy_table_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config).then([this](::shared_ptr<cql3::untyped_result_set> results) {
@@ -195,7 +195,7 @@ default_authorizer::authorize(const role_or_anonymous& maybe_role, const resourc
            ROLE_NAME,
            RESOURCE_NAME);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config,
@@ -224,7 +224,7 @@ default_authorizer::modify(
                    ROLE_NAME,
                    RESOURCE_NAME),
            [this, &role_name, set, &resource](const auto& query) {
-        return _qp.process(
+        return _qp.execute_internal(
                query,
                db::consistency_level::ONE,
                internal_distributed_timeout_config(),
@@ -249,7 +249,7 @@ future<std::vector<permission_details>> default_authorizer::list_all() const {
            meta::AUTH_KS,
            PERMISSIONS_CF);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::ONE,
            internal_distributed_timeout_config(),
@@ -276,7 +276,7 @@ future<> default_authorizer::revoke_all(std::string_view role_name) const {
            PERMISSIONS_CF,
            ROLE_NAME);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::ONE,
            internal_distributed_timeout_config(),
@@ -296,7 +296,7 @@ future<> default_authorizer::revoke_all(const resource& resource) const {
            PERMISSIONS_CF,
            RESOURCE_NAME);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config,
@@ -313,7 +313,7 @@ future<> default_authorizer::revoke_all(const resource& resource) const {
                        ROLE_NAME,
                        RESOURCE_NAME);

-                return _qp.process(
+                return _qp.execute_internal(
                        query,
                        db::consistency_level::LOCAL_ONE,
                        infinite_timeout_config,
--- a/auth/default_authorizer.hh
+++ b/auth/default_authorizer.hh
@@ -71,7 +71,7 @@ public:

    virtual future<> stop() override;

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return default_authorizer_name();
    }

--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -96,10 +96,13 @@ static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
    return !row.get_or<sstring>(SALTED_HASH, "").empty();
 }

-static const sstring update_row_query = format("UPDATE {} SET {} = ? WHERE {} = ?",
-        meta::roles_table::qualified_name(),
-        SALTED_HASH,
-        meta::roles_table::role_col_name);
+static const sstring& update_row_query() {
+    static const sstring update_row_query = format("UPDATE {} SET {} = ? WHERE {} = ?",
+            meta::roles_table::qualified_name(),
+            SALTED_HASH,
+            meta::roles_table::role_col_name);
+    return update_row_query;
+}

 static const sstring legacy_table_name{"credentials"};

@@ -111,7 +114,7 @@ future<> password_authenticator::migrate_legacy_metadata() const {
    plogger.info("Starting migration of legacy authentication metadata.");
    static const sstring query = format("SELECT * FROM {}.{}", meta::AUTH_KS, legacy_table_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
            internal_distributed_timeout_config()).then([this](::shared_ptr<cql3::untyped_result_set> results) {
@@ -119,8 +122,8 @@ future<> password_authenticator::migrate_legacy_metadata() const {
            auto username = row.get_as<sstring>("username");
            auto salted_hash = row.get_as<sstring>(SALTED_HASH);

-            return _qp.process(
-                    update_row_query,
+            return _qp.execute_internal(
+                    update_row_query(),
                    consistency_for_user(username),
                    internal_distributed_timeout_config(),
                    {std::move(salted_hash), username}).discard_result();
@@ -136,8 +139,8 @@ future<> password_authenticator::migrate_legacy_metadata() const {
 future<> password_authenticator::create_default_if_missing() const {
    return default_role_row_satisfies(_qp, &has_salted_hash).then([this](bool exists) {
        if (!exists) {
-            return _qp.process(
-                    update_row_query,
+            return _qp.execute_internal(
+                    update_row_query(),
                    db::consistency_level::QUORUM,
                    internal_distributed_timeout_config(),
                    {passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt), DEFAULT_USER_NAME}).then([](auto&&) {
@@ -194,7 +197,7 @@ db::consistency_level password_authenticator::consistency_for_user(std::string_v
    return db::consistency_level::LOCAL_ONE;
 }

-const sstring& password_authenticator::qualified_java_name() const {
+std::string_view password_authenticator::qualified_java_name() const {
    return password_authenticator_name();
 }

@@ -233,7 +236,7 @@ future<authenticated_user> password_authenticator::authenticate(
                meta::roles_table::qualified_name(),
                meta::roles_table::role_col_name);

-        return _qp.process(
+        return _qp.execute_internal(
                query,
                consistency_for_user(username),
                internal_distributed_timeout_config(),
@@ -267,8 +270,8 @@ future<> password_authenticator::create(std::string_view role_name, const authen
        return make_ready_future<>();
    }

-    return _qp.process(
-            update_row_query,
+    return _qp.execute_internal(
+            update_row_query(),
            consistency_for_user(role_name),
            internal_distributed_timeout_config(),
            {passwords::hash(*options.password, rng_for_salt), sstring(role_name)}).discard_result();
@@ -284,7 +287,7 @@ future<> password_authenticator::alter(std::string_view role_name, const authent
            SALTED_HASH,
            meta::roles_table::role_col_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            consistency_for_user(role_name),
            internal_distributed_timeout_config(),
@@ -297,7 +300,7 @@ future<> password_authenticator::drop(std::string_view name) const {
            meta::roles_table::qualified_name(),
            meta::roles_table::role_col_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query, consistency_for_user(name),
            internal_distributed_timeout_config(),
            {sstring(name)}).discard_result();
--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -71,7 +71,7 @@ public:

    virtual future<> stop() override;

-    virtual const sstring& qualified_java_name() const override;
+    virtual std::string_view qualified_java_name() const override;

    virtual bool require_authentication() const override;

--- a/auth/role_manager.hh
+++ b/auth/role_manager.hh
@@ -33,6 +33,7 @@

 #include "auth/resource.hh"
 #include "seastarx.hh"
+#include "exceptions/exceptions.hh"

 namespace auth {

@@ -52,9 +53,9 @@ struct role_config_update final {
 ///
 /// A logical argument error for a role-management operation.
 ///
-class roles_argument_exception : public std::invalid_argument {
+class roles_argument_exception : public exceptions::invalid_request_exception {
 public:
-    using std::invalid_argument::invalid_argument;
+    using exceptions::invalid_request_exception::invalid_request_exception;
 };

 class role_already_exists : public roles_argument_exception {
--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -68,14 +68,14 @@ future<bool> default_role_row_satisfies(
            meta::roles_table::role_col_name);

    return do_with(std::move(p), [&qp](const auto& p) {
-        return qp.process(
+        return qp.execute_internal(
                query,
                db::consistency_level::ONE,
                infinite_timeout_config,
                {meta::DEFAULT_SUPERUSER_NAME},
                true).then([&qp, &p](::shared_ptr<cql3::untyped_result_set> results) {
            if (results->empty()) {
-                return qp.process(
+                return qp.execute_internal(
                        query,
                        db::consistency_level::QUORUM,
                        internal_distributed_timeout_config(),
@@ -100,7 +100,7 @@ future<bool> any_nondefault_role_row_satisfies(
    static const sstring query = format("SELECT * FROM {}", meta::roles_table::qualified_name());

    return do_with(std::move(p), [&qp](const auto& p) {
-        return qp.process(
+        return qp.execute_internal(
                query,
                db::consistency_level::QUORUM,
                internal_distributed_timeout_config()).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -193,9 +193,12 @@ future<> service::start(::service::migration_manager& mm) {
 future<> service::stop() {
    // Only one of the shards has the listener registered, but let's try to
    // unregister on each one just to make sure.
-    _mnotifier.unregister_listener(_migration_listener.get());
-
-    return _permissions_cache->stop().then([this] {
+    return _mnotifier.unregister_listener(_migration_listener.get()).then([this] {
+        if (_permissions_cache) {
+            return _permissions_cache->stop();
+        }
+        return make_ready_future<>();
+    }).then([this] {
        return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop());
    });
 }
@@ -217,7 +220,7 @@ future<bool> service::has_existing_legacy_users() const {
    // This logic is borrowed directly from Apache Cassandra. By first checking for the presence of the default user, we
    // can potentially avoid doing a range query with a high consistency level.

-    return _qp.process(
+    return _qp.execute_internal(
            default_user_query,
            db::consistency_level::ONE,
            infinite_timeout_config,
@@ -227,7 +230,7 @@ future<bool> service::has_existing_legacy_users() const {
            return make_ready_future<bool>(true);
        }

-        return _qp.process(
+        return _qp.execute_internal(
                default_user_query,
                db::consistency_level::QUORUM,
                infinite_timeout_config,
@@ -237,7 +240,7 @@ future<bool> service::has_existing_legacy_users() const {
                return make_ready_future<bool>(true);
            }

-            return _qp.process(
+            return _qp.execute_internal(
                    all_users_query,
                    db::consistency_level::QUORUM,
                    infinite_timeout_config).then([](auto results) {
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -35,6 +35,7 @@
 #include "auth/common.hh"
 #include "auth/roles-metadata.hh"
 #include "cql3/query_processor.hh"
+#include "cql3/untyped_result_set.hh"
 #include "db/consistency_level_type.hh"
 #include "exceptions/exceptions.hh"
 #include "log.hh"
@@ -86,7 +87,7 @@ static future<std::optional<record>> find_record(cql3::query_processor& qp, std:
            meta::roles_table::qualified_name(),
            meta::roles_table::role_col_name);

-    return qp.process(
+    return qp.execute_internal(
            query,
            consistency_for_role(role_name),
            internal_distributed_timeout_config(),
@@ -170,7 +171,7 @@ future<> standard_role_manager::create_default_role_if_missing() const {
                    meta::roles_table::qualified_name(),
                    meta::roles_table::role_col_name);

-            return _qp.process(
+            return _qp.execute_internal(
                    query,
                    db::consistency_level::QUORUM,
                    internal_distributed_timeout_config(),
@@ -197,7 +198,7 @@ future<> standard_role_manager::migrate_legacy_metadata() const {
    log.info("Starting migration of legacy user metadata.");
    static const sstring query = format("SELECT * FROM {}.{}", meta::AUTH_KS, legacy_table_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
            internal_distributed_timeout_config()).then([this](::shared_ptr<cql3::untyped_result_set> results) {
@@ -258,7 +259,7 @@ future<> standard_role_manager::create_or_replace(std::string_view role_name, co
            meta::roles_table::qualified_name(),
            meta::roles_table::role_col_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            consistency_for_role(role_name),
            internal_distributed_timeout_config(),
@@ -298,7 +299,7 @@ standard_role_manager::alter(std::string_view role_name, const role_config_updat
            return make_ready_future<>();
        }

-        return _qp.process(
+        return _qp.execute_internal(
                format("UPDATE {} SET {} WHERE {} = ?",
                        meta::roles_table::qualified_name(),
                        build_column_assignments(u),
@@ -320,7 +321,7 @@ future<> standard_role_manager::drop(std::string_view role_name) const {
            static const sstring query = format("SELECT member FROM {} WHERE role = ?",
                    meta::role_members_table::qualified_name());

-            return _qp.process(
+            return _qp.execute_internal(
                    query,
                    consistency_for_role(role_name),
                    internal_distributed_timeout_config(),
@@ -359,7 +360,7 @@ future<> standard_role_manager::drop(std::string_view role_name) const {
                    meta::roles_table::qualified_name(),
                    meta::roles_table::role_col_name);

-            return _qp.process(
+            return _qp.execute_internal(
                    query,
                    consistency_for_role(role_name),
                    internal_distributed_timeout_config(),
@@ -386,7 +387,7 @@ standard_role_manager::modify_membership(
                (ch == membership_change::add ? '+' : '-'),
                meta::roles_table::role_col_name);

-        return _qp.process(
+        return _qp.execute_internal(
                query,
                consistency_for_role(grantee_name),
                internal_distributed_timeout_config(),
@@ -396,7 +397,7 @@ standard_role_manager::modify_membership(
    const auto modify_role_members = [this, role_name, grantee_name, ch] {
        switch (ch) {
            case membership_change::add:
-                return _qp.process(
+                return _qp.execute_internal(
                        format("INSERT INTO {} (role, member) VALUES (?, ?)",
                                meta::role_members_table::qualified_name()),
                        consistency_for_role(role_name),
@@ -404,7 +405,7 @@ standard_role_manager::modify_membership(
                        {sstring(role_name), sstring(grantee_name)}).discard_result();

            case membership_change::remove:
-                return _qp.process(
+                return _qp.execute_internal(
                        format("DELETE FROM {} WHERE role = ? AND member = ?",
                                meta::role_members_table::qualified_name()),
                        consistency_for_role(role_name),
@@ -508,7 +509,7 @@ future<role_set> standard_role_manager::query_all() const {
    // To avoid many copies of a view.
    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
            internal_distributed_timeout_config()).then([](::shared_ptr<cql3::untyped_result_set> results) {
--- a/auth/transitional.cc
+++ b/auth/transitional.cc
@@ -82,7 +82,7 @@ public:
        return _authenticator->stop();
    }

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return transitional_authenticator_name();
    }

@@ -201,7 +201,7 @@ public:
        return _authorizer->stop();
    }

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return transitional_authorizer_name();
    }

--- a/backlog_controller.hh
+++ b/backlog_controller.hh
@@ -23,7 +23,11 @@
 #include <seastar/core/scheduling.hh>
 #include <seastar/core/timer.hh>
 #include <seastar/core/gate.hh>
+#include <seastar/core/file.hh>
 #include <chrono>
+#include <cmath>
+
+#include "seastarx.hh"

 // Simple proportional controller to adjust shares for processes for which a backlog can be clearly
 // defined.
--- a/build_id.cc
+++ b/build_id.cc
@@ -7,6 +7,7 @@
 #include <link.h>
 #include <seastar/core/align.hh>
 #include <sstream>
+#include <cassert>

 using namespace seastar;

--- a/bytes.cc
+++ b/bytes.cc
@@ -64,7 +64,7 @@ bytes from_hex(sstring_view s) {

 sstring to_hex(bytes_view b) {
    static char digits[] = "0123456789abcdef";
-    sstring out(sstring::initialized_later(), b.size() * 2);
+    sstring out = uninitialized_string(b.size() * 2);
    unsigned end = b.size();
    for (unsigned i = 0; i != end; ++i) {
        uint8_t x = b[i];
--- a/canonical_mutation.cc
+++ b/canonical_mutation.cc
@@ -92,7 +92,7 @@ mutation canonical_mutation::to_mutation(schema_ptr s) const {
 }

 static sstring bytes_to_text(bytes_view bv) {
-    sstring ret(sstring::initialized_later(), bv.size());
+    sstring ret = uninitialized_string(bv.size());
    std::copy_n(reinterpret_cast<const char*>(bv.data()), bv.size(), ret.data());
    return ret;
 }
--- a/canonical_mutation.hh
+++ b/canonical_mutation.hh
@@ -22,7 +22,7 @@
 #pragma once

 #include "bytes.hh"
-#include "schema.hh"
+#include "schema_fwd.hh"
 #include "database_fwd.hh"
 #include "mutation_partition_visitor.hh"
 #include "mutation_partition_serializer.hh"
--- a/cartesian_product.hh
+++ b/cartesian_product.hh
@@ -22,6 +22,9 @@

 #pragma once

+#include <vector>
+#include <sys/types.h>
+
 // Single-pass range over cartesian product of vectors.

 // Note:
--- a/cdc/cdc.cc
+++ b/cdc/cdc.cc
@@ -1,818 +0,0 @@
-/*
- * Copyright (C) 2019 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <utility>
-#include <algorithm>
-
-#include <boost/range/irange.hpp>
-#include <seastar/util/defer.hh>
-#include <seastar/core/thread.hh>
-
-#include "cdc/cdc.hh"
-#include "bytes.hh"
-#include "database.hh"
-#include "db/config.hh"
-#include "dht/murmur3_partitioner.hh"
-#include "partition_slice_builder.hh"
-#include "schema.hh"
-#include "schema_builder.hh"
-#include "service/migration_listener.hh"
-#include "service/storage_service.hh"
-#include "types/tuple.hh"
-#include "cql3/statements/select_statement.hh"
-#include "cql3/multi_column_relation.hh"
-#include "cql3/tuples.hh"
-#include "log.hh"
-#include "json.hh"
-
-using locator::snitch_ptr;
-using locator::token_metadata;
-using locator::topology;
-using seastar::sstring;
-using service::migration_notifier;
-using service::storage_proxy;
-
-namespace std {
-
-template<> struct hash<std::pair<net::inet_address, unsigned int>> {
-    std::size_t operator()(const std::pair<net::inet_address, unsigned int> &p) const {
-        return std::hash<net::inet_address>{}(p.first) ^ std::hash<int>{}(p.second);
-    }
-};
-
-}
-
-using namespace std::chrono_literals;
-
-static logging::logger cdc_log("cdc");
-
-namespace cdc {
-static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {});
-static schema_ptr create_stream_description_table_schema(const schema&, std::optional<utils::UUID> = {});
-static future<> populate_desc(db_context ctx, const schema& s);
-}
-
-class cdc::cdc_service::impl : service::migration_listener::empty_listener {
-    friend cdc_service;
-    db_context _ctxt;
-public:
-    impl(db_context ctxt)
-        : _ctxt(std::move(ctxt))
-    {
-        _ctxt._migration_notifier.register_listener(this);
-    }
-    ~impl() {
-        _ctxt._migration_notifier.unregister_listener(this);
-    }
-
-    void on_before_create_column_family(const schema& schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
-        if (schema.cdc_options().enabled()) {
-            auto& db = _ctxt._proxy.get_db().local();
-            auto logname = log_name(schema.cf_name());
-            if (!db.has_schema(schema.ks_name(), logname)) {
-                // in seastar thread
-                auto log_schema = create_log_schema(schema);
-                auto stream_desc_schema = create_stream_description_table_schema(schema);
-                auto& keyspace = db.find_keyspace(schema.ks_name());
-
-                auto log_mut = db::schema_tables::make_create_table_mutations(keyspace.metadata(), log_schema, timestamp);
-                auto stream_mut = db::schema_tables::make_create_table_mutations(keyspace.metadata(), stream_desc_schema, timestamp);
-
-                mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
-                mutations.insert(mutations.end(), std::make_move_iterator(stream_mut.begin()), std::make_move_iterator(stream_mut.end()));
-            }
-        }
-    }
-
-    void on_before_update_column_family(const schema& new_schema, const schema& old_schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
-        bool is_cdc = new_schema.cdc_options().enabled();
-        bool was_cdc = old_schema.cdc_options().enabled();
-
-        // we need to create or modify the log & stream schemas iff either we changed cdc status (was != is)
-        // or if cdc is on now unconditionally, since then any actual base schema changes will affect the column 
-        // etc.
-        if (was_cdc || is_cdc) {
-            auto logname = log_name(old_schema.cf_name());
-            auto descname = desc_name(old_schema.cf_name());
-            auto& db = _ctxt._proxy.get_db().local();
-            auto& keyspace = db.find_keyspace(old_schema.ks_name());
-            auto log_schema = was_cdc ? db.find_column_family(old_schema.ks_name(), logname).schema() : nullptr;
-            auto stream_desc_schema = was_cdc ? db.find_column_family(old_schema.ks_name(), descname).schema() : nullptr;
-
-            if (!is_cdc) {
-                auto log_mut = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), log_schema, timestamp);
-                auto stream_mut = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), stream_desc_schema, timestamp);
-
-                mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
-                mutations.insert(mutations.end(), std::make_move_iterator(stream_mut.begin()), std::make_move_iterator(stream_mut.end()));
-                return;
-            }
-
-            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt);
-            auto new_stream_desc_schema = create_stream_description_table_schema(new_schema, stream_desc_schema ? std::make_optional(stream_desc_schema->id()) : std::nullopt);
-
-            auto log_mut = log_schema 
-                ? db::schema_tables::make_update_table_mutations(keyspace.metadata(), log_schema, new_log_schema, timestamp, false)
-                : db::schema_tables::make_create_table_mutations(keyspace.metadata(), new_log_schema, timestamp)
-                ;
-            auto stream_mut = stream_desc_schema 
-                ? db::schema_tables::make_update_table_mutations(keyspace.metadata(), stream_desc_schema, new_stream_desc_schema, timestamp, false)
-                : db::schema_tables::make_create_table_mutations(keyspace.metadata(), new_stream_desc_schema, timestamp)
-                ;
-
-            mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
-            mutations.insert(mutations.end(), std::make_move_iterator(stream_mut.begin()), std::make_move_iterator(stream_mut.end()));
-        }
-    }
-
-    void on_before_drop_column_family(const schema& schema, std::vector<mutation>& mutations, api::timestamp_type timestamp) override {
-        if (schema.cdc_options().enabled()) {
-            auto logname = log_name(schema.cf_name());
-            auto descname = desc_name(schema.cf_name());
-            auto& db = _ctxt._proxy.get_db().local();
-            auto& keyspace = db.find_keyspace(schema.ks_name());
-            auto log_schema = db.find_column_family(schema.ks_name(), logname).schema();
-            auto stream_desc_schema = db.find_column_family(schema.ks_name(), descname).schema();
-
-            auto log_mut = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), log_schema, timestamp);
-            auto stream_mut = db::schema_tables::make_drop_table_mutations(keyspace.metadata(), stream_desc_schema, timestamp);
-
-            mutations.insert(mutations.end(), std::make_move_iterator(log_mut.begin()), std::make_move_iterator(log_mut.end()));
-            mutations.insert(mutations.end(), std::make_move_iterator(stream_mut.begin()), std::make_move_iterator(stream_mut.end()));
-        }
-    }
-
-    void on_create_column_family(const sstring& ks_name, const sstring& cf_name) override {
-        // This callback is done on all shards. Only do the work once. 
-        if (engine().cpu_id() != 0) {
-            return; 
-        }
-        auto& db = _ctxt._proxy.get_db().local();
-        auto& cf = db.find_column_family(ks_name, cf_name);
-        auto schema = cf.schema();
-        if (schema->cdc_options().enabled()) {
-            populate_desc(_ctxt, *schema).get();
-        }
-    }
-
-    void on_update_column_family(const sstring& ks_name, const sstring& cf_name, bool columns_changed) override {
-        on_create_column_family(ks_name, cf_name);
-    }
-
-    void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {}
-
-    future<std::tuple<std::vector<mutation>, result_callback>> augment_mutation_call(
-        lowres_clock::time_point timeout,
-        std::vector<mutation>&& mutations
-    );
-
-    template<typename Iter>
-    future<> append_mutations(Iter i, Iter e, schema_ptr s, lowres_clock::time_point, std::vector<mutation>&);
-};
-
-cdc::cdc_service::cdc_service(service::storage_proxy& proxy)
-    : cdc_service(db_context::builder(proxy).build())
-{}
-
-cdc::cdc_service::cdc_service(db_context ctxt)
-    : _impl(std::make_unique<impl>(std::move(ctxt)))
-{
-    _impl->_ctxt._proxy.set_cdc_service(this);
-}
-
-cdc::cdc_service::~cdc_service() = default;
-
-cdc::options::options(const std::map<sstring, sstring>& map) {
-    if (map.find("enabled") == std::end(map)) {
-        return;
-    }
-
-    for (auto& p : map) {
-        if (p.first == "enabled") {
-            _enabled = p.second == "true";
-        } else if (p.first == "preimage") {
-            _preimage = p.second == "true";
-        } else if (p.first == "postimage") {
-            _postimage = p.second == "true";
-        } else if (p.first == "ttl") {
-            _ttl = std::stoi(p.second);
-        } else {
-            throw exceptions::configuration_exception("Invalid CDC option: " + p.first);
-        }
-    }
-}
-
-std::map<sstring, sstring> cdc::options::to_map() const {
-    if (!_enabled) {
-        return {};
-    }
-    return {
-        { "enabled", _enabled ? "true" : "false" },
-        { "preimage", _preimage ? "true" : "false" },
-        { "postimage", _postimage ? "true" : "false" },
-        { "ttl", std::to_string(_ttl) },
-    };
-}
-
-sstring cdc::options::to_sstring() const {
-    return json::to_json(to_map());
-}
-
-bool cdc::options::operator==(const options& o) const {
-    return _enabled == o._enabled && _preimage == o._preimage && _postimage == o._postimage && _ttl == o._ttl;
-}
-bool cdc::options::operator!=(const options& o) const {
-    return !(*this == o);
-}
-
-namespace cdc {
-
-using operation_native_type = std::underlying_type_t<operation>;
-using column_op_native_type = std::underlying_type_t<column_op>;
-
-sstring log_name(const sstring& table_name) {
-    static constexpr auto cdc_log_suffix = "_scylla_cdc_log";
-    return table_name + cdc_log_suffix;
-}
-
-sstring desc_name(const sstring& table_name) {
-    static constexpr auto cdc_desc_suffix = "_scylla_cdc_desc";
-    return table_name + cdc_desc_suffix;
-}
-
-static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid) {
-    schema_builder b(s.ks_name(), log_name(s.cf_name()));
-    b.set_default_time_to_live(gc_clock::duration{s.cdc_options().ttl()});
-    b.set_comment(sprint("CDC log for %s.%s", s.ks_name(), s.cf_name()));
-    b.with_column("stream_id", uuid_type, column_kind::partition_key);
-    b.with_column("time", timeuuid_type, column_kind::clustering_key);
-    b.with_column("batch_seq_no", int32_type, column_kind::clustering_key);
-    b.with_column("operation", data_type_for<operation_native_type>());
-    b.with_column("ttl", long_type);
-    auto add_columns = [&] (const schema::const_iterator_range_type& columns, bool is_data_col = false) {
-        for (const auto& column : columns) {
-            auto type = column.type;
-            if (is_data_col) {
-                type = tuple_type_impl::get_instance({ /* op */ data_type_for<column_op_native_type>(), /* value */ type, /* ttl */long_type});
-            }
-            b.with_column("_" + column.name(), type);
-        }
-    };
-    add_columns(s.partition_key_columns());
-    add_columns(s.clustering_key_columns());
-    add_columns(s.static_columns(), true);
-    add_columns(s.regular_columns(), true);
-
-    if (uuid) {
-        b.set_uuid(*uuid);
-    }
-    
-    return b.build();
-}
-
-static schema_ptr create_stream_description_table_schema(const schema& s, std::optional<utils::UUID> uuid) {
-    schema_builder b(s.ks_name(), desc_name(s.cf_name()));
-    b.set_comment(sprint("CDC description for %s.%s", s.ks_name(), s.cf_name()));
-    b.with_column("node_ip", inet_addr_type, column_kind::partition_key);
-    b.with_column("shard_id", int32_type, column_kind::partition_key);
-    b.with_column("created_at", timestamp_type, column_kind::clustering_key);
-    b.with_column("stream_id", uuid_type);
-
-    if (uuid) {
-        b.set_uuid(*uuid);
-    }
-
-    return b.build();
-}
-
-// This function assumes setup_stream_description_table was called on |s| before the call to this
-// function.
-static future<> populate_desc(db_context ctx, const schema& s) {
-    auto& db = ctx._proxy.get_db().local();
-    auto desc_schema =
-        db.find_schema(s.ks_name(), desc_name(s.cf_name()));
-    auto log_schema =
-        db.find_schema(s.ks_name(), log_name(s.cf_name()));
-    auto belongs_to = [&](const gms::inet_address& endpoint,
-                          const unsigned int shard_id,
-                          const int shard_count,
-                          const unsigned int ignore_msb_bits,
-                          const utils::UUID& stream_id) {
-        const auto log_pk = partition_key::from_singular(*log_schema,
-                                                         data_value(stream_id));
-        const auto token = ctx._partitioner.decorate_key(*log_schema, log_pk).token();
-        if (ctx._token_metadata.get_endpoint(ctx._token_metadata.first_token(token)) != endpoint) {
-            return false;
-        }
-        const auto owning_shard_id = dht::murmur3_partitioner(shard_count, ignore_msb_bits).shard_of(token);
-        return owning_shard_id == shard_id;
-    };
-
-    std::vector<mutation> mutations;
-    const auto ts = api::new_timestamp();
-    const auto ck = clustering_key::from_single_value(
-            *desc_schema, timestamp_type->decompose(ts));
-    auto cdef = desc_schema->get_column_definition(to_bytes("stream_id"));
-
-    for (const auto& dc : ctx._token_metadata.get_topology().get_datacenter_endpoints()) {
-        for (const auto& endpoint : dc.second) {
-            const auto decomposed_ip = inet_addr_type->decompose(endpoint.addr());
-            const unsigned int shard_count = ctx._snitch->get_shard_count(endpoint);
-            const unsigned int ignore_msb_bits = ctx._snitch->get_ignore_msb_bits(endpoint);
-            for (unsigned int shard_id = 0; shard_id < shard_count; ++shard_id) {
-                const auto pk = partition_key::from_exploded(
-                        *desc_schema, { decomposed_ip, int32_type->decompose(static_cast<int>(shard_id)) });
-                mutations.emplace_back(desc_schema, pk);
-
-                auto stream_id = utils::make_random_uuid();
-                while (!belongs_to(endpoint, shard_id, shard_count, ignore_msb_bits, stream_id)) {
-                    stream_id = utils::make_random_uuid();
-                }
-                auto value = atomic_cell::make_live(*uuid_type,
-                                                    ts,
-                                                    uuid_type->decompose(stream_id));
-                mutations.back().set_cell(ck, *cdef, std::move(value));
-            }
-        }
-    }
-    return ctx._proxy.mutate(std::move(mutations),
-                             db::consistency_level::QUORUM,
-                             db::no_timeout,
-                             nullptr,
-                             empty_service_permit());
-}
-
-db_context::builder::builder(service::storage_proxy& proxy) 
-    : _proxy(proxy) 
-{}
-
-db_context::builder& db_context::builder::with_migration_notifier(service::migration_notifier& migration_notifier) {
-    _migration_notifier = migration_notifier;
-    return *this;
-}
-
-db_context::builder& db_context::builder::with_token_metadata(locator::token_metadata& token_metadata) {
-    _token_metadata = token_metadata;
-    return *this;
-}
-
-db_context::builder& db_context::builder::with_snitch(locator::snitch_ptr& snitch) {
-    _snitch = snitch;
-    return *this;
-}
-
-db_context::builder& db_context::builder::with_partitioner(dht::i_partitioner& partitioner) {
-    _partitioner = partitioner;
-    return *this;
-}
-
-db_context db_context::builder::build() {
-    return db_context{
-        _proxy,
-        _migration_notifier ? _migration_notifier->get() : service::get_local_storage_service().get_migration_notifier(),
-        _token_metadata ? _token_metadata->get() : service::get_local_storage_service().get_token_metadata(),
-        _snitch ? _snitch->get() : locator::i_endpoint_snitch::get_local_snitch_ptr(),
-        _partitioner ? _partitioner->get() : dht::global_partitioner()
-    };
-}
-
-class transformer final {
-public:
-    using streams_type = std::unordered_map<std::pair<net::inet_address, unsigned int>, utils::UUID>;
-private:
-    db_context _ctx;
-    schema_ptr _schema;
-    schema_ptr _log_schema;
-    utils::UUID _time;
-    bytes _decomposed_time;
-    ::shared_ptr<const transformer::streams_type> _streams;
-    const column_definition& _op_col;
-
-    clustering_key set_pk_columns(const partition_key& pk, int batch_no, mutation& m) const {
-        const auto log_ck = clustering_key::from_exploded(
-                *m.schema(), { _decomposed_time, int32_type->decompose(batch_no) });
-        auto pk_value = pk.explode(*_schema);
-        size_t pos = 0;
-        for (const auto& column : _schema->partition_key_columns()) {
-            assert (pos < pk_value.size());
-            auto cdef = m.schema()->get_column_definition(to_bytes("_" + column.name()));
-            auto value = atomic_cell::make_live(*column.type,
-                                                _time.timestamp(),
-                                                bytes_view(pk_value[pos]));
-            m.set_cell(log_ck, *cdef, std::move(value));
-            ++pos;
-        }
-        return log_ck;
-    }
-
-    void set_operation(const clustering_key& ck, operation op, mutation& m) const {
-        m.set_cell(ck, _op_col, atomic_cell::make_live(*_op_col.type, _time.timestamp(), _op_col.type->decompose(operation_native_type(op))));
-    }
-
-    partition_key stream_id(const net::inet_address& ip, unsigned int shard_id) const {
-        auto it = _streams->find(std::make_pair(ip, shard_id));
-        if (it == std::end(*_streams)) {
-                throw std::runtime_error(format("No stream found for node {} and shard {}", ip, shard_id));
-        }
-        return partition_key::from_exploded(*_log_schema, { uuid_type->decompose(it->second) });
-    }
-public:
-    transformer(db_context ctx, schema_ptr s, ::shared_ptr<const transformer::streams_type> streams)
-        : _ctx(ctx)
-        , _schema(std::move(s))
-        , _log_schema(ctx._proxy.get_db().local().find_schema(_schema->ks_name(), log_name(_schema->cf_name())))
-        , _time(utils::UUID_gen::get_time_UUID())
-        , _decomposed_time(timeuuid_type->decompose(_time))
-        , _streams(std::move(streams))
-        , _op_col(*_log_schema->get_column_definition(to_bytes("operation")))
-    {}
-
-    // TODO: is pre-image data based on query enough. We only have actual column data. Do we need
-    // more details like tombstones/ttl? Probably not but keep in mind.
-    mutation transform(const mutation& m, const cql3::untyped_result_set* rs = nullptr) const {
-        auto& t = m.token();
-        auto&& ep = _ctx._token_metadata.get_endpoint(
-                _ctx._token_metadata.first_token(t));
-        if (!ep) {
-            throw std::runtime_error(format("No owner found for key {}", m.decorated_key()));
-        }
-        auto shard_id = dht::murmur3_partitioner(_ctx._snitch->get_shard_count(*ep), _ctx._snitch->get_ignore_msb_bits(*ep)).shard_of(t);
-        mutation res(_log_schema, stream_id(ep->addr(), shard_id));
-        auto& p = m.partition();
-        if (p.partition_tombstone()) {
-            // Partition deletion
-            auto log_ck = set_pk_columns(m.key(), 0, res);
-            set_operation(log_ck, operation::partition_delete, res);
-        } else if (!p.row_tombstones().empty()) {
-            // range deletion
-            int batch_no = 0;
-            for (auto& rt : p.row_tombstones()) {
-                auto set_bound = [&] (const clustering_key& log_ck, const clustering_key_prefix& ckp) {
-                    auto exploded = ckp.explode(*_schema);
-                    size_t pos = 0;
-                    for (const auto& column : _schema->clustering_key_columns()) {
-                        if (pos >= exploded.size()) {
-                            break;
-                        }
-                        auto cdef = _log_schema->get_column_definition(to_bytes("_" + column.name()));
-                        auto value = atomic_cell::make_live(*column.type,
-                                                            _time.timestamp(),
-                                                            bytes_view(exploded[pos]));
-                        res.set_cell(log_ck, *cdef, std::move(value));
-                        ++pos;
-                    }
-                };
-                {
-                    auto log_ck = set_pk_columns(m.key(), batch_no, res);
-                    set_bound(log_ck, rt.start);
-                    // TODO: separate inclusive/exclusive range
-                    set_operation(log_ck, operation::range_delete_start, res);
-                    ++batch_no;
-                }
-                {
-                    auto log_ck = set_pk_columns(m.key(), batch_no, res);
-                    set_bound(log_ck, rt.end);
-                    // TODO: separate inclusive/exclusive range
-                    set_operation(log_ck, operation::range_delete_end, res);
-                    ++batch_no;
-                }
-            }
-        } else {
-            // should be update or deletion
-            int batch_no = 0;
-            for (const rows_entry& r : p.clustered_rows()) {
-                auto ck_value = r.key().explode(*_schema);
-
-                std::optional<clustering_key> pikey;
-                const cql3::untyped_result_set_row * pirow = nullptr;
-
-                if (rs) {
-                    for (auto& utr : *rs) {
-                        bool match = true;
-                        for (auto& c : _schema->clustering_key_columns()) {
-                            auto rv = utr.get_view(c.name_as_text());
-                            auto cv = r.key().get_component(*_schema, c.component_index());
-                            if (rv != cv) {
-                                match = false;
-                                break;
-                            }
-                        }
-                        if (match) {
-                            pikey = set_pk_columns(m.key(), batch_no, res);
-                            set_operation(*pikey, operation::pre_image, res);
-                            pirow = &utr;
-                            ++batch_no;
-                            break;
-                        }
-                    }
-                }
-
-                auto log_ck = set_pk_columns(m.key(), batch_no, res);
-
-                size_t pos = 0;
-                for (const auto& column : _schema->clustering_key_columns()) {
-                    assert (pos < ck_value.size());
-                    auto cdef = _log_schema->get_column_definition(to_bytes("_" + column.name()));
-                    res.set_cell(log_ck, *cdef, atomic_cell::make_live(*column.type, _time.timestamp(), bytes_view(ck_value[pos])));
-
-                    if (pirow) {
-                        assert(pirow->has(column.name_as_text()));
-                        res.set_cell(*pikey, *cdef, atomic_cell::make_live(*column.type, _time.timestamp(), bytes_view(ck_value[pos])));
-                    }
-
-                    ++pos;
-                }
-
-                std::vector<bytes_opt> values(3);
-
-                auto process_cells = [&](const row& r, column_kind ckind) {
-                    r.for_each_cell([&](column_id id, const atomic_cell_or_collection& cell) {
-                        auto& cdef = _schema->column_at(ckind, id);
-                        auto* dst = _log_schema->get_column_definition(to_bytes("_" + cdef.name()));
-                        // todo: collections.
-                        if (cdef.is_atomic()) {
-                            column_op op;
-
-                            values[1] = values[2] = std::nullopt;
-                            auto view = cell.as_atomic_cell(cdef);
-                            if (view.is_live()) {
-                                op = column_op::set;
-                                values[1] = view.value().linearize();
-                                if (view.is_live_and_has_ttl()) {
-                                    values[2] = long_type->decompose(data_value(view.ttl().count()));
-                                }
-                            } else {
-                                op = column_op::del;
-                            }
-
-                            values[0] = data_type_for<column_op_native_type>()->decompose(data_value(static_cast<column_op_native_type>(op)));
-                            res.set_cell(log_ck, *dst, atomic_cell::make_live(*dst->type, _time.timestamp(), tuple_type_impl::build_value(values)));
-
-                            if (pirow && pirow->has(cdef.name_as_text())) {
-                                values[0] = data_type_for<column_op_native_type>()->decompose(data_value(static_cast<column_op_native_type>(column_op::set)));
-                                values[1] = pirow->get_blob(cdef.name_as_text());
-                                values[2] = std::nullopt;
-
-                                assert(std::addressof(res.partition().clustered_row(*_log_schema, *pikey)) != std::addressof(res.partition().clustered_row(*_log_schema, log_ck)));
-                                assert(pikey->explode() != log_ck.explode());
-                                res.set_cell(*pikey, *dst, atomic_cell::make_live(*dst->type, _time.timestamp(), tuple_type_impl::build_value(values)));
-                            }
-                        } else {
-                            cdc_log.warn("Non-atomic cell ignored {}.{}:{}", _schema->ks_name(), _schema->cf_name(), cdef.name_as_text());
-                        }
-                    });
-                };
-
-                process_cells(r.row().cells(), column_kind::regular_column);
-                process_cells(p.static_row().get(), column_kind::static_column);
-
-                set_operation(log_ck, operation::update, res);
-                ++batch_no;
-            }
-        }
-
-        return res;
-    }
-
-    static db::timeout_clock::time_point default_timeout() {
-        return db::timeout_clock::now() + 10s;
-    }
-
-    future<lw_shared_ptr<cql3::untyped_result_set>> pre_image_select(
-            service::client_state& client_state,
-            db::consistency_level cl,
-            const mutation& m)
-    {
-        auto& p = m.partition();
-        if (p.partition_tombstone() || !p.row_tombstones().empty() || p.clustered_rows().empty()) {
-            return make_ready_future<lw_shared_ptr<cql3::untyped_result_set>>();
-        }
-
-        dht::partition_range_vector partition_ranges{dht::partition_range(m.decorated_key())};
-
-        auto&& pc = _schema->partition_key_columns();
-        auto&& cc = _schema->clustering_key_columns();
-
-        std::vector<query::clustering_range> bounds;
-        if (cc.empty()) {
-            bounds.push_back(query::clustering_range::make_open_ended_both_sides());
-        } else {
-            for (const rows_entry& r : p.clustered_rows()) {
-                auto& ck = r.key();
-                bounds.push_back(query::clustering_range::make_singular(ck));
-            }
-        }
-
-        std::vector<const column_definition*> columns;
-        columns.reserve(_schema->all_columns().size());
-
-        std::transform(pc.begin(), pc.end(), std::back_inserter(columns), [](auto& c) { return &c; });
-        std::transform(cc.begin(), cc.end(), std::back_inserter(columns), [](auto& c) { return &c; });
-
-        query::column_id_vector static_columns, regular_columns;
-
-        auto sk = column_kind::static_column;
-        auto rk = column_kind::regular_column;
-        // TODO: this assumes all mutations touch the same set of columns. This might not be true, and we may need to do more horrible set operation here.
-        for (auto& [r, cids, kind] : { std::tie(p.static_row().get(), static_columns, sk), std::tie(p.clustered_rows().begin()->row().cells(), regular_columns, rk) }) {
-            r.for_each_cell([&](column_id id, const atomic_cell_or_collection&) {
-                auto& cdef =_schema->column_at(kind, id);
-                cids.emplace_back(id);
-                columns.emplace_back(&cdef);
-            });
-        }
-
-        auto selection = cql3::selection::selection::for_columns(_schema, std::move(columns));
-        auto partition_slice = query::partition_slice(std::move(bounds), std::move(static_columns), std::move(regular_columns), selection->get_query_options());
-        auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), partition_slice, query::max_partitions);
-
-        return _ctx._proxy.query(_schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), empty_service_permit(), client_state)).then(
-                [s = _schema, partition_slice = std::move(partition_slice), selection = std::move(selection)] (service::storage_proxy::coordinator_query_result qr) -> lw_shared_ptr<cql3::untyped_result_set> {
-                    cql3::selection::result_set_builder builder(*selection, gc_clock::now(), cql_serialization_format::latest());
-                    query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *s, *selection));
-                    auto result_set = builder.build();
-                    if (!result_set || result_set->empty()) {
-                        return {};
-                    }
-                    return make_lw_shared<cql3::untyped_result_set>(*result_set);
-        });
-    }
-};
-
-// This class is used to build a mapping from <node ip, shard id> to stream_id
-// It is used as a consumer for rows returned by the query to CDC Description Table
-class streams_builder {
-    const schema& _schema;
-    transformer::streams_type _streams;
-    net::inet_address _node_ip = net::inet_address();
-    unsigned int _shard_id = 0;
-    api::timestamp_type _latest_row_timestamp = api::min_timestamp;
-    utils::UUID _latest_row_stream_id = utils::UUID();
-public:
-    streams_builder(const schema& s) : _schema(s) {}
-
-    void accept_new_partition(const partition_key& key, uint32_t row_count) {
-        auto exploded = key.explode(_schema);
-        _node_ip = value_cast<net::inet_address>(inet_addr_type->deserialize(exploded[0]));
-        _shard_id = static_cast<unsigned int>(value_cast<int>(int32_type->deserialize(exploded[1])));
-        _latest_row_timestamp = api::min_timestamp;
-        _latest_row_stream_id = utils::UUID();
-    }
-
-    void accept_new_partition(uint32_t row_count) {
-        assert(false);
-    }
-
-    void accept_new_row(
-            const clustering_key& key,
-            const query::result_row_view& static_row,
-            const query::result_row_view& row) {
-        auto row_iterator = row.iterator();
-        api::timestamp_type timestamp = value_cast<db_clock::time_point>(
-                timestamp_type->deserialize(key.explode(_schema)[0])).time_since_epoch().count();
-        if (timestamp <= _latest_row_timestamp) {
-            return;
-        }
-        _latest_row_timestamp = timestamp;
-        for (auto&& cdef : _schema.regular_columns()) {
-            if (cdef.name_as_text() != "stream_id") {
-                row_iterator.skip(cdef);
-                continue;
-            }
-            auto val_opt = row_iterator.next_atomic_cell();
-            assert(val_opt);
-            val_opt->value().with_linearized([&] (bytes_view bv) {
-                _latest_row_stream_id = value_cast<utils::UUID>(uuid_type->deserialize(bv));
-            });
-        }
-    }
-
-    void accept_new_row(const query::result_row_view& static_row, const query::result_row_view& row) {
-        assert(false);
-    }
-
-    void accept_partition_end(const query::result_row_view& static_row) {
-        _streams.emplace(std::make_pair(_node_ip, _shard_id), _latest_row_stream_id);
-    }
-
-    transformer::streams_type build() {
-        return std::move(_streams);
-    }
-};
-
-static future<::shared_ptr<transformer::streams_type>> get_streams(
-        db_context ctx,
-        const sstring& ks_name,
-        const sstring& cf_name,
-        lowres_clock::time_point timeout,
-        service::query_state& qs) {
-    auto s =
-        ctx._proxy.get_db().local().find_schema(ks_name, desc_name(cf_name));
-    query::read_command cmd(
-            s->id(),
-            s->version(),
-            partition_slice_builder(*s).with_no_static_columns().build());
-    return ctx._proxy.query(
-            s,
-            make_lw_shared(std::move(cmd)),
-            {dht::partition_range::make_open_ended_both_sides()},
-            db::consistency_level::QUORUM,
-            {timeout, qs.get_permit(), qs.get_client_state()}).then([s = std::move(s)] (auto qr) mutable {
-        return query::result_view::do_with(*qr.query_result,
-                [s = std::move(s)] (query::result_view v) {
-            auto slice = partition_slice_builder(*s)
-                    .with_no_static_columns()
-                    .build();
-            streams_builder builder{ *s };
-            v.consume(slice, builder);
-            return ::make_shared<transformer::streams_type>(builder.build());
-        });
-    });
-}
-
-template <typename Func>
-future<std::vector<mutation>>
-transform_mutations(std::vector<mutation>& muts, decltype(muts.size()) batch_size, Func&& f) {
-    return parallel_for_each(
-            boost::irange(static_cast<decltype(muts.size())>(0), muts.size(), batch_size),
-            std::move(f))
-        .then([&muts] () mutable { return std::move(muts); });
-}
-
-} // namespace cdc
-
-future<std::tuple<std::vector<mutation>, cdc::result_callback>>
-cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations) {
-    // we do all this because in the case of batches, we can have mixed schemas.
-    auto e = mutations.end();
-    auto i = std::find_if(mutations.begin(), e, [](const mutation& m) {
-        return m.schema()->cdc_options().enabled();
-    });
-
-    if (i == e) {
-        return make_ready_future<std::tuple<std::vector<mutation>, cdc::result_callback>>(std::make_tuple(std::move(mutations), result_callback{}));
-    }
-
-    mutations.reserve(2 * mutations.size());
-
-    return do_with(std::move(mutations), service::query_state(service::client_state::for_internal_calls(), empty_service_permit()), [this, timeout, i](std::vector<mutation>& mutations, service::query_state& qs) {
-        return transform_mutations(mutations, 1, [this, &mutations, timeout, &qs] (int idx) {
-            auto& m = mutations[idx];
-            auto s = m.schema();
-
-            if (!s->cdc_options().enabled()) {
-                return make_ready_future<>();
-            }
-            // for batches/multiple mutations this is super inefficient. either partition the mutation set by schema
-            // and re-use streams, or probably better: add a cache so this lookup is a noop on second mutation
-            return get_streams(_ctxt, s->ks_name(), s->cf_name(), timeout, qs).then([this, s = std::move(s), &qs, &mutations, idx](::shared_ptr<transformer::streams_type> streams) mutable {
-                auto& m = mutations[idx]; // should not really need because of reserve, but lets be conservative
-                transformer trans(_ctxt, s, streams);
-
-                if (!s->cdc_options().preimage()) {
-                    mutations.emplace_back(trans.transform(m));
-                    return make_ready_future<>();
-                }
-
-                // Note: further improvement here would be to coalesce the pre-image selects into one
-                // iff a batch contains several modifications to the same table. Otoh, batch is rare(?)
-                // so this is premature.
-                auto f = trans.pre_image_select(qs.get_client_state(), db::consistency_level::LOCAL_QUORUM, m);
-                return f.then([trans = std::move(trans), &mutations, idx] (lw_shared_ptr<cql3::untyped_result_set> rs) mutable {
-                    mutations.push_back(trans.transform(mutations[idx], rs.get()));
-                });
-            });
-        }).then([](std::vector<mutation> mutations) {
-            return make_ready_future<std::tuple<std::vector<mutation>, cdc::result_callback>>(std::make_tuple(std::move(mutations), result_callback{}));
-        });
-    });
-}
-
-bool cdc::cdc_service::needs_cdc_augmentation(const std::vector<mutation>& mutations) const {
-    return std::any_of(mutations.begin(), mutations.end(), [](const mutation& m) {
-        return m.schema()->cdc_options().enabled();
-    });
-}
-
-future<std::tuple<std::vector<mutation>, cdc::result_callback>>
-cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations) {
-    return _impl->augment_mutation_call(timeout, std::move(mutations));
-}
--- a/cdc/cdc.hh
+++ b/cdc/cdc.hh
@@ -1,141 +0,0 @@
-/*
- * Copyright (C) 2019 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <functional>
-#include <optional>
-#include <map>
-#include <string>
-#include <vector>
-
-#include <seastar/core/future.hh>
-#include <seastar/core/lowres_clock.hh>
-#include <seastar/core/shared_ptr.hh>
-#include <seastar/core/sstring.hh>
-
-#include "exceptions/exceptions.hh"
-#include "timestamp.hh"
-#include "cdc_options.hh"
-
-class schema;
-using schema_ptr = seastar::lw_shared_ptr<const schema>;
-
-namespace locator {
-
-class snitch_ptr;
-class token_metadata;
-
-} // namespace locator
-
-namespace service {
-
-class migration_notifier;
-class storage_proxy;
-class query_state;
-
-} // namespace service
-
-namespace dht {
-
-class i_partitioner;
-
-} // namespace dht
-
-class mutation;
-class partition_key;
-
-namespace cdc {
-
-class db_context;
-
-// Callback to be invoked on mutation finish to fix
-// the whole bit about post-image.
-// TODO: decide on what the parameters are to be for this.
-using result_callback = std::function<future<>()>;
-
-/// \brief CDC service, responsible for schema listeners
-///
-/// CDC service will listen for schema changes and iff CDC is enabled/changed
-/// create/modify/delete corresponding log tables etc as part of the schema change. 
-///
-class cdc_service {
-    class impl;
-    std::unique_ptr<impl> _impl;
-public:
-    cdc_service(service::storage_proxy&);
-    cdc_service(db_context);
-    ~cdc_service();
-
-    // If any of the mutations are cdc enabled, optionally selects preimage, and adds the
-    // appropriate augments to set the log entries.
-    // Iff post-image is enabled for any of these, a non-empty callback is also
-    // returned to be invoked post the mutation query.
-    future<std::tuple<std::vector<mutation>, result_callback>> augment_mutation_call(
-        lowres_clock::time_point timeout,
-        std::vector<mutation>&& mutations
-        );
-    bool needs_cdc_augmentation(const std::vector<mutation>&) const;
-};
-
-struct db_context final {
-    service::storage_proxy& _proxy;
-    service::migration_notifier& _migration_notifier;
-    locator::token_metadata& _token_metadata;
-    locator::snitch_ptr& _snitch;
-    dht::i_partitioner& _partitioner;
-
-    class builder final {
-        service::storage_proxy& _proxy;
-        std::optional<std::reference_wrapper<service::migration_notifier>> _migration_notifier;
-        std::optional<std::reference_wrapper<locator::token_metadata>> _token_metadata;
-        std::optional<std::reference_wrapper<locator::snitch_ptr>> _snitch;
-        std::optional<std::reference_wrapper<dht::i_partitioner>> _partitioner;
-    public:
-        builder(service::storage_proxy& proxy);
-
-        builder& with_migration_notifier(service::migration_notifier& migration_notifier);
-        builder& with_token_metadata(locator::token_metadata& token_metadata);
-        builder& with_snitch(locator::snitch_ptr& snitch);
-        builder& with_partitioner(dht::i_partitioner& partitioner);
-
-        db_context build();
-    };
-};
-
-// cdc log table operation
-enum class operation : int8_t {
-    // note: these values will eventually be read by a third party, probably not privvy to this
-    // enum decl, so don't change the constant values (or the datatype).
-    pre_image = 0, update = 1, row_delete = 2, range_delete_start = 3, range_delete_end = 4, partition_delete = 5
-};
-
-// cdc log data column operation
-enum class column_op : int8_t {
-    // same as "operation". Do not edit values or type/type unless you _really_ want to.
-    set = 0, del = 1, add = 2,
-};
-
-seastar::sstring log_name(const seastar::sstring& table_name);
-
-seastar::sstring desc_name(const seastar::sstring& table_name);
-
-} // namespace cdc
--- a/cdc/cdc_extension.hh
+++ b/cdc/cdc_extension.hh
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2020 ScyllaDB
+ */
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "serializer.hh"
+#include "db/extensions.hh"
+#include "cdc/cdc_options.hh"
+#include "schema.hh"
+
+namespace cdc {
+
+class cdc_extension : public schema_extension {
+    cdc::options _cdc_options;
+public:
+    static constexpr auto NAME = "cdc";
+
+    cdc_extension() = default;
+    explicit cdc_extension(std::map<sstring, sstring> tags) : _cdc_options(std::move(tags)) {}
+    explicit cdc_extension(const bytes& b) : _cdc_options(cdc_extension::deserialize(b)) {}
+    explicit cdc_extension(const sstring& s) {
+        throw std::logic_error("Cannot create cdc info from string");
+    }
+    bytes serialize() const override {
+        return ser::serialize_to_buffer<bytes>(_cdc_options.to_map());
+    }
+    static std::map<sstring, sstring> deserialize(const bytes_view& buffer) {
+        return ser::deserialize_from_buffer(buffer, boost::type<std::map<sstring, sstring>>());
+    }
+    const options& get_options() const {
+        return _cdc_options;
+    }
+};
+
+}
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -0,0 +1,405 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <boost/type.hpp>
+#include <random>
+#include <unordered_set>
+#include <seastar/core/sleep.hh>
+
+#include "keys.hh"
+#include "schema_builder.hh"
+#include "db/config.hh"
+#include "db/system_keyspace.hh"
+#include "db/system_distributed_keyspace.hh"
+#include "dht/token-sharding.hh"
+#include "locator/token_metadata.hh"
+#include "gms/application_state.hh"
+#include "gms/inet_address.hh"
+#include "gms/gossiper.hh"
+
+#include "cdc/generation.hh"
+
+extern logging::logger cdc_log;
+
+static int get_shard_count(const gms::inet_address& endpoint, const gms::gossiper& g) {
+    auto ep_state = g.get_application_state_ptr(endpoint, gms::application_state::SHARD_COUNT);
+    return ep_state ? std::stoi(ep_state->value) : -1;
+}
+
+static unsigned get_sharding_ignore_msb(const gms::inet_address& endpoint, const gms::gossiper& g) {
+    auto ep_state = g.get_application_state_ptr(endpoint, gms::application_state::IGNORE_MSB_BITS);
+    return ep_state ? std::stoi(ep_state->value) : 0;
+}
+
+namespace cdc {
+
+extern const api::timestamp_clock::duration generation_leeway =
+    std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));
+
+static void copy_int_to_bytes(int64_t i, size_t offset, bytes& b) {
+    i = net::hton(i);
+    std::copy_n(reinterpret_cast<int8_t*>(&i), sizeof(int64_t), b.begin() + offset);
+}
+
+stream_id::stream_id(int64_t first, int64_t second)
+    : _value(bytes::initialized_later(), 2 * sizeof(int64_t))
+{
+    copy_int_to_bytes(first, 0, _value);
+    copy_int_to_bytes(second, sizeof(int64_t), _value);
+}
+
+stream_id::stream_id(bytes b) : _value(std::move(b)) { }
+
+bool stream_id::is_set() const {
+    return !_value.empty();
+}
+
+bool stream_id::operator==(const stream_id& o) const {
+    return _value == o._value;
+}
+
+bool stream_id::operator<(const stream_id& o) const {
+    return _value < o._value;
+}
+
+static int64_t bytes_to_int64(const bytes& b, size_t offset) {
+    assert(b.size() >= offset + sizeof(int64_t));
+    int64_t res;
+    std::copy_n(b.begin() + offset, sizeof(int64_t), reinterpret_cast<int8_t *>(&res));
+    return net::ntoh(res);
+}
+
+int64_t stream_id::first() const {
+    return bytes_to_int64(_value, 0);
+}
+
+int64_t stream_id::second() const {
+    return bytes_to_int64(_value, sizeof(int64_t));
+}
+
+const bytes& stream_id::to_bytes() const {
+    return _value;
+}
+
+partition_key stream_id::to_partition_key(const schema& log_schema) const {
+    return partition_key::from_single_value(log_schema, _value);
+}
+
+bool token_range_description::operator==(const token_range_description& o) const {
+    return token_range_end == o.token_range_end && streams == o.streams
+        && sharding_ignore_msb == o.sharding_ignore_msb;
+}
+
+topology_description::topology_description(std::vector<token_range_description> entries)
+    : _entries(std::move(entries)) {}
+
+bool topology_description::operator==(const topology_description& o) const {
+    return _entries == o._entries;
+}
+
+const std::vector<token_range_description>& topology_description::entries() const {
+    return _entries;
+}
+
+static stream_id make_random_stream_id() {
+    static thread_local std::mt19937_64 rand_gen(std::random_device().operator()());
+    static thread_local std::uniform_int_distribution<int64_t> rand_dist(std::numeric_limits<int64_t>::min());
+
+    return {rand_dist(rand_gen), rand_dist(rand_gen)};
+}
+
+/* Given:
+ * 1. a set of tokens which split the token ring into token ranges (vnodes),
+ * 2. information on how each token range is distributed among its owning node's shards
+ * this function tries to generate a set of CDC stream identifiers such that for each
+ * shard and vnode pair there exists a stream whose token falls into this
+ * vnode and is owned by this shard.
+ *
+ * It then builds a cdc::topology_description which maps tokens to these
+ * found stream identifiers, such that if token T is owned by shard S in vnode V,
+ * it gets mapped to the stream identifier generated for (S, V).
+ */
+// Run in seastar::async context.
+topology_description generate_topology_description(
+        const db::config& cfg,
+        const std::unordered_set<dht::token>& bootstrap_tokens,
+        const locator::token_metadata& token_metadata,
+        const gms::gossiper& gossiper) {
+    if (bootstrap_tokens.empty()) {
+        throw std::runtime_error(
+                "cdc: bootstrap tokens is empty in generate_topology_description");
+    }
+
+    auto tokens = token_metadata.sorted_tokens();
+    tokens.insert(tokens.end(), bootstrap_tokens.begin(), bootstrap_tokens.end());
+    std::sort(tokens.begin(), tokens.end());
+    tokens.erase(std::unique(tokens.begin(), tokens.end()), tokens.end());
+
+    std::vector<token_range_description> entries(tokens.size());
+    int spots_to_fill = 0;
+
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        auto& entry = entries[i];
+        entry.token_range_end = tokens[i];
+
+        if (bootstrap_tokens.count(entry.token_range_end) > 0) {
+            entry.streams.resize(smp::count);
+            entry.sharding_ignore_msb = cfg.murmur3_partitioner_ignore_msb_bits();
+        } else {
+            auto endpoint = token_metadata.get_endpoint(entry.token_range_end);
+            if (!endpoint) {
+                throw std::runtime_error(format("Can't find endpoint for token {}", entry.token_range_end));
+            }
+            auto sc = get_shard_count(*endpoint, gossiper);
+            entry.streams.resize(sc > 0 ? sc : 1);
+            entry.sharding_ignore_msb = get_sharding_ignore_msb(*endpoint, gossiper);
+        }
+
+        spots_to_fill += entry.streams.size();
+    }
+
+    auto schema = schema_builder("fake_ks", "fake_table")
+        .with_column("stream_id", bytes_type, column_kind::partition_key)
+        .build();
+
+    auto quota = std::chrono::seconds(spots_to_fill / 2000 + 1);
+    auto start_time = std::chrono::system_clock::now();
+
+    // For each pair (i, j), 0 <= i < streams.size(), 0 <= j < streams[i].size(),
+    // try to find a stream (stream[i][j]) such that the token of this stream will get mapped to this stream
+    // (refer to the comments above topology_description's definition to understand how it describes the mapping).
+    // We find the streams by randomly generating them and checking into which pairs they get mapped.
+    // NOTE: this algorithm is temporary and will be replaced after per-table-partitioner feature gets merged in.
+    repeat([&] {
+        for (int i = 0; i < 500; ++i) {
+            auto stream_id = make_random_stream_id();
+            auto token = dht::get_token(*schema, stream_id.to_partition_key(*schema));
+
+            // Find the token range into which our stream_id's token landed.
+            auto it = std::lower_bound(tokens.begin(), tokens.end(), token);
+            auto& entry = entries[it != tokens.end() ? std::distance(tokens.begin(), it) : 0];
+
+            auto shard_id = dht::shard_of(entry.streams.size(), entry.sharding_ignore_msb, token);
+            assert(shard_id < entry.streams.size());
+
+            if (!entry.streams[shard_id].is_set()) {
+                --spots_to_fill;
+                entry.streams[shard_id] = stream_id;
+            }
+        }
+
+        if (!spots_to_fill) {
+            return stop_iteration::yes;
+        }
+
+        auto now = std::chrono::system_clock::now();
+        auto passed = std::chrono::duration_cast<std::chrono::seconds>(now - start_time);
+        if (passed > quota) {
+            return stop_iteration::yes;
+        }
+
+        return stop_iteration::no;
+    }).get();
+
+    if (spots_to_fill) {
+        // We were not able to generate stream ids for each (token range, shard) pair.
+
+        // For each range that has a stream, for each shard for this range that doesn't have a stream,
+        // use the stream id of the next shard for this range.
+
+        // For each range that doesn't have any stream,
+        // use streams of the first range to the left which does have a stream.
+
+        cdc_log.warn("Generation of CDC streams failed to create streams for some (vnode, shard) pair."
+                     " This can lead to worse performance.");
+
+        stream_id some_stream;
+        size_t idx = 0;
+        for (; idx < entries.size(); ++idx) {
+            for (auto s: entries[idx].streams) {
+                if (s.is_set()) {
+                    some_stream = s;
+                    break;
+                }
+            }
+            if (some_stream.is_set()) {
+                break;
+            }
+        }
+
+        assert(idx != entries.size() && some_stream.is_set());
+
+        // Iterate over all ranges in the clockwise direction, starting with the one we found a stream for.
+        for (size_t off = 0; off < entries.size(); ++off) {
+            auto& ss = entries[(idx + off) % entries.size()].streams;
+
+            int last_set_stream_idx = ss.size() - 1;
+            while (last_set_stream_idx > -1 && !ss[last_set_stream_idx].is_set()) {
+                --last_set_stream_idx;
+            }
+
+            if (last_set_stream_idx == -1) {
+                cdc_log.warn(
+                        "CDC wasn't able to generate any stream for vnode ({}, {}]. We'll use another vnode's streams"
+                        " instead. This might lead to inconsistencies.",
+                        tokens[(idx + off + entries.size() - 1) % entries.size()], tokens[(idx + off) % entries.size()]);
+
+                ss[0] = some_stream;
+                last_set_stream_idx = 0;
+            }
+
+            some_stream = ss[last_set_stream_idx];
+
+            // Replace 'unset' stream ids with indexes below last_set_stream_idx
+            for (int s_idx = last_set_stream_idx - 1; s_idx > -1; --s_idx) {
+                if (ss[s_idx].is_set()) {
+                    some_stream = ss[s_idx];
+                } else {
+                    ss[s_idx] = some_stream;
+                }
+            }
+            // Replace 'unset' stream ids with indexes above last_set_stream_idx
+            for (int s_idx = ss.size() - 1; s_idx > last_set_stream_idx; --s_idx) {
+                if (ss[s_idx].is_set()) {
+                    some_stream = ss[s_idx];
+                } else {
+                    ss[s_idx] = some_stream;
+                }
+            }
+        }
+    }
+
+    return {std::move(entries)};
+}
+
+bool should_propose_first_generation(const gms::inet_address& me, const gms::gossiper& g) {
+    auto my_host_id = g.get_host_id(me);
+    auto& eps = g.get_endpoint_states();
+    return std::none_of(eps.begin(), eps.end(),
+            [&] (const std::pair<gms::inet_address, gms::endpoint_state>& ep) {
+        return my_host_id < g.get_host_id(ep.first);
+    });
+}
+
+future<db_clock::time_point> get_local_streams_timestamp() {
+    return db::system_keyspace::get_saved_cdc_streams_timestamp().then([] (std::optional<db_clock::time_point> ts) {
+        if (!ts) {
+            auto err = format("get_local_streams_timestamp: tried to retrieve streams timestamp after bootstrapping, but it's not present");
+            cdc_log.error("{}", err);
+            throw std::runtime_error(err);
+        }
+        return *ts;
+    });
+}
+
+// Run inside seastar::async context.
+db_clock::time_point make_new_cdc_generation(
+        const db::config& cfg,
+        const std::unordered_set<dht::token>& bootstrap_tokens,
+        const locator::token_metadata& tm,
+        const gms::gossiper& g,
+        db::system_distributed_keyspace& sys_dist_ks,
+        std::chrono::milliseconds ring_delay,
+        bool for_testing) {
+    assert(!bootstrap_tokens.empty());
+
+    auto gen = generate_topology_description(cfg, bootstrap_tokens, tm, g);
+
+    // Begin the race.
+    auto ts = db_clock::now() + (
+            for_testing ? std::chrono::milliseconds(0) : (
+                2 * ring_delay + std::chrono::duration_cast<std::chrono::milliseconds>(generation_leeway)));
+    sys_dist_ks.insert_cdc_topology_description(ts, std::move(gen), { tm.count_normal_token_owners() }).get();
+
+    return ts;
+}
+
+std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_address& endpoint, const gms::gossiper& g) {
+    auto streams_ts_string = g.get_application_state_value(endpoint, gms::application_state::CDC_STREAMS_TIMESTAMP);
+    cdc_log.trace("endpoint={}, streams_ts_string={}", endpoint, streams_ts_string);
+
+    if (streams_ts_string.empty()) {
+        return {};
+    }
+
+    return db_clock::time_point(db_clock::duration(std::stoll(streams_ts_string)));
+}
+
+// Run inside seastar::async context.
+static void do_update_streams_description(
+        db_clock::time_point streams_ts,
+        db::system_distributed_keyspace& sys_dist_ks,
+        db::system_distributed_keyspace::context ctx) {
+    if (sys_dist_ks.cdc_desc_exists(streams_ts, ctx).get0()) {
+        cdc_log.debug("update_streams_description: description of generation {} already inserted", streams_ts);
+        return;
+    }
+
+    // We might race with another node also inserting the description, but that's ok. It's an idempotent operation.
+
+    auto topo = sys_dist_ks.read_cdc_topology_description(streams_ts, ctx).get0();
+    if (!topo) {
+        throw std::runtime_error(format("could not find streams data for timestamp {}", streams_ts));
+    }
+
+    std::set<cdc::stream_id> streams_set;
+    for (auto& entry: topo->entries()) {
+        streams_set.insert(entry.streams.begin(), entry.streams.end());
+    }
+
+    std::vector<cdc::stream_id> streams_vec(streams_set.begin(), streams_set.end());
+
+    sys_dist_ks.create_cdc_desc(streams_ts, streams_vec, ctx).get();
+    cdc_log.info("CDC description table successfully updated with generation {}.", streams_ts);
+}
+
+void update_streams_description(
+        db_clock::time_point streams_ts,
+        shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source& abort_src) {
+    try {
+        do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
+    } catch(...) {
+        cdc_log.warn(
+            "Could not update CDC description table with generation {}: {}. Will retry in the background.",
+            streams_ts, std::current_exception());
+
+        // It is safe to discard this future: we keep system distributed keyspace alive.
+        (void)seastar::async([
+            streams_ts, sys_dist_ks, get_num_token_owners = std::move(get_num_token_owners), &abort_src
+        ] {
+            while (true) {
+                sleep_abortable(std::chrono::seconds(60), abort_src).get();
+                try {
+                    do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
+                    return;
+                } catch (...) {
+                    cdc_log.warn(
+                        "Could not update CDC description table with generation {}: {}. Will try again.",
+                        streams_ts, std::current_exception());
+                }
+            }
+        });
+    }
+}
+
+} // namespace cdc
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -0,0 +1,176 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+/* This module contains classes and functions used to manage CDC generations:
+ * sets of CDC stream identifiers used by the cluster to choose partition keys for CDC log writes.
+ * Each CDC generation begins operating at a specific time point, called the generation's timestamp
+ * (`cdc_streams_timpestamp` or `streams_timestamp` in the code).
+ * The generation is used by all nodes in the cluster to pick CDC streams until superseded by a new generation.
+ *
+ * Functions from this module are used by the node joining procedure to introduce new CDC generations to the cluster
+ * (which is necessary due to new tokens being inserted into the token ring), or during rolling upgrade
+ * if CDC is enabled for the first time.
+ */
+
+#pragma once
+
+#include <vector>
+#include <unordered_set>
+#include <seastar/util/noncopyable_function.hh>
+
+#include "database_fwd.hh"
+#include "db_clock.hh"
+#include "dht/token.hh"
+
+namespace seastar {
+    class abort_source;
+} // namespace seastar
+
+namespace db {
+    class config;
+    class system_distributed_keyspace;
+} // namespace db
+
+namespace gms {
+    class inet_address;
+    class gossiper;
+} // namespace gms
+
+namespace locator {
+    class token_metadata;
+} // namespace locator
+
+namespace cdc {
+
+class stream_id final {
+    bytes _value;
+public:
+    stream_id() = default;
+    stream_id(int64_t, int64_t);
+    stream_id(bytes);
+    bool is_set() const;
+    bool operator==(const stream_id&) const;
+    bool operator<(const stream_id&) const;
+
+    int64_t first() const;
+    int64_t second() const;
+
+    const bytes& to_bytes() const;
+
+    partition_key to_partition_key(const schema& log_schema) const;
+};
+
+/* Describes a mapping of tokens to CDC streams in a token range.
+ *
+ * The range ends with `token_range_end`. A vector of `token_range_description`s defines the ranges entirely
+ * (the end of the `i`th range is the beginning of the `i+1 % size()`th range). Ranges are left-opened, right-closed.
+ *
+ * Tokens in the range ending with `token_range_end` are mapped to streams in the `streams` vector as follows:
+ * token `T` is mapped to `streams[j]` if and only if the used partitioner maps `T` to the `j`th shard,
+ * assuming that the partitioner is configured for `streams.size()` shards and (partitioner's) `sharding_ignore_msb`
+ * equals to the given `sharding_ignore_msb`.
+*/
+struct token_range_description {
+    dht::token token_range_end;
+    std::vector<stream_id> streams;
+    uint8_t sharding_ignore_msb;
+
+    bool operator==(const token_range_description&) const;
+};
+
+
+/* Describes a mapping of tokens to CDC streams in a whole token ring.
+ *
+ * Division of the ring to token ranges is defined in terms of `token_range_end`s
+ * in the `_entries` vector. See the comment above `token_range_description` for explanation.
+ */
+class topology_description {
+    std::vector<token_range_description> _entries;
+public:
+    topology_description(std::vector<token_range_description> entries);
+    bool operator==(const topology_description&) const;
+
+    const std::vector<token_range_description>& entries() const;
+};
+
+/* Should be called when we're restarting and we noticed that we didn't save any streams timestamp in our local tables,
+ * which means that we're probably upgrading from a non-CDC/old CDC version (another reason could be
+ * that there's a bug, or the user messed with our local tables).
+ *
+ * It checks whether we should be the node to propose the first generation of CDC streams.
+ * The chosen condition is arbitrary, it only tries to make sure that no two nodes propose a generation of streams
+ * when upgrading, and nothing bad happens if they for some reason do (it's mostly an optimization).
+ */
+bool should_propose_first_generation(const gms::inet_address& me, const gms::gossiper&);
+
+/*
+ * Read this node's streams generation timestamp stored in the LOCAL table.
+ * Assumes that the node has successfully bootstrapped, and we're not upgrading from a non-CDC version,
+ * so the timestamp is present.
+ */
+future<db_clock::time_point> get_local_streams_timestamp();
+
+/* Generate a new set of CDC streams and insert it into the distributed cdc_topology_description table.
+ * Returns the timestamp of this new generation.
+ *
+ * Should be called when starting the node for the first time (i.e., joining the ring).
+ *
+ * Assumes that the system_distributed keyspace is initialized.
+ *
+ * The caller of this function is expected to insert this timestamp into the gossiper as fast as possible,
+ * so that other nodes learn about the generation before their clocks cross the timestmap
+ * (not guaranteed in the current implementation, but expected to be the common case;
+ *  we assume that `ring_delay` is enough for other nodes to learn about the new generation).
+ */
+db_clock::time_point make_new_cdc_generation(
+        const db::config& cfg,
+        const std::unordered_set<dht::token>& bootstrap_tokens,
+        const locator::token_metadata& tm,
+        const gms::gossiper& g,
+        db::system_distributed_keyspace& sys_dist_ks,
+        std::chrono::milliseconds ring_delay,
+        bool for_testing);
+
+/* Retrieves CDC streams generation timestamp from the given endpoint's application state (broadcasted through gossip).
+ * We might be during a rolling upgrade, so the timestamp might not be there (if the other node didn't upgrade yet),
+ * but if the cluster already supports CDC, then every newly joining node will propose a new CDC generation,
+ * which means it will gossip the generation's timestamp.
+ */
+std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_address& endpoint, const gms::gossiper&);
+
+/* Inform CDC users about a generation of streams (identified by the given timestamp)
+ * by inserting it into the cdc_description table.
+ *
+ * Assumes that the cdc_topology_description table contains this generation.
+ *
+ * Returning from this function does not mean that the table update was successful: the function
+ * might run an asynchronous task in the background.
+ *
+ * Run inside seastar::async context.
+ */
+void update_streams_description(
+        db_clock::time_point,
+        shared_ptr<db::system_distributed_keyspace>,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source&);
+
+} // namespace cdc
--- a/cdc/log.cc
+++ b/cdc/log.cc
--- a/cdc/log.hh
+++ b/cdc/log.hh
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * This module manages CDC log tables. It contains facilities used to:
+ * - perform schema changes to CDC log tables correspondingly when base tables are changed,
+ * - perform writes to CDC log tables correspondingly when writes to base tables are made.
+ */
+
+#pragma once
+
+#include <functional>
+#include <optional>
+#include <map>
+#include <string>
+#include <vector>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/sstring.hh>
+
+#include "exceptions/exceptions.hh"
+#include "timestamp.hh"
+#include "tracing/trace_state.hh"
+#include "cdc_options.hh"
+#include "utils/UUID.hh"
+
+class schema;
+using schema_ptr = seastar::lw_shared_ptr<const schema>;
+
+namespace locator {
+
+class token_metadata;
+
+} // namespace locator
+
+namespace service {
+
+class migration_notifier;
+class storage_proxy;
+class query_state;
+
+} // namespace service
+
+class mutation;
+class partition_key;
+
+namespace cdc {
+
+struct operation_result_tracker;
+class db_context;
+class metadata;
+
+/// \brief CDC service, responsible for schema listeners
+///
+/// CDC service will listen for schema changes and iff CDC is enabled/changed
+/// create/modify/delete corresponding log tables etc as part of the schema change. 
+///
+class cdc_service {
+    class impl;
+    std::unique_ptr<impl> _impl;
+public:
+    future<> stop();
+    cdc_service(service::storage_proxy&);
+    cdc_service(db_context);
+    ~cdc_service();
+
+    // If any of the mutations are cdc enabled, optionally selects preimage, and adds the
+    // appropriate augments to set the log entries.
+    // Iff post-image is enabled for any of these, a non-empty callback is also
+    // returned to be invoked post the mutation query.
+    future<std::tuple<std::vector<mutation>, lw_shared_ptr<operation_result_tracker>>> augment_mutation_call(
+        lowres_clock::time_point timeout,
+        std::vector<mutation>&& mutations,
+        tracing::trace_state_ptr tr_state
+        );
+    bool needs_cdc_augmentation(const std::vector<mutation>&) const;
+};
+
+struct db_context final {
+    service::storage_proxy& _proxy;
+    service::migration_notifier& _migration_notifier;
+    locator::token_metadata& _token_metadata;
+    cdc::metadata& _cdc_metadata;
+
+    class builder final {
+        service::storage_proxy& _proxy;
+        std::optional<std::reference_wrapper<service::migration_notifier>> _migration_notifier;
+        std::optional<std::reference_wrapper<locator::token_metadata>> _token_metadata;
+        std::optional<std::reference_wrapper<cdc::metadata>> _cdc_metadata;
+    public:
+        builder(service::storage_proxy& proxy);
+
+        builder& with_migration_notifier(service::migration_notifier& migration_notifier);
+        builder& with_token_metadata(locator::token_metadata& token_metadata);
+        builder& with_cdc_metadata(cdc::metadata&);
+
+        db_context build();
+    };
+};
+
+// cdc log table operation
+enum class operation : int8_t {
+    // note: these values will eventually be read by a third party, probably not privvy to this
+    // enum decl, so don't change the constant values (or the datatype).
+    pre_image = 0, update = 1, insert = 2, row_delete = 3, partition_delete = 4,
+    range_delete_start_inclusive = 5, range_delete_start_exclusive = 6, range_delete_end_inclusive = 7, range_delete_end_exclusive = 8,
+    post_image = 9,
+};
+
+bool is_log_for_some_table(const sstring& ks_name, const std::string_view& table_name);
+seastar::sstring log_name(const seastar::sstring& table_name);
+seastar::sstring log_data_column_name(std::string_view column_name);
+seastar::sstring log_meta_column_name(std::string_view column_name);
+bytes log_data_column_name_bytes(const bytes& column_name);
+bytes log_meta_column_name_bytes(const bytes& column_name);
+
+seastar::sstring log_data_column_deleted_name(std::string_view column_name);
+bytes log_data_column_deleted_name_bytes(const bytes& column_name);
+
+seastar::sstring log_data_column_deleted_elements_name(std::string_view column_name);
+bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name);
+
+utils::UUID generate_timeuuid(api::timestamp_type t);
+
+} // namespace cdc
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -0,0 +1,200 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "dht/token-sharding.hh"
+#include "utils/exceptions.hh"
+#include "exceptions/exceptions.hh"
+
+#include "cdc/generation.hh"
+#include "cdc/metadata.hh"
+
+extern logging::logger cdc_log;
+
+namespace cdc {
+    extern const api::timestamp_clock::duration generation_leeway;
+} // namespace cdc
+
+static api::timestamp_type to_ts(db_clock::time_point tp) {
+    // This assumes that timestamp_clock and db_clock have the same epochs.
+    return std::chrono::duration_cast<api::timestamp_clock::duration>(tp.time_since_epoch()).count();
+}
+
+static cdc::stream_id get_stream(
+        const cdc::token_range_description& entry,
+        dht::token tok) {
+    // The ith stream is the stream for the ith shard.
+    auto shard_cnt = entry.streams.size();
+    auto shard_id = dht::shard_of(shard_cnt, entry.sharding_ignore_msb, tok);
+
+    if (shard_id >= shard_cnt) {
+        on_internal_error(cdc_log, "get_stream: shard_id out of bounds");
+    }
+
+    return entry.streams[shard_id];
+}
+
+static cdc::stream_id get_stream(
+        const std::vector<cdc::token_range_description>& entries,
+        dht::token tok) {
+    if (entries.empty()) {
+        on_internal_error(cdc_log, "get_stream: entries empty");
+    }
+
+    auto it = std::lower_bound(entries.begin(), entries.end(), tok,
+            [] (const cdc::token_range_description& e, dht::token t) { return e.token_range_end < t; });
+    if (it == entries.end()) {
+        it = entries.begin();
+    }
+
+    return get_stream(*it, tok);
+}
+
+cdc::metadata::container_t::const_iterator cdc::metadata::gen_used_at(api::timestamp_type ts) const {
+    auto it = _gens.upper_bound(ts);
+    if (it == _gens.begin()) {
+        // All known generations have higher timestamps than `ts`.
+        return _gens.end();
+    }
+
+    return std::prev(it);
+}
+
+cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok) {
+    auto now = api::new_timestamp();
+    if (ts > now + generation_leeway.count()) {
+        throw exceptions::invalid_request_exception(format(
+                "cdc: attempted to get a stream \"from the future\" ({}; current server time: {})."
+                " With CDC you cannot send writes with timestamps arbitrarily into the future, because we don't"
+                " know what streams will be used at that time.\n"
+                "We *do* allow sending writes into the near future, but our ability to do that is limited."
+                " If you really must use your own timestamps, then make sure your clocks are well-synchronized"
+               "  with the database's clocks.", format_timestamp(ts), format_timestamp(now)));
+        // Note that we might still send a write to a wrong generation, if we learn about the current
+        // generation too late (we might think that an earlier generation is the current one).
+        // Nothing protects us from that until we start using transactions for generation switching.
+    }
+
+    auto it = gen_used_at(now);
+    if (it == _gens.end()) {
+        throw std::runtime_error(format(
+                "cdc::metadata::get_stream: could not find any CDC stream (current time: {})."
+                " Are we in the middle of a cluster upgrade?", format_timestamp(now)));
+    }
+
+    // Garbage-collect generations that will no longer be used.
+    it = _gens.erase(_gens.begin(), it);
+
+    if (it->first > ts) {
+        throw exceptions::invalid_request_exception(format(
+                "cdc: attempted to get a stream from an earlier generation than the currently used one."
+                " With CDC you cannot send writes with timestamps too far into the past, because that would break"
+                " consistency properties (write timestamp: {}, current generation started at: {})",
+                format_timestamp(ts), format_timestamp(it->first)));
+    }
+
+    // With `generation_leeway` we allow sending writes to the near future. It might happen
+    // that `ts` doesn't belong to the current generation ("current" according to our clock),
+    // but to the next generation. Adjust for this case:
+    {
+        auto next_it = std::next(it);
+        while (next_it != _gens.end() && next_it->first <= ts) {
+            it = next_it++;
+        }
+    }
+    // Note: if there is a next generation that `ts` belongs to, but we don't know about it,
+    // then too bad. This is no different from the situation in which we didn't manage to learn
+    // about the current generation in time. We won't be able to prevent it until we introduce transactions.
+
+    if (!it->second) {
+        throw std::runtime_error(format(
+                "cdc: attempted to get a stream from a generation that we know about, but weren't able to retrieve"
+                " (generation timestamp: {}, write timestamp: {}). Make sure that the replicas which contain"
+                " this generation's data are alive and reachable from this node.", format_timestamp(it->first), format_timestamp(ts)));
+    }
+
+    auto& gen = *it->second;
+    auto ret = ::get_stream(gen.entries(), tok);
+    _last_stream_timestamp = ts;
+    return ret;
+}
+
+bool cdc::metadata::known_or_obsolete(db_clock::time_point tp) const {
+    auto ts = to_ts(tp);
+    auto it = _gens.lower_bound(ts);
+
+    if (it == _gens.end()) {
+        // No known generations with timestamp >= ts.
+        return false;
+    }
+
+    if (it->first == ts) {
+        if (it->second) {
+            // We already inserted this particular generation.
+            return true;
+        }
+        ++it;
+    }
+
+    // Check if some new generation has already superseded this one.
+    return it != _gens.end() && it->first <= api::new_timestamp();
+}
+
+bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen) {
+    if (known_or_obsolete(tp)) {
+        return false;
+    }
+
+    auto now = api::new_timestamp();
+    auto it = gen_used_at(now);
+
+    if (it != _gens.end()) {
+        // Garbage-collect generations that will no longer be used.
+        it = _gens.erase(_gens.begin(), it);
+
+    }
+
+    _gens.insert_or_assign(to_ts(tp), std::move(gen));
+    return true;
+}
+
+bool cdc::metadata::prepare(db_clock::time_point tp) {
+    if (known_or_obsolete(tp)) {
+        return false;
+    }
+
+    auto ts = to_ts(tp);
+    auto emplaced = _gens.emplace(to_ts(tp), std::nullopt).second;
+
+    if (_last_stream_timestamp != api::missing_timestamp) {
+        auto last_correct_gen = gen_used_at(_last_stream_timestamp);
+        if (emplaced && last_correct_gen != _gens.end() && last_correct_gen->first == ts) {
+            cdc_log.error(
+                "just learned about a CDC generation newer than the one used the last time"
+                " streams were retrieved. This generation, or some newer one, should have"
+                " been used instead (new generation's timestamp: {}, last time streams were retrieved: {})."
+                " The new generation probably arrived too late due to a network partition"
+                " and we've made a write using the wrong set streams.",
+                format_timestamp(ts), format_timestamp(_last_stream_timestamp));
+        }
+    }
+
+    return emplaced;
+}
--- a/cdc/metadata.hh
+++ b/cdc/metadata.hh
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <map>
+
+#include "db_clock.hh"
+#include "timestamp.hh"
+
+namespace dht {
+    class token;
+}
+
+namespace cdc {
+
+class stream_id;
+class topology_description;
+
+/* Represents the node's knowledge about CDC generations used in the cluster.
+ * Used during writes to pick streams to which CDC log writes should be sent to
+ * (i.e., to pick partition keys for these writes).
+ */
+class metadata final {
+    // Note: we use db_clock (1ms resolution) for generation timestaps
+    // (because we need to insert them into tables using columns of timestamp types,
+    //  and the native type of our columns' timestamp_type is db_clock::time_point).
+    // On the other hand, timestamp_clock (1us resolution) is used for mutation timestamps,
+    // and api::timestamp_type represents the number of ticks of a timestamp_clock::time_point since epoch.
+
+    using container_t = std::map<api::timestamp_type, std::optional<topology_description>>;
+    container_t _gens;
+
+    /* The timestamp used in the last successful `get_stream` call. */
+    api::timestamp_type _last_stream_timestamp = api::missing_timestamp;
+
+    container_t::const_iterator gen_used_at(api::timestamp_type ts) const;
+public:
+    /* Is a generation with the given timestamp already known or superseded by a newer generation? */
+    bool known_or_obsolete(db_clock::time_point) const;
+
+    /* Return the stream for the base partition whose token is `tok` to which a corresponding log write should go
+     * according to the generation used at time `ts` (i.e, the latest generation whose timestamp is less or equal to `ts`).
+     *
+     * If the provided timestamp is too far away "into the future" (where "now" is defined according to our local clock),
+     * we reject the get_stream query. This is because the resulting stream might belong to a generation which we don't
+     * yet know about. The amount of leeway (how much "into the future" we allow `ts` to be) is defined
+     * by the `cdc::generation_leeway` constant.
+     */
+    stream_id get_stream(api::timestamp_type ts, dht::token tok);
+
+    /* Insert the generation given by `gen` with timestamp `ts` to be used by the `get_stream` function,
+     * if the generation is not already known or older than the currently known ones.
+     *
+     * Returns true if the generation was inserted,
+     * meaning that `get_stream` might return a stream from this generation (at some time points).
+     */
+    bool insert(db_clock::time_point ts, topology_description&& gen);
+
+    /* Prepare for inserting a new generation whose timestamp is `ts`.
+     * This method is not required to be called before `insert`, but it's here
+     * to increase safety of `get_stream` calls in some situations. Use it if you:
+     * 1. know that there is a new generation, but
+     * 2. you didn't yet retrieve the generation's topology_description.
+     *
+     * After preparing a generation, if `get_stream` is supposed to return a stream from this generation
+     * but we don't yet have the generation's data, it will reject the query to maintain consistency of streams.
+     *
+     * Returns true iff this generation is not obsolete and wasn't previously prepared nor inserted.
+     */
+    bool prepare(db_clock::time_point ts);
+};
+
+} // namespace cdc
--- a/cdc/split.cc
+++ b/cdc/split.cc
@@ -0,0 +1,463 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "mutation.hh"
+#include "schema.hh"
+
+#include "split.hh"
+#include "log.hh"
+
+struct atomic_column_update {
+    column_id id;
+    atomic_cell cell;
+};
+
+// see the comment inside `clustered_row_insert` for motivation for separating
+// nonatomic deletions from nonatomic updates
+struct nonatomic_column_deletion {
+    column_id id;
+    tombstone t;
+};
+
+struct nonatomic_column_update {
+    column_id id;
+    utils::chunked_vector<std::pair<bytes, atomic_cell>> cells;
+};
+
+struct static_row_update {
+    gc_clock::duration ttl;
+    std::vector<atomic_column_update> atomic_entries;
+    std::vector<nonatomic_column_deletion> nonatomic_deletions;
+    std::vector<nonatomic_column_update> nonatomic_updates;
+};
+
+struct clustered_row_insert {
+    gc_clock::duration ttl;
+    clustering_key key;
+    row_marker marker;
+    std::vector<atomic_column_update> atomic_entries;
+    std::vector<nonatomic_column_deletion> nonatomic_deletions;
+    // INSERTs can't express updates of individual cells inside a non-atomic
+    // (without deleting the entire field first), so no `nonatomic_updates` field
+    // overwriting a nonatomic column inside an INSERT will be split into two changes:
+    // one with a nonatomic deletion, and one with a nonatomic update
+};
+
+struct clustered_row_update {
+    gc_clock::duration ttl;
+    clustering_key key;
+    std::vector<atomic_column_update> atomic_entries;
+    std::vector<nonatomic_column_deletion> nonatomic_deletions;
+    std::vector<nonatomic_column_update> nonatomic_updates;
+};
+
+struct clustered_row_deletion {
+    clustering_key key;
+    tombstone t;
+};
+
+struct clustered_range_deletion {
+    range_tombstone rt;
+};
+
+struct partition_deletion {
+    tombstone t;
+};
+
+struct batch {
+    std::vector<static_row_update> static_updates;
+    std::vector<clustered_row_insert> clustered_inserts;
+    std::vector<clustered_row_update> clustered_updates;
+    std::vector<clustered_row_deletion> clustered_row_deletions;
+    std::vector<clustered_range_deletion> clustered_range_deletions;
+    std::optional<partition_deletion> partition_deletions;
+};
+
+using set_of_changes = std::map<api::timestamp_type, batch>;
+
+struct row_update {
+    std::vector<atomic_column_update> atomic_entries;
+    std::vector<nonatomic_column_deletion> nonatomic_deletions;
+    std::vector<nonatomic_column_update> nonatomic_updates;
+};
+
+static
+std::map<std::pair<api::timestamp_type, gc_clock::duration>, row_update>
+extract_row_updates(const row& r, column_kind ckind, const schema& schema) {
+    std::map<std::pair<api::timestamp_type, gc_clock::duration>, row_update> result;
+    r.for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
+        auto& cdef = schema.column_at(ckind, id);
+        if (cdef.is_atomic()) {
+            auto view = cell.as_atomic_cell(cdef);
+            auto timestamp_and_ttl = std::pair(
+                    view.timestamp(),
+                    view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0)
+                );
+            result[timestamp_and_ttl].atomic_entries.push_back({id, atomic_cell(*cdef.type, view)});
+            return;
+        }
+
+        cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
+            auto desc = mview.materialize(*cdef.type);
+            for (auto& [k, v]: desc.cells) {
+                auto timestamp_and_ttl = std::pair(
+                        v.timestamp(),
+                        v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0)
+                    );
+                auto& updates = result[timestamp_and_ttl].nonatomic_updates;
+                if (updates.empty() || updates.back().id != id) {
+                    updates.push_back({id, {}});
+                }
+                updates.back().cells.push_back({std::move(k), std::move(v)});
+            }
+
+            if (desc.tomb) {
+                auto timestamp_and_ttl = std::pair(desc.tomb.timestamp, gc_clock::duration(0));
+                result[timestamp_and_ttl].nonatomic_deletions.push_back({id, desc.tomb});
+            }
+        });
+    });
+    return result;
+};
+
+set_of_changes extract_changes(const mutation& base_mutation, const schema& base_schema) {
+    set_of_changes res;
+    auto& p = base_mutation.partition();
+
+    auto sr_updates = extract_row_updates(p.static_row().get(), column_kind::static_column, base_schema);
+    for (auto& [k, up]: sr_updates) {
+        auto [timestamp, ttl] = k;
+        res[timestamp].static_updates.push_back({
+                ttl,
+                std::move(up.atomic_entries),
+                std::move(up.nonatomic_deletions),
+                std::move(up.nonatomic_updates)
+            });
+    }
+
+    for (const rows_entry& cr : p.clustered_rows()) {
+        auto cr_updates = extract_row_updates(cr.row().cells(), column_kind::regular_column, base_schema);
+
+        const auto& marker = cr.row().marker();
+        auto marker_timestamp = marker.timestamp();
+        auto marker_ttl = marker.is_expiring() ? marker.ttl() : gc_clock::duration(0);
+        if (marker.is_live()) {
+            // make sure that an entry corresponding to the row marker's timestamp and ttl is in the map
+            (void)cr_updates[std::pair(marker_timestamp, marker_ttl)];
+        }
+
+        auto is_insert = [&] (api::timestamp_type timestamp, gc_clock::duration ttl) {
+            if (!marker.is_live()) {
+                return false;
+            }
+
+            return timestamp == marker_timestamp && ttl == marker_ttl;
+        };
+
+        for (auto& [k, up]: cr_updates) {
+            auto [timestamp, ttl] = k;
+
+            if (is_insert(timestamp, ttl)) {
+                res[timestamp].clustered_inserts.push_back({
+                        ttl,
+                        cr.key(),
+                        marker,
+                        std::move(up.atomic_entries),
+                        std::move(up.nonatomic_deletions)
+                    });
+                if (!up.nonatomic_updates.empty()) {
+                    // nonatomic updates cannot be expressed with an INSERT.
+                    res[timestamp].clustered_updates.push_back({
+                            ttl,
+                            cr.key(),
+                            {},
+                            {},
+                            std::move(up.nonatomic_updates)
+                        });
+                }
+            } else {
+                res[timestamp].clustered_updates.push_back({
+                        ttl,
+                        cr.key(),
+                        std::move(up.atomic_entries),
+                        std::move(up.nonatomic_deletions),
+                        std::move(up.nonatomic_updates)
+                    });
+            }
+        }
+
+        auto row_tomb = cr.row().deleted_at().regular();
+        if (row_tomb) {
+            res[row_tomb.timestamp].clustered_row_deletions.push_back({cr.key(), row_tomb});
+        }
+    }
+
+    for (const auto& rt: p.row_tombstones()) {
+        if (rt.tomb.timestamp != api::missing_timestamp) {
+            res[rt.tomb.timestamp].clustered_range_deletions.push_back({rt});
+        }
+    }
+
+    auto partition_tomb_timestamp = p.partition_tombstone().timestamp;
+    if (partition_tomb_timestamp != api::missing_timestamp) {
+        res[partition_tomb_timestamp].partition_deletions = {p.partition_tombstone()};
+    }
+
+    return res;
+}
+
+namespace cdc {
+
+bool should_split(const mutation& base_mutation, const schema& base_schema) {
+    auto& p = base_mutation.partition();
+
+    api::timestamp_type found_ts = api::missing_timestamp;
+    std::optional<gc_clock::duration> found_ttl; // 0 = "no ttl"
+
+    auto check_or_set = [&] (api::timestamp_type ts, gc_clock::duration ttl) {
+        if (found_ts != api::missing_timestamp && found_ts != ts) {
+            return true;
+        }
+        found_ts = ts;
+
+        if (found_ttl && *found_ttl != ttl) {
+            return true;
+        }
+        found_ttl = ttl;
+
+        return false;
+    };
+
+    bool had_static_row = false;
+
+    bool should_split = false;
+    p.static_row().get().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
+        had_static_row = true;
+
+        auto& cdef = base_schema.column_at(column_kind::static_column, id);
+        if (cdef.is_atomic()) {
+            auto view = cell.as_atomic_cell(cdef);
+            if (check_or_set(view.timestamp(), view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0))) {
+                should_split = true;
+            }
+            return;
+        }
+
+        cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
+            auto desc = mview.materialize(*cdef.type);
+            for (auto& [k, v]: desc.cells) {
+                if (check_or_set(v.timestamp(), v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0))) {
+                    should_split = true;
+                    return;
+                }
+            }
+
+            if (desc.tomb) {
+                if (check_or_set(desc.tomb.timestamp, gc_clock::duration(0))) {
+                    should_split = true;
+                    return;
+                }
+            }
+        });
+    });
+
+    if (should_split) {
+        return true;
+    }
+
+    bool had_clustered_row = false;
+
+    if (!p.clustered_rows().empty() && had_static_row) {
+        return true;
+    }
+    for (const rows_entry& cr : p.clustered_rows()) {
+        had_clustered_row = true;
+
+        const auto& marker = cr.row().marker();
+        if (marker.is_live() && check_or_set(marker.timestamp(), marker.is_expiring() ? marker.ttl() : gc_clock::duration(0))) {
+            return true;
+        }
+
+        bool is_insert = marker.is_live();
+
+        bool had_cells = false;
+        cr.row().cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
+            had_cells = true;
+
+            auto& cdef = base_schema.column_at(column_kind::regular_column, id);
+            if (cdef.is_atomic()) {
+                auto view = cell.as_atomic_cell(cdef);
+                if (check_or_set(view.timestamp(), view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0))) {
+                    should_split = true;
+                }
+                return;
+            }
+
+            cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
+                for (auto& [k, v]: mview.cells) {
+                    if (check_or_set(v.timestamp(), v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0))) {
+                        should_split = true;
+                        return;
+                    }
+
+                    if (is_insert) {
+                        // nonatomic updates cannot be expressed with an INSERT.
+                        should_split = true;
+                        return;
+                    }
+                }
+
+                if (mview.tomb) {
+                    if (check_or_set(mview.tomb.timestamp, gc_clock::duration(0))) {
+                        should_split = true;
+                        return;
+                    }
+                }
+            });
+        });
+
+        if (should_split) {
+            return true;
+        }
+
+        auto row_tomb = cr.row().deleted_at().regular();
+        if (row_tomb) {
+            if (had_cells) {
+                return true;
+            }
+
+            // there were no cells, so no ttl
+            assert(!found_ttl);
+            if (found_ts != api::missing_timestamp && found_ts != row_tomb.timestamp) {
+                return true;
+            }
+
+            found_ts = row_tomb.timestamp;
+        }
+    }
+
+    if (!p.row_tombstones().empty() && (had_static_row || had_clustered_row)) {
+        return true;
+    }
+
+    for (const auto& rt: p.row_tombstones()) {
+        if (rt.tomb) {
+            if (found_ts != api::missing_timestamp && found_ts != rt.tomb.timestamp) {
+                return true;
+            }
+
+            found_ts = rt.tomb.timestamp;
+        }
+    }
+
+    if (p.partition_tombstone().timestamp != api::missing_timestamp
+            && (!p.row_tombstones().empty() || had_static_row || had_clustered_row)) {
+        return true;
+    }
+
+    // A mutation with no timestamp will be split into 0 mutations
+    return found_ts == api::missing_timestamp;
+}
+
+void for_each_change(const mutation& base_mutation, const schema_ptr& base_schema,
+        seastar::noncopyable_function<void(mutation, api::timestamp_type, bytes, int&)> f) {
+    auto changes = extract_changes(base_mutation, *base_schema);
+    auto pk = base_mutation.key();
+
+    for (auto& [change_ts, btch] : changes) {
+        auto tuuid = timeuuid_type->decompose(generate_timeuuid(change_ts));
+        int batch_no = 0;
+
+        for (auto& sr_update : btch.static_updates) {
+            mutation m(base_schema, pk);
+            for (auto& atomic_update : sr_update.atomic_entries) {
+                auto& cdef = base_schema->column_at(column_kind::static_column, atomic_update.id);
+                m.set_static_cell(cdef, std::move(atomic_update.cell));
+            }
+            for (auto& nonatomic_delete : sr_update.nonatomic_deletions) {
+                auto& cdef = base_schema->column_at(column_kind::static_column, nonatomic_delete.id);
+                m.set_static_cell(cdef, collection_mutation_description{nonatomic_delete.t, {}}.serialize(*cdef.type));
+            }
+            for (auto& nonatomic_update : sr_update.nonatomic_updates) {
+                auto& cdef = base_schema->column_at(column_kind::static_column, nonatomic_update.id);
+                m.set_static_cell(cdef, collection_mutation_description{{}, std::move(nonatomic_update.cells)}.serialize(*cdef.type));
+            }
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        for (auto& cr_insert : btch.clustered_inserts) {
+            mutation m(base_schema, pk);
+
+            auto& row = m.partition().clustered_row(*base_schema, cr_insert.key);
+            for (auto& atomic_update : cr_insert.atomic_entries) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, atomic_update.id);
+                row.cells().apply(cdef, std::move(atomic_update.cell));
+            }
+            for (auto& nonatomic_delete : cr_insert.nonatomic_deletions) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, nonatomic_delete.id);
+                row.cells().apply(cdef, collection_mutation_description{nonatomic_delete.t, {}}.serialize(*cdef.type));
+            }
+            row.apply(cr_insert.marker);
+
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        for (auto& cr_update : btch.clustered_updates) {
+            mutation m(base_schema, pk);
+
+            auto& row = m.partition().clustered_row(*base_schema, cr_update.key).cells();
+            for (auto& atomic_update : cr_update.atomic_entries) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, atomic_update.id);
+                row.apply(cdef, std::move(atomic_update.cell));
+            }
+            for (auto& nonatomic_delete : cr_update.nonatomic_deletions) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, nonatomic_delete.id);
+                row.apply(cdef, collection_mutation_description{nonatomic_delete.t, {}}.serialize(*cdef.type));
+            }
+            for (auto& nonatomic_update : cr_update.nonatomic_updates) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, nonatomic_update.id);
+                row.apply(cdef, collection_mutation_description{{}, std::move(nonatomic_update.cells)}.serialize(*cdef.type));
+            }
+
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        for (auto& cr_delete : btch.clustered_row_deletions) {
+            mutation m(base_schema, pk);
+            m.partition().apply_delete(*base_schema, cr_delete.key, cr_delete.t);
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        for (auto& crange_delete : btch.clustered_range_deletions) {
+            mutation m(base_schema, pk);
+            m.partition().apply_delete(*base_schema, crange_delete.rt);
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        if (btch.partition_deletions) {
+            mutation m(base_schema, pk);
+            m.partition().apply(btch.partition_deletions->t);
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+    }
+}
+
+} // namespace cdc
--- a/cdc/split.hh
+++ b/cdc/split.hh
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <vector>
+#include "schema_fwd.hh"
+#include "timestamp.hh"
+#include "bytes.hh"
+#include <seastar/util/noncopyable_function.hh>
+
+class mutation;
+
+namespace cdc {
+
+bool should_split(const mutation& base_mutation, const schema& base_schema);
+void for_each_change(const mutation& base_mutation, const schema_ptr& base_schema,
+        seastar::noncopyable_function<void(mutation, api::timestamp_type, bytes, int&)>);
+
+}
--- a/cdc/stats.hh
+++ b/cdc/stats.hh
@@ -0,0 +1,120 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <string>
+#include <seastar/core/metrics_registration.hh>
+#include "enum_set.hh"
+#include "utils/histogram.hh"
+#include "utils/estimated_histogram.hh"
+
+namespace cdc {
+
+class stats final {
+    seastar::metrics::metric_groups _metrics;
+
+public:
+    enum class part_type {
+        STATIC_ROW,
+        CLUSTERING_ROW,
+        MAP,
+        SET,
+        LIST,
+        UDT,
+        RANGE_TOMBSTONE,
+        PARTITION_DELETE,
+        ROW_DELETE,
+
+        MAX
+    };
+
+    using part_type_set = enum_set<super_enum<part_type,
+        part_type::STATIC_ROW,
+        part_type::CLUSTERING_ROW,
+        part_type::MAP,
+        part_type::SET,
+        part_type::LIST,
+        part_type::UDT,
+        part_type::RANGE_TOMBSTONE,
+        part_type::PARTITION_DELETE,
+        part_type::ROW_DELETE
+    >>;
+
+    struct parts_touched_stats final {
+        std::array<uint64_t, (size_t)part_type::MAX> count = {};
+
+        inline void apply(part_type_set parts_set) {
+            for (part_type idx : parts_set) {
+                count[(size_t)idx]++;
+            }
+        }
+
+        void register_metrics(seastar::metrics::metric_groups& metrics, std::string_view suffix);
+    };
+
+    struct counters final {
+        uint64_t unsplit_count = 0;
+        uint64_t split_count = 0;
+        uint64_t preimage_selects = 0;
+        uint64_t with_preimage_count = 0;
+        uint64_t with_postimage_count = 0;
+
+        parts_touched_stats touches;
+    };
+
+    counters counters_total;
+    counters counters_failed;
+
+    stats();
+};
+
+// Contains the details on what happened during a CDC operation.
+struct operation_details final {
+    stats::part_type_set touched_parts;
+    bool was_split = false;
+    bool had_preimage = false;
+    bool had_postimage = false;
+};
+
+// This object tracks the lifetime of write handlers related to one CDC operation. After all
+// write handlers for the operation finish, CDC metrics are updated.
+class operation_result_tracker final {
+    stats& _stats;
+    operation_details _details;
+    bool _failed;
+
+public:
+    operation_result_tracker(stats& stats, operation_details details)
+        : _stats(stats)
+        , _details(details)
+        , _failed(false)
+    {}
+    ~operation_result_tracker();
+
+    void on_mutation_failed() {
+        _failed = true;
+    }
+};
+
+}
--- a/checked-file-impl.hh
+++ b/checked-file-impl.hh
@@ -22,7 +22,10 @@
 #pragma once

 #include "seastar/core/file.hh"
-#include "disk-error-handler.hh"
+#include "seastar/core/reactor.hh"
+#include "utils/disk-error-handler.hh"
+
+#include "seastarx.hh"

 class checked_file_impl : public file_impl {
 public:
--- a/clocks-impl.cc
+++ b/clocks-impl.cc
@@ -19,6 +19,23 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <seastar/core/print.hh>
+
+#include "db_clock.hh"
+#include "timestamp.hh"
+
 #include "clocks-impl.hh"

 std::atomic<int64_t> clocks_offset;
+
+std::ostream& operator<<(std::ostream& os, db_clock::time_point tp) {
+    auto t = db_clock::to_time_t(tp);
+    ::tm t_buf;
+    return os << std::put_time(::gmtime_r(&t, &t_buf), "%Y/%m/%d %T");
+}
+
+std::string format_timestamp(api::timestamp_type ts) {
+    auto t = std::time_t(std::chrono::duration_cast<std::chrono::seconds>(api::timestamp_clock::duration(ts)).count());
+    ::tm t_buf;
+    return format("{}", std::put_time(::gmtime_r(&t, &t_buf), "%Y/%m/%d %T"));
+}
--- a/clustering_bounds_comparator.hh
+++ b/clustering_bounds_comparator.hh
@@ -24,7 +24,7 @@

 #include <functional>
 #include "keys.hh"
-#include "schema.hh"
+#include "schema_fwd.hh"
 #include "range.hh"

 /**
--- a/clustering_interval_set.hh
+++ b/clustering_interval_set.hh
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "schema_fwd.hh"
+#include "position_in_partition.hh"
+#include <boost/icl/interval_set.hpp>
+
+// Represents a non-contiguous subset of clustering_key domain of a particular schema.
+// Can be treated like an ordered and non-overlapping sequence of position_range:s.
+class clustering_interval_set {
+    // Needed to make position_in_partition comparable, required by boost::icl::interval_set.
+    class position_in_partition_with_schema {
+        schema_ptr _schema;
+        position_in_partition _pos;
+    public:
+        position_in_partition_with_schema()
+            : _pos(position_in_partition::for_static_row())
+        { }
+        position_in_partition_with_schema(schema_ptr s, position_in_partition pos)
+            : _schema(std::move(s))
+            , _pos(std::move(pos))
+        { }
+        bool operator<(const position_in_partition_with_schema& other) const {
+            return position_in_partition::less_compare(*_schema)(_pos, other._pos);
+        }
+        bool operator==(const position_in_partition_with_schema& other) const {
+            return position_in_partition::equal_compare(*_schema)(_pos, other._pos);
+        }
+        const position_in_partition& position() const { return _pos; }
+    };
+private:
+    // We want to represent intervals of clustering keys, not position_in_partitions,
+    // but clustering_key domain is not enough to represent all kinds of clustering ranges.
+    // All intervals in this set are of the form [x, y).
+    using set_type = boost::icl::interval_set<position_in_partition_with_schema>;
+    using interval = boost::icl::interval<position_in_partition_with_schema>;
+    set_type _set;
+public:
+    clustering_interval_set() = default;
+    // Constructs from legacy clustering_row_ranges
+    clustering_interval_set(const schema& s, const query::clustering_row_ranges& ranges) {
+        for (auto&& r : ranges) {
+            add(s, position_range::from_range(r));
+        }
+    }
+    query::clustering_row_ranges to_clustering_row_ranges() const {
+        query::clustering_row_ranges result;
+        for (position_range r : *this) {
+            result.push_back(query::clustering_range::make(
+                {r.start().key(), r.start()._bound_weight != bound_weight::after_all_prefixed},
+                {r.end().key(), r.end()._bound_weight == bound_weight::after_all_prefixed}));
+        }
+        return result;
+    }
+    class position_range_iterator : public std::iterator<std::input_iterator_tag, const position_range> {
+        set_type::iterator _i;
+    public:
+        position_range_iterator(set_type::iterator i) : _i(i) {}
+        position_range operator*() const {
+            // FIXME: Produce position_range view. Not performance critical yet.
+            const interval::interval_type& iv = *_i;
+            return position_range{iv.lower().position(), iv.upper().position()};
+        }
+        bool operator==(const position_range_iterator& other) const { return _i == other._i; }
+        bool operator!=(const position_range_iterator& other) const { return _i != other._i; }
+        position_range_iterator& operator++() {
+            ++_i;
+            return *this;
+        }
+        position_range_iterator operator++(int) {
+            auto tmp = *this;
+            ++_i;
+            return tmp;
+        }
+    };
+    static interval::type make_interval(const schema& s, const position_range& r) {
+        assert(r.start().has_clustering_key());
+        assert(r.end().has_clustering_key());
+        return interval::right_open(
+            position_in_partition_with_schema(s.shared_from_this(), r.start()),
+            position_in_partition_with_schema(s.shared_from_this(), r.end()));
+    }
+public:
+    bool equals(const schema& s, const clustering_interval_set& other) const {
+        return boost::equal(_set, other._set);
+    }
+    bool contains(const schema& s, position_in_partition_view pos) const {
+        // FIXME: Avoid copy
+        return _set.find(position_in_partition_with_schema(s.shared_from_this(), position_in_partition(pos))) != _set.end();
+    }
+    // Returns true iff this set is fully contained in the other set.
+    bool contained_in(clustering_interval_set& other) const {
+        return boost::icl::within(_set, other._set);
+    }
+    bool overlaps(const schema& s, const position_range& range) const {
+        // FIXME: Avoid copy
+        auto r = _set.equal_range(make_interval(s, range));
+        return r.first != r.second;
+    }
+    // Adds given clustering range to this interval set.
+    // The range may overlap with this set.
+    void add(const schema& s, const position_range& r) {
+        _set += make_interval(s, r);
+    }
+    void add(const schema& s, const clustering_interval_set& other) {
+        for (auto&& r : other) {
+            add(s, r);
+        }
+    }
+    position_range_iterator begin() const { return {_set.begin()}; }
+    position_range_iterator end() const { return {_set.end()}; }
+    friend std::ostream& operator<<(std::ostream&, const clustering_interval_set&);
+};
+
--- a/clustering_key_filter.hh
+++ b/clustering_key_filter.hh
@@ -23,7 +23,7 @@

 #pragma once

-#include "schema.hh"
+#include "schema_fwd.hh"
 #include "query-request.hh"

 namespace query {
--- a/column_computation.hh
+++ b/column_computation.hh
@@ -21,6 +21,8 @@

 #pragma once

+#include <json/json.h>
+
 #include "bytes.hh"

 class schema;
--- a/combine.hh
+++ b/combine.hh
@@ -21,6 +21,8 @@

 #pragma once

+#include <algorithm>
+
 // combine two sorted uniqued sequences into a single sorted sequence
 // unique elements are copied, duplicate elements are merged with a
 // binary function.
--- a/compaction_garbage_collector.hh
+++ b/compaction_garbage_collector.hh
@@ -21,7 +21,6 @@

 #pragma once

-#include "schema.hh"
 #include "collection_mutation.hh"

 class atomic_cell;
--- a/compress.cc
+++ b/compress.cc
@@ -81,7 +81,7 @@ shared_ptr<compressor> compressor::create(const sstring& name, const opt_getter&
    qualified_name qn(namespace_prefix, name);

    for (auto& c : { lz4, snappy, deflate }) {
-        if (c->name() == qn) {
+        if (c->name() == static_cast<const sstring&>(qn)) {
            return c;
        }
    }
@@ -103,9 +103,9 @@ shared_ptr<compressor> compressor::create(const std::map<sstring, sstring>& opti
    return {};
 }

-thread_local const shared_ptr<compressor> compressor::lz4 = make_shared<lz4_processor>(namespace_prefix + "LZ4Compressor");
-thread_local const shared_ptr<compressor> compressor::snappy = make_shared<snappy_processor>(namespace_prefix + "SnappyCompressor");
-thread_local const shared_ptr<compressor> compressor::deflate = make_shared<deflate_processor>(namespace_prefix + "DeflateCompressor");
+thread_local const shared_ptr<compressor> compressor::lz4 = ::make_shared<lz4_processor>(namespace_prefix + "LZ4Compressor");
+thread_local const shared_ptr<compressor> compressor::snappy = ::make_shared<snappy_processor>(namespace_prefix + "SnappyCompressor");
+thread_local const shared_ptr<compressor> compressor::deflate = ::make_shared<deflate_processor>(namespace_prefix + "DeflateCompressor");

 const sstring compression_parameters::SSTABLE_COMPRESSION = "sstable_compression";
 const sstring compression_parameters::CHUNK_LENGTH_KB = "chunk_length_in_kb";
--- a/concrete_types.hh
+++ b/concrete_types.hh
@@ -135,7 +135,7 @@ struct timeuuid_type_impl final : public concrete_type<utils::UUID> {
    static utils::UUID from_sstring(sstring_view s);
 };

-struct varint_type_impl final : public concrete_type<boost::multiprecision::cpp_int> {
+struct varint_type_impl final : public concrete_type<utils::multiprecision_int> {
    varint_type_impl();
 };

--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -288,9 +288,7 @@ batch_size_fail_threshold_in_kb: 50
 # reloading all data, so when upgrading you should set this to the
 # same partitioner you were already using.
 #
-# Besides Murmur3Partitioner, partitioners included for backwards
-# compatibility include RandomPartitioner, ByteOrderedPartitioner, and
-# OrderPreservingPartitioner.
+# Murmur3Partitioner is currently the only supported partitioner,
 #
 partitioner: org.apache.cassandra.dht.Murmur3Partitioner

--- a/configure.py
+++ b/configure.py
@@ -142,16 +142,21 @@ def flag_supported(flag, compiler):
    return try_compile(flags=['-Werror'] + split, compiler=compiler)


-def gold_supported(compiler):
+def linker_flags(compiler):
    src_main = 'int main(int argc, char **argv) { return 0; }'
+    link_flags = ['-fuse-ld=lld']
+    if try_compile_and_link(source=src_main, flags=link_flags, compiler=compiler):
+        print('Note: using the lld linker')
+        return ' '.join(link_flags)
    link_flags = ['-fuse-ld=gold']
    if try_compile_and_link(source=src_main, flags=link_flags, compiler=compiler):
+        print('Note: using the gold linker')
        threads_flag = '-Wl,--threads'
        if try_compile_and_link(source=src_main, flags=link_flags + [threads_flag], compiler=compiler):
            link_flags.append(threads_flag)
        return ' '.join(link_flags)
    else:
-        print('Note: gold not found; using default system linker')
+        print('Note: neither lld nor gold found; using default system linker')
        return ''


@@ -243,24 +248,24 @@ def find_headers(repodir, excluded_dirs):

 modes = {
    'debug': {
-        'cxxflags': '-DDEBUG -DDEBUG_LSA_SANITIZER',
-        'cxx_ld_flags': '',
+        'cxxflags': '-DDEBUG -DDEBUG_LSA_SANITIZER -DSEASTAR_ENABLE_ALLOC_FAILURE_INJECTION -DSCYLLA_ENABLE_ERROR_INJECTION',
+        'cxx_ld_flags': '-Wstack-usage=%s' % (1024*40),
    },
    'release': {
        'cxxflags': '',
-        'cxx_ld_flags': '-O3',
+        'cxx_ld_flags': '-O3 -Wstack-usage=%s' % (1024*29),
    },
    'dev': {
-        'cxxflags': '',
-        'cxx_ld_flags': '-O1',
+        'cxxflags': '-DSEASTAR_ENABLE_ALLOC_FAILURE_INJECTION -DSCYLLA_ENABLE_ERROR_INJECTION',
+        'cxx_ld_flags': '-O1 -Wstack-usage=%s' % (1024*29),
    },
    'sanitize': {
-        'cxxflags': '-DDEBUG -DDEBUG_LSA_SANITIZER',
-        'cxx_ld_flags': '-Os',
+        'cxxflags': '-DDEBUG -DDEBUG_LSA_SANITIZER -DSCYLLA_ENABLE_ERROR_INJECTION',
+        'cxx_ld_flags': '-Os -Wstack-usage=%s' % (1024*50),
    }
 }

-scylla_tests = [
+scylla_tests = set([
    'test/boost/UUID_test',
    'test/boost/aggregate_fcts_test',
    'test/boost/allocation_strategy_test',
@@ -291,6 +296,10 @@ scylla_tests = [
    'test/boost/cql_auth_query_test',
    'test/boost/cql_auth_syntax_test',
    'test/boost/cql_query_test',
+    'test/boost/cql_query_large_test',
+    'test/boost/cql_query_like_test',
+    'test/boost/cql_query_group_test',
+    'test/boost/cql_functions_test',
    'test/boost/crc_test',
    'test/boost/data_listeners_test',
    'test/boost/database_test',
@@ -299,6 +308,7 @@ scylla_tests = [
    'test/boost/enum_option_test',
    'test/boost/enum_set_test',
    'test/boost/extensions_test',
+    'test/boost/error_injection_test',
    'test/boost/filtering_test',
    'test/boost/flat_mutation_reader_test',
    'test/boost/flush_queue_test',
@@ -326,6 +336,7 @@ scylla_tests = [
    'test/boost/mutation_fragment_test',
    'test/boost/mutation_query_test',
    'test/boost/mutation_reader_test',
+    'test/boost/multishard_combining_reader_as_mutation_source_test',
    'test/boost/mutation_test',
    'test/boost/mutation_writer_test',
    'test/boost/mvcc_test',
@@ -343,6 +354,7 @@ scylla_tests = [
    'test/boost/schema_change_test',
    'test/boost/schema_registry_test',
    'test/boost/secondary_index_test',
+    'test/boost/index_with_paging_test',
    'test/boost/serialization_test',
    'test/boost/serialized_action_test',
    'test/boost/small_vector_test',
@@ -350,6 +362,8 @@ scylla_tests = [
    'test/boost/sstable_3_x_test',
    'test/boost/sstable_datafile_test',
    'test/boost/sstable_mutation_test',
+    'test/boost/schema_changes_test',
+    'test/boost/sstable_conforms_to_mutation_source_test',
    'test/boost/sstable_resharding_test',
    'test/boost/sstable_test',
    'test/boost/storage_proxy_test',
@@ -363,8 +377,11 @@ scylla_tests = [
    'test/boost/view_build_test',
    'test/boost/view_complex_test',
    'test/boost/view_schema_test',
+    'test/boost/view_schema_pkey_test',
+    'test/boost/view_schema_ckey_test',
    'test/boost/vint_serialization_test',
    'test/boost/virtual_reader_test',
+    'test/boost/stall_free_test',
    'test/manual/ec2_snitch_test',
    'test/manual/gce_snitch_test',
    'test/manual/gossip',
@@ -375,6 +392,8 @@ scylla_tests = [
    'test/manual/partition_data_test',
    'test/manual/row_locker_test',
    'test/manual/streaming_histogram_test',
+    'test/manual/sstable_scan_footprint_test',
+    'test/perf/memory_footprint_test',
    'test/perf/perf_cache_eviction',
    'test/perf/perf_cql_parser',
    'test/perf/perf_fast_forward',
@@ -383,33 +402,32 @@ scylla_tests = [
    'test/perf/perf_row_cache_update',
    'test/perf/perf_simple_query',
    'test/perf/perf_sstable',
-    'test/tools/cql_repl',
    'test/unit/lsa_async_eviction_test',
    'test/unit/lsa_sync_eviction_test',
-    'test/unit/memory_footprint_test',
    'test/unit/row_cache_alloc_stress_test',
    'test/unit/row_cache_stress_test',
-]
+])

-perf_tests = [
+perf_tests = set([
    'test/perf/perf_mutation_readers',
    'test/perf/perf_checksum',
    'test/perf/perf_mutation_fragment',
    'test/perf/perf_idl',
    'test/perf/perf_vint',
-]
+])

-apps = [
+apps = set([
    'scylla',
-]
+    'test/tools/cql_repl',
+])

-tests = scylla_tests + perf_tests
+tests = scylla_tests | perf_tests

-other = [
+other = set([
    'iotune',
-]
+])

-all_artifacts = apps + tests + other
+all_artifacts = apps | tests | other

 arg_parser = argparse.ArgumentParser('Configure scylla')
 arg_parser.add_argument('--static', dest='static', action='store_const', default='',
@@ -463,6 +481,7 @@ arg_parser.add_argument('--with-antlr3', dest='antlr3_exec', action='store', def
                        help='path to antlr3 executable')
 arg_parser.add_argument('--with-ragel', dest='ragel_exec', action='store', default='ragel',
        help='path to ragel executable')
+add_tristate(arg_parser, name='stack-guards', dest='stack_guards', help='Use stack guards')
 args = arg_parser.parse_args()

 defines = ['XXH_PRIVATE_API',
@@ -483,6 +502,7 @@ scylla_core = (['database.cc',
                'frozen_schema.cc',
                'schema_registry.cc',
                'bytes.cc',
+                'timeout_config.cc',
                'mutation.cc',
                'mutation_fragment.cc',
                'partition_version.cc',
@@ -501,6 +521,7 @@ scylla_core = (['database.cc',
                'mutation_partition.cc',
                'mutation_partition_view.cc',
                'mutation_partition_serializer.cc',
+                'converting_mutation_partition_applier.cc',
                'mutation_reader.cc',
                'flat_mutation_reader.cc',
                'mutation_query.cc',
@@ -528,7 +549,10 @@ scylla_core = (['database.cc',
                'transport/event_notifier.cc',
                'transport/server.cc',
                'transport/messages/result_message.cc',
-                'cdc/cdc.cc',
+                'cdc/log.cc',
+                'cdc/split.cc',
+                'cdc/generation.cc',
+                'cdc/metadata.cc',
                'cql3/type_json.cc',
                'cql3/abstract_marker.cc',
                'cql3/attributes.cc',
@@ -564,7 +588,7 @@ scylla_core = (['database.cc',
                'cql3/statements/function_statement.cc',
                'cql3/statements/modification_statement.cc',
                'cql3/statements/cas_request.cc',
-                'cql3/statements/parsed_statement.cc',
+                'cql3/statements/raw/parsed_statement.cc',
                'cql3/statements/property_definitions.cc',
                'cql3/statements/update_statement.cc',
                'cql3/statements/delete_statement.cc',
@@ -660,10 +684,12 @@ scylla_core = (['database.cc',
                'utils/managed_bytes.cc',
                'utils/exceptions.cc',
                'utils/config_file.cc',
+                'utils/multiprecision_int.cc',
                'utils/gz/crc_combine.cc',
                'gms/version_generator.cc',
                'gms/versioned_value.cc',
                'gms/gossiper.cc',
+                'gms/feature_service.cc',
                'gms/failure_detector.cc',
                'gms/gossip_digest_syn.cc',
                'gms/gossip_digest_ack.cc',
@@ -672,9 +698,8 @@ scylla_core = (['database.cc',
                'gms/application_state.cc',
                'gms/inet_address.cc',
                'dht/i_partitioner.cc',
+                'dht/token.cc',
                'dht/murmur3_partitioner.cc',
-                'dht/byte_ordered_partitioner.cc',
-                'dht/random_partitioner.cc',
                'dht/boot_strapper.cc',
                'dht/range_streamer.cc',
                'unimplemented.cc',
@@ -747,7 +772,7 @@ scylla_core = (['database.cc',
                'table_helper.cc',
                'range_tombstone.cc',
                'range_tombstone_list.cc',
-                'disk-error-handler.cc',
+                'utils/disk-error-handler.cc',
                'duration.cc',
                'vint-serialization.cc',
                'utils/arch/powerpc/crc32-vpmsum/crc32_wrapper.cc',
@@ -760,6 +785,7 @@ scylla_core = (['database.cc',
                'utils/utf8.cc',
                'utils/ascii.cc',
                'utils/like_matcher.cc',
+                'utils/error_injection.cc',
                'mutation_writer/timestamp_based_splitting_writer.cc',
                'lua.cc',
                ] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')]
@@ -799,6 +825,8 @@ api = ['api/api.cc',
       'api/system.cc',
       'api/config.cc',
       'api/api-doc/config.json',
+        'api/error_injection.cc',
+        'api/api-doc/error_injection.json',
       ]

 alternator = [
@@ -827,6 +855,7 @@ redis = [
        'redis/abstract_command.cc',
        'redis/command_factory.cc',
        'redis/commands.cc',
+        'redis/lolwut.cc',
        ]

 idls = ['idl/gossip_digest.idl.hh',
@@ -862,6 +891,7 @@ headers = find_headers('.', excluded_dirs=['idl', 'build', 'seastar', '.git'])
 scylla_tests_generic_dependencies = [
    'test/lib/cql_test_env.cc',
    'test/lib/test_services.cc',
+    'test/lib/log.cc',
 ]

 scylla_tests_dependencies = scylla_core + idls + scylla_tests_generic_dependencies + [
@@ -875,6 +905,7 @@ scylla_tests_dependencies = scylla_core + idls + scylla_tests_generic_dependenci

 deps = {
    'scylla': idls + ['main.cc', 'release.cc', 'build_id.cc'] + scylla_core + api + alternator + redis,
+    'test/tools/cql_repl': idls + ['test/tools/cql_repl.cc'] + scylla_core + scylla_tests_generic_dependencies,
 }

 pure_boost_tests = set([
@@ -916,6 +947,7 @@ tests_not_using_seastar_test_framework = set([
    'test/boost/small_vector_test',
    'test/manual/gossip',
    'test/manual/message',
+    'test/perf/memory_footprint_test',
    'test/perf/perf_cache_eviction',
    'test/perf/perf_cql_parser',
    'test/perf/perf_hash',
@@ -924,9 +956,9 @@ tests_not_using_seastar_test_framework = set([
    'test/perf/perf_sstable',
    'test/unit/lsa_async_eviction_test',
    'test/unit/lsa_sync_eviction_test',
-    'test/unit/memory_footprint_test',
    'test/unit/row_cache_alloc_stress_test',
    'test/unit/row_cache_stress_test',
+    'test/manual/sstable_scan_footprint_test',
 ]) | pure_boost_tests

 for t in tests_not_using_seastar_test_framework:
@@ -949,9 +981,19 @@ for t in perf_tests:

 deps['test/boost/sstable_test'] += ['test/lib/sstable_utils.cc', 'test/lib/normalizing_reader.cc']
 deps['test/boost/sstable_datafile_test'] += ['test/lib/sstable_utils.cc', 'test/lib/normalizing_reader.cc']
-deps['test/boost/mutation_reader_test'] += ['test/lib/sstable_utils.cc']
+deps['test/boost/sstable_resharding_test'] += ['test/lib/sstable_utils.cc' ]
+deps['test/boost/mutation_reader_test'] += ['test/lib/sstable_utils.cc', 'test/lib/dummy_partitioner.cc' ]
+deps['test/boost/multishard_combining_reader_as_mutation_source_test'] += ['test/lib/sstable_utils.cc', 'test/lib/dummy_partitioner.cc' ]
+deps['test/boost/sstable_mutation_test'] += ['test/lib/sstable_utils.cc']
+deps['test/boost/sstable_conforms_to_mutation_source_test'] += ['test/lib/sstable_utils.cc']

-deps['test/boost/bytes_ostream_test'] = ['test/boost/bytes_ostream_test.cc', 'utils/managed_bytes.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
+deps['test/boost/bytes_ostream_test'] = [
+    "test/boost/bytes_ostream_test.cc",
+    "utils/managed_bytes.cc",
+    "utils/logalloc.cc",
+    "utils/dynamic_bitset.cc",
+    "test/lib/log.cc",
+]
 deps['test/boost/input_stream_test'] = ['test/boost/input_stream_test.cc']
 deps['test/boost/UUID_test'] = ['utils/UUID_gen.cc', 'test/boost/UUID_test.cc', 'utils/uuid.cc', 'utils/managed_bytes.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc', 'hashers.cc']
 deps['test/boost/murmur_hash_test'] = ['bytes.cc', 'utils/murmur_hash.cc', 'test/boost/murmur_hash_test.cc']
@@ -962,12 +1004,18 @@ deps['test/perf/perf_fast_forward'] += ['release.cc']
 deps['test/perf/perf_simple_query'] += ['release.cc']
 deps['test/boost/meta_test'] = ['test/boost/meta_test.cc']
 deps['test/manual/imr_test'] = ['test/manual/imr_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
-deps['test/boost/reusable_buffer_test'] = ['test/boost/reusable_buffer_test.cc']
+deps['test/boost/reusable_buffer_test'] = [
+    "test/boost/reusable_buffer_test.cc",
+    "test/lib/log.cc",
+]
 deps['test/boost/utf8_test'] = ['utils/utf8.cc', 'test/boost/utf8_test.cc']
 deps['test/boost/small_vector_test'] = ['test/boost/small_vector_test.cc']
 deps['test/boost/multishard_mutation_query_test'] += ['test/boost/test_table.cc']
 deps['test/boost/vint_serialization_test'] = ['test/boost/vint_serialization_test.cc', 'vint-serialization.cc', 'bytes.cc']
-deps['test/boost/linearizing_input_stream_test'] = ['test/boost/linearizing_input_stream_test.cc']
+deps['test/boost/linearizing_input_stream_test'] = [
+    "test/boost/linearizing_input_stream_test.cc",
+    "test/lib/log.cc",
+]

 deps['test/boost/duration_test'] += ['test/lib/exception_utils.cc']

@@ -1010,7 +1058,7 @@ optimization_flags = [o
                      if flag_supported(flag=o, compiler=args.cxx)]
 modes['release']['cxx_ld_flags'] += ' ' + ' '.join(optimization_flags)

-gold_linker_flag = gold_supported(compiler=args.cxx)
+linker_flags = linker_flags(compiler=args.cxx)

 dbgflag = '-g -gz' if args.debuginfo else ''
 tests_link_rule = 'link' if args.tests_debuginfo else 'link_stripped'
@@ -1132,8 +1180,24 @@ extra_cxxflags["release.cc"] = "-DSCYLLA_VERSION=\"\\\"" + scylla_version + "\\\
 for m in ['debug', 'release', 'sanitize']:
    modes[m]['cxxflags'] += ' ' + dbgflag

+get_dynamic_linker_output = subprocess.check_output(['./reloc/get-dynamic-linker.sh'], shell=True)
+dynamic_linker = get_dynamic_linker_output.decode('utf-8').strip()
+
+forced_ldflags = '-Wl,'
+
+# The default build-id used by lld is xxhash, which is 8 bytes long, but RPM
+# requires build-ids to be at least 16 bytes long
+# (https://github.com/rpm-software-management/rpm/issues/950), so let's
+# explicitly ask for SHA1 build-ids.
+forced_ldflags += '--build-id=sha1,'
+
+forced_ldflags += f'--dynamic-linker={dynamic_linker}'
+
+args.user_ldflags = forced_ldflags + ' ' + args.user_ldflags
+
+args.user_cflags += ' -Wno-error=stack-usage='
+
 seastar_cflags = args.user_cflags
-seastar_cflags += ' -Wno-error'
 if args.target != '':
    seastar_cflags += ' -march=' + args.target
 seastar_ldflags = args.user_ldflags
@@ -1150,12 +1214,18 @@ def configure_seastar(build_dir, mode):
        '-DCMAKE_BUILD_TYPE={}'.format(MODE_TO_CMAKE_BUILD_TYPE[mode]),
        '-DCMAKE_C_COMPILER={}'.format(args.cc),
        '-DCMAKE_CXX_COMPILER={}'.format(args.cxx),
+        '-DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON',
        '-DSeastar_CXX_FLAGS={}'.format((seastar_cflags + ' ' + modes[mode]['cxx_ld_flags']).replace(' ', ';')),
        '-DSeastar_LD_FLAGS={}'.format(seastar_ldflags),
        '-DSeastar_CXX_DIALECT=gnu++17',
        '-DSeastar_STD_OPTIONAL_VARIANT_STRINGVIEW=ON',
        '-DSeastar_UNUSED_RESULT_ERROR=ON',
    ]
+
+    if args.stack_guards is not None:
+        stack_guards = 'ON' if args.stack_guards else 'OFF'
+        seastar_cmake_args += ['-DSeastar_STACK_GUARDS={}'.format(stack_guards)]
+
    if args.dpdk:
        seastar_cmake_args += ['-DSeastar_DPDK=ON', '-DSeastar_DPDK_MACHINE=wsm']
    if args.gcc6_concepts:
@@ -1196,9 +1266,9 @@ def query_seastar_flags(pc_file, link_static_cxx=False):
    return cflags, libs

 for mode in build_modes:
-    seastar_cflags, seastar_libs = query_seastar_flags(pc[mode], link_static_cxx=args.staticcxx)
-    modes[mode]['seastar_cflags'] = seastar_cflags
-    modes[mode]['seastar_libs'] = seastar_libs
+    seastar_pc_cflags, seastar_pc_libs = query_seastar_flags(pc[mode], link_static_cxx=args.staticcxx)
+    modes[mode]['seastar_cflags'] = seastar_pc_cflags
+    modes[mode]['seastar_libs'] = seastar_pc_libs

 # We need to use experimental features of the zstd library (to use our own allocators for the (de)compression context),
 # which are available only when the library is linked statically.
@@ -1219,6 +1289,46 @@ def configure_zstd(build_dir, mode):
    os.makedirs(zstd_build_dir, exist_ok=True)
    subprocess.check_call(zstd_cmd, shell=False, cwd=zstd_build_dir)

+def configure_abseil(build_dir, mode):
+    abseil_build_dir = os.path.join(build_dir, mode, 'abseil')
+
+    abseil_cflags = seastar_cflags + ' ' + modes[mode]['cxx_ld_flags']
+    cmake_mode = MODE_TO_CMAKE_BUILD_TYPE[mode]
+    abseil_cmake_args = [
+        '-DCMAKE_BUILD_TYPE={}'.format(cmake_mode),
+        '-DCMAKE_INSTALL_PREFIX={}'.format(build_dir + '/inst'), # just to avoid a warning from absl
+        '-DCMAKE_C_COMPILER={}'.format(args.cc),
+        '-DCMAKE_CXX_COMPILER={}'.format(args.cxx),
+        '-DCMAKE_CXX_FLAGS_{}={}'.format(cmake_mode.upper(), abseil_cflags),
+    ]
+
+    abseil_cmd = ['cmake', '-G', 'Ninja', os.path.relpath('abseil', abseil_build_dir)] + abseil_cmake_args
+
+    os.makedirs(abseil_build_dir, exist_ok=True)
+    subprocess.check_call(abseil_cmd, shell=False, cwd=abseil_build_dir)
+
+abseil_libs = ['absl/' + lib for lib in [
+    'container/libabsl_hashtablez_sampler.a',
+    'container/libabsl_raw_hash_set.a',
+    'synchronization/libabsl_synchronization.a',
+    'synchronization/libabsl_graphcycles_internal.a',
+    'debugging/libabsl_stacktrace.a',
+    'debugging/libabsl_symbolize.a',
+    'debugging/libabsl_debugging_internal.a',
+    'debugging/libabsl_demangle_internal.a',
+    'time/libabsl_time.a',
+    'time/libabsl_time_zone.a',
+    'numeric/libabsl_int128.a',
+    'hash/libabsl_city.a',
+    'hash/libabsl_hash.a',
+    'base/libabsl_malloc_internal.a',
+    'base/libabsl_spinlock_wait.a',
+    'base/libabsl_base.a',
+    'base/libabsl_dynamic_annotations.a',
+    'base/libabsl_raw_logging_internal.a',
+    'base/libabsl_exponential_biased.a',
+    'base/libabsl_throw_delegate.a']]
+
 args.user_cflags += " " + pkg_config('jsoncpp', '--cflags')
 args.user_cflags += ' -march=' + args.target
 libs = ' '.join([maybe_static(args.staticyamlcpp, '-lyaml-cpp'), '-latomic', '-llz4', '-lz', '-lsnappy', pkg_config('jsoncpp', '--libs'),
@@ -1247,6 +1357,7 @@ if any(filter(thrift_version.startswith, thrift_boost_versions)):
 for pkg in pkgs:
    args.user_cflags += ' ' + pkg_config(pkg, '--cflags')
    libs += ' ' + pkg_config(pkg, '--libs')
+args.user_cflags += '-I abseil'
 user_cflags = args.user_cflags + ' -fvisibility=hidden'
 user_ldflags = args.user_ldflags + ' -fvisibility=hidden'
 if args.staticcxx:
@@ -1277,6 +1388,9 @@ else:
 for mode in build_modes:
    configure_zstd(outdir, mode)

+for mode in build_modes:
+    configure_abseil(outdir, mode)
+
 # configure.py may run automatically from an already-existing build.ninja.
 # If the user interrupts configure.py in the middle, we need build.ninja
 # to remain in a valid state.  So we write our output to a temporary
@@ -1289,8 +1403,8 @@ with open(buildfile_tmp, 'w') as f:
        builddir = {outdir}
        cxx = {cxx}
        cxxflags = {user_cflags} {warnings} {defines}
-        ldflags = {gold_linker_flag} {user_ldflags}
-        ldflags_build = {gold_linker_flag}
+        ldflags = {linker_flags} {user_ldflags}
+        ldflags_build = {linker_flags}
        libs = {libs}
        pool link_pool
            depth = {link_pool_depth}
@@ -1360,7 +1474,11 @@ with open(buildfile_tmp, 'w') as f:
                # name, we also add a global typedef to avoid compilation errors.
                command = sed -e '/^#if 0/,/^#endif/d' $in > $builddir/{mode}/gen/$in $
                     && {antlr3_exec} $builddir/{mode}/gen/$in $
+                     && sed -i -e '/^.*On :.*$$/d' build/{mode}/gen/${{stem}}Lexer.hpp $
+                     && sed -i -e '/^.*On :.*$$/d' build/{mode}/gen/${{stem}}Lexer.cpp $
+                     && sed -i -e '/^.*On :.*$$/d' build/{mode}/gen/${{stem}}Parser.hpp $
                     && sed -i -e 's/^\\( *\)\\(ImplTraits::CommonTokenType\\* [a-zA-Z0-9_]* = NULL;\\)$$/\\1const \\2/' $
+                        -e '/^.*On :.*$$/d' $
                        -e '1i using ExceptionBaseType = int;' $
                        -e 's/^{{/{{ ExceptionBaseType\* ex = nullptr;/; $
                            s/ExceptionBaseType\* ex = new/ex = new/; $
@@ -1407,9 +1525,11 @@ with open(buildfile_tmp, 'w') as f:
                objs.extend(['$builddir/' + mode + '/' + artifact for artifact in [
                    'libdeflate/libdeflate.a',
                    'zstd/lib/libzstd.a',
+                ] + [
+                    'abseil/' + x for x in abseil_libs
                ]])
                objs.append('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o')
-                if binary.startswith('test/'):
+                if binary in tests:
                    local_libs = '$seastar_libs_{} $libs'.format(mode)
                    if binary in pure_boost_tests:
                        local_libs += ' ' + maybe_static(args.staticboost, '-lboost_unit_test_framework')
@@ -1463,6 +1583,12 @@ with open(buildfile_tmp, 'w') as f:
                objs=' '.join(compiles)
            )
        )
+        f.write(
+            'build {mode}-headers: phony {header_objs}\n'.format(
+                mode=mode,
+                header_objs=' '.join(["$builddir/{mode}/{hh}.o".format(mode=mode, hh=hh) for hh in headers])
+            )
+        )


        gen_headers = []
@@ -1517,7 +1643,7 @@ with open(buildfile_tmp, 'w') as f:
        f.write('  pool = submodule_pool\n')
        f.write('  subdir = build/{mode}/seastar\n'.format(**locals()))
        f.write('  target = seastar\n'.format(**locals()))
-        f.write('build build/{mode}/seastar/libseastar_testing.a: ninja\n'
+        f.write('build build/{mode}/seastar/libseastar_testing.a: ninja | always\n'
                .format(**locals()))
        f.write('  pool = submodule_pool\n')
        f.write('  subdir = build/{mode}/seastar\n'.format(**locals()))
@@ -1542,6 +1668,12 @@ with open(buildfile_tmp, 'w') as f:
        f.write('  subdir = build/{mode}/zstd\n'.format(**locals()))
        f.write('  target = libzstd.a\n'.format(**locals()))

+        for lib in abseil_libs:
+            f.write('build build/{mode}/abseil/{lib}: ninja\n'.format(**locals()))
+            f.write('  pool = submodule_pool\n')
+            f.write('  subdir = build/{mode}/abseil\n'.format(**locals()))
+            f.write('  target = {lib}\n'.format(**locals()))
+
    mode = 'dev' if 'dev' in modes else modes[0]
    f.write('build checkheaders: phony || {}\n'.format(' '.join(['$builddir/{}/{}.o'.format(mode, hh) for hh in headers])))

@@ -1549,7 +1681,7 @@ with open(buildfile_tmp, 'w') as f:
        rule configure
          command = {python} configure.py $configure_args
          generator = 1
-        build build.ninja: configure | configure.py SCYLLA-VERSION-GEN
+        build build.ninja: configure | configure.py SCYLLA-VERSION-GEN seastar/CMakeLists.txt
        rule cscope
            command = find -name '*.[chS]' -o -name "*.cc" -o -name "*.hh" | cscope -bq -i-
            description = CSCOPE
--- a/converting_mutation_partition_applier.cc
+++ b/converting_mutation_partition_applier.cc
@@ -0,0 +1,188 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "converting_mutation_partition_applier.hh"
+#include "concrete_types.hh"
+
+#include "mutation_partition_view.hh"
+#include "mutation_partition.hh"
+#include "schema.hh"
+
+bool
+converting_mutation_partition_applier::is_compatible(const column_definition& new_def, const abstract_type& old_type, column_kind kind) {
+    return ::is_compatible(new_def.kind, kind) && new_def.type->is_value_compatible_with(old_type);
+}
+
+atomic_cell
+converting_mutation_partition_applier::upgrade_cell(const abstract_type& new_type, const abstract_type& old_type, atomic_cell_view cell,
+                                atomic_cell::collection_member cm) {
+    if (cell.is_live() && !old_type.is_counter()) {
+        if (cell.is_live_and_has_ttl()) {
+            return atomic_cell::make_live(new_type, cell.timestamp(), cell.value().linearize(), cell.expiry(), cell.ttl(), cm);
+        }
+        return atomic_cell::make_live(new_type, cell.timestamp(), cell.value().linearize(), cm);
+    } else {
+        return atomic_cell(new_type, cell);
+    }
+}
+
+void
+converting_mutation_partition_applier::accept_cell(row& dst, column_kind kind, const column_definition& new_def, const abstract_type& old_type, atomic_cell_view cell) {
+    if (!is_compatible(new_def, old_type, kind) || cell.timestamp() <= new_def.dropped_at()) {
+        return;
+    }
+    dst.apply(new_def, upgrade_cell(*new_def.type, old_type, cell));
+}
+
+void
+converting_mutation_partition_applier::accept_cell(row& dst, column_kind kind, const column_definition& new_def, const abstract_type& old_type, collection_mutation_view cell) {
+    if (!is_compatible(new_def, old_type, kind)) {
+        return;
+    }
+
+  cell.with_deserialized(old_type, [&] (collection_mutation_view_description old_view) {
+    collection_mutation_description new_view;
+    if (old_view.tomb.timestamp > new_def.dropped_at()) {
+        new_view.tomb = old_view.tomb;
+    }
+
+    visit(old_type, make_visitor(
+        [&] (const collection_type_impl& old_ctype) {
+            assert(new_def.type->is_collection()); // because is_compatible
+            auto& new_ctype = static_cast<const collection_type_impl&>(*new_def.type);
+
+            auto& new_value_type = *new_ctype.value_comparator();
+            auto& old_value_type = *old_ctype.value_comparator();
+
+            for (auto& c : old_view.cells) {
+                if (c.second.timestamp() > new_def.dropped_at()) {
+                    new_view.cells.emplace_back(c.first, upgrade_cell(
+                            new_value_type, old_value_type, c.second, atomic_cell::collection_member::yes));
+                }
+            }
+        },
+        [&] (const user_type_impl& old_utype) {
+            assert(new_def.type->is_user_type()); // because is_compatible
+            auto& new_utype = static_cast<const user_type_impl&>(*new_def.type);
+
+            for (auto& c : old_view.cells) {
+                if (c.second.timestamp() > new_def.dropped_at()) {
+                    auto idx = deserialize_field_index(c.first);
+                    assert(idx < new_utype.size() && idx < old_utype.size());
+
+                    new_view.cells.emplace_back(c.first, upgrade_cell(
+                            *new_utype.type(idx), *old_utype.type(idx), c.second, atomic_cell::collection_member::yes));
+                }
+            }
+        },
+        [&] (const abstract_type& o) {
+            throw std::runtime_error(format("not a multi-cell type: {}", o.name()));
+        }
+    ));
+
+    if (new_view.tomb || !new_view.cells.empty()) {
+        dst.apply(new_def, new_view.serialize(*new_def.type));
+    }
+  });
+}
+
+converting_mutation_partition_applier::converting_mutation_partition_applier(
+        const column_mapping& visited_column_mapping,
+        const schema& target_schema,
+        mutation_partition& target)
+    : _p_schema(target_schema)
+    , _p(target)
+    , _visited_column_mapping(visited_column_mapping)
+{ }
+
+
+void
+converting_mutation_partition_applier::accept_partition_tombstone(tombstone t) {
+    _p.apply(t);
+}
+
+void
+converting_mutation_partition_applier::accept_static_cell(column_id id, atomic_cell cell) {
+    return accept_static_cell(id, atomic_cell_view(cell));
+}
+
+void
+converting_mutation_partition_applier::accept_static_cell(column_id id, atomic_cell_view cell) {
+    const column_mapping_entry& col = _visited_column_mapping.static_column_at(id);
+    const column_definition* def = _p_schema.get_column_definition(col.name());
+    if (def) {
+        accept_cell(_p._static_row.maybe_create(), column_kind::static_column, *def, *col.type(), cell);
+    }
+}
+
+void
+converting_mutation_partition_applier::accept_static_cell(column_id id, collection_mutation_view collection) {
+    const column_mapping_entry& col = _visited_column_mapping.static_column_at(id);
+    const column_definition* def = _p_schema.get_column_definition(col.name());
+    if (def) {
+        accept_cell(_p._static_row.maybe_create(), column_kind::static_column, *def, *col.type(), collection);
+    }
+}
+
+void
+converting_mutation_partition_applier::accept_row_tombstone(const range_tombstone& rt) {
+    _p.apply_row_tombstone(_p_schema, rt);
+}
+
+void
+converting_mutation_partition_applier::accept_row(position_in_partition_view key, const row_tombstone& deleted_at, const row_marker& rm, is_dummy dummy, is_continuous continuous) {
+    deletable_row& r = _p.clustered_row(_p_schema, key, dummy, continuous);
+    r.apply(rm);
+    r.apply(deleted_at);
+    _current_row = &r;
+}
+
+void
+converting_mutation_partition_applier::accept_row_cell(column_id id, atomic_cell cell) {
+    return accept_row_cell(id, atomic_cell_view(cell));
+}
+
+void
+converting_mutation_partition_applier::accept_row_cell(column_id id, atomic_cell_view cell) {
+    const column_mapping_entry& col = _visited_column_mapping.regular_column_at(id);
+    const column_definition* def = _p_schema.get_column_definition(col.name());
+    if (def) {
+        accept_cell(_current_row->cells(), column_kind::regular_column, *def, *col.type(), cell);
+    }
+}
+
+void
+converting_mutation_partition_applier::accept_row_cell(column_id id, collection_mutation_view collection) {
+    const column_mapping_entry& col = _visited_column_mapping.regular_column_at(id);
+    const column_definition* def = _p_schema.get_column_definition(col.name());
+    if (def) {
+        accept_cell(_current_row->cells(), column_kind::regular_column, *def, *col.type(), collection);
+    }
+}
+
+void
+converting_mutation_partition_applier::append_cell(row& dst, column_kind kind, const column_definition& new_def, const column_definition& old_def, const atomic_cell_or_collection& cell) {
+    if (new_def.is_atomic()) {
+        accept_cell(dst, kind, new_def, *old_def.type, cell.as_atomic_cell(old_def));
+    } else {
+        accept_cell(dst, kind, new_def, *old_def.type, cell.as_collection_mutation());
+    }
+}
--- a/converting_mutation_partition_applier.hh
+++ b/converting_mutation_partition_applier.hh
@@ -21,12 +21,13 @@

 #pragma once

-#include "types/user.hh"
-#include "concrete_types.hh"
+#include "mutation_partition_visitor.hh"

-#include "mutation_partition_view.hh"
-#include "mutation_partition.hh"
-#include "schema.hh"
+class schema;
+class row;
+class mutation_partition;
+class column_mapping;
+class deletable_row;

 // Mutation partition visitor which applies visited data into
 // existing mutation_partition. The visited data may be of a different schema.
@@ -38,148 +39,26 @@ class converting_mutation_partition_applier : public mutation_partition_visitor
    const column_mapping& _visited_column_mapping;
    deletable_row* _current_row;
 private:
-    static bool is_compatible(const column_definition& new_def, const abstract_type& old_type, column_kind kind) {
-        return ::is_compatible(new_def.kind, kind) && new_def.type->is_value_compatible_with(old_type);
-    }
+    static bool is_compatible(const column_definition& new_def, const abstract_type& old_type, column_kind kind);
    static atomic_cell upgrade_cell(const abstract_type& new_type, const abstract_type& old_type, atomic_cell_view cell,
-                                    atomic_cell::collection_member cm = atomic_cell::collection_member::no) {
-        if (cell.is_live() && !old_type.is_counter()) {
-            if (cell.is_live_and_has_ttl()) {
-                return atomic_cell::make_live(new_type, cell.timestamp(), cell.value().linearize(), cell.expiry(), cell.ttl(), cm);
-            }
-            return atomic_cell::make_live(new_type, cell.timestamp(), cell.value().linearize(), cm);
-        } else {
-            return atomic_cell(new_type, cell);
-        }
-    }
-    static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const abstract_type& old_type, atomic_cell_view cell) {
-        if (!is_compatible(new_def, old_type, kind) || cell.timestamp() <= new_def.dropped_at()) {
-            return;
-        }
-        dst.apply(new_def, upgrade_cell(*new_def.type, old_type, cell));
-    }
-    static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const abstract_type& old_type, collection_mutation_view cell) {
-        if (!is_compatible(new_def, old_type, kind)) {
-            return;
-        }
-
-      cell.with_deserialized(old_type, [&] (collection_mutation_view_description old_view) {
-        collection_mutation_description new_view;
-        if (old_view.tomb.timestamp > new_def.dropped_at()) {
-            new_view.tomb = old_view.tomb;
-        }
-
-        visit(old_type, make_visitor(
-            [&] (const collection_type_impl& old_ctype) {
-                assert(new_def.type->is_collection()); // because is_compatible
-                auto& new_ctype = static_cast<const collection_type_impl&>(*new_def.type);
-
-                auto& new_value_type = *new_ctype.value_comparator();
-                auto& old_value_type = *old_ctype.value_comparator();
-
-                for (auto& c : old_view.cells) {
-                    if (c.second.timestamp() > new_def.dropped_at()) {
-                        new_view.cells.emplace_back(c.first, upgrade_cell(
-                                new_value_type, old_value_type, c.second, atomic_cell::collection_member::yes));
-                    }
-                }
-            },
-            [&] (const user_type_impl& old_utype) {
-                assert(new_def.type->is_user_type()); // because is_compatible
-                auto& new_utype = static_cast<const user_type_impl&>(*new_def.type);
-
-                for (auto& c : old_view.cells) {
-                    if (c.second.timestamp() > new_def.dropped_at()) {
-                        auto idx = deserialize_field_index(c.first);
-                        assert(idx < new_utype.size() && idx < old_utype.size());
-
-                        new_view.cells.emplace_back(c.first, upgrade_cell(
-                                *new_utype.type(idx), *old_utype.type(idx), c.second, atomic_cell::collection_member::yes));
-                    }
-                }
-            },
-            [&] (const abstract_type& o) {
-                throw std::runtime_error(format("not a multi-cell type: {}", o.name()));
-            }
-        ));
-
-        if (new_view.tomb || !new_view.cells.empty()) {
-            dst.apply(new_def, new_view.serialize(*new_def.type));
-        }
-      });
-    }
-public:
+                                    atomic_cell::collection_member cm = atomic_cell::collection_member::no);
+    static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const abstract_type& old_type, atomic_cell_view cell);
+    static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const abstract_type& old_type, collection_mutation_view cell);public:
    converting_mutation_partition_applier(
            const column_mapping& visited_column_mapping,
            const schema& target_schema,
-            mutation_partition& target)
-        : _p_schema(target_schema)
-        , _p(target)
-        , _visited_column_mapping(visited_column_mapping)
-    { }
-
-    virtual void accept_partition_tombstone(tombstone t) override {
-        _p.apply(t);
-    }
-
-    void accept_static_cell(column_id id, atomic_cell cell) {
-        return accept_static_cell(id, atomic_cell_view(cell));
-    }
-
-    virtual void accept_static_cell(column_id id, atomic_cell_view cell) override {
-        const column_mapping_entry& col = _visited_column_mapping.static_column_at(id);
-        const column_definition* def = _p_schema.get_column_definition(col.name());
-        if (def) {
-            accept_cell(_p._static_row.maybe_create(), column_kind::static_column, *def, *col.type(), cell);
-        }
-    }
-
-    virtual void accept_static_cell(column_id id, collection_mutation_view collection) override {
-        const column_mapping_entry& col = _visited_column_mapping.static_column_at(id);
-        const column_definition* def = _p_schema.get_column_definition(col.name());
-        if (def) {
-            accept_cell(_p._static_row.maybe_create(), column_kind::static_column, *def, *col.type(), collection);
-        }
-    }
-
-    virtual void accept_row_tombstone(const range_tombstone& rt) override {
-        _p.apply_row_tombstone(_p_schema, rt);
-    }
-
-    virtual void accept_row(position_in_partition_view key, const row_tombstone& deleted_at, const row_marker& rm, is_dummy dummy, is_continuous continuous) override {
-        deletable_row& r = _p.clustered_row(_p_schema, key, dummy, continuous);
-        r.apply(rm);
-        r.apply(deleted_at);
-        _current_row = &r;
-    }
-
-    void accept_row_cell(column_id id, atomic_cell cell) {
-        return accept_row_cell(id, atomic_cell_view(cell));
-    }
-
-    virtual void accept_row_cell(column_id id, atomic_cell_view cell) override {
-        const column_mapping_entry& col = _visited_column_mapping.regular_column_at(id);
-        const column_definition* def = _p_schema.get_column_definition(col.name());
-        if (def) {
-            accept_cell(_current_row->cells(), column_kind::regular_column, *def, *col.type(), cell);
-        }
-    }
-
-    virtual void accept_row_cell(column_id id, collection_mutation_view collection) override {
-        const column_mapping_entry& col = _visited_column_mapping.regular_column_at(id);
-        const column_definition* def = _p_schema.get_column_definition(col.name());
-        if (def) {
-            accept_cell(_current_row->cells(), column_kind::regular_column, *def, *col.type(), collection);
-        }
-    }
+            mutation_partition& target);
+    virtual void accept_partition_tombstone(tombstone t) override;
+    void accept_static_cell(column_id id, atomic_cell cell);
+    virtual void accept_static_cell(column_id id, atomic_cell_view cell) override;
+    virtual void accept_static_cell(column_id id, collection_mutation_view collection) override;
+    virtual void accept_row_tombstone(const range_tombstone& rt) override;
+    virtual void accept_row(position_in_partition_view key, const row_tombstone& deleted_at, const row_marker& rm, is_dummy dummy, is_continuous continuous) override;
+    void accept_row_cell(column_id id, atomic_cell cell);
+    virtual void accept_row_cell(column_id id, atomic_cell_view cell) override;
+    virtual void accept_row_cell(column_id id, collection_mutation_view collection) override;

    // Appends the cell to dst upgrading it to the new schema.
    // Cells must have monotonic names.
-    static void append_cell(row& dst, column_kind kind, const column_definition& new_def, const column_definition& old_def, const atomic_cell_or_collection& cell) {
-        if (new_def.is_atomic()) {
-            accept_cell(dst, kind, new_def, *old_def.type, cell.as_atomic_cell(old_def));
-        } else {
-            accept_cell(dst, kind, new_def, *old_def.type, cell.as_collection_mutation());
-        }
-    }
+    static void append_cell(row& dst, column_kind kind, const column_definition& new_def, const column_definition& old_def, const atomic_cell_or_collection& cell);
 };
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -408,7 +408,7 @@ selectStatement returns [shared_ptr<raw::select_statement> expr]
      ( K_ALLOW K_FILTERING  { allow_filtering = true; } )?
      ( K_BYPASS K_CACHE { bypass_cache = true; })?
      {
-          auto params = ::make_shared<raw::select_statement::parameters>(std::move(orderings), is_distinct, allow_filtering, is_json, bypass_cache);
+          auto params = make_lw_shared<raw::select_statement::parameters>(std::move(orderings), is_distinct, allow_filtering, is_json, bypass_cache);
          $expr = ::make_shared<raw::select_statement>(std::move(cf), std::move(params),
            std::move(sclause), std::move(wclause), std::move(limit), std::move(per_partition_limit),
            std::move(gbcolumns));
@@ -478,7 +478,7 @@ jsonValue returns [::shared_ptr<cql3::term::raw> value]
 */
 insertStatement returns [::shared_ptr<raw::modification_statement> expr]
    @init {
-        auto attrs = ::make_shared<cql3::attributes::raw>();
+        auto attrs = std::make_unique<cql3::attributes::raw>();
        std::vector<::shared_ptr<cql3::column_identifier::raw>> column_names;
        std::vector<::shared_ptr<cql3::term::raw>> values;
        bool if_not_exists = false;
@@ -513,11 +513,11 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
        )
    ;

-usingClause[::shared_ptr<cql3::attributes::raw> attrs]
+usingClause[std::unique_ptr<cql3::attributes::raw>& attrs]
    : K_USING usingClauseObjective[attrs] ( K_AND usingClauseObjective[attrs] )*
    ;

-usingClauseObjective[::shared_ptr<cql3::attributes::raw> attrs]
+usingClauseObjective[std::unique_ptr<cql3::attributes::raw>& attrs]
    : K_TIMESTAMP ts=intValue { attrs->timestamp = ts; }
    | K_TTL t=intValue { attrs->time_to_live = t; }
    ;
@@ -531,7 +531,7 @@ usingClauseObjective[::shared_ptr<cql3::attributes::raw> attrs]
 updateStatement returns [::shared_ptr<raw::update_statement> expr]
    @init {
        bool if_exists = false;
-        auto attrs = ::make_shared<cql3::attributes::raw>();
+        auto attrs = std::make_unique<cql3::attributes::raw>();
        std::vector<std::pair<::shared_ptr<cql3::column_identifier::raw>, ::shared_ptr<cql3::operation::raw_update>>> operations;
    }
    : K_UPDATE cf=columnFamilyName
@@ -562,7 +562,7 @@ updateConditions returns [conditions_type conditions]
 */
 deleteStatement returns [::shared_ptr<raw::delete_statement> expr]
    @init {
-        auto attrs = ::make_shared<cql3::attributes::raw>();
+        auto attrs = std::make_unique<cql3::attributes::raw>();
        std::vector<::shared_ptr<cql3::operation::raw_deletion>> column_deletions;
        bool if_exists = false;
    }
@@ -592,7 +592,7 @@ deleteOp returns [::shared_ptr<cql3::operation::raw_deletion> op]
    | c=cident '.' field=ident { $op = ::make_shared<cql3::operation::field_deletion>(std::move(c), std::move(field)); }
    ;

-usingClauseDelete[::shared_ptr<cql3::attributes::raw> attrs]
+usingClauseDelete[std::unique_ptr<cql3::attributes::raw>& attrs]
    : K_USING K_TIMESTAMP ts=intValue { attrs->timestamp = ts; }
    ;

@@ -625,7 +625,7 @@ batchStatement returns [shared_ptr<cql3::statements::raw::batch_statement> expr]
        using btype = cql3::statements::raw::batch_statement::type; 
        btype type = btype::LOGGED;
        std::vector<shared_ptr<cql3::statements::raw::modification_statement>> statements;
-        auto attrs = make_shared<cql3::attributes::raw>();
+        auto attrs = std::make_unique<cql3::attributes::raw>();
    }
    : K_BEGIN
      ( K_UNLOGGED { type = btype::UNLOGGED; } | K_COUNTER { type = btype::COUNTER; } )?
@@ -752,7 +752,7 @@ createKeyspaceStatement returns [shared_ptr<cql3::statements::create_keyspace_st
        bool if_not_exists = false;
    }
    : K_CREATE K_KEYSPACE (K_IF K_NOT K_EXISTS { if_not_exists = true; } )? ks=keyspaceName
-      K_WITH properties[attrs] { $expr = make_shared<cql3::statements::create_keyspace_statement>(ks, attrs, if_not_exists); }
+      K_WITH properties[attrs] { $expr = ::make_shared<cql3::statements::create_keyspace_statement>(ks, attrs, if_not_exists); }
    ;

 /**
@@ -914,7 +914,7 @@ alterKeyspaceStatement returns [shared_ptr<cql3::statements::alter_keyspace_stat
        auto attrs = make_shared<cql3::statements::ks_prop_defs>();
    }
    : K_ALTER K_KEYSPACE ks=keyspaceName
-        K_WITH properties[attrs] { $expr = make_shared<cql3::statements::alter_keyspace_statement>(ks, attrs); }
+        K_WITH properties[attrs] { $expr = ::make_shared<cql3::statements::alter_keyspace_statement>(ks, attrs); }
    ;

 /**
@@ -1243,16 +1243,16 @@ roleOption[cql3::role_options& opts]
 // identifiers because the underlying comparator is not necessarily text. See
 // CASSANDRA-8178 for details.
 cident returns [shared_ptr<cql3::column_identifier::raw> id]
-    : t=IDENT              { $id = make_shared<cql3::column_identifier::raw>(sstring{$t.text}, false); }
-    | t=QUOTED_NAME        { $id = make_shared<cql3::column_identifier::raw>(sstring{$t.text}, true); }
-    | k=unreserved_keyword { $id = make_shared<cql3::column_identifier::raw>(k, false); }
+    : t=IDENT              { $id = ::make_shared<cql3::column_identifier::raw>(sstring{$t.text}, false); }
+    | t=QUOTED_NAME        { $id = ::make_shared<cql3::column_identifier::raw>(sstring{$t.text}, true); }
+    | k=unreserved_keyword { $id = ::make_shared<cql3::column_identifier::raw>(k, false); }
    ;

 // Identifiers that do not refer to columns or where the comparator is known to be text
 ident returns [shared_ptr<cql3::column_identifier> id]
-    : t=IDENT              { $id = make_shared<cql3::column_identifier>(sstring{$t.text}, false); }
-    | t=QUOTED_NAME        { $id = make_shared<cql3::column_identifier>(sstring{$t.text}, true); }
-    | k=unreserved_keyword { $id = make_shared<cql3::column_identifier>(k, false); }
+    : t=IDENT              { $id = ::make_shared<cql3::column_identifier>(sstring{$t.text}, false); }
+    | t=QUOTED_NAME        { $id = ::make_shared<cql3::column_identifier>(sstring{$t.text}, true); }
+    | k=unreserved_keyword { $id = ::make_shared<cql3::column_identifier>(k, false); }
    ;

 // Keyspace & Column family names
--- a/cql3/abstract_marker.cc
+++ b/cql3/abstract_marker.cc
@@ -55,8 +55,8 @@ abstract_marker::abstract_marker(int32_t bind_index, ::shared_ptr<column_specifi
    , _receiver{std::move(receiver)}
 { }

-void abstract_marker::collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names) {
-    bound_names->add(_bind_index, _receiver);
+void abstract_marker::collect_marker_specification(variable_specifications& bound_names) const {
+    bound_names.add(_bind_index, _receiver);
 }

 bool abstract_marker::contains_bind_marker() const {
@@ -67,7 +67,7 @@ abstract_marker::raw::raw(int32_t bind_index)
    : _bind_index{bind_index}
 { }

-::shared_ptr<term> abstract_marker::raw::prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver)
+::shared_ptr<term> abstract_marker::raw::prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const
 {
    if (receiver->type->is_collection()) {
        if (receiver->type->get_kind() == abstract_type::kind::list) {
@@ -87,7 +87,7 @@ abstract_marker::raw::raw(int32_t bind_index)
    return ::make_shared<constants::marker>(_bind_index, receiver);
 }

-assignment_testable::test_result abstract_marker::raw::test_assignment(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) {
+assignment_testable::test_result abstract_marker::raw::test_assignment(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const {
    return assignment_testable::test_result::WEAKLY_ASSIGNABLE;
 }

@@ -104,7 +104,7 @@ abstract_marker::in_raw::in_raw(int32_t bind_index)
    return ::make_shared<column_specification>(receiver->ks_name, receiver->cf_name, in_name, list_type_impl::get_instance(receiver->type, false));
 }

-::shared_ptr<term> abstract_marker::in_raw::prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) {
+::shared_ptr<term> abstract_marker::in_raw::prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const {
    return ::make_shared<lists::marker>(_bind_index, make_in_receiver(receiver));
 }

--- a/cql3/abstract_marker.hh
+++ b/cql3/abstract_marker.hh
@@ -57,7 +57,7 @@ protected:
 public:
    abstract_marker(int32_t bind_index, ::shared_ptr<column_specification>&& receiver);

-    virtual void collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names) override;
+    virtual void collect_marker_specification(variable_specifications& bound_names) const override;

    virtual bool contains_bind_marker() const override;

@@ -70,9 +70,9 @@ public:
    public:
        raw(int32_t bind_index);

-        virtual ::shared_ptr<term> prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) override;
+        virtual ::shared_ptr<term> prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const override;

-        virtual assignment_testable::test_result test_assignment(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) override;
+        virtual assignment_testable::test_result test_assignment(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const override;

        virtual sstring to_string() const override;
    };
@@ -89,7 +89,7 @@ public:
    private:
        static ::shared_ptr<column_specification> make_in_receiver(::shared_ptr<column_specification> receiver);
    public:
-        virtual ::shared_ptr<term> prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) override;
+        virtual ::shared_ptr<term> prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const override;
    };
 };

--- a/cql3/assignment_testable.hh
+++ b/cql3/assignment_testable.hh
@@ -99,7 +99,7 @@ public:
     * Most caller should just call the isAssignable() method on the result, though functions have a use for
     * testing "strong" equality to decide the most precise overload to pick when multiple could match.
     */
-    virtual test_result test_assignment(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) = 0;
+    virtual test_result test_assignment(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const = 0;

    // for error reporting
    virtual sstring assignment_testable_source_context() const = 0;
--- a/cql3/attributes.cc
+++ b/cql3/attributes.cc
@@ -120,7 +120,7 @@ int32_t attributes::get_time_to_live(const query_options& options) {
    return ttl;
 }

-void attributes::collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names) {
+void attributes::collect_marker_specification(variable_specifications& bound_names) const {
    if (_timestamp) {
        _timestamp->collect_marker_specification(bound_names);
    }
@@ -129,17 +129,17 @@ void attributes::collect_marker_specification(lw_shared_ptr<variable_specificati
    }
 }

-std::unique_ptr<attributes> attributes::raw::prepare(database& db, const sstring& ks_name, const sstring& cf_name) {
+std::unique_ptr<attributes> attributes::raw::prepare(database& db, const sstring& ks_name, const sstring& cf_name) const {
    auto ts = !timestamp ? ::shared_ptr<term>{} : timestamp->prepare(db, ks_name, timestamp_receiver(ks_name, cf_name));
    auto ttl = !time_to_live ? ::shared_ptr<term>{} : time_to_live->prepare(db, ks_name, time_to_live_receiver(ks_name, cf_name));
    return std::unique_ptr<attributes>{new attributes{std::move(ts), std::move(ttl)}};
 }

-::shared_ptr<column_specification> attributes::raw::timestamp_receiver(const sstring& ks_name, const sstring& cf_name) {
+::shared_ptr<column_specification> attributes::raw::timestamp_receiver(const sstring& ks_name, const sstring& cf_name) const {
    return ::make_shared<column_specification>(ks_name, cf_name, ::make_shared<column_identifier>("[timestamp]", true), data_type_for<int64_t>());
 }

-::shared_ptr<column_specification> attributes::raw::time_to_live_receiver(const sstring& ks_name, const sstring& cf_name) {
+::shared_ptr<column_specification> attributes::raw::time_to_live_receiver(const sstring& ks_name, const sstring& cf_name) const {
    return ::make_shared<column_specification>(ks_name, cf_name, ::make_shared<column_identifier>("[ttl]", true), data_type_for<int32_t>());
 }

--- a/cql3/attributes.hh
+++ b/cql3/attributes.hh
@@ -69,18 +69,18 @@ public:

    int32_t get_time_to_live(const query_options& options);

-    void collect_marker_specification(lw_shared_ptr<variable_specifications> bound_names);
+    void collect_marker_specification(variable_specifications& bound_names) const;

-    class raw {
+    class raw final {
    public:
        ::shared_ptr<term::raw> timestamp;
        ::shared_ptr<term::raw> time_to_live;

-        std::unique_ptr<attributes> prepare(database& db, const sstring& ks_name, const sstring& cf_name);
+        std::unique_ptr<attributes> prepare(database& db, const sstring& ks_name, const sstring& cf_name) const;
    private:
-        ::shared_ptr<column_specification> timestamp_receiver(const sstring& ks_name, const sstring& cf_name);
+        ::shared_ptr<column_specification> timestamp_receiver(const sstring& ks_name, const sstring& cf_name) const;

-        ::shared_ptr<column_specification> time_to_live_receiver(const sstring& ks_name, const sstring& cf_name);
+        ::shared_ptr<column_specification> time_to_live_receiver(const sstring& ks_name, const sstring& cf_name) const;
    };
 };

--- a/cql3/column_condition.cc
+++ b/cql3/column_condition.cc
@@ -47,6 +47,7 @@
 #include <boost/range/algorithm_ext/push_back.hpp>
 #include "types/map.hh"
 #include "types/list.hh"
+#include "utils/like_matcher.hh"

 namespace {

@@ -131,7 +132,7 @@ column_condition::uses_function(const sstring& ks_name, const sstring& function_
    return false;
 }

-void column_condition::collect_marker_specificaton(lw_shared_ptr<variable_specifications> bound_names) {
+void column_condition::collect_marker_specificaton(variable_specifications& bound_names) const {
    if (_collection_element) {
        _collection_element->collect_marker_specification(bound_names);
    }
@@ -152,7 +153,7 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti
    // - a predicate can operate on a column or a collection element, which must always be
    // on the right side: "a = 3" or "collection['key'] IN (1,2,3)"
    // - parameter markers are allowed on the right hand side only
-    // - only <, >, >=, <=, != and IN predicates are supported.
+    // - only <, >, >=, <=, !=, LIKE, and IN predicates are supported.
    // - NULLs and missing values are treated differently from the WHERE clause:
    // a term or cell in IF clause is allowed to be NULL or compared with NULL,
    // and NULL value is treated just like any other value in the domain (there is no
@@ -245,6 +246,26 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti
        // directly to compare.
        return is_satisfied_by(_op, *cell_value->type(), *column.type, *cell_value, to_bytes(param));
    }
+
+    if (_op == operator_type::LIKE) {
+        if (cell_value == nullptr) {
+            return false;
+        }
+        if (_matcher) {
+            return (*_matcher)(bytes_view(cell_value->serialize_nonnull()));
+        } else {
+            auto param = _value->bind_and_get(options);  // LIKE pattern
+            if (param.is_unset_value()) {
+                throw exceptions::invalid_request_exception("Invalid 'unset' value in LIKE pattern");
+            }
+            if (param.is_null()) {
+                throw exceptions::invalid_request_exception("Invalid NULL value in LIKE pattern");
+            }
+            like_matcher matcher(to_bytes(param));
+            return matcher(bytes_view(cell_value->serialize_nonnull()));
+        }
+    }
+
    assert(_op == operator_type::IN);

    std::vector<bytes_opt> in_values;
@@ -266,12 +287,12 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti
            return value.has_value() && is_satisfied_by(operator_type::EQ, *cell_value->type(), *column.type, *cell_value, *value);
        });
    } else {
-        return std::any_of(in_values.begin(), in_values.end(), [] (const bytes_opt& value) { return value.has_value() == false; });
+        return std::any_of(in_values.begin(), in_values.end(), [] (const bytes_opt& value) { return !value.has_value() || value->empty(); });
    }
 }

 ::shared_ptr<column_condition>
-column_condition::raw::prepare(database& db, const sstring& keyspace, const column_definition& receiver) {
+column_condition::raw::prepare(database& db, const sstring& keyspace, const column_definition& receiver) const {
    if (receiver.type->is_counter()) {
        throw exceptions::invalid_request_exception("Conditions on counters are not supported");
    }
@@ -287,12 +308,13 @@ column_condition::raw::prepare(database& db, const sstring& keyspace, const colu
        // later be used to validate the parameter type is compatible with receiver type.
        shared_ptr<column_specification> element_spec;
        auto ctype = static_cast<const collection_type_impl*>(receiver.type.get());
+        const column_specification& recv_column_spec = *receiver.column_specification;
        if (ctype->get_kind() == abstract_type::kind::list) {
-            element_spec = lists::index_spec_of(receiver.column_specification);
-            value_spec = lists::value_spec_of(receiver.column_specification);
+            element_spec = lists::index_spec_of(recv_column_spec);
+            value_spec = lists::value_spec_of(recv_column_spec);
        } else if (ctype->get_kind() == abstract_type::kind::map) {
-            element_spec = maps::key_spec_of(*receiver.column_specification);
-            value_spec = maps::value_spec_of(*receiver.column_specification);
+            element_spec = maps::key_spec_of(recv_column_spec);
+            value_spec = maps::value_spec_of(recv_column_spec);
        } else if (ctype->get_kind() == abstract_type::kind::set) {
            throw exceptions::invalid_request_exception(format("Invalid element access syntax for set column {}",
                        receiver.name_as_text()));
@@ -305,8 +327,27 @@ column_condition::raw::prepare(database& db, const sstring& keyspace, const colu

    if (_op.is_compare()) {
        validate_operation_on_durations(*receiver.type, _op);
-        return column_condition::condition(receiver, collection_element_term, _value->prepare(db, keyspace, value_spec), _op);
+        return column_condition::condition(receiver, collection_element_term,
+                _value->prepare(db, keyspace, value_spec), nullptr, _op);
    }
+
+    if (_op == operator_type::LIKE) {
+        auto literal_term = dynamic_pointer_cast<constants::literal>(_value);
+        if (literal_term) {
+            // Pass matcher object
+            const sstring& pattern = literal_term->get_raw_text();
+            return column_condition::condition(receiver, collection_element_term,
+                    _value->prepare(db, keyspace, value_spec),
+                    std::make_unique<like_matcher>(bytes_view(reinterpret_cast<const int8_t*>(pattern.data()), pattern.size())),
+                    _op);
+        } else {
+            // Pass through rhs value, matcher object built on execution
+            // TODO: caller should validate parametrized LIKE pattern
+            return column_condition::condition(receiver, collection_element_term,
+                    _value->prepare(db, keyspace, value_spec), nullptr, _op);
+        }
+    }
+
    if (_op != operator_type::IN) {
        throw exceptions::invalid_request_exception(format("Unsupported operator type {} in a condition ", _op));
    }
--- a/cql3/column_condition.hh
+++ b/cql3/column_condition.hh
@@ -44,6 +44,7 @@
 #include "cql3/term.hh"
 #include "cql3/abstract_marker.hh"
 #include "cql3/operator.hh"
+#include "utils/like_matcher.hh"

 namespace cql3 {

@@ -65,14 +66,17 @@ private:
    ::shared_ptr<term> _value;
    // List of terminals for "a IN (value, value, ...)"
    std::vector<::shared_ptr<term>> _in_values;
+    const std::unique_ptr<like_matcher> _matcher;
    const operator_type& _op;
 public:
    column_condition(const column_definition& column, ::shared_ptr<term> collection_element,
-        ::shared_ptr<term> value, std::vector<::shared_ptr<term>> in_values, const operator_type& op)
+        ::shared_ptr<term> value, std::vector<::shared_ptr<term>> in_values,
+        std::unique_ptr<like_matcher> matcher, const operator_type& op)
            : column(column)
            , _collection_element(std::move(collection_element))
            , _value(std::move(value))
            , _in_values(std::move(in_values))
+            , _matcher(std::move(matcher))
            , _op(op)
    {
        if (op != operator_type::IN) {
@@ -85,7 +89,7 @@ public:
     * @param boundNames the list of column specification where to collect the
     * bind variables of this term in.
     */
-    void collect_marker_specificaton(lw_shared_ptr<variable_specifications> bound_names);
+    void collect_marker_specificaton(variable_specifications& bound_names) const;

    bool uses_function(const sstring& ks_name, const sstring& function_name) const;

@@ -94,18 +98,23 @@ public:
    // and evaluate the condition.
    bool applies_to(const data_value* cell_value, const query_options& options) const;

-    // Helper constructor wrapper for  "IF col['key'] = 'foo'" or "IF col = 'foo'" */
+    /**
+     * Helper constructor wrapper for
+     * "IF col['key'] = 'foo'"
+     * "IF col = 'foo'"
+     * "IF col LIKE <pattern>"
+     */
    static ::shared_ptr<column_condition> condition(const column_definition& def, ::shared_ptr<term> collection_element,
-            ::shared_ptr<term> value, const operator_type& op) {
+            ::shared_ptr<term> value, std::unique_ptr<like_matcher> matcher, const operator_type& op) {
        return ::make_shared<column_condition>(def, std::move(collection_element), std::move(value),
-            std::vector<::shared_ptr<term>>{}, op);
+            std::vector<::shared_ptr<term>>{}, std::move(matcher), op);
    }

    // Helper constructor wrapper for  "IF col IN ... and IF col['key'] IN ... */
    static ::shared_ptr<column_condition> in_condition(const column_definition& def, ::shared_ptr<term> collection_element,
            ::shared_ptr<term> in_marker, std::vector<::shared_ptr<term>> in_values) {
        return ::make_shared<column_condition>(def, std::move(collection_element), std::move(in_marker),
-            std::move(in_values), operator_type::IN);
+            std::move(in_values), nullptr, operator_type::IN);
    }

    class raw final {
@@ -130,7 +139,13 @@ public:
                , _op(op)
        { }

-        /** A condition on a column or collection element. For example: "IF col['key'] = 'foo'" or "IF col = 'foo'" */
+        /**
+         * A condition on a column or collection element.
+         * For example:
+         * "IF col['key'] = 'foo'"
+         * "IF col = 'foo'"
+         * "IF col LIKE 'foo%'"
+         */
        static ::shared_ptr<raw> simple_condition(::shared_ptr<term::raw> value, ::shared_ptr<term::raw> collection_element,
                const operator_type& op) {
            return ::make_shared<raw>(std::move(value), std::vector<::shared_ptr<term::raw>>{},
@@ -151,7 +166,7 @@ public:
                    std::move(collection_element), operator_type::IN);
        }

-        ::shared_ptr<column_condition> prepare(database& db, const sstring& keyspace, const column_definition& receiver);
+        ::shared_ptr<column_condition> prepare(database& db, const sstring& keyspace, const column_definition& receiver) const;
    };
 };

--- a/cql3/column_identifier.cc
+++ b/cql3/column_identifier.cc
@@ -23,6 +23,7 @@
 #include "exceptions/exceptions.hh"
 #include "cql3/selection/simple_selector.hh"
 #include "cql3/util.hh"
+#include "cql3/query_options.hh"

 #include <regex>

@@ -79,13 +80,13 @@ column_identifier::raw::raw(sstring raw_text, bool keep_case)
    }
 }

-::shared_ptr<selection::selectable> column_identifier::raw::prepare(schema_ptr s) {
+::shared_ptr<selection::selectable> column_identifier::raw::prepare(const schema& s) const {
    return prepare_column_identifier(s);
 }

 ::shared_ptr<column_identifier>
-column_identifier::raw::prepare_column_identifier(schema_ptr schema) {
-    if (schema->regular_column_name_type() == utf8_type) {
+column_identifier::raw::prepare_column_identifier(const schema& schema) const {
+    if (schema.regular_column_name_type() == utf8_type) {
        return ::make_shared<column_identifier>(_text, true);
    }

@@ -93,12 +94,12 @@ column_identifier::raw::prepare_column_identifier(schema_ptr schema) {
    // to get the correct ByteBuffer representation.  However, this doesn't apply to key aliases, so we need to
    // make a special check for those and treat them normally.  See CASSANDRA-8178.
    auto text_bytes = to_bytes(_text);
-    auto def = schema->get_column_definition(text_bytes);
+    auto def = schema.get_column_definition(text_bytes);
    if (def) {
        return ::make_shared<column_identifier>(std::move(text_bytes), _text);
    }

-    return ::make_shared<column_identifier>(schema->regular_column_name_type()->from_string(_raw_text), _text);
+    return ::make_shared<column_identifier>(schema.regular_column_name_type()->from_string(_raw_text), _text);
 }

 bool column_identifier::raw::processes_selection() const {
@@ -123,7 +124,7 @@ std::ostream& operator<<(std::ostream& out, const column_identifier::raw& id) {

 ::shared_ptr<selection::selector::factory>
 column_identifier::new_selector_factory(database& db, schema_ptr schema, std::vector<const column_definition*>& defs) {
-    auto def = get_column_definition(schema, *this);
+    auto def = get_column_definition(*schema, *this);
    if (!def) {
        throw exceptions::invalid_request_exception(format("Undefined name {} in selection clause", _text));
    }
--- a/cql3/column_identifier.hh
+++ b/cql3/column_identifier.hh
@@ -43,7 +43,7 @@

 #include "cql3/selection/selectable.hh"

-#include "schema.hh"
+#include "schema_fwd.hh"

 #include <algorithm>
 #include <functional>
@@ -112,9 +112,9 @@ private:
 public:
    raw(sstring raw_text, bool keep_case);

-    virtual ::shared_ptr<selectable> prepare(schema_ptr s) override;
+    virtual ::shared_ptr<selectable> prepare(const schema& s) const override;

-    ::shared_ptr<column_identifier> prepare_column_identifier(schema_ptr s);
+    ::shared_ptr<column_identifier> prepare_column_identifier(const schema& s) const;

    virtual bool processes_selection() const override;

@@ -130,8 +130,8 @@ public:
 };

 static inline
-const column_definition* get_column_definition(schema_ptr schema, const column_identifier& id) {
-    return schema->get_column_definition(id.bytes_);
+const column_definition* get_column_definition(const schema& schema, const column_identifier& id) {
+    return schema.get_column_definition(id.bytes_);
 }

 static inline
--- a/Show More
+++ b/Show More