release: prepare for 4.1.11

compaction: compaction_writer: destroy shared_sstable after the sstable_writer
sstable_writer may depend on the sstable throughout its whole lifecycle. If the sstable is freed before the sstable_writer we might hit use-after-free as in the follwing case: ``` std::_Deque_iterator<sstables::compression::segmented_offsets::bucket, sstables::compression::segmented_offsets::bucket&, sstables::compression::segmented_offsets::bucket*>::operator+=(long) at /usr/include/c++/10/bits/stl_deque.h:240 (inlined by) std::operator+(std::_Deque_iterator<sstables::compression::segmented_offsets::bucket, sstables::compression::segmented_offsets::bucket&, sstables::compression::segmented_offsets::bucket*> const&, long) at /usr/include/c++/10/bits/stl_deque.h:378 (inlined by) std::_Deque_iterator<sstables::compression::segmented_offsets::bucket, sstables::compression::segmented_offsets::bucket&, sstables::compression::segmented_offsets::bucket*>::operator[](long) const at /usr/include/c++/10/bits/stl_deque.h:252 (inlined by) std::deque<sstables::compression::segmented_offsets::bucket, std::allocator<sstables::compression::segmented_offsets::bucket> >::operator[](unsigned long) at /usr/include/c++/10/bits/stl_deque.h:1327 (inlined by) sstables::compression::segmented_offsets::push_back(unsigned long, sstables::compression::segmented_offsets::state&) at ./sstables/compress.cc:214 sstables::compression::segmented_offsets::writer::push_back(unsigned long) at ./sstables/compress.hh:123 (inlined by) compressed_file_data_sink_impl<crc32_utils, (compressed_checksum_mode)1>::put(seastar::temporary_buffer<char>) at ./sstables/compress.cc:519 seastar::output_stream<char>::put(seastar::temporary_buffer<char>) at table.cc:? (inlined by) seastar::output_stream<char>::put(seastar::temporary_buffer<char>) at ././seastar/include/seastar/core/iostream-impl.hh:432 seastar::output_stream<char>::flush() at table.cc:? seastar::output_stream<char>::close() at table.cc:? sstables::file_writer::close() at sstables.cc:? sstables::mc::writer::~writer() at writer.cc:? (inlined by) sstables::mc::writer::~writer() at ./sstables/mx/writer.cc:790 sstables::mc::writer::~writer() at writer.cc:? flat_mutation_reader::impl::consumer_adapter<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >::~consumer_adapter() at compaction.cc:? (inlined by) std::_Optional_payload_base<sstables::compaction_writer>::_M_destroy() at /usr/include/c++/10/optional:260 (inlined by) std::_Optional_payload_base<sstables::compaction_writer>::_M_reset() at /usr/include/c++/10/optional:280 (inlined by) std::_Optional_payload<sstables::compaction_writer, false, false, false>::~_Optional_payload() at /usr/include/c++/10/optional:401 (inlined by) std::_Optional_base<sstables::compaction_writer, false, false>::~_Optional_base() at /usr/include/c++/10/optional:474 (inlined by) std::optional<sstables::compaction_writer>::~optional() at /usr/include/c++/10/optional:659 (inlined by) sstables::compacting_sstable_writer::~compacting_sstable_writer() at ./sstables/compaction.cc:229 (inlined by) compact_mutation<(emit_only_live_rows)0, (compact_for_sstables)1, sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>::~compact_mutation() at ././mutation_compactor.hh:468 (inlined by) compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>::~compact_for_compaction() at ././mutation_compactor.hh:538 (inlined by) std::default_delete<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >::operator()(compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>*) const at /usr/include/c++/10/bits/unique_ptr.h:85 (inlined by) std::unique_ptr<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>, std::default_delete<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >::~unique_ptr() at /usr/include/c++/10/bits/unique_ptr.h:361 (inlined by) stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >::~stable_flattened_mutations_consumer() at ././mutation_reader.hh:342 (inlined by) flat_mutation_reader::impl::consumer_adapter<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >::~consumer_adapter() at ././flat_mutation_reader.hh:201 auto flat_mutation_reader::impl::consume_in_thread<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter>(stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter, std::chrono::time_point<seastar::lowres_clock, std::chrono::duration<long, std::ratio<1l, 1000l> > >) at ././flat_mutation_reader.hh:272 (inlined by) auto flat_mutation_reader::consume_in_thread<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter>(stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter, std::chrono::time_point<seastar::lowres_clock, std::chrono::duration<long, std::ratio<1l, 1000l> > >) at ././flat_mutation_reader.hh:383 (inlined by) auto flat_mutation_reader::consume_in_thread<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >(stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, std::chrono::time_point<seastar::lowres_clock, std::chrono::duration<long, std::ratio<1l, 1000l> > >) at ././flat_mutation_reader.hh:389 (inlined by) seastar::future<void> sstables::compaction::setup<noop_compacted_fragments_consumer>(noop_compacted_fragments_consumer)::{lambda(flat_mutation_reader)#1}::operator()(flat_mutation_reader)::{lambda()#1}::operator()() at ./sstables/compaction.cc:612 ``` What happens here is that: compressed_file_data_sink_impl(output_stream<char> out, sstables::compression* cm, sstables::local_compression lc) : _out(std::move(out)) , _compression_metadata(cm) , _offsets(_compression_metadata->offsets.get_writer()) , _compression(lc) , _full_checksum(ChecksumType::init_checksum()) _compression_metadata points to a buffer held by the sstable object. and _compression_metadata->offsets.get_writer returns a writer that keeps a reference to the segmented_offsets in the sstables::compression that is used in the ~writer -> close path. Fixes #7821 Signed-off-by: Benny Halevy <bhalevy@scylladb.com> Message-Id: <20201227145726.33319-1-bhalevy@scylladb.com> (cherry picked from commit 8a745a0ee0)
2021-01-05 10:13:34 +02:00 · 2021-01-04 15:12:33 +02:00 · 2020-12-24 12:42:42 +02:00 · 2020-12-16 17:20:32 +02:00 · 2020-12-16 11:59:12 +02:00 · 2020-12-15 16:52:38 +02:00
5654 changed files with 109309 additions and 40201 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,3 +1,4 @@
 .git
 build
 seastar/build
+testlog
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,8 @@ CMakeLists.txt.user
 __pycache__CMakeLists.txt.user
 .gdbinit
 resources
+.pytest_cache
+/expressions.tokens
+tags
+testlog/*
+test/*/*.reject
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,17 +1,17 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
 	url = ../scylla-swagger-ui
 	ignore = dirty
-[submodule "xxHash"]
-	path = xxHash
-	url = ../xxHash
 [submodule "libdeflate"]
 	path = libdeflate
 	url = ../libdeflate
 [submodule "zstd"]
 	path = zstd
 	url = ../zstd
+[submodule "abseil"]
+	path = abseil
+	url = ../abseil-cpp
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,13 +5,25 @@
 cmake_minimum_required(VERSION 3.7)
 project(scylla)

+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+  message(STATUS "Setting build type to 'Release' as none was specified.")
+  set(CMAKE_BUILD_TYPE "Release" CACHE
+      STRING "Choose the type of build." FORCE)
+  # Set the possible values of build type for cmake-gui
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
+    "Debug" "Release" "Dev" "Sanitize")
+endif()
+
+if(CMAKE_BUILD_TYPE)
+    string(TOLOWER "${CMAKE_BUILD_TYPE}" BUILD_TYPE)
+else()
+    set(BUILD_TYPE "release")
+endif()
+
 if (NOT DEFINED FOR_IDE AND NOT DEFINED ENV{FOR_IDE} AND NOT DEFINED ENV{CLION_IDE})
    message(FATAL_ERROR "This CMakeLists.txt file is only valid for use in IDEs, please define FOR_IDE to acknowledge this.")
 endif()

-# Default value. A more accurate list is populated through `pkg-config` below if `seastar.pc` is available.
-set(SEASTAR_INCLUDE_DIRS "seastar")
-
 # These paths are always available, since they're included in the repository. Additional DPDK headers are placed while
 # Seastar is built, and are captured in `SEASTAR_INCLUDE_DIRS` through parsing the Seastar pkg-config file (below).
 set(SEASTAR_DPDK_INCLUDE_DIRS
@@ -22,9 +34,14 @@ set(SEASTAR_DPDK_INCLUDE_DIRS

 find_package(PkgConfig REQUIRED)

-set(ENV{PKG_CONFIG_PATH} "${CMAKE_SOURCE_DIR}/seastar/build/release:$ENV{PKG_CONFIG_PATH}")
+set(ENV{PKG_CONFIG_PATH} "${CMAKE_SOURCE_DIR}/build/${BUILD_TYPE}/seastar:$ENV{PKG_CONFIG_PATH}")
 pkg_check_modules(SEASTAR seastar)

+if(NOT SEASTAR_INCLUDE_DIRS)
+    # Default value. A more accurate list is populated through `pkg-config` below if `seastar.pc` is available.
+    set(SEASTAR_INCLUDE_DIRS "seastar/include")
+endif()
+
 find_package(Boost COMPONENTS filesystem program_options system thread)

 ##
@@ -70,7 +87,7 @@ scan_scylla_source_directories(
          seastar/json
          seastar/net
          seastar/rpc
-          seastar/tests
+          seastar/testing
          seastar/util)

 scan_scylla_source_directories(
@@ -97,7 +114,7 @@ scan_scylla_source_directories(
          service
          sstables
          streaming
-          tests
+          test
          thrift
          tracing
          transport
@@ -106,7 +123,7 @@ scan_scylla_source_directories(
 scan_scylla_source_directories(
        VAR SCYLLA_GEN_SOURCE_FILES
        RECURSIVE
-        PATHS build/release/gen)
+        PATHS build/${BUILD_TYPE}/gen)

 set(SCYLLA_SOURCE_FILES
        ${SCYLLA_ROOT_SOURCE_FILES}
@@ -139,4 +156,4 @@ target_include_directories(scylla PUBLIC
        ${Boost_INCLUDE_DIRS}
        xxhash
        libdeflate
-        build/release/gen)
+        build/${BUILD_TYPE}/gen)
--- a/HACKING.md
+++ b/HACKING.md
@@ -56,7 +56,7 @@ $ ./configure.py --help

 The most important option is:

- `--{enable,disable}-dpdk`: [DPDK](http://dpdk.org/) is a set of libraries and drivers for fast packet processing. During development, it's not necessary to enable support even if it is supported by your platform.
+- `--enable-dpdk`: [DPDK](http://dpdk.org/) is a set of libraries and drivers for fast packet processing. During development, it's not necessary to enable support even if it is supported by your platform.

 Source files and build targets are tracked manually in `configure.py`, so the script needs to be updated when new files or targets are added or removed.

@@ -141,7 +141,7 @@ In v3:
 "Tests: unit ({mode}), dtest ({smp})"
 ```

-The usual is "Tests: unit (release)", although running debug tests is encouraged.
+The usual is "Tests: unit (dev)", although running debug tests is encouraged.

 5. When answering review comments, prefer inline quotes as they make it easier to track the conversation across multiple e-mails.

--- a/31
+++ b/31
@@ -5,8 +5,6 @@ F: Filename, directory, or pattern for the subsystem
 ---

 AUTH
-M: Paweł Dziepak <pdziepak@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 R: Calle Wilund <calle@scylladb.com>
 R: Vlad Zolotarov <vladz@scylladb.com>
 R: Jesse Haber-Kucharsky <jhaberku@scylladb.com>
@@ -14,22 +12,17 @@ F: auth/*

 CACHE
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Paweł Dziepak <pdziepak@scylladb.com>
 R: Piotr Jastrzebski <piotr@scylladb.com>
 F: row_cache*
 F: *mutation*
 F: tests/mvcc*

 COMMITLOG / BATCHLOGa
-M: Paweł Dziepak <pdziepak@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 R: Calle Wilund <calle@scylladb.com>
 F: db/commitlog/*
 F: db/batch*

 COORDINATOR
-M: Paweł Dziepak <pdziepak@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 R: Gleb Natapov <gleb@scylladb.com>
 F: service/storage_proxy*

@@ -49,12 +42,10 @@ M: Pekka Enberg <penberg@scylladb.com>
 F: cql3/*

 COUNTERS
-M: Paweł Dziepak <pdziepak@scylladb.com>
 F: counters*
 F: tests/counter_test*

 GOSSIP
-M: Duarte Nunes <duarte@scylladb.com>
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
 R: Asias He <asias@scylladb.com>
 F: gms/*
@@ -65,14 +56,11 @@ F: dist/docker/*

 LSA
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Paweł Dziepak <pdziepak@scylladb.com>
 F: utils/logalloc*

 MATERIALIZED VIEWS
-M: Duarte Nunes <duarte@scylladb.com>
 M: Pekka Enberg <penberg@scylladb.com>
-R: Nadav Har'El <nyh@scylladb.com>
-R: Duarte Nunes <duarte@scylladb.com>
+M: Nadav Har'El <nyh@scylladb.com>
 F: db/view/*
 F: cql3/statements/*view*

@@ -82,14 +70,12 @@ F: dist/*

 REPAIR
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 R: Asias He <asias@scylladb.com>
 R: Nadav Har'El <nyh@scylladb.com>
 F: repair/*

 SCHEMA MANAGEMENT
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 M: Pekka Enberg <penberg@scylladb.com>
 F: db/schema_tables*
 F: db/legacy_schema_migrator*
@@ -98,15 +84,13 @@ F: schema*

 SECONDARY INDEXES
 M: Pekka Enberg <penberg@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
-R: Nadav Har'El <nyh@scylladb.com>
+M: Nadav Har'El <nyh@scylladb.com>
 R: Pekka Enberg <penberg@scylladb.com>
 F: db/index/*
 F: cql3/statements/*index*

 SSTABLES
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 R: Raphael S. Carvalho <raphaelsc@scylladb.com>
 R: Glauber Costa <glauber@scylladb.com>
 R: Nadav Har'El <nyh@scylladb.com>
@@ -114,18 +98,17 @@ F: sstables/*

 STREAMING
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 R: Asias He <asias@scylladb.com>
 F: streaming/*
 F: service/storage_service.*

-THRIFT TRANSPORT LAYER
-M: Duarte Nunes <duarte@scylladb.com>
-F: thrift/*
+ALTERNATOR
+M: Nadav Har'El <nyh@scylladb.com>
+F: alternator/*
+F: alternator-test/*

 THE REST
 M: Avi Kivity <avi@scylladb.com>
-M: Paweł Dziepak <pdziepak@scylladb.com>
-M: Duarte Nunes <duarte@scylladb.com>
 M: Tomasz Grabiec <tgrabiec@scylladb.com>
+M: Nadav Har'El <nyh@scylladb.com>
 F: *
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -1,5 +1,7 @@
 This project includes code developed by the Apache Software Foundation (http://www.apache.org/),
 especially Apache Cassandra.

-It also includes files from https://github.com/antonblanchard/crc32-vpmsum (author Anton Blanchard <anton@au.ibm.com>, IBM).
+It includes files from https://github.com/antonblanchard/crc32-vpmsum (author Anton Blanchard <anton@au.ibm.com>, IBM).
 These files are located in utils/arch/powerpc/crc32-vpmsum. Their license may be found in licenses/LICENSE-crc32-vpmsum.TXT.
+
+It includes modified code from https://gitbox.apache.org/repos/asf?p=cassandra-dtest.git (owned by The Apache Software Foundation)
--- a/README-DPDK.md
+++ b/README-DPDK.md
@@ -1,29 +0,0 @@
-Seastar and DPDK
-================
-
-Seastar uses the Data Plane Development Kit to drive NIC hardware directly.  This
-provides an enormous performance boost.
-
-To enable DPDK, specify `--enable-dpdk` to `./configure.py`, and `--dpdk-pmd` as a
-run-time parameter.  This will use the DPDK package provided as a git submodule with the
-seastar sources.
-
-To use your own self-compiled DPDK package, follow this procedure:
-
-1. Setup host to compile DPDK:
-   - Ubuntu 
-     `sudo apt-get install -y build-essential linux-image-extra-$(uname -r)` 
-2. Prepare a DPDK SDK:
-   - Download the latest DPDK release: `wget http://dpdk.org/browse/dpdk/snapshot/dpdk-1.8.0.tar.gz`
-   - Untar it.
-   - Edit config/common_linuxapp: set CONFIG_RTE_MBUF_REFCNT and CONFIG_RTE_LIBRTE_KNI to 'n'.
-   - For DPDK 1.7.x: edit config/common_linuxapp: 
-     - Set CONFIG_RTE_LIBRTE_PMD_BOND  to 'n'.
-     - Set CONFIG_RTE_MBUF_SCATTER_GATHER to 'n'.
-     - Set CONFIG_RTE_LIBRTE_IP_FRAG to 'n'.
-   - Start the tools/setup.sh script as root.
-   - Compile a linuxapp target (option 9).
-   - Install IGB_UIO module (option 11).
-   - Bind some physical port to IGB_UIO (option 17).
-   - Configure hugepage mappings (option 14/15).
-3. Run a configure.py: `./configure.py --dpdk-target <Path to untared dpdk-1.8.0 above>/x86_64-native-linuxapp-gcc`.
--- a/README.md
+++ b/README.md
@@ -27,10 +27,10 @@ Please see [HACKING.md](HACKING.md) for detailed information on building and dev

 ```

-* run Scylla with one CPU and ./tmp as data directory
+* run Scylla with one CPU and ./tmp as work directory

 ```
-./build/release/scylla --datadir tmp --commitlog-directory tmp --smp 1
+./build/release/scylla --workdir tmp --smp 1
 ```

 * For more run options:
@@ -38,31 +38,34 @@ Please see [HACKING.md](HACKING.md) for detailed information on building and dev
 ./build/release/scylla --help
 ```

-## Building Fedora RPM
+## Testing

-As a pre-requisite, you need to install [Mock](https://fedoraproject.org/wiki/Mock) on your machine:
+See [test.py manual](docs/testing.md).

-```
-# Install mock:
-sudo yum install mock
+## Scylla APIs and compatibility
+By default, Scylla is compatible with Apache Cassandra and its APIs - CQL and
+Thrift. There is also experimental support for the API of Amazon DynamoDB,
+but being experimental it needs to be explicitly enabled to be used. For more
+information on how to enable the experimental DynamoDB compatibility in Scylla,
+and the current limitations of this feature, see
+[Alternator](docs/alternator/alternator.md) and
+[Getting started with Alternator](docs/alternator/getting-started.md).

-# Add user to the "mock" group:
-usermod -a -G mock $USER && newgrp mock
-```
+## Documentation

-Then, to build an RPM, run:
+Documentation can be found in [./docs](./docs) and on the
+[wiki](https://github.com/scylladb/scylla/wiki). There is currently no clear
+definition of what goes where, so when looking for something be sure to check
+both.
+Seastar documentation can be found [here](http://docs.seastar.io/master/index.html).
+User documentation can be found [here](https://docs.scylladb.com/).

-```
-./dist/redhat/build_rpm.sh
-```
+## Training 

-The built RPM is stored in ``/var/lib/mock/<configuration>/result`` directory.
-For example, on Fedora 21 mock reports the following:
-
-```
-INFO: Done(scylla-server-0.00-1.fc21.src.rpm) Config(default) 20 minutes 7 seconds
-INFO: Results and/or logs in: /var/lib/mock/fedora-21-x86_64/result
-```
+Training material and online courses can be found at [Scylla University](https://university.scylladb.com/). 
+The courses are free, self-paced and include hands-on examples. They cover a variety of topics including Scylla data modeling, 
+administration, architecture, basic NoSQL concepts, using drivers for application development, Scylla setup, failover, compactions, 
+multi-datacenters and how Scylla integrates with third-party applications.

 ## Building Fedora-based Docker image

--- a/10
+++ b/10
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=666.development
+VERSION=4.1.11

 if test -f version
 then
@@ -19,6 +19,14 @@ else
 	SCYLLA_RELEASE=$SCYLLA_BUILD.$DATE.$GIT_COMMIT
 fi

+if [ -f build/SCYLLA-RELEASE-FILE ]; then
+	RELEASE_FILE=$(cat build/SCYLLA-RELEASE-FILE)
+	GIT_COMMIT_FILE=$(cat build/SCYLLA-RELEASE-FILE |cut -d . -f 3)
+	if [ "$GIT_COMMIT" = "$GIT_COMMIT_FILE" ]; then
+		exit 0
+	fi
+fi
+
 echo "$SCYLLA_VERSION-$SCYLLA_RELEASE"
 mkdir -p build
 echo "$SCYLLA_VERSION" > build/SCYLLA-VERSION-FILE
--- a/1
+++ b/1
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -0,0 +1,147 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "alternator/error.hh"
+#include "log.hh"
+#include <string>
+#include <string_view>
+#include <gnutls/crypto.h>
+#include <seastar/util/defer.hh>
+#include "hashers.hh"
+#include "bytes.hh"
+#include "alternator/auth.hh"
+#include <fmt/format.h>
+#include "auth/common.hh"
+#include "auth/password_authenticator.hh"
+#include "auth/roles-metadata.hh"
+#include "cql3/query_processor.hh"
+#include "cql3/untyped_result_set.hh"
+
+namespace alternator {
+
+static logging::logger alogger("alternator-auth");
+
+static hmac_sha256_digest hmac_sha256(std::string_view key, std::string_view msg) {
+    hmac_sha256_digest digest;
+    int ret = gnutls_hmac_fast(GNUTLS_MAC_SHA256, key.data(), key.size(), msg.data(), msg.size(), digest.data());
+    if (ret) {
+        throw std::runtime_error(fmt::format("Computing HMAC failed ({}): {}", ret, gnutls_strerror(ret)));
+    }
+    return digest;
+}
+
+static hmac_sha256_digest get_signature_key(std::string_view key, std::string_view date_stamp, std::string_view region_name, std::string_view service_name) {
+    auto date = hmac_sha256("AWS4" + std::string(key), date_stamp);
+    auto region = hmac_sha256(std::string_view(date.data(), date.size()), region_name);
+    auto service = hmac_sha256(std::string_view(region.data(), region.size()), service_name);
+    auto signing = hmac_sha256(std::string_view(service.data(), service.size()), "aws4_request");
+    return signing;
+}
+
+static std::string apply_sha256(std::string_view msg) {
+    sha256_hasher hasher;
+    hasher.update(msg.data(), msg.size());
+    return to_hex(hasher.finalize());
+}
+
+static std::string format_time_point(db_clock::time_point tp) {
+    time_t time_point_repr = db_clock::to_time_t(tp);
+    std::string time_point_str;
+    time_point_str.resize(17);
+    ::tm time_buf;
+    // strftime prints the terminating null character as well
+    std::strftime(time_point_str.data(), time_point_str.size(), "%Y%m%dT%H%M%SZ", ::gmtime_r(&time_point_repr, &time_buf));
+    time_point_str.resize(16);
+    return time_point_str;
+}
+
+void check_expiry(std::string_view signature_date) {
+    //FIXME: The default 15min can be changed with X-Amz-Expires header - we should honor it
+    std::string expiration_str = format_time_point(db_clock::now() - 15min);
+    std::string validity_str = format_time_point(db_clock::now() + 15min);
+    if (signature_date < expiration_str) {
+        throw api_error("InvalidSignatureException",
+                fmt::format("Signature expired: {} is now earlier than {} (current time - 15 min.)",
+                signature_date, expiration_str));
+    }
+    if (signature_date > validity_str) {
+        throw api_error("InvalidSignatureException",
+                fmt::format("Signature not yet current: {} is still later than {} (current time + 15 min.)",
+                signature_date, validity_str));
+    }
+}
+
+std::string get_signature(std::string_view access_key_id, std::string_view secret_access_key, std::string_view host, std::string_view method,
+        std::string_view orig_datestamp, std::string_view signed_headers_str, const std::map<std::string_view, std::string_view>& signed_headers_map,
+        std::string_view body_content, std::string_view region, std::string_view service, std::string_view query_string) {
+    auto amz_date_it = signed_headers_map.find("x-amz-date");
+    if (amz_date_it == signed_headers_map.end()) {
+        throw api_error("InvalidSignatureException", "X-Amz-Date header is mandatory for signature verification");
+    }
+    std::string_view amz_date = amz_date_it->second;
+    check_expiry(amz_date);
+    std::string_view datestamp = amz_date.substr(0, 8);
+    if (datestamp != orig_datestamp) {
+        throw api_error("InvalidSignatureException",
+                format("X-Amz-Date date does not match the provided datestamp. Expected {}, got {}",
+                        orig_datestamp, datestamp));
+    }
+    std::string_view canonical_uri = "/";
+
+    std::stringstream canonical_headers;
+    for (const auto& header : signed_headers_map) {
+        canonical_headers << fmt::format("{}:{}", header.first, header.second) << '\n';
+    }
+
+    std::string payload_hash = apply_sha256(body_content);
+    std::string canonical_request = fmt::format("{}\n{}\n{}\n{}\n{}\n{}", method, canonical_uri, query_string, canonical_headers.str(), signed_headers_str, payload_hash);
+
+    std::string_view algorithm = "AWS4-HMAC-SHA256";
+    std::string credential_scope = fmt::format("{}/{}/{}/aws4_request", datestamp, region, service);
+    std::string string_to_sign = fmt::format("{}\n{}\n{}\n{}", algorithm, amz_date, credential_scope,  apply_sha256(canonical_request));
+
+    hmac_sha256_digest signing_key = get_signature_key(secret_access_key, datestamp, region, service);
+    hmac_sha256_digest signature = hmac_sha256(std::string_view(signing_key.data(), signing_key.size()), string_to_sign);
+
+    return to_hex(bytes_view(reinterpret_cast<const int8_t*>(signature.data()), signature.size()));
+}
+
+future<std::string> get_key_from_roles(cql3::query_processor& qp, std::string username) {
+    static const sstring query = format("SELECT salted_hash FROM {} WHERE {} = ?",
+            auth::meta::roles_table::qualified_name(), auth::meta::roles_table::role_col_name);
+
+    auto cl = auth::password_authenticator::consistency_for_user(username);
+    auto& timeout = auth::internal_distributed_timeout_config();
+    return qp.execute_internal(query, cl, timeout, {sstring(username)}, true).then_wrapped([username = std::move(username)] (future<::shared_ptr<cql3::untyped_result_set>> f) {
+        auto res = f.get0();
+        auto salted_hash = std::optional<sstring>();
+        if (res->empty()) {
+            throw api_error("UnrecognizedClientException", fmt::format("User not found: {}", username));
+        }
+        salted_hash = res->one().get_opt<sstring>("salted_hash");
+        if (!salted_hash) {
+            throw api_error("UnrecognizedClientException", fmt::format("No password found for user: {}", username));
+        }
+        return make_ready_future<std::string>(*salted_hash);
+    });
+}
+
+}
--- a/alternator/auth.hh
+++ b/alternator/auth.hh
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <string>
+#include <string_view>
+#include <array>
+#include "gc_clock.hh"
+#include "utils/loading_cache.hh"
+
+namespace cql3 {
+class query_processor;
+}
+
+namespace alternator {
+
+using hmac_sha256_digest = std::array<char, 32>;
+
+using key_cache = utils::loading_cache<std::string, std::string>;
+
+std::string get_signature(std::string_view access_key_id, std::string_view secret_access_key, std::string_view host, std::string_view method,
+        std::string_view orig_datestamp, std::string_view signed_headers_str, const std::map<std::string_view, std::string_view>& signed_headers_map,
+        std::string_view body_content, std::string_view region, std::string_view service, std::string_view query_string);
+
+future<std::string> get_key_from_roles(cql3::query_processor& qp, std::string username);
+
+}
--- a/alternator/base64.cc
+++ b/alternator/base64.cc
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// The DynamoAPI dictates that "binary" (a.k.a. "bytes" or "blob") values
+// be encoded in the JSON API as base64-encoded strings. This is code to
+// convert byte arrays to base64-encoded strings, and back.
+
+#include "base64.hh"
+
+#include <ctype.h>
+
+
+// Arrays for quickly converting to and from an integer between 0 and 63,
+// and the character used in base64 encoding to represent it.
+static class base64_chars {
+public:
+    static constexpr const char* to =
+            "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+    int8_t from[255];
+    base64_chars() {
+        static_assert(strlen(to) == 64);
+        for (int i = 0; i < 255; i++) {
+            from[i] = 255; // signal invalid character
+        }
+        for (int i = 0; i < 64; i++) {
+            from[(unsigned) to[i]] = i;
+        }
+    }
+} base64_chars;
+
+std::string base64_encode(bytes_view in) {
+    std::string ret;
+    ret.reserve(((4 * in.size() / 3) + 3) & ~3);
+    int i = 0;
+    unsigned char chunk3[3]; // chunk of input
+    for (auto byte : in) {
+        chunk3[i++] = byte;
+        if (i == 3) {
+            ret += base64_chars.to[ (chunk3[0] & 0xfc) >> 2 ];
+            ret += base64_chars.to[ ((chunk3[0] & 0x03) << 4) + ((chunk3[1] & 0xf0) >> 4) ];
+            ret += base64_chars.to[ ((chunk3[1] & 0x0f) << 2) + ((chunk3[2] & 0xc0) >> 6) ];
+            ret += base64_chars.to[ chunk3[2] & 0x3f ];
+            i = 0;
+        }
+    }
+    if (i) {
+        // i can be 1 or 2.
+        for(int j = i; j < 3; j++)
+            chunk3[j] = '\0';
+        ret += base64_chars.to[ ( chunk3[0] & 0xfc) >> 2 ];
+        ret += base64_chars.to[ ((chunk3[0] & 0x03) << 4) + ((chunk3[1] & 0xf0) >> 4) ];
+        if (i == 2) {
+            ret += base64_chars.to[ ((chunk3[1] & 0x0f) << 2) + ((chunk3[2] & 0xc0) >> 6) ];
+        } else {
+            ret += '=';
+        }
+        ret += '=';
+    }
+    return ret;
+}
+
+bytes base64_decode(std::string_view in) {
+    int i = 0;
+    int8_t chunk4[4]; // chunk of input, each byte converted to 0..63;
+    std::string ret;
+    ret.reserve(in.size() * 3 / 4);
+    for (unsigned char c : in) {
+        uint8_t dc = base64_chars.from[c];
+        if (dc == 255) {
+            // Any unexpected character, include the "=" character usually
+            // used for padding, signals the end of the decode.
+            break;
+        }
+        chunk4[i++] = dc;
+        if (i == 4) {
+            ret += (chunk4[0] << 2) + ((chunk4[1] & 0x30) >> 4);
+            ret += ((chunk4[1] & 0xf) << 4) + ((chunk4[2] & 0x3c) >> 2);
+            ret += ((chunk4[2] & 0x3) << 6) + chunk4[3];
+            i = 0;
+        }
+    }
+    if (i) {
+        // i can be 2 or 3, meaning 1 or 2 more output characters
+        if (i>=2)
+            ret += (chunk4[0] << 2) + ((chunk4[1] & 0x30) >> 4);
+        if (i==3)
+            ret += ((chunk4[1] & 0xf) << 4) + ((chunk4[2] & 0x3c) >> 2);
+    }
+    // FIXME: This copy is sad. The problem is we need back "bytes"
+    // but "bytes" doesn't have efficient append and std::string.
+    // To fix this we need to use bytes' "uninitialized" feature.
+    return bytes(ret.begin(), ret.end());
+}
--- a/alternator/base64.hh
+++ b/alternator/base64.hh
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <string_view>
+#include "bytes.hh"
+#include "rjson.hh"
+
+std::string base64_encode(bytes_view);
+
+bytes base64_decode(std::string_view);
+
+inline bytes base64_decode(const rjson::value& v) {
+  return base64_decode(std::string_view(v.GetString(), v.GetStringLength()));
+}
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -0,0 +1,687 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <list>
+#include <map>
+#include <string_view>
+#include "alternator/conditions.hh"
+#include "alternator/error.hh"
+#include "cql3/constants.hh"
+#include <unordered_map>
+#include "rjson.hh"
+#include "serialization.hh"
+#include "base64.hh"
+#include <stdexcept>
+#include <boost/algorithm/cxx11/all_of.hpp>
+#include <boost/algorithm/cxx11/any_of.hpp>
+#include "utils/overloaded_functor.hh"
+
+#include "expressions_eval.hh"
+
+namespace alternator {
+
+static logging::logger clogger("alternator-conditions");
+
+comparison_operator_type get_comparison_operator(const rjson::value& comparison_operator) {
+    static std::unordered_map<std::string, comparison_operator_type> ops = {
+            {"EQ", comparison_operator_type::EQ},
+            {"NE", comparison_operator_type::NE},
+            {"LE", comparison_operator_type::LE},
+            {"LT", comparison_operator_type::LT},
+            {"GE", comparison_operator_type::GE},
+            {"GT", comparison_operator_type::GT},
+            {"IN", comparison_operator_type::IN},
+            {"NULL", comparison_operator_type::IS_NULL},
+            {"NOT_NULL", comparison_operator_type::NOT_NULL},
+            {"BETWEEN", comparison_operator_type::BETWEEN},
+            {"BEGINS_WITH", comparison_operator_type::BEGINS_WITH},
+            {"CONTAINS", comparison_operator_type::CONTAINS},
+            {"NOT_CONTAINS", comparison_operator_type::NOT_CONTAINS},
+    };
+    if (!comparison_operator.IsString()) {
+        throw api_error("ValidationException", format("Invalid comparison operator definition {}", rjson::print(comparison_operator)));
+    }
+    std::string op = comparison_operator.GetString();
+    auto it = ops.find(op);
+    if (it == ops.end()) {
+        throw api_error("ValidationException", format("Unsupported comparison operator {}", op));
+    }
+    return it->second;
+}
+
+static ::shared_ptr<cql3::restrictions::single_column_restriction::contains> make_map_element_restriction(const column_definition& cdef, std::string_view key, const rjson::value& value) {
+    bytes raw_key = utf8_type->from_string(sstring_view(key.data(), key.size()));
+    auto key_value = ::make_shared<cql3::constants::value>(cql3::raw_value::make_value(std::move(raw_key)));
+    bytes raw_value = serialize_item(value);
+    auto entry_value = ::make_shared<cql3::constants::value>(cql3::raw_value::make_value(std::move(raw_value)));
+    return make_shared<cql3::restrictions::single_column_restriction::contains>(cdef, std::move(key_value), std::move(entry_value));
+}
+
+static ::shared_ptr<cql3::restrictions::single_column_restriction::EQ> make_key_eq_restriction(const column_definition& cdef, const rjson::value& value) {
+    bytes raw_value = get_key_from_typed_value(value, cdef);
+    auto restriction_value = ::make_shared<cql3::constants::value>(cql3::raw_value::make_value(std::move(raw_value)));
+    return make_shared<cql3::restrictions::single_column_restriction::EQ>(cdef, std::move(restriction_value));
+}
+
+::shared_ptr<cql3::restrictions::statement_restrictions> get_filtering_restrictions(schema_ptr schema, const column_definition& attrs_col, const rjson::value& query_filter) {
+    clogger.trace("Getting filtering restrictions for: {}", rjson::print(query_filter));
+    auto filtering_restrictions = ::make_shared<cql3::restrictions::statement_restrictions>(schema, true);
+    for (auto it = query_filter.MemberBegin(); it != query_filter.MemberEnd(); ++it) {
+        std::string_view column_name(it->name.GetString(), it->name.GetStringLength());
+        const rjson::value& condition = it->value;
+
+        const rjson::value& comp_definition = rjson::get(condition, "ComparisonOperator");
+        const rjson::value& attr_list = rjson::get(condition, "AttributeValueList");
+        comparison_operator_type op = get_comparison_operator(comp_definition);
+
+        if (op != comparison_operator_type::EQ) {
+            throw api_error("ValidationException", "Filtering is currently implemented for EQ operator only");
+        }
+        if (attr_list.Size() != 1) {
+            throw api_error("ValidationException", format("EQ restriction needs exactly 1 attribute value: {}", rjson::print(attr_list)));
+        }
+        if (const column_definition* cdef = schema->get_column_definition(to_bytes(column_name.data()))) {
+            // Primary key restriction
+            filtering_restrictions->add_restriction(make_key_eq_restriction(*cdef, attr_list[0]), false, true);
+        } else {
+            // Regular column restriction
+            filtering_restrictions->add_restriction(make_map_element_restriction(attrs_col, column_name, attr_list[0]), false, true);
+        }
+
+    }
+    return filtering_restrictions;
+}
+
+namespace {
+
+struct size_check {
+    // True iff size passes this check.
+    virtual bool operator()(rapidjson::SizeType size) const = 0;
+    // Check description, such that format("expected array {}", check.what()) is human-readable.
+    virtual sstring what() const = 0;
+};
+
+class exact_size : public size_check {
+    rapidjson::SizeType _expected;
+  public:
+    explicit exact_size(rapidjson::SizeType expected) : _expected(expected) {}
+    bool operator()(rapidjson::SizeType size) const override { return size == _expected; }
+    sstring what() const override { return format("of size {}", _expected); }
+};
+
+struct empty : public size_check {
+    bool operator()(rapidjson::SizeType size) const override { return size < 1; }
+    sstring what() const override { return "to be empty"; }
+};
+
+struct nonempty : public size_check {
+    bool operator()(rapidjson::SizeType size) const override { return size > 0; }
+    sstring what() const override { return "to be non-empty"; }
+};
+
+} // anonymous namespace
+
+// Check that array has the expected number of elements
+static void verify_operand_count(const rjson::value* array, const size_check& expected, const rjson::value& op) {
+    if (!array && expected(0)) {
+        // If expected() allows an empty AttributeValueList, it is also fine
+        // that it is missing.
+        return;
+    }
+    if (!array || !array->IsArray()) {
+        throw api_error("ValidationException", "With ComparisonOperator, AttributeValueList must be given and an array");
+    }
+    if (!expected(array->Size())) {
+        throw api_error("ValidationException",
+                        format("{} operator requires AttributeValueList {}, instead found list size {}",
+                               op, expected.what(), array->Size()));
+    }
+}
+
+struct rjson_engaged_ptr_comp {
+    bool operator()(const rjson::value* p1, const rjson::value* p2) const {
+        return rjson::single_value_comp()(*p1, *p2);
+    }
+};
+
+// It's not enough to compare underlying JSON objects when comparing sets,
+// as internally they're stored in an array, and the order of elements is
+// not important in set equality. See issue #5021
+static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2) {
+    if (set1.Size() != set2.Size()) {
+        return false;
+    }
+    std::set<const rjson::value*, rjson_engaged_ptr_comp> set1_raw;
+    for (auto it = set1.Begin(); it != set1.End(); ++it) {
+        set1_raw.insert(&*it);
+    }
+    for (const auto& a : set2.GetArray()) {
+        if (set1_raw.count(&a) == 0) {
+            return false;
+        }
+    }
+    return true;
+}
+
+// Check if two JSON-encoded values match with the EQ relation
+static bool check_EQ(const rjson::value* v1, const rjson::value& v2) {
+    if (!v1) {
+        return false;
+    }
+    if (v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
+        auto it1 = v1->MemberBegin();
+        auto it2 = v2.MemberBegin();
+        if ((it1->name == "SS" && it2->name == "SS") || (it1->name == "NS" && it2->name == "NS") || (it1->name == "BS" && it2->name == "BS")) {
+            return check_EQ_for_sets(it1->value, it2->value);
+        }
+    }
+    return *v1 == v2;
+}
+
+// Check if two JSON-encoded values match with the NE relation
+static bool check_NE(const rjson::value* v1, const rjson::value& v2) {
+    return !v1 || *v1 != v2; // null is unequal to anything.
+}
+
+// Check if two JSON-encoded values match with the BEGINS_WITH relation
+static bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2) {
+    // BEGINS_WITH requires that its single operand (v2) be a string or
+    // binary - otherwise it's a validation error. However, problems with
+    // the stored attribute (v1) will just return false (no match).
+    if (!v2.IsObject() || v2.MemberCount() != 1) {
+        throw api_error("ValidationException", format("BEGINS_WITH operator encountered malformed AttributeValue: {}", v2));
+    }
+    auto it2 = v2.MemberBegin();
+    if (it2->name != "S" && it2->name != "B") {
+        throw api_error("ValidationException", format("BEGINS_WITH operator requires String or Binary in AttributeValue, got {}", it2->name));
+    }
+
+
+    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
+        return false;
+    }
+    auto it1 = v1->MemberBegin();
+    if (it1->name != it2->name) {
+        return false;
+    }
+    if (it2->name == "S") {
+        std::string_view val1(it1->value.GetString(), it1->value.GetStringLength());
+        std::string_view val2(it2->value.GetString(), it2->value.GetStringLength());
+        return val1.substr(0, val2.size()) == val2;
+    } else /* it2->name == "B" */ {
+        // TODO (optimization): Check the begins_with condition directly on
+        // the base64-encoded string, without making a decoded copy.
+        bytes val1 = base64_decode(it1->value);
+        bytes val2 = base64_decode(it2->value);
+        return val1.substr(0, val2.size()) == val2;
+    }
+}
+
+static bool is_set_of(const rjson::value& type1, const rjson::value& type2) {
+    return (type2 == "S" && type1 == "SS") || (type2 == "N" && type1 == "NS") || (type2 == "B" && type1 == "BS");
+}
+
+// Check if two JSON-encoded values match with the CONTAINS relation
+bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2) {
+    if (!v1) {
+        return false;
+    }
+    const auto& kv1 = *v1->MemberBegin();
+    const auto& kv2 = *v2.MemberBegin();
+    if (kv2.name != "S" && kv2.name != "N" &&  kv2.name != "B") {
+        throw api_error("ValidationException",
+                        format("CONTAINS operator requires a single AttributeValue of type String, Number, or Binary, "
+                               "got {} instead", kv2.name));
+    }
+    if (kv1.name == "S" && kv2.name == "S") {
+        return rjson::to_string_view(kv1.value).find(rjson::to_string_view(kv2.value)) != std::string_view::npos;
+    } else if (kv1.name == "B" && kv2.name == "B") {
+        return base64_decode(kv1.value).find(base64_decode(kv2.value)) != bytes::npos;
+    } else if (is_set_of(kv1.name, kv2.name)) {
+        for (auto i = kv1.value.Begin(); i != kv1.value.End(); ++i) {
+            if (*i == kv2.value) {
+                return true;
+            }
+        }
+    } else if (kv1.name == "L") {
+        for (auto i = kv1.value.Begin(); i != kv1.value.End(); ++i) {
+            if (!i->IsObject() || i->MemberCount() != 1) {
+                clogger.error("check_CONTAINS received a list whose element is malformed");
+                return false;
+            }
+            const auto& el = *i->MemberBegin();
+            if (el.name == kv2.name && el.value == kv2.value) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+// Check if two JSON-encoded values match with the NOT_CONTAINS relation
+static bool check_NOT_CONTAINS(const rjson::value* v1, const rjson::value& v2) {
+    if (!v1) {
+        return false;
+    }
+    return !check_CONTAINS(v1, v2);
+}
+
+// Check if a JSON-encoded value equals any element of an array, which must have at least one element.
+static bool check_IN(const rjson::value* val, const rjson::value& array) {
+    if (!array[0].IsObject() || array[0].MemberCount() != 1) {
+        throw api_error("ValidationException",
+                        format("IN operator encountered malformed AttributeValue: {}", array[0]));
+    }
+    const auto& type = array[0].MemberBegin()->name;
+    if (type != "S" && type != "N" && type != "B") {
+        throw api_error("ValidationException",
+                        "IN operator requires AttributeValueList elements to be of type String, Number, or Binary ");
+    }
+    if (!val) {
+        return false;
+    }
+    bool have_match = false;
+    for (const auto& elem : array.GetArray()) {
+        if (!elem.IsObject() || elem.MemberCount() != 1 || elem.MemberBegin()->name != type) {
+            throw api_error("ValidationException",
+                            "IN operator requires all AttributeValueList elements to have the same type ");
+        }
+        if (!have_match && *val == elem) {
+            // Can't return yet, must check types of all array elements. <sigh>
+            have_match = true;
+        }
+    }
+    return have_match;
+}
+
+// Another variant of check_IN, this one for ConditionExpression. It needs to
+// check whether the first element in the given vector is equal to any of the
+// others.
+static bool check_IN(const std::vector<rjson::value>& array) {
+    const rjson::value* first = &array[0];
+    for (unsigned i = 1; i < array.size(); i++) {
+        if (check_EQ(first, array[i])) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static bool check_NULL(const rjson::value* val) {
+    return val == nullptr;
+}
+
+static bool check_NOT_NULL(const rjson::value* val) {
+    return val != nullptr;
+}
+
+// Check if two JSON-encoded values match with cmp.
+template <typename Comparator>
+bool check_compare(const rjson::value* v1, const rjson::value& v2, const Comparator& cmp) {
+    if (!v2.IsObject() || v2.MemberCount() != 1) {
+        throw api_error("ValidationException",
+                        format("{} requires a single AttributeValue of type String, Number, or Binary",
+                               cmp.diagnostic));
+    }
+    const auto& kv2 = *v2.MemberBegin();
+    if (kv2.name != "S" && kv2.name != "N" && kv2.name != "B") {
+        throw api_error("ValidationException",
+                        format("{} requires a single AttributeValue of type String, Number, or Binary",
+                               cmp.diagnostic));
+    }
+    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
+        return false;
+    }
+    const auto& kv1 = *v1->MemberBegin();
+    if (kv1.name != kv2.name) {
+        return false;
+    }
+    if (kv1.name == "N") {
+        return cmp(unwrap_number(*v1, cmp.diagnostic), unwrap_number(v2, cmp.diagnostic));
+    }
+    if (kv1.name == "S") {
+        return cmp(std::string_view(kv1.value.GetString(), kv1.value.GetStringLength()),
+                   std::string_view(kv2.value.GetString(), kv2.value.GetStringLength()));
+    }
+    if (kv1.name == "B") {
+        return cmp(base64_decode(kv1.value), base64_decode(kv2.value));
+    }
+    clogger.error("check_compare panic: LHS type equals RHS type, but one is in {N,S,B} while the other isn't");
+    return false;
+}
+
+struct cmp_lt {
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs; }
+    // We cannot use the normal comparison operators like "<" on the bytes
+    // type, because they treat individual bytes as signed but we need to
+    // compare them as *unsigned*. So we need a specialization for bytes.
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) < 0; }
+    static constexpr const char* diagnostic = "LT operator";
+};
+
+struct cmp_le {
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs <= rhs; }
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) <= 0; }
+    static constexpr const char* diagnostic = "LE operator";
+};
+
+struct cmp_ge {
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs >= rhs; }
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) >= 0; }
+    static constexpr const char* diagnostic = "GE operator";
+};
+
+struct cmp_gt {
+    template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs > rhs; }
+    bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) > 0; }
+    static constexpr const char* diagnostic = "GT operator";
+};
+
+// True if v is between lb and ub, inclusive.  Throws if lb > ub.
+template <typename T>
+bool check_BETWEEN(const T& v, const T& lb, const T& ub) {
+    if (cmp_lt()(ub, lb)) {
+        throw api_error("ValidationException",
+                        format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+    }
+    return cmp_ge()(v, lb) && cmp_le()(v, ub);
+}
+
+static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub) {
+    if (!v) {
+        return false;
+    }
+    if (!v->IsObject() || v->MemberCount() != 1) {
+        throw api_error("ValidationException", format("BETWEEN operator encountered malformed AttributeValue: {}", *v));
+    }
+    if (!lb.IsObject() || lb.MemberCount() != 1) {
+        throw api_error("ValidationException", format("BETWEEN operator encountered malformed AttributeValue: {}", lb));
+    }
+    if (!ub.IsObject() || ub.MemberCount() != 1) {
+        throw api_error("ValidationException", format("BETWEEN operator encountered malformed AttributeValue: {}", ub));
+    }
+
+    const auto& kv_v = *v->MemberBegin();
+    const auto& kv_lb = *lb.MemberBegin();
+    const auto& kv_ub = *ub.MemberBegin();
+    if (kv_lb.name != kv_ub.name) {
+        throw api_error(
+                "ValidationException",
+                format("BETWEEN operator requires the same type for lower and upper bound; instead got {} and {}",
+                       kv_lb.name, kv_ub.name));
+    }
+    if (kv_v.name != kv_lb.name) { // Cannot compare different types, so v is NOT between lb and ub.
+        return false;
+    }
+    if (kv_v.name == "N") {
+        const char* diag = "BETWEEN operator";
+        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag));
+    }
+    if (kv_v.name == "S") {
+        return check_BETWEEN(std::string_view(kv_v.value.GetString(), kv_v.value.GetStringLength()),
+                             std::string_view(kv_lb.value.GetString(), kv_lb.value.GetStringLength()),
+                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()));
+    }
+    if (kv_v.name == "B") {
+        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value));
+    }
+    throw api_error("ValidationException",
+        format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
+               kv_lb.name));
+}
+
+// Verify one Expect condition on one attribute (whose content is "got")
+// for the verify_expected() below.
+// This function returns true or false depending on whether the condition
+// succeeded - it does not throw ConditionalCheckFailedException.
+// However, it may throw ValidationException on input validation errors.
+static bool verify_expected_one(const rjson::value& condition, const rjson::value* got) {
+    const rjson::value* comparison_operator = rjson::find(condition, "ComparisonOperator");
+    const rjson::value* attribute_value_list = rjson::find(condition, "AttributeValueList");
+    const rjson::value* value = rjson::find(condition, "Value");
+    const rjson::value* exists = rjson::find(condition, "Exists");
+    // There are three types of conditions that Expected supports:
+    // A value, not-exists, and a comparison of some kind. Each allows
+    // and requires a different combinations of parameters in the request
+    if (value) {
+        if (exists && (!exists->IsBool() || exists->GetBool() != true)) {
+            throw api_error("ValidationException", "Cannot combine Value with Exists!=true");
+        }
+        if (comparison_operator) {
+            throw api_error("ValidationException", "Cannot combine Value with ComparisonOperator");
+        }
+        return check_EQ(got, *value);
+    } else if (exists) {
+        if (comparison_operator) {
+            throw api_error("ValidationException", "Cannot combine Exists with ComparisonOperator");
+        }
+        if (!exists->IsBool() || exists->GetBool() != false) {
+            throw api_error("ValidationException", "Exists!=false requires Value");
+        }
+        // Remember Exists=false, so we're checking that the attribute does *not* exist:
+        return !got;
+    } else {
+        if (!comparison_operator) {
+            throw api_error("ValidationException", "Missing ComparisonOperator, Value or Exists");
+        }
+        comparison_operator_type op = get_comparison_operator(*comparison_operator);
+        switch (op) {
+        case comparison_operator_type::EQ:
+            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+            return check_EQ(got, (*attribute_value_list)[0]);
+        case comparison_operator_type::NE:
+            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+            return check_NE(got, (*attribute_value_list)[0]);
+        case comparison_operator_type::LT:
+            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+            return check_compare(got, (*attribute_value_list)[0], cmp_lt{});
+        case comparison_operator_type::LE:
+            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+            return check_compare(got, (*attribute_value_list)[0], cmp_le{});
+        case comparison_operator_type::GT:
+            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+            return check_compare(got, (*attribute_value_list)[0], cmp_gt{});
+        case comparison_operator_type::GE:
+            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+            return check_compare(got, (*attribute_value_list)[0], cmp_ge{});
+        case comparison_operator_type::BEGINS_WITH:
+            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+            return check_BEGINS_WITH(got, (*attribute_value_list)[0]);
+        case comparison_operator_type::IN:
+            verify_operand_count(attribute_value_list, nonempty(), *comparison_operator);
+            return check_IN(got, *attribute_value_list);
+        case comparison_operator_type::IS_NULL:
+            verify_operand_count(attribute_value_list, empty(), *comparison_operator);
+            return check_NULL(got);
+        case comparison_operator_type::NOT_NULL:
+            verify_operand_count(attribute_value_list, empty(), *comparison_operator);
+            return check_NOT_NULL(got);
+        case comparison_operator_type::BETWEEN:
+            verify_operand_count(attribute_value_list, exact_size(2), *comparison_operator);
+            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1]);
+        case comparison_operator_type::CONTAINS:
+            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+            return check_CONTAINS(got, (*attribute_value_list)[0]);
+        case comparison_operator_type::NOT_CONTAINS:
+            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
+            return check_NOT_CONTAINS(got, (*attribute_value_list)[0]);
+        }
+        throw std::logic_error(format("Internal error: corrupted operator enum: {}", int(op)));
+    }
+}
+
+// Check if the existing values of the item (previous_item) match the
+// conditions given by the Expected and ConditionalOperator parameters
+// (if they exist) in the request (an UpdateItem, PutItem or DeleteItem).
+// This function can throw an ValidationException API error if there
+// are errors in the format of the condition itself.
+bool verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value>& previous_item) {
+    const rjson::value* expected = rjson::find(req, "Expected");
+    if (!expected) {
+        return true;
+    }
+    if (!expected->IsObject()) {
+        throw api_error("ValidationException", "'Expected' parameter, if given, must be an object");
+    }
+    // ConditionalOperator can be "AND" for requiring all conditions, or
+    // "OR" for requiring one condition, and defaults to "AND" if missing.
+    const rjson::value* conditional_operator = rjson::find(req, "ConditionalOperator");
+    bool require_all = true;
+    if (conditional_operator) {
+        if (!conditional_operator->IsString()) {
+            throw api_error("ValidationException", "'ConditionalOperator' parameter, if given, must be a string");
+        }
+        std::string_view s(conditional_operator->GetString(), conditional_operator->GetStringLength());
+        if (s == "AND") {
+            // require_all is already true
+        } else if (s == "OR") {
+            require_all = false;
+        } else {
+            throw api_error("ValidationException", "'ConditionalOperator' parameter must be AND, OR or missing");
+        }
+        if (expected->GetObject().ObjectEmpty()) {
+            throw api_error("ValidationException", "'ConditionalOperator' parameter cannot be specified for empty Expression");
+        }
+    }
+
+    for (auto it = expected->MemberBegin(); it != expected->MemberEnd(); ++it) {
+        const rjson::value* got = nullptr;
+        if (previous_item && previous_item->IsObject() && previous_item->HasMember("Item")) {
+            got = rjson::find((*previous_item)["Item"], rjson::to_string_view(it->name));
+        }
+        bool success = verify_expected_one(it->value, got);
+        if (success && !require_all) {
+            // When !require_all, one success is enough!
+            return true;
+        } else if (!success && require_all) {
+            // When require_all, one failure is enough!
+            return false;
+        }
+    }
+    // If we got here and require_all, none of the checks failed, so succeed.
+    // If we got here and !require_all, all of the checks failed, so fail.
+    return require_all;
+}
+
+bool calculate_primitive_condition(const parsed::primitive_condition& cond,
+        std::unordered_set<std::string>& used_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        const rjson::value& req,
+        schema_ptr schema,
+        const std::unique_ptr<rjson::value>& previous_item) {
+    std::vector<rjson::value> calculated_values;
+    calculated_values.reserve(cond._values.size());
+    for (const parsed::value& v : cond._values) {
+        calculated_values.push_back(calculate_value(v,
+                cond._op == parsed::primitive_condition::type::VALUE ?
+                        calculate_value_caller::ConditionExpressionAlone :
+                        calculate_value_caller::ConditionExpression,
+                rjson::find(req, "ExpressionAttributeValues"),
+                used_attribute_names, used_attribute_values,
+                req, schema, previous_item));
+    }
+    switch (cond._op) {
+    case parsed::primitive_condition::type::BETWEEN:
+        if (calculated_values.size() != 3) {
+            // Shouldn't happen unless we have a bug in the parser
+            throw std::logic_error(format("Wrong number of values {} in BETWEEN primitive_condition", cond._values.size()));
+        }
+        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2]);
+    case parsed::primitive_condition::type::IN:
+        return check_IN(calculated_values);
+    case parsed::primitive_condition::type::VALUE:
+        if (calculated_values.size() != 1) {
+            // Shouldn't happen unless we have a bug in the parser
+            throw std::logic_error(format("Unexpected values in primitive_condition", cond._values.size()));
+        }
+        // Unwrap the boolean wrapped as the value (if it is a boolean)
+        if (calculated_values[0].IsObject() && calculated_values[0].MemberCount() == 1) {
+            auto it = calculated_values[0].MemberBegin();
+            if (it->name == "BOOL" && it->value.IsBool()) {
+                return it->value.GetBool();
+            }
+        }
+        throw api_error("ValidationException",
+                format("ConditionExpression: condition results in a non-boolean value: {}",
+                        calculated_values[0]));
+    default:
+        // All the rest of the operators have exactly two parameters (and unless
+        // we have a bug in the parser, that's what we have in the parsed object:
+        if (calculated_values.size() != 2) {
+            throw std::logic_error(format("Wrong number of values {} in primitive_condition object", cond._values.size()));
+        }
+    }
+    switch (cond._op) {
+    case parsed::primitive_condition::type::EQ:
+        return check_EQ(&calculated_values[0], calculated_values[1]);
+    case parsed::primitive_condition::type::NE:
+        return check_NE(&calculated_values[0], calculated_values[1]);
+    case parsed::primitive_condition::type::GT:
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{});
+    case parsed::primitive_condition::type::GE:
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{});
+    case parsed::primitive_condition::type::LT:
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{});
+    case parsed::primitive_condition::type::LE:
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{});
+    default:
+        // Shouldn't happen unless we have a bug in the parser
+        throw std::logic_error(format("Unknown type {} in primitive_condition object", (int)(cond._op)));
+    }
+}
+
+// Check if the existing values of the item (previous_item) match the
+// conditions given by the given parsed ConditionExpression.
+bool verify_condition_expression(
+        const parsed::condition_expression& condition_expression,
+        std::unordered_set<std::string>& used_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        const rjson::value& req,
+        schema_ptr schema,
+        const std::unique_ptr<rjson::value>& previous_item) {
+    if (condition_expression.empty()) {
+        return true;
+    }
+    bool ret = std::visit(overloaded_functor {
+        [&] (const parsed::primitive_condition& cond) -> bool {
+            return calculate_primitive_condition(cond, used_attribute_values,
+                    used_attribute_names, req, schema, previous_item);
+        },
+        [&] (const parsed::condition_expression::condition_list& list) -> bool {
+            auto verify_condition = [&] (const parsed::condition_expression& e) {
+                return verify_condition_expression(e, used_attribute_values,
+                        used_attribute_names, req, schema, previous_item);
+            };
+            switch (list.op) {
+            case '&':
+                return boost::algorithm::all_of(list.conditions, verify_condition);
+            case '|':
+                return boost::algorithm::any_of(list.conditions, verify_condition);
+            default:
+                // Shouldn't happen unless we have a bug in the parser
+                throw std::logic_error("bad operator in condition_list");
+            }
+        }
+    }, condition_expression._expression);
+    return condition_expression._negated ? !ret : ret;
+}
+
+}
--- a/alternator/conditions.hh
+++ b/alternator/conditions.hh
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * This file contains definitions and functions related to placing conditions
+ * on Alternator queries (equivalent of CQL's restrictions).
+ *
+ * With conditions, it's possible to add criteria to selection requests (Scan, Query)
+ * and use them for narrowing down the result set, by means of filtering or indexing.
+ *
+ * Ref: https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_Condition.html
+ */
+
+#pragma once
+
+#include "cql3/restrictions/statement_restrictions.hh"
+#include "serialization.hh"
+
+namespace alternator {
+
+enum class comparison_operator_type {
+    EQ, NE, LE, LT, GE, GT, IN, BETWEEN, CONTAINS, NOT_CONTAINS, IS_NULL, NOT_NULL, BEGINS_WITH
+};
+
+comparison_operator_type get_comparison_operator(const rjson::value& comparison_operator);
+
+::shared_ptr<cql3::restrictions::statement_restrictions> get_filtering_restrictions(schema_ptr schema, const column_definition& attrs_col, const rjson::value& query_filter);
+
+bool verify_expected(const rjson::value& req, const std::unique_ptr<rjson::value>& previous_item);
+
+}
--- a/alternator/error.hh
+++ b/alternator/error.hh
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <seastar/http/httpd.hh>
+#include "seastarx.hh"
+
+namespace alternator {
+
+// DynamoDB's error messages are described in detail in
+// https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Programming.Errors.html
+// Ah An error message has a "type", e.g., "ResourceNotFoundException", a coarser
+// HTTP code (almost always, 400), and a human readable message. Eventually these
+// will be wrapped into a JSON object returned to the client.
+class api_error : public std::exception {
+public:
+    using status_type = httpd::reply::status_type;
+    status_type _http_code;
+    std::string _type;
+    std::string _msg;
+    api_error(std::string type, std::string msg, status_type http_code = status_type::bad_request)
+        : _http_code(std::move(http_code))
+        , _type(std::move(type))
+        , _msg(std::move(msg))
+    { }
+    api_error() = default;
+    virtual const char* what() const noexcept override { return _msg.c_str(); }
+};
+
+}
+
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/http/httpd.hh>
+#include "seastarx.hh"
+#include <seastar/json/json_elements.hh>
+#include <seastar/core/sharded.hh>
+
+#include "service/storage_proxy.hh"
+#include "service/migration_manager.hh"
+#include "service/client_state.hh"
+
+#include "alternator/error.hh"
+#include "stats.hh"
+#include "rjson.hh"
+
+namespace alternator {
+
+class executor : public peering_sharded_service<executor> {
+    service::storage_proxy& _proxy;
+    service::migration_manager& _mm;
+    // An smp_service_group to be used for limiting the concurrency when
+    // forwarding Alternator request between shards - if necessary for LWT.
+    smp_service_group _ssg;
+
+public:
+    using client_state = service::client_state;
+    using request_return_type = std::variant<json::json_return_type, api_error>;
+    stats _stats;
+    static constexpr auto ATTRS_COLUMN_NAME = ":attrs";
+    static constexpr auto KEYSPACE_NAME_PREFIX = "alternator_";
+    static constexpr std::string_view INTERNAL_TABLE_PREFIX = ".scylla.alternator.";
+
+    executor(service::storage_proxy& proxy, service::migration_manager& mm, smp_service_group ssg)
+        : _proxy(proxy), _mm(mm), _ssg(ssg) {}
+
+    future<request_return_type> create_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> delete_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> put_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> delete_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> update_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> list_tables(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> scan(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> describe_endpoints(client_state& client_state, service_permit permit, rjson::value request, std::string host_header);
+    future<request_return_type> batch_write_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> batch_get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> query(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> tag_resource(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> untag_resource(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> list_tags_of_resource(client_state& client_state, service_permit permit, rjson::value request);
+
+    future<> start();
+    future<> stop() { return make_ready_future<>(); }
+
+    future<> create_keyspace(std::string_view keyspace_name);
+
+    static tracing::trace_state_ptr maybe_trace_query(client_state& client_state, sstring_view op, sstring_view query);
+};
+
+}
--- a/alternator/expressions.cc
+++ b/alternator/expressions.cc
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "expressions.hh"
+#include "alternator/expressionsLexer.hpp"
+#include "alternator/expressionsParser.hpp"
+#include "utils/overloaded_functor.hh"
+
+#include <seastarx.hh>
+
+#include <seastar/core/print.hh>
+#include <seastar/util/log.hh>
+
+#include <functional>
+
+namespace alternator {
+
+template <typename Func, typename Result = std::result_of_t<Func(expressionsParser&)>>
+Result do_with_parser(std::string input, Func&& f) {
+    expressionsLexer::InputStreamType input_stream{
+        reinterpret_cast<const ANTLR_UINT8*>(input.data()),
+        ANTLR_ENC_UTF8,
+        static_cast<ANTLR_UINT32>(input.size()),
+        nullptr };
+    expressionsLexer lexer(&input_stream);
+    expressionsParser::TokenStreamType tstream(ANTLR_SIZE_HINT, lexer.get_tokSource());
+    expressionsParser parser(&tstream);
+
+    auto result = f(parser);
+    return result;
+}
+
+parsed::update_expression
+parse_update_expression(std::string query) {
+    try {
+        return do_with_parser(query,  std::mem_fn(&expressionsParser::update_expression));
+    } catch (...) {
+        throw expressions_syntax_error(format("Failed parsing UpdateExpression '{}': {}", query, std::current_exception()));
+    }
+}
+
+std::vector<parsed::path>
+parse_projection_expression(std::string query) {
+    try {
+        return do_with_parser(query,  std::mem_fn(&expressionsParser::projection_expression));
+    } catch (...) {
+        throw expressions_syntax_error(format("Failed parsing ProjectionExpression '{}': {}", query, std::current_exception()));
+    }
+}
+
+parsed::condition_expression
+parse_condition_expression(std::string query) {
+    try {
+        return do_with_parser(query,  std::mem_fn(&expressionsParser::condition_expression));
+    } catch (...) {
+        throw expressions_syntax_error(format("Failed parsing ConditionExpression '{}': {}", query, std::current_exception()));
+    }
+}
+
+namespace parsed {
+
+void update_expression::add(update_expression::action a) {
+    std::visit(overloaded_functor {
+        [&] (action::set&)    { seen_set = true; },
+        [&] (action::remove&) { seen_remove = true; },
+        [&] (action::add&)    { seen_add = true; },
+        [&] (action::del&)    { seen_del = true; }
+    }, a._action);
+    _actions.push_back(std::move(a));
+}
+
+void update_expression::append(update_expression other) {
+    if ((seen_set && other.seen_set) ||
+        (seen_remove && other.seen_remove) ||
+        (seen_add && other.seen_add) ||
+        (seen_del && other.seen_del)) {
+        throw expressions_syntax_error("Each of SET, REMOVE, ADD, DELETE may only appear once in UpdateExpression");
+    }
+    std::move(other._actions.begin(), other._actions.end(), std::back_inserter(_actions));
+    seen_set |= other.seen_set;
+    seen_remove |= other.seen_remove;
+    seen_add |= other.seen_add;
+    seen_del |= other.seen_del;
+}
+
+void condition_expression::append(condition_expression&& a, char op) {
+    std::visit(overloaded_functor {
+        [&] (condition_list& x) {
+            // If 'a' has a single condition, we could, instead of inserting
+            // it insert its single condition (possibly negated if a._negated)
+            // But considering it we don't evaluate these expressions many
+            // times, this optimization is not worth extra code complexity.
+            if (!x.conditions.empty() && x.op != op) {
+                // Shouldn't happen unless we have a bug in the parser
+                throw std::logic_error("condition_expression::append called with mixed operators");
+            }
+            x.conditions.push_back(std::move(a));
+            x.op = op;
+        },
+        [&] (primitive_condition& x) {
+            // Shouldn't happen unless we have a bug in the parser
+            throw std::logic_error("condition_expression::append called on primitive_condition");
+        }
+    }, _expression);
+}
+
+
+} // namespace parsed
+} // namespace alternator
--- a/alternator/expressions.g
+++ b/alternator/expressions.g
@@ -0,0 +1,265 @@
+/*
+ * Copyright 2019 ScyllaDB
+ *
+ * This file is part of Scylla. See the LICENSE.PROPRIETARY file in the
+ * top-level directory for licensing information.
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * The DynamoDB protocol is based on JSON, and most DynamoDB requests
+ * describe the operation and its parameters via JSON objects such as maps
+ * and lists. Nevertheless, in some types of requests an "expression" is
+ * passed as a single string, and we need to parse this string. These
+ * cases include:
+ *  1. Attribute paths, such as "a[3].b.c", are used in projection
+ *     expressions as well as inside other expressions described below.
+ *  2. Condition expressions, such as "(NOT (a=b OR c=d)) AND e=f",
+ *     used in conditional updates, filters, and other places.
+ *  3. Update expressions, such as "SET #a.b = :x, c = :y DELETE d"
+ *
+ * All these expression syntaxes are very simple: Most of them could be
+ * parsed as regular expressions, and the parenthesized condition expression
+ * could be done with a simple hand-written lexical analyzer and recursive-
+ * descent parser. Nevertheless, we decided to specify these parsers in the
+ * ANTLR3 language already used in the Scylla project, hopefully making these
+ * parsers easier to reason about, and easier to change if needed - and
+ * reducing the amount of boiler-plate code.
+ */
+
+grammar expressions;
+
+options {
+    language = Cpp;
+}
+
+@parser::namespace{alternator}
+@lexer::namespace{alternator}
+
+/* TODO: explain what these traits things are. I haven't seen them explained
+ * in any document... Compilation fails without these fail because a definition
+ * of "expressionsLexerTraits" and "expressionParserTraits" is needed.
+ */
+@lexer::traits {
+    class expressionsLexer;
+    class expressionsParser;
+    typedef antlr3::Traits<expressionsLexer, expressionsParser> expressionsLexerTraits;
+}
+@parser::traits {
+    typedef expressionsLexerTraits expressionsParserTraits;
+}
+
+@lexer::header {
+	#include "alternator/expressions.hh"
+	// ANTLR generates a bunch of unused variables and functions. Yuck...
+    #pragma GCC diagnostic ignored "-Wunused-variable"
+    #pragma GCC diagnostic ignored "-Wunused-function"
+}
+@parser::header {
+	#include "expressionsLexer.hpp"
+}
+
+/* By default, ANTLR3 composes elaborate syntax-error messages, saying which
+ * token was unexpected, where, and so on on, but then dutifully writes these
+ * error messages to the standard error, and returns from the parser as if
+ * everything was fine, with a half-constructed output object! If we define
+ * the "displayRecognitionError" method, it will be called upon to build this
+ * error message, and we can instead throw an exception to stop the parsing
+ * immediately. This is good enough for now, for our simple needs, but if
+ * we ever want to show more information about the syntax error, Cql3.g
+ * contains an elaborate implementation (it would be nice if we could reuse
+ * it, not duplicate it).
+ * Unfortunately, we have to repeat the same definition twice - once for the
+ * parser, and once for the lexer.
+ */
+@parser::context {
+    void displayRecognitionError(ANTLR_UINT8** token_names, ExceptionBaseType* ex) {
+        throw expressions_syntax_error("syntax error");
+    }
+}
+@lexer::context {
+    void displayRecognitionError(ANTLR_UINT8** token_names, ExceptionBaseType* ex) {
+        throw expressions_syntax_error("syntax error");
+    }
+}
+
+/*
+ * Lexical analysis phase, i.e., splitting the input up to tokens.
+ * Lexical analyzer rules have names starting in capital letters.
+ * "fragment" rules do not generate tokens, and are just aliases used to
+ * make other rules more readable.
+ * Characters *not* listed here, e.g., '=', '(', etc., will be handled
+ * as individual tokens on their own right.
+ * Whitespace spans are skipped, so do not generate tokens.
+ */
+WHITESPACE: (' ' | '\t' | '\n' | '\r')+ { skip(); };
+
+/* shortcuts for case-insensitive keywords */
+fragment A:('a'|'A');
+fragment B:('b'|'B');
+fragment C:('c'|'C');
+fragment D:('d'|'D');
+fragment E:('e'|'E');
+fragment F:('f'|'F');
+fragment G:('g'|'G');
+fragment H:('h'|'H');
+fragment I:('i'|'I');
+fragment J:('j'|'J');
+fragment K:('k'|'K');
+fragment L:('l'|'L');
+fragment M:('m'|'M');
+fragment N:('n'|'N');
+fragment O:('o'|'O');
+fragment P:('p'|'P');
+fragment Q:('q'|'Q');
+fragment R:('r'|'R');
+fragment S:('s'|'S');
+fragment T:('t'|'T');
+fragment U:('u'|'U');
+fragment V:('v'|'V');
+fragment W:('w'|'W');
+fragment X:('x'|'X');
+fragment Y:('y'|'Y');
+fragment Z:('z'|'Z');
+/* These keywords must be appear before the generic NAME token below,
+ * because NAME matches too, and the first to match wins.
+ */
+SET: S E T;
+REMOVE: R E M O V E;
+ADD: A D D;
+DELETE: D E L E T E;
+
+AND: A N D;
+OR: O R;
+NOT: N O T;
+BETWEEN: B E T W E E N;
+IN: I N;
+
+fragment ALPHA: 'A'..'Z' | 'a'..'z';
+fragment DIGIT: '0'..'9';
+fragment ALNUM: ALPHA | DIGIT | '_';
+INTEGER: DIGIT+;
+NAME: ALPHA ALNUM*;
+NAMEREF: '#' ALNUM+;
+VALREF: ':' ALNUM+;
+
+/*
+ * Parsing phase - parsing the string of tokens generated by the lexical
+ * analyzer defined above.
+ */
+
+path_component: NAME | NAMEREF;
+path returns [parsed::path p]:
+    root=path_component           { $p.set_root($root.text); }
+    (   '.' name=path_component   { $p.add_dot($name.text); }
+      | '[' INTEGER ']'           { $p.add_index(std::stoi($INTEGER.text)); }
+    )*;
+
+value returns [parsed::value v]:
+      VALREF       { $v.set_valref($VALREF.text); }
+    | path         { $v.set_path($path.p); }
+    | NAME         { $v.set_func_name($NAME.text); }
+     '(' x=value   { $v.add_func_parameter($x.v); }
+     (',' x=value  { $v.add_func_parameter($x.v); })*
+     ')'
+    ;
+
+update_expression_set_rhs returns [parsed::set_rhs rhs]:
+    v=value  { $rhs.set_value(std::move($v.v)); }
+    (   '+' v=value  { $rhs.set_plus(std::move($v.v)); }
+      | '-' v=value  { $rhs.set_minus(std::move($v.v)); }
+    )?
+    ;
+
+update_expression_set_action returns [parsed::update_expression::action a]:
+    path '=' rhs=update_expression_set_rhs { $a.assign_set($path.p, $rhs.rhs); };
+
+update_expression_remove_action returns [parsed::update_expression::action a]:
+    path { $a.assign_remove($path.p); };
+
+update_expression_add_action returns [parsed::update_expression::action a]:
+    path VALREF { $a.assign_add($path.p, $VALREF.text); };
+
+update_expression_delete_action returns [parsed::update_expression::action a]:
+    path VALREF { $a.assign_del($path.p, $VALREF.text); };
+
+update_expression_clause returns [parsed::update_expression e]:
+      SET s=update_expression_set_action { $e.add(s); }
+      (',' s=update_expression_set_action { $e.add(s); })*
+    | REMOVE r=update_expression_remove_action { $e.add(r); }
+      (',' r=update_expression_remove_action { $e.add(r); })*
+    | ADD a=update_expression_add_action { $e.add(a); }
+      (',' a=update_expression_add_action { $e.add(a); })*
+    | DELETE d=update_expression_delete_action { $e.add(d); }
+      (',' d=update_expression_delete_action { $e.add(d); })*
+    ;
+
+// Note the "EOF" token at the end of the update expression. We want to the
+//  parser to match the entire string given to it - not just its beginning!
+update_expression returns [parsed::update_expression e]:
+    (update_expression_clause { e.append($update_expression_clause.e); })* EOF;
+
+projection_expression returns [std::vector<parsed::path> v]:
+    p=path      { $v.push_back(std::move($p.p)); }
+    (',' p=path { $v.push_back(std::move($p.p)); } )* EOF;
+
+
+primitive_condition returns [parsed::primitive_condition c]:
+      v=value         { $c.add_value(std::move($v.v));
+                        $c.set_operator(parsed::primitive_condition::type::VALUE); }
+      (  (  '='       { $c.set_operator(parsed::primitive_condition::type::EQ); }
+          | '<' '>'   { $c.set_operator(parsed::primitive_condition::type::NE); }
+          | '<'       { $c.set_operator(parsed::primitive_condition::type::LT); }
+          | '<' '='   { $c.set_operator(parsed::primitive_condition::type::LE); }
+          | '>'       { $c.set_operator(parsed::primitive_condition::type::GT); }
+          | '>' '='   { $c.set_operator(parsed::primitive_condition::type::GE); }
+         )
+         v=value      { $c.add_value(std::move($v.v)); }
+       | BETWEEN      { $c.set_operator(parsed::primitive_condition::type::BETWEEN); }
+         v=value      { $c.add_value(std::move($v.v)); }
+         AND
+         v=value      { $c.add_value(std::move($v.v)); }
+       | IN '('       { $c.set_operator(parsed::primitive_condition::type::IN); }
+         v=value      { $c.add_value(std::move($v.v)); }
+         (',' v=value { $c.add_value(std::move($v.v)); })*
+         ')'
+      )?
+    ;
+
+// The following rules for parsing boolean expressions are verbose and
+// somewhat strange because of Antlr 3's limitations on recursive rules,
+// common rule prefixes, and (lack of) support for operator precedence.
+// These rules could have been written more clearly using a more powerful
+// parser generator - such as Yacc.
+boolean_expression returns [parsed::condition_expression e]:
+	  b=boolean_expression_1       { $e.append(std::move($b.e), '|'); }
+	  (OR b=boolean_expression_1   { $e.append(std::move($b.e), '|'); } )*
+	;
+boolean_expression_1 returns [parsed::condition_expression e]:
+	  b=boolean_expression_2       { $e.append(std::move($b.e), '&'); }
+	  (AND b=boolean_expression_2  { $e.append(std::move($b.e), '&'); } )*
+	;
+boolean_expression_2 returns [parsed::condition_expression e]:
+	  p=primitive_condition        { $e.set_primitive(std::move($p.c)); }
+	| NOT b=boolean_expression_2   { $e = std::move($b.e); $e.apply_not(); }
+	| '(' b=boolean_expression ')' { $e = std::move($b.e); }
+    ;
+
+condition_expression returns [parsed::condition_expression e]:
+    boolean_expression { e=std::move($boolean_expression.e); } EOF;
--- a/alternator/expressions.hh
+++ b/alternator/expressions.hh
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <string>
+#include <stdexcept>
+#include <vector>
+
+#include "expressions_types.hh"
+
+namespace alternator {
+
+class expressions_syntax_error : public std::runtime_error {
+public:
+    using runtime_error::runtime_error;
+};
+
+parsed::update_expression parse_update_expression(std::string query);
+std::vector<parsed::path> parse_projection_expression(std::string query);
+parsed::condition_expression parse_condition_expression(std::string query);
+
+} /* namespace alternator */
--- a/alternator/expressions_eval.hh
+++ b/alternator/expressions_eval.hh
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <string>
+#include <unordered_set>
+
+#include "rjson.hh"
+#include "schema_fwd.hh"
+
+#include "expressions_types.hh"
+
+namespace alternator {
+
+// calculate_value() behaves slightly different (especially, different
+// functions supported) when used in different types of expressions, as
+// enumerated in this enum:
+enum class calculate_value_caller {
+    UpdateExpression, ConditionExpression, ConditionExpressionAlone
+};
+
+inline std::ostream& operator<<(std::ostream& out, calculate_value_caller caller) {
+    switch (caller) {
+        case calculate_value_caller::UpdateExpression:
+            out << "UpdateExpression";
+            break;
+        case calculate_value_caller::ConditionExpression:
+            out << "ConditionExpression";
+            break;
+        case calculate_value_caller::ConditionExpressionAlone:
+            out << "ConditionExpression";
+            break;
+        default:
+            out << "unknown type of expression";
+            break;
+    }
+    return out;
+}
+
+bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2);
+
+rjson::value calculate_value(const parsed::value& v,
+        calculate_value_caller caller,
+        const rjson::value* expression_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        std::unordered_set<std::string>& used_attribute_values,
+        const rjson::value& update_info,
+        schema_ptr schema,
+        const std::unique_ptr<rjson::value>& previous_item);
+
+bool verify_condition_expression(
+        const parsed::condition_expression& condition_expression,
+        std::unordered_set<std::string>& used_attribute_values,
+        std::unordered_set<std::string>& used_attribute_names,
+        const rjson::value& req,
+        schema_ptr schema,
+        const std::unique_ptr<rjson::value>& previous_item);
+
+} /* namespace alternator */
--- a/alternator/expressions_types.hh
+++ b/alternator/expressions_types.hh
@@ -0,0 +1,228 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <variant>
+
+/*
+ * Parsed representation of expressions and their components.
+ *
+ * Types in alternator::parse namespace are used for holding the parse
+ * tree - objects generated by the Antlr rules after parsing an expression.
+ * Because of the way Antlr works, all these objects are default-constructed
+ * first, and then assigned when the rule is completed, so all these types
+ * have only default constructors - but setter functions to set them later.
+ */
+
+namespace alternator {
+namespace parsed {
+
+// "path" is an attribute's path in a document, e.g., a.b[3].c.
+class path {
+    // All paths have a "root", a top-level attribute, and any number of
+    // "dereference operators" - each either an index (e.g., "[2]") or a
+    // dot (e.g., ".xyz").
+    std::string _root;
+    std::vector<std::variant<std::string, unsigned>> _operators;
+public:
+    void set_root(std::string root) {
+        _root = std::move(root);
+    }
+    void add_index(unsigned i) {
+        _operators.emplace_back(i);
+    }
+    void add_dot(std::string(name)) {
+        _operators.emplace_back(std::move(name));
+    }
+    const std::string& root() const {
+        return _root;
+    }
+    bool has_operators() const {
+        return !_operators.empty();
+    }
+};
+
+// "value" is is a value used in the right hand side of an assignment
+// expression, "SET a = ...". It can be a reference to a value included in
+// the request (":val"), a path to an attribute from the existing item
+// (e.g., "a.b[3].c"), or a function of other such values.
+// Note that the real right-hand-side of an assignment is actually a bit
+// more general - it allows either a value, or a value+value or value-value -
+// see class set_rhs below.
+struct value {
+    struct function_call {
+        std::string _function_name;
+        std::vector<value> _parameters;
+    };
+    std::variant<std::string, path, function_call> _value;
+    void set_valref(std::string s) {
+        _value = std::move(s);
+    }
+    void set_path(path p) {
+        _value = std::move(p);
+    }
+    void set_func_name(std::string s) {
+        _value = function_call {std::move(s), {}};
+    }
+    void add_func_parameter(value v) {
+        std::get<function_call>(_value)._parameters.emplace_back(std::move(v));
+    }
+    bool is_valref() const {
+        return std::holds_alternative<std::string>(_value);
+    }
+    bool is_path() const {
+        return std::holds_alternative<path>(_value);
+    }
+    bool is_func() const {
+        return std::holds_alternative<function_call>(_value);
+    }
+};
+
+// The right-hand-side of a SET in an update expression can be either a
+// single value (see above), or value+value, or value-value.
+class set_rhs {
+public:
+    char _op;  // '+', '-', or 'v''
+    value _v1;
+    value _v2;
+    void set_value(value&& v1) {
+        _op = 'v';
+        _v1 = std::move(v1);
+    }
+    void set_plus(value&& v2) {
+        _op = '+';
+        _v2 = std::move(v2);
+    }
+    void set_minus(value&& v2) {
+        _op = '-';
+        _v2 = std::move(v2);
+    }
+};
+
+class update_expression {
+public:
+    struct action {
+        path _path;
+        struct set {
+            set_rhs _rhs;
+        };
+        struct remove {
+        };
+        struct add {
+            std::string _valref;
+        };
+        struct del {
+            std::string _valref;
+        };
+        std::variant<set, remove, add, del> _action;
+
+        void assign_set(path p, set_rhs rhs) {
+            _path = std::move(p);
+            _action = set { std::move(rhs) };
+        }
+        void assign_remove(path p) {
+            _path = std::move(p);
+            _action = remove { };
+        }
+        void assign_add(path p, std::string v) {
+            _path = std::move(p);
+            _action = add { std::move(v) };
+        }
+        void assign_del(path p, std::string v) {
+            _path = std::move(p);
+            _action = del { std::move(v) };
+        }
+    };
+private:
+    std::vector<action> _actions;
+    bool seen_set = false;
+    bool seen_remove = false;
+    bool seen_add = false;
+    bool seen_del = false;
+public:
+    void add(action a);
+    void append(update_expression other);
+    bool empty() const {
+        return _actions.empty();
+    }
+    const std::vector<action>& actions() const {
+        return _actions;
+    }
+};
+
+// A primitive_condition is a condition expression involving one condition,
+// while the full condition_expression below adds boolean logic over these
+// primitive conditions.
+// The supported primitive conditions are:
+// 1. Binary operators - v1 OP v2, where OP is =, <>, <, <=, >, or >= and
+//    v1 and v2 are values - from the item (an attribute path), the query
+//    (a ":val" reference), or a function of the the above (only the size()
+//    function is supported).
+// 2. Ternary operator - v1 BETWEEN v2 and v3 (means v1 >= v2 AND v1 <= v3).
+// 3. N-ary operator - v1 IN ( v2, v3, ... )
+// 4. A single function call (attribute_exists etc.). The parser actually
+//    accepts a more general "value" here but later stages reject a value
+//    which is not a function call (because DynamoDB does it too).
+class primitive_condition {
+public:
+    enum class type {
+        UNDEFINED, VALUE, EQ, NE, LT, LE, GT, GE, BETWEEN, IN
+    };
+    type _op = type::UNDEFINED;
+    std::vector<value> _values;
+    void set_operator(type op) {
+        _op = op;
+    }
+    void add_value(value&& v) {
+        _values.push_back(std::move(v));
+    }
+    bool empty() const {
+        return _op == type::UNDEFINED;
+    }
+};
+
+class condition_expression {
+public:
+    bool _negated = false; // If true, the entire condition is negated
+    struct condition_list {
+        char op = '|'; // '&' or '|'
+        std::vector<condition_expression> conditions;
+    };
+    std::variant<primitive_condition, condition_list> _expression = condition_list();
+
+    void set_primitive(primitive_condition&& p) {
+        _expression = std::move(p);
+    }
+    void append(condition_expression&& c, char op);
+    void apply_not() {
+        _negated = !_negated;
+    }
+    bool empty() const {
+        return std::holds_alternative<condition_list>(_expression) &&
+               std::get<condition_list>(_expression).conditions.empty();
+    }
+};
+
+} // namespace parsed
+} // namespace alternator
--- a/alternator/rjson.cc
+++ b/alternator/rjson.cc
@@ -0,0 +1,300 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "rjson.hh"
+#include "error.hh"
+#include <seastar/core/print.hh>
+#include <seastar/core/thread.hh>
+
+namespace rjson {
+
+static allocator the_allocator;
+
+/*
+ * This wrapper class adds nested level checks to rapidjson's handlers.
+ * Each rapidjson handler implements functions for accepting JSON values,
+ * which includes strings, numbers, objects, arrays, etc.
+ * Parsing objects and arrays needs to be performed carefully with regard
+ * to stack overflow - each object/array layer adds another stack frame
+ * to parsing, printing and destroying the parent JSON document.
+ * To prevent stack overflow, a rapidjson handler can be wrapped with
+ * guarded_json_handler, which accepts an additional max_nested_level parameter.
+ * After trying to exceed the max nested level, a proper rjson::error will be thrown.
+ */
+template<typename Handler, bool EnableYield>
+struct guarded_yieldable_json_handler : public Handler {
+    size_t _nested_level = 0;
+    size_t _max_nested_level;
+public:
+    using handler_base = Handler;
+
+    explicit guarded_yieldable_json_handler(size_t max_nested_level) : _max_nested_level(max_nested_level) {}
+    guarded_yieldable_json_handler(string_buffer& buf, size_t max_nested_level)
+            : handler_base(buf), _max_nested_level(max_nested_level) {}
+
+    void Parse(const char* str, size_t length) {
+        rapidjson::MemoryStream ms(static_cast<const char*>(str), length * sizeof(typename encoding::Ch));
+        rapidjson::EncodedInputStream<encoding, rapidjson::MemoryStream> is(ms);
+        rapidjson::GenericReader<encoding, encoding, allocator> reader(&the_allocator);
+        reader.Parse(is, *this);
+        if (reader.HasParseError()) {
+            throw rjson::error(format("Parsing JSON failed: {}", rapidjson::GetParseError_En(reader.GetParseErrorCode())));
+        }
+        //NOTICE: The handler has parsed the string, but in case of rapidjson::GenericDocument
+        // the data now resides in an internal stack_ variable, which is private instead of
+        // protected... which means we cannot simply access its data. Fortunately, another
+        // function for populating documents from SAX events can be abused to extract the data
+        // from the stack via gadget-oriented programming - we use an empty event generator
+        // which does nothing, and use it to call Populate(), which assumes that the generator
+        // will fill the stack with something. It won't, but our stack is already filled with
+        // data we want to steal, so once Populate() ends, our document will be properly parsed.
+        // A proper solution could be programmed once rapidjson declares this stack_ variable
+        // as protected instead of private, so that this class can access it.
+        auto dummy_generator = [](handler_base&){return true;};
+        handler_base::Populate(dummy_generator);
+    }
+
+    bool StartObject() {
+        ++_nested_level;
+        check_nested_level();
+        maybe_yield();
+        return handler_base::StartObject();
+    }
+
+    bool EndObject(rapidjson::SizeType elements_count = 0) {
+        --_nested_level;
+        return handler_base::EndObject(elements_count);
+    }
+
+    bool StartArray() {
+        ++_nested_level;
+        check_nested_level();
+        maybe_yield();
+        return handler_base::StartArray();
+    }
+
+    bool EndArray(rapidjson::SizeType elements_count = 0) {
+        --_nested_level;
+        return handler_base::EndArray(elements_count);
+    }
+
+    bool Null()                 { maybe_yield(); return handler_base::Null(); }
+    bool Bool(bool b)           { maybe_yield(); return handler_base::Bool(b); }
+    bool Int(int i)             { maybe_yield(); return handler_base::Int(i); }
+    bool Uint(unsigned u)       { maybe_yield(); return handler_base::Uint(u); }
+    bool Int64(int64_t i64)     { maybe_yield(); return handler_base::Int64(i64); }
+    bool Uint64(uint64_t u64)   { maybe_yield(); return handler_base::Uint64(u64); }
+    bool Double(double d)       { maybe_yield(); return handler_base::Double(d); }
+    bool String(const value::Ch* str, size_t length, bool copy = false) { maybe_yield(); return handler_base::String(str, length, copy); }
+    bool Key(const value::Ch* str, size_t length, bool copy = false) { maybe_yield(); return handler_base::Key(str, length, copy); }
+
+
+protected:
+    static void maybe_yield() {
+        if constexpr (EnableYield) {
+            thread::maybe_yield();
+        }
+    }
+
+    void check_nested_level() const {
+        if (RAPIDJSON_UNLIKELY(_nested_level > _max_nested_level)) {
+            throw rjson::error(format("Max nested level reached: {}", _max_nested_level));
+        }
+    }
+};
+
+std::string print(const rjson::value& value) {
+    string_buffer buffer;
+    guarded_yieldable_json_handler<writer, false> writer(buffer, 78);
+    value.Accept(writer);
+    return std::string(buffer.GetString());
+}
+
+rjson::value copy(const rjson::value& value) {
+    return rjson::value(value, the_allocator);
+}
+
+rjson::value parse(std::string_view str) {
+    guarded_yieldable_json_handler<document, false> d(78);
+    d.Parse(str.data(), str.size());
+    if (d.HasParseError()) {
+        throw rjson::error(format("Parsing JSON failed: {}", GetParseError_En(d.GetParseError())));
+    }
+    rjson::value& v = d;
+    return std::move(v);
+}
+
+rjson::value parse_yieldable(std::string_view str) {
+    guarded_yieldable_json_handler<document, true> d(78);
+    d.Parse(str.data(), str.size());
+    if (d.HasParseError()) {
+        throw rjson::error(format("Parsing JSON failed: {}", GetParseError_En(d.GetParseError())));
+    }
+    rjson::value& v = d;
+    return std::move(v);
+}
+
+rjson::value& get(rjson::value& value, std::string_view name) {
+    // Although FindMember() has a variant taking a StringRef, it ignores the
+    // given length (see https://github.com/Tencent/rapidjson/issues/1649).
+    // Luckily, the variant taking a GenericValue doesn't share this bug,
+    // and we can create a string GenericValue without copying the string.
+    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
+    if (member_it != value.MemberEnd())
+        return member_it->value;
+    else {
+        throw rjson::error(format("JSON parameter {} not found", name));
+    }
+}
+
+const rjson::value& get(const rjson::value& value, std::string_view name) {
+    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
+    if (member_it != value.MemberEnd())
+        return member_it->value;
+    else {
+        throw rjson::error(format("JSON parameter {} not found", name));
+    }
+}
+
+rjson::value from_string(const std::string& str) {
+    return rjson::value(str.c_str(), str.size(), the_allocator);
+}
+
+rjson::value from_string(const sstring& str) {
+    return rjson::value(str.c_str(), str.size(), the_allocator);
+}
+
+rjson::value from_string(const char* str, size_t size) {
+    return rjson::value(str, size, the_allocator);
+}
+
+rjson::value from_string(std::string_view view) {
+    return rjson::value(view.data(), view.size(), the_allocator);
+}
+
+const rjson::value* find(const rjson::value& value, std::string_view name) {
+    // Although FindMember() has a variant taking a StringRef, it ignores the
+    // given length (see https://github.com/Tencent/rapidjson/issues/1649).
+    // Luckily, the variant taking a GenericValue doesn't share this bug,
+    // and we can create a string GenericValue without copying the string.
+    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
+    return member_it != value.MemberEnd() ? &member_it->value : nullptr;
+}
+
+rjson::value* find(rjson::value& value, std::string_view name) {
+    auto member_it = value.FindMember(rjson::value(name.data(), name.size()));
+    return member_it != value.MemberEnd() ? &member_it->value : nullptr;
+}
+
+bool remove_member(rjson::value& value, std::string_view name) {
+    // Although RemoveMember() has a variant taking a StringRef, it ignores
+    // given length (see https://github.com/Tencent/rapidjson/issues/1649).
+    // Luckily, the variant taking a GenericValue doesn't share this bug,
+    // and we can create a string GenericValue without copying the string.
+    return value.RemoveMember(rjson::value(name.data(), name.size()));
+}
+
+void set_with_string_name(rjson::value& base, const std::string& name, rjson::value&& member) {
+    base.AddMember(rjson::value(name.c_str(), name.size(), the_allocator), std::move(member), the_allocator);
+}
+
+void set_with_string_name(rjson::value& base, std::string_view name, rjson::value&& member) {
+    base.AddMember(rjson::value(name.data(), name.size(), the_allocator), std::move(member), the_allocator);
+}
+
+void set_with_string_name(rjson::value& base, const std::string& name, rjson::string_ref_type member) {
+    base.AddMember(rjson::value(name.c_str(), name.size(), the_allocator), rjson::value(member), the_allocator);
+}
+
+void set_with_string_name(rjson::value& base, std::string_view name, rjson::string_ref_type member) {
+    base.AddMember(rjson::value(name.data(), name.size(), the_allocator), rjson::value(member), the_allocator);
+}
+
+void set(rjson::value& base, rjson::string_ref_type name, rjson::value&& member) {
+    base.AddMember(name, std::move(member), the_allocator);
+}
+
+void set(rjson::value& base, rjson::string_ref_type name, rjson::string_ref_type member) {
+    base.AddMember(name, rjson::value(member), the_allocator);
+}
+
+void push_back(rjson::value& base_array, rjson::value&& item) {
+    base_array.PushBack(std::move(item), the_allocator);
+
+}
+
+bool single_value_comp::operator()(const rjson::value& r1, const rjson::value& r2) const {
+   auto r1_type = r1.GetType();
+   auto r2_type = r2.GetType();
+
+   // null is the smallest type and compares with every other type, nothing is lesser than null
+   if (r1_type == rjson::type::kNullType || r2_type == rjson::type::kNullType) {
+       return r1_type < r2_type;
+   }
+   // only null, true, and false are comparable with each other, other types are not compatible
+   if (r1_type != r2_type) {
+       if (r1_type > rjson::type::kTrueType || r2_type > rjson::type::kTrueType) {
+           throw rjson::error(format("Types are not comparable: {} {}", r1, r2));
+       }
+   }
+
+   switch (r1_type) {
+   case rjson::type::kNullType:
+       // fall-through
+   case rjson::type::kFalseType:
+       // fall-through
+   case rjson::type::kTrueType:
+       return r1_type < r2_type;
+   case rjson::type::kObjectType:
+       throw rjson::error("Object type comparison is not supported");
+   case rjson::type::kArrayType:
+       throw rjson::error("Array type comparison is not supported");
+   case rjson::type::kStringType: {
+       const size_t r1_len = r1.GetStringLength();
+       const size_t r2_len = r2.GetStringLength();
+       size_t len = std::min(r1_len, r2_len);
+       int result = std::strncmp(r1.GetString(), r2.GetString(), len);
+       return result < 0 || (result == 0 && r1_len < r2_len);
+   }
+   case rjson::type::kNumberType: {
+       if (r1.IsInt() && r2.IsInt()) {
+           return r1.GetInt() < r2.GetInt();
+       } else if (r1.IsUint() && r2.IsUint()) {
+           return r1.GetUint() < r2.GetUint();
+       } else if (r1.IsInt64() && r2.IsInt64()) {
+           return r1.GetInt64() < r2.GetInt64();
+       } else if (r1.IsUint64() && r2.IsUint64()) {
+           return r1.GetUint64() < r2.GetUint64();
+       } else {
+           // it's safe to call GetDouble() on any number type
+           return r1.GetDouble() < r2.GetDouble();
+       }
+   }
+   default:
+       return false;
+   }
+}
+
+} // end namespace rjson
+
+std::ostream& std::operator<<(std::ostream& os, const rjson::value& v) {
+    return os << rjson::print(v);
+}
--- a/alternator/rjson.hh
+++ b/alternator/rjson.hh
@@ -0,0 +1,177 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+/*
+ * rjson is a wrapper over rapidjson library, providing fast JSON parsing and generation.
+ *
+ * rapidjson has strict copy elision policies, which, among other things, involves
+ * using provided char arrays without copying them and allows copying objects only explicitly.
+ * As such, one should be careful when passing strings with limited liveness
+ * (e.g. data underneath local std::strings) to rjson functions, because created JSON objects
+ * may end up relying on dangling char pointers. All rjson functions that create JSONs from strings
+ * by rjson have both APIs for string_ref_type (more optimal, used when the string is known to live
+ * at least as long as the object, e.g. a static char array) and for std::strings. The more optimal
+ * variants should be used *only* if the liveness of the string is guaranteed, otherwise it will
+ * result in undefined behaviour.
+ * Also, bear in mind that methods exposed by rjson::value are generic, but some of them
+ * work fine only for specific types. In case the type does not match, an rjson::error will be thrown.
+ * Examples of such mismatched usages is calling MemberCount() on a JSON value not of object type
+ * or calling Size() on a non-array value.
+ */
+
+#include <string>
+#include <stdexcept>
+
+namespace rjson {
+class error : public std::exception {
+    std::string _msg;
+public:
+    error() = default;
+    error(const std::string& msg) : _msg(msg) {}
+
+    virtual const char* what() const noexcept override { return _msg.c_str(); }
+};
+}
+
+// rapidjson configuration macros
+#define RAPIDJSON_HAS_STDSTRING 1
+// Default rjson policy is to use assert() - which is dangerous for two reasons:
+// 1. assert() can be turned off with -DNDEBUG
+// 2. assert() crashes a program
+// Fortunately, the default policy can be overridden, and so rapidjson errors will
+// throw an rjson::error exception instead.
+#define RAPIDJSON_ASSERT(x) do { if (!(x)) throw rjson::error(std::string("JSON error: condition not met: ") + #x); } while (0)
+
+#include <rapidjson/document.h>
+#include <rapidjson/writer.h>
+#include <rapidjson/stringbuffer.h>
+#include <rapidjson/error/en.h>
+#include <seastar/core/sstring.hh>
+#include "seastarx.hh"
+
+namespace rjson {
+
+using allocator = rapidjson::CrtAllocator;
+using encoding = rapidjson::UTF8<>;
+using document = rapidjson::GenericDocument<encoding, allocator>;
+using value = rapidjson::GenericValue<encoding, allocator>;
+using string_ref_type = value::StringRefType;
+using string_buffer = rapidjson::GenericStringBuffer<encoding>;
+using writer = rapidjson::Writer<string_buffer, encoding>;
+using type = rapidjson::Type;
+
+// Returns an object representing JSON's null
+inline rjson::value null_value() {
+    return rjson::value(rapidjson::kNullType);
+}
+
+// Returns an empty JSON object - {}
+inline rjson::value empty_object() {
+    return rjson::value(rapidjson::kObjectType);
+}
+
+// Returns an empty JSON array - []
+inline rjson::value empty_array() {
+    return rjson::value(rapidjson::kArrayType);
+}
+
+// Returns an empty JSON string - ""
+inline rjson::value empty_string() {
+    return rjson::value(rapidjson::kStringType);
+}
+
+// Convert the JSON value to a string with JSON syntax, the opposite of parse().
+// The representation is dense - without any redundant indentation.
+std::string print(const rjson::value& value);
+
+// Returns a string_view to the string held in a JSON value (which is
+// assumed to hold a string, i.e., v.IsString() == true). This is a view
+// to the existing data - no copying is done.
+inline std::string_view to_string_view(const rjson::value& v) {
+    return std::string_view(v.GetString(), v.GetStringLength());
+}
+
+// Copies given JSON value - involves allocation
+rjson::value copy(const rjson::value& value);
+
+// Parses a JSON value from given string or raw character array.
+// The string/char array liveness does not need to be persisted,
+// as parse() will allocate member names and values.
+// Throws rjson::error if parsing failed.
+rjson::value parse(std::string_view str);
+// Needs to be run in thread context
+rjson::value parse_yieldable(std::string_view str);
+
+// Creates a JSON value (of JSON string type) out of internal string representations.
+// The string value is copied, so str's liveness does not need to be persisted.
+rjson::value from_string(const std::string& str);
+rjson::value from_string(const sstring& str);
+rjson::value from_string(const char* str, size_t size);
+rjson::value from_string(std::string_view view);
+
+// Returns a pointer to JSON member if it exists, nullptr otherwise
+rjson::value* find(rjson::value& value, std::string_view name);
+const rjson::value* find(const rjson::value& value, std::string_view name);
+
+// Returns a reference to JSON member if it exists, throws otherwise
+rjson::value& get(rjson::value& value, std::string_view name);
+const rjson::value& get(const rjson::value& value, std::string_view name);
+
+// Sets a member in given JSON object by moving the member - allocates the name.
+// Throws if base is not a JSON object.
+void set_with_string_name(rjson::value& base, const std::string& name, rjson::value&& member);
+void set_with_string_name(rjson::value& base, std::string_view name, rjson::value&& member);
+
+// Sets a string member in given JSON object by assigning its reference - allocates the name.
+// NOTICE: member string liveness must be ensured to be at least as long as base's.
+// Throws if base is not a JSON object.
+void set_with_string_name(rjson::value& base, const std::string& name, rjson::string_ref_type member);
+void set_with_string_name(rjson::value& base, std::string_view name, rjson::string_ref_type member);
+
+// Sets a member in given JSON object by moving the member.
+// NOTICE: name liveness must be ensured to be at least as long as base's.
+// Throws if base is not a JSON object.
+void set(rjson::value& base, rjson::string_ref_type name, rjson::value&& member);
+
+// Sets a string member in given JSON object by assigning its reference.
+// NOTICE: name liveness must be ensured to be at least as long as base's.
+// NOTICE: member liveness must be ensured to be at least as long as base's.
+// Throws if base is not a JSON object.
+void set(rjson::value& base, rjson::string_ref_type name, rjson::string_ref_type member);
+
+// Adds a value to a JSON list by moving the item to its end.
+// Throws if base_array is not a JSON array.
+void push_back(rjson::value& base_array, rjson::value&& item);
+
+// Remove a member from a JSON object. Throws if value isn't an object.
+bool remove_member(rjson::value& value, std::string_view name);
+
+struct single_value_comp {
+    bool operator()(const rjson::value& r1, const rjson::value& r2) const;
+};
+
+} // end namespace rjson
+
+namespace std {
+std::ostream& operator<<(std::ostream& os, const rjson::value& v);
+}
--- a/alternator/rmw_operation.hh
+++ b/alternator/rmw_operation.hh
@@ -0,0 +1,128 @@
+/*
+ * Copyright 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <seastarx.hh>
+#include <service/storage_proxy.hh>
+#include <service/storage_proxy.hh>
+#include "rjson.hh"
+#include "executor.hh"
+
+namespace alternator {
+
+// An rmw_operation encapsulates the common logic of all the item update
+// operations which may involve a read of the item before the write
+// (so-called Read-Modify-Write operations). These operations include PutItem,
+// UpdateItem and DeleteItem: All of these may be conditional operations (the
+// "Expected" parameter) which requir a read before the write, and UpdateItem
+// may also have an update expression which refers to the item's old value.
+//
+// The code below supports running the read and the write together as one
+// transaction using LWT (this is why rmw_operation is a subclass of
+// cas_request, as required by storage_proxy::cas()), but also has optional
+// modes not using LWT.
+class rmw_operation : public service::cas_request, public enable_shared_from_this<rmw_operation> {
+public:
+    // The following options choose which mechanism to use for isolating
+    // parallel write operations:
+    // * The FORBID_RMW option forbids RMW (read-modify-write) operations
+    //   such as conditional updates. For the remaining write-only
+    //   operations, ordinary quorum writes are isolated enough.
+    // * The LWT_ALWAYS option always uses LWT (lightweight transactions)
+    //   for any write operation - whether or not it also has a read.
+    // * The LWT_RMW_ONLY option uses LWT only for RMW operations, and uses
+    //   ordinary quorum writes for write-only operations.
+    //   This option is not safe if the user may send both RMW and write-only
+    //   operations on the same item.
+    // * The UNSAFE_RMW option does read-modify-write operations as separate
+    //   read and write. It is unsafe - concurrent RMW operations are not
+    //   isolated at all. This option will likely be removed in the future.
+    enum class write_isolation {
+        FORBID_RMW, LWT_ALWAYS, LWT_RMW_ONLY, UNSAFE_RMW
+    };
+    static constexpr auto WRITE_ISOLATION_TAG_KEY = "system:write_isolation";
+
+    static write_isolation get_write_isolation_for_schema(schema_ptr schema);
+
+    static write_isolation default_write_isolation;
+public:
+    static void set_default_write_isolation(std::string_view mode);
+
+protected:
+    // The full request JSON
+    rjson::value _request;
+    // All RMW operations involve a single item with a specific partition
+    // and optional clustering key, in a single table, so the following
+    // information is common to all of them:
+    schema_ptr _schema;
+    partition_key _pk = partition_key::make_empty();
+    clustering_key _ck = clustering_key::make_empty();
+    write_isolation _write_isolation;
+
+    // All RMW operations can have a ReturnValues parameter from the following
+    // choices. But note that only UpdateItem actually supports all of them:
+    enum class returnvalues {
+        NONE, ALL_OLD, UPDATED_OLD, ALL_NEW, UPDATED_NEW
+    } _returnvalues;
+    static returnvalues parse_returnvalues(const rjson::value& request);
+    // When _returnvalues != NONE, apply() should store here, in JSON form,
+    // the values which are to be returned in the "Attributes" field.
+    // The default null JSON means do not return an Attributes field at all.
+    // This field is marked "mutable" so that the const apply() can modify
+    // it (see explanation below), but note that because apply() may be
+    // called more than once, if apply() will sometimes set this field it
+    // must set it (even if just to the default empty value) every time.
+    mutable rjson::value _return_attributes;
+public:
+    // The constructor of a rmw_operation subclass should parse the request
+    // and try to discover as many input errors as it can before really
+    // attempting the read or write operations.
+    rmw_operation(service::storage_proxy& proxy, rjson::value&& request);
+    // rmw_operation subclasses (update_item_operation, put_item_operation
+    // and delete_item_operation) shall implement an apply() function which
+    // takes the previous value of the item (if it was read) and creates the
+    // write mutation. If the previous value of item does not pass the needed
+    // conditional expression, apply() should return an empty optional.
+    // apply() may throw if it encounters input errors not discovered during
+    // the constructor.
+    // apply() may be called more than once in case of contention, so it must
+    // not change the state saved in the object (issue #7218 was caused by
+    // violating this). We mark apply() "const" to let the compiler validate
+    // this for us. The output-only field _return_attributes is marked
+    // "mutable" above so that apply() can still write to it.
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const = 0;
+    // Convert the above apply() into the signature needed by cas_request:
+    virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override;
+    virtual ~rmw_operation() = default;
+    schema_ptr schema() const { return _schema; }
+    const rjson::value& request() const { return _request; }
+    rjson::value&& move_request() && { return std::move(_request); }
+    future<executor::request_return_type> execute(service::storage_proxy& proxy,
+            service::client_state& client_state,
+            tracing::trace_state_ptr trace_state,
+            service_permit permit,
+            bool needs_read_before_write,
+            stats& stats);
+    std::optional<shard_id> shard_for_execute(bool needs_read_before_write);
+};
+
+} // namespace alternator
--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -0,0 +1,273 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "base64.hh"
+#include "log.hh"
+#include "serialization.hh"
+#include "error.hh"
+#include "rapidjson/writer.h"
+#include "concrete_types.hh"
+#include "cql3/type_json.hh"
+
+static logging::logger slogger("alternator-serialization");
+
+namespace alternator {
+
+type_info type_info_from_string(std::string type) {
+    static thread_local const std::unordered_map<std::string, type_info> type_infos = {
+        {"S", {alternator_type::S, utf8_type}},
+        {"B", {alternator_type::B, bytes_type}},
+        {"BOOL", {alternator_type::BOOL, boolean_type}},
+        {"N", {alternator_type::N, decimal_type}}, //FIXME: Replace with custom Alternator type when implemented
+    };
+    auto it = type_infos.find(type);
+    if (it == type_infos.end()) {
+        return {alternator_type::NOT_SUPPORTED_YET, utf8_type};
+    }
+    return it->second;
+}
+
+type_representation represent_type(alternator_type atype) {
+    static thread_local const std::unordered_map<alternator_type, type_representation> type_representations = {
+        {alternator_type::S, {"S", utf8_type}},
+        {alternator_type::B, {"B", bytes_type}},
+        {alternator_type::BOOL, {"BOOL", boolean_type}},
+        {alternator_type::N, {"N", decimal_type}}, //FIXME: Replace with custom Alternator type when implemented
+    };
+    auto it = type_representations.find(atype);
+    if (it == type_representations.end()) {
+        throw std::runtime_error(format("Unknown alternator type {}", int8_t(atype)));
+    }
+    return it->second;
+}
+
+struct from_json_visitor {
+    const rjson::value& v;
+    bytes_ostream& bo;
+
+    void operator()(const reversed_type_impl& t) const { visit(*t.underlying_type(), from_json_visitor{v, bo}); };
+    void operator()(const string_type_impl& t) {
+        bo.write(t.from_string(sstring_view(v.GetString(), v.GetStringLength())));
+    }
+    void operator()(const bytes_type_impl& t) const {
+        bo.write(base64_decode(v));
+    }
+    void operator()(const boolean_type_impl& t) const {
+        bo.write(boolean_type->decompose(v.GetBool()));
+    }
+    void operator()(const decimal_type_impl& t) const {
+        bo.write(t.from_string(sstring_view(v.GetString(), v.GetStringLength())));
+    }
+    // default
+    void operator()(const abstract_type& t) const {
+        bo.write(from_json_object(t, Json::Value(rjson::print(v)), cql_serialization_format::internal()));
+    }
+};
+
+bytes serialize_item(const rjson::value& item) {
+    if (item.IsNull() || item.MemberCount() != 1) {
+        throw api_error("ValidationException", format("An item can contain only one attribute definition: {}", item));
+    }
+    auto it = item.MemberBegin();
+    type_info type_info = type_info_from_string(it->name.GetString()); // JSON keys are guaranteed to be strings
+
+    if (type_info.atype == alternator_type::NOT_SUPPORTED_YET) {
+        slogger.trace("Non-optimal serialization of type {}", it->name.GetString());
+        return bytes{int8_t(type_info.atype)} + to_bytes(rjson::print(item));
+    }
+
+    bytes_ostream bo;
+    bo.write(bytes{int8_t(type_info.atype)});
+    visit(*type_info.dtype, from_json_visitor{it->value, bo});
+
+    return bytes(bo.linearize());
+}
+
+struct to_json_visitor {
+    rjson::value& deserialized;
+    const std::string& type_ident;
+    bytes_view bv;
+
+    void operator()(const reversed_type_impl& t) const { visit(*t.underlying_type(), to_json_visitor{deserialized, type_ident, bv}); };
+    void operator()(const decimal_type_impl& t) const {
+        auto s = to_json_string(*decimal_type, bytes(bv));
+        //FIXME(sarna): unnecessary copy
+        rjson::set_with_string_name(deserialized, type_ident, rjson::from_string(s));
+    }
+    void operator()(const string_type_impl& t) {
+        rjson::set_with_string_name(deserialized, type_ident, rjson::from_string(reinterpret_cast<const char *>(bv.data()), bv.size()));
+    }
+    void operator()(const bytes_type_impl& t) const {
+        std::string b64 = base64_encode(bv);
+        rjson::set_with_string_name(deserialized, type_ident, rjson::from_string(b64));
+    }
+    // default
+    void operator()(const abstract_type& t) const {
+        rjson::set_with_string_name(deserialized, type_ident, rjson::parse(to_json_string(t, bytes(bv))));
+    }
+};
+
+rjson::value deserialize_item(bytes_view bv) {
+    rjson::value deserialized(rapidjson::kObjectType);
+    if (bv.empty()) {
+        throw api_error("ValidationException", "Serialized value empty");
+    }
+
+    alternator_type atype = alternator_type(bv[0]);
+    bv.remove_prefix(1);
+
+    if (atype == alternator_type::NOT_SUPPORTED_YET) {
+        slogger.trace("Non-optimal deserialization of alternator type {}", int8_t(atype));
+        return rjson::parse(std::string_view(reinterpret_cast<const char *>(bv.data()), bv.size()));
+    }
+    type_representation type_representation = represent_type(atype);
+    visit(*type_representation.dtype, to_json_visitor{deserialized, type_representation.ident, bv});
+
+    return deserialized;
+}
+
+std::string type_to_string(data_type type) {
+    static thread_local std::unordered_map<data_type, std::string> types = {
+        {utf8_type, "S"},
+        {bytes_type, "B"},
+        {boolean_type, "BOOL"},
+        {decimal_type, "N"}, // FIXME: use a specialized Alternator number type instead of the general decimal_type
+    };
+    auto it = types.find(type);
+    if (it == types.end()) {
+        // fall back to string, in order to be able to present
+        // internal Scylla types in a human-readable way
+        return "S";
+    }
+    return it->second;
+}
+
+bytes get_key_column_value(const rjson::value& item, const column_definition& column) {
+    std::string column_name = column.name_as_text();
+    const rjson::value* key_typed_value = rjson::find(item, column_name);
+    if (!key_typed_value) {
+        throw api_error("ValidationException", format("Key column {} not found", column_name));
+    }
+    return get_key_from_typed_value(*key_typed_value, column);
+}
+
+// Parses the JSON encoding for a key value, which is a map with a single
+// entry, whose key is the type (expected to match the key column's type)
+// and the value is the encoded value.
+bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column) {
+    if (!key_typed_value.IsObject() || key_typed_value.MemberCount() != 1 ||
+            !key_typed_value.MemberBegin()->value.IsString()) {
+        throw api_error("ValidationException",
+                format("Malformed value object for key column {}: {}",
+                        column.name_as_text(), key_typed_value));
+    }
+
+    auto it = key_typed_value.MemberBegin();
+    if (it->name != type_to_string(column.type)) {
+        throw api_error("ValidationException",
+                format("Type mismatch: expected type {} for key column {}, got type {}",
+                        type_to_string(column.type), column.name_as_text(), it->name.GetString()));
+    }
+    if (column.type == bytes_type) {
+        return base64_decode(it->value);
+    } else {
+        return column.type->from_string(rjson::to_string_view(it->value));
+    }
+
+}
+
+rjson::value json_key_column_value(bytes_view cell, const column_definition& column) {
+    if (column.type == bytes_type) {
+        std::string b64 = base64_encode(cell);
+        return rjson::from_string(b64);
+    } if (column.type == utf8_type) {
+        return rjson::from_string(std::string(reinterpret_cast<const char*>(cell.data()), cell.size()));
+    } else if (column.type == decimal_type) {
+        // FIXME: use specialized Alternator number type, not the more
+        // general "decimal_type". A dedicated type can be more efficient
+        // in storage space and in parsing speed.
+        auto s = to_json_string(*decimal_type, bytes(cell));
+        return rjson::from_string(s);
+    } else {
+        // Support for arbitrary key types is useful for parsing values of virtual tables,
+        // which can involve any type supported by Scylla.
+        // In order to guarantee that the returned type is parsable by alternator clients,
+        // they are represented simply as strings.
+        return rjson::from_string(column.type->to_string(bytes(cell)));
+    }
+}
+
+
+partition_key pk_from_json(const rjson::value& item, schema_ptr schema) {
+    std::vector<bytes> raw_pk;
+    // FIXME: this is a loop, but we really allow only one partition key column.
+    for (const column_definition& cdef : schema->partition_key_columns()) {
+        bytes raw_value = get_key_column_value(item, cdef);
+        raw_pk.push_back(std::move(raw_value));
+    }
+   return partition_key::from_exploded(raw_pk);
+}
+
+clustering_key ck_from_json(const rjson::value& item, schema_ptr schema) {
+    if (schema->clustering_key_size() == 0) {
+        return clustering_key::make_empty();
+    }
+    std::vector<bytes> raw_ck;
+    // FIXME: this is a loop, but we really allow only one clustering key column.
+    for (const column_definition& cdef : schema->clustering_key_columns()) {
+        bytes raw_value = get_key_column_value(item,  cdef);
+        raw_ck.push_back(std::move(raw_value));
+    }
+
+    return clustering_key::from_exploded(raw_ck);
+}
+
+big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic) {
+    if (!v.IsObject() || v.MemberCount() != 1) {
+        throw api_error("ValidationException", format("{}: invalid number object", diagnostic));
+    }
+    auto it = v.MemberBegin();
+    if (it->name != "N") {
+        throw api_error("ValidationException", format("{}: expected number, found type '{}'", diagnostic, it->name));
+    }
+    if (it->value.IsNumber()) {
+         // FIXME(sarna): should use big_decimal constructor with numeric values directly:
+        return big_decimal(rjson::print(it->value));
+    }
+    if (!it->value.IsString()) {
+        throw api_error("ValidationException", format("{}: improperly formatted number constant", diagnostic));
+    }
+    return big_decimal(it->value.GetString());
+}
+
+const std::pair<std::string, const rjson::value*> unwrap_set(const rjson::value& v) {
+    if (!v.IsObject() || v.MemberCount() != 1) {
+        return {"", nullptr};
+    }
+    auto it = v.MemberBegin();
+    const std::string it_key = it->name.GetString();
+    if (it_key != "SS" && it_key != "BS" && it_key != "NS") {
+        return {"", nullptr};
+    }
+    return std::make_pair(it_key, &(it->value));
+}
+
+}
--- a/alternator/serialization.hh
+++ b/alternator/serialization.hh
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <string>
+#include <string_view>
+#include "types.hh"
+#include "schema_fwd.hh"
+#include "keys.hh"
+#include "rjson.hh"
+#include "utils/big_decimal.hh"
+
+namespace alternator {
+
+enum class alternator_type : int8_t {
+    S, B, BOOL, N, NOT_SUPPORTED_YET
+};
+
+struct type_info {
+    alternator_type atype;
+    data_type dtype;
+};
+
+struct type_representation {
+    std::string ident;
+    data_type dtype;
+};
+
+type_info type_info_from_string(std::string type);
+type_representation represent_type(alternator_type atype);
+
+bytes serialize_item(const rjson::value& item);
+rjson::value deserialize_item(bytes_view bv);
+
+std::string type_to_string(data_type type);
+
+bytes get_key_column_value(const rjson::value& item, const column_definition& column);
+bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column);
+rjson::value json_key_column_value(bytes_view cell, const column_definition& column);
+
+partition_key pk_from_json(const rjson::value& item, schema_ptr schema);
+clustering_key ck_from_json(const rjson::value& item, schema_ptr schema);
+
+// If v encodes a number (i.e., it is a {"N": [...]}, returns an object representing it.  Otherwise,
+// raises ValidationException with diagnostic.
+big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic);
+
+// Check if a given JSON object encodes a set (i.e., it is a {"SS": [...]}, or "NS", "BS"
+// and returns set's type and a pointer to that set. If the object does not encode a set,
+// returned value is {"", nullptr}
+const std::pair<std::string, const rjson::value*> unwrap_set(const rjson::value& v);
+
+}
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -0,0 +1,487 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "alternator/server.hh"
+#include "log.hh"
+#include <seastar/http/function_handlers.hh>
+#include <seastar/json/json_elements.hh>
+#include <seastarx.hh>
+#include "error.hh"
+#include "rjson.hh"
+#include "auth.hh"
+#include <cctype>
+#include "cql3/query_processor.hh"
+#include "service/storage_service.hh"
+#include "utils/overloaded_functor.hh"
+
+static logging::logger slogger("alternator-server");
+
+using namespace httpd;
+
+namespace alternator {
+
+static constexpr auto TARGET = "X-Amz-Target";
+
+inline std::vector<std::string_view> split(std::string_view text, char separator) {
+    std::vector<std::string_view> tokens;
+    if (text == "") {
+        return tokens;
+    }
+
+    while (true) {
+        auto pos = text.find_first_of(separator);
+        if (pos != std::string_view::npos) {
+            tokens.emplace_back(text.data(), pos);
+            text.remove_prefix(pos + 1);
+        } else {
+            tokens.emplace_back(text);
+            break;
+        }
+    }
+    return tokens;
+}
+
+// DynamoDB HTTP error responses are structured as follows
+// https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Programming.Errors.html
+// Our handlers throw an exception to report an error. If the exception
+// is of type alternator::api_error, it unwrapped and properly reported to
+// the user directly. Other exceptions are unexpected, and reported as
+// Internal Server Error.
+class api_handler : public handler_base {
+public:
+    api_handler(const std::function<future<executor::request_return_type>(std::unique_ptr<request> req)>& _handle) : _f_handle(
+         [this, _handle](std::unique_ptr<request> req, std::unique_ptr<reply> rep) {
+         return seastar::futurize_invoke(_handle, std::move(req)).then_wrapped([this, rep = std::move(rep)](future<executor::request_return_type> resf) mutable {
+             if (resf.failed()) {
+                 // Exceptions of type api_error are wrapped as JSON and
+                 // returned to the client as expected. Other types of
+                 // exceptions are unexpected, and returned to the user
+                 // as an internal server error:
+                 api_error ret;
+                 try {
+                     resf.get();
+                 } catch (api_error &ae) {
+                     ret = ae;
+                 } catch (rjson::error & re) {
+                     ret = api_error("ValidationException", re.what());
+                 } catch (...) {
+                     ret = api_error(
+                             "Internal Server Error",
+                             format("Internal server error: {}", std::current_exception()),
+                             reply::status_type::internal_server_error);
+                 }
+                 generate_error_reply(*rep, ret);
+                 return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
+             }
+             auto res = resf.get0();
+             std::visit(overloaded_functor {
+                 [&] (const json::json_return_type& json_return_value) {
+                     slogger.trace("api_handler success case");
+                     if (json_return_value._body_writer) {
+                         rep->write_body("json", std::move(json_return_value._body_writer));
+                     } else {
+                         rep->_content += json_return_value._res;
+                     }
+                 },
+                 [&] (const api_error& err) {
+                     generate_error_reply(*rep, err);
+                 }
+             }, res);
+
+             return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
+         });
+    }), _type("json") { }
+
+    api_handler(const api_handler&) = default;
+    future<std::unique_ptr<reply>> handle(const sstring& path,
+            std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
+        return _f_handle(std::move(req), std::move(rep)).then(
+                [this](std::unique_ptr<reply> rep) {
+                    rep->done(_type);
+                    return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
+                });
+    }
+
+protected:
+    void generate_error_reply(reply& rep, const api_error& err) {
+        rep._content += "{\"__type\":\"com.amazonaws.dynamodb.v20120810#" + err._type + "\"," +
+                "\"message\":\"" + err._msg + "\"}";
+        rep._status = err._http_code;
+        slogger.trace("api_handler error case: {}", rep._content);
+    }
+
+    future_handler_function _f_handle;
+    sstring _type;
+};
+
+class gated_handler : public handler_base {
+    seastar::gate& _gate;
+public:
+    gated_handler(seastar::gate& gate) : _gate(gate) {}
+    virtual future<std::unique_ptr<reply>> do_handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) = 0;
+    virtual future<std::unique_ptr<reply>> handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) final override {
+        return with_gate(_gate, [this, &path, req = std::move(req), rep = std::move(rep)] () mutable {
+            return do_handle(path, std::move(req), std::move(rep));
+        });
+    }
+};
+
+class health_handler : public gated_handler {
+public:
+    health_handler(seastar::gate& pending_requests) : gated_handler(pending_requests) {}
+protected:
+    virtual future<std::unique_ptr<reply>> do_handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
+        rep->set_status(reply::status_type::ok);
+        rep->write_body("txt", format("healthy: {}", req->get_header("Host")));
+        return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
+    }
+};
+
+class local_nodelist_handler : public gated_handler {
+public:
+    local_nodelist_handler(seastar::gate& pending_requests) : gated_handler(pending_requests) {}
+protected:
+    virtual future<std::unique_ptr<reply>> do_handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
+        rjson::value results = rjson::empty_array();
+        // It's very easy to get a list of all live nodes on the cluster,
+        // using gms::get_local_gossiper().get_live_members(). But getting
+        // just the list of live nodes in this DC needs more elaborate code:
+        sstring local_dc = locator::i_endpoint_snitch::get_local_snitch_ptr()->get_datacenter(
+                utils::fb_utilities::get_broadcast_address());
+        std::unordered_set<gms::inet_address> local_dc_nodes =
+                service::get_local_storage_service().get_token_metadata().
+                get_topology().get_datacenter_endpoints().at(local_dc);
+        for (auto& ip : local_dc_nodes) {
+            if (gms::get_local_gossiper().is_alive(ip)) {
+                rjson::push_back(results, rjson::from_string(ip.to_sstring()));
+            }
+        }
+        rep->set_status(reply::status_type::ok);
+        rep->set_content_type("json");
+        rep->_content = rjson::print(results);
+        return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
+    }
+};
+
+future<> server::verify_signature(const request& req) {
+    if (!_enforce_authorization) {
+        slogger.debug("Skipping authorization");
+        return make_ready_future<>();
+    }
+    auto host_it = req._headers.find("Host");
+    if (host_it == req._headers.end()) {
+        throw api_error("InvalidSignatureException", "Host header is mandatory for signature verification");
+    }
+    auto authorization_it = req._headers.find("Authorization");
+    if (authorization_it == req._headers.end()) {
+        throw api_error("InvalidSignatureException", "Authorization header is mandatory for signature verification");
+    }
+    std::string host = host_it->second;
+    std::vector<std::string_view> credentials_raw = split(authorization_it->second, ' ');
+    std::string credential;
+    std::string user_signature;
+    std::string signed_headers_str;
+    std::vector<std::string_view> signed_headers;
+    for (std::string_view entry : credentials_raw) {
+        std::vector<std::string_view> entry_split = split(entry, '=');
+        if (entry_split.size() != 2) {
+            if (entry != "AWS4-HMAC-SHA256") {
+                throw api_error("InvalidSignatureException", format("Only AWS4-HMAC-SHA256 algorithm is supported. Found: {}", entry));
+            }
+            continue;
+        }
+        std::string_view auth_value = entry_split[1];
+        // Commas appear as an additional (quite redundant) delimiter
+        if (auth_value.back() == ',') {
+            auth_value.remove_suffix(1);
+        }
+        if (entry_split[0] == "Credential") {
+            credential = std::string(auth_value);
+        } else if (entry_split[0] == "Signature") {
+            user_signature = std::string(auth_value);
+        } else if (entry_split[0] == "SignedHeaders") {
+            signed_headers_str = std::string(auth_value);
+            signed_headers = split(auth_value, ';');
+            std::sort(signed_headers.begin(), signed_headers.end());
+        }
+    }
+    std::vector<std::string_view> credential_split = split(credential, '/');
+    if (credential_split.size() != 5) {
+        throw api_error("ValidationException", format("Incorrect credential information format: {}", credential));
+    }
+    std::string user(credential_split[0]);
+    std::string datestamp(credential_split[1]);
+    std::string region(credential_split[2]);
+    std::string service(credential_split[3]);
+
+    std::map<std::string_view, std::string_view> signed_headers_map;
+    for (const auto& header : signed_headers) {
+        signed_headers_map.emplace(header, std::string_view());
+    }
+    for (auto& header : req._headers) {
+        std::string header_str;
+        header_str.resize(header.first.size());
+        std::transform(header.first.begin(), header.first.end(), header_str.begin(), ::tolower);
+        auto it = signed_headers_map.find(header_str);
+        if (it != signed_headers_map.end()) {
+            it->second = std::string_view(header.second);
+        }
+    }
+
+    auto cache_getter = [] (std::string username) {
+        return get_key_from_roles(cql3::get_query_processor().local(), std::move(username));
+    };
+    return _key_cache.get_ptr(user, cache_getter).then([this, &req,
+                                                    user = std::move(user),
+                                                    host = std::move(host),
+                                                    datestamp = std::move(datestamp),
+                                                    signed_headers_str = std::move(signed_headers_str),
+                                                    signed_headers_map = std::move(signed_headers_map),
+                                                    region = std::move(region),
+                                                    service = std::move(service),
+                                                    user_signature = std::move(user_signature)] (key_cache::value_ptr key_ptr) {
+        std::string signature = get_signature(user, *key_ptr, std::string_view(host), req._method,
+                datestamp, signed_headers_str, signed_headers_map, req.content, region, service, "");
+
+        if (signature != std::string_view(user_signature)) {
+            _key_cache.remove(user);
+            throw api_error("UnrecognizedClientException", "The security token included in the request is invalid.");
+        }
+    });
+}
+
+future<executor::request_return_type> server::handle_api_request(std::unique_ptr<request>&& req) {
+    _executor._stats.total_operations++;
+    sstring target = req->get_header(TARGET);
+    std::vector<std::string_view> split_target = split(target, '.');
+    //NOTICE(sarna): Target consists of Dynamo API version followed by a dot '.' and operation type (e.g. CreateTable)
+    std::string op = split_target.empty() ? std::string() : std::string(split_target.back());
+    slogger.trace("Request: {} {}", op, req->content);
+    return verify_signature(*req).then([this, op, req = std::move(req)] () mutable {
+        auto callback_it = _callbacks.find(op);
+        if (callback_it == _callbacks.end()) {
+            _executor._stats.unsupported_operations++;
+            throw api_error("UnknownOperationException",
+                    format("Unsupported operation {}", op));
+        }
+        return with_gate(_pending_requests, [this, callback_it = std::move(callback_it), op = std::move(op), req = std::move(req)] () mutable {
+            //FIXME: Client state can provide more context, e.g. client's endpoint address
+            // We use unique_ptr because client_state cannot be moved or copied
+            return do_with(std::make_unique<executor::client_state>(executor::client_state::internal_tag()),
+                    [this, callback_it = std::move(callback_it), op = std::move(op), req = std::move(req)] (std::unique_ptr<executor::client_state>& client_state) mutable {
+                tracing::trace_state_ptr trace_state = executor::maybe_trace_query(*client_state, op, req->content);
+                tracing::trace(trace_state, op);
+                // JSON parsing can allocate up to roughly 2x the size of the raw document, + a couple of bytes for maintenance.
+                // FIXME: by this time, the whole HTTP request was already read, so some memory is already occupied.
+                // Once HTTP allows working on streams, we should grab the permit *before* reading the HTTP payload.
+                size_t mem_estimate = req->content.size() * 3 + 8000;
+                auto units_fut = get_units(*_memory_limiter, mem_estimate);
+                if (_memory_limiter->waiters()) {
+                    ++_executor._stats.requests_blocked_memory;
+                }
+                return units_fut.then([this, callback_it = std::move(callback_it), &client_state, trace_state, req = std::move(req)] (semaphore_units<> units) mutable {
+                    return _json_parser.parse(req->content).then([this, callback_it = std::move(callback_it), &client_state, trace_state,
+                            units = std::move(units), req = std::move(req)] (rjson::value json_request) mutable {
+                        return callback_it->second(_executor, *client_state, trace_state, make_service_permit(std::move(units)), std::move(json_request), std::move(req)).finally([trace_state] {});
+                    });
+                });
+            });
+        });
+    });
+}
+
+void server::set_routes(routes& r) {
+    api_handler* req_handler = new api_handler([this] (std::unique_ptr<request> req) mutable {
+        return handle_api_request(std::move(req));
+    });
+
+    r.put(operation_type::POST, "/", req_handler);
+    r.put(operation_type::GET, "/", new health_handler(_pending_requests));
+    // The "/localnodes" request is a new Alternator feature, not supported by
+    // DynamoDB and not required for DynamoDB compatibility. It allows a
+    // client to enquire - using a trivial HTTP request without requiring
+    // authentication - the list of all live nodes in the same data center of
+    // the Alternator cluster. The client can use this list to balance its
+    // request load to all the nodes in the same geographical region.
+    // Note that this API exposes - openly without authentication - the
+    // information on the cluster's members inside one data center. We do not
+    // consider this to be a security risk, because an attacker can already
+    // scan an entire subnet for nodes responding to the health request,
+    // or even just scan for open ports.
+    r.put(operation_type::GET, "/localnodes", new local_nodelist_handler(_pending_requests));
+}
+
+//FIXME: A way to immediately invalidate the cache should be considered,
+// e.g. when the system table which stores the keys is changed.
+// For now, this propagation may take up to 1 minute.
+server::server(executor& exec)
+        : _http_server("http-alternator")
+        , _https_server("https-alternator")
+        , _executor(exec)
+        , _key_cache(1024, 1min, slogger)
+        , _enforce_authorization(false)
+        , _enabled_servers{}
+        , _pending_requests{}
+      , _callbacks{
+        {"CreateTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.create_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"DescribeTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.describe_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"DeleteTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.delete_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"PutItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.put_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"UpdateItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.update_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"GetItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.get_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"DeleteItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.delete_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"ListTables", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.list_tables(client_state, std::move(permit), std::move(json_request));
+        }},
+        {"Scan", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.scan(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"DescribeEndpoints", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.describe_endpoints(client_state, std::move(permit), std::move(json_request), req->get_header("Host"));
+        }},
+        {"BatchWriteItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.batch_write_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"BatchGetItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.batch_get_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"Query", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.query(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
+        {"TagResource", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.tag_resource(client_state, std::move(permit), std::move(json_request));
+        }},
+        {"UntagResource", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.untag_resource(client_state, std::move(permit), std::move(json_request));
+        }},
+        {"ListTagsOfResource", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.list_tags_of_resource(client_state, std::move(permit), std::move(json_request));
+        }},
+    } {
+}
+
+future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
+        bool enforce_authorization, semaphore* memory_limiter) {
+    _memory_limiter = memory_limiter;
+    _enforce_authorization = enforce_authorization;
+    if (!port && !https_port) {
+        return make_exception_future<>(std::runtime_error("Either regular port or TLS port"
+                " must be specified in order to init an alternator HTTP server instance"));
+    }
+    return seastar::async([this, addr, port, https_port, creds] {
+        try {
+            _executor.start().get();
+
+            if (port) {
+                set_routes(_http_server._routes);
+                _http_server.set_content_length_limit(server::content_length_limit);
+                _http_server.listen(socket_address{addr, *port}).get();
+                _enabled_servers.push_back(std::ref(_http_server));
+            }
+            if (https_port) {
+                set_routes(_https_server._routes);
+                _https_server.set_content_length_limit(server::content_length_limit);
+                _https_server.set_tls_credentials(creds->build_reloadable_server_credentials([](const std::unordered_set<sstring>& files, std::exception_ptr ep) {
+                    if (ep) {
+                        slogger.warn("Exception loading {}: {}", files, ep);
+                    } else {
+                        slogger.info("Reloaded {}", files);
+                    }
+                }).get0());
+                _https_server.listen(socket_address{addr, *https_port}).get();
+                _enabled_servers.push_back(std::ref(_https_server));
+            }
+        } catch (...) {
+            slogger.error("Failed to set up Alternator HTTP server on {} port {}, TLS port {}: {}",
+                    addr, port ? std::to_string(*port) : "OFF", https_port ? std::to_string(*https_port) : "OFF", std::current_exception());
+            std::throw_with_nested(std::runtime_error(
+                    format("Failed to set up Alternator HTTP server on {} port {}, TLS port {}",
+                            addr, port ? std::to_string(*port) : "OFF", https_port ? std::to_string(*https_port) : "OFF")));
+        }
+    });
+}
+
+future<> server::stop() {
+    return parallel_for_each(_enabled_servers, [] (http_server& server) {
+        return server.stop();
+    }).then([this] {
+        return _pending_requests.close();
+    }).then([this] {
+        return _json_parser.stop();
+    });
+}
+
+server::json_parser::json_parser() : _run_parse_json_thread(async([this] {
+        while (true) {
+            _document_waiting.wait().get();
+            if (_as.abort_requested()) {
+                return;
+            }
+            try {
+                _parsed_document = rjson::parse_yieldable(_raw_document);
+                _current_exception = nullptr;
+            } catch (...) {
+                _current_exception = std::current_exception();
+            }
+            _document_parsed.signal();
+        }
+    })) {
+}
+
+future<rjson::value> server::json_parser::parse(std::string_view content) {
+    if (content.size() < yieldable_parsing_threshold) {
+        return make_ready_future<rjson::value>(rjson::parse(content));
+    }
+    return with_semaphore(_parsing_sem, 1, [this, content] {
+        _raw_document = content;
+        _document_waiting.signal();
+        return _document_parsed.wait().then([this] {
+            if (_current_exception) {
+                return make_exception_future<rjson::value>(_current_exception);
+            }
+            return make_ready_future<rjson::value>(std::move(_parsed_document));
+        });
+    });
+}
+
+future<> server::json_parser::stop() {
+    _as.request_abort();
+    _document_waiting.signal();
+    _document_parsed.broken();
+    return std::move(_run_parse_json_thread);
+}
+
+}
+
--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "alternator/executor.hh"
+#include <seastar/core/future.hh>
+#include <seastar/http/httpd.hh>
+#include <seastar/net/tls.hh>
+#include <optional>
+#include <alternator/auth.hh>
+#include <utils/small_vector.hh>
+#include <seastar/core/units.hh>
+
+namespace alternator {
+
+class server {
+    static constexpr size_t content_length_limit = 16*MB;
+    using alternator_callback = std::function<future<executor::request_return_type>(executor&, executor::client_state&,
+            tracing::trace_state_ptr, service_permit, rjson::value, std::unique_ptr<request>)>;
+    using alternator_callbacks_map = std::unordered_map<std::string_view, alternator_callback>;
+
+    http_server _http_server;
+    http_server _https_server;
+    executor& _executor;
+
+    key_cache _key_cache;
+    bool _enforce_authorization;
+    utils::small_vector<std::reference_wrapper<seastar::httpd::http_server>, 2> _enabled_servers;
+    gate _pending_requests;
+    alternator_callbacks_map _callbacks;
+
+    semaphore* _memory_limiter;
+
+    class json_parser {
+        static constexpr size_t yieldable_parsing_threshold = 16*KB;
+        std::string_view _raw_document;
+        rjson::value _parsed_document;
+        std::exception_ptr _current_exception;
+        semaphore _parsing_sem{1};
+        condition_variable _document_waiting;
+        condition_variable _document_parsed;
+        abort_source _as;
+        future<> _run_parse_json_thread;
+    public:
+        json_parser();
+        future<rjson::value> parse(std::string_view content);
+        future<> stop();
+    };
+    json_parser _json_parser;
+
+public:
+    server(executor& executor);
+
+    future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
+            bool enforce_authorization, semaphore* memory_limiter);
+    future<> stop();
+private:
+    void set_routes(seastar::httpd::routes& r);
+    future<> verify_signature(const seastar::httpd::request& r);
+    future<executor::request_return_type> handle_api_request(std::unique_ptr<request>&& req);
+};
+
+}
+
--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "stats.hh"
+
+#include <seastar/core/metrics.hh>
+
+namespace alternator {
+
+const char* ALTERNATOR_METRICS = "alternator";
+
+stats::stats() : api_operations{} {
+    // Register the
+    seastar::metrics::label op("op");
+
+    _metrics.add_group("alternator", {
+#define OPERATION(name, CamelCaseName) \
+                seastar::metrics::make_total_operations("operation", api_operations.name, \
+                        seastar::metrics::description("number of operations via Alternator API"), {op(CamelCaseName)}),
+#define OPERATION_LATENCY(name, CamelCaseName) \
+                seastar::metrics::make_histogram("op_latency", \
+                        seastar::metrics::description("Latency histogram of an operation via Alternator API"), {op(CamelCaseName)}, [this]{return api_operations.name.get_histogram(1,20);}),
+            OPERATION(batch_write_item, "BatchWriteItem")
+            OPERATION(create_backup, "CreateBackup")
+            OPERATION(create_global_table, "CreateGlobalTable")
+            OPERATION(create_table, "CreateTable")
+            OPERATION(delete_backup, "DeleteBackup")
+            OPERATION(delete_item, "DeleteItem")
+            OPERATION(delete_table, "DeleteTable")
+            OPERATION(describe_backup, "DescribeBackup")
+            OPERATION(describe_continuous_backups, "DescribeContinuousBackups")
+            OPERATION(describe_endpoints, "DescribeEndpoints")
+            OPERATION(describe_global_table, "DescribeGlobalTable")
+            OPERATION(describe_global_table_settings, "DescribeGlobalTableSettings")
+            OPERATION(describe_limits, "DescribeLimits")
+            OPERATION(describe_table, "DescribeTable")
+            OPERATION(describe_time_to_live, "DescribeTimeToLive")
+            OPERATION(get_item, "GetItem")
+            OPERATION(list_backups, "ListBackups")
+            OPERATION(list_global_tables, "ListGlobalTables")
+            OPERATION(list_tables, "ListTables")
+            OPERATION(list_tags_of_resource, "ListTagsOfResource")
+            OPERATION(put_item, "PutItem")
+            OPERATION(query, "Query")
+            OPERATION(restore_table_from_backup, "RestoreTableFromBackup")
+            OPERATION(restore_table_to_point_in_time, "RestoreTableToPointInTime")
+            OPERATION(scan, "Scan")
+            OPERATION(tag_resource, "TagResource")
+            OPERATION(transact_get_items, "TransactGetItems")
+            OPERATION(transact_write_items, "TransactWriteItems")
+            OPERATION(untag_resource, "UntagResource")
+            OPERATION(update_continuous_backups, "UpdateContinuousBackups")
+            OPERATION(update_global_table, "UpdateGlobalTable")
+            OPERATION(update_global_table_settings, "UpdateGlobalTableSettings")
+            OPERATION(update_item, "UpdateItem")
+            OPERATION(update_table, "UpdateTable")
+            OPERATION(update_time_to_live, "UpdateTimeToLive")
+            OPERATION_LATENCY(put_item_latency, "PutItem")
+            OPERATION_LATENCY(get_item_latency, "GetItem")
+            OPERATION_LATENCY(delete_item_latency, "DeleteItem")
+            OPERATION_LATENCY(update_item_latency, "UpdateItem")
+    });
+    _metrics.add_group("alternator", {
+            seastar::metrics::make_total_operations("unsupported_operations", unsupported_operations,
+                    seastar::metrics::description("number of unsupported operations via Alternator API")),
+            seastar::metrics::make_total_operations("total_operations", total_operations,
+                    seastar::metrics::description("number of total operations via Alternator API")),
+            seastar::metrics::make_total_operations("reads_before_write", reads_before_write,
+                    seastar::metrics::description("number of performed read-before-write operations")),
+            seastar::metrics::make_total_operations("write_using_lwt", write_using_lwt,
+                    seastar::metrics::description("number of writes that used LWT")),
+            seastar::metrics::make_total_operations("shard_bounce_for_lwt", shard_bounce_for_lwt,
+                    seastar::metrics::description("number writes that had to be bounced from this shard because of LWT requirements")),
+            seastar::metrics::make_total_operations("requests_blocked_memory", requests_blocked_memory,
+                    seastar::metrics::description("Counts a number of requests blocked due to memory pressure.")),
+            seastar::metrics::make_total_operations("filtered_rows_read_total", cql_stats.filtered_rows_read_total,
+                    seastar::metrics::description("number of rows read during filtering operations")),
+            seastar::metrics::make_total_operations("filtered_rows_matched_total", cql_stats.filtered_rows_matched_total,
+                    seastar::metrics::description("number of rows read and matched during filtering operations")),
+            seastar::metrics::make_total_operations("filtered_rows_dropped_total", [this] { return cql_stats.filtered_rows_read_total - cql_stats.filtered_rows_matched_total; },
+                    seastar::metrics::description("number of rows read and dropped during filtering operations")),
+    });
+}
+
+
+}
--- a/alternator/stats.hh
+++ b/alternator/stats.hh
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include <seastar/core/metrics_registration.hh>
+#include "seastarx.hh"
+#include "utils/estimated_histogram.hh"
+#include "cql3/stats.hh"
+
+namespace alternator {
+
+// Object holding per-shard statistics related to Alternator.
+// While this object is alive, these metrics are also registered to be
+// visible by the metrics REST API, with the "alternator" prefix.
+class stats {
+public:
+    stats();
+    // Count of DynamoDB API operations by types
+    struct {
+        uint64_t batch_get_item = 0;
+        uint64_t batch_write_item = 0;
+        uint64_t create_backup = 0;
+        uint64_t create_global_table = 0;
+        uint64_t create_table = 0;
+        uint64_t delete_backup = 0;
+        uint64_t delete_item = 0;
+        uint64_t delete_table = 0;
+        uint64_t describe_backup = 0;
+        uint64_t describe_continuous_backups = 0;
+        uint64_t describe_endpoints = 0;
+        uint64_t describe_global_table = 0;
+        uint64_t describe_global_table_settings = 0;
+        uint64_t describe_limits = 0;
+        uint64_t describe_table = 0;
+        uint64_t describe_time_to_live = 0;
+        uint64_t get_item = 0;
+        uint64_t list_backups = 0;
+        uint64_t list_global_tables = 0;
+        uint64_t list_tables = 0;
+        uint64_t list_tags_of_resource = 0;
+        uint64_t put_item = 0;
+        uint64_t query = 0;
+        uint64_t restore_table_from_backup = 0;
+        uint64_t restore_table_to_point_in_time = 0;
+        uint64_t scan = 0;
+        uint64_t tag_resource = 0;
+        uint64_t transact_get_items = 0;
+        uint64_t transact_write_items = 0;
+        uint64_t untag_resource = 0;
+        uint64_t update_continuous_backups = 0;
+        uint64_t update_global_table = 0;
+        uint64_t update_global_table_settings = 0;
+        uint64_t update_item = 0;
+        uint64_t update_table = 0;
+        uint64_t update_time_to_live = 0;
+
+        utils::estimated_histogram put_item_latency;
+        utils::estimated_histogram get_item_latency;
+        utils::estimated_histogram delete_item_latency;
+        utils::estimated_histogram update_item_latency;
+    } api_operations;
+    // Miscellaneous event counters
+    uint64_t total_operations = 0;
+    uint64_t unsupported_operations = 0;
+    uint64_t reads_before_write = 0;
+    uint64_t write_using_lwt = 0;
+    uint64_t shard_bounce_for_lwt = 0;
+    uint64_t requests_blocked_memory = 0;
+    // CQL-derived stats
+    cql3::cql_stats cql_stats;
+private:
+    // The metric_groups object holds this stat object's metrics registered
+    // as long as the stats object is alive.
+    seastar::metrics::metric_groups _metrics;
+};
+
+}
--- a/alternator/tags_extension.hh
+++ b/alternator/tags_extension.hh
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "serializer.hh"
+#include "schema.hh"
+#include "db/extensions.hh"
+
+namespace alternator {
+
+class tags_extension : public schema_extension {
+public:
+    static constexpr auto NAME = "scylla_tags";
+
+    tags_extension() = default;
+    explicit tags_extension(const std::map<sstring, sstring>& tags) : _tags(std::move(tags)) {}
+    explicit tags_extension(bytes b) : _tags(tags_extension::deserialize(b)) {}
+    explicit tags_extension(const sstring& s) {
+        throw std::logic_error("Cannot create tags from string");
+    }
+    bytes serialize() const override {
+        return ser::serialize_to_buffer<bytes>(_tags);
+    }
+    static std::map<sstring, sstring> deserialize(bytes_view buffer) {
+        return ser::deserialize_from_buffer(buffer, boost::type<std::map<sstring, sstring>>());
+    }
+    const std::map<sstring, sstring>& tags() const {
+        return _tags;
+    }
+private:
+    std::map<sstring, sstring> _tags;
+};
+
+}
--- a/api/api-doc/cache_service.json
+++ b/api/api-doc/cache_service.json
@@ -13,7 +13,7 @@
            {
               "method":"GET",
               "summary":"get row cache save period in seconds",
-               "type":"int",
+               "type": "long",
               "nickname":"get_row_cache_save_period_in_seconds",
               "produces":[
                  "application/json"
@@ -35,7 +35,7 @@
                     "description":"row cache save period in seconds",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -48,7 +48,7 @@
            {
               "method":"GET",
               "summary":"get key cache save period in seconds",
-               "type":"int",
+               "type": "long",
               "nickname":"get_key_cache_save_period_in_seconds",
               "produces":[
                  "application/json"
@@ -70,7 +70,7 @@
                     "description":"key cache save period in seconds",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -83,7 +83,7 @@
            {
               "method":"GET",
               "summary":"get counter cache save period in seconds",
-               "type":"int",
+               "type": "long",
               "nickname":"get_counter_cache_save_period_in_seconds",
               "produces":[
                  "application/json"
@@ -105,7 +105,7 @@
                     "description":"counter cache save period in seconds",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -118,7 +118,7 @@
            {
               "method":"GET",
               "summary":"get row cache keys to save",
-               "type":"int",
+               "type": "long",
               "nickname":"get_row_cache_keys_to_save",
               "produces":[
                  "application/json"
@@ -140,7 +140,7 @@
                     "description":"row cache keys to save",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -153,7 +153,7 @@
            {
               "method":"GET",
               "summary":"get key cache keys to save",
-               "type":"int",
+               "type": "long",
               "nickname":"get_key_cache_keys_to_save",
               "produces":[
                  "application/json"
@@ -175,7 +175,7 @@
                     "description":"key cache keys to save",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -188,7 +188,7 @@
            {
               "method":"GET",
               "summary":"get counter cache keys to save",
-               "type":"int",
+               "type": "long",
               "nickname":"get_counter_cache_keys_to_save",
               "produces":[
                  "application/json"
@@ -210,7 +210,7 @@
                     "description":"counter cache keys to save",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -448,7 +448,7 @@
        {
          "method": "GET",
          "summary": "Get key entries",
-          "type": "int",
+          "type": "long",
          "nickname": "get_key_entries",
          "produces": [
            "application/json"
@@ -568,7 +568,7 @@
        {
          "method": "GET",
          "summary": "Get row entries",
-          "type": "int",
+          "type": "long",
          "nickname": "get_row_entries",
          "produces": [
            "application/json"
@@ -688,7 +688,7 @@
        {
          "method": "GET",
          "summary": "Get counter entries",
-          "type": "int",
+          "type": "long",
          "nickname": "get_counter_entries",
          "produces": [
            "application/json"
--- a/api/api-doc/column_family.json
+++ b/api/api-doc/column_family.json
@@ -70,7 +70,7 @@
            {
               "method":"POST",
               "summary":"Force a major compaction of this column family",
-               "type":"string",
+               "type":"void",
               "nickname":"force_major_compaction",
               "produces":[
                  "application/json"
@@ -121,7 +121,7 @@
                     "description":"The minimum number of sstables in queue before compaction kicks off",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -172,7 +172,7 @@
                     "description":"The maximum number of sstables in queue before compaction kicks off",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -223,7 +223,7 @@
                     "description":"The maximum number of sstables in queue before compaction kicks off",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  },
                  {
@@ -231,7 +231,7 @@
                     "description":"The minimum number of sstables in queue before compaction kicks off",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -380,16 +380,54 @@
         "operations":[
            {
               "method":"GET",
-               "summary":"check if the auto compaction disabled",
+               "summary":"check if the auto_compaction property is enabled for a given table",
               "type":"boolean",
-               "nickname":"is_auto_compaction_disabled",
+               "nickname":"get_auto_compaction",
               "produces":[
                  "application/json"
               ],
               "parameters":[
                  {
                     "name":"name",
-                     "description":"The column family name in keyspace:name format",
+                     "description":"The table name in keyspace:name format",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            },
+            {
+               "method":"POST",
+               "summary":"Enable table auto compaction",
+               "type":"void",
+               "nickname":"enable_auto_compaction",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"name",
+                     "description":"The table name in keyspace:name format",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            },
+            {
+               "method":"DELETE",
+               "summary":"Disable table auto compaction",
+               "type":"void",
+               "nickname":"disable_auto_compaction",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"name",
+                     "description":"The table name in keyspace:name format",
                     "required":true,
                     "allowMultiple":false,
                     "type":"string",
@@ -544,7 +582,7 @@
               "summary":"sstable count for each level. empty unless leveled compaction is used",
               "type":"array",
               "items":{
-                  "type":"int"
+                  "type": "long"
               },
               "nickname":"get_sstable_count_per_level",
               "produces":[
@@ -636,7 +674,7 @@
                     "description":"Duration (in milliseconds) of monitoring operation",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  },
                  {
@@ -644,7 +682,7 @@
                    "description":"number of the top partitions to list",
                    "required":false,
                    "allowMultiple":false,
-                    "type":"int",
+                    "type": "long",
                    "paramType":"query"
                 },
                 {
@@ -652,7 +690,7 @@
                    "description":"capacity of stream summary: determines amount of resources used in query processing",
                    "required":false,
                    "allowMultiple":false,
-                    "type":"int",
+                    "type": "long",
                    "paramType":"query"
                 }
              ]
@@ -921,7 +959,7 @@
            {
               "method":"GET",
               "summary":"Get memtable switch count",
-               "type":"int",
+               "type": "long",
               "nickname":"get_memtable_switch_count",
               "produces":[
                  "application/json"
@@ -945,7 +983,7 @@
            {
               "method":"GET",
               "summary":"Get all memtable switch count",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_memtable_switch_count",
               "produces":[
                  "application/json"
@@ -1082,7 +1120,7 @@
            {
               "method":"GET",
               "summary":"Get read latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_read_latency",
               "produces":[
                  "application/json"
@@ -1235,7 +1273,7 @@
            {
               "method":"GET",
               "summary":"Get all read latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_read_latency",
               "produces":[
                  "application/json"
@@ -1251,7 +1289,7 @@
            {
               "method":"GET",
               "summary":"Get range latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_range_latency",
               "produces":[
                  "application/json"
@@ -1275,7 +1313,7 @@
            {
               "method":"GET",
               "summary":"Get all range latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_range_latency",
               "produces":[
                  "application/json"
@@ -1291,7 +1329,7 @@
            {
               "method":"GET",
               "summary":"Get write latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_write_latency",
               "produces":[
                  "application/json"
@@ -1444,7 +1482,7 @@
            {
               "method":"GET",
               "summary":"Get all write latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_write_latency",
               "produces":[
                  "application/json"
@@ -1460,7 +1498,7 @@
            {
               "method":"GET",
               "summary":"Get pending flushes",
-               "type":"int",
+               "type": "long",
               "nickname":"get_pending_flushes",
               "produces":[
                  "application/json"
@@ -1484,7 +1522,7 @@
            {
               "method":"GET",
               "summary":"Get all pending flushes",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_pending_flushes",
               "produces":[
                  "application/json"
@@ -1500,7 +1538,7 @@
            {
               "method":"GET",
               "summary":"Get pending compactions",
-               "type":"int",
+               "type": "long",
               "nickname":"get_pending_compactions",
               "produces":[
                  "application/json"
@@ -1524,7 +1562,7 @@
            {
               "method":"GET",
               "summary":"Get all pending compactions",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_pending_compactions",
               "produces":[
                  "application/json"
@@ -1540,7 +1578,7 @@
            {
               "method":"GET",
               "summary":"Get live ss table count",
-               "type":"int",
+               "type": "long",
               "nickname":"get_live_ss_table_count",
               "produces":[
                  "application/json"
@@ -1564,7 +1602,7 @@
            {
               "method":"GET",
               "summary":"Get all live ss table count",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_live_ss_table_count",
               "produces":[
                  "application/json"
@@ -1580,7 +1618,7 @@
            {
               "method":"GET",
               "summary":"Get live disk space used",
-               "type":"int",
+               "type": "long",
               "nickname":"get_live_disk_space_used",
               "produces":[
                  "application/json"
@@ -1604,7 +1642,7 @@
            {
               "method":"GET",
               "summary":"Get all live disk space used",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_live_disk_space_used",
               "produces":[
                  "application/json"
@@ -1620,7 +1658,7 @@
            {
               "method":"GET",
               "summary":"Get total disk space used",
-               "type":"int",
+               "type": "long",
               "nickname":"get_total_disk_space_used",
               "produces":[
                  "application/json"
@@ -1644,7 +1682,7 @@
            {
               "method":"GET",
               "summary":"Get all total disk space used",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_total_disk_space_used",
               "produces":[
                  "application/json"
@@ -2100,7 +2138,7 @@
            {
               "method":"GET",
               "summary":"Get speculative retries",
-               "type":"int",
+               "type": "long",
               "nickname":"get_speculative_retries",
               "produces":[
                  "application/json"
@@ -2124,7 +2162,7 @@
            {
               "method":"GET",
               "summary":"Get all speculative retries",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_speculative_retries",
               "produces":[
                  "application/json"
@@ -2204,7 +2242,7 @@
            {
               "method":"GET",
               "summary":"Get row cache hit out of range",
-               "type":"int",
+               "type": "long",
               "nickname":"get_row_cache_hit_out_of_range",
               "produces":[
                  "application/json"
@@ -2228,7 +2266,7 @@
            {
               "method":"GET",
               "summary":"Get all row cache hit out of range",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_row_cache_hit_out_of_range",
               "produces":[
                  "application/json"
@@ -2244,7 +2282,7 @@
            {
               "method":"GET",
               "summary":"Get row cache hit",
-               "type":"int",
+               "type": "long",
               "nickname":"get_row_cache_hit",
               "produces":[
                  "application/json"
@@ -2268,7 +2306,7 @@
            {
               "method":"GET",
               "summary":"Get all row cache hit",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_row_cache_hit",
               "produces":[
                  "application/json"
@@ -2284,7 +2322,7 @@
            {
               "method":"GET",
               "summary":"Get row cache miss",
-               "type":"int",
+               "type": "long",
               "nickname":"get_row_cache_miss",
               "produces":[
                  "application/json"
@@ -2308,7 +2346,7 @@
            {
               "method":"GET",
               "summary":"Get all row cache miss",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_row_cache_miss",
               "produces":[
                  "application/json"
@@ -2324,7 +2362,7 @@
            {
               "method":"GET",
               "summary":"Get cas prepare",
-               "type":"int",
+               "type": "long",
               "nickname":"get_cas_prepare",
               "produces":[
                  "application/json"
@@ -2348,7 +2386,7 @@
            {
               "method":"GET",
               "summary":"Get cas propose",
-               "type":"int",
+               "type": "long",
               "nickname":"get_cas_propose",
               "produces":[
                  "application/json"
@@ -2372,7 +2410,7 @@
            {
               "method":"GET",
               "summary":"Get cas commit",
-               "type":"int",
+               "type": "long",
               "nickname":"get_cas_commit",
               "produces":[
                  "application/json"
--- a/api/api-doc/compaction_manager.json
+++ b/api/api-doc/compaction_manager.json
@@ -118,7 +118,7 @@
        {
          "method": "GET",
          "summary": "Get pending tasks",
-          "type": "int",
+          "type": "long",
          "nickname": "get_pending_tasks",
          "produces": [
            "application/json"
@@ -181,7 +181,7 @@
        {
          "method": "GET",
          "summary": "Get bytes compacted",
-          "type": "int",
+          "type": "long",
          "nickname": "get_bytes_compacted",
          "produces": [
            "application/json"
@@ -197,7 +197,7 @@
         "description":"A row merged information",
         "properties":{
            "key":{
-               "type":"int",
+               "type": "long",
               "description":"The number of sstable"
            },
            "value":{
--- a/api/api-doc/error_injection.json
+++ b/api/api-doc/error_injection.json
@@ -0,0 +1,90 @@
+{
+   "apiVersion":"0.0.1",
+   "swaggerVersion":"1.2",
+   "basePath":"{{Protocol}}://{{Host}}",
+   "resourcePath":"/error_injection",
+   "produces":[
+      "application/json"
+   ],
+   "apis":[
+      {
+         "path":"/v2/error_injection/injection/{injection}",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Activate an injection that triggers an error in code",
+               "type":"void",
+               "nickname":"enable_injection",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"injection",
+                     "description":"injection name, should correspond to an injection added in code",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  },
+                  {
+                     "name":"one_shot",
+                     "description":"boolean flag indicating whether the injection should be enabled to trigger only once",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"boolean",
+                     "paramType":"query"
+                  }
+               ]
+            },
+            {
+               "method":"DELETE",
+               "summary":"Deactivate an injection previously activated by the API",
+               "type":"void",
+               "nickname":"disable_injection",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"injection",
+                     "description":"injection name",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            }
+         ]
+      },
+      {
+         "path":"/v2/error_injection/injection",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"List all enabled injections on all shards, i.e. injections that will trigger an error in the code",
+               "type":"array",
+               "items":{
+                  "type":"string"
+               },
+               "nickname":"get_enabled_injections_on_all",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            },
+            {
+               "method":"DELETE",
+               "summary":"Deactivate all injections previously activated on all shards by the API",
+               "type":"void",
+               "nickname":"disable_on_all",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            }
+         ]
+      }
+   ]
+}
--- a/api/api-doc/failure_detector.json
+++ b/api/api-doc/failure_detector.json
@@ -110,7 +110,7 @@
            {
               "method":"GET",
               "summary":"Get count down endpoint",
-               "type":"int",
+               "type": "long",
               "nickname":"get_down_endpoint_count",
               "produces":[
                  "application/json"
@@ -126,7 +126,7 @@
            {
               "method":"GET",
               "summary":"Get count up endpoint",
-               "type":"int",
+               "type": "long",
               "nickname":"get_up_endpoint_count",
               "produces":[
                  "application/json"
@@ -180,11 +180,11 @@
                    "description": "The endpoint address"
                },
                "generation": {
-                    "type": "int",
+                    "type": "long",
                    "description": "The heart beat generation"
                },
                "version": {
-                    "type": "int",
+                    "type": "long",
                    "description": "The heart beat version"
                },
                "update_time": {
@@ -209,7 +209,7 @@
           "description": "Holds a version value for an application state",
               "properties": {
                "application_state": {
-                    "type": "int",
+                    "type": "long",
                    "description": "The application state enum index"
                },
                "value": {
@@ -217,7 +217,7 @@
                    "description": "The version value"
                },
                "version": {
-                    "type": "int",
+                    "type": "long",
                    "description": "The application state version"
                }
            }
--- a/api/api-doc/gossiper.json
+++ b/api/api-doc/gossiper.json
@@ -75,7 +75,7 @@
            {
               "method":"GET",
               "summary":"Returns files which are pending for archival attempt. Does NOT include failed archive attempts",
-               "type":"int",
+               "type": "long",
               "nickname":"get_current_generation_number",
               "produces":[
                  "application/json"
@@ -99,7 +99,7 @@
            {
               "method":"GET",
               "summary":"Get heart beat version for a node",
-               "type":"int",
+               "type": "long",
               "nickname":"get_current_heart_beat_version",
               "produces":[
                  "application/json"
--- a/api/api-doc/hinted_handoff.json
+++ b/api/api-doc/hinted_handoff.json
@@ -99,7 +99,7 @@
        {
          "method": "GET",
          "summary": "Get create hint count",
-          "type": "int",
+          "type": "long",
          "nickname": "get_create_hint_count",
          "produces": [
            "application/json"
@@ -123,7 +123,7 @@
        {
          "method": "GET",
          "summary": "Get not stored hints count",
-          "type": "int",
+          "type": "long",
          "nickname": "get_not_stored_hints_count",
          "produces": [
            "application/json"
--- a/api/api-doc/messaging_service.json
+++ b/api/api-doc/messaging_service.json
@@ -191,7 +191,7 @@
            {
               "method":"GET",
               "summary":"Get the version number",
-               "type":"int",
+               "type": "long",
               "nickname":"get_version",
               "produces":[
                  "application/json"
--- a/api/api-doc/storage_proxy.json
+++ b/api/api-doc/storage_proxy.json
@@ -105,7 +105,7 @@
            {
               "method":"GET",
               "summary":"Get the max hint window",
-               "type":"int",
+               "type": "long",
               "nickname":"get_max_hint_window",
               "produces":[
                  "application/json"
@@ -128,7 +128,7 @@
                     "description":"max hint window in ms",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -141,7 +141,7 @@
            {
               "method":"GET",
               "summary":"Get max hints in progress",
-               "type":"int",
+               "type": "long",
               "nickname":"get_max_hints_in_progress",
               "produces":[
                  "application/json"
@@ -164,7 +164,7 @@
                     "description":"max hints in progress",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -177,7 +177,7 @@
            {
               "method":"GET",
               "summary":"get hints in progress",
-               "type":"int",
+               "type": "long",
               "nickname":"get_hints_in_progress",
               "produces":[
                  "application/json"
@@ -602,7 +602,7 @@
        {
          "method": "GET",
          "summary": "Get cas write metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_cas_write_metrics_unfinished_commit",
          "produces": [
            "application/json"
@@ -632,7 +632,7 @@
        {
          "method": "GET",
          "summary": "Get cas write metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_cas_write_metrics_condition_not_met",
          "produces": [
            "application/json"
@@ -641,13 +641,28 @@
        }
      ]
    },
+    {
+      "path": "/storage_proxy/metrics/cas_write/failed_read_round_optimization",
+      "operations": [
+        {
+          "method": "GET",
+          "summary": "Get cas write metrics",
+          "type": "long",
+          "nickname": "get_cas_write_metrics_failed_read_round_optimization",
+          "produces": [
+            "application/json"
+          ],
+          "parameters": []
+        }
+      ]
+    },
    {
      "path": "/storage_proxy/metrics/cas_read/unfinished_commit",
      "operations": [
        {
          "method": "GET",
          "summary": "Get cas read metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_cas_read_metrics_unfinished_commit",
          "produces": [
            "application/json"
@@ -671,28 +686,13 @@
        }
      ]
    },
-    {
-      "path": "/storage_proxy/metrics/cas_read/condition_not_met",
-      "operations": [
-        {
-          "method": "GET",
-          "summary": "Get cas read metrics",
-          "type": "int",
-          "nickname": "get_cas_read_metrics_condition_not_met",
-          "produces": [
-            "application/json"
-          ],
-          "parameters": []
-        }
-      ]
-    },
    {
      "path": "/storage_proxy/metrics/read/timeouts",
      "operations": [
        {
          "method": "GET",
          "summary": "Get read metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_read_metrics_timeouts",
          "produces": [
            "application/json"
@@ -707,7 +707,7 @@
        {
          "method": "GET",
          "summary": "Get read metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_read_metrics_unavailables",
          "produces": [
            "application/json"
@@ -842,7 +842,7 @@
        {
          "method": "GET",
          "summary": "Get range metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_range_metrics_timeouts",
          "produces": [
            "application/json"
@@ -857,7 +857,7 @@
        {
          "method": "GET",
          "summary": "Get range metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_range_metrics_unavailables",
          "produces": [
            "application/json"
@@ -902,7 +902,7 @@
        {
          "method": "GET",
          "summary": "Get write metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_write_metrics_timeouts",
          "produces": [
            "application/json"
@@ -917,7 +917,7 @@
        {
          "method": "GET",
          "summary": "Get write metrics",
-          "type": "int",
+          "type": "long",
          "nickname": "get_write_metrics_unavailables",
          "produces": [
            "application/json"
@@ -1023,7 +1023,7 @@
            {
               "method":"GET",
               "summary":"Get read latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_read_latency",
               "produces":[
                  "application/json"
@@ -1055,7 +1055,7 @@
            {
               "method":"GET",
               "summary":"Get write latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_write_latency",
               "produces":[
                  "application/json"
@@ -1087,7 +1087,7 @@
            {
               "method":"GET",
               "summary":"Get range latency",
-               "type":"int",
+               "type": "long",
               "nickname":"get_range_latency",
               "produces":[
                  "application/json"
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -458,7 +458,7 @@
            {
               "method":"GET",
               "summary":"Return the generation value for this node.",
-               "type":"int",
+               "type": "long",
               "nickname":"get_current_generation_number",
               "produces":[
                  "application/json"
@@ -582,7 +582,15 @@
                  },
                  {
                     "name":"kn",
-                     "description":"Comma seperated keyspaces name to snapshot",
+                     "description":"Comma seperated keyspaces name that their snapshot will be deleted",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"cf",
+                     "description":"an optional table name that its snapshot will be deleted",
                     "required":false,
                     "allowMultiple":false,
                     "type":"string",
@@ -646,7 +654,7 @@
            {
               "method":"POST",
               "summary":"Trigger a cleanup of keys on a single keyspace",
-               "type":"int",
+               "type": "long",
               "nickname":"force_keyspace_cleanup",
               "produces":[
                  "application/json"
@@ -678,7 +686,7 @@
            {
               "method":"GET",
               "summary":"Scrub (deserialize + reserialize at the latest version, skipping bad rows if any) the given keyspace. If columnFamilies array is empty, all CFs are scrubbed. Scrubbed CFs will be snapshotted first, if disableSnapshot is false",
-               "type":"int",
+               "type": "long",
               "nickname":"scrub",
               "produces":[
                  "application/json"
@@ -726,7 +734,7 @@
            {
               "method":"GET",
               "summary":"Rewrite all sstables to the latest version. Unlike scrub, it doesn't skip bad rows and do not snapshot sstables first.",
-               "type":"int",
+               "type": "long",
               "nickname":"upgrade_sstables",
               "produces":[
                  "application/json"
@@ -800,7 +808,7 @@
               "summary":"Return an array with the ids of the currently active repairs",
               "type":"array",
               "items":{
-                  "type":"int"
+                  "type": "long"
               },
               "nickname":"get_active_repair_async",
               "produces":[
@@ -816,7 +824,7 @@
            {
               "method":"POST",
               "summary":"Invoke repair asynchronously. You can track repair progress by using the get supplying id",
-               "type":"int",
+               "type": "long",
               "nickname":"repair_async",
               "produces":[
                  "application/json"
@@ -947,7 +955,7 @@
                     "description":"The repair ID to check for status",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -1277,18 +1285,18 @@
                  },
                  {
                     "name":"dynamic_update_interval",
-                     "description":"integer, in ms (default 100)",
+                     "description":"interval in ms (default 100)",
                     "required":false,
                     "allowMultiple":false,
-                     "type":"integer",
+                     "type":"long",
                     "paramType":"query"
                  },
                  {
                     "name":"dynamic_reset_interval",
-                     "description":"integer, in ms (default 600,000)",
+                     "description":"interval in ms (default 600,000)",
                     "required":false,
                     "allowMultiple":false,
-                     "type":"integer",
+                     "type":"long",
                     "paramType":"query"
                  },
                  {
@@ -1493,7 +1501,7 @@
                     "description":"Stream throughput",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -1501,7 +1509,7 @@
            {
               "method":"GET",
               "summary":"Get stream throughput mb per sec",
-               "type":"int",
+               "type": "long",
               "nickname":"get_stream_throughput_mb_per_sec",
               "produces":[
                  "application/json"
@@ -1517,7 +1525,7 @@
            {
               "method":"GET",
               "summary":"get compaction throughput mb per sec",
-               "type":"int",
+               "type": "long",
               "nickname":"get_compaction_throughput_mb_per_sec",
               "produces":[
                  "application/json"
@@ -1539,7 +1547,7 @@
                     "description":"compaction throughput",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -1943,7 +1951,7 @@
            {
               "method":"GET",
               "summary":"Returns the threshold for warning of queries with many tombstones",
-               "type":"int",
+               "type": "long",
               "nickname":"get_tombstone_warn_threshold",
               "produces":[
                  "application/json"
@@ -1965,7 +1973,7 @@
                     "description":"tombstone debug threshold",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -1978,7 +1986,7 @@
            {
               "method":"GET",
               "summary":"",
-               "type":"int",
+               "type": "long",
               "nickname":"get_tombstone_failure_threshold",
               "produces":[
                  "application/json"
@@ -2000,7 +2008,7 @@
                     "description":"tombstone debug threshold",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -2013,7 +2021,7 @@
            {
               "method":"GET",
               "summary":"Returns the threshold for rejecting queries due to a large batch size",
-               "type":"int",
+               "type": "long",
               "nickname":"get_batch_size_failure_threshold",
               "produces":[
                  "application/json"
@@ -2035,7 +2043,7 @@
                     "description":"batch size debug threshold",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -2059,7 +2067,7 @@
                     "description":"throttle in kb",
                     "required":true,
                     "allowMultiple":false,
-                     "type":"int",
+                     "type": "long",
                     "paramType":"query"
                  }
               ]
@@ -2072,7 +2080,7 @@
            {
               "method":"GET",
               "summary":"Get load",
-               "type":"int",
+               "type": "long",
               "nickname":"get_metrics_load",
               "produces":[
                  "application/json"
@@ -2088,7 +2096,7 @@
            {
               "method":"GET",
               "summary":"Get exceptions",
-               "type":"int",
+               "type": "long",
               "nickname":"get_exceptions",
               "produces":[
                  "application/json"
@@ -2104,7 +2112,7 @@
            {
               "method":"GET",
               "summary":"Get total hints in progress",
-               "type":"int",
+               "type": "long",
               "nickname":"get_total_hints_in_progress",
               "produces":[
                  "application/json"
@@ -2120,7 +2128,7 @@
            {
               "method":"GET",
               "summary":"Get total hints",
-               "type":"int",
+               "type": "long",
               "nickname":"get_total_hints",
               "produces":[
                  "application/json"
--- a/api/api-doc/stream_manager.json
+++ b/api/api-doc/stream_manager.json
@@ -32,7 +32,7 @@
            {
               "method":"GET",
               "summary":"Get number of active outbound streams",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_active_streams_outbound",
               "produces":[
                  "application/json"
@@ -48,7 +48,7 @@
            {
               "method":"GET",
               "summary":"Get total incoming bytes",
-               "type":"int",
+               "type": "long",
               "nickname":"get_total_incoming_bytes",
               "produces":[
                  "application/json"
@@ -72,7 +72,7 @@
            {
               "method":"GET",
               "summary":"Get all total incoming bytes",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_total_incoming_bytes",
               "produces":[
                  "application/json"
@@ -88,7 +88,7 @@
            {
               "method":"GET",
               "summary":"Get total outgoing bytes",
-               "type":"int",
+               "type": "long",
               "nickname":"get_total_outgoing_bytes",
               "produces":[
                  "application/json"
@@ -112,7 +112,7 @@
            {
               "method":"GET",
               "summary":"Get all total outgoing bytes",
-               "type":"int",
+               "type": "long",
               "nickname":"get_all_total_outgoing_bytes",
               "produces":[
                  "application/json"
@@ -154,7 +154,7 @@
               "description":"The peer"
            },
            "session_index":{
-               "type":"int",
+               "type": "long",
               "description":"The session index"
            },
            "connecting":{
@@ -211,7 +211,7 @@
               "description":"The ID"
            },
            "files":{
-               "type":"int",
+               "type": "long",
               "description":"Number of files to transfer. Can be 0 if nothing to transfer for some streaming request."
            },
            "total_size":{
@@ -242,7 +242,7 @@
               "description":"The peer address"
            },
            "session_index":{
-               "type":"int",
+               "type": "long",
               "description":"The session index"
            },
            "file_name":{
--- a/api/api-doc/system.json
+++ b/api/api-doc/system.json
@@ -52,6 +52,21 @@
            }
         ]
      },
+      {
+         "path":"/system/uptime_ms",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Get system uptime, in milliseconds",
+               "type":"long",
+               "nickname":"get_system_uptime",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[]
+            }
+         ]
+      },
      {
         "path":"/system/logger/{name}",
         "operations":[
--- a/api/api.cc
+++ b/api/api.cc
@@ -36,6 +36,7 @@
 #include "endpoint_snitch.hh"
 #include "compaction_manager.hh"
 #include "hinted_handoff.hh"
+#include "error_injection.hh"
 #include <seastar/http/exception.hh>
 #include "stream_manager.hh"
 #include "system.hh"
@@ -68,13 +69,19 @@ future<> set_server_init(http_context& ctx) {
        rb->set_api_doc(r);
        rb02->set_api_doc(r);
        rb02->register_api_file(r, "swagger20_header");
-        set_config(rb02, ctx, r);
        rb->register_function(r, "system",
                "The system related API");
        set_system(ctx, r);
    });
 }

+future<> set_server_config(http_context& ctx) {
+    auto rb02 = std::make_shared < api_registry_builder20 > (ctx.api_doc, "/v2");
+    return ctx.http_server.set_routes([&ctx, rb02](routes& r) {
+        set_config(rb02, ctx, r);
+    });
+}
+
 static future<> register_api(http_context& ctx, const sstring& api_name,
        const sstring api_desc,
        std::function<void(http_context& ctx, routes& r)> f) {
@@ -90,6 +97,10 @@ future<> set_server_storage_service(http_context& ctx) {
    return register_api(ctx, "storage_service", "The storage service API", set_storage_service);
 }

+future<> set_server_snapshot(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { set_snapshot(ctx, r); });
+}
+
 future<> set_server_snitch(http_context& ctx) {
    return register_api(ctx, "endpoint_snitch_info", "The endpoint snitch info API", set_endpoint_snitch);
 }
@@ -153,6 +164,9 @@ future<> set_server_done(http_context& ctx) {
        rb->register_function(r, "collectd",
                "The collectd API");
        set_collectd(ctx, r);
+        rb->register_function(r, "error_injection",
+                "The error injection API");
+        set_error_injection(ctx, r);
    });
 }

--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -23,6 +23,9 @@
 #include "service/storage_proxy.hh"
 #include <seastar/http/httpd.hh>

+namespace service { class load_meter; }
+namespace locator { class token_metadata; }
+
 namespace api {

 struct http_context {
@@ -31,15 +34,21 @@ struct http_context {
    httpd::http_server_control http_server;
    distributed<database>& db;
    distributed<service::storage_proxy>& sp;
+    service::load_meter& lmeter;
+    sharded<locator::token_metadata>& token_metadata;
+
    http_context(distributed<database>& _db,
-            distributed<service::storage_proxy>& _sp)
-            : db(_db), sp(_sp) {
+            distributed<service::storage_proxy>& _sp,
+            service::load_meter& _lm, sharded<locator::token_metadata>& _tm)
+            : db(_db), sp(_sp), lmeter(_lm), token_metadata(_tm) {
    }
 };

 future<> set_server_init(http_context& ctx);
+future<> set_server_config(http_context& ctx);
 future<> set_server_snitch(http_context& ctx);
 future<> set_server_storage_service(http_context& ctx);
+future<> set_server_snapshot(http_context& ctx);
 future<> set_server_gossip(http_context& ctx);
 future<> set_server_load_sstable(http_context& ctx);
 future<> set_server_messaging_service(http_context& ctx);
--- a/api/cache_service.cc
+++ b/api/cache_service.cc
@@ -208,9 +208,11 @@ void set_cache_service(http_context& ctx, routes& r) {
    });

    cs::get_row_capacity.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, uint64_t(0), [](const column_family& cf) {
-            return cf.get_row_cache().get_cache_tracker().region().occupancy().used_space();
-        }, std::plus<uint64_t>());
+        return ctx.db.map_reduce0([](database& db) -> uint64_t {
+            return db.row_cache_tracker().region().occupancy().used_space();
+        }, uint64_t(0), std::plus<uint64_t>()).then([](const int64_t& res) {
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    cs::get_row_hits.set(r, [&ctx] (std::unique_ptr<request> req) {
@@ -251,15 +253,19 @@ void set_cache_service(http_context& ctx, routes& r) {
    cs::get_row_size.set(r, [&ctx] (std::unique_ptr<request> req) {
        // In origin row size is the weighted size.
        // We currently do not support weights, so we use num entries instead
-        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
-            return cf.get_row_cache().partitions();
-        }, std::plus<uint64_t>());
+        return ctx.db.map_reduce0([](database& db) -> uint64_t {
+            return db.row_cache_tracker().partitions();
+        }, uint64_t(0), std::plus<uint64_t>()).then([](const int64_t& res) {
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    cs::get_row_entries.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
-            return cf.get_row_cache().partitions();
-        }, std::plus<uint64_t>());
+        return ctx.db.map_reduce0([](database& db) -> uint64_t {
+            return db.row_cache_tracker().partitions();
+        }, uint64_t(0), std::plus<uint64_t>()).then([](const int64_t& res) {
+            return make_ready_future<json::json_return_type>(res);
+        });
    });

    cs::get_counter_capacity.set(r, [] (std::unique_ptr<request> req) {
--- a/api/collectd.cc
+++ b/api/collectd.cc
@@ -64,7 +64,7 @@ static const char* str_to_regex(const sstring& v) {
 void set_collectd(http_context& ctx, routes& r) {
    cd::get_collectd.set(r, [&ctx](std::unique_ptr<request> req) {

-        auto id = make_shared<scollectd::type_instance_id>(req->param["pluginid"],
+        auto id = ::make_shared<scollectd::type_instance_id>(req->param["pluginid"],
                req->get_query_param("instance"), req->get_query_param("type"),
                req->get_query_param("type_instance"));

--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -26,7 +26,7 @@
 #include "sstables/sstables.hh"
 #include "utils/estimated_histogram.hh"
 #include <algorithm>
-
+#include "db/system_keyspace_view_types.hh"
 #include "db/data_listeners.hh"

 extern logging::logger apilog;
@@ -53,8 +53,7 @@ std::tuple<sstring, sstring> parse_fully_qualified_cf_name(sstring name) {
    return std::make_tuple(name.substr(0, pos), name.substr(end));
 }

-const utils::UUID& get_uuid(const sstring& name, const database& db) {
-    auto [ks, cf] = parse_fully_qualified_cf_name(name);
+const utils::UUID& get_uuid(const sstring& ks, const sstring& cf, const database& db) {
    try {
        return db.find_uuid(ks, cf);
    } catch (std::out_of_range& e) {
@@ -62,6 +61,11 @@ const utils::UUID& get_uuid(const sstring& name, const database& db) {
    }
 }

+const utils::UUID& get_uuid(const sstring& name, const database& db) {
+    auto [ks, cf] = parse_fully_qualified_cf_name(name);
+    return get_uuid(ks, cf, db);
+}
+
 future<> foreach_column_family(http_context& ctx, const sstring& name, function<void(column_family&)> f) {
    auto uuid = get_uuid(name, ctx.db.local());

@@ -71,28 +75,28 @@ future<> foreach_column_family(http_context& ctx, const sstring& name, function<
 }

 future<json::json_return_type>  get_cf_stats(http_context& ctx, const sstring& name,
-        int64_t column_family::stats::*f) {
+        int64_t column_family_stats::*f) {
    return map_reduce_cf(ctx, name, int64_t(0), [f](const column_family& cf) {
        return cf.get_stats().*f;
    }, std::plus<int64_t>());
 }

 future<json::json_return_type>  get_cf_stats(http_context& ctx,
-        int64_t column_family::stats::*f) {
+        int64_t column_family_stats::*f) {
    return map_reduce_cf(ctx, int64_t(0), [f](const column_family& cf) {
        return cf.get_stats().*f;
    }, std::plus<int64_t>());
 }

 static future<json::json_return_type>  get_cf_stats_count(http_context& ctx, const sstring& name,
-        utils::timed_rate_moving_average_and_histogram column_family::stats::*f) {
+        utils::timed_rate_moving_average_and_histogram column_family_stats::*f) {
    return map_reduce_cf(ctx, name, int64_t(0), [f](const column_family& cf) {
        return (cf.get_stats().*f).hist.count;
    }, std::plus<int64_t>());
 }

 static future<json::json_return_type>  get_cf_stats_sum(http_context& ctx, const sstring& name,
-        utils::timed_rate_moving_average_and_histogram column_family::stats::*f) {
+        utils::timed_rate_moving_average_and_histogram column_family_stats::*f) {
    auto uuid = get_uuid(name, ctx.db.local());
    return ctx.db.map_reduce0([uuid, f](database& db) {
        // Histograms information is sample of the actual load
@@ -108,14 +112,14 @@ static future<json::json_return_type>  get_cf_stats_sum(http_context& ctx, const


 static future<json::json_return_type>  get_cf_stats_count(http_context& ctx,
-        utils::timed_rate_moving_average_and_histogram column_family::stats::*f) {
+        utils::timed_rate_moving_average_and_histogram column_family_stats::*f) {
    return map_reduce_cf(ctx, int64_t(0), [f](const column_family& cf) {
        return (cf.get_stats().*f).hist.count;
    }, std::plus<int64_t>());
 }

 static future<json::json_return_type>  get_cf_histogram(http_context& ctx, const sstring& name,
-        utils::timed_rate_moving_average_and_histogram column_family::stats::*f) {
+        utils::timed_rate_moving_average_and_histogram column_family_stats::*f) {
    utils::UUID uuid = get_uuid(name, ctx.db.local());
    return ctx.db.map_reduce0([f, uuid](const database& p) {
        return (p.find_column_family(uuid).get_stats().*f).hist;},
@@ -126,7 +130,7 @@ static future<json::json_return_type>  get_cf_histogram(http_context& ctx, const
    });
 }

-static future<json::json_return_type> get_cf_histogram(http_context& ctx, utils::timed_rate_moving_average_and_histogram column_family::stats::*f) {
+static future<json::json_return_type> get_cf_histogram(http_context& ctx, utils::timed_rate_moving_average_and_histogram column_family_stats::*f) {
    std::function<utils::ihistogram(const database&)> fun = [f] (const database& db)  {
        utils::ihistogram res;
        for (auto i : db.get_column_families()) {
@@ -142,7 +146,7 @@ static future<json::json_return_type> get_cf_histogram(http_context& ctx, utils:
 }

 static future<json::json_return_type>  get_cf_rate_and_histogram(http_context& ctx, const sstring& name,
-        utils::timed_rate_moving_average_and_histogram column_family::stats::*f) {
+        utils::timed_rate_moving_average_and_histogram column_family_stats::*f) {
    utils::UUID uuid = get_uuid(name, ctx.db.local());
    return ctx.db.map_reduce0([f, uuid](const database& p) {
        return (p.find_column_family(uuid).get_stats().*f).rate();},
@@ -153,7 +157,7 @@ static future<json::json_return_type>  get_cf_rate_and_histogram(http_context& c
    });
 }

-static future<json::json_return_type> get_cf_rate_and_histogram(http_context& ctx, utils::timed_rate_moving_average_and_histogram column_family::stats::*f) {
+static future<json::json_return_type> get_cf_rate_and_histogram(http_context& ctx, utils::timed_rate_moving_average_and_histogram column_family_stats::*f) {
    std::function<utils::rate_moving_average_and_histogram(const database&)> fun = [f] (const database& db)  {
        utils::rate_moving_average_and_histogram res;
        for (auto i : db.get_column_families()) {
@@ -250,12 +254,11 @@ class sum_ratio {
    uint64_t _n = 0;
    T _total = 0;
 public:
-    future<> operator()(T value) {
+    void operator()(T value) {
        if (value > 0) {
            _total += value;
            _n++;
        }
-        return make_ready_future<>();
    }
    // Returns average value of all registered ratios.
    T get() && {
@@ -404,11 +407,11 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_memtable_switch_count.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats(ctx,req->param["name"] ,&column_family::stats::memtable_switch_count);
+        return get_cf_stats(ctx,req->param["name"] ,&column_family_stats::memtable_switch_count);
    });

    cf::get_all_memtable_switch_count.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats(ctx, &column_family::stats::memtable_switch_count);
+        return get_cf_stats(ctx, &column_family_stats::memtable_switch_count);
    });

    // FIXME: this refers to partitions, not rows.
@@ -453,67 +456,67 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_pending_flushes.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats(ctx,req->param["name"] ,&column_family::stats::pending_flushes);
+        return get_cf_stats(ctx,req->param["name"] ,&column_family_stats::pending_flushes);
    });

    cf::get_all_pending_flushes.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats(ctx, &column_family::stats::pending_flushes);
+        return get_cf_stats(ctx, &column_family_stats::pending_flushes);
    });

    cf::get_read.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats_count(ctx,req->param["name"] ,&column_family::stats::reads);
+        return get_cf_stats_count(ctx,req->param["name"] ,&column_family_stats::reads);
    });

    cf::get_all_read.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats_count(ctx, &column_family::stats::reads);
+        return get_cf_stats_count(ctx, &column_family_stats::reads);
    });

    cf::get_write.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats_count(ctx, req->param["name"] ,&column_family::stats::writes);
+        return get_cf_stats_count(ctx, req->param["name"] ,&column_family_stats::writes);
    });

    cf::get_all_write.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats_count(ctx, &column_family::stats::writes);
+        return get_cf_stats_count(ctx, &column_family_stats::writes);
    });

    cf::get_read_latency_histogram_depricated.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_histogram(ctx, req->param["name"], &column_family::stats::reads);
+        return get_cf_histogram(ctx, req->param["name"], &column_family_stats::reads);
    });

    cf::get_read_latency_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_rate_and_histogram(ctx, req->param["name"], &column_family::stats::reads);
+        return get_cf_rate_and_histogram(ctx, req->param["name"], &column_family_stats::reads);
    });

    cf::get_read_latency.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats_sum(ctx,req->param["name"] ,&column_family::stats::reads);
+        return get_cf_stats_sum(ctx,req->param["name"] ,&column_family_stats::reads);
    });

    cf::get_write_latency.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats_sum(ctx, req->param["name"] ,&column_family::stats::writes);
+        return get_cf_stats_sum(ctx, req->param["name"] ,&column_family_stats::writes);
    });

    cf::get_all_read_latency_histogram_depricated.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_histogram(ctx, &column_family::stats::writes);
+        return get_cf_histogram(ctx, &column_family_stats::writes);
    });

    cf::get_all_read_latency_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_rate_and_histogram(ctx, &column_family::stats::writes);
+        return get_cf_rate_and_histogram(ctx, &column_family_stats::writes);
    });

    cf::get_write_latency_histogram_depricated.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_histogram(ctx, req->param["name"], &column_family::stats::writes);
+        return get_cf_histogram(ctx, req->param["name"], &column_family_stats::writes);
    });

    cf::get_write_latency_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_rate_and_histogram(ctx, req->param["name"], &column_family::stats::writes);
+        return get_cf_rate_and_histogram(ctx, req->param["name"], &column_family_stats::writes);
    });

    cf::get_all_write_latency_histogram_depricated.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_histogram(ctx, &column_family::stats::writes);
+        return get_cf_histogram(ctx, &column_family_stats::writes);
    });

    cf::get_all_write_latency_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_rate_and_histogram(ctx, &column_family::stats::writes);
+        return get_cf_rate_and_histogram(ctx, &column_family_stats::writes);
    });

    cf::get_pending_compactions.set(r, [&ctx] (std::unique_ptr<request> req) {
@@ -529,11 +532,11 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_live_ss_table_count.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats(ctx, req->param["name"], &column_family::stats::live_sstable_count);
+        return get_cf_stats(ctx, req->param["name"], &column_family_stats::live_sstable_count);
    });

    cf::get_all_live_ss_table_count.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_stats(ctx, &column_family::stats::live_sstable_count);
+        return get_cf_stats(ctx, &column_family_stats::live_sstable_count);
    });

    cf::get_unleveled_sstables.set(r, [&ctx] (std::unique_ptr<request> req) {
@@ -792,25 +795,25 @@ void set_column_family(http_context& ctx, routes& r) {

    });

-    cf::get_cas_prepare.set(r, [] (std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        //auto id = get_uuid(req->param["name"], ctx.db.local());
-        return make_ready_future<json::json_return_type>(0);
+    cf::get_cas_prepare.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](column_family& cf) {
+            return cf.get_stats().estimated_cas_prepare;
+        },
+        utils::estimated_histogram_merge, utils_json::estimated_histogram());
    });

-    cf::get_cas_propose.set(r, [] (std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        //auto id = get_uuid(req->param["name"], ctx.db.local());
-        return make_ready_future<json::json_return_type>(0);
+    cf::get_cas_propose.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](column_family& cf) {
+            return cf.get_stats().estimated_cas_accept;
+        },
+        utils::estimated_histogram_merge, utils_json::estimated_histogram());
    });

-    cf::get_cas_commit.set(r, [] (std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        //auto id = get_uuid(req->param["name"], ctx.db.local());
-        return make_ready_future<json::json_return_type>(0);
+    cf::get_cas_commit.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](column_family& cf) {
+            return cf.get_stats().estimated_cas_learn;
+        },
+        utils::estimated_histogram_merge, utils_json::estimated_histogram());
    });

    cf::get_sstables_per_read_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
@@ -821,11 +824,11 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_tombstone_scanned_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_histogram(ctx, req->param["name"], &column_family::stats::tombstone_scanned);
+        return get_cf_histogram(ctx, req->param["name"], &column_family_stats::tombstone_scanned);
    });

    cf::get_live_scanned_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return get_cf_histogram(ctx, req->param["name"], &column_family::stats::live_scanned);
+        return get_cf_histogram(ctx, req->param["name"], &column_family_stats::live_scanned);
    });

    cf::get_col_update_time_delta_histogram.set(r, [] (std::unique_ptr<request> req) {
@@ -836,19 +839,49 @@ void set_column_family(http_context& ctx, routes& r) {
        return make_ready_future<json::json_return_type>(res);
    });

-    cf::is_auto_compaction_disabled.set(r, [] (const_req req) {
-        // FIXME
-        // currently auto compaction is disable
-        // it should be changed when it would have an API
-        return true;
+    cf::get_auto_compaction.set(r, [&ctx] (const_req req) {
+        const utils::UUID& uuid = get_uuid(req.param["name"], ctx.db.local());
+        column_family& cf = ctx.db.local().find_column_family(uuid);
+        return !cf.is_auto_compaction_disabled_by_user();
    });

-    cf::get_built_indexes.set(r, [](const_req) {
-        // FIXME
-        // Currently there are no index support
-        return std::vector<sstring>();
+    cf::enable_auto_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
+        return foreach_column_family(ctx, req->param["name"], [](column_family &cf) {
+            cf.enable_auto_compaction();
+        }).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
    });

+    cf::disable_auto_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
+        return foreach_column_family(ctx, req->param["name"], [](column_family &cf) {
+            cf.disable_auto_compaction();
+        }).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    cf::get_built_indexes.set(r, [&ctx](std::unique_ptr<request> req) {
+        auto [ks, cf_name] = parse_fully_qualified_cf_name(req->param["name"]);
+        return db::system_keyspace::load_view_build_progress().then([ks, cf_name, &ctx](const std::vector<db::system_keyspace::view_build_progress>& vb) mutable {
+            std::set<sstring> vp;
+            for (auto b : vb) {
+                if (b.view.first == ks) {
+                    vp.insert(b.view.second);
+                }
+            }
+            std::vector<sstring> res;
+            auto uuid = get_uuid(ks, cf_name, ctx.db.local());
+            column_family& cf = ctx.db.local().find_column_family(uuid);
+            res.reserve(cf.get_index_manager().list_indexes().size());
+            for (auto&& i : cf.get_index_manager().list_indexes()) {
+                if (vp.find(secondary_index::index_table_name(i.metadata().name())) == vp.end()) {
+                    res.emplace_back(i.metadata().name());
+                }
+            }
+            return make_ready_future<json::json_return_type>(res);
+        });
+    });

    cf::get_compression_metadata_off_heap_memory_used.set(r, [](const_req) {
        // FIXME
@@ -976,5 +1009,15 @@ void set_column_family(http_context& ctx, routes& r) {
        });
    });

+    cf::force_major_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
+        if (req->get_query_param("split_output") != "") {
+            fail(unimplemented::cause::API);
+        }
+        return foreach_column_family(ctx, req->param["name"], [](column_family &cf) {
+            return cf.compact_all_sstables();
+        }).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
 }
 }
--- a/api/column_family.hh
+++ b/api/column_family.hh
@@ -109,9 +109,9 @@ future<json::json_return_type> map_reduce_cf(http_context& ctx, I init,
 }

 future<json::json_return_type>  get_cf_stats(http_context& ctx, const sstring& name,
-        int64_t column_family::stats::*f);
+        int64_t column_family_stats::*f);

 future<json::json_return_type>  get_cf_stats(http_context& ctx,
-        int64_t column_family::stats::*f);
+        int64_t column_family_stats::*f);

 }
--- a/api/compaction_manager.cc
+++ b/api/compaction_manager.cc
@@ -74,13 +74,14 @@ void set_compaction_manager(http_context& ctx, routes& r) {

    cm::get_pending_tasks_by_table.set(r, [&ctx] (std::unique_ptr<request> req) {
        return ctx.db.map_reduce0([&ctx](database& db) {
-            std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash> tasks;
-            return do_for_each(db.get_column_families(), [&tasks](const std::pair<utils::UUID, seastar::lw_shared_ptr<table>>& i) {
-                table& cf = *i.second.get();
-                tasks[std::make_pair(cf.schema()->ks_name(), cf.schema()->cf_name())] = cf.get_compaction_strategy().estimated_pending_compactions(cf);
-                return make_ready_future<>();
-            }).then([&tasks] {
-                return tasks;
+            return do_with(std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>(), [&ctx, &db](std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>& tasks) {
+                return do_for_each(db.get_column_families(), [&tasks](const std::pair<utils::UUID, seastar::lw_shared_ptr<table>>& i) {
+                    table& cf = *i.second.get();
+                    tasks[std::make_pair(cf.schema()->ks_name(), cf.schema()->cf_name())] = cf.get_compaction_strategy().estimated_pending_compactions(cf);
+                    return make_ready_future<>();
+                }).then([&tasks] {
+                    return std::move(tasks);
+                });
            });
        }, std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>(), sum_pending_tasks).then(
                [](const std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>& task_map) {
--- a/api/error_injection.cc
+++ b/api/error_injection.cc
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "api/api-doc/error_injection.json.hh"
+#include "api/api.hh"
+
+#include <seastar/http/exception.hh>
+#include "log.hh"
+#include "utils/error_injection.hh"
+#include "seastar/core/future-util.hh"
+
+namespace api {
+
+namespace hf = httpd::error_injection_json;
+
+void set_error_injection(http_context& ctx, routes& r) {
+
+    hf::enable_injection.set(r, [](std::unique_ptr<request> req) {
+        sstring injection = req->param["injection"];
+        bool one_shot = req->get_query_param("one_shot") == "True";
+        auto& errinj = utils::get_local_injector();
+        return errinj.enable_on_all(injection, one_shot).then([] {
+            return make_ready_future<json::json_return_type>(json::json_void());
+        });
+    });
+
+    hf::get_enabled_injections_on_all.set(r, [](std::unique_ptr<request> req) {
+        auto& errinj = utils::get_local_injector();
+        auto ret = errinj.enabled_injections_on_all();
+        return make_ready_future<json::json_return_type>(ret);
+    });
+
+    hf::disable_injection.set(r, [](std::unique_ptr<request> req) {
+        sstring injection = req->param["injection"];
+
+        auto& errinj = utils::get_local_injector();
+        return errinj.disable_on_all(injection).then([] {
+            return make_ready_future<json::json_return_type>(json::json_void());
+        });
+    });
+
+    hf::disable_on_all.set(r, [](std::unique_ptr<request> req) {
+        auto& errinj = utils::get_local_injector();
+        return errinj.disable_on_all().then([] {
+            return make_ready_future<json::json_return_type>(json::json_void());
+        });
+    });
+
+}
+
+} // namespace api
--- a/api/error_injection.hh
+++ b/api/error_injection.hh
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "api.hh"
+
+namespace api {
+
+void set_error_injection(http_context& ctx, routes& r);
+
+}
--- a/api/storage_proxy.cc
+++ b/api/storage_proxy.cc
@@ -27,6 +27,7 @@
 #include "db/config.hh"
 #include "utils/histogram.hh"
 #include "database.hh"
+#include "seastar/core/scheduling_specific.hh"

 namespace api {

@@ -34,12 +35,70 @@ namespace sp = httpd::storage_proxy_json;
 using proxy = service::storage_proxy;
 using namespace json;

-static future<utils::rate_moving_average>  sum_timed_rate(distributed<proxy>& d, utils::timed_rate_moving_average proxy::stats::*f) {
-    return d.map_reduce0([f](const proxy& p) {return (p.get_stats().*f).rate();}, utils::rate_moving_average(),
-            std::plus<utils::rate_moving_average>());
+
+/**
+ * This function implement a two dimentional map reduce where
+ * the first level is a distributed storage_proxy class and the
+ * second level is the stats per scheduling group class.
+ * @param d -  a reference to the storage_proxy distributed class.
+ * @param mapper -  the internal mapper that is used to map the internal
+ * stat class into a value of type `V`.
+ * @param reducer - the reducer that is used in both outer and inner
+ * aggregations.
+ * @param initial_value - the initial value to use for both aggregations
+ * @return A future that resolves to the result of the aggregation.
+ */
+template<typename V, typename Reducer, typename InnerMapper>
+future<V> two_dimensional_map_reduce(distributed<service::storage_proxy>& d,
+        InnerMapper mapper, Reducer reducer, V initial_value) {
+    return d.map_reduce0( [mapper, reducer, initial_value] (const service::storage_proxy& sp) {
+        return map_reduce_scheduling_group_specific<service::storage_proxy_stats::stats>(
+                mapper, reducer, initial_value, sp.get_stats_key());
+    }, initial_value, reducer);
 }

-static future<json::json_return_type>  sum_timed_rate_as_obj(distributed<proxy>& d, utils::timed_rate_moving_average proxy::stats::*f) {
+/**
+ * This function implement a two dimentional map reduce where
+ * the first level is a distributed storage_proxy class and the
+ * second level is the stats per scheduling group class.
+ * @param d -  a reference to the storage_proxy distributed class.
+ * @param f - a field pointer which is the implicit internal reducer.
+ * @param reducer - the reducer that is used in both outer and inner
+ * aggregations.
+ * @param initial_value - the initial value to use for both aggregations* @return
+ * @return A future that resolves to the result of the aggregation.
+ */
+template<typename V, typename Reducer, typename F>
+future<V> two_dimensional_map_reduce(distributed<service::storage_proxy>& d,
+        V F::*f, Reducer reducer, V initial_value) {
+    return two_dimensional_map_reduce(d, [f] (F& stats) {
+        return stats.*f;
+    }, reducer, initial_value);
+}
+
+/**
+ * A partial Specialization of sum_stats for the storage proxy
+ * case where the get stats function doesn't return a
+ * stats object with fields but a per scheduling group
+ * stats object, the name was also changed since functions
+ * partial specialization is not supported in C++.
+ *
+ */
+template<typename V, typename F>
+future<json::json_return_type>  sum_stats_storage_proxy(distributed<proxy>& d, V F::*f) {
+    return two_dimensional_map_reduce(d, [f] (F& stats) { return stats.*f; }, std::plus<V>(), V(0)).then([] (V val) {
+        return make_ready_future<json::json_return_type>(val);
+    });
+}
+
+
+static future<utils::rate_moving_average>  sum_timed_rate(distributed<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
+    return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
+        return (stats.*f).rate();
+    }, std::plus<utils::rate_moving_average>(), utils::rate_moving_average());
+}
+
+static future<json::json_return_type>  sum_timed_rate_as_obj(distributed<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
    return sum_timed_rate(d, f).then([](const utils::rate_moving_average& val) {
        httpd::utils_json::rate_moving_average m;
        m = val;
@@ -51,29 +110,72 @@ httpd::utils_json::rate_moving_average_and_histogram get_empty_moving_average()
    return timer_to_json(utils::rate_moving_average_and_histogram());
 }

-static future<json::json_return_type>  sum_timed_rate_as_long(distributed<proxy>& d, utils::timed_rate_moving_average proxy::stats::*f) {
+static future<json::json_return_type>  sum_timed_rate_as_long(distributed<proxy>& d, utils::timed_rate_moving_average service::storage_proxy_stats::stats::*f) {
    return sum_timed_rate(d, f).then([](const utils::rate_moving_average& val) {
        return make_ready_future<json::json_return_type>(val.count);
    });
 }

-static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::estimated_histogram proxy::stats::*f) {
-    return ctx.sp.map_reduce0([f](const proxy& p) {return p.get_stats().*f;}, utils::estimated_histogram(),
-            utils::estimated_histogram_merge).then([](const utils::estimated_histogram& val) {
+static future<json::json_return_type>  sum_estimated_histogram(http_context& ctx, utils::estimated_histogram service::storage_proxy_stats::stats::*f) {
+
+    return two_dimensional_map_reduce(ctx.sp, f, utils::estimated_histogram_merge,
+            utils::estimated_histogram()).then([](const utils::estimated_histogram& val) {
        utils_json::estimated_histogram res;
        res = val;
        return make_ready_future<json::json_return_type>(res);
    });
 }

-static future<json::json_return_type>  total_latency(http_context& ctx, utils::timed_rate_moving_average_and_histogram proxy::stats::*f) {
-    return ctx.sp.map_reduce0([f](const proxy& p) {return (p.get_stats().*f).hist.mean * (p.get_stats().*f).hist.count;}, 0.0,
-            std::plus<double>()).then([](double val) {
+static future<json::json_return_type>  total_latency(http_context& ctx, utils::timed_rate_moving_average_and_histogram service::storage_proxy_stats::stats::*f) {
+    return two_dimensional_map_reduce(ctx.sp, [f] (service::storage_proxy_stats::stats& stats) {
+            return (stats.*f).hist.mean * (stats.*f).hist.count;
+        }, std::plus<double>(), 0.0).then([](double val) {
        int64_t res = val;
        return make_ready_future<json::json_return_type>(res);
    });
 }

+/**
+ * A partial Specialization of sum_histogram_stats
+ * for the storage proxy case where the get stats
+ * function doesn't return a stats object with
+ * fields but a per scheduling group stats object,
+ * the name was also changed since function partial
+ * specialization is not supported in C++.
+ */
+template<typename F>
+future<json::json_return_type>
+sum_histogram_stats_storage_proxy(distributed<proxy>& d,
+        utils::timed_rate_moving_average_and_histogram F::*f) {
+    return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
+        return (stats.*f).hist;
+    }, std::plus<utils::ihistogram>(), utils::ihistogram()).
+            then([](const utils::ihistogram& val) {
+        return make_ready_future<json::json_return_type>(to_json(val));
+    });
+}
+
+/**
+ * A partial Specialization of sum_timer_stats for the
+ * storage proxy case where the get stats function
+ * doesn't return a stats object with fields but a
+ * per scheduling group stats object, the name
+ * was also changed since partial function specialization
+ * is not supported in C++.
+ */
+template<typename F>
+future<json::json_return_type>
+sum_timer_stats_storage_proxy(distributed<proxy>& d,
+        utils::timed_rate_moving_average_and_histogram F::*f) {
+
+    return two_dimensional_map_reduce(d, [f] (service::storage_proxy_stats::stats& stats) {
+        return (stats.*f).rate();
+    }, std::plus<utils::rate_moving_average_and_histogram>(),
+            utils::rate_moving_average_and_histogram()).then([](const utils::rate_moving_average_and_histogram& val) {
+        return make_ready_future<json::json_return_type>(timer_to_json(val));
+    });
+}
+
 void set_storage_proxy(http_context& ctx, routes& r) {
    sp::get_total_hints.set(r, [](std::unique_ptr<request> req)  {
        //TBD
@@ -81,12 +183,9 @@ void set_storage_proxy(http_context& ctx, routes& r) {
        return make_ready_future<json::json_return_type>(0);
    });

-    sp::get_hinted_handoff_enabled.set(r, [](std::unique_ptr<request> req)  {
-        //TBD
-        // FIXME
-        // hinted handoff is not supported currently,
-        // so we should return false
-        return make_ready_future<json::json_return_type>(false);
+    sp::get_hinted_handoff_enabled.set(r, [&ctx](std::unique_ptr<request> req)  {
+        auto enabled = ctx.db.local().get_config().hinted_handoff_enabled();
+        return make_ready_future<json::json_return_type>(enabled);
    });

    sp::set_hinted_handoff_enabled.set(r, [](std::unique_ptr<request> req)  {
@@ -226,15 +325,15 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_read_repair_attempted.set(r, [&ctx](std::unique_ptr<request> req)  {
-        return sum_stats(ctx.sp, &proxy::stats::read_repair_attempts);
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_attempts);
    });

    sp::get_read_repair_repaired_blocking.set(r, [&ctx](std::unique_ptr<request> req)  {
-        return sum_stats(ctx.sp, &proxy::stats::read_repair_repaired_blocking);
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_repaired_blocking);
    });

    sp::get_read_repair_repaired_background.set(r, [&ctx](std::unique_ptr<request> req)  {
-        return sum_stats(ctx.sp, &proxy::stats::read_repair_repaired_background);
+        return sum_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read_repair_repaired_background);
    });

    sp::get_schema_versions.set(r, [](std::unique_ptr<request> req)  {
@@ -250,151 +349,119 @@ void set_storage_proxy(http_context& ctx, routes& r) {
        });
    });

-    sp::get_cas_read_timeouts.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        // FIXME
-        // cas is not supported yet, so just return 0
-        return make_ready_future<json::json_return_type>(0);
+    sp::get_cas_read_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::cas_read_timeouts);
    });

-    sp::get_cas_read_unavailables.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        // FIXME
-        // cas is not supported yet, so just return 0
-        return make_ready_future<json::json_return_type>(0);
+    sp::get_cas_read_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::cas_read_unavailables);
    });

-    sp::get_cas_write_timeouts.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        // FIXME
-        // cas is not supported yet, so just return 0
-        return make_ready_future<json::json_return_type>(0);
+    sp::get_cas_write_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::cas_write_timeouts);
    });

-    sp::get_cas_write_unavailables.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        // FIXME
-        // cas is not supported yet, so just return 0
-        return make_ready_future<json::json_return_type>(0);
+    sp::get_cas_write_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::cas_write_unavailables);
    });

-    sp::get_cas_write_metrics_unfinished_commit.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        return make_ready_future<json::json_return_type>(0);
+    sp::get_cas_write_metrics_unfinished_commit.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_write_unfinished_commit);
    });

-    sp::get_cas_write_metrics_contention.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        return make_ready_future<json::json_return_type>(0);
+    sp::get_cas_write_metrics_contention.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_estimated_histogram(ctx, &proxy::stats::cas_write_contention);
    });

-    sp::get_cas_write_metrics_condition_not_met.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        return make_ready_future<json::json_return_type>(0);
+    sp::get_cas_write_metrics_condition_not_met.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_write_condition_not_met);
    });

-    sp::get_cas_read_metrics_unfinished_commit.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        return make_ready_future<json::json_return_type>(0);
+    sp::get_cas_write_metrics_failed_read_round_optimization.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_failed_read_round_optimization);
    });

-    sp::get_cas_read_metrics_contention.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        return make_ready_future<json::json_return_type>(0);
+    sp::get_cas_read_metrics_unfinished_commit.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_stats(ctx.sp, &proxy::stats::cas_read_unfinished_commit);
    });

-    sp::get_cas_read_metrics_condition_not_met.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        return make_ready_future<json::json_return_type>(0);
+    sp::get_cas_read_metrics_contention.set(r, [&ctx](std::unique_ptr<request> req) {
+        return sum_estimated_histogram(ctx, &proxy::stats::cas_read_contention);
    });

    sp::get_read_metrics_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::read_timeouts);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::read_timeouts);
    });

    sp::get_read_metrics_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::read_unavailables);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::read_unavailables);
    });

    sp::get_range_metrics_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::range_slice_timeouts);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::range_slice_timeouts);
    });

    sp::get_range_metrics_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::range_slice_unavailables);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::range_slice_unavailables);
    });

    sp::get_write_metrics_timeouts.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::write_timeouts);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::write_timeouts);
    });

    sp::get_write_metrics_unavailables.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_long(ctx.sp, &proxy::stats::write_unavailables);
+        return sum_timed_rate_as_long(ctx.sp, &service::storage_proxy_stats::stats::write_unavailables);
    });

    sp::get_read_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::read_timeouts);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::read_timeouts);
    });

    sp::get_read_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::read_unavailables);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::read_unavailables);
    });

    sp::get_range_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::range_slice_timeouts);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::range_slice_timeouts);
    });

    sp::get_range_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::range_slice_unavailables);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::range_slice_unavailables);
    });

    sp::get_write_metrics_timeouts_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::write_timeouts);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::write_timeouts);
    });

    sp::get_write_metrics_unavailables_rates.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timed_rate_as_obj(ctx.sp, &proxy::stats::write_unavailables);
+        return sum_timed_rate_as_obj(ctx.sp, &service::storage_proxy_stats::stats::write_unavailables);
    });

    sp::get_range_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_histogram_stats(ctx.sp, &proxy::stats::range);
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

    sp::get_write_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_histogram_stats(ctx.sp, &proxy::stats::write);
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::write);
    });

    sp::get_read_metrics_latency_histogram_depricated.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_histogram_stats(ctx.sp, &proxy::stats::read);
+        return sum_histogram_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read);
    });

    sp::get_range_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats(ctx.sp, &proxy::stats::range);
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

    sp::get_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats(ctx.sp, &proxy::stats::write);
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::write);
    });
    sp::get_cas_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        //TBD
-        // FIXME
-        // cas is not supported yet, so just return empty moving average
-
-        return make_ready_future<json::json_return_type>(get_empty_moving_average());
+        return sum_timer_stats(ctx.sp, &proxy::stats::cas_write);
    });

    sp::get_cas_read_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        //TBD
-        // FIXME
-        // cas is not supported yet, so just return empty moving average
-
-        return make_ready_future<json::json_return_type>(get_empty_moving_average());
+        return sum_timer_stats(ctx.sp, &proxy::stats::cas_read);
    });

    sp::get_view_write_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
@@ -406,30 +473,30 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_read_metrics_latency_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats(ctx.sp, &proxy::stats::read);
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::read);
    });

    sp::get_read_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_estimated_histogram(ctx, &proxy::stats::estimated_read);
+        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::estimated_read);
    });

    sp::get_read_latency.set(r, [&ctx](std::unique_ptr<request> req) {
-        return total_latency(ctx, &proxy::stats::read);
+        return total_latency(ctx, &service::storage_proxy_stats::stats::read);
    });
    sp::get_write_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_estimated_histogram(ctx, &proxy::stats::estimated_write);
+        return sum_estimated_histogram(ctx, &service::storage_proxy_stats::stats::estimated_write);
    });

    sp::get_write_latency.set(r, [&ctx](std::unique_ptr<request> req) {
-        return total_latency(ctx, &proxy::stats::write);
+        return total_latency(ctx, &service::storage_proxy_stats::stats::write);
    });

    sp::get_range_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return sum_timer_stats(ctx.sp, &proxy::stats::range);
+        return sum_timer_stats_storage_proxy(ctx.sp, &service::storage_proxy_stats::stats::range);
    });

    sp::get_range_latency.set(r, [&ctx](std::unique_ptr<request> req) {
-        return total_latency(ctx, &proxy::stats::range);
+        return total_latency(ctx, &service::storage_proxy_stats::stats::range);
    });
 }

--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -27,6 +27,7 @@
 #include <boost/range/adaptor/map.hpp>
 #include <boost/range/adaptor/filtered.hpp>
 #include "service/storage_service.hh"
+#include "service/load_meter.hh"
 #include "db/commitlog/commitlog.hh"
 #include "gms/gossiper.hh"
 #include "db/system_keyspace.hh"
@@ -41,8 +42,6 @@
 #include "database.hh"
 #include "db/extensions.hh"

-sstables::sstable::version_types get_highest_supported_format();
-
 namespace api {

 namespace ss = httpd::storage_service_json;
@@ -55,57 +54,70 @@ static sstring validate_keyspace(http_context& ctx, const parameters& param) {
    throw bad_param_exception("Keyspace " + param["keyspace"] + " Does not exist");
 }

-static std::vector<ss::token_range> describe_ring(const sstring& keyspace) {
-    std::vector<ss::token_range> res;
-    for (auto d : service::get_local_storage_service().describe_ring(keyspace)) {
-        ss::token_range r;
-        r.start_token = d._start_token;
-        r.end_token = d._end_token;
-        r.endpoints = d._endpoints;
-        r.rpc_endpoints = d._rpc_endpoints;
-        for (auto det : d._endpoint_details) {
-            ss::endpoint_detail ed;
-            ed.host = det._host;
-            ed.datacenter = det._datacenter;
-            if (det._rack != "") {
-                ed.rack = det._rack;
-            }
-            r.endpoint_details.push(ed);
+static ss::token_range token_range_endpoints_to_json(const dht::token_range_endpoints& d) {
+    ss::token_range r;
+    r.start_token = d._start_token;
+    r.end_token = d._end_token;
+    r.endpoints = d._endpoints;
+    r.rpc_endpoints = d._rpc_endpoints;
+    for (auto det : d._endpoint_details) {
+        ss::endpoint_detail ed;
+        ed.host = det._host;
+        ed.datacenter = det._datacenter;
+        if (det._rack != "") {
+            ed.rack = det._rack;
        }
-        res.push_back(r);
+        r.endpoint_details.push(ed);
    }
-    return res;
+    return r;
+}
+
+using ks_cf_func = std::function<future<json::json_return_type>(http_context&, std::unique_ptr<request>, sstring, std::vector<sstring>)>;
+
+static auto wrap_ks_cf(http_context &ctx, ks_cf_func f) {
+    return [&ctx, f = std::move(f)](std::unique_ptr<request> req) {
+        auto keyspace = validate_keyspace(ctx, req->param);
+        auto column_families = split_cf(req->get_query_param("cf"));
+        if (column_families.empty()) {
+            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
+        }
+        return f(ctx, std::move(req), std::move(keyspace), std::move(column_families));
+    };
+}
+
+future<> set_tables_autocompaction(http_context& ctx, const sstring &keyspace, std::vector<sstring> tables, bool enabled) {
+    if (tables.empty()) {
+        tables = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
+    }
+    return ctx.db.invoke_on_all([keyspace, tables, enabled] (database& db) {
+        return parallel_for_each(tables, [&db, keyspace, enabled](const sstring& table) mutable {
+            column_family& cf = db.find_column_family(keyspace, table);
+            if (enabled) {
+                cf.enable_auto_compaction();
+            } else {
+                cf.disable_auto_compaction();
+            }
+            return make_ready_future<>();
+        });
+    });
 }

 void set_storage_service(http_context& ctx, routes& r) {
-    using ks_cf_func = std::function<future<json::json_return_type>(std::unique_ptr<request>, sstring, std::vector<sstring>)>;
-
-    auto wrap_ks_cf = [&ctx](ks_cf_func f) {
-        return [&ctx, f = std::move(f)](std::unique_ptr<request> req) {
-            auto keyspace = validate_keyspace(ctx, req->param);
-            auto column_families = split_cf(req->get_query_param("cf"));
-            if (column_families.empty()) {
-                column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
-            }
-            return f(std::move(req), std::move(keyspace), std::move(column_families));
-        };
-    };
-
    ss::local_hostid.set(r, [](std::unique_ptr<request> req) {
        return db::system_keyspace::get_local_host_id().then([](const utils::UUID& id) {
            return make_ready_future<json::json_return_type>(id.to_sstring());
        });
    });

-    ss::get_tokens.set(r, [] (std::unique_ptr<request> req) {
-        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().get_token_metadata().sorted_tokens(), [](const dht::token& i) {
+    ss::get_tokens.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.token_metadata.local().sorted_tokens(), [](const dht::token& i) {
           return boost::lexical_cast<std::string>(i);
        }));
    });

-    ss::get_node_tokens.set(r, [] (std::unique_ptr<request> req) {
+    ss::get_node_tokens.set(r, [&ctx] (std::unique_ptr<request> req) {
        gms::inet_address addr(req->param["endpoint"]);
-        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().get_token_metadata().get_tokens(addr), [](const dht::token& i) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.token_metadata.local().get_tokens(addr), [](const dht::token& i) {
           return boost::lexical_cast<std::string>(i);
       }));
    });
@@ -123,8 +135,8 @@ void set_storage_service(http_context& ctx, routes& r) {
        }));
    });

-    ss::get_leaving_nodes.set(r, [](const_req req) {
-        return container_to_vec(service::get_local_storage_service().get_token_metadata().get_leaving_endpoints());
+    ss::get_leaving_nodes.set(r, [&ctx](const_req req) {
+        return container_to_vec(ctx.token_metadata.local().get_leaving_endpoints());
    });

    ss::get_moving_nodes.set(r, [](const_req req) {
@@ -132,8 +144,8 @@ void set_storage_service(http_context& ctx, routes& r) {
        return container_to_vec(addr);
    });

-    ss::get_joining_nodes.set(r, [](const_req req) {
-        auto points = service::get_local_storage_service().get_token_metadata().get_bootstrap_tokens();
+    ss::get_joining_nodes.set(r, [&ctx](const_req req) {
+        auto points = ctx.token_metadata.local().get_bootstrap_tokens();
        std::unordered_set<sstring> addr;
        for (auto i: points) {
            addr.insert(boost::lexical_cast<std::string>(i.second));
@@ -176,27 +188,26 @@ void set_storage_service(http_context& ctx, routes& r) {
        return make_ready_future<json::json_return_type>(res);
    });

-    ss::describe_any_ring.set(r, [&ctx](const_req req) {
-        return describe_ring("");
+    ss::describe_any_ring.set(r, [&ctx](std::unique_ptr<request> req) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(""), token_range_endpoints_to_json));
    });

-    ss::describe_ring.set(r, [&ctx](const_req req) {
-        auto keyspace = validate_keyspace(ctx, req.param);
-        return describe_ring(keyspace);
+    ss::describe_ring.set(r, [&ctx](std::unique_ptr<request> req) {
+        auto keyspace = validate_keyspace(ctx, req->param);
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(keyspace), token_range_endpoints_to_json));
    });

-    ss::get_host_id_map.set(r, [](const_req req) {
+    ss::get_host_id_map.set(r, [&ctx](const_req req) {
        std::vector<ss::mapper> res;
-        return map_to_key_value(service::get_local_storage_service().
-                get_token_metadata().get_endpoint_to_host_id_map_for_reading(), res);
+        return map_to_key_value(ctx.token_metadata.local().get_endpoint_to_host_id_map_for_reading(), res);
    });

    ss::get_load.set(r, [&ctx](std::unique_ptr<request> req) {
-        return get_cf_stats(ctx, &column_family::stats::live_disk_space_used);
+        return get_cf_stats(ctx, &column_family_stats::live_disk_space_used);
    });

-    ss::get_load_map.set(r, [] (std::unique_ptr<request> req) {
-        return service::get_local_storage_service().get_load_map().then([] (auto&& load_map) {
+    ss::get_load_map.set(r, [&ctx] (std::unique_ptr<request> req) {
+        return ctx.lmeter.get_load_map().then([] (auto&& load_map) {
            std::vector<ss::map_string_double> res;
            for (auto i : load_map) {
                ss::map_string_double val;
@@ -221,64 +232,6 @@ void set_storage_service(http_context& ctx, routes& r) {
                req.get_query_param("key")));
    });

-    ss::get_snapshot_details.set(r, [](std::unique_ptr<request> req) {
-        return service::get_local_storage_service().get_snapshot_details().then([] (auto result) {
-            std::vector<ss::snapshots> res;
-            for (auto& map: result) {
-                ss::snapshots all_snapshots;
-                all_snapshots.key = map.first;
-
-                std::vector<ss::snapshot> snapshot;
-                for (auto& cf: map.second) {
-                    ss::snapshot s;
-                    s.ks = cf.ks;
-                    s.cf = cf.cf;
-                    s.live = cf.live;
-                    s.total = cf.total;
-                    snapshot.push_back(std::move(s));
-                }
-                all_snapshots.value = std::move(snapshot);
-                res.push_back(std::move(all_snapshots));
-            }
-            return make_ready_future<json::json_return_type>(std::move(res));
-        });
-    });
-
-    ss::take_snapshot.set(r, [](std::unique_ptr<request> req) {
-        auto tag = req->get_query_param("tag");
-        auto column_family = req->get_query_param("cf");
-
-        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-
-        auto resp = make_ready_future<>();
-        if (column_family.empty()) {
-            resp = service::get_local_storage_service().take_snapshot(tag, keynames);
-        } else {
-            if (keynames.size() > 1) {
-                throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
-            }
-            resp = service::get_local_storage_service().take_column_family_snapshot(keynames[0], column_family, tag);
-        }
-        return resp.then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
-    ss::del_snapshot.set(r, [](std::unique_ptr<request> req) {
-        auto tag = req->get_query_param("tag");
-
-        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-        return service::get_local_storage_service().clear_snapshot(tag, keynames).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
-    ss::true_snapshots_size.set(r, [](std::unique_ptr<request> req) {
-        return service::get_local_storage_service().true_snapshots_size().then([] (int64_t size) {
-            return make_ready_future<json::json_return_type>(size);
-        });
-    });
-
    ss::force_keyspace_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
        auto keyspace = validate_keyspace(ctx, req->param);
        auto column_families = split_cf(req->get_query_param("cf"));
@@ -304,46 +257,28 @@ void set_storage_service(http_context& ctx, routes& r) {
        if (column_families.empty()) {
            column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
        }
-        return ctx.db.invoke_on_all([keyspace, column_families] (database& db) {
-            std::vector<column_family*> column_families_vec;
-            auto& cm = db.get_compaction_manager();
-            for (auto cf : column_families) {
-                column_families_vec.push_back(&db.find_column_family(keyspace, cf));
+        return service::get_local_storage_service().is_cleanup_allowed(keyspace).then([&ctx, keyspace,
+                column_families = std::move(column_families)] (bool is_cleanup_allowed) mutable {
+            if (!is_cleanup_allowed) {
+                return make_exception_future<json::json_return_type>(
+                        std::runtime_error("Can not perform cleanup operation when topology changes"));
            }
-            return parallel_for_each(column_families_vec, [&cm] (column_family* cf) {
-                return cm.perform_cleanup(cf);
+            return ctx.db.invoke_on_all([keyspace, column_families] (database& db) {
+                std::vector<column_family*> column_families_vec;
+                auto& cm = db.get_compaction_manager();
+                for (auto cf : column_families) {
+                    column_families_vec.push_back(&db.find_column_family(keyspace, cf));
+                }
+                return parallel_for_each(column_families_vec, [&cm, &db] (column_family* cf) {
+                    return cm.perform_cleanup(db, cf);
+                });
+            }).then([]{
+                return make_ready_future<json::json_return_type>(0);
            });
-        }).then([]{
-            return make_ready_future<json::json_return_type>(0);
        });
    });

-    ss::scrub.set(r, wrap_ks_cf([&ctx](std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
-        // TODO: respect this
-        auto skip_corrupted = req->get_query_param("skip_corrupted");
-
-        auto f = make_ready_future<>();
-        if (!req_param<bool>(*req, "disable_snapshot", false)) {
-            auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
-            f = parallel_for_each(column_families, [keyspace, tag](sstring cf) {
-                return service::get_local_storage_service().take_column_family_snapshot(keyspace, cf, tag);
-            });
-        }
-
-        return f.then([&ctx, keyspace, column_families] {
-            return ctx.db.invoke_on_all([=] (database& db) {
-                return do_for_each(column_families, [=, &db](sstring cfname) {
-                    auto& cm = db.get_compaction_manager();
-                    auto& cf = db.find_column_family(keyspace, cfname);
-                    return cm.perform_sstable_scrub(&cf);
-                });
-            });
-        }).then([]{
-            return make_ready_future<json::json_return_type>(0);
-        });
-    }));
-
-    ss::upgrade_sstables.set(r, wrap_ks_cf([&ctx](std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
+    ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
        bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);

        return ctx.db.invoke_on_all([=] (database& db) {
@@ -598,9 +533,7 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::join_ring.set(r, [](std::unique_ptr<request> req) {
-        return service::get_local_storage_service().join_ring().then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
+        return make_ready_future<json::json_return_type>(json_void());
    });

    ss::is_joined.set(r, [] (std::unique_ptr<request> req) {
@@ -728,7 +661,7 @@ void set_storage_service(http_context& ctx, routes& r) {

    ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
        auto probability = req->get_query_param("probability");
-        return futurize<json::json_return_type>::apply([probability] {
+        return futurize_invoke([probability] {
            double real_prob = std::stod(probability.c_str());
            return tracing::tracing::tracing_instance().invoke_on_all([real_prob] (auto& local_tracing) {
                local_tracing.set_trace_probability(real_prob);
@@ -783,19 +716,19 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::enable_auto_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
        auto keyspace = validate_keyspace(ctx, req->param);
-        auto column_family = req->get_query_param("cf");
-        return make_ready_future<json::json_return_type>(json_void());
+        auto tables = split_cf(req->get_query_param("cf"));
+        return set_tables_autocompaction(ctx, keyspace, tables, true).then([]{
+            return make_ready_future<json::json_return_type>(json_void());
+        });
    });

    ss::disable_auto_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
        auto keyspace = validate_keyspace(ctx, req->param);
-        auto column_family = req->get_query_param("cf");
-        return make_ready_future<json::json_return_type>(json_void());
+        auto tables = split_cf(req->get_query_param("cf"));
+        return set_tables_autocompaction(ctx, keyspace, tables, false).then([]{
+            return make_ready_future<json::json_return_type>(json_void());
+        });
    });

    ss::deliver_hints.set(r, [](std::unique_ptr<request> req) {
@@ -860,7 +793,7 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::get_metrics_load.set(r, [&ctx](std::unique_ptr<request> req) {
-        return get_cf_stats(ctx, &column_family::stats::live_disk_space_used);
+        return get_cf_stats(ctx, &column_family_stats::live_disk_space_used);
    });

    ss::get_exceptions.set(r, [](const_req req) {
@@ -1031,4 +964,107 @@ void set_storage_service(http_context& ctx, routes& r) {

 }

+void set_snapshot(http_context& ctx, routes& r) {
+    ss::get_snapshot_details.set(r, [](std::unique_ptr<request> req) {
+        std::function<future<>(output_stream<char>&&)> f = [](output_stream<char>&& s) {
+            return do_with(output_stream<char>(std::move(s)), true, [] (output_stream<char>& s, bool& first){
+                return s.write("[").then([&s, &first] {
+                    return service::get_local_storage_service().get_snapshot_details().then([&s, &first] (std::unordered_map<sstring, std::vector<service::storage_service::snapshot_details>>&& result) {
+                        return do_with(std::move(result), [&s, &first](const std::unordered_map<sstring, std::vector<service::storage_service::snapshot_details>>& result) {
+                            return do_for_each(result, [&s, &result,&first](std::tuple<sstring, std::vector<service::storage_service::snapshot_details>>&& map){
+                                return do_with(ss::snapshots(), [&s, &first, &result, &map](ss::snapshots& all_snapshots) {
+                                    all_snapshots.key = std::get<0>(map);
+                                    future<> f = first ? make_ready_future<>() : s.write(", ");
+                                    first = false;
+                                    std::vector<ss::snapshot> snapshot;
+                                    for (auto& cf: std::get<1>(map)) {
+                                        ss::snapshot snp;
+                                        snp.ks = cf.ks;
+                                        snp.cf = cf.cf;
+                                        snp.live = cf.live;
+                                        snp.total = cf.total;
+                                        snapshot.push_back(std::move(snp));
+                                    }
+                                    all_snapshots.value = std::move(snapshot);
+                                    return f.then([&s, &all_snapshots] {
+                                        return all_snapshots.write(s);
+                                    });
+                                });
+                            });
+                        });
+                    }).then([&s] {
+                        return s.write("]").then([&s] {
+                            return s.close();
+                        });
+                    });
+                });
+            });
+        };
+        return make_ready_future<json::json_return_type>(std::move(f));
+    });
+
+    ss::take_snapshot.set(r, [](std::unique_ptr<request> req) {
+        auto tag = req->get_query_param("tag");
+        auto column_family = req->get_query_param("cf");
+
+        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
+
+        auto resp = make_ready_future<>();
+        if (column_family.empty()) {
+            resp = service::get_local_storage_service().take_snapshot(tag, keynames);
+        } else {
+            if (keynames.empty()) {
+                throw httpd::bad_param_exception("The keyspace of column families must be specified");
+            }
+            if (keynames.size() > 1) {
+                throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
+            }
+            resp = service::get_local_storage_service().take_column_family_snapshot(keynames[0], column_family, tag);
+        }
+        return resp.then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    ss::del_snapshot.set(r, [](std::unique_ptr<request> req) {
+        auto tag = req->get_query_param("tag");
+        auto column_family = req->get_query_param("cf");
+
+        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
+        return service::get_local_storage_service().clear_snapshot(tag, keynames, column_family).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    ss::true_snapshots_size.set(r, [](std::unique_ptr<request> req) {
+        return service::get_local_storage_service().true_snapshots_size().then([] (int64_t size) {
+            return make_ready_future<json::json_return_type>(size);
+        });
+    });
+
+    ss::scrub.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
+        const auto skip_corrupted = req_param<bool>(*req, "skip_corrupted", false);
+
+        auto f = make_ready_future<>();
+        if (!req_param<bool>(*req, "disable_snapshot", false)) {
+            auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
+            f = parallel_for_each(column_families, [keyspace, tag](sstring cf) {
+                return service::get_local_storage_service().take_column_family_snapshot(keyspace, cf, tag);
+            });
+        }
+
+        return f.then([&ctx, keyspace, column_families, skip_corrupted] {
+            return ctx.db.invoke_on_all([=] (database& db) {
+                return do_for_each(column_families, [=, &db](sstring cfname) {
+                    auto& cm = db.get_compaction_manager();
+                    auto& cf = db.find_column_family(keyspace, cfname);
+                    return cm.perform_sstable_scrub(&cf, skip_corrupted);
+                });
+            });
+        }).then([]{
+            return make_ready_future<json::json_return_type>(0);
+        });
+    }));
+}
+
 }
--- a/api/storage_service.hh
+++ b/api/storage_service.hh
@@ -26,5 +26,6 @@
 namespace api {

 void set_storage_service(http_context& ctx, routes& r);
+void set_snapshot(http_context& ctx, routes& r);

 }
--- a/api/system.cc
+++ b/api/system.cc
@@ -22,6 +22,7 @@
 #include "api/api-doc/system.json.hh"
 #include "api/api.hh"

+#include <seastar/core/reactor.hh>
 #include <seastar/http/exception.hh>
 #include "log.hh"

@@ -30,6 +31,10 @@ namespace api {
 namespace hs = httpd::system_json;

 void set_system(http_context& ctx, routes& r) {
+    hs::get_system_uptime.set(r, [](const_req req) {
+        return std::chrono::duration_cast<std::chrono::milliseconds>(engine().uptime()).count();
+    });
+
    hs::get_all_logger_names.set(r, [](const_req req) {
        return logging::logger_registry().get_all_logger_names();
    });
--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -21,8 +21,8 @@

 #include "atomic_cell.hh"
 #include "atomic_cell_or_collection.hh"
+#include "counters.hh"
 #include "types.hh"
-#include "types/collection.hh"

 /// LSA mirator for cells with irrelevant type
 ///
@@ -148,35 +148,6 @@ atomic_cell_or_collection::atomic_cell_or_collection(const abstract_type& type,
 {
 }

-static collection_mutation_view get_collection_mutation_view(const uint8_t* ptr)
-{
-    auto f = data::cell::structure::get_member<data::cell::tags::flags>(ptr);
-    auto ti = data::type_info::make_collection();
-    data::cell::context ctx(f, ti);
-    auto view = data::cell::structure::get_member<data::cell::tags::cell>(ptr).as<data::cell::tags::collection>(ctx);
-    auto dv = data::cell::variable_value::make_view(view, f.get<data::cell::tags::external_data>());
-    return collection_mutation_view { dv };
-}
-
-collection_mutation_view atomic_cell_or_collection::as_collection_mutation() const {
-    return get_collection_mutation_view(_data.get());
-}
-
-collection_mutation::collection_mutation(const collection_type_impl& type, collection_mutation_view v)
-    : _data(imr_object_type::make(data::cell::make_collection(v.data), &type.imr_state().lsa_migrator()))
-{
-}
-
-collection_mutation::collection_mutation(const collection_type_impl& type, bytes_view v)
-    : _data(imr_object_type::make(data::cell::make_collection(v), &type.imr_state().lsa_migrator()))
-{
-}
-
-collection_mutation::operator collection_mutation_view() const
-{
-    return get_collection_mutation_view(_data.get());
-}
-
 bool atomic_cell_or_collection::equals(const abstract_type& type, const atomic_cell_or_collection& other) const
 {
    auto ptr_a = _data.get();
@@ -231,7 +202,7 @@ size_t atomic_cell_or_collection::external_memory_usage(const abstract_type& t)
    size_t external_value_size = 0;
    if (flags.get<data::cell::tags::external_data>()) {
        if (flags.get<data::cell::tags::collection>()) {
-            external_value_size = get_collection_mutation_view(_data.get()).data.size_bytes();
+            external_value_size = as_collection_mutation().data.size_bytes();
        } else {
            auto cell_view = data::cell::atomic_cell_view(t.imr_state().type_info(), view);
            external_value_size = cell_view.value_size();
@@ -244,6 +215,61 @@ size_t atomic_cell_or_collection::external_memory_usage(const abstract_type& t)
        + imr_object_type::size_overhead + external_value_size;
 }

+std::ostream&
+operator<<(std::ostream& os, const atomic_cell_view& acv) {
+    if (acv.is_live()) {
+        return fmt_print(os, "atomic_cell{{{},ts={:d},expiry={:d},ttl={:d}}}",
+            acv.is_counter_update()
+                    ? "counter_update_value=" + to_sstring(acv.counter_update_value())
+                    : to_hex(acv.value().linearize()),
+            acv.timestamp(),
+            acv.is_live_and_has_ttl() ? acv.expiry().time_since_epoch().count() : -1,
+            acv.is_live_and_has_ttl() ? acv.ttl().count() : 0);
+    } else {
+        return fmt_print(os, "atomic_cell{{DEAD,ts={:d},deletion_time={:d}}}",
+            acv.timestamp(), acv.deletion_time().time_since_epoch().count());
+    }
+}
+
+std::ostream&
+operator<<(std::ostream& os, const atomic_cell& ac) {
+    return os << atomic_cell_view(ac);
+}
+
+std::ostream&
+operator<<(std::ostream& os, const atomic_cell_view::printer& acvp) {
+    auto& type = acvp._type;
+    auto& acv = acvp._cell;
+    if (acv.is_live()) {
+        std::ostringstream cell_value_string_builder;
+        if (type.is_counter()) {
+            if (acv.is_counter_update()) {
+                cell_value_string_builder << "counter_update_value=" << acv.counter_update_value();
+            } else {
+                cell_value_string_builder << "shards: ";
+                counter_cell_view::with_linearized(acv, [&cell_value_string_builder] (counter_cell_view& ccv) {
+                    cell_value_string_builder << ::join(", ", ccv.shards());
+                });
+            }
+        } else {
+            cell_value_string_builder << type.to_string(acv.value().linearize());
+        }
+        return fmt_print(os, "atomic_cell{{{},ts={:d},expiry={:d},ttl={:d}}}",
+            cell_value_string_builder.str(),
+            acv.timestamp(),
+            acv.is_live_and_has_ttl() ? acv.expiry().time_since_epoch().count() : -1,
+            acv.is_live_and_has_ttl() ? acv.ttl().count() : 0);
+    } else {
+        return fmt_print(os, "atomic_cell{{DEAD,ts={:d},deletion_time={:d}}}",
+            acv.timestamp(), acv.deletion_time().time_since_epoch().count());
+    }
+}
+
+std::ostream&
+operator<<(std::ostream& os, const atomic_cell::printer& acp) {
+    return operator<<(os, static_cast<const atomic_cell_view::printer&>(acp));
+}
+
 std::ostream& operator<<(std::ostream& os, const atomic_cell_or_collection::printer& p) {
    if (!p._cell._data.get()) {
        return os << "{ null atomic_cell_or_collection }";
@@ -253,9 +279,9 @@ std::ostream& operator<<(std::ostream& os, const atomic_cell_or_collection::prin
    if (dc::structure::get_member<dc::tags::flags>(p._cell._data.get()).get<dc::tags::collection>()) {
        os << "collection ";
        auto cmv = p._cell.as_collection_mutation();
-        os << to_hex(cmv.data.linearize());
+        os << collection_mutation_view::printer(*p._cdef.type, cmv);
    } else {
-        os << p._cell.as_atomic_cell(p._cdef);
+        os << atomic_cell_view::printer(*p._cdef.type, p._cell.as_atomic_cell(p._cdef));
    }
    return os << " }";
 }
--- a/atomic_cell.hh
+++ b/atomic_cell.hh
@@ -153,6 +153,14 @@ public:
    }

    friend std::ostream& operator<<(std::ostream& os, const atomic_cell_view& acv);
+
+    class printer {
+        const abstract_type& _type;
+        const atomic_cell_view& _cell;
+    public:
+        printer(const abstract_type& type, const atomic_cell_view& cell) : _type(type), _cell(cell) {}
+        friend std::ostream& operator<<(std::ostream& os, const printer& acvp);
+    };
 };

 class atomic_cell_mutable_view final : public basic_atomic_cell_view<mutable_view::yes> {
@@ -219,30 +227,12 @@ public:
    static atomic_cell make_live_uninitialized(const abstract_type& type, api::timestamp_type timestamp, size_t size);
    friend class atomic_cell_or_collection;
    friend std::ostream& operator<<(std::ostream& os, const atomic_cell& ac);
-};

-class collection_mutation_view;
-
-// Represents a mutation of a collection.  Actual format is determined by collection type,
-// and is:
-//   set:  list of atomic_cell
-//   map:  list of pair<atomic_cell, bytes> (for key/value)
-//   list: tbd, probably ugly
-class collection_mutation {
-public:
-    using imr_object_type =  imr::utils::object<data::cell::structure>;
-    imr_object_type _data;
-
-    collection_mutation() {}
-    collection_mutation(const collection_type_impl&, collection_mutation_view v);
-    collection_mutation(const collection_type_impl&, bytes_view bv);
-    operator collection_mutation_view() const;
-};
-
-
-class collection_mutation_view {
-public:
-    atomic_cell_value_view data;
+    class printer : atomic_cell_view::printer {
+    public:
+        printer(const abstract_type& type, const atomic_cell_view& cell) : atomic_cell_view::printer(type, cell) {}
+        friend std::ostream& operator<<(std::ostream& os, const printer& acvp);
+    };
 };

 class column_definition;
--- a/atomic_cell_hash.hh
+++ b/atomic_cell_hash.hh
@@ -34,14 +34,12 @@ template<>
 struct appending_hash<collection_mutation_view> {
    template<typename Hasher>
    void operator()(Hasher& h, collection_mutation_view cell, const column_definition& cdef) const {
-      cell.data.with_linearized([&] (bytes_view cell_bv) {
-        auto ctype = static_pointer_cast<const collection_type_impl>(cdef.type);
-        auto m_view = ctype->deserialize_mutation_form(cell_bv);
-        ::feed_hash(h, m_view.tomb);
-        for (auto&& key_and_value : m_view.cells) {
-            ::feed_hash(h, key_and_value.first);
-            ::feed_hash(h, key_and_value.second, cdef);
-        }
+        cell.with_deserialized(*cdef.type, [&] (collection_mutation_view_description m_view) {
+            ::feed_hash(h, m_view.tomb);
+            for (auto&& key_and_value : m_view.cells) {
+                ::feed_hash(h, key_and_value.first);
+                ::feed_hash(h, key_and_value.second, cdef);
+            }
      });
    }
 };
--- a/atomic_cell_or_collection.hh
+++ b/atomic_cell_or_collection.hh
@@ -22,6 +22,7 @@
 #pragma once

 #include "atomic_cell.hh"
+#include "collection_mutation.hh"
 #include "schema.hh"
 #include "hashing.hh"

--- a/auth/allow_all_authenticator.hh
+++ b/auth/allow_all_authenticator.hh
@@ -52,7 +52,7 @@ public:
        return make_ready_future<>();
    }

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return allow_all_authenticator_name();
    }

--- a/auth/allow_all_authorizer.hh
+++ b/auth/allow_all_authorizer.hh
@@ -49,7 +49,7 @@ public:
        return make_ready_future<>();
    }

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return allow_all_authorizer_name();
    }

--- a/auth/authenticator.hh
+++ b/auth/authenticator.hh
@@ -96,7 +96,7 @@ public:
    ///
    /// A fully-qualified (class with package) Java-like name for this implementation.
    ///
-    virtual const sstring& qualified_java_name() const = 0;
+    virtual std::string_view qualified_java_name() const = 0;

    virtual bool require_authentication() const = 0;

--- a/auth/authorizer.hh
+++ b/auth/authorizer.hh
@@ -100,7 +100,7 @@ public:
    ///
    /// A fully-qualified (class with package) Java-like name for this implementation.
    ///
-    virtual const sstring& qualified_java_name() const = 0;
+    virtual std::string_view qualified_java_name() const = 0;

    ///
    /// Query for the permissions granted directly to a role for a particular \ref resource (and not any of its
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -59,22 +59,22 @@ future<> do_after_system_ready(seastar::abort_source& as, seastar::noncopyable_f
    }).discard_result();
 }

-future<> create_metadata_table_if_missing(
+static future<> create_metadata_table_if_missing_impl(
        std::string_view table_name,
        cql3::query_processor& qp,
        std::string_view cql,
        ::service::migration_manager& mm) {
    static auto ignore_existing = [] (seastar::noncopyable_function<future<>()> func) {
-        return futurize_apply(std::move(func)).handle_exception_type([] (exceptions::already_exists_exception& ignored) { });
+        return futurize_invoke(std::move(func)).handle_exception_type([] (exceptions::already_exists_exception& ignored) { });
    };
    auto& db = qp.db();
-    auto parsed_statement = static_pointer_cast<cql3::statements::raw::cf_statement>(
-            cql3::query_processor::parse_statement(cql));
+    auto parsed_statement = cql3::query_processor::parse_statement(cql);
+    auto& parsed_cf_statement = static_cast<cql3::statements::raw::cf_statement&>(*parsed_statement);

-    parsed_statement->prepare_keyspace(meta::AUTH_KS);
+    parsed_cf_statement.prepare_keyspace(meta::AUTH_KS);

    auto statement = static_pointer_cast<cql3::statements::create_table_statement>(
-            parsed_statement->prepare(db, qp.get_cql_stats())->statement);
+            parsed_cf_statement.prepare(db, qp.get_cql_stats())->statement);

    const auto schema = statement->get_cf_meta_data(qp.db());
    const auto uuid = generate_legacy_id(schema->ks_name(), schema->cf_name());
@@ -85,7 +85,14 @@ future<> create_metadata_table_if_missing(
    return ignore_existing([&mm, table = std::move(table)] () {
        return mm.announce_new_column_family(table, false);
    });
+}

+future<> create_metadata_table_if_missing(
+        std::string_view table_name,
+        cql3::query_processor& qp,
+        std::string_view cql,
+        ::service::migration_manager& mm) noexcept {
+    return futurize_invoke(create_metadata_table_if_missing_impl, table_name, qp, cql, mm);
 }

 future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db, seastar::abort_source& as) {
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -27,9 +27,10 @@
 #include <seastar/core/future.hh>
 #include <seastar/core/abort_source.hh>
 #include <seastar/util/noncopyable_function.hh>
-#include <seastar/core/reactor.hh>
+#include <seastar/core/seastar.hh>
 #include <seastar/core/resource.hh>
 #include <seastar/core/sstring.hh>
+#include <seastar/core/smp.hh>

 #include "log.hh"
 #include "seastarx.hh"
@@ -61,7 +62,7 @@ extern const sstring AUTH_PACKAGE_NAME;

 template <class Task>
 future<> once_among_shards(Task&& f) {
-    if (engine().cpu_id() == 0u) {
+    if (this_shard_id() == 0u) {
        return f();
    }

@@ -79,7 +80,7 @@ future<> create_metadata_table_if_missing(
        std::string_view table_name,
        cql3::query_processor&,
        std::string_view cql,
-        ::service::migration_manager&);
+        ::service::migration_manager&) noexcept;

 future<> wait_for_schema_agreement(::service::migration_manager&, const database&, seastar::abort_source&);

--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -51,7 +51,7 @@ extern "C" {

 #include <boost/algorithm/string/join.hpp>
 #include <boost/range.hpp>
-#include <seastar/core/reactor.hh>
+#include <seastar/core/seastar.hh>

 #include "auth/authenticated_user.hh"
 #include "auth/common.hh"
@@ -101,7 +101,7 @@ bool default_authorizer::legacy_metadata_exists() const {
 future<bool> default_authorizer::any_granted() const {
    static const sstring query = format("SELECT * FROM {}.{} LIMIT 1", meta::AUTH_KS, PERMISSIONS_CF);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config,
@@ -115,7 +115,7 @@ future<> default_authorizer::migrate_legacy_metadata() const {
    alogger.info("Starting migration of legacy permissions metadata.");
    static const sstring query = format("SELECT * FROM {}.{}", meta::AUTH_KS, legacy_table_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config).then([this](::shared_ptr<cql3::untyped_result_set> results) {
@@ -195,7 +195,7 @@ default_authorizer::authorize(const role_or_anonymous& maybe_role, const resourc
            ROLE_NAME,
            RESOURCE_NAME);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config,
@@ -224,7 +224,7 @@ default_authorizer::modify(
                    ROLE_NAME,
                    RESOURCE_NAME),
            [this, &role_name, set, &resource](const auto& query) {
-        return _qp.process(
+        return _qp.execute_internal(
                query,
                db::consistency_level::ONE,
                internal_distributed_timeout_config(),
@@ -249,7 +249,7 @@ future<std::vector<permission_details>> default_authorizer::list_all() const {
            meta::AUTH_KS,
            PERMISSIONS_CF);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::ONE,
            internal_distributed_timeout_config(),
@@ -276,7 +276,7 @@ future<> default_authorizer::revoke_all(std::string_view role_name) const {
            PERMISSIONS_CF,
            ROLE_NAME);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::ONE,
            internal_distributed_timeout_config(),
@@ -296,7 +296,7 @@ future<> default_authorizer::revoke_all(const resource& resource) const {
            PERMISSIONS_CF,
            RESOURCE_NAME);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::LOCAL_ONE,
            infinite_timeout_config,
@@ -313,7 +313,7 @@ future<> default_authorizer::revoke_all(const resource& resource) const {
                        ROLE_NAME,
                        RESOURCE_NAME);

-                return _qp.process(
+                return _qp.execute_internal(
                        query,
                        db::consistency_level::LOCAL_ONE,
                        infinite_timeout_config,
--- a/auth/default_authorizer.hh
+++ b/auth/default_authorizer.hh
@@ -71,7 +71,7 @@ public:

    virtual future<> stop() override;

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return default_authorizer_name();
    }

--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -48,7 +48,7 @@
 #include <optional>

 #include <boost/algorithm/cxx11/all_of.hpp>
-#include <seastar/core/reactor.hh>
+#include <seastar/core/seastar.hh>

 #include "auth/authenticated_user.hh"
 #include "auth/common.hh"
@@ -96,10 +96,13 @@ static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
    return !row.get_or<sstring>(SALTED_HASH, "").empty();
 }

-static const sstring update_row_query = format("UPDATE {} SET {} = ? WHERE {} = ?",
-        meta::roles_table::qualified_name(),
-        SALTED_HASH,
-        meta::roles_table::role_col_name);
+static const sstring& update_row_query() {
+    static const sstring update_row_query = format("UPDATE {} SET {} = ? WHERE {} = ?",
+            meta::roles_table::qualified_name(),
+            SALTED_HASH,
+            meta::roles_table::role_col_name);
+    return update_row_query;
+}

 static const sstring legacy_table_name{"credentials"};

@@ -111,7 +114,7 @@ future<> password_authenticator::migrate_legacy_metadata() const {
    plogger.info("Starting migration of legacy authentication metadata.");
    static const sstring query = format("SELECT * FROM {}.{}", meta::AUTH_KS, legacy_table_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
            internal_distributed_timeout_config()).then([this](::shared_ptr<cql3::untyped_result_set> results) {
@@ -119,8 +122,8 @@ future<> password_authenticator::migrate_legacy_metadata() const {
            auto username = row.get_as<sstring>("username");
            auto salted_hash = row.get_as<sstring>(SALTED_HASH);

-            return _qp.process(
-                    update_row_query,
+            return _qp.execute_internal(
+                    update_row_query(),
                    consistency_for_user(username),
                    internal_distributed_timeout_config(),
                    {std::move(salted_hash), username}).discard_result();
@@ -136,8 +139,8 @@ future<> password_authenticator::migrate_legacy_metadata() const {
 future<> password_authenticator::create_default_if_missing() const {
    return default_role_row_satisfies(_qp, &has_salted_hash).then([this](bool exists) {
        if (!exists) {
-            return _qp.process(
-                    update_row_query,
+            return _qp.execute_internal(
+                    update_row_query(),
                    db::consistency_level::QUORUM,
                    internal_distributed_timeout_config(),
                    {passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt), DEFAULT_USER_NAME}).then([](auto&&) {
@@ -194,7 +197,7 @@ db::consistency_level password_authenticator::consistency_for_user(std::string_v
    return db::consistency_level::LOCAL_ONE;
 }

-const sstring& password_authenticator::qualified_java_name() const {
+std::string_view password_authenticator::qualified_java_name() const {
    return password_authenticator_name();
 }

@@ -227,13 +230,13 @@ future<authenticated_user> password_authenticator::authenticate(
    // obsolete prepared statements pretty quickly.
    // Rely on query processing caching statements instead, and lets assume
    // that a map lookup string->statement is not gonna kill us much.
-    return futurize_apply([this, username, password] {
+    return futurize_invoke([this, username, password] {
        static const sstring query = format("SELECT {} FROM {} WHERE {} = ?",
                SALTED_HASH,
                meta::roles_table::qualified_name(),
                meta::roles_table::role_col_name);

-        return _qp.process(
+        return _qp.execute_internal(
                query,
                consistency_for_user(username),
                internal_distributed_timeout_config(),
@@ -267,8 +270,8 @@ future<> password_authenticator::create(std::string_view role_name, const authen
        return make_ready_future<>();
    }

-    return _qp.process(
-            update_row_query,
+    return _qp.execute_internal(
+            update_row_query(),
            consistency_for_user(role_name),
            internal_distributed_timeout_config(),
            {passwords::hash(*options.password, rng_for_salt), sstring(role_name)}).discard_result();
@@ -284,7 +287,7 @@ future<> password_authenticator::alter(std::string_view role_name, const authent
            SALTED_HASH,
            meta::roles_table::role_col_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            consistency_for_user(role_name),
            internal_distributed_timeout_config(),
@@ -297,7 +300,7 @@ future<> password_authenticator::drop(std::string_view name) const {
            meta::roles_table::qualified_name(),
            meta::roles_table::role_col_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query, consistency_for_user(name),
            internal_distributed_timeout_config(),
            {sstring(name)}).discard_result();
--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -71,7 +71,7 @@ public:

    virtual future<> stop() override;

-    virtual const sstring& qualified_java_name() const override;
+    virtual std::string_view qualified_java_name() const override;

    virtual bool require_authentication() const override;

--- a/auth/role_manager.hh
+++ b/auth/role_manager.hh
@@ -33,6 +33,7 @@

 #include "auth/resource.hh"
 #include "seastarx.hh"
+#include "exceptions/exceptions.hh"

 namespace auth {

@@ -52,9 +53,9 @@ struct role_config_update final {
 ///
 /// A logical argument error for a role-management operation.
 ///
-class roles_argument_exception : public std::invalid_argument {
+class roles_argument_exception : public exceptions::invalid_request_exception {
 public:
-    using std::invalid_argument::invalid_argument;
+    using exceptions::invalid_request_exception::invalid_request_exception;
 };

 class role_already_exists : public roles_argument_exception {
--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -68,14 +68,14 @@ future<bool> default_role_row_satisfies(
            meta::roles_table::role_col_name);

    return do_with(std::move(p), [&qp](const auto& p) {
-        return qp.process(
+        return qp.execute_internal(
                query,
                db::consistency_level::ONE,
                infinite_timeout_config,
                {meta::DEFAULT_SUPERUSER_NAME},
                true).then([&qp, &p](::shared_ptr<cql3::untyped_result_set> results) {
            if (results->empty()) {
-                return qp.process(
+                return qp.execute_internal(
                        query,
                        db::consistency_level::QUORUM,
                        internal_distributed_timeout_config(),
@@ -100,7 +100,7 @@ future<bool> any_nondefault_role_row_satisfies(
    static const sstring query = format("SELECT * FROM {}", meta::roles_table::qualified_name());

    return do_with(std::move(p), [&qp](const auto& p) {
-        return qp.process(
+        return qp.execute_internal(
                query,
                db::consistency_level::QUORUM,
                internal_distributed_timeout_config()).then([&p](::shared_ptr<cql3::untyped_result_set> results) {
--- a/auth/service.cc
+++ b/auth/service.cc
@@ -39,7 +39,7 @@
 #include "db/consistency_level_type.hh"
 #include "exceptions/exceptions.hh"
 #include "log.hh"
-#include "service/migration_listener.hh"
+#include "service/migration_manager.hh"
 #include "utils/class_registrator.hh"
 #include "database.hh"

@@ -77,17 +77,23 @@ private:
    void on_update_view(const sstring& ks_name, const sstring& view_name, bool columns_changed) override {}

    void on_drop_keyspace(const sstring& ks_name) override {
-        _authorizer.revoke_all(
+        // Do it in the background.
+        (void)_authorizer.revoke_all(
                auth::make_data_resource(ks_name)).handle_exception_type([](const unsupported_authorization_operation&) {
            // Nothing.
+        }).handle_exception([] (std::exception_ptr e) {
+            log.error("Unexpected exception while revoking all permissions on dropped keyspace: {}", e);
        });
    }

    void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {
-        _authorizer.revoke_all(
+        // Do it in the background.
+        (void)_authorizer.revoke_all(
                auth::make_data_resource(
                        ks_name, cf_name)).handle_exception_type([](const unsupported_authorization_operation&) {
            // Nothing.
+        }).handle_exception([] (std::exception_ptr e) {
+            log.error("Unexpected exception while revoking all permissions on dropped table: {}", e);
        });
    }

@@ -108,14 +114,14 @@ static future<> validate_role_exists(const service& ser, std::string_view role_n
 service::service(
        permissions_cache_config c,
        cql3::query_processor& qp,
-        ::service::migration_manager& mm,
+        ::service::migration_notifier& mn,
        std::unique_ptr<authorizer> z,
        std::unique_ptr<authenticator> a,
        std::unique_ptr<role_manager> r)
            : _permissions_cache_config(std::move(c))
            , _permissions_cache(nullptr)
            , _qp(qp)
-            , _migration_manager(mm)
+            , _mnotifier(mn)
            , _authorizer(std::move(z))
            , _authenticator(std::move(a))
            , _role_manager(std::move(r))
@@ -135,18 +141,19 @@ service::service(
 service::service(
        permissions_cache_config c,
        cql3::query_processor& qp,
+        ::service::migration_notifier& mn,
        ::service::migration_manager& mm,
        const service_config& sc)
            : service(
                      std::move(c),
                      qp,
-                      mm,
+                      mn,
                      create_object<authorizer>(sc.authorizer_java_name, qp, mm),
                      create_object<authenticator>(sc.authenticator_java_name, qp, mm),
                      create_object<role_manager>(sc.role_manager_java_name, qp, mm)) {
 }

-future<> service::create_keyspace_if_missing() const {
+future<> service::create_keyspace_if_missing(::service::migration_manager& mm) const {
    auto& db = _qp.db();

    if (!db.has_keyspace(meta::AUTH_KS)) {
@@ -160,15 +167,15 @@ future<> service::create_keyspace_if_missing() const {

        // We use min_timestamp so that default keyspace metadata will loose with any manual adjustments.
        // See issue #2129.
-        return _migration_manager.announce_new_keyspace(ksm, api::min_timestamp, false);
+        return mm.announce_new_keyspace(ksm, api::min_timestamp, false);
    }

    return make_ready_future<>();
 }

-future<> service::start() {
-    return once_among_shards([this] {
-        return create_keyspace_if_missing();
+future<> service::start(::service::migration_manager& mm) {
+    return once_among_shards([this, &mm] {
+        return create_keyspace_if_missing(mm);
    }).then([this] {
        return _role_manager->start().then([this] {
            return when_all_succeed(_authorizer->start(), _authenticator->start());
@@ -177,7 +184,7 @@ future<> service::start() {
        _permissions_cache = std::make_unique<permissions_cache>(_permissions_cache_config, *this, log);
    }).then([this] {
        return once_among_shards([this] {
-            _migration_manager.register_listener(_migration_listener.get());
+            _mnotifier.register_listener(_migration_listener.get());
            return make_ready_future<>();
        });
    });
@@ -186,9 +193,12 @@ future<> service::start() {
 future<> service::stop() {
    // Only one of the shards has the listener registered, but let's try to
    // unregister on each one just to make sure.
-    _migration_manager.unregister_listener(_migration_listener.get());
-
-    return _permissions_cache->stop().then([this] {
+    return _mnotifier.unregister_listener(_migration_listener.get()).then([this] {
+        if (_permissions_cache) {
+            return _permissions_cache->stop();
+        }
+        return make_ready_future<>();
+    }).then([this] {
        return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop());
    });
 }
@@ -210,7 +220,7 @@ future<bool> service::has_existing_legacy_users() const {
    // This logic is borrowed directly from Apache Cassandra. By first checking for the presence of the default user, we
    // can potentially avoid doing a range query with a high consistency level.

-    return _qp.process(
+    return _qp.execute_internal(
            default_user_query,
            db::consistency_level::ONE,
            infinite_timeout_config,
@@ -220,7 +230,7 @@ future<bool> service::has_existing_legacy_users() const {
            return make_ready_future<bool>(true);
        }

-        return _qp.process(
+        return _qp.execute_internal(
                default_user_query,
                db::consistency_level::QUORUM,
                infinite_timeout_config,
@@ -230,7 +240,7 @@ future<bool> service::has_existing_legacy_users() const {
                return make_ready_future<bool>(true);
            }

-            return _qp.process(
+            return _qp.execute_internal(
                    all_users_query,
                    db::consistency_level::QUORUM,
                    infinite_timeout_config).then([](auto results) {
@@ -409,7 +419,7 @@ future<> create_role(
            return make_ready_future<>();
        }

-        return futurize_apply(
+        return futurize_invoke(
                &validate_authentication_options_are_supported,
                options,
                ser.underlying_authenticator().supported_options()).then([&ser, name, &options] {
@@ -433,7 +443,7 @@ future<> alter_role(
            return make_ready_future<>();
        }

-        return futurize_apply(
+        return futurize_invoke(
                &validate_authentication_options_are_supported,
                options,
                ser.underlying_authenticator().supported_options()).then([&ser, name, &options] {
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -28,6 +28,7 @@
 #include <seastar/core/future.hh>
 #include <seastar/core/sstring.hh>
 #include <seastar/util/bool_class.hh>
+#include <seastar/core/sharded.hh>

 #include "auth/authenticator.hh"
 #include "auth/authorizer.hh"
@@ -42,6 +43,7 @@ class query_processor;

 namespace service {
 class migration_manager;
+class migration_notifier;
 class migration_listener;
 }

@@ -76,13 +78,15 @@ public:
 ///
 /// All state associated with access-control is stored externally to any particular instance of this class.
 ///
-class service final {
+/// peering_sharded_service inheritance is needed to be able to access shard local authentication service
+/// given an object from another shard. Used for bouncing lwt requests to correct shard.
+class service final : public seastar::peering_sharded_service<service> {
    permissions_cache_config _permissions_cache_config;
    std::unique_ptr<permissions_cache> _permissions_cache;

    cql3::query_processor& _qp;

-    ::service::migration_manager& _migration_manager;
+    ::service::migration_notifier& _mnotifier;

    std::unique_ptr<authorizer> _authorizer;

@@ -97,7 +101,7 @@ public:
    service(
            permissions_cache_config,
            cql3::query_processor&,
-            ::service::migration_manager&,
+            ::service::migration_notifier&,
            std::unique_ptr<authorizer>,
            std::unique_ptr<authenticator>,
            std::unique_ptr<role_manager>);
@@ -110,10 +114,11 @@ public:
    service(
            permissions_cache_config,
            cql3::query_processor&,
+            ::service::migration_notifier&,
            ::service::migration_manager&,
            const service_config&);

-    future<> start();
+    future<> start(::service::migration_manager&);

    future<> stop();

@@ -159,7 +164,7 @@ public:
 private:
    future<bool> has_existing_legacy_users() const;

-    future<> create_keyspace_if_missing() const;
+    future<> create_keyspace_if_missing(::service::migration_manager& mm) const;
 };

 future<bool> has_superuser(const service&, const authenticated_user&);
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -35,6 +35,7 @@
 #include "auth/common.hh"
 #include "auth/roles-metadata.hh"
 #include "cql3/query_processor.hh"
+#include "cql3/untyped_result_set.hh"
 #include "db/consistency_level_type.hh"
 #include "exceptions/exceptions.hh"
 #include "log.hh"
@@ -86,7 +87,7 @@ static future<std::optional<record>> find_record(cql3::query_processor& qp, std:
            meta::roles_table::qualified_name(),
            meta::roles_table::role_col_name);

-    return qp.process(
+    return qp.execute_internal(
            query,
            consistency_for_role(role_name),
            internal_distributed_timeout_config(),
@@ -101,8 +102,8 @@ static future<std::optional<record>> find_record(cql3::query_processor& qp, std:
        return std::make_optional(
                record{
                        row.get_as<sstring>(sstring(meta::roles_table::role_col_name)),
-                        row.get_as<bool>("is_superuser"),
-                        row.get_as<bool>("can_login"),
+                        row.get_or<bool>("is_superuser", false),
+                        row.get_or<bool>("can_login", false),
                        (row.has("member_of")
                                 ? row.get_set<sstring>("member_of")
                                 : role_set())});
@@ -170,7 +171,7 @@ future<> standard_role_manager::create_default_role_if_missing() const {
                    meta::roles_table::qualified_name(),
                    meta::roles_table::role_col_name);

-            return _qp.process(
+            return _qp.execute_internal(
                    query,
                    db::consistency_level::QUORUM,
                    internal_distributed_timeout_config(),
@@ -197,13 +198,13 @@ future<> standard_role_manager::migrate_legacy_metadata() const {
    log.info("Starting migration of legacy user metadata.");
    static const sstring query = format("SELECT * FROM {}.{}", meta::AUTH_KS, legacy_table_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
            internal_distributed_timeout_config()).then([this](::shared_ptr<cql3::untyped_result_set> results) {
        return do_for_each(*results, [this](const cql3::untyped_result_set_row& row) {
            role_config config;
-            config.is_superuser = row.get_as<bool>("super");
+            config.is_superuser = row.get_or<bool>("super", false);
            config.can_login = true;

            return do_with(
@@ -258,7 +259,7 @@ future<> standard_role_manager::create_or_replace(std::string_view role_name, co
            meta::roles_table::qualified_name(),
            meta::roles_table::role_col_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            consistency_for_role(role_name),
            internal_distributed_timeout_config(),
@@ -298,7 +299,7 @@ standard_role_manager::alter(std::string_view role_name, const role_config_updat
            return make_ready_future<>();
        }

-        return _qp.process(
+        return _qp.execute_internal(
                format("UPDATE {} SET {} WHERE {} = ?",
                        meta::roles_table::qualified_name(),
                        build_column_assignments(u),
@@ -320,7 +321,7 @@ future<> standard_role_manager::drop(std::string_view role_name) const {
            static const sstring query = format("SELECT member FROM {} WHERE role = ?",
                    meta::role_members_table::qualified_name());

-            return _qp.process(
+            return _qp.execute_internal(
                    query,
                    consistency_for_role(role_name),
                    internal_distributed_timeout_config(),
@@ -359,7 +360,7 @@ future<> standard_role_manager::drop(std::string_view role_name) const {
                    meta::roles_table::qualified_name(),
                    meta::roles_table::role_col_name);

-            return _qp.process(
+            return _qp.execute_internal(
                    query,
                    consistency_for_role(role_name),
                    internal_distributed_timeout_config(),
@@ -386,7 +387,7 @@ standard_role_manager::modify_membership(
                (ch == membership_change::add ? '+' : '-'),
                meta::roles_table::role_col_name);

-        return _qp.process(
+        return _qp.execute_internal(
                query,
                consistency_for_role(grantee_name),
                internal_distributed_timeout_config(),
@@ -396,7 +397,7 @@ standard_role_manager::modify_membership(
    const auto modify_role_members = [this, role_name, grantee_name, ch] {
        switch (ch) {
            case membership_change::add:
-                return _qp.process(
+                return _qp.execute_internal(
                        format("INSERT INTO {} (role, member) VALUES (?, ?)",
                                meta::role_members_table::qualified_name()),
                        consistency_for_role(role_name),
@@ -404,7 +405,7 @@ standard_role_manager::modify_membership(
                        {sstring(role_name), sstring(grantee_name)}).discard_result();

            case membership_change::remove:
-                return _qp.process(
+                return _qp.execute_internal(
                        format("DELETE FROM {} WHERE role = ? AND member = ?",
                                meta::role_members_table::qualified_name()),
                        consistency_for_role(role_name),
@@ -508,7 +509,7 @@ future<role_set> standard_role_manager::query_all() const {
    // To avoid many copies of a view.
    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);

-    return _qp.process(
+    return _qp.execute_internal(
            query,
            db::consistency_level::QUORUM,
            internal_distributed_timeout_config()).then([](::shared_ptr<cql3::untyped_result_set> results) {
--- a/auth/transitional.cc
+++ b/auth/transitional.cc
@@ -82,7 +82,7 @@ public:
        return _authenticator->stop();
    }

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return transitional_authenticator_name();
    }

@@ -158,7 +158,7 @@ public:
            }

            virtual future<authenticated_user> get_authenticated_user() const {
-                return futurize_apply([this] {
+                return futurize_invoke([this] {
                    return _sasl->get_authenticated_user().handle_exception([](auto ep) {
                        try {
                            std::rethrow_exception(ep);
@@ -201,7 +201,7 @@ public:
        return _authorizer->stop();
    }

-    virtual const sstring& qualified_java_name() const override {
+    virtual std::string_view qualified_java_name() const override {
        return transitional_authorizer_name();
    }

--- a/backlog_controller.hh
+++ b/backlog_controller.hh
@@ -23,7 +23,11 @@
 #include <seastar/core/scheduling.hh>
 #include <seastar/core/timer.hh>
 #include <seastar/core/gate.hh>
+#include <seastar/core/file.hh>
 #include <chrono>
+#include <cmath>
+
+#include "seastarx.hh"

 // Simple proportional controller to adjust shares for processes for which a backlog can be clearly
 // defined.
--- a/build_id.cc
+++ b/build_id.cc
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+#include "build_id.hh"
+#include <fmt/printf.h>
+#include <link.h>
+#include <seastar/core/align.hh>
+#include <sstream>
+#include <cassert>
+
+using namespace seastar;
+
+static const Elf64_Nhdr* get_nt_build_id(dl_phdr_info* info) {
+    auto base = info->dlpi_addr;
+    const auto* h = info->dlpi_phdr;
+    auto num_headers = info->dlpi_phnum;
+    for (int i = 0; i != num_headers; ++i, ++h) {
+        if (h->p_type != PT_NOTE) {
+            continue;
+        }
+
+        auto* p = reinterpret_cast<const char*>(base) + h->p_vaddr;
+        auto* e = p + h->p_memsz;
+        while (p != e) {
+            const auto* n = reinterpret_cast<const Elf64_Nhdr*>(p);
+            if (n->n_type == NT_GNU_BUILD_ID) {
+                return n;
+            }
+
+            p += sizeof(Elf64_Nhdr);
+
+            p += n->n_namesz;
+            p = align_up(p, 4);
+
+            p += n->n_descsz;
+            p = align_up(p, 4);
+        }
+    }
+
+    assert(0 && "no NT_GNU_BUILD_ID note");
+}
+
+static int callback(dl_phdr_info* info, size_t size, void* data) {
+    std::string& ret = *(std::string*)data;
+    std::ostringstream os;
+
+    // The first DSO is always the main program, which has an empty name.
+    assert(strlen(info->dlpi_name) == 0);
+
+    auto* n = get_nt_build_id(info);
+    auto* p = reinterpret_cast<const char*>(n);
+
+    p += sizeof(Elf64_Nhdr);
+
+    p += n->n_namesz;
+    p = align_up(p, 4);
+
+    const char* desc = p;
+    for (unsigned i = 0; i < n->n_descsz; ++i) {
+        fmt::fprintf(os, "%02x", (unsigned char)*(desc + i));
+    }
+    ret = os.str();
+    return 1;
+}
+
+std::string get_build_id() {
+    std::string ret;
+    int r = dl_iterate_phdr(callback, &ret);
+    assert(r == 1);
+    return ret;
+}
--- a/build_id.hh
+++ b/build_id.hh
@@ -0,0 +1,9 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+#pragma once
+
+#include <string>
+
+std::string get_build_id();
--- a/bytes.cc
+++ b/bytes.cc
@@ -64,7 +64,7 @@ bytes from_hex(sstring_view s) {

 sstring to_hex(bytes_view b) {
    static char digits[] = "0123456789abcdef";
-    sstring out(sstring::initialized_later(), b.size() * 2);
+    sstring out = uninitialized_string(b.size() * 2);
    unsigned end = b.size();
    for (unsigned i = 0; i != end; ++i) {
        uint8_t x = b[i];
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -38,6 +38,7 @@ class bytes_ostream {
 public:
    using size_type = bytes::size_type;
    using value_type = bytes::value_type;
+    using fragment_type = bytes_view;
    static constexpr size_type max_chunk_size() { return 128 * 1024; }
 private:
    static_assert(sizeof(value_type) == 1, "value_type is assumed to be one byte long");
@@ -93,6 +94,29 @@ public:
            return _current != other._current;
        }
    };
+    using const_iterator = fragment_iterator;
+
+    class output_iterator {
+    public:
+        using iterator_category = std::output_iterator_tag;
+        using difference_type = std::ptrdiff_t;
+        using value_type = bytes_ostream::value_type;
+        using pointer = bytes_ostream::value_type*;
+        using reference = bytes_ostream::value_type&;
+
+        friend class bytes_ostream;
+
+    private:
+        bytes_ostream* _ostream = nullptr;
+
+    private:
+        explicit output_iterator(bytes_ostream& os) : _ostream(&os) { }
+
+    public:
+        reference operator*() const { return *_ostream->write_place_holder(1); }
+        output_iterator& operator++() { return *this; }
+        output_iterator operator++(int) { return *this; }
+    };
 private:
    inline size_type current_space_left() const {
        if (!_current) {
@@ -289,6 +313,11 @@ public:
        return _size;
    }

+    // For the FragmentRange concept
+    size_type size_bytes() const {
+        return _size;
+    }
+
    bool empty() const {
        return _size == 0;
    }
@@ -326,6 +355,8 @@ public:
    fragment_iterator begin() const { return { _begin.get() }; }
    fragment_iterator end() const { return { nullptr }; }

+    output_iterator write_begin() { return output_iterator(*this); }
+
    boost::iterator_range<fragment_iterator> fragments() const {
        return { begin(), end() };
    }
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -61,6 +61,7 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
        // - _last_row points at a direct predecessor of the next row which is going to be read.
        //   Used for populating continuity.
        // - _population_range_starts_before_all_rows is set accordingly
+        // - _underlying is engaged and fast-forwarded
        reading_from_underlying,

        end_of_stream
@@ -99,7 +100,13 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
    // forward progress is not guaranteed in case iterators are getting constantly invalidated.
    bool _lower_bound_changed = false;

+    // Points to the underlying reader conforming to _schema,
+    // either to *_underlying_holder or _read_context->underlying().underlying().
+    flat_mutation_reader* _underlying = nullptr;
+    std::optional<flat_mutation_reader> _underlying_holder;
+
    future<> do_fill_buffer(db::timeout_clock::time_point);
+    future<> ensure_underlying(db::timeout_clock::time_point);
    void copy_from_cache_to_buffer();
    future<> process_static_row(db::timeout_clock::time_point);
    void move_to_end();
@@ -169,7 +176,7 @@ public:
        return make_ready_future<>();
    }
    virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override {
-        throw std::bad_function_call();
+        return make_exception_future<>(make_backtraced_exception_ptr<std::bad_function_call>());
    }
 };

@@ -186,23 +193,22 @@ future<> cache_flat_mutation_reader::process_static_row(db::timeout_clock::time_
        return make_ready_future<>();
    } else {
        _read_context->cache().on_row_miss();
-        return _read_context->get_next_fragment(timeout).then([this] (mutation_fragment_opt&& sr) {
-            if (sr) {
-                assert(sr->is_static_row());
-                maybe_add_to_cache(sr->as_static_row());
-                push_mutation_fragment(std::move(*sr));
-            }
-            maybe_set_static_row_continuous();
+        return ensure_underlying(timeout).then([this, timeout] {
+            return (*_underlying)(timeout).then([this] (mutation_fragment_opt&& sr) {
+                if (sr) {
+                    assert(sr->is_static_row());
+                    maybe_add_to_cache(sr->as_static_row());
+                    push_mutation_fragment(std::move(*sr));
+                }
+                maybe_set_static_row_continuous();
+            });
        });
    }
 }

 inline
 void cache_flat_mutation_reader::touch_partition() {
-    if (_snp->at_latest_version()) {
-        rows_entry& last_dummy = *_snp->version()->partition().clustered_rows().rbegin();
-        _snp->tracker()->touch(last_dummy);
-    }
+    _snp->touch();
 }

 inline
@@ -232,14 +238,36 @@ future<> cache_flat_mutation_reader::fill_buffer(db::timeout_clock::time_point t
    });
 }

+inline
+future<> cache_flat_mutation_reader::ensure_underlying(db::timeout_clock::time_point timeout) {
+    if (_underlying) {
+        return make_ready_future<>();
+    }
+    return _read_context->ensure_underlying(timeout).then([this, timeout] {
+        flat_mutation_reader& ctx_underlying = _read_context->underlying().underlying();
+        if (ctx_underlying.schema() != _schema) {
+            _underlying_holder = make_delegating_reader(ctx_underlying);
+            _underlying_holder->upgrade_schema(_schema);
+            _underlying = &*_underlying_holder;
+        } else {
+            _underlying = &ctx_underlying;
+        }
+    });
+}
+
 inline
 future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
    if (_state == state::move_to_underlying) {
+        if (!_underlying) {
+            return ensure_underlying(timeout).then([this, timeout] {
+                return do_fill_buffer(timeout);
+            });
+        }
        _state = state::reading_from_underlying;
        _population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
        auto end = _next_row_in_range ? position_in_partition(_next_row.position())
                                      : position_in_partition(_upper_bound);
-        return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
+        return _underlying->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
            return read_from_underlying(timeout);
        });
    }
@@ -280,7 +308,7 @@ future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_poin

 inline
 future<> cache_flat_mutation_reader::read_from_underlying(db::timeout_clock::time_point timeout) {
-    return consume_mutation_fragments_until(_read_context->underlying().underlying(),
+    return consume_mutation_fragments_until(*_underlying,
        [this] { return _state != state::reading_from_underlying || is_buffer_full(); },
        [this] (mutation_fragment mf) {
            _read_context->cache().on_row_miss();
--- a/canonical_mutation.cc
+++ b/canonical_mutation.cc
@@ -35,6 +35,7 @@
 #include "idl/uuid.dist.impl.hh"
 #include "idl/keys.dist.impl.hh"
 #include "idl/mutation.dist.impl.hh"
+#include <iostream>

 canonical_mutation::canonical_mutation(bytes data)
        : _data(std::move(data))
@@ -79,7 +80,8 @@ mutation canonical_mutation::to_mutation(schema_ptr s) const {

    if (version == m.schema()->version()) {
        auto partition_view = mutation_partition_view::from_view(mv.partition());
-        m.partition().apply(*m.schema(), partition_view, *m.schema());
+        mutation_application_stats app_stats;
+        m.partition().apply(*m.schema(), partition_view, *m.schema(), app_stats);
    } else {
        column_mapping cm = mv.mapping();
        converting_mutation_partition_applier v(cm, *m.schema(), m.partition());
@@ -88,3 +90,81 @@ mutation canonical_mutation::to_mutation(schema_ptr s) const {
    }
    return m;
 }
+
+static sstring bytes_to_text(bytes_view bv) {
+    sstring ret = uninitialized_string(bv.size());
+    std::copy_n(reinterpret_cast<const char*>(bv.data()), bv.size(), ret.data());
+    return ret;
+}
+
+std::ostream& operator<<(std::ostream& os, const canonical_mutation& cm) {
+    auto in = ser::as_input_stream(cm._data);
+    auto mv = ser::deserialize(in, boost::type<ser::canonical_mutation_view>());
+    column_mapping mapping = mv.mapping();
+    auto partition_view = mutation_partition_view::from_view(mv.partition());
+    fmt::print(os, "{{canonical_mutation: ");
+    fmt::print(os, "table_id {} schema_version {} ", mv.table_id(), mv.schema_version());
+    fmt::print(os, "partition_key {} ", mv.key());
+    class printing_visitor : public mutation_partition_view_virtual_visitor {
+        std::ostream& _os;
+        const column_mapping& _cm;
+        bool _first = true;
+        bool _in_row = false;
+    private:
+        void print_separator() {
+            if (!_first) {
+                fmt::print(_os, ", ");
+            }
+            _first = false;
+        }
+    public:
+        printing_visitor(std::ostream& os, const column_mapping& cm) : _os(os), _cm(cm) {}
+        virtual void accept_partition_tombstone(tombstone t) override {
+            print_separator();
+            fmt::print(_os, "partition_tombstone {}", t);
+        }
+        virtual void accept_static_cell(column_id id, atomic_cell ac) override {
+            print_separator();
+            auto&& entry = _cm.static_column_at(id);
+            fmt::print(_os, "static column {} {}", bytes_to_text(entry.name()), atomic_cell::printer(*entry.type(), ac));
+        }
+        virtual void accept_static_cell(column_id id, collection_mutation_view cmv) override {
+            print_separator();
+            auto&& entry = _cm.static_column_at(id);
+            fmt::print(_os, "static column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cmv));
+        }
+        virtual void accept_row_tombstone(range_tombstone rt) override {
+            print_separator();
+            fmt::print(_os, "row tombstone {}", rt);
+        }
+        virtual void accept_row(position_in_partition_view pipv, row_tombstone rt, row_marker rm, is_dummy, is_continuous) override {
+            if (_in_row) {
+                fmt::print(_os, "}}, ");
+            }
+            fmt::print(_os, "{{row {} tombstone {} marker {}", pipv, rt, rm);
+            _in_row = true;
+            _first = false;
+        }
+        virtual void accept_row_cell(column_id id, atomic_cell ac) override {
+            print_separator();
+            auto&& entry = _cm.regular_column_at(id);
+            fmt::print(_os, "column {} {}", bytes_to_text(entry.name()), atomic_cell::printer(*entry.type(), ac));
+        }
+        virtual void accept_row_cell(column_id id, collection_mutation_view cmv) override {
+            print_separator();
+            auto&& entry = _cm.regular_column_at(id);
+            fmt::print(_os, "column {} {}", bytes_to_text(entry.name()), collection_mutation_view::printer(*entry.type(), cmv));
+        }
+        void finalize() {
+            if (_in_row) {
+                fmt::print(_os, "}}");
+            }
+        }
+    };
+    printing_visitor pv(os, mapping);
+    partition_view.accept(mapping, pv);
+    pv.finalize();
+    fmt::print(os, "}}");
+    return os;
+}
+
--- a/canonical_mutation.hh
+++ b/canonical_mutation.hh
@@ -22,10 +22,11 @@
 #pragma once

 #include "bytes.hh"
-#include "schema.hh"
+#include "schema_fwd.hh"
 #include "database_fwd.hh"
 #include "mutation_partition_visitor.hh"
 #include "mutation_partition_serializer.hh"
+#include <iosfwd>

 // Immutable mutation form which can be read using any schema version of the same table.
 // Safe to access from other shards via const&.
@@ -52,4 +53,5 @@ public:

    const bytes& representation() const { return _data; }

+    friend std::ostream& operator<<(std::ostream& os, const canonical_mutation& cm);
 };
--- a/cartesian_product.hh
+++ b/cartesian_product.hh
@@ -22,6 +22,9 @@

 #pragma once

+#include <vector>
+#include <sys/types.h>
+
 // Single-pass range over cartesian product of vectors.

 // Note:
--- a/cdc/cdc_extension.hh
+++ b/cdc/cdc_extension.hh
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2020 ScyllaDB
+ */
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "serializer.hh"
+#include "db/extensions.hh"
+#include "cdc/cdc_options.hh"
+#include "schema.hh"
+
+namespace cdc {
+
+class cdc_extension : public schema_extension {
+    cdc::options _cdc_options;
+public:
+    static constexpr auto NAME = "cdc";
+
+    cdc_extension() = default;
+    explicit cdc_extension(std::map<sstring, sstring> tags) : _cdc_options(std::move(tags)) {}
+    explicit cdc_extension(const bytes& b) : _cdc_options(cdc_extension::deserialize(b)) {}
+    explicit cdc_extension(const sstring& s) {
+        throw std::logic_error("Cannot create cdc info from string");
+    }
+    bytes serialize() const override {
+        return ser::serialize_to_buffer<bytes>(_cdc_options.to_map());
+    }
+    static std::map<sstring, sstring> deserialize(const bytes_view& buffer) {
+        return ser::deserialize_from_buffer(buffer, boost::type<std::map<sstring, sstring>>());
+    }
+    const options& get_options() const {
+        return _cdc_options;
+    }
+};
+
+}
--- a/cdc/cdc_options.hh
+++ b/cdc/cdc_options.hh
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <map>
+#include <seastar/core/sstring.hh>
+#include "seastarx.hh"
+
+namespace cdc {
+
+class options final {
+    bool _enabled = false;
+    bool _preimage = false;
+    bool _postimage = false;
+    int _ttl = 86400; // 24h in seconds
+public:
+    options() = default;
+    options(const std::map<sstring, sstring>& map);
+
+    std::map<sstring, sstring> to_map() const;
+    sstring to_sstring() const;
+
+    bool enabled() const { return _enabled; }
+    bool preimage() const { return _preimage; }
+    bool postimage() const { return _postimage; }
+    int ttl() const { return _ttl; }
+
+    bool operator==(const options& o) const;
+    bool operator!=(const options& o) const;
+};
+
+} // namespace cdc
--- a/cdc/cdc_partitioner.cc
+++ b/cdc/cdc_partitioner.cc
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cdc_partitioner.hh"
+#include "dht/token.hh"
+#include "schema.hh"
+#include "sstables/key.hh"
+#include "utils/class_registrator.hh"
+#include "cdc/generation.hh"
+#include "keys.hh"
+
+static const sstring cdc_partitioner_name = "com.scylladb.dht.CDCPartitioner";
+
+namespace cdc {
+
+const sstring cdc_partitioner::name() const {
+    return cdc_partitioner_name;
+}
+
+static dht::token to_token(int64_t value) {
+    return dht::token(dht::token::kind::key, value);
+}
+
+static dht::token to_token(bytes_view key) {
+    // Key should be 16 B long, of which first 8 B are used for token calculation
+    if (key.size() != 2*sizeof(int64_t)) {
+        return dht::minimum_token();
+    }
+    return to_token(stream_id::token_from_bytes(key));
+}
+
+dht::token
+cdc_partitioner::get_token(const sstables::key_view& key) const {
+    return to_token(bytes_view(key));
+}
+
+dht::token
+cdc_partitioner::get_token(const schema& s, partition_key_view key) const {
+    auto exploded_key = key.explode(s);
+    return to_token(exploded_key[0]);
+}
+
+using registry = class_registrator<dht::i_partitioner, cdc_partitioner>;
+static registry registrator(cdc_partitioner_name);
+static registry registrator_short_name("CDCPartitioner");
+
+}
--- a/cdc/cdc_partitioner.hh
+++ b/cdc/cdc_partitioner.hh
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <seastar/core/sstring.hh>
+
+#include "bytes.hh"
+#include "dht/i_partitioner.hh"
+
+class schema;
+class partition_key_view;
+
+namespace sstables {
+
+class key_view;
+
+}
+
+namespace cdc {
+
+struct cdc_partitioner final : public dht::i_partitioner {
+    cdc_partitioner() = default;
+    virtual const sstring name() const override;
+    virtual dht::token get_token(const schema& s, partition_key_view key) const override;
+    virtual dht::token get_token(const sstables::key_view& key) const override;
+};
+
+
+}
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -0,0 +1,338 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <boost/type.hpp>
+#include <random>
+#include <unordered_set>
+#include <seastar/core/sleep.hh>
+
+#include "keys.hh"
+#include "schema_builder.hh"
+#include "db/config.hh"
+#include "db/system_keyspace.hh"
+#include "db/system_distributed_keyspace.hh"
+#include "dht/token-sharding.hh"
+#include "locator/token_metadata.hh"
+#include "gms/application_state.hh"
+#include "gms/inet_address.hh"
+#include "gms/gossiper.hh"
+
+#include "cdc/generation.hh"
+
+extern logging::logger cdc_log;
+
+static int get_shard_count(const gms::inet_address& endpoint, const gms::gossiper& g) {
+    auto ep_state = g.get_application_state_ptr(endpoint, gms::application_state::SHARD_COUNT);
+    return ep_state ? std::stoi(ep_state->value) : -1;
+}
+
+static unsigned get_sharding_ignore_msb(const gms::inet_address& endpoint, const gms::gossiper& g) {
+    auto ep_state = g.get_application_state_ptr(endpoint, gms::application_state::IGNORE_MSB_BITS);
+    return ep_state ? std::stoi(ep_state->value) : 0;
+}
+
+namespace cdc {
+
+extern const api::timestamp_clock::duration generation_leeway =
+    std::chrono::duration_cast<api::timestamp_clock::duration>(std::chrono::seconds(5));
+
+static void copy_int_to_bytes(int64_t i, size_t offset, bytes& b) {
+    i = net::hton(i);
+    std::copy_n(reinterpret_cast<int8_t*>(&i), sizeof(int64_t), b.begin() + offset);
+}
+
+stream_id::stream_id(int64_t first, int64_t second)
+    : _value(bytes::initialized_later(), 2 * sizeof(int64_t))
+{
+    copy_int_to_bytes(first, 0, _value);
+    copy_int_to_bytes(second, sizeof(int64_t), _value);
+}
+
+stream_id::stream_id(bytes b) : _value(std::move(b)) { }
+
+bool stream_id::is_set() const {
+    return !_value.empty();
+}
+
+bool stream_id::operator==(const stream_id& o) const {
+    return _value == o._value;
+}
+
+bool stream_id::operator<(const stream_id& o) const {
+    return _value < o._value;
+}
+
+static int64_t bytes_to_int64(bytes_view b, size_t offset) {
+    assert(b.size() >= offset + sizeof(int64_t));
+    int64_t res;
+    std::copy_n(b.begin() + offset, sizeof(int64_t), reinterpret_cast<int8_t *>(&res));
+    return net::ntoh(res);
+}
+
+int64_t stream_id::first() const {
+    return token_from_bytes(_value);
+}
+
+int64_t stream_id::second() const {
+    return bytes_to_int64(_value, sizeof(int64_t));
+}
+
+int64_t stream_id::token_from_bytes(bytes_view b) {
+    return bytes_to_int64(b, 0);
+}
+
+const bytes& stream_id::to_bytes() const {
+    return _value;
+}
+
+partition_key stream_id::to_partition_key(const schema& log_schema) const {
+    return partition_key::from_single_value(log_schema, _value);
+}
+
+bool token_range_description::operator==(const token_range_description& o) const {
+    return token_range_end == o.token_range_end && streams == o.streams
+        && sharding_ignore_msb == o.sharding_ignore_msb;
+}
+
+topology_description::topology_description(std::vector<token_range_description> entries)
+    : _entries(std::move(entries)) {}
+
+bool topology_description::operator==(const topology_description& o) const {
+    return _entries == o._entries;
+}
+
+const std::vector<token_range_description>& topology_description::entries() const {
+    return _entries;
+}
+
+static stream_id create_stream_id(dht::token t) {
+    static thread_local std::mt19937_64 rand_gen(std::random_device().operator()());
+    static thread_local std::uniform_int_distribution<int64_t> rand_dist(std::numeric_limits<int64_t>::min());
+
+    return {dht::token::to_int64(t), rand_dist(rand_gen)};
+}
+
+class topology_description_generator final {
+    const db::config& _cfg;
+    const std::unordered_set<dht::token>& _bootstrap_tokens;
+    const locator::token_metadata& _token_metadata;
+    const gms::gossiper& _gossiper;
+
+    // Compute a set of tokens that split the token ring into vnodes
+    auto get_tokens() const {
+        auto tokens = _token_metadata.sorted_tokens();
+        auto it = tokens.insert(
+                tokens.end(), _bootstrap_tokens.begin(), _bootstrap_tokens.end());
+        std::sort(it, tokens.end());
+        std::inplace_merge(tokens.begin(), it, tokens.end());
+        tokens.erase(std::unique(tokens.begin(), tokens.end()), tokens.end());
+        return tokens;
+    }
+
+    // Fetch sharding parameters for a node that owns vnode ending with this.end
+    // Returns <shard_count, ignore_msb> pair.
+    std::pair<size_t, uint8_t> get_sharding_info(dht::token end) const {
+        if (_bootstrap_tokens.count(end) > 0) {
+            return {smp::count, _cfg.murmur3_partitioner_ignore_msb_bits()};
+        } else {
+            auto endpoint = _token_metadata.get_endpoint(end);
+            if (!endpoint) {
+                throw std::runtime_error(
+                        format("Can't find endpoint for token {}", end));
+            }
+            auto sc = get_shard_count(*endpoint, _gossiper);
+            return {sc > 0 ? sc : 1, get_sharding_ignore_msb(*endpoint, _gossiper)};
+        }
+    }
+
+    token_range_description create_description(dht::token start, dht::token end) const {
+        token_range_description desc;
+
+        desc.token_range_end = end;
+
+        auto [shard_count, ignore_msb] = get_sharding_info(end);
+        desc.streams.reserve(shard_count);
+        desc.sharding_ignore_msb = ignore_msb;
+
+        dht::sharder sharder(shard_count, ignore_msb);
+        for (size_t shard_idx = 0; shard_idx < shard_count; ++shard_idx) {
+            auto t = dht::find_first_token_for_shard(sharder, start, end, shard_idx);
+            desc.streams.push_back(create_stream_id(t));
+        }
+
+        return desc;
+    }
+public:
+    topology_description_generator(
+            const db::config& cfg,
+            const std::unordered_set<dht::token>& bootstrap_tokens,
+            const locator::token_metadata& token_metadata,
+            const gms::gossiper& gossiper)
+        : _cfg(cfg)
+        , _bootstrap_tokens(bootstrap_tokens)
+        , _token_metadata(token_metadata)
+        , _gossiper(gossiper)
+    {
+        if (_bootstrap_tokens.empty()) {
+            throw std::runtime_error(
+                    "cdc: bootstrap tokens is empty in generate_topology_description");
+        }
+    }
+
+    /*
+     * Generate a set of CDC stream identifiers such that for each shard
+     * and vnode pair there exists a stream whose token falls into this vnode
+     * and is owned by this shard. It is sometimes not possible to generate
+     * a CDC stream identifier for some (vnode, shard) pair because not all
+     * shards have to own tokens in a vnode. Small vnode can be totally owned
+     * by a single shard. In such case, a stream identifier that maps to
+     * end of the vnode is generated.
+     *
+     * Then build a cdc::topology_description which maps tokens to generated
+     * stream identifiers, such that if token T is owned by shard S in vnode V,
+     * it gets mapped to the stream identifier generated for (S, V).
+     */
+    // Run in seastar::async context.
+    topology_description generate() const {
+        const auto tokens = get_tokens();
+
+        std::vector<token_range_description> vnode_descriptions;
+        vnode_descriptions.reserve(tokens.size());
+
+        vnode_descriptions.push_back(
+                create_description(tokens.back(), tokens.front()));
+        for (size_t idx = 1; idx < tokens.size(); ++idx) {
+            vnode_descriptions.push_back(
+                    create_description(tokens[idx - 1], tokens[idx]));
+        }
+
+        return {std::move(vnode_descriptions)};
+    }
+};
+
+bool should_propose_first_generation(const gms::inet_address& me, const gms::gossiper& g) {
+    auto my_host_id = g.get_host_id(me);
+    auto& eps = g.get_endpoint_states();
+    return std::none_of(eps.begin(), eps.end(),
+            [&] (const std::pair<gms::inet_address, gms::endpoint_state>& ep) {
+        return my_host_id < g.get_host_id(ep.first);
+    });
+}
+
+future<db_clock::time_point> get_local_streams_timestamp() {
+    return db::system_keyspace::get_saved_cdc_streams_timestamp().then([] (std::optional<db_clock::time_point> ts) {
+        if (!ts) {
+            auto err = format("get_local_streams_timestamp: tried to retrieve streams timestamp after bootstrapping, but it's not present");
+            cdc_log.error("{}", err);
+            throw std::runtime_error(err);
+        }
+        return *ts;
+    });
+}
+
+// Run inside seastar::async context.
+db_clock::time_point make_new_cdc_generation(
+        const db::config& cfg,
+        const std::unordered_set<dht::token>& bootstrap_tokens,
+        const locator::token_metadata& tm,
+        const gms::gossiper& g,
+        db::system_distributed_keyspace& sys_dist_ks,
+        std::chrono::milliseconds ring_delay,
+        bool for_testing) {
+    assert(!bootstrap_tokens.empty());
+
+    auto gen = topology_description_generator(cfg, bootstrap_tokens, tm, g).generate();
+
+    // Begin the race.
+    auto ts = db_clock::now() + (
+            for_testing ? std::chrono::milliseconds(0) : (
+                2 * ring_delay + std::chrono::duration_cast<std::chrono::milliseconds>(generation_leeway)));
+    sys_dist_ks.insert_cdc_topology_description(ts, std::move(gen), { tm.count_normal_token_owners() }).get();
+
+    return ts;
+}
+
+std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_address& endpoint, const gms::gossiper& g) {
+    auto streams_ts_string = g.get_application_state_value(endpoint, gms::application_state::CDC_STREAMS_TIMESTAMP);
+    cdc_log.trace("endpoint={}, streams_ts_string={}", endpoint, streams_ts_string);
+    return gms::versioned_value::cdc_streams_timestamp_from_string(streams_ts_string);
+}
+
+// Run inside seastar::async context.
+static void do_update_streams_description(
+        db_clock::time_point streams_ts,
+        db::system_distributed_keyspace& sys_dist_ks,
+        db::system_distributed_keyspace::context ctx) {
+    if (sys_dist_ks.cdc_desc_exists(streams_ts, ctx).get0()) {
+        cdc_log.debug("update_streams_description: description of generation {} already inserted", streams_ts);
+        return;
+    }
+
+    // We might race with another node also inserting the description, but that's ok. It's an idempotent operation.
+
+    auto topo = sys_dist_ks.read_cdc_topology_description(streams_ts, ctx).get0();
+    if (!topo) {
+        throw std::runtime_error(format("could not find streams data for timestamp {}", streams_ts));
+    }
+
+    std::set<cdc::stream_id> streams_set;
+    for (auto& entry: topo->entries()) {
+        streams_set.insert(entry.streams.begin(), entry.streams.end());
+    }
+
+    std::vector<cdc::stream_id> streams_vec(streams_set.begin(), streams_set.end());
+
+    sys_dist_ks.create_cdc_desc(streams_ts, streams_vec, ctx).get();
+    cdc_log.info("CDC description table successfully updated with generation {}.", streams_ts);
+}
+
+void update_streams_description(
+        db_clock::time_point streams_ts,
+        shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source& abort_src) {
+    try {
+        do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
+    } catch(...) {
+        cdc_log.warn(
+            "Could not update CDC description table with generation {}: {}. Will retry in the background.",
+            streams_ts, std::current_exception());
+
+        // It is safe to discard this future: we keep system distributed keyspace alive.
+        (void)seastar::async([
+            streams_ts, sys_dist_ks, get_num_token_owners = std::move(get_num_token_owners), &abort_src
+        ] {
+            while (true) {
+                sleep_abortable(std::chrono::seconds(60), abort_src).get();
+                try {
+                    do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
+                    return;
+                } catch (...) {
+                    cdc_log.warn(
+                        "Could not update CDC description table with generation {}: {}. Will try again.",
+                        streams_ts, std::current_exception());
+                }
+            }
+        });
+    }
+}
+
+} // namespace cdc
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -0,0 +1,177 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+/* This module contains classes and functions used to manage CDC generations:
+ * sets of CDC stream identifiers used by the cluster to choose partition keys for CDC log writes.
+ * Each CDC generation begins operating at a specific time point, called the generation's timestamp
+ * (`cdc_streams_timpestamp` or `streams_timestamp` in the code).
+ * The generation is used by all nodes in the cluster to pick CDC streams until superseded by a new generation.
+ *
+ * Functions from this module are used by the node joining procedure to introduce new CDC generations to the cluster
+ * (which is necessary due to new tokens being inserted into the token ring), or during rolling upgrade
+ * if CDC is enabled for the first time.
+ */
+
+#pragma once
+
+#include <vector>
+#include <unordered_set>
+#include <seastar/util/noncopyable_function.hh>
+
+#include "database_fwd.hh"
+#include "db_clock.hh"
+#include "dht/token.hh"
+
+namespace seastar {
+    class abort_source;
+} // namespace seastar
+
+namespace db {
+    class config;
+    class system_distributed_keyspace;
+} // namespace db
+
+namespace gms {
+    class inet_address;
+    class gossiper;
+} // namespace gms
+
+namespace locator {
+    class token_metadata;
+} // namespace locator
+
+namespace cdc {
+
+class stream_id final {
+    bytes _value;
+public:
+    stream_id() = default;
+    stream_id(int64_t, int64_t);
+    stream_id(bytes);
+    bool is_set() const;
+    bool operator==(const stream_id&) const;
+    bool operator<(const stream_id&) const;
+
+    int64_t first() const;
+    int64_t second() const;
+
+    const bytes& to_bytes() const;
+
+    partition_key to_partition_key(const schema& log_schema) const;
+    static int64_t token_from_bytes(bytes_view);
+};
+
+/* Describes a mapping of tokens to CDC streams in a token range.
+ *
+ * The range ends with `token_range_end`. A vector of `token_range_description`s defines the ranges entirely
+ * (the end of the `i`th range is the beginning of the `i+1 % size()`th range). Ranges are left-opened, right-closed.
+ *
+ * Tokens in the range ending with `token_range_end` are mapped to streams in the `streams` vector as follows:
+ * token `T` is mapped to `streams[j]` if and only if the used partitioner maps `T` to the `j`th shard,
+ * assuming that the partitioner is configured for `streams.size()` shards and (partitioner's) `sharding_ignore_msb`
+ * equals to the given `sharding_ignore_msb`.
+*/
+struct token_range_description {
+    dht::token token_range_end;
+    std::vector<stream_id> streams;
+    uint8_t sharding_ignore_msb;
+
+    bool operator==(const token_range_description&) const;
+};
+
+
+/* Describes a mapping of tokens to CDC streams in a whole token ring.
+ *
+ * Division of the ring to token ranges is defined in terms of `token_range_end`s
+ * in the `_entries` vector. See the comment above `token_range_description` for explanation.
+ */
+class topology_description {
+    std::vector<token_range_description> _entries;
+public:
+    topology_description(std::vector<token_range_description> entries);
+    bool operator==(const topology_description&) const;
+
+    const std::vector<token_range_description>& entries() const;
+};
+
+/* Should be called when we're restarting and we noticed that we didn't save any streams timestamp in our local tables,
+ * which means that we're probably upgrading from a non-CDC/old CDC version (another reason could be
+ * that there's a bug, or the user messed with our local tables).
+ *
+ * It checks whether we should be the node to propose the first generation of CDC streams.
+ * The chosen condition is arbitrary, it only tries to make sure that no two nodes propose a generation of streams
+ * when upgrading, and nothing bad happens if they for some reason do (it's mostly an optimization).
+ */
+bool should_propose_first_generation(const gms::inet_address& me, const gms::gossiper&);
+
+/*
+ * Read this node's streams generation timestamp stored in the LOCAL table.
+ * Assumes that the node has successfully bootstrapped, and we're not upgrading from a non-CDC version,
+ * so the timestamp is present.
+ */
+future<db_clock::time_point> get_local_streams_timestamp();
+
+/* Generate a new set of CDC streams and insert it into the distributed cdc_generations table.
+ * Returns the timestamp of this new generation.
+ *
+ * Should be called when starting the node for the first time (i.e., joining the ring).
+ *
+ * Assumes that the system_distributed keyspace is initialized.
+ *
+ * The caller of this function is expected to insert this timestamp into the gossiper as fast as possible,
+ * so that other nodes learn about the generation before their clocks cross the timestmap
+ * (not guaranteed in the current implementation, but expected to be the common case;
+ *  we assume that `ring_delay` is enough for other nodes to learn about the new generation).
+ */
+db_clock::time_point make_new_cdc_generation(
+        const db::config& cfg,
+        const std::unordered_set<dht::token>& bootstrap_tokens,
+        const locator::token_metadata& tm,
+        const gms::gossiper& g,
+        db::system_distributed_keyspace& sys_dist_ks,
+        std::chrono::milliseconds ring_delay,
+        bool for_testing);
+
+/* Retrieves CDC streams generation timestamp from the given endpoint's application state (broadcasted through gossip).
+ * We might be during a rolling upgrade, so the timestamp might not be there (if the other node didn't upgrade yet),
+ * but if the cluster already supports CDC, then every newly joining node will propose a new CDC generation,
+ * which means it will gossip the generation's timestamp.
+ */
+std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_address& endpoint, const gms::gossiper&);
+
+/* Inform CDC users about a generation of streams (identified by the given timestamp)
+ * by inserting it into the cdc_streams table.
+ *
+ * Assumes that the cdc_generations table contains this generation.
+ *
+ * Returning from this function does not mean that the table update was successful: the function
+ * might run an asynchronous task in the background.
+ *
+ * Run inside seastar::async context.
+ */
+void update_streams_description(
+        db_clock::time_point,
+        shared_ptr<db::system_distributed_keyspace>,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source&);
+
+} // namespace cdc
--- a/cdc/log.cc
+++ b/cdc/log.cc
--- a/cdc/log.hh
+++ b/cdc/log.hh
@@ -0,0 +1,146 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * This module manages CDC log tables. It contains facilities used to:
+ * - perform schema changes to CDC log tables correspondingly when base tables are changed,
+ * - perform writes to CDC log tables correspondingly when writes to base tables are made.
+ */
+
+#pragma once
+
+#include <functional>
+#include <optional>
+#include <map>
+#include <string>
+#include <vector>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/sstring.hh>
+
+#include "exceptions/exceptions.hh"
+#include "timestamp.hh"
+#include "tracing/trace_state.hh"
+#include "cdc_options.hh"
+#include "utils/UUID.hh"
+
+class schema;
+using schema_ptr = seastar::lw_shared_ptr<const schema>;
+
+namespace locator {
+
+class token_metadata;
+
+} // namespace locator
+
+namespace service {
+
+class migration_notifier;
+class storage_proxy;
+class query_state;
+
+} // namespace service
+
+class mutation;
+class partition_key;
+
+namespace cdc {
+
+struct operation_result_tracker;
+class db_context;
+class metadata;
+
+/// \brief CDC service, responsible for schema listeners
+///
+/// CDC service will listen for schema changes and iff CDC is enabled/changed
+/// create/modify/delete corresponding log tables etc as part of the schema change. 
+///
+class cdc_service {
+    class impl;
+    std::unique_ptr<impl> _impl;
+public:
+    future<> stop();
+    cdc_service(service::storage_proxy&);
+    cdc_service(db_context);
+    ~cdc_service();
+
+    // If any of the mutations are cdc enabled, optionally selects preimage, and adds the
+    // appropriate augments to set the log entries.
+    // Iff post-image is enabled for any of these, a non-empty callback is also
+    // returned to be invoked post the mutation query.
+    future<std::tuple<std::vector<mutation>, lw_shared_ptr<operation_result_tracker>>> augment_mutation_call(
+        lowres_clock::time_point timeout,
+        std::vector<mutation>&& mutations,
+        tracing::trace_state_ptr tr_state,
+        db::consistency_level write_cl
+        );
+    bool needs_cdc_augmentation(const std::vector<mutation>&) const;
+};
+
+struct db_context final {
+    service::storage_proxy& _proxy;
+    service::migration_notifier& _migration_notifier;
+    locator::token_metadata& _token_metadata;
+    cdc::metadata& _cdc_metadata;
+
+    class builder final {
+        service::storage_proxy& _proxy;
+        std::optional<std::reference_wrapper<service::migration_notifier>> _migration_notifier;
+        std::optional<std::reference_wrapper<locator::token_metadata>> _token_metadata;
+        std::optional<std::reference_wrapper<cdc::metadata>> _cdc_metadata;
+    public:
+        builder(service::storage_proxy& proxy);
+
+        builder& with_migration_notifier(service::migration_notifier& migration_notifier);
+        builder& with_token_metadata(locator::token_metadata& token_metadata);
+        builder& with_cdc_metadata(cdc::metadata&);
+
+        db_context build();
+    };
+};
+
+// cdc log table operation
+enum class operation : int8_t {
+    // note: these values will eventually be read by a third party, probably not privvy to this
+    // enum decl, so don't change the constant values (or the datatype).
+    pre_image = 0, update = 1, insert = 2, row_delete = 3, partition_delete = 4,
+    range_delete_start_inclusive = 5, range_delete_start_exclusive = 6, range_delete_end_inclusive = 7, range_delete_end_exclusive = 8,
+    post_image = 9,
+};
+
+bool is_log_for_some_table(const sstring& ks_name, const std::string_view& table_name);
+seastar::sstring log_name(const seastar::sstring& table_name);
+seastar::sstring log_data_column_name(std::string_view column_name);
+seastar::sstring log_meta_column_name(std::string_view column_name);
+bytes log_data_column_name_bytes(const bytes& column_name);
+bytes log_meta_column_name_bytes(const bytes& column_name);
+
+seastar::sstring log_data_column_deleted_name(std::string_view column_name);
+bytes log_data_column_deleted_name_bytes(const bytes& column_name);
+
+seastar::sstring log_data_column_deleted_elements_name(std::string_view column_name);
+bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name);
+
+utils::UUID generate_timeuuid(api::timestamp_type t);
+
+} // namespace cdc
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -0,0 +1,200 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "dht/token-sharding.hh"
+#include "utils/exceptions.hh"
+#include "exceptions/exceptions.hh"
+
+#include "cdc/generation.hh"
+#include "cdc/metadata.hh"
+
+extern logging::logger cdc_log;
+
+namespace cdc {
+    extern const api::timestamp_clock::duration generation_leeway;
+} // namespace cdc
+
+static api::timestamp_type to_ts(db_clock::time_point tp) {
+    // This assumes that timestamp_clock and db_clock have the same epochs.
+    return std::chrono::duration_cast<api::timestamp_clock::duration>(tp.time_since_epoch()).count();
+}
+
+static cdc::stream_id get_stream(
+        const cdc::token_range_description& entry,
+        dht::token tok) {
+    // The ith stream is the stream for the ith shard.
+    auto shard_cnt = entry.streams.size();
+    auto shard_id = dht::shard_of(shard_cnt, entry.sharding_ignore_msb, tok);
+
+    if (shard_id >= shard_cnt) {
+        on_internal_error(cdc_log, "get_stream: shard_id out of bounds");
+    }
+
+    return entry.streams[shard_id];
+}
+
+static cdc::stream_id get_stream(
+        const std::vector<cdc::token_range_description>& entries,
+        dht::token tok) {
+    if (entries.empty()) {
+        on_internal_error(cdc_log, "get_stream: entries empty");
+    }
+
+    auto it = std::lower_bound(entries.begin(), entries.end(), tok,
+            [] (const cdc::token_range_description& e, dht::token t) { return e.token_range_end < t; });
+    if (it == entries.end()) {
+        it = entries.begin();
+    }
+
+    return get_stream(*it, tok);
+}
+
+cdc::metadata::container_t::const_iterator cdc::metadata::gen_used_at(api::timestamp_type ts) const {
+    auto it = _gens.upper_bound(ts);
+    if (it == _gens.begin()) {
+        // All known generations have higher timestamps than `ts`.
+        return _gens.end();
+    }
+
+    return std::prev(it);
+}
+
+cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok) {
+    auto now = api::new_timestamp();
+    if (ts > now + generation_leeway.count()) {
+        throw exceptions::invalid_request_exception(format(
+                "cdc: attempted to get a stream \"from the future\" ({}; current server time: {})."
+                " With CDC you cannot send writes with timestamps arbitrarily into the future, because we don't"
+                " know what streams will be used at that time.\n"
+                "We *do* allow sending writes into the near future, but our ability to do that is limited."
+                " If you really must use your own timestamps, then make sure your clocks are well-synchronized"
+               "  with the database's clocks.", format_timestamp(ts), format_timestamp(now)));
+        // Note that we might still send a write to a wrong generation, if we learn about the current
+        // generation too late (we might think that an earlier generation is the current one).
+        // Nothing protects us from that until we start using transactions for generation switching.
+    }
+
+    auto it = gen_used_at(now);
+    if (it == _gens.end()) {
+        throw std::runtime_error(format(
+                "cdc::metadata::get_stream: could not find any CDC stream (current time: {})."
+                " Are we in the middle of a cluster upgrade?", format_timestamp(now)));
+    }
+
+    // Garbage-collect generations that will no longer be used.
+    it = _gens.erase(_gens.begin(), it);
+
+    if (it->first > ts) {
+        throw exceptions::invalid_request_exception(format(
+                "cdc: attempted to get a stream from an earlier generation than the currently used one."
+                " With CDC you cannot send writes with timestamps too far into the past, because that would break"
+                " consistency properties (write timestamp: {}, current generation started at: {})",
+                format_timestamp(ts), format_timestamp(it->first)));
+    }
+
+    // With `generation_leeway` we allow sending writes to the near future. It might happen
+    // that `ts` doesn't belong to the current generation ("current" according to our clock),
+    // but to the next generation. Adjust for this case:
+    {
+        auto next_it = std::next(it);
+        while (next_it != _gens.end() && next_it->first <= ts) {
+            it = next_it++;
+        }
+    }
+    // Note: if there is a next generation that `ts` belongs to, but we don't know about it,
+    // then too bad. This is no different from the situation in which we didn't manage to learn
+    // about the current generation in time. We won't be able to prevent it until we introduce transactions.
+
+    if (!it->second) {
+        throw std::runtime_error(format(
+                "cdc: attempted to get a stream from a generation that we know about, but weren't able to retrieve"
+                " (generation timestamp: {}, write timestamp: {}). Make sure that the replicas which contain"
+                " this generation's data are alive and reachable from this node.", format_timestamp(it->first), format_timestamp(ts)));
+    }
+
+    auto& gen = *it->second;
+    auto ret = ::get_stream(gen.entries(), tok);
+    _last_stream_timestamp = ts;
+    return ret;
+}
+
+bool cdc::metadata::known_or_obsolete(db_clock::time_point tp) const {
+    auto ts = to_ts(tp);
+    auto it = _gens.lower_bound(ts);
+
+    if (it == _gens.end()) {
+        // No known generations with timestamp >= ts.
+        return false;
+    }
+
+    if (it->first == ts) {
+        if (it->second) {
+            // We already inserted this particular generation.
+            return true;
+        }
+        ++it;
+    }
+
+    // Check if some new generation has already superseded this one.
+    return it != _gens.end() && it->first <= api::new_timestamp();
+}
+
+bool cdc::metadata::insert(db_clock::time_point tp, topology_description&& gen) {
+    if (known_or_obsolete(tp)) {
+        return false;
+    }
+
+    auto now = api::new_timestamp();
+    auto it = gen_used_at(now);
+
+    if (it != _gens.end()) {
+        // Garbage-collect generations that will no longer be used.
+        it = _gens.erase(_gens.begin(), it);
+
+    }
+
+    _gens.insert_or_assign(to_ts(tp), std::move(gen));
+    return true;
+}
+
+bool cdc::metadata::prepare(db_clock::time_point tp) {
+    if (known_or_obsolete(tp)) {
+        return false;
+    }
+
+    auto ts = to_ts(tp);
+    auto emplaced = _gens.emplace(to_ts(tp), std::nullopt).second;
+
+    if (_last_stream_timestamp != api::missing_timestamp) {
+        auto last_correct_gen = gen_used_at(_last_stream_timestamp);
+        if (emplaced && last_correct_gen != _gens.end() && last_correct_gen->first == ts) {
+            cdc_log.error(
+                "just learned about a CDC generation newer than the one used the last time"
+                " streams were retrieved. This generation, or some newer one, should have"
+                " been used instead (new generation's timestamp: {}, last time streams were retrieved: {})."
+                " The new generation probably arrived too late due to a network partition"
+                " and we've made a write using the wrong set streams.",
+                format_timestamp(ts), format_timestamp(_last_stream_timestamp));
+        }
+    }
+
+    return emplaced;
+}
--- a/cdc/metadata.hh
+++ b/cdc/metadata.hh
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <map>
+
+#include "db_clock.hh"
+#include "timestamp.hh"
+
+namespace dht {
+    class token;
+}
+
+namespace cdc {
+
+class stream_id;
+class topology_description;
+
+/* Represents the node's knowledge about CDC generations used in the cluster.
+ * Used during writes to pick streams to which CDC log writes should be sent to
+ * (i.e., to pick partition keys for these writes).
+ */
+class metadata final {
+    // Note: we use db_clock (1ms resolution) for generation timestaps
+    // (because we need to insert them into tables using columns of timestamp types,
+    //  and the native type of our columns' timestamp_type is db_clock::time_point).
+    // On the other hand, timestamp_clock (1us resolution) is used for mutation timestamps,
+    // and api::timestamp_type represents the number of ticks of a timestamp_clock::time_point since epoch.
+
+    using container_t = std::map<api::timestamp_type, std::optional<topology_description>>;
+    container_t _gens;
+
+    /* The timestamp used in the last successful `get_stream` call. */
+    api::timestamp_type _last_stream_timestamp = api::missing_timestamp;
+
+    container_t::const_iterator gen_used_at(api::timestamp_type ts) const;
+public:
+    /* Is a generation with the given timestamp already known or superseded by a newer generation? */
+    bool known_or_obsolete(db_clock::time_point) const;
+
+    /* Return the stream for the base partition whose token is `tok` to which a corresponding log write should go
+     * according to the generation used at time `ts` (i.e, the latest generation whose timestamp is less or equal to `ts`).
+     *
+     * If the provided timestamp is too far away "into the future" (where "now" is defined according to our local clock),
+     * we reject the get_stream query. This is because the resulting stream might belong to a generation which we don't
+     * yet know about. The amount of leeway (how much "into the future" we allow `ts` to be) is defined
+     * by the `cdc::generation_leeway` constant.
+     */
+    stream_id get_stream(api::timestamp_type ts, dht::token tok);
+
+    /* Insert the generation given by `gen` with timestamp `ts` to be used by the `get_stream` function,
+     * if the generation is not already known or older than the currently known ones.
+     *
+     * Returns true if the generation was inserted,
+     * meaning that `get_stream` might return a stream from this generation (at some time points).
+     */
+    bool insert(db_clock::time_point ts, topology_description&& gen);
+
+    /* Prepare for inserting a new generation whose timestamp is `ts`.
+     * This method is not required to be called before `insert`, but it's here
+     * to increase safety of `get_stream` calls in some situations. Use it if you:
+     * 1. know that there is a new generation, but
+     * 2. you didn't yet retrieve the generation's topology_description.
+     *
+     * After preparing a generation, if `get_stream` is supposed to return a stream from this generation
+     * but we don't yet have the generation's data, it will reject the query to maintain consistency of streams.
+     *
+     * Returns true iff this generation is not obsolete and wasn't previously prepared nor inserted.
+     */
+    bool prepare(db_clock::time_point ts);
+};
+
+} // namespace cdc
--- a/cdc/split.cc
+++ b/cdc/split.cc
@@ -0,0 +1,463 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "mutation.hh"
+#include "schema.hh"
+
+#include "split.hh"
+#include "log.hh"
+
+struct atomic_column_update {
+    column_id id;
+    atomic_cell cell;
+};
+
+// see the comment inside `clustered_row_insert` for motivation for separating
+// nonatomic deletions from nonatomic updates
+struct nonatomic_column_deletion {
+    column_id id;
+    tombstone t;
+};
+
+struct nonatomic_column_update {
+    column_id id;
+    utils::chunked_vector<std::pair<bytes, atomic_cell>> cells;
+};
+
+struct static_row_update {
+    gc_clock::duration ttl;
+    std::vector<atomic_column_update> atomic_entries;
+    std::vector<nonatomic_column_deletion> nonatomic_deletions;
+    std::vector<nonatomic_column_update> nonatomic_updates;
+};
+
+struct clustered_row_insert {
+    gc_clock::duration ttl;
+    clustering_key key;
+    row_marker marker;
+    std::vector<atomic_column_update> atomic_entries;
+    std::vector<nonatomic_column_deletion> nonatomic_deletions;
+    // INSERTs can't express updates of individual cells inside a non-atomic
+    // (without deleting the entire field first), so no `nonatomic_updates` field
+    // overwriting a nonatomic column inside an INSERT will be split into two changes:
+    // one with a nonatomic deletion, and one with a nonatomic update
+};
+
+struct clustered_row_update {
+    gc_clock::duration ttl;
+    clustering_key key;
+    std::vector<atomic_column_update> atomic_entries;
+    std::vector<nonatomic_column_deletion> nonatomic_deletions;
+    std::vector<nonatomic_column_update> nonatomic_updates;
+};
+
+struct clustered_row_deletion {
+    clustering_key key;
+    tombstone t;
+};
+
+struct clustered_range_deletion {
+    range_tombstone rt;
+};
+
+struct partition_deletion {
+    tombstone t;
+};
+
+struct batch {
+    std::vector<static_row_update> static_updates;
+    std::vector<clustered_row_insert> clustered_inserts;
+    std::vector<clustered_row_update> clustered_updates;
+    std::vector<clustered_row_deletion> clustered_row_deletions;
+    std::vector<clustered_range_deletion> clustered_range_deletions;
+    std::optional<partition_deletion> partition_deletions;
+};
+
+using set_of_changes = std::map<api::timestamp_type, batch>;
+
+struct row_update {
+    std::vector<atomic_column_update> atomic_entries;
+    std::vector<nonatomic_column_deletion> nonatomic_deletions;
+    std::vector<nonatomic_column_update> nonatomic_updates;
+};
+
+static
+std::map<std::pair<api::timestamp_type, gc_clock::duration>, row_update>
+extract_row_updates(const row& r, column_kind ckind, const schema& schema) {
+    std::map<std::pair<api::timestamp_type, gc_clock::duration>, row_update> result;
+    r.for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
+        auto& cdef = schema.column_at(ckind, id);
+        if (cdef.is_atomic()) {
+            auto view = cell.as_atomic_cell(cdef);
+            auto timestamp_and_ttl = std::pair(
+                    view.timestamp(),
+                    view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0)
+                );
+            result[timestamp_and_ttl].atomic_entries.push_back({id, atomic_cell(*cdef.type, view)});
+            return;
+        }
+
+        cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
+            auto desc = mview.materialize(*cdef.type);
+            for (auto& [k, v]: desc.cells) {
+                auto timestamp_and_ttl = std::pair(
+                        v.timestamp(),
+                        v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0)
+                    );
+                auto& updates = result[timestamp_and_ttl].nonatomic_updates;
+                if (updates.empty() || updates.back().id != id) {
+                    updates.push_back({id, {}});
+                }
+                updates.back().cells.push_back({std::move(k), std::move(v)});
+            }
+
+            if (desc.tomb) {
+                auto timestamp_and_ttl = std::pair(desc.tomb.timestamp, gc_clock::duration(0));
+                result[timestamp_and_ttl].nonatomic_deletions.push_back({id, desc.tomb});
+            }
+        });
+    });
+    return result;
+};
+
+set_of_changes extract_changes(const mutation& base_mutation, const schema& base_schema) {
+    set_of_changes res;
+    auto& p = base_mutation.partition();
+
+    auto sr_updates = extract_row_updates(p.static_row().get(), column_kind::static_column, base_schema);
+    for (auto& [k, up]: sr_updates) {
+        auto [timestamp, ttl] = k;
+        res[timestamp].static_updates.push_back({
+                ttl,
+                std::move(up.atomic_entries),
+                std::move(up.nonatomic_deletions),
+                std::move(up.nonatomic_updates)
+            });
+    }
+
+    for (const rows_entry& cr : p.clustered_rows()) {
+        auto cr_updates = extract_row_updates(cr.row().cells(), column_kind::regular_column, base_schema);
+
+        const auto& marker = cr.row().marker();
+        auto marker_timestamp = marker.timestamp();
+        auto marker_ttl = marker.is_expiring() ? marker.ttl() : gc_clock::duration(0);
+        if (marker.is_live()) {
+            // make sure that an entry corresponding to the row marker's timestamp and ttl is in the map
+            (void)cr_updates[std::pair(marker_timestamp, marker_ttl)];
+        }
+
+        auto is_insert = [&] (api::timestamp_type timestamp, gc_clock::duration ttl) {
+            if (!marker.is_live()) {
+                return false;
+            }
+
+            return timestamp == marker_timestamp && ttl == marker_ttl;
+        };
+
+        for (auto& [k, up]: cr_updates) {
+            auto [timestamp, ttl] = k;
+
+            if (is_insert(timestamp, ttl)) {
+                res[timestamp].clustered_inserts.push_back({
+                        ttl,
+                        cr.key(),
+                        marker,
+                        std::move(up.atomic_entries),
+                        std::move(up.nonatomic_deletions)
+                    });
+                if (!up.nonatomic_updates.empty()) {
+                    // nonatomic updates cannot be expressed with an INSERT.
+                    res[timestamp].clustered_updates.push_back({
+                            ttl,
+                            cr.key(),
+                            {},
+                            {},
+                            std::move(up.nonatomic_updates)
+                        });
+                }
+            } else {
+                res[timestamp].clustered_updates.push_back({
+                        ttl,
+                        cr.key(),
+                        std::move(up.atomic_entries),
+                        std::move(up.nonatomic_deletions),
+                        std::move(up.nonatomic_updates)
+                    });
+            }
+        }
+
+        auto row_tomb = cr.row().deleted_at().regular();
+        if (row_tomb) {
+            res[row_tomb.timestamp].clustered_row_deletions.push_back({cr.key(), row_tomb});
+        }
+    }
+
+    for (const auto& rt: p.row_tombstones()) {
+        if (rt.tomb.timestamp != api::missing_timestamp) {
+            res[rt.tomb.timestamp].clustered_range_deletions.push_back({rt});
+        }
+    }
+
+    auto partition_tomb_timestamp = p.partition_tombstone().timestamp;
+    if (partition_tomb_timestamp != api::missing_timestamp) {
+        res[partition_tomb_timestamp].partition_deletions = {p.partition_tombstone()};
+    }
+
+    return res;
+}
+
+namespace cdc {
+
+bool should_split(const mutation& base_mutation, const schema& base_schema) {
+    auto& p = base_mutation.partition();
+
+    api::timestamp_type found_ts = api::missing_timestamp;
+    std::optional<gc_clock::duration> found_ttl; // 0 = "no ttl"
+
+    auto check_or_set = [&] (api::timestamp_type ts, gc_clock::duration ttl) {
+        if (found_ts != api::missing_timestamp && found_ts != ts) {
+            return true;
+        }
+        found_ts = ts;
+
+        if (found_ttl && *found_ttl != ttl) {
+            return true;
+        }
+        found_ttl = ttl;
+
+        return false;
+    };
+
+    bool had_static_row = false;
+
+    bool should_split = false;
+    p.static_row().get().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
+        had_static_row = true;
+
+        auto& cdef = base_schema.column_at(column_kind::static_column, id);
+        if (cdef.is_atomic()) {
+            auto view = cell.as_atomic_cell(cdef);
+            if (check_or_set(view.timestamp(), view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0))) {
+                should_split = true;
+            }
+            return;
+        }
+
+        cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
+            auto desc = mview.materialize(*cdef.type);
+            for (auto& [k, v]: desc.cells) {
+                if (check_or_set(v.timestamp(), v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0))) {
+                    should_split = true;
+                    return;
+                }
+            }
+
+            if (desc.tomb) {
+                if (check_or_set(desc.tomb.timestamp, gc_clock::duration(0))) {
+                    should_split = true;
+                    return;
+                }
+            }
+        });
+    });
+
+    if (should_split) {
+        return true;
+    }
+
+    bool had_clustered_row = false;
+
+    if (!p.clustered_rows().empty() && had_static_row) {
+        return true;
+    }
+    for (const rows_entry& cr : p.clustered_rows()) {
+        had_clustered_row = true;
+
+        const auto& marker = cr.row().marker();
+        if (marker.is_live() && check_or_set(marker.timestamp(), marker.is_expiring() ? marker.ttl() : gc_clock::duration(0))) {
+            return true;
+        }
+
+        bool is_insert = marker.is_live();
+
+        bool had_cells = false;
+        cr.row().cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
+            had_cells = true;
+
+            auto& cdef = base_schema.column_at(column_kind::regular_column, id);
+            if (cdef.is_atomic()) {
+                auto view = cell.as_atomic_cell(cdef);
+                if (check_or_set(view.timestamp(), view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0))) {
+                    should_split = true;
+                }
+                return;
+            }
+
+            cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
+                for (auto& [k, v]: mview.cells) {
+                    if (check_or_set(v.timestamp(), v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0))) {
+                        should_split = true;
+                        return;
+                    }
+
+                    if (is_insert) {
+                        // nonatomic updates cannot be expressed with an INSERT.
+                        should_split = true;
+                        return;
+                    }
+                }
+
+                if (mview.tomb) {
+                    if (check_or_set(mview.tomb.timestamp, gc_clock::duration(0))) {
+                        should_split = true;
+                        return;
+                    }
+                }
+            });
+        });
+
+        if (should_split) {
+            return true;
+        }
+
+        auto row_tomb = cr.row().deleted_at().regular();
+        if (row_tomb) {
+            if (had_cells) {
+                return true;
+            }
+
+            // there were no cells, so no ttl
+            assert(!found_ttl);
+            if (found_ts != api::missing_timestamp && found_ts != row_tomb.timestamp) {
+                return true;
+            }
+
+            found_ts = row_tomb.timestamp;
+        }
+    }
+
+    if (!p.row_tombstones().empty() && (had_static_row || had_clustered_row)) {
+        return true;
+    }
+
+    for (const auto& rt: p.row_tombstones()) {
+        if (rt.tomb) {
+            if (found_ts != api::missing_timestamp && found_ts != rt.tomb.timestamp) {
+                return true;
+            }
+
+            found_ts = rt.tomb.timestamp;
+        }
+    }
+
+    if (p.partition_tombstone().timestamp != api::missing_timestamp
+            && (!p.row_tombstones().empty() || had_static_row || had_clustered_row)) {
+        return true;
+    }
+
+    // A mutation with no timestamp will be split into 0 mutations
+    return found_ts == api::missing_timestamp;
+}
+
+void for_each_change(const mutation& base_mutation, const schema_ptr& base_schema,
+        seastar::noncopyable_function<void(mutation, api::timestamp_type, bytes, int&)> f) {
+    auto changes = extract_changes(base_mutation, *base_schema);
+    auto pk = base_mutation.key();
+
+    for (auto& [change_ts, btch] : changes) {
+        auto tuuid = timeuuid_type->decompose(generate_timeuuid(change_ts));
+        int batch_no = 0;
+
+        for (auto& sr_update : btch.static_updates) {
+            mutation m(base_schema, pk);
+            for (auto& atomic_update : sr_update.atomic_entries) {
+                auto& cdef = base_schema->column_at(column_kind::static_column, atomic_update.id);
+                m.set_static_cell(cdef, std::move(atomic_update.cell));
+            }
+            for (auto& nonatomic_delete : sr_update.nonatomic_deletions) {
+                auto& cdef = base_schema->column_at(column_kind::static_column, nonatomic_delete.id);
+                m.set_static_cell(cdef, collection_mutation_description{nonatomic_delete.t, {}}.serialize(*cdef.type));
+            }
+            for (auto& nonatomic_update : sr_update.nonatomic_updates) {
+                auto& cdef = base_schema->column_at(column_kind::static_column, nonatomic_update.id);
+                m.set_static_cell(cdef, collection_mutation_description{{}, std::move(nonatomic_update.cells)}.serialize(*cdef.type));
+            }
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        for (auto& cr_insert : btch.clustered_inserts) {
+            mutation m(base_schema, pk);
+
+            auto& row = m.partition().clustered_row(*base_schema, cr_insert.key);
+            for (auto& atomic_update : cr_insert.atomic_entries) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, atomic_update.id);
+                row.cells().apply(cdef, std::move(atomic_update.cell));
+            }
+            for (auto& nonatomic_delete : cr_insert.nonatomic_deletions) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, nonatomic_delete.id);
+                row.cells().apply(cdef, collection_mutation_description{nonatomic_delete.t, {}}.serialize(*cdef.type));
+            }
+            row.apply(cr_insert.marker);
+
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        for (auto& cr_update : btch.clustered_updates) {
+            mutation m(base_schema, pk);
+
+            auto& row = m.partition().clustered_row(*base_schema, cr_update.key).cells();
+            for (auto& atomic_update : cr_update.atomic_entries) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, atomic_update.id);
+                row.apply(cdef, std::move(atomic_update.cell));
+            }
+            for (auto& nonatomic_delete : cr_update.nonatomic_deletions) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, nonatomic_delete.id);
+                row.apply(cdef, collection_mutation_description{nonatomic_delete.t, {}}.serialize(*cdef.type));
+            }
+            for (auto& nonatomic_update : cr_update.nonatomic_updates) {
+                auto& cdef = base_schema->column_at(column_kind::regular_column, nonatomic_update.id);
+                row.apply(cdef, collection_mutation_description{{}, std::move(nonatomic_update.cells)}.serialize(*cdef.type));
+            }
+
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        for (auto& cr_delete : btch.clustered_row_deletions) {
+            mutation m(base_schema, pk);
+            m.partition().apply_delete(*base_schema, cr_delete.key, cr_delete.t);
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        for (auto& crange_delete : btch.clustered_range_deletions) {
+            mutation m(base_schema, pk);
+            m.partition().apply_delete(*base_schema, crange_delete.rt);
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+
+        if (btch.partition_deletions) {
+            mutation m(base_schema, pk);
+            m.partition().apply(btch.partition_deletions->t);
+            f(std::move(m), change_ts, tuuid, batch_no);
+        }
+    }
+}
+
+} // namespace cdc
--- a/Show More
+++ b/Show More