scylla_ntp_setup: support 'pool' directive on ntp.conf

Currently, scylla_ntp_setup only supports 'server' directive, we should support 'pool' too. Fixes #9393 Closes #9397 (cherry picked from commit 61469d62b8)
scylla_cpuscaling_setup: add --force option
2021-10-10 19:42:14 +03:00 · 2021-10-05 16:20:30 +03:00 · 2021-10-03 14:09:37 +03:00 · 2021-10-03 13:11:30 +03:00 · 2021-09-23 15:18:22 +03:00 · 2021-09-12 16:04:11 +03:00
4730 changed files with 41507 additions and 21956 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -0,0 +1,81 @@
+# AUTH
+auth/* @elcallio @vladzcloudius
+
+# CACHE
+row_cache* @tgrabiec @haaawk
+*mutation* @tgrabiec @haaawk
+tests/mvcc* @tgrabiec @haaawk
+
+# CDC
+cdc/* @haaawk @kbr- @elcallio @piodul @jul-stas
+test/cql/cdc_* @haaawk @kbr- @elcallio @piodul @jul-stas
+test/boost/cdc_* @haaawk @kbr- @elcallio @piodul @jul-stas
+
+# COMMITLOG / BATCHLOG
+db/commitlog/* @elcallio
+db/batch* @elcallio
+
+# COORDINATOR
+service/storage_proxy* @gleb-cloudius
+
+# COMPACTION
+sstables/compaction* @raphaelsc @nyh
+
+# CQL TRANSPORT LAYER
+transport/* @penberg
+
+# CQL QUERY LANGUAGE
+cql3/* @tgrabiec @penberg @psarna
+
+# COUNTERS
+counters* @haaawk @jul-stas
+tests/counter_test* @haaawk @jul-stas
+
+# GOSSIP
+gms/* @tgrabiec @asias
+
+# DOCKER
+dist/docker/* @penberg
+
+# LSA
+utils/logalloc* @tgrabiec
+
+# MATERIALIZED VIEWS
+db/view/* @nyh @psarna
+cql3/statements/*view* @nyh @psarna
+test/boost/view_* @nyh @psarna
+
+# PACKAGING
+dist/* @syuu1228
+
+# REPAIR
+repair/* @tgrabiec @asias @nyh
+
+# SCHEMA MANAGEMENT
+db/schema_tables* @tgrabiec @nyh
+db/legacy_schema_migrator* @tgrabiec @nyh
+service/migration* @tgrabiec @nyh
+schema* @tgrabiec @nyh
+
+# SECONDARY INDEXES
+db/index/* @nyh @penberg @psarna
+cql3/statements/*index* @nyh @penberg @psarna
+test/boost/*index* @nyh @penberg @psarna
+
+# SSTABLES
+sstables/* @tgrabiec @raphaelsc @nyh
+
+# STREAMING
+streaming/* @tgrabiec @asias
+service/storage_service.* @tgrabiec @asias
+
+# ALTERNATOR
+alternator/* @nyh @psarna
+test/alternator/* @nyh @psarna
+
+# HINTED HANDOFF
+db/hints/* @haaawk @piodul @vladzcloudius
+
+# REDIS
+redis/* @nyh @syuu1228
+redis-test/* @nyh @syuu1228
--- a/.gitignore
+++ b/.gitignore
@@ -22,5 +22,6 @@ resources
 .pytest_cache
 /expressions.tokens
 tags
-testlog/*
+testlog
 test/*/*.reject
+.vscode
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,8 +13,11 @@
 	path = abseil
 	url = ../abseil-cpp
 [submodule "scylla-jmx"]
-	path = scylla-jmx
+	path = tools/jmx
 	url = ../scylla-jmx
 [submodule "scylla-tools"]
-	path = scylla-tools
+	path = tools/java
 	url = ../scylla-tools-java
+[submodule "scylla-python3"]
+	path = tools/python3
+	url = ../scylla-python3
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -110,6 +110,7 @@ scan_scylla_source_directories(
          io
          locator
          message
+          raft
          repair
          service
          sstables
@@ -152,4 +153,5 @@ target_include_directories(scylla PUBLIC
        ${Boost_INCLUDE_DIRS}
        xxhash
        libdeflate
+        abseil
        build/${BUILD_TYPE}/gen)
--- a/114
+++ b/114
@@ -1,114 +0,0 @@
-M: Maintainer with commit access
-R: Reviewer with subsystem expertise
-F: Filename, directory, or pattern for the subsystem
-
---
-
-AUTH
-R: Calle Wilund <calle@scylladb.com>
-R: Vlad Zolotarov <vladz@scylladb.com>
-R: Jesse Haber-Kucharsky <jhaberku@scylladb.com>
-F: auth/*
-
-CACHE
-M: Tomasz Grabiec <tgrabiec@scylladb.com>
-R: Piotr Jastrzebski <piotr@scylladb.com>
-F: row_cache*
-F: *mutation*
-F: tests/mvcc*
-
-COMMITLOG / BATCHLOGa
-R: Calle Wilund <calle@scylladb.com>
-F: db/commitlog/*
-F: db/batch*
-
-COORDINATOR
-R: Gleb Natapov <gleb@scylladb.com>
-F: service/storage_proxy*
-
-COMPACTION
-R: Raphael S. Carvalho <raphaelsc@scylladb.com>
-R: Glauber Costa <glauber@scylladb.com>
-R: Nadav Har'El <nyh@scylladb.com>
-F: sstables/compaction*
-
-CQL TRANSPORT LAYER
-M: Pekka Enberg <penberg@scylladb.com>
-F: transport/*
-
-CQL QUERY LANGUAGE
-M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Pekka Enberg <penberg@scylladb.com>
-F: cql3/*
-
-COUNTERS
-F: counters*
-F: tests/counter_test*
-
-GOSSIP
-M: Tomasz Grabiec <tgrabiec@scylladb.com>
-R: Asias He <asias@scylladb.com>
-F: gms/*
-
-DOCKER
-M: Pekka Enberg <penberg@scylladb.com>
-F: dist/docker/*
-
-LSA
-M: Tomasz Grabiec <tgrabiec@scylladb.com>
-F: utils/logalloc*
-
-MATERIALIZED VIEWS
-M: Pekka Enberg <penberg@scylladb.com>
-M: Nadav Har'El <nyh@scylladb.com>
-F: db/view/*
-F: cql3/statements/*view*
-
-PACKAGING
-R: Takuya ASADA <syuu@scylladb.com>
-F: dist/*
-
-REPAIR
-M: Tomasz Grabiec <tgrabiec@scylladb.com>
-R: Asias He <asias@scylladb.com>
-R: Nadav Har'El <nyh@scylladb.com>
-F: repair/*
-
-SCHEMA MANAGEMENT
-M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Pekka Enberg <penberg@scylladb.com>
-F: db/schema_tables*
-F: db/legacy_schema_migrator*
-F: service/migration*
-F: schema*
-
-SECONDARY INDEXES
-M: Pekka Enberg <penberg@scylladb.com>
-M: Nadav Har'El <nyh@scylladb.com>
-R: Pekka Enberg <penberg@scylladb.com>
-F: db/index/*
-F: cql3/statements/*index*
-
-SSTABLES
-M: Tomasz Grabiec <tgrabiec@scylladb.com>
-R: Raphael S. Carvalho <raphaelsc@scylladb.com>
-R: Glauber Costa <glauber@scylladb.com>
-R: Nadav Har'El <nyh@scylladb.com>
-F: sstables/*
-
-STREAMING
-M: Tomasz Grabiec <tgrabiec@scylladb.com>
-R: Asias He <asias@scylladb.com>
-F: streaming/*
-F: service/storage_service.*
-
-ALTERNATOR
-M: Nadav Har'El <nyh@scylladb.com>
-F: alternator/*
-F: alternator-test/*
-
-THE REST
-M: Avi Kivity <avi@scylladb.com>
-M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Nadav Har'El <nyh@scylladb.com>
-F: *
--- a/README.md
+++ b/README.md
@@ -1,43 +1,66 @@
 # Scylla

-## Quick-start
+[![Slack](https://img.shields.io/badge/slack-scylla-brightgreen.svg?logo=slack)](http://slack.scylladb.com)
+[![Twitter](https://img.shields.io/twitter/follow/ScyllaDB.svg?style=social&label=Follow)](https://twitter.com/intent/follow?screen_name=ScyllaDB)
+
+## What is Scylla?
+
+Scylla is the real-time big data database that is API-compatible with Apache Cassandra and Amazon DynamoDB.
+Scylla embraces a shared-nothing approach that increases throughput and storage capacity to realize order-of-magnitude performance improvements and reduce hardware costs.
+
+For more information, please see the [ScyllaDB web site].
+
+[ScyllaDB web site]: https://www.scylladb.com
+
+## Build Prerequisites

 Scylla is fairly fussy about its build environment, requiring very recent
 versions of the C++20 compiler and of many libraries to build. The document
 [HACKING.md](HACKING.md) includes detailed information on building and
 developing Scylla, but to get Scylla building quickly on (almost) any build
-machine, Scylla offers offers a [frozen toolchain](tools/toolchain/README.md),
+machine, Scylla offers a [frozen toolchain](tools/toolchain/README.md),
 This is a pre-configured Docker image which includes recent versions of all
 the required compilers, libraries and build tools. Using the frozen toolchain
 allows you to avoid changing anything in your build machine to meet Scylla's
 requirements - you just need to meet the frozen toolchain's prerequisites
 (mostly, Docker or Podman being available).

-Building and running Scylla with the frozen toolchain is as easy as:
+## Building Scylla
+
+Building Scylla with the frozen toolchain `dbuild` is as easy as:

 ```bash
-$ ./tools/toolchain/dbuild ./configure.py
-$ ./tools/toolchain/dbuild ninja build/release/scylla
-$ ./tools/toolchain/dbuild ./build/release/scylla --developer-mode 1
+$ git submodule update --init --force --recursive
+$ ./tools/toolchain/dbuild ./configure.py
+$ ./tools/toolchain/dbuild ninja build/release/scylla
 ```

+For further information, please see:
+
+* [Developer documentation] for more information on building Scylla.
+* [Build documentation] on how to build Scylla binaries, tests, and packages.
+* [Docker image build documentation] for information on how to build Docker images.
+
+[developer documentation]: HACKING.md
+[build documentation]: docs/building.md
+[docker image build documentation]: dist/docker/redhat/README.md
+
 ## Running Scylla

-* Run Scylla
-```
-./build/release/scylla
+To start Scylla server, run:

+```bash
+$ ./tools/toolchain/dbuild ./build/release/scylla --workdir tmp --smp 1 --developer-mode 1
 ```

-* run Scylla with one CPU and ./tmp as work directory
+This will start a Scylla node with one CPU core allocated to it and data files stored in the `tmp` directory.
+The `--developer-mode` is needed to disable the various checks Scylla performs at startup to ensure the machine is configured for maximum performance (not relevant on development workstations).
+Please note that you need to run Scylla with `dbuild` if you built it with the frozen toolchain.

-```
-./build/release/scylla --workdir tmp --smp 1
-```
+For more run options, run:

-* For more run options:
-```
-./build/release/scylla --help
+```bash
+$ ./tools/toolchain/dbuild ./build/release/scylla --help
 ```

 ## Testing
@@ -46,10 +69,10 @@ See [test.py manual](docs/testing.md).

 ## Scylla APIs and compatibility
 By default, Scylla is compatible with Apache Cassandra and its APIs - CQL and
-Thrift. There is also experimental support for the API of Amazon DynamoDB,
-but being experimental it needs to be explicitly enabled to be used. For more
-information on how to enable the experimental DynamoDB compatibility in Scylla,
-and the current limitations of this feature, see
+Thrift. There is also support for the API of Amazon DynamoDB™,
+which needs to be enabled and configured in order to be used. For more
+information on how to enable the DynamoDB™ API in Scylla,
+and the current compatibility of this feature as well as Scylla-specific extensions, see
 [Alternator](docs/alternator/alternator.md) and
 [Getting started with Alternator](docs/alternator/getting-started.md).

@@ -69,27 +92,22 @@ The courses are free, self-paced and include hands-on examples. They cover a var
 administration, architecture, basic NoSQL concepts, using drivers for application development, Scylla setup, failover, compactions, 
 multi-datacenters and how Scylla integrates with third-party applications.

-## Building a CentOS-based Docker image
-
-Build a Docker image with:
-
-```
-cd dist/docker/redhat
-docker build -t <image-name> .
-```
-
-This build is based on executables downloaded from downloads.scylladb.com,
-**not** on the executables built in this source directory. See further
-instructions in dist/docker/redhat/README.md to build a docker image from
-your own executables.
-
-Run the image with:
-
-```
-docker run -p $(hostname -i):9042:9042 -i -t <image name>
-```
-
 ## Contributing to Scylla

-[Hacking howto](HACKING.md)
-[Guidelines for contributing](CONTRIBUTING.md)
+If you want to report a bug or submit a pull request or a patch, please read the [contribution guidelines].
+
+If you are a developer working on Scylla, please read the [developer guidelines].
+
+[contribution guidelines]: CONTRIBUTING.md
+[developer guidelines]: HACKING.md
+
+## Contact
+
+* The [users mailing list] and [Slack channel] are for users to discuss configuration, management, and operations of the ScyllaDB open source.
+* The [developers mailing list] is for developers and people interested in following the development of ScyllaDB to discuss technical topics.
+
+[Users mailing list]: https://groups.google.com/forum/#!forum/scylladb-users
+
+[Slack channel]: http://slack.scylladb.com/
+
+[Developers mailing list]: https://groups.google.com/forum/#!forum/scylladb-dev
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=4.2.4
+VERSION=4.3.7

 if test -f version
 then
--- a/2
+++ b/2
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -78,12 +78,12 @@ void check_expiry(std::string_view signature_date) {
    std::string expiration_str = format_time_point(db_clock::now() - 15min);
    std::string validity_str = format_time_point(db_clock::now() + 15min);
    if (signature_date < expiration_str) {
-        throw api_error("InvalidSignatureException",
+        throw api_error::invalid_signature(
                fmt::format("Signature expired: {} is now earlier than {} (current time - 15 min.)",
                signature_date, expiration_str));
    }
    if (signature_date > validity_str) {
-        throw api_error("InvalidSignatureException",
+        throw api_error::invalid_signature(
                fmt::format("Signature not yet current: {} is still later than {} (current time + 15 min.)",
                signature_date, validity_str));
    }
@@ -94,13 +94,13 @@ std::string get_signature(std::string_view access_key_id, std::string_view secre
        std::string_view body_content, std::string_view region, std::string_view service, std::string_view query_string) {
    auto amz_date_it = signed_headers_map.find("x-amz-date");
    if (amz_date_it == signed_headers_map.end()) {
-        throw api_error("InvalidSignatureException", "X-Amz-Date header is mandatory for signature verification");
+        throw api_error::invalid_signature("X-Amz-Date header is mandatory for signature verification");
    }
    std::string_view amz_date = amz_date_it->second;
    check_expiry(amz_date);
    std::string_view datestamp = amz_date.substr(0, 8);
    if (datestamp != orig_datestamp) {
-        throw api_error("InvalidSignatureException",
+        throw api_error::invalid_signature(
                format("X-Amz-Date date does not match the provided datestamp. Expected {}, got {}",
                        orig_datestamp, datestamp));
    }
@@ -126,7 +126,7 @@ std::string get_signature(std::string_view access_key_id, std::string_view secre

 future<std::string> get_key_from_roles(cql3::query_processor& qp, std::string username) {
    static const sstring query = format("SELECT salted_hash FROM {} WHERE {} = ?",
-            auth::meta::roles_table::qualified_name(), auth::meta::roles_table::role_col_name);
+            auth::meta::roles_table::qualified_name, auth::meta::roles_table::role_col_name);

    auto cl = auth::password_authenticator::consistency_for_user(username);
    auto& timeout = auth::internal_distributed_timeout_config();
@@ -134,11 +134,11 @@ future<std::string> get_key_from_roles(cql3::query_processor& qp, std::string us
        auto res = f.get0();
        auto salted_hash = std::optional<sstring>();
        if (res->empty()) {
-            throw api_error("UnrecognizedClientException", fmt::format("User not found: {}", username));
+            throw api_error::unrecognized_client(fmt::format("User not found: {}", username));
        }
        salted_hash = res->one().get_opt<sstring>("salted_hash");
        if (!salted_hash) {
-            throw api_error("UnrecognizedClientException", fmt::format("No password found for user: {}", username));
+            throw api_error::unrecognized_client(fmt::format("No password found for user: {}", username));
        }
        return make_ready_future<std::string>(*salted_hash);
    });
--- a/alternator/base64.cc
+++ b/alternator/base64.cc
@@ -32,13 +32,13 @@
 // and the character used in base64 encoding to represent it.
 static class base64_chars {
 public:
-    static constexpr const char* to =
+    static constexpr const char to[] =
            "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
    int8_t from[255];
    base64_chars() {
-        static_assert(strlen(to) == 64);
+        static_assert(sizeof(to) == 64 + 1);
        for (int i = 0; i < 255; i++) {
-            from[i] = 255; // signal invalid character
+            from[i] = -1; // signal invalid character
        }
        for (int i = 0; i < 64; i++) {
            from[(unsigned) to[i]] = i;
--- a/alternator/base64.hh
+++ b/alternator/base64.hh
@@ -23,7 +23,7 @@

 #include <string_view>
 #include "bytes.hh"
-#include "rjson.hh"
+#include "utils/rjson.hh"

 std::string base64_encode(bytes_view);

--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -26,7 +26,7 @@
 #include "alternator/error.hh"
 #include "cql3/constants.hh"
 #include <unordered_map>
-#include "rjson.hh"
+#include "utils/rjson.hh"
 #include "serialization.hh"
 #include "base64.hh"
 #include <stdexcept>
@@ -57,12 +57,12 @@ comparison_operator_type get_comparison_operator(const rjson::value& comparison_
            {"NOT_CONTAINS", comparison_operator_type::NOT_CONTAINS},
    };
    if (!comparison_operator.IsString()) {
-        throw api_error("ValidationException", format("Invalid comparison operator definition {}", rjson::print(comparison_operator)));
+        throw api_error::validation(format("Invalid comparison operator definition {}", rjson::print(comparison_operator)));
    }
    std::string op = comparison_operator.GetString();
    auto it = ops.find(op);
    if (it == ops.end()) {
-        throw api_error("ValidationException", format("Unsupported comparison operator {}", op));
+        throw api_error::validation(format("Unsupported comparison operator {}", op));
    }
    return it->second;
 }
@@ -104,10 +104,10 @@ static void verify_operand_count(const rjson::value* array, const size_check& ex
        return;
    }
    if (!array || !array->IsArray()) {
-        throw api_error("ValidationException", "With ComparisonOperator, AttributeValueList must be given and an array");
+        throw api_error::validation("With ComparisonOperator, AttributeValueList must be given and an array");
    }
    if (!expected(array->Size())) {
-        throw api_error("ValidationException",
+        throw api_error::validation(
                        format("{} operator requires AttributeValueList {}, instead found list size {}",
                               op, expected.what(), array->Size()));
    }
@@ -123,7 +123,7 @@ struct rjson_engaged_ptr_comp {
 // as internally they're stored in an array, and the order of elements is
 // not important in set equality. See issue #5021
 static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2) {
-    if (set1.Size() != set2.Size()) {
+    if (!set1.IsArray() || !set2.IsArray() || set1.Size() != set2.Size()) {
        return false;
    }
    std::set<const rjson::value*, rjson_engaged_ptr_comp> set1_raw;
@@ -131,7 +131,40 @@ static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2
        set1_raw.insert(&*it);
    }
    for (const auto& a : set2.GetArray()) {
-        if (set1_raw.count(&a) == 0) {
+        if (!set1_raw.contains(&a)) {
+            return false;
+        }
+    }
+    return true;
+}
+// Moreover, the JSON being compared can be a nested document with outer
+// layers of lists and maps and some inner set - and we need to get to that
+// inner set to compare it correctly with check_EQ_for_sets() (issue #8514).
+static bool check_EQ(const rjson::value* v1, const rjson::value& v2);
+static bool check_EQ_for_lists(const rjson::value& list1, const rjson::value& list2) {
+    if (!list1.IsArray() || !list2.IsArray() || list1.Size() != list2.Size()) {
+        return false;
+    }
+    auto it1 = list1.Begin();
+    auto it2 = list2.Begin();
+    while (it1 != list1.End()) {
+        // Note: Alternator limits an item's depth (rjson::parse() limits
+        // it to around 37 levels), so this recursion is safe.
+        if (!check_EQ(&*it1, *it2)) {
+            return false;
+        }
+        ++it1;
+        ++it2;
+    }
+    return true;
+}
+static bool check_EQ_for_maps(const rjson::value& list1, const rjson::value& list2) {
+    if (!list1.IsObject() || !list2.IsObject() || list1.MemberCount() != list2.MemberCount()) {
+        return false;
+    }
+    for (auto it1 = list1.MemberBegin(); it1 != list1.MemberEnd(); ++it1) {
+        auto it2 = list2.FindMember(it1->name);
+        if (it2 == list2.MemberEnd() || !check_EQ(&it1->value, it2->value)) {
            return false;
        }
    }
@@ -140,22 +173,34 @@ static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2

 // Check if two JSON-encoded values match with the EQ relation
 static bool check_EQ(const rjson::value* v1, const rjson::value& v2) {
-    if (!v1) {
-        return false;
-    }
-    if (v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
+    if (v1 && v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
        auto it1 = v1->MemberBegin();
        auto it2 = v2.MemberBegin();
-        if ((it1->name == "SS" && it2->name == "SS") || (it1->name == "NS" && it2->name == "NS") || (it1->name == "BS" && it2->name == "BS")) {
-            return check_EQ_for_sets(it1->value, it2->value);
+        if (it1->name != it2->name) {
+            return false;
        }
+        if (it1->name == "SS" || it1->name == "NS" || it1->name == "BS") {
+            return check_EQ_for_sets(it1->value, it2->value);
+        } else if(it1->name == "L") {
+            return check_EQ_for_lists(it1->value, it2->value);
+        } else if(it1->name == "M") {
+            return check_EQ_for_maps(it1->value, it2->value);
+        } else {
+            // Other, non-nested types (number, string, etc.) can be compared
+            // literally, comparing their JSON representation.
+            return it1->value == it2->value;
+        }
+    } else {
+        // If v1 and/or v2 are missing (IsNull()) the result should be false.
+        // In the unlikely case that the object is malformed (issue #8070),
+        // let's also return false.
+        return false;
    }
-    return *v1 == v2;
 }

 // Check if two JSON-encoded values match with the NE relation
 static bool check_NE(const rjson::value* v1, const rjson::value& v2) {
-    return !v1 || *v1 != v2; // null is unequal to anything.
+    return !check_EQ(v1, v2);
 }

 // Check if two JSON-encoded values match with the BEGINS_WITH relation
@@ -164,26 +209,26 @@ bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2,
    bool bad = false;
    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
        if (v1_from_query) {
-            throw api_error("ValidationException", "begins_with() encountered malformed argument");
+            throw api_error::validation("begins_with() encountered malformed argument");
        } else {
            bad = true;
        }
    } else if (v1->MemberBegin()->name != "S" && v1->MemberBegin()->name != "B") {
        if (v1_from_query) {
-            throw api_error("ValidationException", format("begins_with supports only string or binary type, got: {}", *v1));
+            throw api_error::validation(format("begins_with supports only string or binary type, got: {}", *v1));
        } else {
            bad = true;
        }
    }
    if (!v2.IsObject() || v2.MemberCount() != 1) {
        if (v2_from_query) {
-            throw api_error("ValidationException", "begins_with() encountered malformed argument");
+            throw api_error::validation("begins_with() encountered malformed argument");
        } else {
            bad = true;
        }
    } else if (v2.MemberBegin()->name != "S" && v2.MemberBegin()->name != "B") {
        if (v2_from_query) {
-            throw api_error("ValidationException", format("begins_with() supports only string or binary type, got: {}", v2));
+            throw api_error::validation(format("begins_with() supports only string or binary type, got: {}", v2));
        } else {
            bad = true;
        }
@@ -250,12 +295,12 @@ static bool check_NOT_CONTAINS(const rjson::value* v1, const rjson::value& v2) {
 // Check if a JSON-encoded value equals any element of an array, which must have at least one element.
 static bool check_IN(const rjson::value* val, const rjson::value& array) {
    if (!array[0].IsObject() || array[0].MemberCount() != 1) {
-        throw api_error("ValidationException",
+        throw api_error::validation(
                        format("IN operator encountered malformed AttributeValue: {}", array[0]));
    }
    const auto& type = array[0].MemberBegin()->name;
    if (type != "S" && type != "N" && type != "B") {
-        throw api_error("ValidationException",
+        throw api_error::validation(
                        "IN operator requires AttributeValueList elements to be of type String, Number, or Binary ");
    }
    if (!val) {
@@ -264,7 +309,7 @@ static bool check_IN(const rjson::value* val, const rjson::value& array) {
    bool have_match = false;
    for (const auto& elem : array.GetArray()) {
        if (!elem.IsObject() || elem.MemberCount() != 1 || elem.MemberBegin()->name != type) {
-            throw api_error("ValidationException",
+            throw api_error::validation(
                            "IN operator requires all AttributeValueList elements to have the same type ");
        }
        if (!have_match && *val == elem) {
@@ -298,6 +343,8 @@ static bool check_NOT_NULL(const rjson::value* val) {

 // Only types S, N or B (string, number or bytes) may be compared by the
 // various comparion operators - lt, le, gt, ge, and between.
+// Note that in particular, if the value is missing (v->IsNull()), this
+// check returns false.
 static bool check_comparable_type(const rjson::value& v) {
    if (!v.IsObject() || v.MemberCount() != 1) {
        return false;
@@ -313,13 +360,13 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara
    bool bad = false;
    if (!v1 || !check_comparable_type(*v1)) {
        if (v1_from_query) {
-            throw api_error("ValidationException", format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
+            throw api_error::validation(format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
        }
        bad = true;
    }
    if (!check_comparable_type(v2)) {
        if (v2_from_query) {
-            throw api_error("ValidationException", format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
+            throw api_error::validation(format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
        }
        bad = true;
    }
@@ -379,7 +426,7 @@ template <typename T>
 static bool check_BETWEEN(const T& v, const T& lb, const T& ub, bool bounds_from_query) {
    if (cmp_lt()(ub, lb)) {
        if (bounds_from_query) {
-            throw api_error("ValidationException",
+            throw api_error::validation(
                format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
        } else {
            return false;
@@ -393,7 +440,7 @@ static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const r
    if ((v && v_from_query && !check_comparable_type(*v)) ||
        (lb_from_query && !check_comparable_type(lb)) ||
        (ub_from_query && !check_comparable_type(ub))) {
-        throw api_error("ValidationException", "between allow only the types String, Number, or Binary");
+        throw api_error::validation("between allow only the types String, Number, or Binary");

    }
    if (!v || !v->IsObject() || v->MemberCount() != 1 ||
@@ -408,7 +455,7 @@ static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const r
    bool bounds_from_query = lb_from_query && ub_from_query;
    if (kv_lb.name != kv_ub.name) {
        if (bounds_from_query) {
-           throw api_error("ValidationException",
+           throw api_error::validation(
                format("BETWEEN operator requires the same type for lower and upper bound; instead got {} and {}",
                       kv_lb.name, kv_ub.name));
        } else {
@@ -432,7 +479,7 @@ static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const r
        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value), bounds_from_query);
    }
    if (v_from_query) {
-        throw api_error("ValidationException",
+        throw api_error::validation(
            format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
               kv_lb.name));
    } else {
@@ -455,24 +502,24 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
    // and requires a different combinations of parameters in the request
    if (value) {
        if (exists && (!exists->IsBool() || exists->GetBool() != true)) {
-            throw api_error("ValidationException", "Cannot combine Value with Exists!=true");
+            throw api_error::validation("Cannot combine Value with Exists!=true");
        }
        if (comparison_operator) {
-            throw api_error("ValidationException", "Cannot combine Value with ComparisonOperator");
+            throw api_error::validation("Cannot combine Value with ComparisonOperator");
        }
        return check_EQ(got, *value);
    } else if (exists) {
        if (comparison_operator) {
-            throw api_error("ValidationException", "Cannot combine Exists with ComparisonOperator");
+            throw api_error::validation("Cannot combine Exists with ComparisonOperator");
        }
        if (!exists->IsBool() || exists->GetBool() != false) {
-            throw api_error("ValidationException", "Exists!=false requires Value");
+            throw api_error::validation("Exists!=false requires Value");
        }
        // Remember Exists=false, so we're checking that the attribute does *not* exist:
        return !got;
    } else {
        if (!comparison_operator) {
-            throw api_error("ValidationException", "Missing ComparisonOperator, Value or Exists");
+            throw api_error::validation("Missing ComparisonOperator, Value or Exists");
        }
        comparison_operator_type op = get_comparison_operator(*comparison_operator);
        switch (op) {
@@ -518,7 +565,7 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
                const rjson::value& arg = (*attribute_value_list)[0];
                const auto& argtype = (*arg.MemberBegin()).name;
                if (argtype != "S" && argtype != "N" && argtype != "B") {
-                    throw api_error("ValidationException",
+                    throw api_error::validation(
                            format("CONTAINS operator requires a single AttributeValue of type String, Number, or Binary, "
                                    "got {} instead", argtype));
                }
@@ -532,7 +579,7 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
                const rjson::value& arg = (*attribute_value_list)[0];
                const auto& argtype = (*arg.MemberBegin()).name;
                if (argtype != "S" && argtype != "N" && argtype != "B") {
-                    throw api_error("ValidationException",
+                    throw api_error::validation(
                            format("CONTAINS operator requires a single AttributeValue of type String, Number, or Binary, "
                                    "got {} instead", argtype));
                }
@@ -549,7 +596,7 @@ conditional_operator_type get_conditional_operator(const rjson::value& req) {
        return conditional_operator_type::MISSING;
    }
    if (!conditional_operator->IsString()) {
-        throw api_error("ValidationException", "'ConditionalOperator' parameter, if given, must be a string");
+        throw api_error::validation("'ConditionalOperator' parameter, if given, must be a string");
    }
    auto s = rjson::to_string_view(*conditional_operator);
    if (s == "AND") {
@@ -557,7 +604,7 @@ conditional_operator_type get_conditional_operator(const rjson::value& req) {
    } else if (s == "OR") {
        return conditional_operator_type::OR;
    } else {
-        throw api_error("ValidationException",
+        throw api_error::validation(
                format("'ConditionalOperator' parameter must be AND, OR or missing. Found {}.", s));
    }
 }
@@ -572,13 +619,13 @@ bool verify_expected(const rjson::value& req, const rjson::value* previous_item)
    auto conditional_operator = get_conditional_operator(req);
    if (conditional_operator != conditional_operator_type::MISSING &&
        (!expected || (expected->IsObject() && expected->GetObject().ObjectEmpty()))) {
-            throw api_error("ValidationException", "'ConditionalOperator' parameter cannot be specified for missing or empty Expression");
+            throw api_error::validation("'ConditionalOperator' parameter cannot be specified for missing or empty Expression");
    }
    if (!expected) {
        return true;
    }
    if (!expected->IsObject()) {
-        throw api_error("ValidationException", "'Expected' parameter, if given, must be an object");
+        throw api_error::validation("'Expected' parameter, if given, must be an object");
    }
    bool require_all = conditional_operator != conditional_operator_type::OR;
    return verify_condition(*expected, require_all, previous_item);
@@ -637,7 +684,7 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
                return it->value.GetBool();
            }
        }
-        throw api_error("ValidationException",
+        throw api_error::validation(
                format("ConditionExpression: condition results in a non-boolean value: {}",
                        calculated_values[0]));
    default:
--- a/alternator/error.hh
+++ b/alternator/error.hh
@@ -26,12 +26,15 @@

 namespace alternator {

-// DynamoDB's error messages are described in detail in
+// api_error contains a DynamoDB error message to be returned to the user.
+// It can be returned by value (see executor::request_return_type) or thrown.
+// The DynamoDB's error messages are described in detail in
 // https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Programming.Errors.html
-// Ah An error message has a "type", e.g., "ResourceNotFoundException", a coarser
-// HTTP code (almost always, 400), and a human readable message. Eventually these
-// will be wrapped into a JSON object returned to the client.
-class api_error : public std::exception {
+// An error message has an HTTP code (almost always 400), a type, e.g.,
+// "ResourceNotFoundException", and a human readable message.
+// Eventually alternator::api_handler will convert a returned or thrown
+// api_error into a JSON object, and that is returned to the user.
+class api_error final {
 public:
    using status_type = httpd::reply::status_type;
    status_type _http_code;
@@ -42,8 +45,41 @@ public:
        , _type(std::move(type))
        , _msg(std::move(msg))
    { }
-    api_error() = default;
-    virtual const char* what() const noexcept override { return _msg.c_str(); }
+
+    // Factory functions for some common types of DynamoDB API errors
+    static api_error validation(std::string msg) {
+        return api_error("ValidationException", std::move(msg));
+    }
+    static api_error resource_not_found(std::string msg) {
+        return api_error("ResourceNotFoundException", std::move(msg));
+    }
+    static api_error resource_in_use(std::string msg) {
+        return api_error("ResourceInUseException", std::move(msg));
+    }
+    static api_error invalid_signature(std::string msg) {
+        return api_error("InvalidSignatureException", std::move(msg));
+    }
+    static api_error unrecognized_client(std::string msg) {
+        return api_error("UnrecognizedClientException", std::move(msg));
+    }
+    static api_error unknown_operation(std::string msg) {
+        return api_error("UnknownOperationException", std::move(msg));
+    }
+    static api_error access_denied(std::string msg) {
+        return api_error("AccessDeniedException", std::move(msg));
+    }
+    static api_error conditional_check_failed(std::string msg) {
+        return api_error("ConditionalCheckFailedException", std::move(msg));
+    }
+    static api_error expired_iterator(std::string msg) {
+        return api_error("ExpiredIteratorException", std::move(msg));
+    }
+    static api_error trimmed_data_access_exception(std::string msg) {
+        return api_error("TrimmedDataAccessException", std::move(msg));
+    }
+    static api_error internal(std::string msg) {
+        return api_error("InternalServerError", std::move(msg), reply::status_type::internal_server_error);
+    }
 };

 }
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -30,16 +30,51 @@
 #include "service/storage_proxy.hh"
 #include "service/migration_manager.hh"
 #include "service/client_state.hh"
+#include "db/timeout_clock.hh"

 #include "alternator/error.hh"
 #include "stats.hh"
-#include "rjson.hh"
+#include "utils/rjson.hh"
+
+namespace db {
+    class system_distributed_keyspace;
+}
+
+namespace query {
+class partition_slice;
+class result;
+}
+
+namespace cql3::selection {
+    class selection;
+}
+
+namespace service {
+    class storage_service;
+}

 namespace alternator {

+class rmw_operation;
+
+struct make_jsonable : public json::jsonable {
+    rjson::value _value;
+public:
+    explicit make_jsonable(rjson::value&& value);
+    std::string to_json() const override;
+};
+struct json_string : public json::jsonable {
+    std::string _value;
+public:
+    explicit json_string(std::string&& value);
+    std::string to_json() const override;
+};
+
 class executor : public peering_sharded_service<executor> {
    service::storage_proxy& _proxy;
    service::migration_manager& _mm;
+    db::system_distributed_keyspace& _sdks;
+    service::storage_service& _ss;
    // An smp_service_group to be used for limiting the concurrency when
    // forwarding Alternator request between shards - if necessary for LWT.
    smp_service_group _ssg;
@@ -52,12 +87,13 @@ public:
    static constexpr auto KEYSPACE_NAME_PREFIX = "alternator_";
    static constexpr std::string_view INTERNAL_TABLE_PREFIX = ".scylla.alternator.";

-    executor(service::storage_proxy& proxy, service::migration_manager& mm, smp_service_group ssg)
-        : _proxy(proxy), _mm(mm), _ssg(ssg) {}
+    executor(service::storage_proxy& proxy, service::migration_manager& mm, db::system_distributed_keyspace& sdks, service::storage_service& ss, smp_service_group ssg)
+        : _proxy(proxy), _mm(mm), _sdks(sdks), _ss(ss), _ssg(ssg) {}

    future<request_return_type> create_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
    future<request_return_type> describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
    future<request_return_type> delete_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> update_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
    future<request_return_type> put_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
    future<request_return_type> get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
    future<request_return_type> delete_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
@@ -71,6 +107,10 @@ public:
    future<request_return_type> tag_resource(client_state& client_state, service_permit permit, rjson::value request);
    future<request_return_type> untag_resource(client_state& client_state, service_permit permit, rjson::value request);
    future<request_return_type> list_tags_of_resource(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> list_streams(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> describe_stream(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> get_shard_iterator(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> get_records(client_state& client_state, tracing::trace_state_ptr, service_permit permit, rjson::value request);

    future<> start();
    future<> stop() { return make_ready_future<>(); }
@@ -78,6 +118,37 @@ public:
    future<> create_keyspace(std::string_view keyspace_name);

    static tracing::trace_state_ptr maybe_trace_query(client_state& client_state, sstring_view op, sstring_view query);
+
+    static sstring table_name(const schema&);
+    static db::timeout_clock::time_point default_timeout();
+    static schema_ptr find_table(service::storage_proxy&, const rjson::value& request);
+
+private:
+    friend class rmw_operation;
+
+    static bool is_alternator_keyspace(const sstring& ks_name);
+    static sstring make_keyspace_name(const sstring& table_name);
+    static void describe_key_schema(rjson::value& parent, const schema&, std::unordered_map<std::string,std::string> * = nullptr);
+    static void describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>&);
+    
+public:    
+    static std::optional<rjson::value> describe_single_item(schema_ptr,
+        const query::partition_slice&,
+        const cql3::selection::selection&,
+        const query::result&,
+        const std::unordered_set<std::string>&);
+
+    static void describe_single_item(const cql3::selection::selection&,
+        const std::vector<bytes_opt>&,
+        const std::unordered_set<std::string>&,
+        rjson::value&,
+        bool = false);
+
+
+
+    void add_stream_options(const rjson::value& stream_spec, schema_builder&) const;
+    void supplement_table_info(rjson::value& descr, const schema& schema) const;
+    void supplement_table_stream_info(rjson::value& descr, const schema& schema) const;
 };

 }
--- a/alternator/expressions.cc
+++ b/alternator/expressions.cc
@@ -157,12 +157,12 @@ static void resolve_path(parsed::path& p,
    const std::string& column_name = p.root();
    if (column_name.size() > 0 && column_name.front() == '#') {
        if (!expression_attribute_names) {
-            throw api_error("ValidationException",
+            throw api_error::validation(
                    format("ExpressionAttributeNames missing, entry '{}' required by expression", column_name));
        }
        const rjson::value* value = rjson::find(*expression_attribute_names, column_name);
        if (!value || !value->IsString()) {
-            throw api_error("ValidationException",
+            throw api_error::validation(
                    format("ExpressionAttributeNames missing entry '{}' required by expression", column_name));
        }
        used_attribute_names.emplace(column_name);
@@ -176,16 +176,16 @@ static void resolve_constant(parsed::constant& c,
    std::visit(overloaded_functor {
        [&] (const std::string& valref) {
            if (!expression_attribute_values) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("ExpressionAttributeValues missing, entry '{}' required by expression", valref));
            }
            const rjson::value* value = rjson::find(*expression_attribute_values, valref);
            if (!value) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("ExpressionAttributeValues missing entry '{}' required by expression", valref));
            }
            if (value->IsNull()) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("ExpressionAttributeValues null value for entry '{}' required by expression", valref));
            }
            validate_value(*value, "ExpressionAttributeValues");
@@ -392,7 +392,7 @@ static rjson::value list_concatenate(const rjson::value& v1, const rjson::value&
    const rjson::value* list1 = unwrap_list(v1);
    const rjson::value* list2 = unwrap_list(v2);
    if (!list1 || !list2) {
-        throw api_error("ValidationException", "UpdateExpression: list_append() given a non-list");
+        throw api_error::validation("UpdateExpression: list_append() given a non-list");
    }
    rjson::value cat = rjson::copy(*list1);
    for (const auto& a : list2->GetArray()) {
@@ -413,28 +413,28 @@ static rjson::value calculate_size(const rjson::value& v) {
    // must come from the request itself, not from the database, so it makes
    // sense to throw a ValidationException if we see such a problem.
    if (!v.IsObject() || v.MemberCount() != 1) {
-        throw api_error("ValidationException", format("invalid object: {}", v));
+        throw api_error::validation(format("invalid object: {}", v));
    }
    auto it = v.MemberBegin();
    int ret;
    if (it->name == "S") {
        if (!it->value.IsString()) {
-            throw api_error("ValidationException", format("invalid string: {}", v));
+            throw api_error::validation(format("invalid string: {}", v));
        }
        ret = it->value.GetStringLength();
    } else if (it->name == "NS" || it->name == "SS" || it->name == "BS" || it->name == "L") {
        if (!it->value.IsArray()) {
-            throw api_error("ValidationException", format("invalid set: {}", v));
+            throw api_error::validation(format("invalid set: {}", v));
        }
        ret = it->value.Size();
    } else if (it->name == "M") {
        if (!it->value.IsObject()) {
-            throw api_error("ValidationException", format("invalid map: {}", v));
+            throw api_error::validation(format("invalid map: {}", v));
        }
        ret = it->value.MemberCount();
    } else if (it->name == "B") {
        if (!it->value.IsString()) {
-            throw api_error("ValidationException", format("invalid byte string: {}", v));
+            throw api_error::validation(format("invalid byte string: {}", v));
        }
        ret = base64_decoded_len(rjson::to_string_view(it->value));
    } else {
@@ -478,11 +478,11 @@ static const
 std::unordered_map<std::string_view, function_handler_type*> function_handlers {
    {"list_append", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
            if (caller != calculate_value_caller::UpdateExpression) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: list_append() not allowed here", caller));
            }
            if (f._parameters.size() != 2) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: list_append() accepts 2 parameters, got {}", caller, f._parameters.size()));
            }
            rjson::value v1 = calculate_value(f._parameters[0], caller, previous_item);
@@ -492,15 +492,15 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
    },
    {"if_not_exists", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
            if (caller != calculate_value_caller::UpdateExpression) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: if_not_exists() not allowed here", caller));
            }
            if (f._parameters.size() != 2) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: if_not_exists() accepts 2 parameters, got {}", caller, f._parameters.size()));
            }
            if (!std::holds_alternative<parsed::path>(f._parameters[0]._value)) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: if_not_exists() must include path as its first argument", caller));
            }
            rjson::value v1 = calculate_value(f._parameters[0], caller, previous_item);
@@ -510,11 +510,11 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
    },
    {"size", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
            if (caller != calculate_value_caller::ConditionExpression) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: size() not allowed here", caller));
            }
            if (f._parameters.size() != 1) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: size() accepts 1 parameter, got {}", caller, f._parameters.size()));
            }
            rjson::value v = calculate_value(f._parameters[0], caller, previous_item);
@@ -523,15 +523,15 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
    },
    {"attribute_exists", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
            if (caller != calculate_value_caller::ConditionExpressionAlone) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_exists() not allowed here", caller));
            }
            if (f._parameters.size() != 1) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_exists() accepts 1 parameter, got {}", caller, f._parameters.size()));
            }
            if (!std::holds_alternative<parsed::path>(f._parameters[0]._value)) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_exists()'s parameter must be a path", caller));
            }
            rjson::value v = calculate_value(f._parameters[0], caller, previous_item);
@@ -540,15 +540,15 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
    },
    {"attribute_not_exists", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
            if (caller != calculate_value_caller::ConditionExpressionAlone) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_not_exists() not allowed here", caller));
            }
            if (f._parameters.size() != 1) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_not_exists() accepts 1 parameter, got {}", caller, f._parameters.size()));
            }
            if (!std::holds_alternative<parsed::path>(f._parameters[0]._value)) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_not_exists()'s parameter must be a path", caller));
            }
            rjson::value v = calculate_value(f._parameters[0], caller, previous_item);
@@ -557,18 +557,18 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
    },
    {"attribute_type", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
            if (caller != calculate_value_caller::ConditionExpressionAlone) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_type() not allowed here", caller));
            }
            if (f._parameters.size() != 2) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_type() accepts 2 parameters, got {}", caller, f._parameters.size()));
            }
            // There is no real reason for the following check (not
            // allowing the type to come from a document attribute), but
            // DynamoDB does this check, so we do too...
            if (!f._parameters[1].is_constant()) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_types()'s first parameter must be an expression attribute", caller));
            }
            rjson::value v0 = calculate_value(f._parameters[0], caller, previous_item);
@@ -577,7 +577,7 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
                // If the type parameter is not one of the legal types
                // we should generate an error, not a failed condition:
                if (!known_type(rjson::to_string_view(v1.MemberBegin()->value))) {
-                    throw api_error("ValidationException",
+                    throw api_error::validation(
                            format("{}: attribute_types()'s second parameter, {}, is not a known type",
                                    caller, v1.MemberBegin()->value));
                }
@@ -587,18 +587,18 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
                    return to_bool_json(false);
                }
            } else {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_type() second parameter must refer to a string, got {}", caller, v1));
            }
        }
    },
    {"begins_with", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
            if (caller != calculate_value_caller::ConditionExpressionAlone) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: begins_with() not allowed here", caller));
            }
            if (f._parameters.size() != 2) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: begins_with() accepts 2 parameters, got {}", caller, f._parameters.size()));
            }
            rjson::value v1 = calculate_value(f._parameters[0], caller, previous_item);
@@ -609,11 +609,11 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
    },
    {"contains", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
            if (caller != calculate_value_caller::ConditionExpressionAlone) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: contains() not allowed here", caller));
            }
            if (f._parameters.size() != 2) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: contains() accepts 2 parameters, got {}", caller, f._parameters.size()));
            }
            rjson::value v1 = calculate_value(f._parameters[0], caller, previous_item);
@@ -639,7 +639,7 @@ rjson::value calculate_value(const parsed::value& v,
        [&] (const parsed::value::function_call& f) -> rjson::value {
            auto function_it = function_handlers.find(std::string_view(f._function_name));
            if (function_it == function_handlers.end()) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("UpdateExpression: unknown function '{}' called.", f._function_name));
            }
            return function_it->second(caller, previous_item, f);
@@ -651,7 +651,7 @@ rjson::value calculate_value(const parsed::value& v,
            std::string update_path = p.root();
            if (p.has_operators()) {
                // FIXME: support this
-                throw api_error("ValidationException", "Reading attribute paths not yet implemented");
+                throw api_error::validation("Reading attribute paths not yet implemented");
            }
            const rjson::value* previous_value = rjson::find(*previous_item, update_path);
            return previous_value ? rjson::copy(*previous_value) : rjson::null_value();
@@ -663,7 +663,7 @@ rjson::value calculate_value(const parsed::value& v,
 // either a single value, or v1+v2 or v1-v2.
 rjson::value calculate_value(const parsed::set_rhs& rhs,
        const rjson::value* previous_item) {
-    switch(rhs._op) {
+    switch (rhs._op) {
    case 'v':
        return calculate_value(rhs._v1, calculate_value_caller::UpdateExpression, previous_item);
    case '+': {
--- a/alternator/expressions.hh
+++ b/alternator/expressions.hh
@@ -30,7 +30,7 @@
 #include <seastar/util/noncopyable_function.hh>

 #include "expressions_types.hh"
-#include "rjson.hh"
+#include "utils/rjson.hh"

 namespace alternator {

--- a/alternator/expressions_types.hh
+++ b/alternator/expressions_types.hh
@@ -27,7 +27,7 @@

 #include <seastar/core/shared_ptr.hh>

-#include "rjson.hh"
+#include "utils/rjson.hh"

 /*
 * Parsed representation of expressions and their components.
--- a/alternator/rjson.hh
+++ b/alternator/rjson.hh
@@ -1,177 +0,0 @@
-/*
- * Copyright 2019 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-/*
- * rjson is a wrapper over rapidjson library, providing fast JSON parsing and generation.
- *
- * rapidjson has strict copy elision policies, which, among other things, involves
- * using provided char arrays without copying them and allows copying objects only explicitly.
- * As such, one should be careful when passing strings with limited liveness
- * (e.g. data underneath local std::strings) to rjson functions, because created JSON objects
- * may end up relying on dangling char pointers. All rjson functions that create JSONs from strings
- * by rjson have both APIs for string_ref_type (more optimal, used when the string is known to live
- * at least as long as the object, e.g. a static char array) and for std::strings. The more optimal
- * variants should be used *only* if the liveness of the string is guaranteed, otherwise it will
- * result in undefined behaviour.
- * Also, bear in mind that methods exposed by rjson::value are generic, but some of them
- * work fine only for specific types. In case the type does not match, an rjson::error will be thrown.
- * Examples of such mismatched usages is calling MemberCount() on a JSON value not of object type
- * or calling Size() on a non-array value.
- */
-
-#include <string>
-#include <stdexcept>
-
-namespace rjson {
-class error : public std::exception {
-    std::string _msg;
-public:
-    error() = default;
-    error(const std::string& msg) : _msg(msg) {}
-
-    virtual const char* what() const noexcept override { return _msg.c_str(); }
-};
-}
-
-// rapidjson configuration macros
-#define RAPIDJSON_HAS_STDSTRING 1
-// Default rjson policy is to use assert() - which is dangerous for two reasons:
-// 1. assert() can be turned off with -DNDEBUG
-// 2. assert() crashes a program
-// Fortunately, the default policy can be overridden, and so rapidjson errors will
-// throw an rjson::error exception instead.
-#define RAPIDJSON_ASSERT(x) do { if (!(x)) throw rjson::error(std::string("JSON error: condition not met: ") + #x); } while (0)
-
-#include <rapidjson/document.h>
-#include <rapidjson/writer.h>
-#include <rapidjson/stringbuffer.h>
-#include <rapidjson/error/en.h>
-#include <seastar/core/sstring.hh>
-#include "seastarx.hh"
-
-namespace rjson {
-
-using allocator = rapidjson::CrtAllocator;
-using encoding = rapidjson::UTF8<>;
-using document = rapidjson::GenericDocument<encoding, allocator>;
-using value = rapidjson::GenericValue<encoding, allocator>;
-using string_ref_type = value::StringRefType;
-using string_buffer = rapidjson::GenericStringBuffer<encoding>;
-using writer = rapidjson::Writer<string_buffer, encoding>;
-using type = rapidjson::Type;
-
-// Returns an object representing JSON's null
-inline rjson::value null_value() {
-    return rjson::value(rapidjson::kNullType);
-}
-
-// Returns an empty JSON object - {}
-inline rjson::value empty_object() {
-    return rjson::value(rapidjson::kObjectType);
-}
-
-// Returns an empty JSON array - []
-inline rjson::value empty_array() {
-    return rjson::value(rapidjson::kArrayType);
-}
-
-// Returns an empty JSON string - ""
-inline rjson::value empty_string() {
-    return rjson::value(rapidjson::kStringType);
-}
-
-// Convert the JSON value to a string with JSON syntax, the opposite of parse().
-// The representation is dense - without any redundant indentation.
-std::string print(const rjson::value& value);
-
-// Returns a string_view to the string held in a JSON value (which is
-// assumed to hold a string, i.e., v.IsString() == true). This is a view
-// to the existing data - no copying is done.
-inline std::string_view to_string_view(const rjson::value& v) {
-    return std::string_view(v.GetString(), v.GetStringLength());
-}
-
-// Copies given JSON value - involves allocation
-rjson::value copy(const rjson::value& value);
-
-// Parses a JSON value from given string or raw character array.
-// The string/char array liveness does not need to be persisted,
-// as parse() will allocate member names and values.
-// Throws rjson::error if parsing failed.
-rjson::value parse(std::string_view str);
-// Needs to be run in thread context
-rjson::value parse_yieldable(std::string_view str);
-
-// Creates a JSON value (of JSON string type) out of internal string representations.
-// The string value is copied, so str's liveness does not need to be persisted.
-rjson::value from_string(const std::string& str);
-rjson::value from_string(const sstring& str);
-rjson::value from_string(const char* str, size_t size);
-rjson::value from_string(std::string_view view);
-
-// Returns a pointer to JSON member if it exists, nullptr otherwise
-rjson::value* find(rjson::value& value, std::string_view name);
-const rjson::value* find(const rjson::value& value, std::string_view name);
-
-// Returns a reference to JSON member if it exists, throws otherwise
-rjson::value& get(rjson::value& value, std::string_view name);
-const rjson::value& get(const rjson::value& value, std::string_view name);
-
-// Sets a member in given JSON object by moving the member - allocates the name.
-// Throws if base is not a JSON object.
-void set_with_string_name(rjson::value& base, const std::string& name, rjson::value&& member);
-void set_with_string_name(rjson::value& base, std::string_view name, rjson::value&& member);
-
-// Sets a string member in given JSON object by assigning its reference - allocates the name.
-// NOTICE: member string liveness must be ensured to be at least as long as base's.
-// Throws if base is not a JSON object.
-void set_with_string_name(rjson::value& base, const std::string& name, rjson::string_ref_type member);
-void set_with_string_name(rjson::value& base, std::string_view name, rjson::string_ref_type member);
-
-// Sets a member in given JSON object by moving the member.
-// NOTICE: name liveness must be ensured to be at least as long as base's.
-// Throws if base is not a JSON object.
-void set(rjson::value& base, rjson::string_ref_type name, rjson::value&& member);
-
-// Sets a string member in given JSON object by assigning its reference.
-// NOTICE: name liveness must be ensured to be at least as long as base's.
-// NOTICE: member liveness must be ensured to be at least as long as base's.
-// Throws if base is not a JSON object.
-void set(rjson::value& base, rjson::string_ref_type name, rjson::string_ref_type member);
-
-// Adds a value to a JSON list by moving the item to its end.
-// Throws if base_array is not a JSON array.
-void push_back(rjson::value& base_array, rjson::value&& item);
-
-// Remove a member from a JSON object. Throws if value isn't an object.
-bool remove_member(rjson::value& value, std::string_view name);
-
-struct single_value_comp {
-    bool operator()(const rjson::value& r1, const rjson::value& r2) const;
-};
-
-} // end namespace rjson
-
-namespace std {
-std::ostream& operator<<(std::ostream& os, const rjson::value& v);
-}
--- a/alternator/rmw_operation.hh
+++ b/alternator/rmw_operation.hh
@@ -24,7 +24,7 @@
 #include "seastarx.hh"
 #include "service/storage_proxy.hh"
 #include "service/storage_proxy.hh"
-#include "rjson.hh"
+#include "utils/rjson.hh"
 #include "executor.hh"

 namespace alternator {
--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -65,7 +65,7 @@ struct from_json_visitor {

    void operator()(const reversed_type_impl& t) const { visit(*t.underlying_type(), from_json_visitor{v, bo}); };
    void operator()(const string_type_impl& t) {
-        bo.write(t.from_string(sstring_view(v.GetString(), v.GetStringLength())));
+        bo.write(t.from_string(rjson::to_string_view(v)));
    }
    void operator()(const bytes_type_impl& t) const {
        bo.write(base64_decode(v));
@@ -74,23 +74,27 @@ struct from_json_visitor {
        bo.write(boolean_type->decompose(v.GetBool()));
    }
    void operator()(const decimal_type_impl& t) const {
-        bo.write(t.from_string(sstring_view(v.GetString(), v.GetStringLength())));
+        try {
+            bo.write(t.from_string(rjson::to_string_view(v)));
+        } catch (const marshal_exception& e) {
+            throw api_error::validation(format("The parameter cannot be converted to a numeric value: {}", v));
+        }
    }
    // default
    void operator()(const abstract_type& t) const {
-        bo.write(from_json_object(t, Json::Value(rjson::print(v)), cql_serialization_format::internal()));
+        bo.write(from_json_object(t, v, cql_serialization_format::internal()));
    }
 };

 bytes serialize_item(const rjson::value& item) {
    if (item.IsNull() || item.MemberCount() != 1) {
-        throw api_error("ValidationException", format("An item can contain only one attribute definition: {}", item));
+        throw api_error::validation(format("An item can contain only one attribute definition: {}", item));
    }
    auto it = item.MemberBegin();
    type_info type_info = type_info_from_string(rjson::to_string_view(it->name)); // JSON keys are guaranteed to be strings

    if (type_info.atype == alternator_type::NOT_SUPPORTED_YET) {
-        slogger.trace("Non-optimal serialization of type {}", it->name.GetString());
+        slogger.trace("Non-optimal serialization of type {}", it->name);
        return bytes{int8_t(type_info.atype)} + to_bytes(rjson::print(item));
    }

@@ -128,7 +132,7 @@ struct to_json_visitor {
 rjson::value deserialize_item(bytes_view bv) {
    rjson::value deserialized(rapidjson::kObjectType);
    if (bv.empty()) {
-        throw api_error("ValidationException", "Serialized value empty");
+        throw api_error::validation("Serialized value empty");
    }

    alternator_type atype = alternator_type(bv[0]);
@@ -164,7 +168,7 @@ bytes get_key_column_value(const rjson::value& item, const column_definition& co
    std::string column_name = column.name_as_text();
    const rjson::value* key_typed_value = rjson::find(item, column_name);
    if (!key_typed_value) {
-        throw api_error("ValidationException", format("Key column {} not found", column_name));
+        throw api_error::validation(format("Key column {} not found", column_name));
    }
    return get_key_from_typed_value(*key_typed_value, column);
 }
@@ -175,20 +179,20 @@ bytes get_key_column_value(const rjson::value& item, const column_definition& co
 bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column) {
    if (!key_typed_value.IsObject() || key_typed_value.MemberCount() != 1 ||
            !key_typed_value.MemberBegin()->value.IsString()) {
-        throw api_error("ValidationException",
+        throw api_error::validation(
                format("Malformed value object for key column {}: {}",
                        column.name_as_text(), key_typed_value));
    }

    auto it = key_typed_value.MemberBegin();
    if (it->name != type_to_string(column.type)) {
-        throw api_error("ValidationException",
+        throw api_error::validation(
                format("Type mismatch: expected type {} for key column {}, got type {}",
-                        type_to_string(column.type), column.name_as_text(), it->name.GetString()));
+                        type_to_string(column.type), column.name_as_text(), it->name));
    }
    std::string_view value_view = rjson::to_string_view(it->value);
    if (value_view.empty()) {
-        throw api_error("ValidationException",
+        throw api_error::validation(
                format("The AttributeValue for a key attribute cannot contain an empty string value. Key: {}", column.name_as_text()));
    }
    if (column.type == bytes_type) {
@@ -247,20 +251,24 @@ clustering_key ck_from_json(const rjson::value& item, schema_ptr schema) {

 big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic) {
    if (!v.IsObject() || v.MemberCount() != 1) {
-        throw api_error("ValidationException", format("{}: invalid number object", diagnostic));
+        throw api_error::validation(format("{}: invalid number object", diagnostic));
    }
    auto it = v.MemberBegin();
    if (it->name != "N") {
-        throw api_error("ValidationException", format("{}: expected number, found type '{}'", diagnostic, it->name));
+        throw api_error::validation(format("{}: expected number, found type '{}'", diagnostic, it->name));
    }
-    if (it->value.IsNumber()) {
-         // FIXME(sarna): should use big_decimal constructor with numeric values directly:
-        return big_decimal(rjson::print(it->value));
+    try {
+        if (it->value.IsNumber()) {
+             // FIXME(sarna): should use big_decimal constructor with numeric values directly:
+            return big_decimal(rjson::print(it->value));
+        }
+        if (!it->value.IsString()) {
+            throw api_error::validation(format("{}: improperly formatted number constant", diagnostic));
+        }
+        return big_decimal(rjson::to_string_view(it->value));
+    } catch (const marshal_exception& e) {
+        throw api_error::validation(format("The parameter cannot be converted to a numeric value: {}", it->value));
    }
-    if (!it->value.IsString()) {
-        throw api_error("ValidationException", format("{}: improperly formatted number constant", diagnostic));
-    }
-    return big_decimal(it->value.GetString());
 }

 const std::pair<std::string, const rjson::value*> unwrap_set(const rjson::value& v) {
@@ -312,10 +320,10 @@ rjson::value set_sum(const rjson::value& v1, const rjson::value& v2) {
    auto [set1_type, set1] = unwrap_set(v1);
    auto [set2_type, set2] = unwrap_set(v2);
    if (set1_type != set2_type) {
-        throw api_error("ValidationException", format("Mismatched set types: {} and {}", set1_type, set2_type));
+        throw api_error::validation(format("Mismatched set types: {} and {}", set1_type, set2_type));
    }
    if (!set1 || !set2) {
-        throw api_error("ValidationException", "UpdateExpression: ADD operation for sets must be given sets as arguments");
+        throw api_error::validation("UpdateExpression: ADD operation for sets must be given sets as arguments");
    }
    rjson::value sum = rjson::copy(*set1);
    std::set<rjson::value, rjson::single_value_comp> set1_raw;
@@ -323,7 +331,7 @@ rjson::value set_sum(const rjson::value& v1, const rjson::value& v2) {
        set1_raw.insert(rjson::copy(*it));
    }
    for (const auto& a : set2->GetArray()) {
-        if (set1_raw.count(a) == 0) {
+        if (!set1_raw.contains(a)) {
            rjson::push_back(sum, rjson::copy(a));
        }
    }
@@ -340,10 +348,10 @@ std::optional<rjson::value> set_diff(const rjson::value& v1, const rjson::value&
    auto [set1_type, set1] = unwrap_set(v1);
    auto [set2_type, set2] = unwrap_set(v2);
    if (set1_type != set2_type) {
-        throw api_error("ValidationException", format("Mismatched set types: {} and {}", set1_type, set2_type));
+        throw api_error::validation(format("Mismatched set types: {} and {}", set1_type, set2_type));
    }
    if (!set1 || !set2) {
-        throw api_error("ValidationException", "UpdateExpression: DELETE operation can only be performed on a set");
+        throw api_error::validation("UpdateExpression: DELETE operation can only be performed on a set");
    }
    std::set<rjson::value, rjson::single_value_comp> set1_raw;
    for (auto it = set1->Begin(); it != set1->End(); ++it) {
--- a/alternator/serialization.hh
+++ b/alternator/serialization.hh
@@ -26,7 +26,7 @@
 #include "types.hh"
 #include "schema_fwd.hh"
 #include "keys.hh"
-#include "rjson.hh"
+#include "utils/rjson.hh"
 #include "utils/big_decimal.hh"

 namespace alternator {
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -25,7 +25,7 @@
 #include <seastar/json/json_elements.hh>
 #include "seastarx.hh"
 #include "error.hh"
-#include "rjson.hh"
+#include "utils/rjson.hh"
 #include "auth.hh"
 #include <cctype>
 #include "cql3/query_processor.hh"
@@ -75,20 +75,17 @@ public:
                 // returned to the client as expected. Other types of
                 // exceptions are unexpected, and returned to the user
                 // as an internal server error:
-                 api_error ret;
                 try {
                     resf.get();
                 } catch (api_error &ae) {
-                     ret = ae;
+                     generate_error_reply(*rep, ae);
                 } catch (rjson::error & re) {
-                     ret = api_error("ValidationException", re.what());
+                     generate_error_reply(*rep,
+                             api_error::validation(re.what()));
                 } catch (...) {
-                     ret = api_error(
-                             "Internal Server Error",
-                             format("Internal server error: {}", std::current_exception()),
-                             reply::status_type::internal_server_error);
+                     generate_error_reply(*rep,
+                             api_error::internal(format("Internal server error: {}", std::current_exception())));
                 }
-                 generate_error_reply(*rep, ret);
                 return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
             }
             auto res = resf.get0();
@@ -188,11 +185,11 @@ future<> server::verify_signature(const request& req) {
    }
    auto host_it = req._headers.find("Host");
    if (host_it == req._headers.end()) {
-        throw api_error("InvalidSignatureException", "Host header is mandatory for signature verification");
+        throw api_error::invalid_signature("Host header is mandatory for signature verification");
    }
    auto authorization_it = req._headers.find("Authorization");
    if (authorization_it == req._headers.end()) {
-        throw api_error("InvalidSignatureException", "Authorization header is mandatory for signature verification");
+        throw api_error::invalid_signature("Authorization header is mandatory for signature verification");
    }
    std::string host = host_it->second;
    std::vector<std::string_view> credentials_raw = split(authorization_it->second, ' ');
@@ -204,7 +201,7 @@ future<> server::verify_signature(const request& req) {
        std::vector<std::string_view> entry_split = split(entry, '=');
        if (entry_split.size() != 2) {
            if (entry != "AWS4-HMAC-SHA256") {
-                throw api_error("InvalidSignatureException", format("Only AWS4-HMAC-SHA256 algorithm is supported. Found: {}", entry));
+                throw api_error::invalid_signature(format("Only AWS4-HMAC-SHA256 algorithm is supported. Found: {}", entry));
            }
            continue;
        }
@@ -225,7 +222,7 @@ future<> server::verify_signature(const request& req) {
    }
    std::vector<std::string_view> credential_split = split(credential, '/');
    if (credential_split.size() != 5) {
-        throw api_error("ValidationException", format("Incorrect credential information format: {}", credential));
+        throw api_error::validation(format("Incorrect credential information format: {}", credential));
    }
    std::string user(credential_split[0]);
    std::string datestamp(credential_split[1]);
@@ -263,7 +260,7 @@ future<> server::verify_signature(const request& req) {

        if (signature != std::string_view(user_signature)) {
            _key_cache.remove(user);
-            throw api_error("UnrecognizedClientException", "The security token included in the request is invalid.");
+            throw api_error::unrecognized_client("The security token included in the request is invalid.");
        }
    });
 }
@@ -274,13 +271,12 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
    std::vector<std::string_view> split_target = split(target, '.');
    //NOTICE(sarna): Target consists of Dynamo API version followed by a dot '.' and operation type (e.g. CreateTable)
    std::string op = split_target.empty() ? std::string() : std::string(split_target.back());
-    slogger.trace("Request: {} {}", op, req->content);
+    slogger.trace("Request: {} {} {}", op, req->content, req->_headers);
    return verify_signature(*req).then([this, op, req = std::move(req)] () mutable {
        auto callback_it = _callbacks.find(op);
        if (callback_it == _callbacks.end()) {
            _executor._stats.unsupported_operations++;
-            throw api_error("UnknownOperationException",
-                    format("Unsupported operation {}", op));
+            throw api_error::unknown_operation(format("Unsupported operation {}", op));
        }
        return with_gate(_pending_requests, [this, callback_it = std::move(callback_it), op = std::move(op), req = std::move(req)] () mutable {
            //FIXME: Client state can provide more context, e.g. client's endpoint address
@@ -350,6 +346,9 @@ server::server(executor& exec)
        {"DeleteTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
            return e.delete_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
        }},
+        {"UpdateTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.update_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
        {"PutItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
            return e.put_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
        }},
@@ -389,6 +388,18 @@ server::server(executor& exec)
        {"ListTagsOfResource", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
            return e.list_tags_of_resource(client_state, std::move(permit), std::move(json_request));
        }},
+        {"ListStreams", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.list_streams(client_state, std::move(permit), std::move(json_request));
+        }},
+        {"DescribeStream", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.describe_stream(client_state, std::move(permit), std::move(json_request));
+        }},
+        {"GetShardIterator", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.get_shard_iterator(client_state, std::move(permit), std::move(json_request));
+        }},
+        {"GetRecords", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.get_records(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
    } {
 }

--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -20,7 +20,7 @@
 */

 #include "stats.hh"
-
+#include "utils/histogram_metrics_helper.hh"
 #include <seastar/core/metrics.hh>

 namespace alternator {
@@ -37,7 +37,7 @@ stats::stats() : api_operations{} {
                        seastar::metrics::description("number of operations via Alternator API"), {op(CamelCaseName)}),
 #define OPERATION_LATENCY(name, CamelCaseName) \
                seastar::metrics::make_histogram("op_latency", \
-                        seastar::metrics::description("Latency histogram of an operation via Alternator API"), {op(CamelCaseName)}, [this]{return api_operations.name.get_histogram(1,20);}),
+                        seastar::metrics::description("Latency histogram of an operation via Alternator API"), {op(CamelCaseName)}, [this]{return to_metrics_histogram(api_operations.name);}),
            OPERATION(batch_write_item, "BatchWriteItem")
            OPERATION(create_backup, "CreateBackup")
            OPERATION(create_global_table, "CreateGlobalTable")
@@ -77,6 +77,11 @@ stats::stats() : api_operations{} {
            OPERATION_LATENCY(get_item_latency, "GetItem")
            OPERATION_LATENCY(delete_item_latency, "DeleteItem")
            OPERATION_LATENCY(update_item_latency, "UpdateItem")
+            OPERATION(list_streams, "ListStreams")
+            OPERATION(describe_stream, "DescribeStream")
+            OPERATION(get_shard_iterator, "GetShardIterator")
+            OPERATION(get_records, "GetRecords")
+            OPERATION_LATENCY(get_records_latency, "GetRecords")
    });
    _metrics.add_group("alternator", {
            seastar::metrics::make_total_operations("unsupported_operations", unsupported_operations,
--- a/alternator/stats.hh
+++ b/alternator/stats.hh
@@ -74,11 +74,16 @@ public:
        uint64_t update_item = 0;
        uint64_t update_table = 0;
        uint64_t update_time_to_live = 0;
+        uint64_t list_streams = 0;
+        uint64_t describe_stream = 0;
+        uint64_t get_shard_iterator = 0;
+        uint64_t get_records = 0;

-        utils::estimated_histogram put_item_latency;
-        utils::estimated_histogram get_item_latency;
-        utils::estimated_histogram delete_item_latency;
-        utils::estimated_histogram update_item_latency;
+        utils::time_estimated_histogram put_item_latency;
+        utils::time_estimated_histogram get_item_latency;
+        utils::time_estimated_histogram delete_item_latency;
+        utils::time_estimated_histogram update_item_latency;
+        utils::time_estimated_histogram get_records_latency;
    } api_operations;
    // Miscellaneous event counters
    uint64_t total_operations = 0;
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
--- a/api/api-doc/messaging_service.json
+++ b/api/api-doc/messaging_service.json
@@ -249,7 +249,7 @@
                 "MIGRATION_REQUEST",
                 "PREPARE_MESSAGE",
                 "PREPARE_DONE_MESSAGE",
-                 "STREAM_MUTATION",
+                 "UNUSED__STREAM_MUTATION",
                 "STREAM_MUTATION_DONE",
                 "COMPLETE_MESSAGE",
                 "REPAIR_CHECKSUM_RANGE",
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -833,6 +833,43 @@
            }
         ]
      },
+      {
+         "path":"/storage_service/repair_status/",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Query the repair status and return when the repair is finished or timeout",
+               "type":"string",
+               "enum":[
+                  "RUNNING",
+                  "SUCCESSFUL",
+                  "FAILED"
+               ],
+               "nickname":"repair_await_completion",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"id",
+                     "description":"The repair ID to check for status",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type": "long",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"timeout",
+                     "description":"Seconds to wait before the query returns even if the repair is not finished. The value -1 or not providing this parameter means no timeout",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type": "long",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
      {
         "path":"/storage_service/repair_async/{keyspace}",
         "operations":[
@@ -2431,7 +2468,7 @@
            "version":{
               "type":"string",
               "enum":[
-                  "ka", "la", "mc"
+                  "ka", "la", "mc", "md"
               ],
               "description":"SSTable version"
            },
--- a/api/api.cc
+++ b/api/api.cc
@@ -113,8 +113,20 @@ future<> set_server_storage_service(http_context& ctx) {
    return register_api(ctx, "storage_service", "The storage service API", set_storage_service);
 }

-future<> set_server_snapshot(http_context& ctx) {
-    return ctx.http_server.set_routes([&ctx] (routes& r) { set_snapshot(ctx, r); });
+future<> set_server_repair(http_context& ctx, sharded<netw::messaging_service>& ms) {
+    return ctx.http_server.set_routes([&ctx, &ms] (routes& r) { set_repair(ctx, r, ms); });
+}
+
+future<> unset_server_repair(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_repair(ctx, r); });
+}
+
+future<> set_server_snapshot(http_context& ctx, sharded<db::snapshot_ctl>& snap_ctl) {
+    return ctx.http_server.set_routes([&ctx, &snap_ctl] (routes& r) { set_snapshot(ctx, r, snap_ctl); });
+}
+
+future<> unset_server_snapshot(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_snapshot(ctx, r); });
 }

 future<> set_server_snitch(http_context& ctx) {
@@ -131,9 +143,14 @@ future<> set_server_load_sstable(http_context& ctx) {
                "The column family API", set_column_family);
 }

-future<> set_server_messaging_service(http_context& ctx) {
+future<> set_server_messaging_service(http_context& ctx, sharded<netw::messaging_service>& ms) {
    return register_api(ctx, "messaging_service",
-                "The messaging service API", set_messaging_service);
+                "The messaging service API", [&ms] (http_context& ctx, routes& r) {
+                    set_messaging_service(ctx, r, ms);
+                });
+}
+future<> unset_server_messaging_service(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_messaging_service(ctx, r); });
 }

 future<> set_server_storage_proxy(http_context& ctx) {
--- a/api/api.hh
+++ b/api/api.hh
@@ -256,4 +256,6 @@ public:
    operator T() const { return value; }
 };

+utils_json::estimated_histogram time_to_json_histogram(const utils::time_estimated_histogram& val);
+
 }
--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -27,6 +27,8 @@ namespace service { class load_meter; }
 namespace locator { class token_metadata; }
 namespace cql_transport { class controller; }
 class thrift_controller;
+namespace db { class snapshot_ctl; }
+namespace netw { class messaging_service; }

 namespace api {

@@ -37,11 +39,11 @@ struct http_context {
    distributed<database>& db;
    distributed<service::storage_proxy>& sp;
    service::load_meter& lmeter;
-    sharded<locator::token_metadata>& token_metadata;
+    const sharded<locator::token_metadata>& token_metadata;

    http_context(distributed<database>& _db,
            distributed<service::storage_proxy>& _sp,
-            service::load_meter& _lm, sharded<locator::token_metadata>& _tm)
+            service::load_meter& _lm, const sharded<locator::token_metadata>& _tm)
            : db(_db), sp(_sp), lmeter(_lm), token_metadata(_tm) {
    }
 };
@@ -50,14 +52,18 @@ future<> set_server_init(http_context& ctx);
 future<> set_server_config(http_context& ctx);
 future<> set_server_snitch(http_context& ctx);
 future<> set_server_storage_service(http_context& ctx);
+future<> set_server_repair(http_context& ctx, sharded<netw::messaging_service>& ms);
+future<> unset_server_repair(http_context& ctx);
 future<> set_transport_controller(http_context& ctx, cql_transport::controller& ctl);
 future<> unset_transport_controller(http_context& ctx);
 future<> set_rpc_controller(http_context& ctx, thrift_controller& ctl);
 future<> unset_rpc_controller(http_context& ctx);
-future<> set_server_snapshot(http_context& ctx);
+future<> set_server_snapshot(http_context& ctx, sharded<db::snapshot_ctl>& snap_ctl);
+future<> unset_server_snapshot(http_context& ctx);
 future<> set_server_gossip(http_context& ctx);
 future<> set_server_load_sstable(http_context& ctx);
-future<> set_server_messaging_service(http_context& ctx);
+future<> set_server_messaging_service(http_context& ctx, sharded<netw::messaging_service>& ms);
+future<> unset_server_messaging_service(http_context& ctx);
 future<> set_server_storage_proxy(http_context& ctx);
 future<> set_server_stream_manager(http_context& ctx);
 future<> set_server_gossip_settle(http_context& ctx);
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -249,6 +249,12 @@ static future<json::json_return_type> sum_sstable(http_context& ctx, bool total)
    });
 }

+future<json::json_return_type> map_reduce_cf_time_histogram(http_context& ctx, const sstring& name, std::function<utils::time_estimated_histogram(const column_family&)> f) {
+    return map_reduce_cf_raw(ctx, name, utils::time_estimated_histogram(), f, utils::time_estimated_histogram_merge).then([](const utils::time_estimated_histogram& res) {
+        return make_ready_future<json::json_return_type>(time_to_json_histogram(res));
+    });
+}
+
 template <typename T>
 class sum_ratio {
    uint64_t _n = 0;
@@ -325,15 +331,15 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], 0, [](column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], uint64_t{0}, [](column_family& cf) {
            return cf.active_memtable().partition_count();
-        }, std::plus<int>());
+        }, std::plus<>());
    });

    cf::get_all_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, 0, [](column_family& cf) {
+        return map_reduce_cf(ctx, uint64_t{0}, [](column_family& cf) {
            return cf.active_memtable().partition_count();
-        }, std::plus<int>());
+        }, std::plus<>());
    });

    cf::get_memtable_on_heap_size.set(r, [] (const_req req) {
@@ -796,24 +802,21 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_cas_prepare.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const column_family& cf) {
            return cf.get_stats().estimated_cas_prepare;
-        },
-        utils::estimated_histogram_merge, utils_json::estimated_histogram());
+        });
    });

    cf::get_cas_propose.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const column_family& cf) {
            return cf.get_stats().estimated_cas_accept;
-        },
-        utils::estimated_histogram_merge, utils_json::estimated_histogram());
+        });
    });

    cf::get_cas_commit.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const column_family& cf) {
            return cf.get_stats().estimated_cas_learn;
-        },
-        utils::estimated_histogram_merge, utils_json::estimated_histogram());
+        });
    });

    cf::get_sstables_per_read_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
@@ -862,7 +865,9 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_built_indexes.set(r, [&ctx](std::unique_ptr<request> req) {
-        auto [ks, cf_name] = parse_fully_qualified_cf_name(req->param["name"]);
+        auto ks_cf = parse_fully_qualified_cf_name(req->param["name"]);
+        auto&& ks = std::get<0>(ks_cf);
+        auto&& cf_name = std::get<1>(ks_cf);
        return db::system_keyspace::load_view_build_progress().then([ks, cf_name, &ctx](const std::vector<db::system_keyspace::view_build_progress>& vb) mutable {
            std::set<sstring> vp;
            for (auto b : vb) {
@@ -875,7 +880,7 @@ void set_column_family(http_context& ctx, routes& r) {
            column_family& cf = ctx.db.local().find_column_family(uuid);
            res.reserve(cf.get_index_manager().list_indexes().size());
            for (auto&& i : cf.get_index_manager().list_indexes()) {
-                if (vp.find(secondary_index::index_table_name(i.metadata().name())) == vp.end()) {
+                if (!vp.contains(secondary_index::index_table_name(i.metadata().name()))) {
                    res.emplace_back(i.metadata().name());
                }
            }
@@ -909,17 +914,15 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_read_latency_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const column_family& cf) {
            return cf.get_stats().estimated_read;
-        },
-        utils::estimated_histogram_merge, utils_json::estimated_histogram());
+        });
    });

    cf::get_write_latency_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const column_family& cf) {
            return cf.get_stats().estimated_write;
-        },
-        utils::estimated_histogram_merge, utils_json::estimated_histogram());
+        });
    });

    cf::set_compaction_strategy_class.set(r, [&ctx](std::unique_ptr<request> req) {
--- a/api/column_family.hh
+++ b/api/column_family.hh
@@ -68,6 +68,8 @@ future<json::json_return_type> map_reduce_cf(http_context& ctx, const sstring& n
    });
 }

+future<json::json_return_type> map_reduce_cf_time_histogram(http_context& ctx, const sstring& name, std::function<utils::time_estimated_histogram(const column_family&)> f);
+
 struct map_reduce_column_families_locally {
    std::any init;
    std::function<std::unique_ptr<std::any>(column_family&)> mapper;
--- a/api/messaging_service.cc
+++ b/api/messaging_service.cc
@@ -53,8 +53,8 @@ std::vector<message_counter> map_to_message_counters(
 * according to a function that it gets as a parameter.
 *
 */
-future_json_function get_client_getter(std::function<uint64_t(const shard_info&)> f) {
-    return [f](std::unique_ptr<request> req) {
+future_json_function get_client_getter(sharded<netw::messaging_service>& ms, std::function<uint64_t(const shard_info&)> f) {
+    return [&ms, f](std::unique_ptr<request> req) {
        using map_type = std::unordered_map<gms::inet_address, uint64_t>;
        auto get_shard_map = [f](messaging_service& ms) {
            std::unordered_map<gms::inet_address, unsigned long> map;
@@ -63,15 +63,15 @@ future_json_function get_client_getter(std::function<uint64_t(const shard_info&)
            });
            return map;
        };
-        return  get_messaging_service().map_reduce0(get_shard_map, map_type(), map_sum<map_type>).
+        return ms.map_reduce0(get_shard_map, map_type(), map_sum<map_type>).
                then([](map_type&& map) {
            return make_ready_future<json::json_return_type>(map_to_message_counters(map));
        });
    };
 }

-future_json_function get_server_getter(std::function<uint64_t(const rpc::stats&)> f) {
-    return [f](std::unique_ptr<request> req) {
+future_json_function get_server_getter(sharded<netw::messaging_service>& ms, std::function<uint64_t(const rpc::stats&)> f) {
+    return [&ms, f](std::unique_ptr<request> req) {
        using map_type = std::unordered_map<gms::inet_address, uint64_t>;
        auto get_shard_map = [f](messaging_service& ms) {
            std::unordered_map<gms::inet_address, unsigned long> map;
@@ -80,53 +80,53 @@ future_json_function get_server_getter(std::function<uint64_t(const rpc::stats&)
            });
            return map;
        };
-        return  get_messaging_service().map_reduce0(get_shard_map, map_type(), map_sum<map_type>).
+        return ms.map_reduce0(get_shard_map, map_type(), map_sum<map_type>).
                then([](map_type&& map) {
            return make_ready_future<json::json_return_type>(map_to_message_counters(map));
        });
    };
 }

-void set_messaging_service(http_context& ctx, routes& r) {
-    get_timeout_messages.set(r, get_client_getter([](const shard_info& c) {
+void set_messaging_service(http_context& ctx, routes& r, sharded<netw::messaging_service>& ms) {
+    get_timeout_messages.set(r, get_client_getter(ms, [](const shard_info& c) {
        return c.get_stats().timeout;
    }));

-    get_sent_messages.set(r, get_client_getter([](const shard_info& c) {
+    get_sent_messages.set(r, get_client_getter(ms, [](const shard_info& c) {
        return c.get_stats().sent_messages;
    }));

-    get_dropped_messages.set(r, get_client_getter([](const shard_info& c) {
+    get_dropped_messages.set(r, get_client_getter(ms, [](const shard_info& c) {
        // We don't have the same drop message mechanism
        // as origin has.
        // hence we can always return 0
        return 0;
    }));

-    get_exception_messages.set(r, get_client_getter([](const shard_info& c) {
+    get_exception_messages.set(r, get_client_getter(ms, [](const shard_info& c) {
        return c.get_stats().exception_received;
    }));

-    get_pending_messages.set(r, get_client_getter([](const shard_info& c) {
+    get_pending_messages.set(r, get_client_getter(ms, [](const shard_info& c) {
        return c.get_stats().pending;
    }));

-    get_respond_pending_messages.set(r, get_server_getter([](const rpc::stats& c) {
+    get_respond_pending_messages.set(r, get_server_getter(ms, [](const rpc::stats& c) {
        return c.pending;
    }));

-    get_respond_completed_messages.set(r, get_server_getter([](const rpc::stats& c) {
+    get_respond_completed_messages.set(r, get_server_getter(ms, [](const rpc::stats& c) {
        return c.sent_messages;
    }));

-    get_version.set(r, [](const_req req) {
-        return netw::get_local_messaging_service().get_raw_version(req.get_query_param("addr"));
+    get_version.set(r, [&ms](const_req req) {
+        return ms.local().get_raw_version(req.get_query_param("addr"));
    });

-    get_dropped_messages_by_ver.set(r, [](std::unique_ptr<request> req) {
+    get_dropped_messages_by_ver.set(r, [&ms](std::unique_ptr<request> req) {
        shared_ptr<std::vector<uint64_t>> map = make_shared<std::vector<uint64_t>>(num_verb);

-        return netw::get_messaging_service().map_reduce([map](const uint64_t* local_map) mutable {
+        return ms.map_reduce([map](const uint64_t* local_map) mutable {
            for (auto i = 0; i < num_verb; i++) {
                (*map)[i]+= local_map[i];
            }
@@ -151,5 +151,18 @@ void set_messaging_service(http_context& ctx, routes& r) {
        });
    });
 }
+
+void unset_messaging_service(http_context& ctx, routes& r) {
+    get_timeout_messages.unset(r);
+    get_sent_messages.unset(r);
+    get_dropped_messages.unset(r);
+    get_exception_messages.unset(r);
+    get_pending_messages.unset(r);
+    get_respond_pending_messages.unset(r);
+    get_respond_completed_messages.unset(r);
+    get_version.unset(r);
+    get_dropped_messages_by_ver.unset(r);
+}
+
 }

--- a/api/messaging_service.hh
+++ b/api/messaging_service.hh
@@ -23,8 +23,11 @@

 #include "api.hh"

+namespace netw { class messaging_service; }
+
 namespace api {

-void set_messaging_service(http_context& ctx, routes& r);
+void set_messaging_service(http_context& ctx, routes& r, sharded<netw::messaging_service>& ms);
+void unset_messaging_service(http_context& ctx, routes& r);

 }
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -41,6 +41,7 @@
 #include "sstables/sstables.hh"
 #include "database.hh"
 #include "db/extensions.hh"
+#include "db/snapshot-ctl.hh"
 #include "transport/controller.hh"
 #include "thrift/controller.hh"

@@ -149,6 +150,104 @@ void unset_rpc_controller(http_context& ctx, routes& r) {
    ss::is_rpc_server_running.unset(r);
 }

+void set_repair(http_context& ctx, routes& r, sharded<netw::messaging_service>& ms) {
+    ss::repair_async.set(r, [&ctx, &ms](std::unique_ptr<request> req) {
+        static std::vector<sstring> options = {"primaryRange", "parallelism", "incremental",
+                "jobThreads", "ranges", "columnFamilies", "dataCenters", "hosts", "trace",
+                "startToken", "endToken" };
+        std::unordered_map<sstring, sstring> options_map;
+        for (auto o : options) {
+            auto s = req->get_query_param(o);
+            if (s != "") {
+                options_map[o] = s;
+            }
+        }
+
+        // The repair process is asynchronous: repair_start only starts it and
+        // returns immediately, not waiting for the repair to finish. The user
+        // then has other mechanisms to track the ongoing repair's progress,
+        // or stop it.
+        return repair_start(ctx.db, ms, validate_keyspace(ctx, req->param),
+                options_map).then([] (int i) {
+                    return make_ready_future<json::json_return_type>(i);
+                });
+    });
+
+    ss::get_active_repair_async.set(r, [&ctx](std::unique_ptr<request> req) {
+        return get_active_repairs(ctx.db).then([] (std::vector<int> res){
+            return make_ready_future<json::json_return_type>(res);
+        });
+    });
+
+    ss::repair_async_status.set(r, [&ctx](std::unique_ptr<request> req) {
+        return repair_get_status(ctx.db, boost::lexical_cast<int>( req->get_query_param("id")))
+                .then_wrapped([] (future<repair_status>&& fut) {
+            ss::ns_repair_async_status::return_type_wrapper res;
+            try {
+                res = fut.get0();
+            } catch(std::runtime_error& e) {
+                throw httpd::bad_param_exception(e.what());
+            }
+            return make_ready_future<json::json_return_type>(json::json_return_type(res));
+        });
+    });
+
+    ss::repair_await_completion.set(r, [&ctx](std::unique_ptr<request> req) {
+        int id;
+        using clock = std::chrono::steady_clock;
+        clock::time_point expire;
+        try {
+            id = boost::lexical_cast<int>(req->get_query_param("id"));
+            // If timeout is not provided, it means no timeout.
+            sstring s = req->get_query_param("timeout");
+            int64_t timeout = s.empty() ? int64_t(-1) : boost::lexical_cast<int64_t>(s);
+            if (timeout < 0 && timeout != -1) {
+                return make_exception_future<json::json_return_type>(
+                        httpd::bad_param_exception("timeout can only be -1 (means no timeout) or non negative integer"));
+            }
+            if (timeout < 0) {
+                expire = clock::time_point::max();
+            } else {
+                expire = clock::now() + std::chrono::seconds(timeout);
+            }
+        } catch (std::exception& e) {
+            return make_exception_future<json::json_return_type>(httpd::bad_param_exception(e.what()));
+        }
+        return repair_await_completion(ctx.db, id, expire)
+                .then_wrapped([] (future<repair_status>&& fut) {
+            ss::ns_repair_async_status::return_type_wrapper res;
+            try {
+                res = fut.get0();
+            } catch (std::exception& e) {
+                return make_exception_future<json::json_return_type>(httpd::server_error_exception(e.what()));
+            }
+            return make_ready_future<json::json_return_type>(json::json_return_type(res));
+        });
+    });
+
+    ss::force_terminate_all_repair_sessions.set(r, [](std::unique_ptr<request> req) {
+        return repair_abort_all(service::get_local_storage_service().db()).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    ss::force_terminate_all_repair_sessions_new.set(r, [](std::unique_ptr<request> req) {
+        return repair_abort_all(service::get_local_storage_service().db()).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+}
+
+void unset_repair(http_context& ctx, routes& r) {
+    ss::repair_async.unset(r);
+    ss::get_active_repair_async.unset(r);
+    ss::repair_async_status.unset(r);
+    ss::repair_await_completion.unset(r);
+    ss::force_terminate_all_repair_sessions.unset(r);
+    ss::force_terminate_all_repair_sessions_new.unset(r);
+}
+
 void set_storage_service(http_context& ctx, routes& r) {
    ss::local_hostid.set(r, [](std::unique_ptr<request> req) {
        return db::system_keyspace::get_local_host_id().then([](const utils::UUID& id) {
@@ -220,11 +319,26 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::get_range_to_endpoint_map.set(r, [&ctx](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
        auto keyspace = validate_keyspace(ctx, req->param);
        std::vector<ss::maplist_mapper> res;
-        return make_ready_future<json::json_return_type>(res);
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().get_range_to_address_map(keyspace),
+                [](const std::pair<dht::token_range, std::vector<gms::inet_address>>& entry){
+            ss::maplist_mapper m;
+            if (entry.first.start()) {
+                m.key.push(entry.first.start().value().value().to_sstring());
+            } else {
+                m.key.push("");
+            }
+            if (entry.first.end()) {
+                m.key.push(entry.first.end().value().value().to_sstring());
+            } else {
+                m.key.push("");
+            }
+            for (const gms::inet_address& address : entry.second) {
+                m.value.push(address.to_sstring());
+            }
+            return m;
+        }));
    });

    ss::get_pending_range_to_endpoint_map.set(r, [&ctx](std::unique_ptr<request> req) {
@@ -338,7 +452,7 @@ void set_storage_service(http_context& ctx, routes& r) {
            return do_for_each(column_families, [=, &db](sstring cfname) {
                auto& cm = db.get_compaction_manager();
                auto& cf = db.find_column_family(keyspace, cfname);
-                return cm.perform_sstable_upgrade(&cf, exclude_current_version);
+                return cm.perform_sstable_upgrade(db, &cf, exclude_current_version);
            });
        }).then([]{
            return make_ready_future<json::json_return_type>(0);
@@ -361,59 +475,6 @@ void set_storage_service(http_context& ctx, routes& r) {
    });


-    ss::repair_async.set(r, [&ctx](std::unique_ptr<request> req) {
-        static std::vector<sstring> options = {"primaryRange", "parallelism", "incremental",
-                "jobThreads", "ranges", "columnFamilies", "dataCenters", "hosts", "trace",
-                "startToken", "endToken" };
-        std::unordered_map<sstring, sstring> options_map;
-        for (auto o : options) {
-            auto s = req->get_query_param(o);
-            if (s != "") {
-                options_map[o] = s;
-            }
-        }
-
-        // The repair process is asynchronous: repair_start only starts it and
-        // returns immediately, not waiting for the repair to finish. The user
-        // then has other mechanisms to track the ongoing repair's progress,
-        // or stop it.
-        return repair_start(ctx.db, validate_keyspace(ctx, req->param),
-                options_map).then([] (int i) {
-                    return make_ready_future<json::json_return_type>(i);
-                });
-    });
-
-    ss::get_active_repair_async.set(r, [&ctx](std::unique_ptr<request> req) {
-        return get_active_repairs(ctx.db).then([] (std::vector<int> res){
-            return make_ready_future<json::json_return_type>(res);
-        });
-    });
-
-    ss::repair_async_status.set(r, [&ctx](std::unique_ptr<request> req) {
-        return repair_get_status(ctx.db, boost::lexical_cast<int>( req->get_query_param("id")))
-                .then_wrapped([] (future<repair_status>&& fut) {
-            ss::ns_repair_async_status::return_type_wrapper res;
-            try {
-                res = fut.get0();
-            } catch(std::runtime_error& e) {
-                throw httpd::bad_param_exception(e.what());
-            }
-            return make_ready_future<json::json_return_type>(json::json_return_type(res));
-        });
-    });
-
-    ss::force_terminate_all_repair_sessions.set(r, [](std::unique_ptr<request> req) {
-        return repair_abort_all(service::get_local_storage_service().db()).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
-    ss::force_terminate_all_repair_sessions_new.set(r, [](std::unique_ptr<request> req) {
-        return repair_abort_all(service::get_local_storage_service().db()).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
    ss::decommission.set(r, [](std::unique_ptr<request> req) {
        return service::get_local_storage_service().decommission().then([] {
            return make_ready_future<json::json_return_type>(json_void());
@@ -921,7 +982,7 @@ void set_storage_service(http_context& ctx, routes& r) {
                                    e.value = p.second;
                                    nm.attributes.push(std::move(e));
                                }
-                                if (!cp->options().count(compression_parameters::SSTABLE_COMPRESSION)) {
+                                if (!cp->options().contains(compression_parameters::SSTABLE_COMPRESSION)) {
                                    ss::mapper e;
                                    e.key = compression_parameters::SSTABLE_COMPRESSION;
                                    e.value = cp->name();
@@ -979,31 +1040,29 @@ void set_storage_service(http_context& ctx, routes& r) {

 }

-void set_snapshot(http_context& ctx, routes& r) {
-    ss::get_snapshot_details.set(r, [](std::unique_ptr<request> req) {
-        std::function<future<>(output_stream<char>&&)> f = [](output_stream<char>&& s) {
-            return do_with(output_stream<char>(std::move(s)), true, [] (output_stream<char>& s, bool& first){
-                return s.write("[").then([&s, &first] {
-                    return service::get_local_storage_service().get_snapshot_details().then([&s, &first] (std::unordered_map<sstring, std::vector<service::storage_service::snapshot_details>>&& result) {
-                        return do_with(std::move(result), [&s, &first](const std::unordered_map<sstring, std::vector<service::storage_service::snapshot_details>>& result) {
-                            return do_for_each(result, [&s, &result,&first](std::tuple<sstring, std::vector<service::storage_service::snapshot_details>>&& map){
-                                return do_with(ss::snapshots(), [&s, &first, &result, &map](ss::snapshots& all_snapshots) {
-                                    all_snapshots.key = std::get<0>(map);
-                                    future<> f = first ? make_ready_future<>() : s.write(", ");
-                                    first = false;
-                                    std::vector<ss::snapshot> snapshot;
-                                    for (auto& cf: std::get<1>(map)) {
-                                        ss::snapshot snp;
-                                        snp.ks = cf.ks;
-                                        snp.cf = cf.cf;
-                                        snp.live = cf.live;
-                                        snp.total = cf.total;
-                                        snapshot.push_back(std::move(snp));
-                                    }
-                                    all_snapshots.value = std::move(snapshot);
-                                    return f.then([&s, &all_snapshots] {
-                                        return all_snapshots.write(s);
-                                    });
+void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_ctl) {
+    ss::get_snapshot_details.set(r, [&snap_ctl](std::unique_ptr<request> req) {
+        return snap_ctl.local().get_snapshot_details().then([] (std::unordered_map<sstring, std::vector<db::snapshot_ctl::snapshot_details>>&& result) {
+            std::function<future<>(output_stream<char>&&)> f = [result = std::move(result)](output_stream<char>&& s) {
+                return do_with(output_stream<char>(std::move(s)), true, [&result] (output_stream<char>& s, bool& first){
+                    return s.write("[").then([&s, &first, &result] {
+                        return do_for_each(result, [&s, &first](std::tuple<sstring, std::vector<db::snapshot_ctl::snapshot_details>>&& map){
+                            return do_with(ss::snapshots(), [&s, &first, &map](ss::snapshots& all_snapshots) {
+                                all_snapshots.key = std::get<0>(map);
+                                future<> f = first ? make_ready_future<>() : s.write(", ");
+                                first = false;
+                                std::vector<ss::snapshot> snapshot;
+                                for (auto& cf: std::get<1>(map)) {
+                                    ss::snapshot snp;
+                                    snp.ks = cf.ks;
+                                    snp.cf = cf.cf;
+                                    snp.live = cf.live;
+                                    snp.total = cf.total;
+                                    snapshot.push_back(std::move(snp));
+                                }
+                                all_snapshots.value = std::move(snapshot);
+                                return f.then([&s, &all_snapshots] {
+                                    return all_snapshots.write(s);
                                });
                            });
                        });
@@ -1013,12 +1072,13 @@ void set_snapshot(http_context& ctx, routes& r) {
                        });
                    });
                });
-            });
-        };
-        return make_ready_future<json::json_return_type>(std::move(f));
+            };
+
+            return make_ready_future<json::json_return_type>(std::move(f));
+        });
    });

-    ss::take_snapshot.set(r, [](std::unique_ptr<request> req) {
+    ss::take_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) {
        auto tag = req->get_query_param("tag");
        auto column_families = split(req->get_query_param("cf"), ",");

@@ -1026,7 +1086,7 @@ void set_snapshot(http_context& ctx, routes& r) {

        auto resp = make_ready_future<>();
        if (column_families.empty()) {
-            resp = service::get_local_storage_service().take_snapshot(tag, keynames);
+            resp = snap_ctl.local().take_snapshot(tag, keynames);
        } else {
            if (keynames.empty()) {
                throw httpd::bad_param_exception("The keyspace of column families must be specified");
@@ -1034,37 +1094,37 @@ void set_snapshot(http_context& ctx, routes& r) {
            if (keynames.size() > 1) {
                throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
            }
-            resp = service::get_local_storage_service().take_column_family_snapshot(keynames[0], column_families, tag);
+            resp = snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag);
        }
        return resp.then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::del_snapshot.set(r, [](std::unique_ptr<request> req) {
+    ss::del_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) {
        auto tag = req->get_query_param("tag");
        auto column_family = req->get_query_param("cf");

        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-        return service::get_local_storage_service().clear_snapshot(tag, keynames, column_family).then([] {
+        return snap_ctl.local().clear_snapshot(tag, keynames, column_family).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::true_snapshots_size.set(r, [](std::unique_ptr<request> req) {
-        return service::get_local_storage_service().true_snapshots_size().then([] (int64_t size) {
+    ss::true_snapshots_size.set(r, [&snap_ctl](std::unique_ptr<request> req) {
+        return snap_ctl.local().true_snapshots_size().then([] (int64_t size) {
            return make_ready_future<json::json_return_type>(size);
        });
    });

-    ss::scrub.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
+    ss::scrub.set(r, wrap_ks_cf(ctx, [&snap_ctl] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
        const auto skip_corrupted = req_param<bool>(*req, "skip_corrupted", false);

        auto f = make_ready_future<>();
        if (!req_param<bool>(*req, "disable_snapshot", false)) {
            auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
-            f = parallel_for_each(column_families, [keyspace, tag](sstring cf) {
-                return service::get_local_storage_service().take_column_family_snapshot(keyspace, cf, tag);
+            f = parallel_for_each(column_families, [&snap_ctl, keyspace, tag](sstring cf) {
+                return snap_ctl.local().take_column_family_snapshot(keyspace, cf, tag);
            });
        }

@@ -1082,4 +1142,12 @@ void set_snapshot(http_context& ctx, routes& r) {
    }));
 }

+void unset_snapshot(http_context& ctx, routes& r) {
+    ss::get_snapshot_details.unset(r);
+    ss::take_snapshot.unset(r);
+    ss::del_snapshot.unset(r);
+    ss::true_snapshots_size.unset(r);
+    ss::scrub.unset(r);
+}
+
 }
--- a/api/storage_service.hh
+++ b/api/storage_service.hh
@@ -21,18 +21,24 @@

 #pragma once

+#include <seastar/core/sharded.hh>
 #include "api.hh"

 namespace cql_transport { class controller; }
 class thrift_controller;
+namespace db { class snapshot_ctl; }
+namespace netw { class messaging_service; }

 namespace api {

 void set_storage_service(http_context& ctx, routes& r);
+void set_repair(http_context& ctx, routes& r, sharded<netw::messaging_service>& ms);
+void unset_repair(http_context& ctx, routes& r);
 void set_transport_controller(http_context& ctx, routes& r, cql_transport::controller& ctl);
 void unset_transport_controller(http_context& ctx, routes& r);
 void set_rpc_controller(http_context& ctx, routes& r, thrift_controller& ctl);
 void unset_rpc_controller(http_context& ctx, routes& r);
-void set_snapshot(http_context& ctx, routes& r);
+void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_ctl);
+void unset_snapshot(http_context& ctx, routes& r);

 }
--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -208,7 +208,7 @@ size_t atomic_cell_or_collection::external_memory_usage(const abstract_type& t)
            external_value_size = cell_view.value_size();
        }
        // Add overhead of chunk headers. The last one is a special case.
-        external_value_size += (external_value_size - 1) / data::cell::maximum_external_chunk_length * data::cell::external_chunk_overhead;
+        external_value_size += (external_value_size - 1) / data::cell::effective_external_chunk_length * data::cell::external_chunk_overhead;
        external_value_size += data::cell::external_last_chunk_overhead;
    }
    return data::cell::structure::serialized_object_size(_data.get(), ctx)
--- a/atomic_cell.hh
+++ b/atomic_cell.hh
@@ -38,6 +38,7 @@

 class abstract_type;
 class collection_type_impl;
+class atomic_cell_or_collection;

 using atomic_cell_value_view = data::value_view;
 using atomic_cell_value_mutable_view = data::value_mutable_view;
--- a/auth/allow_all_authenticator.cc
+++ b/auth/allow_all_authenticator.cc
@@ -26,10 +26,7 @@

 namespace auth {

-const sstring& allow_all_authenticator_name() {
-    static const sstring name = meta::AUTH_PACKAGE_NAME + "AllowAllAuthenticator";
-    return name;
-}
+constexpr std::string_view allow_all_authenticator_name("org.apache.cassandra.auth.AllowAllAuthenticator");

 // To ensure correct initialization order, we unfortunately need to use a string literal.
 static const class_registrator<
--- a/auth/allow_all_authenticator.hh
+++ b/auth/allow_all_authenticator.hh
@@ -37,7 +37,7 @@ class migration_manager;

 namespace auth {

-const sstring& allow_all_authenticator_name();
+extern const std::string_view allow_all_authenticator_name;

 class allow_all_authenticator final : public authenticator {
 public:
@@ -53,7 +53,7 @@ public:
    }

    virtual std::string_view qualified_java_name() const override {
-        return allow_all_authenticator_name();
+        return allow_all_authenticator_name;
    }

    virtual bool require_authentication() const override {
--- a/auth/allow_all_authorizer.cc
+++ b/auth/allow_all_authorizer.cc
@@ -26,10 +26,7 @@

 namespace auth {

-const sstring& allow_all_authorizer_name() {
-    static const sstring name = meta::AUTH_PACKAGE_NAME + "AllowAllAuthorizer";
-    return name;
-}
+constexpr std::string_view allow_all_authorizer_name("org.apache.cassandra.auth.AllowAllAuthorizer");

 // To ensure correct initialization order, we unfortunately need to use a string literal.
 static const class_registrator<
--- a/auth/allow_all_authorizer.hh
+++ b/auth/allow_all_authorizer.hh
@@ -34,7 +34,7 @@ class migration_manager;

 namespace auth {

-const sstring& allow_all_authorizer_name();
+extern const std::string_view allow_all_authorizer_name;

 class allow_all_authorizer final  : public authorizer {
 public:
@@ -50,7 +50,7 @@ public:
    }

    virtual std::string_view qualified_java_name() const override {
-        return allow_all_authorizer_name();
+        return allow_all_authorizer_name;
    }

    virtual future<permission_set> authorize(const role_or_anonymous&, const resource&) const override {
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -34,10 +34,9 @@ namespace auth {

 namespace meta {

-const sstring DEFAULT_SUPERUSER_NAME("cassandra");
-const sstring AUTH_KS("system_auth");
-const sstring USERS_CF("users");
-const sstring AUTH_PACKAGE_NAME("org.apache.cassandra.auth.");
+constexpr std::string_view AUTH_KS("system_auth");
+constexpr std::string_view USERS_CF("users");
+constexpr std::string_view AUTH_PACKAGE_NAME("org.apache.cassandra.auth.");

 }

@@ -110,7 +109,12 @@ future<> wait_for_schema_agreement(::service::migration_manager& mm, const datab
 }

 const timeout_config& internal_distributed_timeout_config() noexcept {
+#ifdef DEBUG
+    // Give the much slower debug tests more headroom for completing auth queries.
+    static const auto t = 30s;
+#else
    static const auto t = 5s;
+#endif
    static const timeout_config tc{t, t, t, t, t, t, t};
    return tc;
 }
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -53,10 +53,10 @@ namespace auth {

 namespace meta {

-extern const sstring DEFAULT_SUPERUSER_NAME;
-extern const sstring AUTH_KS;
-extern const sstring USERS_CF;
-extern const sstring AUTH_PACKAGE_NAME;
+constexpr std::string_view DEFAULT_SUPERUSER_NAME("cassandra");
+extern const std::string_view AUTH_KS;
+extern const std::string_view USERS_CF;
+extern const std::string_view AUTH_PACKAGE_NAME;

 }

--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -65,15 +65,14 @@ extern "C" {

 namespace auth {

-const sstring& default_authorizer_name() {
-    static const sstring name = meta::AUTH_PACKAGE_NAME + "CassandraAuthorizer";
-    return name;
+std::string_view default_authorizer::qualified_java_name() const {
+    return "org.apache.cassandra.auth.CassandraAuthorizer";
 }

-static const sstring ROLE_NAME = "role";
-static const sstring RESOURCE_NAME = "resource";
-static const sstring PERMISSIONS_NAME = "permissions";
-static const sstring PERMISSIONS_CF = "role_permissions";
+static constexpr std::string_view ROLE_NAME = "role";
+static constexpr std::string_view RESOURCE_NAME = "resource";
+static constexpr std::string_view PERMISSIONS_NAME = "permissions";
+static constexpr std::string_view PERMISSIONS_CF = "role_permissions";

 static logging::logger alogger("default_authorizer");

--- a/auth/default_authorizer.hh
+++ b/auth/default_authorizer.hh
@@ -51,8 +51,6 @@

 namespace auth {

-const sstring& default_authorizer_name();
-
 class default_authorizer : public authorizer {
    cql3::query_processor& _qp;

@@ -71,9 +69,7 @@ public:

    virtual future<> stop() override;

-    virtual std::string_view qualified_java_name() const override {
-        return default_authorizer_name();
-    }
+    virtual std::string_view qualified_java_name() const override;

    virtual future<permission_set> authorize(const role_or_anonymous&, const resource&) const override;

--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -62,15 +62,12 @@

 namespace auth {

-const sstring& password_authenticator_name() {
-    static const sstring name = meta::AUTH_PACKAGE_NAME + "PasswordAuthenticator";
-    return name;
-}
+constexpr std::string_view password_authenticator_name("org.apache.cassandra.auth.PasswordAuthenticator");

 // name of the hash column.
-static const sstring SALTED_HASH = "salted_hash";
-static const sstring DEFAULT_USER_NAME = meta::DEFAULT_SUPERUSER_NAME;
-static const sstring DEFAULT_USER_PASSWORD = meta::DEFAULT_SUPERUSER_NAME;
+static constexpr std::string_view SALTED_HASH = "salted_hash";
+static constexpr std::string_view DEFAULT_USER_NAME = meta::DEFAULT_SUPERUSER_NAME;
+static const sstring DEFAULT_USER_PASSWORD = sstring(meta::DEFAULT_SUPERUSER_NAME);

 static logging::logger plogger("password_authenticator");

@@ -98,7 +95,7 @@ static bool has_salted_hash(const cql3::untyped_result_set_row& row) {

 static const sstring& update_row_query() {
    static const sstring update_row_query = format("UPDATE {} SET {} = ? WHERE {} = ?",
-            meta::roles_table::qualified_name(),
+            meta::roles_table::qualified_name,
            SALTED_HASH,
            meta::roles_table::role_col_name);
    return update_row_query;
@@ -198,7 +195,7 @@ db::consistency_level password_authenticator::consistency_for_user(std::string_v
 }

 std::string_view password_authenticator::qualified_java_name() const {
-    return password_authenticator_name();
+    return password_authenticator_name;
 }

 bool password_authenticator::require_authentication() const {
@@ -215,10 +212,10 @@ authentication_option_set password_authenticator::alterable_options() const {

 future<authenticated_user> password_authenticator::authenticate(
                const credentials_map& credentials) const {
-    if (!credentials.count(USERNAME_KEY)) {
+    if (!credentials.contains(USERNAME_KEY)) {
        throw exceptions::authentication_exception(format("Required key '{}' is missing", USERNAME_KEY));
    }
-    if (!credentials.count(PASSWORD_KEY)) {
+    if (!credentials.contains(PASSWORD_KEY)) {
        throw exceptions::authentication_exception(format("Required key '{}' is missing", PASSWORD_KEY));
    }

@@ -233,7 +230,7 @@ future<authenticated_user> password_authenticator::authenticate(
    return futurize_invoke([this, username, password] {
        static const sstring query = format("SELECT {} FROM {} WHERE {} = ?",
                SALTED_HASH,
-                meta::roles_table::qualified_name(),
+                meta::roles_table::qualified_name,
                meta::roles_table::role_col_name);

        return _qp.execute_internal(
@@ -283,7 +280,7 @@ future<> password_authenticator::alter(std::string_view role_name, const authent
    }

    static const sstring query = format("UPDATE {} SET {} = ? WHERE {} = ?",
-            meta::roles_table::qualified_name(),
+            meta::roles_table::qualified_name,
            SALTED_HASH,
            meta::roles_table::role_col_name);

@@ -297,7 +294,7 @@ future<> password_authenticator::alter(std::string_view role_name, const authent
 future<> password_authenticator::drop(std::string_view name) const {
    static const sstring query = format("DELETE {} FROM {} WHERE {} = ?",
            SALTED_HASH,
-            meta::roles_table::qualified_name(),
+            meta::roles_table::qualified_name,
            meta::roles_table::role_col_name);

    return _qp.execute_internal(
--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -52,7 +52,7 @@ class migration_manager;

 namespace auth {

-const sstring& password_authenticator_name();
+extern const std::string_view password_authenticator_name;

 class password_authenticator : public authenticator {
    cql3::query_processor& _qp;
--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -45,16 +45,13 @@ std::string_view creation_query() {
            "  member_of set<text>,"
            "  salted_hash text"
            ")",
-            qualified_name(),
+            qualified_name,
            role_col_name);

    return instance;
 }

-std::string_view qualified_name() noexcept {
-    static const sstring instance = AUTH_KS + "." + sstring(name);
-    return instance;
-}
+constexpr std::string_view qualified_name("system_auth.roles");

 }

@@ -64,7 +61,7 @@ future<bool> default_role_row_satisfies(
        cql3::query_processor& qp,
        std::function<bool(const cql3::untyped_result_set_row&)> p) {
    static const sstring query = format("SELECT * FROM {} WHERE {} = ?",
-            meta::roles_table::qualified_name(),
+            meta::roles_table::qualified_name,
            meta::roles_table::role_col_name);

    return do_with(std::move(p), [&qp](const auto& p) {
@@ -97,7 +94,7 @@ future<bool> default_role_row_satisfies(
 future<bool> any_nondefault_role_row_satisfies(
        cql3::query_processor& qp,
        std::function<bool(const cql3::untyped_result_set_row&)> p) {
-    static const sstring query = format("SELECT * FROM {}", meta::roles_table::qualified_name());
+    static const sstring query = format("SELECT * FROM {}", meta::roles_table::qualified_name);

    return do_with(std::move(p), [&qp](const auto& p) {
        return qp.execute_internal(
--- a/auth/roles-metadata.hh
+++ b/auth/roles-metadata.hh
@@ -43,7 +43,7 @@ std::string_view creation_query();

 constexpr std::string_view name{"roles", 5};

-std::string_view qualified_name() noexcept;
+extern const std::string_view qualified_name;

 constexpr std::string_view role_col_name{"role", 4};

--- a/auth/service.cc
+++ b/auth/service.cc
@@ -31,9 +31,7 @@
 #include "auth/allow_all_authenticator.hh"
 #include "auth/allow_all_authorizer.hh"
 #include "auth/common.hh"
-#include "auth/password_authenticator.hh"
 #include "auth/role_or_anonymous.hh"
-#include "auth/standard_role_manager.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/untyped_result_set.hh"
 #include "db/consistency_level_type.hh"
@@ -125,18 +123,7 @@ service::service(
            , _authorizer(std::move(z))
            , _authenticator(std::move(a))
            , _role_manager(std::move(r))
-            , _migration_listener(std::make_unique<auth_migration_listener>(*_authorizer)) {
-    // The password authenticator requires that the `standard_role_manager` is running so that the roles metadata table
-    // it manages is created and updated. This cross-module dependency is rather gross, but we have to maintain it for
-    // the sake of compatibility with Apache Cassandra and its choice of auth. schema.
-    if ((_authenticator->qualified_java_name() == password_authenticator_name())
-            && (_role_manager->qualified_java_name() != standard_role_manager_name())) {
-        throw incompatible_module_combination(
-                format("The {} authenticator must be loaded alongside the {} role-manager.",
-                        password_authenticator_name(),
-                        standard_role_manager_name()));
-    }
-}
+            , _migration_listener(std::make_unique<auth_migration_listener>(*_authorizer)) {}

 service::service(
        permissions_cache_config c,
@@ -376,25 +363,25 @@ future<permission_set> get_permissions(const service& ser, const authenticated_u
 }

 bool is_enforcing(const service& ser)  {
-    const bool enforcing_authorizer = ser.underlying_authorizer().qualified_java_name() != allow_all_authorizer_name();
+    const bool enforcing_authorizer = ser.underlying_authorizer().qualified_java_name() != allow_all_authorizer_name;

    const bool enforcing_authenticator = ser.underlying_authenticator().qualified_java_name()
-            != allow_all_authenticator_name();
+            != allow_all_authenticator_name;

    return enforcing_authorizer || enforcing_authenticator;
 }

 bool is_protected(const service& ser, const resource& r) noexcept {
-    return ser.underlying_role_manager().protected_resources().count(r)
-            || ser.underlying_authenticator().protected_resources().count(r)
-            || ser.underlying_authorizer().protected_resources().count(r);
+    return ser.underlying_role_manager().protected_resources().contains(r)
+            || ser.underlying_authenticator().protected_resources().contains(r)
+            || ser.underlying_authorizer().protected_resources().contains(r);
 }

 static void validate_authentication_options_are_supported(
        const authentication_options& options,
        const authentication_option_set& supported) {
    const auto check = [&supported](authentication_option k) {
-        if (supported.count(k) == 0) {
+        if (!supported.contains(k)) {
            throw unsupported_authentication_option(k);
        }
    };
@@ -474,7 +461,7 @@ future<bool> has_role(const service& ser, std::string_view grantee, std::string_
    return when_all_succeed(
            validate_role_exists(ser, name),
            ser.get_roles(grantee)).then_unpack([name](role_set all_roles) {
-        return make_ready_future<bool>(all_roles.count(sstring(name)) != 0);
+        return make_ready_future<bool>(all_roles.contains(sstring(name)));
    });
 }
 future<bool> has_role(const service& ser, const authenticated_user& u, std::string_view name) {
@@ -531,14 +518,9 @@ future<std::vector<permission_details>> list_filtered_permissions(
                    ? auth::expand_resource_family(r)
                    : auth::resource_set{r};

-            all_details.erase(
-                    std::remove_if(
-                            all_details.begin(),
-                            all_details.end(),
-                            [&resources](const permission_details& pd) {
-                        return resources.count(pd.resource) == 0;
-                    }),
-                    all_details.end());
+            std::erase_if(all_details, [&resources](const permission_details& pd) {
+                return !resources.contains(pd.resource);
+            });
        }

        std::transform(
@@ -551,11 +533,9 @@ future<std::vector<permission_details>> list_filtered_permissions(
                });

        // Eliminate rows with an empty permission set.
-        all_details.erase(
-                std::remove_if(all_details.begin(), all_details.end(), [](const permission_details& pd) {
-                    return pd.permissions.mask() == 0;
-                }),
-                all_details.end());
+        std::erase_if(all_details, [](const permission_details& pd) {
+            return pd.permissions.mask() == 0;
+        });

        if (!role_name) {
            return make_ready_future<std::vector<permission_details>>(std::move(all_details));
@@ -567,14 +547,9 @@ future<std::vector<permission_details>> list_filtered_permissions(

        return do_with(std::move(all_details), [&ser, role_name](auto& all_details) {
            return ser.get_roles(*role_name).then([&all_details](role_set all_roles) {
-                all_details.erase(
-                        std::remove_if(
-                                all_details.begin(),
-                                all_details.end(),
-                                [&all_roles](const permission_details& pd) {
-                            return all_roles.count(pd.role_name) == 0;
-                        }),
-                        all_details.end());
+                std::erase_if(all_details, [&all_roles](const permission_details& pd) {
+                    return !all_roles.contains(pd.role_name);
+                });

                return make_ready_future<std::vector<permission_details>>(std::move(all_details));
            });
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -49,11 +49,7 @@ namespace meta {
 namespace role_members_table {

 constexpr std::string_view name{"role_members" , 12};
-
-static std::string_view qualified_name() noexcept {
-    static const sstring instance = AUTH_KS + "." + sstring(name);
-    return instance;
-}
+constexpr std::string_view qualified_name("system_auth.role_members");

 }

@@ -84,7 +80,7 @@ static db::consistency_level consistency_for_role(std::string_view role_name) no

 static future<std::optional<record>> find_record(cql3::query_processor& qp, std::string_view role_name) {
    static const sstring query = format("SELECT * FROM {} WHERE {} = ?",
-            meta::roles_table::qualified_name(),
+            meta::roles_table::qualified_name,
            meta::roles_table::role_col_name);

    return qp.execute_internal(
@@ -124,13 +120,8 @@ static bool has_can_login(const cql3::untyped_result_set_row& row) {
    return row.has("can_login") && !(boolean_type->deserialize(row.get_blob("can_login")).is_null());
 }

-std::string_view standard_role_manager_name() noexcept {
-    static const sstring instance = meta::AUTH_PACKAGE_NAME + "CassandraRoleManager";
-    return instance;
-}
-
 std::string_view standard_role_manager::qualified_java_name() const noexcept {
-    return standard_role_manager_name();
+    return "org.apache.cassandra.auth.CassandraRoleManager";
 }

 const resource_set& standard_role_manager::protected_resources() const {
@@ -148,7 +139,7 @@ future<> standard_role_manager::create_metadata_tables_if_missing() const {
            "  member text,"
            "  PRIMARY KEY (role, member)"
            ")",
-            meta::role_members_table::qualified_name());
+            meta::role_members_table::qualified_name);


    return when_all_succeed(
@@ -168,7 +159,7 @@ future<> standard_role_manager::create_default_role_if_missing() const {
    return default_role_row_satisfies(_qp, &has_can_login).then([this](bool exists) {
        if (!exists) {
            static const sstring query = format("INSERT INTO {} ({}, is_superuser, can_login) VALUES (?, true, true)",
-                    meta::roles_table::qualified_name(),
+                    meta::roles_table::qualified_name,
                    meta::roles_table::role_col_name);

            return _qp.execute_internal(
@@ -256,7 +247,7 @@ future<> standard_role_manager::stop() {

 future<> standard_role_manager::create_or_replace(std::string_view role_name, const role_config& c) const {
    static const sstring query = format("INSERT INTO {} ({}, is_superuser, can_login) VALUES (?, ?, ?)",
-            meta::roles_table::qualified_name(),
+            meta::roles_table::qualified_name,
            meta::roles_table::role_col_name);

    return _qp.execute_internal(
@@ -301,7 +292,7 @@ standard_role_manager::alter(std::string_view role_name, const role_config_updat

        return _qp.execute_internal(
                format("UPDATE {} SET {} WHERE {} = ?",
-                        meta::roles_table::qualified_name(),
+                        meta::roles_table::qualified_name,
                        build_column_assignments(u),
                        meta::roles_table::role_col_name),
                consistency_for_role(role_name),
@@ -319,7 +310,7 @@ future<> standard_role_manager::drop(std::string_view role_name) const {
        // First, revoke this role from all roles that are members of it.
        const auto revoke_from_members = [this, role_name] {
            static const sstring query = format("SELECT member FROM {} WHERE role = ?",
-                    meta::role_members_table::qualified_name());
+                    meta::role_members_table::qualified_name);

            return _qp.execute_internal(
                    query,
@@ -357,7 +348,7 @@ future<> standard_role_manager::drop(std::string_view role_name) const {
        // Finally, delete the role itself.
        auto delete_role = [this, role_name] {
            static const sstring query = format("DELETE FROM {} WHERE {} = ?",
-                    meta::roles_table::qualified_name(),
+                    meta::roles_table::qualified_name,
                    meta::roles_table::role_col_name);

            return _qp.execute_internal(
@@ -383,7 +374,7 @@ standard_role_manager::modify_membership(
    const auto modify_roles = [this, role_name, grantee_name, ch] {
        const auto query = format(
                "UPDATE {} SET member_of = member_of {} ? WHERE {} = ?",
-                meta::roles_table::qualified_name(),
+                meta::roles_table::qualified_name,
                (ch == membership_change::add ? '+' : '-'),
                meta::roles_table::role_col_name);

@@ -399,7 +390,7 @@ standard_role_manager::modify_membership(
            case membership_change::add:
                return _qp.execute_internal(
                        format("INSERT INTO {} (role, member) VALUES (?, ?)",
-                                meta::role_members_table::qualified_name()),
+                                meta::role_members_table::qualified_name),
                        consistency_for_role(role_name),
                        internal_distributed_timeout_config(),
                        {sstring(role_name), sstring(grantee_name)}).discard_result();
@@ -407,7 +398,7 @@ standard_role_manager::modify_membership(
            case membership_change::remove:
                return _qp.execute_internal(
                        format("DELETE FROM {} WHERE role = ? AND member = ?",
-                                meta::role_members_table::qualified_name()),
+                                meta::role_members_table::qualified_name),
                        consistency_for_role(role_name),
                        internal_distributed_timeout_config(),
                        {sstring(role_name), sstring(grantee_name)}).discard_result();
@@ -416,7 +407,7 @@ standard_role_manager::modify_membership(
        return make_ready_future<>();
    };

-    return when_all_succeed(modify_roles(), modify_role_members()).discard_result();
+    return when_all_succeed(modify_roles(), modify_role_members).discard_result();
 }

 future<>
@@ -425,7 +416,7 @@ standard_role_manager::grant(std::string_view grantee_name, std::string_view rol
        return this->query_granted(
                grantee_name,
                recursive_role_query::yes).then([role_name, grantee_name](role_set roles) {
-            if (roles.count(sstring(role_name)) != 0) {
+            if (roles.contains(sstring(role_name))) {
                throw role_already_included(grantee_name, role_name);
            }

@@ -437,7 +428,7 @@ standard_role_manager::grant(std::string_view grantee_name, std::string_view rol
        return this->query_granted(
                role_name,
                recursive_role_query::yes).then([role_name, grantee_name](role_set roles) {
-            if (roles.count(sstring(grantee_name)) != 0) {
+            if (roles.contains(sstring(grantee_name))) {
                throw role_already_included(role_name, grantee_name);
            }

@@ -460,7 +451,7 @@ standard_role_manager::revoke(std::string_view revokee_name, std::string_view ro
        return this->query_granted(
                revokee_name,
                recursive_role_query::no).then([revokee_name, role_name](role_set roles) {
-            if (roles.count(sstring(role_name)) == 0) {
+            if (!roles.contains(sstring(role_name))) {
                throw revoke_ungranted_role(revokee_name, role_name);
            }

@@ -504,7 +495,7 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
 future<role_set> standard_role_manager::query_all() const {
    static const sstring query = format("SELECT {} FROM {}",
            meta::roles_table::role_col_name,
-            meta::roles_table::qualified_name());
+            meta::roles_table::qualified_name);

    // To avoid many copies of a view.
    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);
--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -42,8 +42,6 @@ class migration_manager;

 namespace auth {

-std::string_view standard_role_manager_name() noexcept;
-
 class standard_role_manager final : public role_manager {
    cql3::query_processor& _qp;
    ::service::migration_manager& _migration_manager;
--- a/auth/transitional.cc
+++ b/auth/transitional.cc
@@ -101,7 +101,7 @@ public:
    virtual future<authenticated_user> authenticate(const credentials_map& credentials) const override {
        auto i = credentials.find(authenticator::USERNAME_KEY);
        if ((i == credentials.end() || i->second.empty())
-                && (!credentials.count(PASSWORD_KEY) || credentials.at(PASSWORD_KEY).empty())) {
+                && (!credentials.contains(PASSWORD_KEY) || credentials.at(PASSWORD_KEY).empty())) {
            // return anon user
            return make_ready_future<authenticated_user>(anonymous_user());
        }
--- a/bytes.cc
+++ b/bytes.cc
@@ -100,3 +100,7 @@ std::ostream& operator<<(std::ostream& os, const bytes_view& b) {
 }

 }
+
+std::ostream& operator<<(std::ostream& os, const fmt_hex& b) {
+    return os << to_hex(b.v);
+}
--- a/bytes.hh
+++ b/bytes.hh
@@ -39,6 +39,10 @@ inline sstring_view to_sstring_view(bytes_view view) {
    return {reinterpret_cast<const char*>(view.data()), view.size()};
 }

+inline bytes_view to_bytes_view(sstring_view view) {
+    return {reinterpret_cast<const int8_t*>(view.data()), view.size()};
+}
+
 namespace std {

 template <>
@@ -50,6 +54,13 @@ struct hash<bytes_view> {

 }

+struct fmt_hex {
+    bytes_view& v;
+    fmt_hex(bytes_view& v) noexcept : v(v) {}
+};
+
+std::ostream& operator<<(std::ostream& os, const fmt_hex& hex);
+
 bytes from_hex(sstring_view s);
 sstring to_hex(bytes_view b);
 sstring to_hex(const bytes& b);
@@ -84,9 +95,12 @@ struct appending_hash<bytes_view> {
 };

 inline int32_t compare_unsigned(bytes_view v1, bytes_view v2) {
-    auto n = memcmp(v1.begin(), v2.begin(), std::min(v1.size(), v2.size()));
+  auto size = std::min(v1.size(), v2.size());
+  if (size) {
+    auto n = memcmp(v1.begin(), v2.begin(), size);
    if (n) {
        return n;
    }
+  }
    return (int32_t) (v1.size() - v2.size());
 }
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -28,7 +28,6 @@
 #include "partition_version.hh"
 #include "utils/logalloc.hh"
 #include "query-request.hh"
-#include "partition_snapshot_reader.hh"
 #include "partition_snapshot_row_cursor.hh"
 #include "read_context.hh"
 #include "flat_mutation_reader.hh"
@@ -134,7 +133,7 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
    void maybe_add_to_cache(const static_row& sr);
    void maybe_set_static_row_continuous();
    void finish_reader() {
-        push_mutation_fragment(partition_end());
+        push_mutation_fragment(*_schema, _permit, partition_end());
        _end_of_stream = true;
        _state = state::end_of_stream;
    }
@@ -146,7 +145,7 @@ public:
                               lw_shared_ptr<read_context> ctx,
                               partition_snapshot_ptr snp,
                               row_cache& cache)
-        : flat_mutation_reader::impl(std::move(s))
+        : flat_mutation_reader::impl(std::move(s), ctx->permit())
        , _snp(std::move(snp))
        , _position_cmp(*_schema)
        , _ck_ranges(std::move(crr))
@@ -158,8 +157,8 @@ public:
        , _read_context(std::move(ctx))
        , _next_row(*_schema, *_snp)
    {
-        clogger.trace("csm {}: table={}.{}", this, _schema->ks_name(), _schema->cf_name());
-        push_mutation_fragment(partition_start(std::move(dk), _snp->partition_tombstone()));
+        clogger.trace("csm {}: table={}.{}", fmt::ptr(this), _schema->ks_name(), _schema->cf_name());
+        push_mutation_fragment(*_schema, _permit, partition_start(std::move(dk), _snp->partition_tombstone()));
    }
    cache_flat_mutation_reader(const cache_flat_mutation_reader&) = delete;
    cache_flat_mutation_reader(cache_flat_mutation_reader&&) = delete;
@@ -188,7 +187,7 @@ future<> cache_flat_mutation_reader::process_static_row(db::timeout_clock::time_
            return _snp->static_row(_read_context->digest_requested());
        });
        if (!sr.empty()) {
-            push_mutation_fragment(mutation_fragment(std::move(sr)));
+            push_mutation_fragment(mutation_fragment(*_schema, _permit, std::move(sr)));
        }
        return make_ready_future<>();
    } else {
@@ -232,7 +231,7 @@ future<> cache_flat_mutation_reader::fill_buffer(db::timeout_clock::time_point t
            return after_static_row();
        }
    }
-    clogger.trace("csm {}: fill_buffer(), range={}, lb={}", this, *_ck_ranges_curr, _lower_bound);
+    clogger.trace("csm {}: fill_buffer(), range={}, lb={}", fmt::ptr(this), *_ck_ranges_curr, _lower_bound);
    return do_until([this] { return _end_of_stream || is_buffer_full(); }, [this, timeout] {
        return do_fill_buffer(timeout);
    });
@@ -277,7 +276,7 @@ future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_poin
    // assert(_state == state::reading_from_cache)
    return _lsa_manager.run_in_read_section([this] {
        auto next_valid = _next_row.iterators_valid();
-        clogger.trace("csm {}: reading_from_cache, range=[{}, {}), next={}, valid={}", this, _lower_bound,
+        clogger.trace("csm {}: reading_from_cache, range=[{}, {}), next={}, valid={}", fmt::ptr(this), _lower_bound,
            _upper_bound, _next_row.position(), next_valid);
        // We assume that if there was eviction, and thus the range may
        // no longer be continuous, the cursor was invalidated.
@@ -291,7 +290,7 @@ future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_poin
            }
        }
        _next_row.maybe_refresh();
-        clogger.trace("csm {}: next={}, cont={}", this, _next_row.position(), _next_row.continuous());
+        clogger.trace("csm {}: next={}, cont={}", fmt::ptr(this), _next_row.position(), _next_row.continuous());
        _lower_bound_changed = false;
        while (_state == state::reading_from_cache) {
            copy_from_cache_to_buffer();
@@ -357,7 +356,7 @@ future<> cache_flat_mutation_reader::read_from_underlying(db::timeout_clock::tim
                                    e.release();
                                    auto next = std::next(it);
                                    it->set_continuous(next->continuous());
-                                    clogger.trace("csm {}: inserted dummy at {}, cont={}", this, it->position(), it->continuous());
+                                    clogger.trace("csm {}: inserted dummy at {}, cont={}", fmt::ptr(this), it->position(), it->continuous());
                                }
                            });
                        } else if (ensure_population_lower_bound()) {
@@ -368,11 +367,11 @@ future<> cache_flat_mutation_reader::read_from_underlying(db::timeout_clock::tim
                                auto insert_result = rows.insert_check(_next_row.get_iterator_in_latest_version(), *e, less);
                                auto inserted = insert_result.second;
                                if (inserted) {
-                                    clogger.trace("csm {}: inserted dummy at {}", this, _upper_bound);
+                                    clogger.trace("csm {}: inserted dummy at {}", fmt::ptr(this), _upper_bound);
                                    _snp->tracker()->insert(*e);
                                    e.release();
                                } else {
-                                    clogger.trace("csm {}: mark {} as continuous", this, insert_result.first->position());
+                                    clogger.trace("csm {}: mark {} as continuous", fmt::ptr(this), insert_result.first->position());
                                    insert_result.first->set_continuous(true);
                                }
                            });
@@ -413,7 +412,7 @@ bool cache_flat_mutation_reader::ensure_population_lower_bound() {
            auto insert_result = rows.insert_check(rows.end(), *e, less);
            auto inserted = insert_result.second;
            if (inserted) {
-                clogger.trace("csm {}: inserted lower bound dummy at {}", this, e->position());
+                clogger.trace("csm {}: inserted lower bound dummy at {}", fmt::ptr(this), e->position());
                _snp->tracker()->insert(*e);
                e.release();
            }
@@ -453,7 +452,7 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
        _read_context->cache().on_mispopulate();
        return;
    }
-    clogger.trace("csm {}: populate({})", this, clustering_row::printer(*_schema, cr));
+    clogger.trace("csm {}: populate({})", fmt::ptr(this), clustering_row::printer(*_schema, cr));
    _lsa_manager.run_in_update_section_with_allocator([this, &cr] {
        mutation_partition& mp = _snp->version()->partition();
        rows_entry::compare less(*_schema);
@@ -475,7 +474,7 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {

        rows_entry& e = *it;
        if (ensure_population_lower_bound()) {
-            clogger.trace("csm {}: set_continuous({})", this, e.position());
+            clogger.trace("csm {}: set_continuous({})", fmt::ptr(this), e.position());
            e.set_continuous(true);
        } else {
            _read_context->cache().on_mispopulate();
@@ -494,14 +493,14 @@ bool cache_flat_mutation_reader::after_current_range(position_in_partition_view

 inline
 void cache_flat_mutation_reader::start_reading_from_underlying() {
-    clogger.trace("csm {}: start_reading_from_underlying(), range=[{}, {})", this, _lower_bound, _next_row_in_range ? _next_row.position() : _upper_bound);
+    clogger.trace("csm {}: start_reading_from_underlying(), range=[{}, {})", fmt::ptr(this), _lower_bound, _next_row_in_range ? _next_row.position() : _upper_bound);
    _state = state::move_to_underlying;
    _next_row.touch();
 }

 inline
 void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
-    clogger.trace("csm {}: copy_from_cache, next={}, next_row_in_range={}", this, _next_row.position(), _next_row_in_range);
+    clogger.trace("csm {}: copy_from_cache, next={}, next_row_in_range={}", fmt::ptr(this), _next_row.position(), _next_row_in_range);
    _next_row.touch();
    position_in_partition_view next_lower_bound = _next_row.dummy() ? _next_row.position() : position_in_partition_view::after_key(_next_row.key());
    for (auto &&rts : _snp->range_tombstones(_lower_bound, _next_row_in_range ? next_lower_bound : _upper_bound)) {
@@ -517,7 +516,7 @@ void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
                return;
            }
        }
-        push_mutation_fragment(std::move(rts));
+        push_mutation_fragment(*_schema, _permit, std::move(rts));
    }
    // We add the row to the buffer even when it's full.
    // This simplifies the code. For more info see #3139.
@@ -533,7 +532,7 @@ void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
 inline
 void cache_flat_mutation_reader::move_to_end() {
    finish_reader();
-    clogger.trace("csm {}: eos", this);
+    clogger.trace("csm {}: eos", fmt::ptr(this));
 }

 inline
@@ -558,7 +557,7 @@ void cache_flat_mutation_reader::move_to_range(query::clustering_row_ranges::con
    _ck_ranges_curr = next_it;
    auto adjacent = _next_row.advance_to(_lower_bound);
    _next_row_in_range = !after_current_range(_next_row.position());
-    clogger.trace("csm {}: move_to_range(), range={}, lb={}, ub={}, next={}", this, *_ck_ranges_curr, _lower_bound, _upper_bound, _next_row.position());
+    clogger.trace("csm {}: move_to_range(), range={}, lb={}, ub={}, next={}", fmt::ptr(this), *_ck_ranges_curr, _lower_bound, _upper_bound, _next_row.position());
    if (!adjacent && !_next_row.continuous()) {
        // FIXME: We don't insert a dummy for singular range to avoid allocating 3 entries
        // for a hit (before, at and after). If we supported the concept of an incomplete row,
@@ -568,7 +567,7 @@ void cache_flat_mutation_reader::move_to_range(query::clustering_row_ranges::con
            // Insert dummy for lower bound
            if (can_populate()) {
                // FIXME: _lower_bound could be adjacent to the previous row, in which case we could skip this
-                clogger.trace("csm {}: insert dummy at {}", this, _lower_bound);
+                clogger.trace("csm {}: insert dummy at {}", fmt::ptr(this), _lower_bound);
                auto it = with_allocator(_lsa_manager.region().allocator(), [&] {
                    auto& rows = _snp->version()->partition().clustered_rows();
                    auto new_entry = current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no);
@@ -587,7 +586,7 @@ void cache_flat_mutation_reader::move_to_range(query::clustering_row_ranges::con
 // _next_row must be inside the range.
 inline
 void cache_flat_mutation_reader::move_to_next_entry() {
-    clogger.trace("csm {}: move_to_next_entry(), curr={}", this, _next_row.position());
+    clogger.trace("csm {}: move_to_next_entry(), curr={}", fmt::ptr(this), _next_row.position());
    if (no_clustering_row_between(*_schema, _next_row.position(), _upper_bound)) {
        move_to_next_range();
    } else {
@@ -596,7 +595,7 @@ void cache_flat_mutation_reader::move_to_next_entry() {
            return;
        }
        _next_row_in_range = !after_current_range(_next_row.position());
-        clogger.trace("csm {}: next={}, cont={}, in_range={}", this, _next_row.position(), _next_row.continuous(), _next_row_in_range);
+        clogger.trace("csm {}: next={}, cont={}, in_range={}", fmt::ptr(this), _next_row.position(), _next_row.continuous(), _next_row_in_range);
        if (!_next_row.continuous()) {
            start_reading_from_underlying();
        }
@@ -605,7 +604,7 @@ void cache_flat_mutation_reader::move_to_next_entry() {

 inline
 void cache_flat_mutation_reader::add_to_buffer(mutation_fragment&& mf) {
-    clogger.trace("csm {}: add_to_buffer({})", this, mutation_fragment::printer(*_schema, mf));
+    clogger.trace("csm {}: add_to_buffer({})", fmt::ptr(this), mutation_fragment::printer(*_schema, mf));
    if (mf.is_clustering_row()) {
        add_clustering_row_to_buffer(std::move(mf));
    } else {
@@ -618,7 +617,7 @@ inline
 void cache_flat_mutation_reader::add_to_buffer(const partition_snapshot_row_cursor& row) {
    if (!row.dummy()) {
        _read_context->cache().on_row_hit();
-        add_clustering_row_to_buffer(row.row(_read_context->digest_requested()));
+        add_clustering_row_to_buffer(mutation_fragment(*_schema, _permit, row.row(_read_context->digest_requested())));
    }
 }

@@ -627,7 +626,7 @@ void cache_flat_mutation_reader::add_to_buffer(const partition_snapshot_row_curs
 //   (2) If _lower_bound > mf.position(), mf was emitted
 inline
 void cache_flat_mutation_reader::add_clustering_row_to_buffer(mutation_fragment&& mf) {
-    clogger.trace("csm {}: add_clustering_row_to_buffer({})", this, mutation_fragment::printer(*_schema, mf));
+    clogger.trace("csm {}: add_clustering_row_to_buffer({})", fmt::ptr(this), mutation_fragment::printer(*_schema, mf));
    auto& row = mf.as_clustering_row();
    auto new_lower_bound = position_in_partition::after_key(row.key());
    push_mutation_fragment(std::move(mf));
@@ -637,7 +636,7 @@ void cache_flat_mutation_reader::add_clustering_row_to_buffer(mutation_fragment&

 inline
 void cache_flat_mutation_reader::add_to_buffer(range_tombstone&& rt) {
-    clogger.trace("csm {}: add_to_buffer({})", this, rt);
+    clogger.trace("csm {}: add_to_buffer({})", fmt::ptr(this), rt);
    // This guarantees that rt starts after any emitted clustering_row
    // and not before any emitted range tombstone.
    position_in_partition::less_compare less(*_schema);
@@ -650,13 +649,13 @@ void cache_flat_mutation_reader::add_to_buffer(range_tombstone&& rt) {
        _lower_bound = position_in_partition(rt.position());
        _lower_bound_changed = true;
    }
-    push_mutation_fragment(std::move(rt));
+    push_mutation_fragment(*_schema, _permit, std::move(rt));
 }

 inline
 void cache_flat_mutation_reader::maybe_add_to_cache(const range_tombstone& rt) {
    if (can_populate()) {
-        clogger.trace("csm {}: maybe_add_to_cache({})", this, rt);
+        clogger.trace("csm {}: maybe_add_to_cache({})", fmt::ptr(this), rt);
        _lsa_manager.run_in_update_section_with_allocator([&] {
            _snp->version()->partition().row_tombstones().apply_monotonically(*_schema, rt);
        });
@@ -668,7 +667,7 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const range_tombstone& rt) {
 inline
 void cache_flat_mutation_reader::maybe_add_to_cache(const static_row& sr) {
    if (can_populate()) {
-        clogger.trace("csm {}: populate({})", this, static_row::printer(*_schema, sr));
+        clogger.trace("csm {}: populate({})", fmt::ptr(this), static_row::printer(*_schema, sr));
        _read_context->cache().on_static_row_insert();
        _lsa_manager.run_in_update_section_with_allocator([&] {
            if (_read_context->digest_requested()) {
@@ -684,7 +683,7 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const static_row& sr) {
 inline
 void cache_flat_mutation_reader::maybe_set_static_row_continuous() {
    if (can_populate()) {
-        clogger.trace("csm {}: set static row continuous", this);
+        clogger.trace("csm {}: set static row continuous", fmt::ptr(this));
        _snp->version()->partition().set_static_row_continuous(true);
    } else {
        _read_context->cache().on_mispopulate();
--- a/caching_options.hh
+++ b/caching_options.hh
@@ -23,7 +23,7 @@
 #include <seastar/core/sstring.hh>
 #include <boost/lexical_cast.hpp>
 #include "exceptions/exceptions.hh"
-#include "json.hh"
+#include "utils/rjson.hh"
 #include "seastarx.hh"

 class schema;
@@ -76,7 +76,7 @@ public:
    }

    sstring to_sstring() const {
-        return json::to_json(to_map());
+        return rjson::print(rjson::from_string_map(to_map()));
    }

    static caching_options get_disabled_caching_options() {
@@ -97,13 +97,14 @@ public:
            } else if (p.first == "enabled") {
                e = p.second == "true";
            } else {
-                throw exceptions::configuration_exception("Invalid caching option: " + p.first);
+                throw exceptions::configuration_exception(format("Invalid caching option: {}", p.first));
            }
        }
        return caching_options(k, r, e);
    }
+
    static caching_options from_sstring(const sstring& str) {
-        return from_map(json::to_map(str));
+        return from_map(rjson::parse_to_map<std::map<sstring, sstring>>(str));
    }

    bool operator==(const caching_options& other) const {
--- a/cdc/cdc_extension.hh
+++ b/cdc/cdc_extension.hh
@@ -20,10 +20,16 @@

 #pragma once

+#include <map>
+
+#include <seastar/core/sstring.hh>
+
+#include "bytes.hh"
 #include "serializer.hh"
 #include "db/extensions.hh"
 #include "cdc/cdc_options.hh"
 #include "schema.hh"
+#include "serializer_impl.hh"

 namespace cdc {

@@ -33,6 +39,7 @@ public:
    static constexpr auto NAME = "cdc";

    cdc_extension() = default;
+    cdc_extension(const options& opts) : _cdc_options(opts) {}
    explicit cdc_extension(std::map<sstring, sstring> tags) : _cdc_options(std::move(tags)) {}
    explicit cdc_extension(const bytes& b) : _cdc_options(cdc_extension::deserialize(b)) {}
    explicit cdc_extension(const sstring& s) {
--- a/cdc/cdc_options.hh
+++ b/cdc/cdc_options.hh
@@ -27,10 +27,32 @@

 namespace cdc {

+enum class delta_mode : uint8_t {
+    keys,
+    full,
+};
+
+/**
+ * (for now only pre-) image collection mode.
+ * Stating how much info to record.
+ * off == none
+ * on == changed columns
+ * full == all (changed and unmodified columns)
+ */
+enum class image_mode : uint8_t {
+    off, 
+    on,
+    full,
+};
+
+std::ostream& operator<<(std::ostream& os, delta_mode);
+std::ostream& operator<<(std::ostream& os, image_mode);
+
 class options final {
    bool _enabled = false;
-    bool _preimage = false;
+    image_mode _preimage = image_mode::off;
    bool _postimage = false;
+    delta_mode _delta_mode = delta_mode::full;
    int _ttl = 86400; // 24h in seconds
 public:
    options() = default;
@@ -40,10 +62,19 @@ public:
    sstring to_sstring() const;

    bool enabled() const { return _enabled; }
-    bool preimage() const { return _preimage; }
+    bool preimage() const { return _preimage != image_mode::off; }
+    bool full_preimage() const { return _preimage == image_mode::full; }
    bool postimage() const { return _postimage; }
+    delta_mode get_delta_mode() const { return _delta_mode; }
+    void set_delta_mode(delta_mode m) { _delta_mode = m; }
    int ttl() const { return _ttl; }

+    void enabled(bool b) { _enabled = b; }
+    void preimage(bool b) { preimage(b ? image_mode::on : image_mode::off); }
+    void preimage(image_mode m) { _preimage = m; }
+    void postimage(bool b) { _postimage = b; }
+    void ttl(int v) { _ttl = v; }
+
    bool operator==(const options& o) const;
    bool operator!=(const options& o) const;
 };
--- a/cdc/change_visitor.hh
+++ b/cdc/change_visitor.hh
@@ -0,0 +1,283 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "mutation.hh"
+
+/*
+ * This file contains a general abstraction for walking over mutations,
+ * deconstructing them into ``atomic'' pieces, and consuming these pieces.
+ *
+ * The pieces considered atomic are:
+ * - atomic_cells, either in collections or in atomic columns
+ *   (see `live_collection_cell`, `dead_collection_cell`, `live_atomic_cell`, `dead_atomic_cell`),
+ * - collection tombstones (see `collection_tombstone`)
+ * - row markers (see `marker`)
+ * - row tombstones (see `clustered_row_delete`),
+ * - range tombstones (see `range_delete`),
+ * - partition tombstones (see `partition_delete`).
+ * We use the term ``changes'' to refer to these atomic pieces, hence the name ``ChangeVisitor''.
+ *
+ * IMPORTANT: this doesn't understand all possible states that a mutation can have, e.g. it doesn't understand
+ * the concept of ``continuity''. However, it is sufficient for analyzing mutations created by a write coordinator,
+ * e.g. obtained by parsing a CQL statement.
+ *
+ * To analyze a mutation, create a visitor (described by the `ChangeVisitor` concept below) and pass it
+ * together with the mutation to `inspect_mutation`.
+ *
+ * To analyze certain fragments of the mutation, the inspecting code requires further visitors to be passed.
+ * For example, when it encounters a clustered row update, it calls `clustered_row_cells` on the visitor,
+ * passing it the row's key and the callback. The visitor can then decide:
+ * - if it's not interested in the row's cells, it can simply not call the callback,
+ * - otherwise, it can call the callback with a value of type that satisfies the ``RowCellsVisitor'' concept.
+ * If the callback is called, the inspector walks over the row and passes the changes into the ``row cells visitor''.
+ * In either case, it will then proceed to analyze further parts of the mutation, if any.
+ *
+ * Note that the type passed to the callbacks provided by the inspector (such as in the example above)
+ * can be decided at runtime. This can be especially useful with the callback passed to `collection_column`
+ * in RowCellsVisitor, if different collection types require different logic to handle.
+ *
+ * The dummy visitors below are there only to define the concepts.
+ * For example, in the RowCellsVisitor concept I wanted to express that `visit_collection` in RowCellsVisitor
+ * is a function that handles *any* type which satisfies CollectionVisitor. I didn't find a way to do that
+ * other than providing a ``most generic'' concrete type which satisfies the interface (`dummy_collection_visitor`).
+ * Unfortunately C++ is still not Haskell.
+ *
+ * The inspector calls `finished()` after visiting each change, and sometimes before (e.g. when it starts
+ * visiting a static row, but before it visits any of its cells). If it returns true, the inspector
+ * will stop the visitation. Thus, if at any point during the walk the visitor decides it's not interested
+ * in any more changes, it can inform the inspector by returning `true` from `finished()`.
+ *
+ * IMPORTANT: if the visitor returns `true` from `finished()`, it should keep returning `true`. This is because
+ * the inspector may call `finished()` multiple times when exiting some nested loops.
+ *
+ * The order of visitation is as follows:
+ * - First the static row is visited, if it has any cells.
+ *   Within the row, its columns are visited in order of increasing column IDs.
+ *
+ * - Then, for each clustering key, if a change (row marker, cell, or tombstone) exists for this key:
+ *   - The row marker is visited, if there is one.
+ *   - Columns are visited in order of increasing column IDs.
+ *   - The row tombstone is visited, if there is one.
+ *
+ * For both the static row and a clustering row, for each column:
+ * - If the column is atomic, a corresponding atomic_cell is visited (if there is one).
+ * - Otherwise (the column is non-atomic):
+ *   - The collection tombstone is visited first.
+ *   - Cells are visited in order of increasing keys
+ *     (assuming that the mutation was correctly constructed, i.e. it stores cells in key order).
+ *
+ * WARNING: visited collection tombstone and cells
+ * are guaranteed to live only for the duration of `collection_column` call.
+ *
+ * - Then range tombstones are visited. The order is unspecified
+ *   (more accurately: if it's specified, I don't know what it is)
+ *
+ * - Finally, the partition tombstone is visited, if it exists.
+ */
+
+namespace cdc {
+
+template <typename V>
+concept CollectionVisitor = requires(V v,
+        const tombstone& t,
+        bytes_view key,
+        const atomic_cell_view& cell) {
+
+    { v.collection_tombstone(t) }         -> std::same_as<void>;
+    { v.live_collection_cell(key, cell) } -> std::same_as<void>;
+    { v.dead_collection_cell(key, cell) } -> std::same_as<void>;
+    { v.finished() } -> std::same_as<bool>;
+};
+
+struct dummy_collection_visitor {
+    void collection_tombstone(const tombstone&) {}
+    void live_collection_cell(bytes_view, const atomic_cell_view&) {}
+    void dead_collection_cell(bytes_view, const atomic_cell_view&) {}
+    bool finished() { return false; }
+};
+
+template <typename V>
+concept RowCellsVisitor = requires(V v,
+        const column_definition& cdef,
+        const atomic_cell_view& cell,
+        noncopyable_function<void(dummy_collection_visitor&)> visit_collection) {
+
+    { v.live_atomic_cell(cdef, cell) }                         -> std::same_as<void>;
+    { v.dead_atomic_cell(cdef, cell) }                         -> std::same_as<void>;
+    { v.collection_column(cdef, std::move(visit_collection)) } -> std::same_as<void>;
+    { v.finished() }                                           -> std::same_as<bool>;
+};
+
+struct dummy_row_cells_visitor {
+    void live_atomic_cell(const column_definition&, const atomic_cell_view&) {}
+    void dead_atomic_cell(const column_definition&, const atomic_cell_view&) {}
+    void collection_column(const column_definition&, auto&& visit_collection) {
+        dummy_collection_visitor v;
+        visit_collection(v);
+    }
+    bool finished() { return false; }
+};
+
+template <typename V>
+concept ClusteredRowCellsVisitor = requires(V v,
+        const row_marker& rm) {
+    requires RowCellsVisitor<V>;
+    { v.marker(rm) } -> std::same_as<void>;
+};
+
+struct dummy_clustered_row_cells_visitor : public dummy_row_cells_visitor {
+    void marker(const row_marker&) {}
+};
+
+template <typename V>
+concept ChangeVisitor = requires(V v,
+        api::timestamp_type ts,
+        const clustering_key& ckey,
+        const range_tombstone& rt,
+        const tombstone& t,
+        noncopyable_function<void(dummy_clustered_row_cells_visitor&)> visit_clustered_row_cells,
+        noncopyable_function<void(dummy_row_cells_visitor&)> visit_row_cells) {
+
+    { v.static_row_cells(std::move(visit_row_cells)) }                    -> std::same_as<void>;
+    { v.clustered_row_cells(ckey, std::move(visit_clustered_row_cells)) } -> std::same_as<void>;
+    { v.clustered_row_delete(ckey, t) }                                   -> std::same_as<void>;
+    { v.range_delete(rt) }                                                -> std::same_as<void>;
+    { v.partition_delete(t) }                                             -> std::same_as<void>;
+    { v.finished() }                                                      -> std::same_as<bool>;
+};
+
+template <RowCellsVisitor V>
+void inspect_row_cells(const schema& s, column_kind ckind, const row& r, V& v) {
+    r.for_each_cell_until([&s, ckind, &v] (column_id id, const atomic_cell_or_collection& acoc) {
+        auto& cdef = s.column_at(ckind, id);
+
+        if (cdef.is_atomic()) {
+            auto cell = acoc.as_atomic_cell(cdef);
+            if (cell.is_live()) {
+                v.live_atomic_cell(cdef, cell);
+            } else {
+                v.dead_atomic_cell(cdef, cell);
+            }
+
+            return stop_iteration(v.finished());
+        }
+
+        acoc.as_collection_mutation().with_deserialized(*cdef.type, [&v, &cdef] (collection_mutation_view_description view) {
+            v.collection_column(cdef, [&view] (CollectionVisitor auto& cv) {
+                if (cv.finished()) {
+                    return;
+                }
+
+                if (view.tomb) {
+                    cv.collection_tombstone(view.tomb);
+                    if (cv.finished()) {
+                        return;
+                    }
+                }
+
+                for (auto& [key, cell]: view.cells) {
+                    if (cell.is_live()) {
+                        cv.live_collection_cell(key, cell);
+                    } else {
+                        cv.dead_collection_cell(key, cell);
+                    }
+
+                    if (cv.finished()) {
+                        return;
+                    }
+                }
+            });
+        });
+
+        return stop_iteration(v.finished());
+    });
+}
+
+template <ChangeVisitor V>
+void inspect_mutation(const mutation& m, V& v) {
+    auto& p = m.partition();
+    auto& s = *m.schema();
+
+    if (!p.static_row().empty()) {
+        v.static_row_cells([&s, &p] (RowCellsVisitor auto& srv) {
+            if (srv.finished()) {
+                return;
+            }
+            inspect_row_cells(s, column_kind::static_column, p.static_row().get(), srv);
+        });
+
+        if (v.finished()) {
+            return;
+        }
+    }
+
+    for (auto& cr: p.clustered_rows()) {
+        auto& r = cr.row();
+
+        if (r.marker().is_live() || !r.cells().empty()) {
+            v.clustered_row_cells(cr.key(), [&s, &r] (ClusteredRowCellsVisitor auto& crv) {
+                if (crv.finished()) {
+                    return;
+                }
+
+                auto& rm = r.marker();
+                if (rm.is_live()) {
+                    crv.marker(rm);
+
+                    if (crv.finished()) {
+                        return;
+                    }
+                }
+
+                inspect_row_cells(s, column_kind::regular_column, r.cells(), crv);
+            });
+
+            if (v.finished()) {
+                return;
+            }
+        }
+
+        if (r.deleted_at()) {
+            auto t = r.deleted_at().tomb();
+            assert(t.timestamp != api::missing_timestamp);
+            v.clustered_row_delete(cr.key(), t);
+            if (v.finished()) {
+                return;
+            }
+        }
+    }
+
+    for (auto& rt: p.row_tombstones()) {
+        assert(rt.tomb.timestamp != api::missing_timestamp);
+        v.range_delete(rt);
+        if (v.finished()) {
+            return;
+        }
+    }
+
+    if (p.partition_tombstone()) {
+        v.partition_delete(p.partition_tombstone());
+    }
+}
+
+} // namespace cdc
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -23,6 +23,7 @@
 #include <random>
 #include <unordered_set>
 #include <seastar/core/sleep.hh>
+#include <algorithm>

 #include "keys.hh"
 #include "schema_builder.hh"
@@ -59,14 +60,57 @@ static void copy_int_to_bytes(int64_t i, size_t offset, bytes& b) {
    std::copy_n(reinterpret_cast<int8_t*>(&i), sizeof(int64_t), b.begin() + offset);
 }

-stream_id::stream_id(int64_t first, int64_t second)
+static constexpr auto stream_id_version_bits = 4;
+static constexpr auto stream_id_random_bits = 38;
+static constexpr auto stream_id_index_bits = sizeof(uint64_t)*8 - stream_id_version_bits - stream_id_random_bits;
+
+static constexpr auto stream_id_version_shift = 0;
+static constexpr auto stream_id_index_shift = stream_id_version_shift + stream_id_version_bits;
+static constexpr auto stream_id_random_shift = stream_id_index_shift + stream_id_index_bits;
+
+/**
+ * Responsibilty for encoding stream_id moved from factory method to
+ * this constructor, to keep knowledge of composition in a single place.
+ * Note this is private and friended to topology_description_generator,
+ * because he is the one who defined the "order" we view vnodes etc.
+ */
+stream_id::stream_id(dht::token token, size_t vnode_index)
    : _value(bytes::initialized_later(), 2 * sizeof(int64_t))
 {
-    copy_int_to_bytes(first, 0, _value);
-    copy_int_to_bytes(second, sizeof(int64_t), _value);
+    static thread_local std::mt19937_64 rand_gen(std::random_device{}());
+    static thread_local std::uniform_int_distribution<uint64_t> rand_dist;
+
+    auto rand = rand_dist(rand_gen);
+    auto mask_shift = [](uint64_t val, size_t bits, size_t shift) {
+        return (val & ((1ull << bits) - 1u)) << shift;
+    };
+    /**
+     *  Low qword:
+     * 0-4: version
+     * 5-26: vnode index as when created (see generation below). This excludes shards
+     * 27-64: random value (maybe to be replaced with timestamp)
+     */
+    auto low_qword = mask_shift(version_1, stream_id_version_bits, stream_id_version_shift)
+        | mask_shift(vnode_index, stream_id_index_bits, stream_id_index_shift)
+        | mask_shift(rand, stream_id_random_bits, stream_id_random_shift)
+        ;
+
+    copy_int_to_bytes(dht::token::to_int64(token), 0, _value);
+    copy_int_to_bytes(low_qword, sizeof(int64_t), _value);
+    // not a hot code path. make sure we did not mess up the shifts and masks.
+    assert(version() == version_1);
+    assert(index() == vnode_index);
 }

-stream_id::stream_id(bytes b) : _value(std::move(b)) { }
+stream_id::stream_id(bytes b)
+    : _value(std::move(b))
+{
+    // this is not a very solid check. Id:s previous to GA/versioned id:s
+    // have fully random bits in low qword, so this could go either way...
+    if (version() > version_1) {
+        throw std::invalid_argument("Unknown CDC stream id version");
+    }
+}

 bool stream_id::is_set() const {
    return !_value.empty();
@@ -76,6 +120,10 @@ bool stream_id::operator==(const stream_id& o) const {
    return _value == o._value;
 }

+bool stream_id::operator!=(const stream_id& o) const {
+    return !(*this == o);
+}
+
 bool stream_id::operator<(const stream_id& o) const {
    return _value < o._value;
 }
@@ -87,18 +135,26 @@ static int64_t bytes_to_int64(bytes_view b, size_t offset) {
    return net::ntoh(res);
 }

-int64_t stream_id::first() const {
-    return token_from_bytes(_value);
-}
-
-int64_t stream_id::second() const {
-    return bytes_to_int64(_value, sizeof(int64_t));
+dht::token stream_id::token() const {
+    return dht::token::from_int64(token_from_bytes(_value));
 }

 int64_t stream_id::token_from_bytes(bytes_view b) {
    return bytes_to_int64(b, 0);
 }

+static uint64_t unpack_value(bytes_view b, size_t off, size_t shift, size_t bits) {
+    return (uint64_t(bytes_to_int64(b, off)) >> shift) & ((1ull << bits) - 1u);
+}
+
+uint8_t stream_id::version() const {
+    return unpack_value(_value, sizeof(int64_t), stream_id_version_shift, stream_id_version_bits);
+}
+
+size_t stream_id::index() const {
+    return unpack_value(_value, sizeof(int64_t), stream_id_index_shift, stream_id_index_bits);
+}
+
 const bytes& stream_id::to_bytes() const {
    return _value;
 }
@@ -119,15 +175,27 @@ bool topology_description::operator==(const topology_description& o) const {
    return _entries == o._entries;
 }

-const std::vector<token_range_description>& topology_description::entries() const {
+const std::vector<token_range_description>& topology_description::entries() const& {
    return _entries;
 }

-static stream_id create_stream_id(dht::token t) {
-    static thread_local std::mt19937_64 rand_gen(std::random_device().operator()());
-    static thread_local std::uniform_int_distribution<int64_t> rand_dist(std::numeric_limits<int64_t>::min());
+std::vector<token_range_description>&& topology_description::entries() && {
+    return std::move(_entries);
+}

-    return {dht::token::to_int64(t), rand_dist(rand_gen)};
+static std::vector<stream_id> create_stream_ids(
+        size_t index, dht::token start, dht::token end, size_t shard_count, uint8_t ignore_msb) {
+    std::vector<stream_id> result;
+    result.reserve(shard_count);
+    dht::sharder sharder(shard_count, ignore_msb);
+    for (size_t shard_idx = 0; shard_idx < shard_count; ++shard_idx) {
+        auto t = dht::find_first_token_for_shard(sharder, start, end, shard_idx);
+        // compose the id from token and the "index" of the range end owning vnode
+        // as defined by token sort order. Basically grouping within this
+        // shard set.
+        result.emplace_back(stream_id(t, index));
+    }
+    return result;
 }

 class topology_description_generator final {
@@ -150,7 +218,7 @@ class topology_description_generator final {
    // Fetch sharding parameters for a node that owns vnode ending with this.end
    // Returns <shard_count, ignore_msb> pair.
    std::pair<size_t, uint8_t> get_sharding_info(dht::token end) const {
-        if (_bootstrap_tokens.count(end) > 0) {
+        if (_bootstrap_tokens.contains(end)) {
            return {smp::count, _cfg.murmur3_partitioner_ignore_msb_bits()};
        } else {
            auto endpoint = _token_metadata.get_endpoint(end);
@@ -163,21 +231,15 @@ class topology_description_generator final {
        }
    }

-    token_range_description create_description(dht::token start, dht::token end) const {
+    token_range_description create_description(size_t index, dht::token start, dht::token end) const {
        token_range_description desc;

        desc.token_range_end = end;

        auto [shard_count, ignore_msb] = get_sharding_info(end);
-        desc.streams.reserve(shard_count);
+        desc.streams = create_stream_ids(index, start, end, shard_count, ignore_msb);
        desc.sharding_ignore_msb = ignore_msb;

-        dht::sharder sharder(shard_count, ignore_msb);
-        for (size_t shard_idx = 0; shard_idx < shard_count; ++shard_idx) {
-            auto t = dht::find_first_token_for_shard(sharder, start, end, shard_idx);
-            desc.streams.push_back(create_stream_id(t));
-        }
-
        return desc;
    }
 public:
@@ -213,10 +275,10 @@ public:
        vnode_descriptions.reserve(tokens.size());

        vnode_descriptions.push_back(
-                create_description(tokens.back(), tokens.front()));
+                create_description(0, tokens.back(), tokens.front()));
        for (size_t idx = 1; idx < tokens.size(); ++idx) {
            vnode_descriptions.push_back(
-                    create_description(tokens[idx - 1], tokens[idx]));
+                    create_description(idx, tokens[idx - 1], tokens[idx]));
        }

        return {std::move(vnode_descriptions)};
@@ -243,6 +305,38 @@ future<db_clock::time_point> get_local_streams_timestamp() {
    });
 }

+// non-static for testing
+size_t limit_of_streams_in_topology_description() {
+    // Each stream takes 16B and we don't want to exceed 4MB so we can have
+    // at most 262144 streams but not less than 1 per vnode.
+    return 4 * 1024 * 1024 / 16;
+}
+
+// non-static for testing
+topology_description limit_number_of_streams_if_needed(topology_description&& desc) {
+    int64_t streams_count = 0;
+    for (auto& tr_desc : desc.entries()) {
+        streams_count += tr_desc.streams.size();
+    }
+
+    size_t limit = std::max(limit_of_streams_in_topology_description(), desc.entries().size());
+    if (limit >= size_t(streams_count)) {
+        return std::move(desc);
+    }
+    size_t streams_per_vnode_limit = limit / desc.entries().size();
+    auto entries = std::move(desc).entries();
+    auto start = entries.back().token_range_end;
+    for (size_t idx = 0; idx < entries.size(); ++idx) {
+        auto end = entries[idx].token_range_end;
+        if (entries[idx].streams.size() > streams_per_vnode_limit) {
+            entries[idx].streams =
+                create_stream_ids(idx, start, end, streams_per_vnode_limit, entries[idx].sharding_ignore_msb);
+        }
+        start = end;
+    }
+    return topology_description(std::move(entries));
+}
+
 // Run inside seastar::async context.
 db_clock::time_point make_new_cdc_generation(
        const db::config& cfg,
@@ -252,12 +346,25 @@ db_clock::time_point make_new_cdc_generation(
        db::system_distributed_keyspace& sys_dist_ks,
        std::chrono::milliseconds ring_delay,
        bool for_testing) {
+    using namespace std::chrono;
    auto gen = topology_description_generator(cfg, bootstrap_tokens, tm, g).generate();

+    // If the cluster is large we may end up with a generation that contains
+    // large number of streams. This is problematic because we store the
+    // generation in a single row. For a generation with large number of rows
+    // this will lead to a row that can be as big as 32MB. This is much more
+    // than the limit imposed by commitlog_segment_size_in_mb. If the size of
+    // the row that describes a new generation grows above
+    // commitlog_segment_size_in_mb, the write will fail and the new node won't
+    // be able to join. To avoid such problem we make sure that such row is
+    // always smaller than 4MB. We do that by removing some CDC streams from
+    // each vnode if the total number of streams is too large.
+    gen = limit_number_of_streams_if_needed(std::move(gen));
+
    // Begin the race.
    auto ts = db_clock::now() + (
-            for_testing ? std::chrono::milliseconds(0) : (
-                2 * ring_delay + std::chrono::duration_cast<std::chrono::milliseconds>(generation_leeway)));
+            (for_testing || ring_delay == milliseconds(0)) ? milliseconds(0) : (
+                2 * ring_delay + duration_cast<milliseconds>(generation_leeway)));
    sys_dist_ks.insert_cdc_topology_description(ts, std::move(gen), { tm.count_normal_token_owners() }).get();

    return ts;
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -64,17 +64,21 @@ namespace cdc {
 class stream_id final {
    bytes _value;
 public:
+    static constexpr uint8_t version_1 = 1;
+
    stream_id() = default;
-    stream_id(int64_t, int64_t);
    stream_id(bytes);
+    stream_id(dht::token, size_t);
+
    bool is_set() const;
    bool operator==(const stream_id&) const;
+    bool operator!=(const stream_id&) const;
    bool operator<(const stream_id&) const;

-    int64_t first() const;
-    int64_t second() const;
-
+    uint8_t version() const;
+    size_t index() const;
    const bytes& to_bytes() const;
+    dht::token token() const;

    partition_key to_partition_key(const schema& log_schema) const;
    static int64_t token_from_bytes(bytes_view);
@@ -110,7 +114,25 @@ public:
    topology_description(std::vector<token_range_description> entries);
    bool operator==(const topology_description&) const;

-    const std::vector<token_range_description>& entries() const;
+    const std::vector<token_range_description>& entries() const&;
+    std::vector<token_range_description>&& entries() &&;
+};
+
+/**
+ * The set of streams for a single topology version/generation
+ * I.e. the stream ids at a given time. 
+ */ 
+class streams_version {
+public:
+    std::vector<stream_id> streams;
+    db_clock::time_point timestamp;
+    std::optional<db_clock::time_point> expired;
+
+    streams_version(std::vector<stream_id> s, db_clock::time_point ts, std::optional<db_clock::time_point> exp)
+        : streams(std::move(s))
+        , timestamp(ts)
+        , expired(std::move(exp))
+    {}
 };

 /* Should be called when we're restarting and we noticed that we didn't save any streams timestamp in our local tables,
@@ -130,8 +152,8 @@ bool should_propose_first_generation(const gms::inet_address& me, const gms::gos
 */
 future<db_clock::time_point> get_local_streams_timestamp();

-/* Generate a new set of CDC streams and insert it into the distributed cdc_generations table.
- * Returns the timestamp of this new generation.
+/* Generate a new set of CDC streams and insert it into the distributed cdc_generation_descriptions table.
+ * Returns the timestamp of this new generation
 *
 * Should be called when starting the node for the first time (i.e., joining the ring).
 *
@@ -161,7 +183,7 @@ std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_ad
 /* Inform CDC users about a generation of streams (identified by the given timestamp)
 * by inserting it into the cdc_streams table.
 *
- * Assumes that the cdc_generations table contains this generation.
+ * Assumes that the cdc_generation_descriptions table contains this generation.
 *
 * Returning from this function does not mean that the table update was successful: the function
 * might run an asynchronous task in the background.
--- a/cdc/log.cc
+++ b/cdc/log.cc
--- a/cdc/log.hh
+++ b/cdc/log.hh
@@ -41,7 +41,6 @@
 #include "exceptions/exceptions.hh"
 #include "timestamp.hh"
 #include "tracing/trace_state.hh"
-#include "cdc_options.hh"
 #include "utils/UUID.hh"

 class schema;
@@ -63,6 +62,7 @@ class query_state;

 class mutation;
 class partition_key;
+class database;

 namespace cdc {

@@ -100,19 +100,19 @@ public:
 struct db_context final {
    service::storage_proxy& _proxy;
    service::migration_notifier& _migration_notifier;
-    locator::token_metadata& _token_metadata;
+    const locator::token_metadata& _token_metadata;
    cdc::metadata& _cdc_metadata;

    class builder final {
        service::storage_proxy& _proxy;
        std::optional<std::reference_wrapper<service::migration_notifier>> _migration_notifier;
-        std::optional<std::reference_wrapper<locator::token_metadata>> _token_metadata;
+        std::optional<std::reference_wrapper<const locator::token_metadata>> _token_metadata;
        std::optional<std::reference_wrapper<cdc::metadata>> _cdc_metadata;
    public:
        builder(service::storage_proxy& proxy);

        builder& with_migration_notifier(service::migration_notifier& migration_notifier);
-        builder& with_token_metadata(locator::token_metadata& token_metadata);
+        builder& with_token_metadata(const locator::token_metadata& token_metadata);
        builder& with_cdc_metadata(cdc::metadata&);

        db_context build();
@@ -129,7 +129,12 @@ enum class operation : int8_t {
 };

 bool is_log_for_some_table(const sstring& ks_name, const std::string_view& table_name);
-seastar::sstring log_name(const seastar::sstring& table_name);
+
+schema_ptr get_base_table(const database&, const schema&);
+schema_ptr get_base_table(const database&, sstring_view, std::string_view);
+
+seastar::sstring base_name(std::string_view log_name);
+seastar::sstring log_name(std::string_view table_name);
 seastar::sstring log_data_column_name(std::string_view column_name);
 seastar::sstring log_meta_column_name(std::string_view column_name);
 bytes log_data_column_name_bytes(const bytes& column_name);
@@ -141,6 +146,8 @@ bytes log_data_column_deleted_name_bytes(const bytes& column_name);
 seastar::sstring log_data_column_deleted_elements_name(std::string_view column_name);
 bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name);

+bool is_cdc_metacolumn_name(const sstring& name);
+
 utils::UUID generate_timeuuid(api::timestamp_type t);

 } // namespace cdc
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -51,7 +51,8 @@ static cdc::stream_id get_stream(
    return entry.streams[shard_id];
 }

-static cdc::stream_id get_stream(
+// non-static for testing
+cdc::stream_id get_stream(
        const std::vector<cdc::token_range_description>& entries,
        dht::token tok) {
    if (entries.empty()) {
@@ -77,6 +78,12 @@ cdc::metadata::container_t::const_iterator cdc::metadata::gen_used_at(api::times
    return std::prev(it);
 }

+bool cdc::metadata::streams_available() const {
+    auto now = api::new_timestamp();
+    auto it = gen_used_at(now);
+    return  it != _gens.end();
+}
+
 cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok) {
    auto now = api::new_timestamp();
    if (ts > now + generation_leeway.count()) {
--- a/cdc/metadata.hh
+++ b/cdc/metadata.hh
@@ -57,6 +57,10 @@ public:
    /* Is a generation with the given timestamp already known or superseded by a newer generation? */
    bool known_or_obsolete(db_clock::time_point) const;

+    /* Are there streams available. I.e. valid for time == now. If this is false, any writes to 
+     * CDC logs will fail fast.
+     */
+    bool streams_available() const;
    /* Return the stream for the base partition whose token is `tok` to which a corresponding log write should go
     * according to the generation used at time `ts` (i.e, the latest generation whose timestamp is less or equal to `ts`).
     *
--- a/cdc/split.cc
+++ b/cdc/split.cc
@@ -22,8 +22,14 @@
 #include "mutation.hh"
 #include "schema.hh"

+#include "concrete_types.hh"
+#include "types/user.hh"
+
 #include "split.hh"
 #include "log.hh"
+#include "change_visitor.hh"
+
+#include <type_traits>

 struct atomic_column_update {
    column_id id;
@@ -70,6 +76,37 @@ struct partition_deletion {
    tombstone t;
 };

+using clustered_column_set = std::map<clustering_key, cdc::one_kind_column_set, clustering_key::less_compare>;
+
+template<typename Container>
+concept EntryContainer = requires(Container& container) {
+    // Parenthesized due to https://bugs.llvm.org/show_bug.cgi?id=45088
+    { (container.atomic_entries) } -> std::same_as<std::vector<atomic_column_update>&>;
+    { (container.nonatomic_entries) } -> std::same_as<std::vector<nonatomic_column_update>&>;
+};
+
+template<EntryContainer Container>
+static void add_columns_affected_by_entries(cdc::one_kind_column_set& cset, const Container& cont) {
+    for (const auto& entry : cont.atomic_entries) {
+        cset.set(entry.id);
+    }
+    for (const auto& entry : cont.nonatomic_entries) {
+        cset.set(entry.id);
+    }
+}
+
+/* Given a mutation with multiple timestamps/ttl/types of changes, we split it into multiple mutations
+ * before passing it into `process_change` (see comment above `should_split_visitor` for more details).
+ *
+ * The first step of the splitting is to walk over the mutation and put each change into an appropriate bucket
+ * (see `batch`). The buckets are sorted by timestamps (see `set_of_changes`), and within each bucket,
+ * the changes are split according to their types (`static_updates`, `clustered_inserts`, and so on).
+ * Within each type, the changes are sorted w.r.t TTLs. Changes without a TTL are treated as if they had TTL = 0.
+ *
+ * The function that puts changes into bucket is called `extract_changes`. Underneath, it uses
+ * `extract_changes_visitor`, `extract_collection_visitor` and `extract_row_visitor`.
+ */
+
 struct batch {
    std::vector<static_row_update> static_updates;
    std::vector<clustered_row_insert> clustered_inserts;
@@ -77,6 +114,40 @@ struct batch {
    std::vector<clustered_row_deletion> clustered_row_deletions;
    std::vector<clustered_range_deletion> clustered_range_deletions;
    std::optional<partition_deletion> partition_deletions;
+
+    clustered_column_set get_affected_clustered_columns_per_row(const schema& s) const {
+        clustered_column_set ret{clustering_key::less_compare(s)};
+
+        if (!clustered_row_deletions.empty()) {
+            // When deleting a row, all columns are affected
+            cdc::one_kind_column_set all_columns{s.regular_columns_count()};
+            all_columns.set(0, s.regular_columns_count(), true);
+            for (const auto& change : clustered_row_deletions) {
+                ret.insert(std::make_pair(change.key, all_columns));
+            }
+        }
+
+        auto process_change_type = [&] (const auto& changes) {
+            for (const auto& change : changes) {
+                auto& cset = ret[change.key];
+                cset.resize(s.regular_columns_count());
+                add_columns_affected_by_entries(cset, change);
+            }
+        };
+
+        process_change_type(clustered_inserts);
+        process_change_type(clustered_updates);
+
+        return ret;
+    }
+
+    cdc::one_kind_column_set get_affected_static_columns(const schema& s) const {
+        cdc::one_kind_column_set ret{s.static_columns_count()};
+        for (const auto& change : static_updates) {
+            add_columns_affected_by_entries(ret, change);
+        }
+        return ret;
+    }
 };

 using set_of_changes = std::map<api::timestamp_type, batch>;
@@ -86,100 +157,179 @@ struct row_update {
    std::vector<nonatomic_column_update> nonatomic_entries;
 };

-static
-std::map<std::pair<api::timestamp_type, gc_clock::duration>, row_update>
-extract_row_updates(const row& r, column_kind ckind, const schema& schema) {
-    std::map<std::pair<api::timestamp_type, gc_clock::duration>, row_update> result;
-    r.for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
-        auto& cdef = schema.column_at(ckind, id);
-        if (cdef.is_atomic()) {
-            auto view = cell.as_atomic_cell(cdef);
-            auto timestamp_and_ttl = std::pair(
-                    view.timestamp(),
-                    view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0)
-                );
-            result[timestamp_and_ttl].atomic_entries.push_back({id, atomic_cell(*cdef.type, view)});
-            return;
+static gc_clock::duration get_ttl(const atomic_cell_view& acv) {
+    return acv.is_live_and_has_ttl() ? acv.ttl() : gc_clock::duration(0);
+}
+
+static gc_clock::duration get_ttl(const row_marker& rm) {
+    return rm.is_expiring() ? rm.ttl() : gc_clock::duration(0);
+}
+
+using change_key_t = std::pair<api::timestamp_type, gc_clock::duration>;
+
+/* Visits the cells and tombstone of a collection, putting the encountered changes into buckets
+ * sorted by timestamp first and ttl second (see `_updates`).
+ */
+template <typename V>
+struct extract_collection_visitor {
+private:
+    const column_id _id;
+    std::map<change_key_t, row_update>& _updates;
+
+    nonatomic_column_update& get_or_append_entry(api::timestamp_type ts, gc_clock::duration ttl) {
+        auto& updates = this->_updates[std::pair(ts, ttl)].nonatomic_entries;
+        if (updates.empty() || updates.back().id != _id) {
+            updates.push_back({_id});
        }
-
-        cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
-            auto desc = mview.materialize(*cdef.type);
-            for (auto& [k, v]: desc.cells) {
-                auto timestamp_and_ttl = std::pair(
-                        v.timestamp(),
-                        v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0)
-                    );
-                auto& updates = result[timestamp_and_ttl].nonatomic_entries;
-                if (updates.empty() || updates.back().id != id) {
-                    updates.push_back({id, {}});
-                }
-                updates.back().cells.push_back({std::move(k), std::move(v)});
-            }
-
-            if (desc.tomb) {
-                auto timestamp_and_ttl = std::pair(desc.tomb.timestamp + 1, gc_clock::duration(0));
-                auto& updates = result[timestamp_and_ttl].nonatomic_entries;
-                if (updates.empty() || updates.back().id != id) {
-                    updates.push_back({id, {}});
-                }
-                updates.back().t = std::move(desc.tomb);
-            }
-        });
-    });
-    return result;
-};
-
-set_of_changes extract_changes(const mutation& base_mutation, const schema& base_schema) {
-    set_of_changes res;
-    auto& p = base_mutation.partition();
-
-    auto sr_updates = extract_row_updates(p.static_row().get(), column_kind::static_column, base_schema);
-    for (auto& [k, up]: sr_updates) {
-        auto [timestamp, ttl] = k;
-        res[timestamp].static_updates.push_back({
-                ttl,
-                std::move(up.atomic_entries),
-                std::move(up.nonatomic_entries)
-            });
+        return updates.back();
    }

-    for (const rows_entry& cr : p.clustered_rows()) {
-        auto cr_updates = extract_row_updates(cr.row().cells(), column_kind::regular_column, base_schema);
+    /* To copy a value from a collection/non-frozen UDT (in order to put it into a bucket) we need to know the value's type.
+     * The method of obtaining the type depends on the collection type; in particular, for non-frozen UDT, each value
+     * might have a different type, thus in general we need a method that, given a key (identifying the value in the collection),
+     * returns the value' type.
+     *
+     * We use the `Curiously Recurring Template Pattern' to avoid performing a dynamic dispatch on the collection's type for each visited cell.
+     * Instead we perform a single dynamic dispatch at the beginning, when encountering the collection column;
+     * the dispatch provides us with a correct `get_value_type` method.
+     * See `extract_row_visitor::collection_column` where the dispatch is done.

-        const auto& marker = cr.row().marker();
-        auto marker_timestamp = marker.timestamp();
-        auto marker_ttl = marker.is_expiring() ? marker.ttl() : gc_clock::duration(0);
-        if (marker.is_live()) {
-            // make sure that an entry corresponding to the row marker's timestamp and ttl is in the map
-            (void)cr_updates[std::pair(marker_timestamp, marker_ttl)];
+    data_type get_value_type(bytes_view);
+    */
+
+    void cell(bytes_view key, const atomic_cell_view& c) {
+        auto& entry = get_or_append_entry(c.timestamp(), get_ttl(c));
+        entry.cells.emplace_back(to_bytes(key), atomic_cell(*static_cast<V&>(*this).get_value_type(key), c));
+    }
+
+public:
+    extract_collection_visitor(column_id id, std::map<change_key_t, row_update>& updates)
+        : _id(id), _updates(updates) {}
+
+    void collection_tombstone(const tombstone& t) {
+        auto& entry = get_or_append_entry(t.timestamp + 1, gc_clock::duration(0));
+        entry.t = t;
+    }
+
+    void live_collection_cell(bytes_view key, const atomic_cell_view& c) {
+        cell(key, c);
+    }
+
+    void dead_collection_cell(bytes_view key, const atomic_cell_view& c) {
+        cell(key, c);
+    }
+
+    constexpr bool finished() const { return false; }
+};
+
+/* Visits all cells and tombstones in a row, putting the encountered changes into buckets
+ * sorted by timestamp first and ttl second (see `_updates`).
+ */
+struct extract_row_visitor {
+    std::map<change_key_t, row_update> _updates;
+
+    void cell(const column_definition& cdef, const atomic_cell_view& cell) {
+        _updates[std::pair(cell.timestamp(), get_ttl(cell))].atomic_entries.push_back({cdef.id, atomic_cell(*cdef.type, cell)});
+    }
+
+    void live_atomic_cell(const column_definition& cdef, const atomic_cell_view& c) {
+        cell(cdef, c);
+    }
+
+    void dead_atomic_cell(const column_definition& cdef, const atomic_cell_view& c) {
+        cell(cdef, c);
+    }
+
+    void collection_column(const column_definition& cdef, auto&& visit_collection) {
+        visit(*cdef.type, make_visitor(
+        [&] (const collection_type_impl& ctype) {
+            struct collection_visitor : public extract_collection_visitor<collection_visitor> {
+                data_type _value_type;
+
+                collection_visitor(column_id id, std::map<change_key_t, row_update>& updates, const collection_type_impl& ctype)
+                    : extract_collection_visitor<collection_visitor>(id, updates), _value_type(ctype.value_comparator()) {}
+
+                data_type get_value_type(bytes_view) {
+                    return _value_type;
+                }
+            } v(cdef.id, _updates, ctype);
+
+            visit_collection(v);
+        },
+        [&] (const user_type_impl& utype) {
+            struct udt_visitor : public extract_collection_visitor<udt_visitor> {
+                const user_type_impl& _utype;
+
+                udt_visitor(column_id id, std::map<change_key_t, row_update>& updates, const user_type_impl& utype)
+                    : extract_collection_visitor<udt_visitor>(id, updates), _utype(utype) {}
+
+                data_type get_value_type(bytes_view key) {
+                    return _utype.type(deserialize_field_index(key));
+                }
+            } v(cdef.id, _updates, utype);
+
+            visit_collection(v);
+        },
+        [&] (const abstract_type& o) {
+            throw std::runtime_error(format("extract_changes: unknown collection type:", o.name()));
        }
+        ));
+    }

-        auto is_insert = [&] (api::timestamp_type timestamp, gc_clock::duration ttl) {
-            if (!marker.is_live()) {
-                return false;
+    constexpr bool finished() const { return false; }
+};
+
+struct extract_changes_visitor {
+    set_of_changes _result;
+
+    void static_row_cells(auto&& visit_row_cells) {
+        extract_row_visitor v;
+        visit_row_cells(v);
+
+        for (auto& [ts_ttl, row_update]: v._updates) {
+            _result[ts_ttl.first].static_updates.push_back({
+                ts_ttl.second,
+                std::move(row_update.atomic_entries),
+                std::move(row_update.nonatomic_entries)
+            });
+        }
+    }
+
+    void clustered_row_cells(const clustering_key& ckey, auto&& visit_row_cells) {
+        struct clustered_cells_visitor : public extract_row_visitor {
+            api::timestamp_type _marker_ts;
+            gc_clock::duration _marker_ttl;
+            std::optional<row_marker> _marker;
+
+            void marker(const row_marker& rm) {
+                _marker_ts = rm.timestamp();
+                _marker_ttl = get_ttl(rm);
+                _marker = rm;
+
+                // make sure that an entry corresponding to the row marker's timestamp and ttl is in the map
+                (void)_updates[std::pair(_marker_ts, _marker_ttl)];
            }
+        } v;
+        visit_row_cells(v);

-            return timestamp == marker_timestamp && ttl == marker_ttl;
-        };
-
-        for (auto& [k, up]: cr_updates) {
+        for (auto& [ts_ttl, row_update]: v._updates) {
            // It is important that changes in the resulting `set_of_changes` are listed
            // in increasing TTL order. The reason is explained in a comment in cdc/log.cc,
            // search for "#6070".
-            auto [timestamp, ttl] = k;
+            auto [ts, ttl] = ts_ttl;

-            if (is_insert(timestamp, ttl)) {
-                res[timestamp].clustered_inserts.push_back({
+            if (v._marker && ts == v._marker_ts && ttl == v._marker_ttl) {
+                _result[ts].clustered_inserts.push_back({
                        ttl,
-                        cr.key(),
-                        marker,
-                        std::move(up.atomic_entries),
+                        ckey,
+                        *v._marker,
+                        std::move(row_update.atomic_entries),
                        {}
                    });

-                auto& cr_insert = res[timestamp].clustered_inserts.back();
+                auto& cr_insert = _result[ts].clustered_inserts.back();
                bool clustered_update_exists = false;
-                for (auto& nonatomic_up: up.nonatomic_entries) {
+                for (auto& nonatomic_up: row_update.nonatomic_entries) {
                    // Updating a collection column with an INSERT statement implies inserting a tombstone.
                    //
                    // For example, suppose that we have:
@@ -205,9 +355,9 @@ set_of_changes extract_changes(const mutation& base_mutation, const schema& base
                        cr_insert.nonatomic_entries.push_back(std::move(nonatomic_up));
                    } else {
                        if (!clustered_update_exists) {
-                            res[timestamp].clustered_updates.push_back({
+                            _result[ts].clustered_updates.push_back({
                                ttl,
-                                cr.key(),
+                                ckey,
                                {},
                                {}
                            });
@@ -228,201 +378,239 @@ set_of_changes extract_changes(const mutation& base_mutation, const schema& base
                            clustered_update_exists = true;
                        }

-                        auto& cr_update = res[timestamp].clustered_updates.back();
+                        auto& cr_update = _result[ts].clustered_updates.back();
                        cr_update.nonatomic_entries.push_back(std::move(nonatomic_up));
                    }
                }
            } else {
-                res[timestamp].clustered_updates.push_back({
+                _result[ts].clustered_updates.push_back({
                        ttl,
-                        cr.key(),
-                        std::move(up.atomic_entries),
-                        std::move(up.nonatomic_entries)
+                        ckey,
+                        std::move(row_update.atomic_entries),
+                        std::move(row_update.nonatomic_entries)
                    });
            }
        }
-
-        auto row_tomb = cr.row().deleted_at().regular();
-        if (row_tomb) {
-            res[row_tomb.timestamp].clustered_row_deletions.push_back({cr.key(), row_tomb});
-        }
    }

-    for (const auto& rt: p.row_tombstones()) {
-        if (rt.tomb.timestamp != api::missing_timestamp) {
-            res[rt.tomb.timestamp].clustered_range_deletions.push_back({rt});
-        }
+    void clustered_row_delete(const clustering_key& ckey, const tombstone& t) {
+        _result[t.timestamp].clustered_row_deletions.push_back({ckey, t});
    }

-    auto partition_tomb_timestamp = p.partition_tombstone().timestamp;
-    if (partition_tomb_timestamp != api::missing_timestamp) {
-        res[partition_tomb_timestamp].partition_deletions = {p.partition_tombstone()};
+    void range_delete(const range_tombstone& rt) {
+        _result[rt.tomb.timestamp].clustered_range_deletions.push_back({rt});
    }

-    return res;
+    void partition_delete(const tombstone& t) {
+        _result[t.timestamp].partition_deletions = {t};
+    }
+
+    constexpr bool finished() const { return false; }
+};
+
+set_of_changes extract_changes(const mutation& m) {
+    extract_changes_visitor v;
+    cdc::inspect_mutation(m, v);
+    return std::move(v._result);
 }

 namespace cdc {

-bool should_split(const mutation& base_mutation, const schema& base_schema) {
-    auto& p = base_mutation.partition();
+struct find_timestamp_visitor {
+    api::timestamp_type _ts = api::missing_timestamp;

-    api::timestamp_type found_ts = api::missing_timestamp;
-    std::optional<gc_clock::duration> found_ttl; // 0 = "no ttl"
+    bool finished() const { return _ts != api::missing_timestamp; }

-    auto check_or_set = [&] (api::timestamp_type ts, gc_clock::duration ttl) {
-        if (found_ts != api::missing_timestamp && found_ts != ts) {
-            return true;
-        }
-        found_ts = ts;
+    void visit(api::timestamp_type ts) { _ts = ts; }
+    void visit(const atomic_cell_view& cell) { visit(cell.timestamp()); }

-        if (found_ttl && *found_ttl != ttl) {
-            return true;
-        }
-        found_ttl = ttl;
+    void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
+    void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
+    void collection_tombstone(const tombstone& t) {
+        // A collection tombstone with timestamp T can be created with:
+        // UPDATE ks.t USING TIMESTAMP T + 1 SET X = null WHERE ...
+        // (where X is a collection column).
+        // This is, among others, the reason why we show it in the CDC log
+        // with cdc$time using timestamp T + 1 instead of T.
+        visit(t.timestamp + 1);
+    }
+    void live_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
+    void dead_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
+    void collection_column(const column_definition&, auto&& visit_collection) { visit_collection(*this); }
+    void marker(const row_marker& rm) { visit(rm.timestamp()); }
+    void static_row_cells(auto&& visit_row_cells) { visit_row_cells(*this); }
+    void clustered_row_cells(const clustering_key&, auto&& visit_row_cells) { visit_row_cells(*this); }
+    void clustered_row_delete(const clustering_key&, const tombstone& t) { visit(t.timestamp); }
+    void range_delete(const range_tombstone& t) { visit(t.tomb.timestamp); }
+    void partition_delete(const tombstone& t) { visit(t.timestamp); }
+};

-        return false;
-    };
+/* Find some timestamp inside the given mutation.
+ *
+ * If this mutation was created using a single insert/update/delete statement, then it will have a single,
+ * well-defined timestamp (even if this timestamp occurs multiple times, e.g. in a cell and row_marker).
+ *
+ * This function shouldn't be used for mutations that have multiple different timestamps: the function
+ * would only find one of them. When dealing with such mutations, the caller should first split the mutation
+ * into multiple ones, each with a single timestamp.
+ */
+api::timestamp_type find_timestamp(const mutation& m) {
+    find_timestamp_visitor v;

-    bool had_static_row = false;
+    cdc::inspect_mutation(m, v);

-    bool should_split = false;
-    p.static_row().get().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
-        had_static_row = true;
-
-        auto& cdef = base_schema.column_at(column_kind::static_column, id);
-        if (cdef.is_atomic()) {
-            auto view = cell.as_atomic_cell(cdef);
-            if (check_or_set(view.timestamp(), view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0))) {
-                should_split = true;
-            }
-            return;
-        }
-
-        cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
-            auto desc = mview.materialize(*cdef.type);
-            for (auto& [k, v]: desc.cells) {
-                if (check_or_set(v.timestamp(), v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0))) {
-                    should_split = true;
-                    return;
-                }
-            }
-
-            if (desc.tomb) {
-                if (check_or_set(desc.tomb.timestamp + 1, gc_clock::duration(0))) {
-                    should_split = true;
-                    return;
-                }
-            }
-        });
-    });
-
-    if (should_split) {
-        return true;
+    if (v._ts == api::missing_timestamp) {
+        throw std::runtime_error("cdc: could not find timestamp of mutation");
    }

-    bool had_clustered_row = false;
-
-    if (!p.clustered_rows().empty() && had_static_row) {
-        return true;
-    }
-    for (const rows_entry& cr : p.clustered_rows()) {
-        had_clustered_row = true;
-
-        const auto& marker = cr.row().marker();
-        if (marker.is_live() && check_or_set(marker.timestamp(), marker.is_expiring() ? marker.ttl() : gc_clock::duration(0))) {
-            return true;
-        }
-
-        bool is_insert = marker.is_live();
-
-        bool had_cells = false;
-        cr.row().cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
-            had_cells = true;
-
-            auto& cdef = base_schema.column_at(column_kind::regular_column, id);
-            if (cdef.is_atomic()) {
-                auto view = cell.as_atomic_cell(cdef);
-                if (check_or_set(view.timestamp(), view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0))) {
-                    should_split = true;
-                }
-                return;
-            }
-
-            cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
-                for (auto& [k, v]: mview.cells) {
-                    if (check_or_set(v.timestamp(), v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0))) {
-                        should_split = true;
-                        return;
-                    }
-
-                    if (is_insert) {
-                        // nonatomic updates cannot be expressed with an INSERT.
-                        should_split = true;
-                        return;
-                    }
-                }
-
-                if (mview.tomb) {
-                    if (check_or_set(mview.tomb.timestamp + 1, gc_clock::duration(0))) {
-                        should_split = true;
-                        return;
-                    }
-                }
-            });
-        });
-
-        if (should_split) {
-            return true;
-        }
-
-        auto row_tomb = cr.row().deleted_at().regular();
-        if (row_tomb) {
-            if (had_cells) {
-                return true;
-            }
-
-            // there were no cells, so no ttl
-            assert(!found_ttl);
-            if (found_ts != api::missing_timestamp && found_ts != row_tomb.timestamp) {
-                return true;
-            }
-
-            found_ts = row_tomb.timestamp;
-        }
-    }
-
-    if (!p.row_tombstones().empty() && (had_static_row || had_clustered_row)) {
-        return true;
-    }
-
-    for (const auto& rt: p.row_tombstones()) {
-        if (rt.tomb) {
-            if (found_ts != api::missing_timestamp && found_ts != rt.tomb.timestamp) {
-                return true;
-            }
-
-            found_ts = rt.tomb.timestamp;
-        }
-    }
-
-    if (p.partition_tombstone().timestamp != api::missing_timestamp
-            && (!p.row_tombstones().empty() || had_static_row || had_clustered_row)) {
-        return true;
-    }
-
-    // A mutation with no timestamp will be split into 0 mutations
-    return found_ts == api::missing_timestamp;
+    return v._ts;
 }

-void for_each_change(const mutation& base_mutation, const schema_ptr& base_schema,
-        seastar::noncopyable_function<void(mutation, api::timestamp_type, bytes, int&)> f) {
-    auto changes = extract_changes(base_mutation, *base_schema);
+/* If a mutation contains multiple timestamps, multiple ttls, or multiple types of changes
+ * (e.g. it was created from a batch that both updated a clustered row and deleted a clustered row),
+ * we split it into multiple mutations, each with exactly one timestamp, at most one ttl, and a single type of change.
+ * We also split if we find both a change with no ttl (e.g. a cell tombstone) and a change with ttl (e.g. a ttled cell update).
+ *
+ * The `should_split` function checks whether the mutation requires such splitting, using `should_split_visitor`.
+ * The visitor uses the order in which the mutation is being visited (see the documentation of ChangeVisitor),
+ * remembers a bunch of state based on whatever was visited until now (e.g. was there a static row update?
+ * Was there a clustered row update? Was there a clustered row delete? Was there a TTL?)
+ * and tells the caller to stop on the first occurence of a second timestamp/ttl/type of change.
+ */
+struct should_split_visitor {
+    bool _had_static_row = false;
+    bool _had_clustered_row = false;
+    bool _had_upsert = false;
+    bool _had_row_marker = false;
+    bool _had_range_delete = false;
+
+    bool _result = false;
+
+    // This becomes a valid (non-missing) timestamp after visiting the first change.
+    // Then, if we encounter any different timestamp, it means that we should split.
+    api::timestamp_type _ts = api::missing_timestamp;
+
+    // This becomes non-null after visiting the fist change.
+    // If the change did not have a ttl (e.g. a non-ttled cell, or a tombstone), we store gc_clock::duration(0) there,
+    // because specifying ttl = 0 is equivalent to not specifying a TTL.
+    // Otherwise we store the change's ttl.
+    std::optional<gc_clock::duration> _ttl = std::nullopt;
+
+    inline bool finished() const { return _result; }
+    inline void stop() { _result = true; }
+
+    void visit(api::timestamp_type ts, gc_clock::duration ttl = gc_clock::duration(0)) {
+        if (_ts != api::missing_timestamp && _ts != ts) {
+            return stop();
+        }
+        _ts = ts;
+
+        if (_ttl && *_ttl != ttl) {
+            return stop();
+        }
+        _ttl = { ttl };
+    }
+
+    void visit(const atomic_cell_view& cell) { visit(cell.timestamp(), get_ttl(cell)); }
+
+    void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
+    void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
+
+    void collection_tombstone(const tombstone& t) { visit(t.timestamp + 1); }
+
+    void live_collection_cell(bytes_view, const atomic_cell_view& cell) {
+        if (_had_row_marker) {
+            // nonatomic updates cannot be expressed with an INSERT.
+            return stop();
+        }
+        visit(cell);
+    }
+    void dead_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
+    void collection_column(const column_definition&, auto&& visit_collection) { visit_collection(*this); }
+
+    void marker(const row_marker& rm) {
+        _had_row_marker = true;
+        visit(rm.timestamp(), get_ttl(rm));
+    }
+
+    void static_row_cells(auto&& visit_row_cells) {
+        _had_static_row = true;
+        visit_row_cells(*this);
+    }
+
+    void clustered_row_cells(const clustering_key&, auto&& visit_row_cells) {
+        if (_had_static_row) {
+            return stop();
+        }
+        _had_clustered_row = _had_upsert = true;
+        visit_row_cells(*this);
+    }
+
+    void clustered_row_delete(const clustering_key&, const tombstone& t) {
+        if (_had_static_row || _had_upsert) {
+            return stop();
+        }
+        _had_clustered_row = true;
+        visit(t.timestamp);
+    }
+
+    void range_delete(const range_tombstone& t) {
+        if (_had_static_row || _had_clustered_row) {
+            return stop();
+        }
+        _had_range_delete = true;
+        visit(t.tomb.timestamp);
+    }
+
+    void partition_delete(const tombstone&) {
+        if (_had_range_delete || _had_static_row || _had_clustered_row) {
+            return stop();
+        }
+    }
+};
+
+bool should_split(const mutation& m) {
+    should_split_visitor v;
+
+    cdc::inspect_mutation(m, v);
+
+    return v._result
+    // A mutation with no timestamp will be split into 0 mutations:
+        || v._ts == api::missing_timestamp;
+}
+
+void process_changes_with_splitting(const mutation& base_mutation, change_processor& processor,
+        bool enable_preimage, bool enable_postimage) {
+    const auto base_schema = base_mutation.schema();
+    auto changes = extract_changes(base_mutation);
    auto pk = base_mutation.key();

+    if (changes.empty()) {
+        return;
+    }
+
+    const auto last_timestamp = changes.rbegin()->first;
+
    for (auto& [change_ts, btch] : changes) {
-        auto tuuid = timeuuid_type->decompose(generate_timeuuid(change_ts));
-        int batch_no = 0;
+        const bool is_last = change_ts == last_timestamp;
+        processor.begin_timestamp(change_ts, is_last);
+
+        clustered_column_set affected_clustered_columns_per_row{clustering_key::less_compare(*base_schema)};
+        one_kind_column_set affected_static_columns{base_schema->static_columns_count()};
+
+        if (enable_preimage || enable_postimage) {
+            affected_static_columns = btch.get_affected_static_columns(*base_schema);
+            affected_clustered_columns_per_row = btch.get_affected_clustered_columns_per_row(*base_mutation.schema());
+        }
+
+        if (enable_preimage) {
+            if (affected_static_columns.count() > 0) {
+                processor.produce_preimage(nullptr, affected_static_columns);
+            }
+            for (const auto& [ck, affected_row_cells] : affected_clustered_columns_per_row) {
+                processor.produce_preimage(&ck, affected_row_cells);
+            }
+        }

        for (auto& sr_update : btch.static_updates) {
            mutation m(base_schema, pk);
@@ -434,7 +622,7 @@ void for_each_change(const mutation& base_mutation, const schema_ptr& base_schem
                auto& cdef = base_schema->column_at(column_kind::static_column, nonatomic_update.id);
                m.set_static_cell(cdef, collection_mutation_description{nonatomic_update.t, std::move(nonatomic_update.cells)}.serialize(*cdef.type));
            }
-            f(std::move(m), change_ts, tuuid, batch_no);
+            processor.process_change(m);
        }

        for (auto& cr_insert : btch.clustered_inserts) {
@@ -451,7 +639,7 @@ void for_each_change(const mutation& base_mutation, const schema_ptr& base_schem
            }
            row.apply(cr_insert.marker);

-            f(std::move(m), change_ts, tuuid, batch_no);
+            processor.process_change(m);
        }

        for (auto& cr_update : btch.clustered_updates) {
@@ -467,27 +655,86 @@ void for_each_change(const mutation& base_mutation, const schema_ptr& base_schem
                row.apply(cdef, collection_mutation_description{nonatomic_update.t, std::move(nonatomic_update.cells)}.serialize(*cdef.type));
            }

-            f(std::move(m), change_ts, tuuid, batch_no);
+            processor.process_change(m);
        }

        for (auto& cr_delete : btch.clustered_row_deletions) {
            mutation m(base_schema, pk);
            m.partition().apply_delete(*base_schema, cr_delete.key, cr_delete.t);
-            f(std::move(m), change_ts, tuuid, batch_no);
+            processor.process_change(m);
        }

        for (auto& crange_delete : btch.clustered_range_deletions) {
            mutation m(base_schema, pk);
            m.partition().apply_delete(*base_schema, crange_delete.rt);
-            f(std::move(m), change_ts, tuuid, batch_no);
+            processor.process_change(m);
        }

        if (btch.partition_deletions) {
            mutation m(base_schema, pk);
            m.partition().apply(btch.partition_deletions->t);
-            f(std::move(m), change_ts, tuuid, batch_no);
+            processor.process_change(m);
        }
+
+        if (enable_postimage) {
+            if (affected_static_columns.count() > 0) {
+                processor.produce_postimage(nullptr);
+            }
+            for (const auto& [ck, crow] : affected_clustered_columns_per_row) {
+                processor.produce_postimage(&ck);
+            }
+        }
+
+        processor.end_record();
    }
 }

+void process_changes_without_splitting(const mutation& base_mutation, change_processor& processor,
+        bool enable_preimage, bool enable_postimage) {
+    auto ts = find_timestamp(base_mutation);
+    processor.begin_timestamp(ts, true);
+
+    const auto base_schema = base_mutation.schema();
+
+    if (enable_preimage) {
+        const auto& p = base_mutation.partition();
+
+        one_kind_column_set columns{base_schema->static_columns_count()};
+        if (!p.static_row().empty()) {
+            p.static_row().get().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
+                columns.set(id);
+            });
+            processor.produce_preimage(nullptr, columns);
+        }
+
+        columns.resize(base_schema->regular_columns_count());
+        for (const rows_entry& cr : p.clustered_rows()) {
+            columns.reset();
+            if (cr.row().deleted_at().regular()) {
+                // Row deleted - include all columns in preimage
+                columns.set(0, base_schema->regular_columns_count(), true);
+            } else {
+                cr.row().cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
+                    columns.set(id);
+                });
+            }
+            processor.produce_preimage(&cr.key(), columns);
+        }
+    }
+
+    processor.process_change(base_mutation);
+
+    if (enable_postimage) {
+        const auto& p = base_mutation.partition();
+        if (!p.static_row().empty()) {
+            processor.produce_postimage(nullptr);
+        }
+        for (const rows_entry& cr : p.clustered_rows()) {
+            processor.produce_postimage(&cr.key());
+        }
+    }
+
+    processor.end_record();
+}
+
 } // namespace cdc
--- a/cdc/split.hh
+++ b/cdc/split.hh
@@ -22,6 +22,7 @@
 #pragma once

 #include <vector>
+#include <boost/dynamic_bitset.hpp>
 #include "schema_fwd.hh"
 #include "timestamp.hh"
 #include "bytes.hh"
@@ -31,8 +32,61 @@ class mutation;

 namespace cdc {

-bool should_split(const mutation& base_mutation, const schema& base_schema);
-void for_each_change(const mutation& base_mutation, const schema_ptr& base_schema,
-        seastar::noncopyable_function<void(mutation, api::timestamp_type, bytes, int&)>);
+// Represents a set of column ids of one kind (partition key, clustering key, regular row or static row).
+// There already exists a column_set type, but it keeps ordinal_column_ids, not column_ids (ordinal column ids
+// are unique across whole table, while kind-specific ids are unique only within one column kind).
+// To avoid converting back and forth between ordinal and kind-specific ids, one_kind_column_set is used instead.
+using one_kind_column_set = boost::dynamic_bitset<uint64_t>;
+
+// An object that processes changes from a single, big mutation.
+// It is intended to be used with process_changes_xxx_splitting. Those functions define the order and layout in which
+// changes should appear in CDC log, and change_processor is responsible for producing CDC log rows from changes given
+// by those two functions.
+//
+// The flow of calling its methods should go as follows:
+//   -> begin_timestamp #1
+//     -> produce_preimage (one call for each preimage row to be generated)
+//     -> process_change (one call for each part generated by the splitting function)
+//     -> produce_postimage (one call for each postimage row to be generated)
+//   -> begin_timestamp #2
+//   ...
+class change_processor {
+protected:
+    ~change_processor() {};
+public:
+    // Tells the processor that changes that follow from now on will be of given timestamp.
+    // This method must be called in increasing timestamp order.
+    // begin_timestamp can be called only once for a given timestamp and change_processor object.
+    //   ts - timestamp of mutation parts
+    //   is_last - determines if this will be the last timestamp to be processed by this change_processor instance.
+    virtual void begin_timestamp(api::timestamp_type ts, bool is_last) = 0;
+
+    // Tells the processor to produce a preimage for a given clustering/static row.
+    //   ck - clustering key of the row for which to produce a preimage; if nullptr, static row preimage is requested
+    //   columns_to_include - include information about the current state of those columns only, leave others as null
+    virtual void produce_preimage(const clustering_key* ck, const one_kind_column_set& columns_to_include) = 0;
+
+    // Tells the processor to produce a postimage for a given clustering/static row.
+    // Contrary to preimage, this requires data from all columns to be present.
+    //   ck - clustering key of the row for which to produce a postimage; if nullptr, static row postimage is requested
+    virtual void produce_postimage(const clustering_key* ck) = 0;
+
+    // Processes a smaller mutation which is a subset of the big mutation.
+    // The mutation provided to process_change should be simple enough for it to be possible to convert it
+    // into CDC log rows - for example, it cannot represent a write to two columns of the same row, where
+    // both columns have different timestamp or TTL set.
+    //   m - the small mutation to be converted into CDC log rows.
+    virtual void process_change(const mutation& m) = 0;
+
+    // Tells processor we have reached end of record - last part
+    // of a given timestamp batch
+    virtual void end_record() = 0;
+};
+
+bool should_split(const mutation& base_mutation);
+void process_changes_with_splitting(const mutation& base_mutation, change_processor& processor,
+        bool enable_preimage, bool enable_postimage);
+void process_changes_without_splitting(const mutation& base_mutation, change_processor& processor,
+        bool enable_preimage, bool enable_postimage);

 }
--- a/column_computation.hh
+++ b/column_computation.hh
@@ -21,8 +21,7 @@

 #pragma once

-#include <json/json.h>
-
+#include "utils/rjson.hh"
 #include "bytes.hh"

 class schema;
@@ -47,7 +46,7 @@ public:
    virtual ~column_computation() = default;

    static column_computation_ptr deserialize(bytes_view raw);
-    static column_computation_ptr deserialize(const Json::Value& json);
+    static column_computation_ptr deserialize(const rjson::value& json);

    virtual column_computation_ptr clone() const = 0;

--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -148,8 +148,8 @@ public:
                _type.begin(k1), _type.end(k1),
                _type.begin(k2), _type.end(k2),
                [] (const bytes_view& c1, const bytes_view& c2) -> int {
-                    if (c1.size() != c2.size()) {
-                        return c1.size() < c2.size() ? -1 : 1;
+                    if (c1.size() != c2.size() || !c1.size()) {
+                        return c1.size() < c2.size() ? -1 : c1.size() ? 1 : 0;
                    }
                    return memcmp(c1.begin(), c2.begin(), c1.size());
                });
--- a/compress.cc
+++ b/compress.cc
@@ -205,7 +205,7 @@ void compression_parameters::validate_options(const std::map<sstring, sstring>&
        ckw = _compressor->option_names();
    }
    for (auto&& opt : options) {
-        if (!keywords.count(opt.first) && !ckw.count(opt.first)) {
+        if (!keywords.contains(opt.first) && !ckw.contains(opt.first)) {
            throw exceptions::configuration_exception(format("Unknown compression option '{}'.", opt.first));
        }
    }
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -100,8 +100,13 @@ listen_address: localhost

 # port for the CQL native transport to listen for clients on
 # For security reasons, you should not expose this port to the internet.  Firewall it if needed.
+# To disable the CQL native transport, set this option to 0.
 native_transport_port: 9042

+# Like native_transport_port, but clients are forwarded to specific shards, based on the
+# client-side port numbers.
+native_shard_aware_transport_port: 19042
+
 # Enabling native transport encryption in client_encryption_options allows you to either use
 # encryption for the standard port or to use a dedicated, additional port along with the unencrypted
 # standard native_transport_port.
@@ -111,6 +116,10 @@ native_transport_port: 9042
 # keeping native_transport_port unencrypted.
 #native_transport_port_ssl: 9142

+# Like native_transport_port_ssl, but clients are forwarded to specific shards, based on the
+# client-side port numbers.
+#native_shard_aware_transport_port_ssl: 19142
+
 # How long the coordinator should wait for read operations to complete
 read_request_timeout_in_ms: 5000

--- a/configure.py
+++ b/configure.py
@@ -34,7 +34,9 @@ from distutils.spawn import find_executable

 curdir = os.getcwd()

-tempfile.tempdir = "./build/tmp"
+outdir = 'build'
+
+tempfile.tempdir = f"{outdir}/tmp"

 configure_args = str.join(' ', [shlex.quote(x) for x in sys.argv[1:]])

@@ -56,6 +58,7 @@ i18n_xlat = {
    },
 }

+python3_dependencies = subprocess.run('./install-dependencies.sh --print-python3-runtime-packages', shell=True, capture_output=True, encoding='utf-8').stdout.strip()

 def pkgname(name):
    if name in i18n_xlat:
@@ -249,25 +252,30 @@ def find_headers(repodir, excluded_dirs):

 modes = {
    'debug': {
-        'cxxflags': '-DDEBUG -DDEBUG_LSA_SANITIZER -DSCYLLA_ENABLE_ERROR_INJECTION',
-        'cxx_ld_flags': '-Wstack-usage=%s' % (1024*40),
+        'cxxflags': '-DDEBUG -DSANITIZE -DDEBUG_LSA_SANITIZER -DSCYLLA_ENABLE_ERROR_INJECTION',
+        'cxx_ld_flags': '',
+        'stack-usage-threshold': 1024*40,
    },
    'release': {
        'cxxflags': '',
-        'cxx_ld_flags': '-O3 -Wstack-usage=%s' % (1024*13),
+        'cxx_ld_flags': '-O3 -ffunction-sections -fdata-sections -Wl,--gc-sections',
+        'stack-usage-threshold': 1024*13,
    },
    'dev': {
        'cxxflags': '-DSEASTAR_ENABLE_ALLOC_FAILURE_INJECTION -DSCYLLA_ENABLE_ERROR_INJECTION',
-        'cxx_ld_flags': '-O1 -Wstack-usage=%s' % (1024*21),
+        'cxx_ld_flags': '-O1',
+        'stack-usage-threshold': 1024*21,
    },
    'sanitize': {
-        'cxxflags': '-DDEBUG -DDEBUG_LSA_SANITIZER -DSCYLLA_ENABLE_ERROR_INJECTION',
-        'cxx_ld_flags': '-Os -Wstack-usage=%s' % (1024*50),
+        'cxxflags': '-DDEBUG -DSANITIZE -DDEBUG_LSA_SANITIZER -DSCYLLA_ENABLE_ERROR_INJECTION',
+        'cxx_ld_flags': '-Os',
+        'stack-usage-threshold': 1024*50,
    }
 }

 scylla_tests = set([
    'test/boost/UUID_test',
+    'test/boost/cdc_generation_test',
    'test/boost/aggregate_fcts_test',
    'test/boost/allocation_strategy_test',
    'test/boost/alternator_base64_test',
@@ -290,6 +298,7 @@ scylla_tests = set([
    'test/boost/checksum_utils_test',
    'test/boost/chunked_vector_test',
    'test/boost/clustering_ranges_walker_test',
+    'test/boost/column_mapping_test',
    'test/boost/commitlog_test',
    'test/boost/compound_test',
    'test/boost/compress_test',
@@ -323,7 +332,9 @@ scylla_tests = set([
    'test/boost/idl_test',
    'test/boost/input_stream_test',
    'test/boost/json_cql_query_test',
+    'test/boost/json_test',
    'test/boost/keys_test',
+    'test/boost/large_paging_state_test',
    'test/boost/like_matcher_test',
    'test/boost/limiting_data_source_test',
    'test/boost/linearizing_input_stream_test',
@@ -332,6 +343,7 @@ scylla_tests = set([
    'test/boost/estimated_histogram_test',
    'test/boost/logalloc_test',
    'test/boost/managed_vector_test',
+    'test/boost/intrusive_array_test',
    'test/boost/map_difference_test',
    'test/boost/memtable_test',
    'test/boost/meta_test',
@@ -353,6 +365,7 @@ scylla_tests = set([
    'test/boost/range_test',
    'test/boost/range_tombstone_list_test',
    'test/boost/reusable_buffer_test',
+    'test/boost/restrictions_test',
    'test/boost/role_manager_test',
    'test/boost/row_cache_test',
    'test/boost/schema_change_test',
@@ -374,7 +387,6 @@ scylla_tests = set([
    'test/boost/storage_proxy_test',
    'test/boost/top_k_test',
    'test/boost/transport_test',
-    'test/boost/truncation_migration_test',
    'test/boost/types_test',
    'test/boost/user_function_test',
    'test/boost/user_types_test',
@@ -386,13 +398,15 @@ scylla_tests = set([
    'test/boost/view_schema_ckey_test',
    'test/boost/vint_serialization_test',
    'test/boost/virtual_reader_test',
+    'test/boost/bptree_test',
+    'test/boost/double_decker_test',
    'test/boost/stall_free_test',
+    'test/boost/imr_test',
    'test/manual/ec2_snitch_test',
+    'test/manual/enormous_table_scan_test',
    'test/manual/gce_snitch_test',
    'test/manual/gossip',
    'test/manual/hint_test',
-    'test/manual/imr_test',
-    'test/manual/json_test',
    'test/manual/message',
    'test/manual/partition_data_test',
    'test/manual/row_locker_test',
@@ -404,6 +418,7 @@ scylla_tests = set([
    'test/perf/perf_fast_forward',
    'test/perf/perf_hash',
    'test/perf/perf_mutation',
+    'test/perf/perf_bptree',
    'test/perf/perf_row_cache_update',
    'test/perf/perf_simple_query',
    'test/perf/perf_sstable',
@@ -411,6 +426,8 @@ scylla_tests = set([
    'test/unit/lsa_sync_eviction_test',
    'test/unit/row_cache_alloc_stress_test',
    'test/unit/row_cache_stress_test',
+    'test/unit/bptree_stress_test',
+    'test/unit/bptree_compaction_test',
 ])

 perf_tests = set([
@@ -422,13 +439,18 @@ perf_tests = set([
    'test/perf/perf_big_decimal',
 ])

+raft_tests = set([
+    'test/raft/replication_test',
+    'test/boost/raft_fsm_test',
+])
+
 apps = set([
    'scylla',
    'test/tools/cql_repl',
    'tools/scylla-types',
 ])

-tests = scylla_tests | perf_tests
+tests = scylla_tests | perf_tests | raft_tests

 other = set([
    'iotune',
@@ -447,6 +469,8 @@ arg_parser.add_argument('--so', dest='so', action='store_true',
 arg_parser.add_argument('--mode', action='append', choices=list(modes.keys()), dest='selected_modes')
 arg_parser.add_argument('--with', dest='artifacts', action='append', choices=all_artifacts, default=[])
 arg_parser.add_argument('--with-seastar', action='store', dest='seastar_path', default='seastar', help='Path to Seastar sources')
+add_tristate(arg_parser, name='dist', dest='enable_dist',
+                        help='scylla-tools-java, scylla-jmx and packages')
 arg_parser.add_argument('--cflags', action='store', dest='user_cflags', default='',
                        help='Extra flags for the C++ compiler')
 arg_parser.add_argument('--ldflags', action='store', dest='user_ldflags', default='',
@@ -457,8 +481,6 @@ arg_parser.add_argument('--compiler', action='store', dest='cxx', default='g++',
                        help='C++ compiler path')
 arg_parser.add_argument('--c-compiler', action='store', dest='cc', default='gcc',
                        help='C compiler path')
-arg_parser.add_argument('--with-osv', action='store', dest='with_osv', default='',
-                        help='Shortcut for compile for OSv')
 add_tristate(arg_parser, name='dpdk', dest='dpdk',
                        help='Use dpdk (from seastar dpdk sources) (default=True for release builds)')
 arg_parser.add_argument('--dpdk-target', action='store', dest='dpdk_target', default='',
@@ -481,21 +503,43 @@ arg_parser.add_argument('--split-dwarf', dest='split_dwarf', action='store_true'
                        help='use of split dwarf (https://gcc.gnu.org/wiki/DebugFission) to speed up linking')
 arg_parser.add_argument('--enable-alloc-failure-injector', dest='alloc_failure_injector', action='store_true', default=False,
                        help='enable allocation failure injection')
+arg_parser.add_argument('--enable-seastar-debug-allocations', dest='seastar_debug_allocations', action='store_true', default=False,
+                        help='enable seastar debug allocations')
 arg_parser.add_argument('--with-antlr3', dest='antlr3_exec', action='store', default=None,
                        help='path to antlr3 executable')
 arg_parser.add_argument('--with-ragel', dest='ragel_exec', action='store', default='ragel',
        help='path to ragel executable')
+arg_parser.add_argument('--build-raft', dest='build_raft', action='store_true', default=False,
+                        help='build raft code')
 add_tristate(arg_parser, name='stack-guards', dest='stack_guards', help='Use stack guards')
+arg_parser.add_argument('--verbose', dest='verbose', action='store_true',
+                        help='Make configure.py output more verbose (useful for debugging the build process itself)')
+arg_parser.add_argument('--test-repeat', dest='test_repeat', action='store', type=str, default='1',
+                         help='Set number of times to repeat each unittest.')
+arg_parser.add_argument('--test-timeout', dest='test_timeout', action='store', type=str, default='7200')
 args = arg_parser.parse_args()

+coroutines_test_src = '''
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#if GCC_VERSION < 100201
+    #error "Coroutines support requires at leat gcc 10.2.1"
+#endif
+'''
+compiler_supports_coroutines = try_compile(compiler=args.cxx, source=coroutines_test_src)
+
+if args.build_raft and not compiler_supports_coroutines:
+    raise Exception("--build-raft is requested, while the used compiler does not support coroutines")
+
+if not args.build_raft:
+    all_artifacts.difference_update(raft_tests)
+    tests.difference_update(raft_tests)
+
 defines = ['XXH_PRIVATE_API',
           'SEASTAR_TESTING_MAIN',
 ]

 extra_cxxflags = {}

-cassandra_interface = Thrift(source='interface/cassandra.thrift', service='Cassandra')
-
 scylla_core = (['database.cc',
                'absl-flat_hash_map.cc',
                'table.cc',
@@ -516,6 +560,7 @@ scylla_core = (['database.cc',
                'frozen_mutation.cc',
                'memtable.cc',
                'schema_mutations.cc',
+                'utils/array-search.cc',
                'utils/logalloc.cc',
                'utils/large_bitset.cc',
                'utils/buffer_input_stream.cc',
@@ -523,6 +568,8 @@ scylla_core = (['database.cc',
                'utils/updateable_value.cc',
                'utils/directories.cc',
                'utils/generation-number.cc',
+                'utils/rjson.cc',
+                'utils/human_readable.cc',
                'mutation_partition.cc',
                'mutation_partition_view.cc',
                'mutation_partition_serializer.cc',
@@ -530,7 +577,6 @@ scylla_core = (['database.cc',
                'mutation_reader.cc',
                'flat_mutation_reader.cc',
                'mutation_query.cc',
-                'json.cc',
                'keys.cc',
                'counters.cc',
                'compress.cc',
@@ -538,7 +584,8 @@ scylla_core = (['database.cc',
                'sstables/mp_row_consumer.cc',
                'sstables/sstables.cc',
                'sstables/sstables_manager.cc',
-                'sstables/mc/writer.cc',
+                'sstables/mx/writer.cc',
+                'sstables/kl/writer.cc',
                'sstables/sstable_version.cc',
                'sstables/compress.cc',
                'sstables/partition.cc',
@@ -552,6 +599,10 @@ scylla_core = (['database.cc',
                'sstables/prepended_input_stream.cc',
                'sstables/m_format_read_helpers.cc',
                'sstables/sstable_directory.cc',
+                'sstables/random_access_reader.cc',
+                'sstables/metadata_collector.cc',
+                'sstables/writer.cc',
+                'transport/cql_protocol_extension.cc',
                'transport/event.cc',
                'transport/event_notifier.cc',
                'transport/server.cc',
@@ -574,6 +625,8 @@ scylla_core = (['database.cc',
                'cql3/sets.cc',
                'cql3/tuples.cc',
                'cql3/maps.cc',
+                'cql3/values.cc',
+                'cql3/expr/expression.cc',
                'cql3/functions/user_function.cc',
                'cql3/functions/functions.cc',
                'cql3/functions/aggregate_fcts.cc',
@@ -621,6 +674,7 @@ scylla_core = (['database.cc',
                'cql3/statements/alter_keyspace_statement.cc',
                'cql3/statements/role-management-statements.cc',
                'cql3/update_parameters.cc',
+                'cql3/util.cc',
                'cql3/ut_name.cc',
                'cql3/role_name.cc',
                'thrift/handler.cc',
@@ -640,7 +694,6 @@ scylla_core = (['database.cc',
                'service/paxos/prepare_response.cc',
                'service/paxos/paxos_state.cc',
                'service/paxos/prepare_summary.cc',
-                'cql3/operator.cc',
                'cql3/relation.cc',
                'cql3/column_identifier.cc',
                'cql3/column_specification.cc',
@@ -684,6 +737,7 @@ scylla_core = (['database.cc',
                'db/view/view_update_generator.cc',
                'db/view/row_locking.cc',
                'db/sstables-format-selector.cc',
+                'db/snapshot-ctl.cc',
                'index/secondary_index_manager.cc',
                'index/secondary_index.cc',
                'utils/UUID_gen.cc',
@@ -751,6 +805,7 @@ scylla_core = (['database.cc',
                'streaming/stream_manager.cc',
                'streaming/stream_result_future.cc',
                'streaming/stream_session_state.cc',
+                'streaming/stream_reason.cc',
                'clocks-impl.cc',
                'partition_slice_builder.cc',
                'init.cc',
@@ -800,6 +855,7 @@ scylla_core = (['database.cc',
                'utils/error_injection.cc',
                'mutation_writer/timestamp_based_splitting_writer.cc',
                'mutation_writer/shard_based_splitting_writer.cc',
+                'mutation_writer/feed_writers.cc',
                'lua.cc',
                ] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')]
               )
@@ -851,8 +907,8 @@ alternator = [
       'alternator/expressions.cc',
       Antlr3Grammar('alternator/expressions.g'),
       'alternator/conditions.cc',
-       'alternator/rjson.cc',
       'alternator/auth.cc',
+       'alternator/streams.cc',
 ]

 redis = [
@@ -907,6 +963,8 @@ scylla_tests_generic_dependencies = [
    'test/lib/log.cc',
    'test/lib/reader_permit.cc',
    'test/lib/test_utils.cc',
+    'test/lib/tmpdir.cc',
+    'test/lib/sstable_run_based_compaction_strategy_for_tests.cc',
 ]

 scylla_tests_dependencies = scylla_core + idls + scylla_tests_generic_dependencies + [
@@ -919,8 +977,17 @@ scylla_tests_dependencies = scylla_core + idls + scylla_tests_generic_dependenci
    'test/lib/random_schema.cc',
 ]

+scylla_raft_dependencies = [
+    'raft/raft.cc',
+    'raft/server.cc',
+    'raft/fsm.cc',
+    'raft/progress.cc',
+    'raft/log.cc',
+    'utils/uuid.cc'
+]
+
 deps = {
-    'scylla': idls + ['main.cc', 'release.cc', 'build_id.cc'] + scylla_core + api + alternator + redis,
+    'scylla': idls + ['main.cc', 'release.cc', 'utils/build_id.cc'] + scylla_core + api + alternator + redis,
    'test/tools/cql_repl': idls + ['test/tools/cql_repl.cc'] + scylla_core + scylla_tests_generic_dependencies,
    #FIXME: we don't need all of scylla_core here, only the types module, need to modularize scylla_core.
    'tools/scylla-types': idls + ['tools/scylla-types.cc'] + scylla_core,
@@ -944,6 +1011,7 @@ pure_boost_tests = set([
    'test/boost/enum_option_test',
    'test/boost/enum_set_test',
    'test/boost/idl_test',
+    'test/boost/json_test',
    'test/boost/keys_test',
    'test/boost/like_matcher_test',
    'test/boost/linearizing_input_stream_test',
@@ -957,7 +1025,7 @@ pure_boost_tests = set([
    'test/boost/small_vector_test',
    'test/boost/top_k_test',
    'test/boost/vint_serialization_test',
-    'test/manual/json_test',
+    'test/boost/bptree_test',
    'test/manual/streaming_histogram_test',
 ])

@@ -971,10 +1039,13 @@ tests_not_using_seastar_test_framework = set([
    'test/perf/perf_cql_parser',
    'test/perf/perf_hash',
    'test/perf/perf_mutation',
+    'test/perf/perf_bptree',
    'test/perf/perf_row_cache_update',
    'test/unit/lsa_async_eviction_test',
    'test/unit/lsa_sync_eviction_test',
    'test/unit/row_cache_alloc_stress_test',
+    'test/unit/bptree_stress_test',
+    'test/unit/bptree_compaction_test',
    'test/manual/sstable_scan_footprint_test',
 ]) | pure_boost_tests

@@ -1018,7 +1089,7 @@ deps['test/boost/anchorless_list_test'] = ['test/boost/anchorless_list_test.cc']
 deps['test/perf/perf_fast_forward'] += ['release.cc']
 deps['test/perf/perf_simple_query'] += ['release.cc']
 deps['test/boost/meta_test'] = ['test/boost/meta_test.cc']
-deps['test/manual/imr_test'] = ['test/manual/imr_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
+deps['test/boost/imr_test'] = ['test/boost/imr_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
 deps['test/boost/reusable_buffer_test'] = [
    "test/boost/reusable_buffer_test.cc",
    "test/lib/log.cc",
@@ -1035,8 +1106,12 @@ deps['test/boost/linearizing_input_stream_test'] = [
 deps['test/boost/duration_test'] += ['test/lib/exception_utils.cc']
 deps['test/boost/alternator_base64_test'] += ['alternator/base64.cc']

+deps['test/raft/replication_test'] = ['test/raft/replication_test.cc'] + scylla_raft_dependencies
+deps['test/boost/raft_fsm_test'] =  ['test/boost/raft_fsm_test.cc', 'test/lib/log.cc'] + scylla_raft_dependencies
+
 deps['utils/gz/gen_crc_combine_table'] = ['utils/gz/gen_crc_combine_table.cc']

+
 warnings = [
    '-Wall',
    '-Werror',
@@ -1058,6 +1133,27 @@ warnings = [
    '-Wno-ignored-attributes',
    '-Wno-overloaded-virtual',
    '-Wno-stringop-overflow',
+    '-Wno-unused-command-line-argument',
+    '-Wno-inconsistent-missing-override',
+    '-Wno-defaulted-function-deleted',
+    '-Wno-redeclared-class-member',
+    '-Wno-pessimizing-move',
+    '-Wno-redundant-move',
+    '-Wno-gnu-designator',
+    '-Wno-instantiation-after-specialization',
+    '-Wno-unused-private-field',
+    '-Wno-unsupported-friend',
+    '-Wno-unused-variable',
+    '-Wno-return-std-move',
+    '-Wno-delete-non-abstract-non-virtual-dtor',
+    '-Wno-unknown-attributes',
+    '-Wno-braced-scalar-init',
+    '-Wno-unused-value',
+    '-Wno-range-loop-construct',
+    '-Wno-unused-function',
+    '-Wno-implicit-int-float-conversion',
+    '-Wno-delete-abstract-non-virtual-dtor',
+    '-Wno-uninitialized-const-reference',
 ]

 warnings = [w
@@ -1067,13 +1163,18 @@ warnings = [w
 warnings = ' '.join(warnings + ['-Wno-error=deprecated-declarations'])

 optimization_flags = [
-    '--param inline-unit-growth=300',
+    '--param inline-unit-growth=300', # gcc
+    '-mllvm -inline-threshold=2500',  # clang
 ]
 optimization_flags = [o
                      for o in optimization_flags
                      if flag_supported(flag=o, compiler=args.cxx)]
 modes['release']['cxx_ld_flags'] += ' ' + ' '.join(optimization_flags)

+if flag_supported(flag='-Wstack-usage=4096', compiler=args.cxx):
+    for mode in modes:
+        modes[mode]['cxx_ld_flags'] += f' -Wstack-usage={modes[mode]["stack-usage-threshold"]} -Wno-error=stack-usage='
+
 linker_flags = linker_flags(compiler=args.cxx)

 dbgflag = '-g -gz' if args.debuginfo else ''
@@ -1107,9 +1208,20 @@ pkgs.append('libsystemd')


 compiler_test_src = '''
-#if __GNUC__ < 8
+
+// clang pretends to be gcc (defined __GNUC__), so we
+// must check it first
+#ifdef __clang__
+
+#if __clang_major__ < 10
    #error "MAJOR"
-#elif __GNUC__ == 8
+#endif
+
+#elif defined(__GNUC__)
+
+#if __GNUC__ < 10
+    #error "MAJOR"
+#elif __GNUC__ == 10
    #if __GNUC_MINOR__ < 1
        #error "MINOR"
    #elif __GNUC_MINOR__ == 1
@@ -1119,10 +1231,16 @@ compiler_test_src = '''
    #endif
 #endif

+#else
+
+#error "Unrecognized compiler"
+
+#endif
+
 int main() { return 0; }
 '''
 if not try_compile_and_link(compiler=args.cxx, source=compiler_test_src):
-    print('Wrong GCC version. Scylla needs GCC >= 8.1.1 to compile.')
+    print('Wrong GCC version. Scylla needs GCC >= 10.1.1 to compile.')
    sys.exit(1)

 if not try_compile(compiler=args.cxx, source='#include <boost/version.hpp>'):
@@ -1166,9 +1284,9 @@ if status != 0:
    print('Version file generation failed')
    sys.exit(1)

-file = open('build/SCYLLA-VERSION-FILE', 'r')
+file = open(f'{outdir}/SCYLLA-VERSION-FILE', 'r')
 scylla_version = file.read().strip()
-file = open('build/SCYLLA-RELEASE-FILE', 'r')
+file = open(f'{outdir}/SCYLLA-RELEASE-FILE', 'r')
 scylla_release = file.read().strip()

 extra_cxxflags["release.cc"] = "-DSCYLLA_VERSION=\"\\\"" + scylla_version + "\\\"\" -DSCYLLA_RELEASE=\"\\\"" + scylla_release + "\\\"\""
@@ -1207,11 +1325,13 @@ forced_ldflags += f'--dynamic-linker={dynamic_linker}'

 args.user_ldflags = forced_ldflags + ' ' + args.user_ldflags

-args.user_cflags += ' -Wno-error=stack-usage='
-
-args.user_cflags += f"-ffile-prefix-map={curdir}=."
+args.user_cflags += f" -ffile-prefix-map={curdir}=."

 seastar_cflags = args.user_cflags
+
+if build_raft:
+    seastar_cflags += ' -fcoroutines'
+
 if args.target != '':
    seastar_cflags += ' -march=' + args.target
 seastar_ldflags = args.user_ldflags
@@ -1231,7 +1351,7 @@ def configure_seastar(build_dir, mode):
        '-DSeastar_CXX_FLAGS={}'.format((seastar_cflags + ' ' + modes[mode]['cxx_ld_flags']).replace(' ', ';')),
        '-DSeastar_LD_FLAGS={}'.format(seastar_ldflags),
        '-DSeastar_CXX_DIALECT=gnu++20',
-        '-DSeastar_API_LEVEL=4',
+        '-DSeastar_API_LEVEL=6',
        '-DSeastar_UNUSED_RESULT_ERROR=ON',
    ]

@@ -1241,13 +1361,15 @@ def configure_seastar(build_dir, mode):

    dpdk = args.dpdk
    if dpdk is None:
-        dpdk = mode == 'release'
+        dpdk = platform.machine() == 'x86_64' and mode == 'release'
    if dpdk:
        seastar_cmake_args += ['-DSeastar_DPDK=ON', '-DSeastar_DPDK_MACHINE=wsm']
    if args.split_dwarf:
        seastar_cmake_args += ['-DSeastar_SPLIT_DWARF=ON']
    if args.alloc_failure_injector:
        seastar_cmake_args += ['-DSeastar_ALLOC_FAILURE_INJECTION=ON']
+    if args.seastar_debug_allocations:
+        seastar_cmake_args += ['-DSeastar_DEBUG_ALLOCATIONS=ON']

    seastar_cmd = ['cmake', '-G', 'Ninja', os.path.relpath(args.seastar_path, seastar_build_dir)] + seastar_cmake_args
    cmake_dir = seastar_build_dir
@@ -1257,14 +1379,15 @@ def configure_seastar(build_dir, mode):
        relative_seastar_build_dir = os.path.join('..', seastar_build_dir)  # relative to seastar/
        seastar_cmd = ['./cooking.sh', '-i', 'dpdk', '-d', relative_seastar_build_dir, '--'] + seastar_cmd[4:]

-    print(seastar_cmd)
+    if args.verbose:
+        print(" \\\n  ".join(seastar_cmd))
    os.makedirs(seastar_build_dir, exist_ok=True)
    subprocess.check_call(seastar_cmd, shell=False, cwd=cmake_dir)

 for mode in build_modes:
-    configure_seastar('build', mode)
+    configure_seastar(outdir, mode)

-pc = {mode: 'build/{}/seastar/seastar.pc'.format(mode) for mode in build_modes}
+pc = {mode: f'{outdir}/{mode}/seastar/seastar.pc' for mode in build_modes}
 ninja = find_executable('ninja') or find_executable('ninja-build')
 if not ninja:
    print('Ninja executable (ninja or ninja-build) not found on PATH\n')
@@ -1319,7 +1442,6 @@ abseil_libs = ['absl/' + lib for lib in [
    'base/libabsl_malloc_internal.a',
    'base/libabsl_spinlock_wait.a',
    'base/libabsl_base.a',
-    'base/libabsl_dynamic_annotations.a',
    'base/libabsl_raw_logging_internal.a',
    'base/libabsl_exponential_biased.a',
    'base/libabsl_throw_delegate.a']]
@@ -1331,18 +1453,15 @@ libs = ' '.join([maybe_static(args.staticyamlcpp, '-lyaml-cpp'), '-latomic', '-l
                 # Must link with static version of libzstd, since
                 # experimental APIs that we use are only present there.
                 maybe_static(True, '-lzstd'),
-                 maybe_static(args.staticboost, '-lboost_date_time -lboost_regex -licuuc'), ])
-
-pkgconfig_libs = [
-    'libxxhash',
-]
-
-args.user_cflags += ' ' + ' '.join([pkg_config(lib, '--cflags') for lib in pkgconfig_libs])
-libs += ' ' + ' '.join([pkg_config(lib, '--libs') for lib in pkgconfig_libs])
+                 maybe_static(args.staticboost, '-lboost_date_time -lboost_regex -licuuc -licui18n'),
+                 '-lxxhash'])

 if not args.staticboost:
    args.user_cflags += ' -DBOOST_TEST_DYN_LINK'

+if build_raft:
+    args.user_cflags += ' -DENABLE_SCYLLA_RAFT -fcoroutines'
+
 # thrift version detection, see #4538
 proc_res = subprocess.run(["thrift", "-version"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 proc_res_output = proc_res.stdout.decode("utf-8")
@@ -1367,13 +1486,9 @@ if args.staticthrift:
 else:
    thrift_libs = "-lthrift"

-outdir = 'build'
 buildfile = 'build.ninja'

 os.makedirs(outdir, exist_ok=True)
-do_sanitize = True
-if args.static:
-    do_sanitize = False

 if args.antlr3_exec:
    antlr3_exec = args.antlr3_exec
@@ -1399,7 +1514,7 @@ with open(buildfile_tmp, 'w') as f:
        configure_args = {configure_args}
        builddir = {outdir}
        cxx = {cxx}
-        cxxflags = {user_cflags} {warnings} {defines}
+        cxxflags = --std=gnu++20 {user_cflags} {warnings} {defines}
        ldflags = {linker_flags} {user_ldflags}
        ldflags_build = {linker_flags}
        libs = {libs}
@@ -1429,7 +1544,7 @@ with open(buildfile_tmp, 'w') as f:
            command = $in > $out
            description = GEN $out
        rule copy
-            command = cp $in $out
+            command = cp --reflink=auto $in $out
            description = COPY $out
        rule package
            command = scripts/create-relocatable-package.py --mode $mode $out
@@ -1437,6 +1552,8 @@ with open(buildfile_tmp, 'w') as f:
            command = reloc/build_rpm.sh --reloc-pkg $in --builddir $out
        rule debbuild
            command = reloc/build_deb.sh --reloc-pkg $in --builddir $out
+        rule unified
+            command = unified/build_unified.sh --mode $mode --unified-pkg $out
        ''').format(**globals()))
    for mode in build_modes:
        modeval = modes[mode]
@@ -1469,45 +1586,49 @@ with open(buildfile_tmp, 'w') as f:
            rule thrift.{mode}
                command = thrift -gen cpp:cob_style -out $builddir/{mode}/gen $in
                description = THRIFT $in
+                restat = 1
            rule antlr3.{mode}
                # We replace many local `ExceptionBaseType* ex` variables with a single function-scope one.
                # Because we add such a variable to every function, and because `ExceptionBaseType` is not a global
                # name, we also add a global typedef to avoid compilation errors.
                command = sed -e '/^#if 0/,/^#endif/d' $in > $builddir/{mode}/gen/$in $
                     && {antlr3_exec} $builddir/{mode}/gen/$in $
-                     && sed -i -e '/^.*On :.*$$/d' build/{mode}/gen/${{stem}}Lexer.hpp $
-                     && sed -i -e '/^.*On :.*$$/d' build/{mode}/gen/${{stem}}Lexer.cpp $
-                     && sed -i -e '/^.*On :.*$$/d' build/{mode}/gen/${{stem}}Parser.hpp $
+                     && sed -i -e '/^.*On :.*$$/d' $builddir/{mode}/gen/${{stem}}Lexer.hpp $
+                     && sed -i -e '/^.*On :.*$$/d' $builddir/{mode}/gen/${{stem}}Lexer.cpp $
+                     && sed -i -e '/^.*On :.*$$/d' $builddir/{mode}/gen/${{stem}}Parser.hpp $
                     && sed -i -e 's/^\\( *\)\\(ImplTraits::CommonTokenType\\* [a-zA-Z0-9_]* = NULL;\\)$$/\\1const \\2/' $
                        -e '/^.*On :.*$$/d' $
                        -e '1i using ExceptionBaseType = int;' $
                        -e 's/^{{/{{ ExceptionBaseType\* ex = nullptr;/; $
                            s/ExceptionBaseType\* ex = new/ex = new/; $
                            s/exceptions::syntax_exception e/exceptions::syntax_exception\& e/' $
-                        build/{mode}/gen/${{stem}}Parser.cpp
+                        $builddir/{mode}/gen/${{stem}}Parser.cpp
                description = ANTLR3 $in
            rule checkhh.{mode}
-              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} $obj_cxxflags --include $in -c -o $out build/{mode}/gen/empty.cc
+              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} $obj_cxxflags --include $in -c -o $out $builddir/{mode}/gen/empty.cc
              description = CHECKHH $in
              depfile = $out.d
            rule test.{mode}
-              command = ./test.py --mode={mode}
+              command = ./test.py --mode={mode} --repeat={test_repeat} --timeout={test_timeout}
+              pool = console
              description = TEST {mode}
-            ''').format(mode=mode, antlr3_exec=antlr3_exec, fmt_lib=fmt_lib, **modeval))
+            ''').format(mode=mode, antlr3_exec=antlr3_exec, fmt_lib=fmt_lib, test_repeat=test_repeat, test_timeout=test_timeout, **modeval))
        f.write(
-            'build {mode}: phony {artifacts}\n'.format(
+            'build {mode}-build: phony {artifacts}\n'.format(
                mode=mode,
                artifacts=str.join(' ', ('$builddir/' + mode + '/' + x for x in build_artifacts))
            )
        )
+        include_dist_target = f'dist-{mode}' if args.enable_dist is None or args.enable_dist else ''
+        f.write(f'build {mode}: phony {mode}-build {include_dist_target}\n')
        compiles = {}
        swaggers = set()
        serializers = {}
        thrifts = set()
        ragels = {}
        antlr3_grammars = set()
-        seastar_dep = 'build/{}/seastar/libseastar.a'.format(mode)
-        seastar_testing_dep = 'build/{}/seastar/libseastar_testing.a'.format(mode)
+        seastar_dep = '$builddir/{}/seastar/libseastar.a'.format(mode)
+        seastar_testing_dep = '$builddir/{}/seastar/libseastar_testing.a'.format(mode)
        for binary in build_artifacts:
            if binary in other:
                continue
@@ -1595,7 +1716,7 @@ with open(buildfile_tmp, 'w') as f:
        )

        f.write(
-            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/test/tools/cql_repl\n'.format(
+            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/test/tools/cql_repl $builddir/{mode}/scylla\n'.format(
                mode=mode,
                test_executables=' '.join(['$builddir/{}/{}'.format(mode, binary) for binary in tests]),
            )
@@ -1655,106 +1776,142 @@ with open(buildfile_tmp, 'w') as f:
                    if has_sanitize_address_use_after_scope:
                        flags += ' -fno-sanitize-address-use-after-scope'
                    f.write('  obj_cxxflags = %s\n' % flags)
-        f.write(f'build build/{mode}/gen/empty.cc: gen\n')
+        f.write(f'build $builddir/{mode}/gen/empty.cc: gen\n')
        for hh in headers:
-            f.write('build $builddir/{mode}/{hh}.o: checkhh.{mode} {hh} | build/{mode}/gen/empty.cc || {gen_headers_dep}\n'.format(
+            f.write('build $builddir/{mode}/{hh}.o: checkhh.{mode} {hh} | $builddir/{mode}/gen/empty.cc || {gen_headers_dep}\n'.format(
                    mode=mode, hh=hh, gen_headers_dep=gen_headers_dep))

-        f.write('build build/{mode}/seastar/libseastar.a: ninja | always\n'
+        f.write('build $builddir/{mode}/seastar/libseastar.a: ninja | always\n'
                .format(**locals()))
        f.write('  pool = submodule_pool\n')
-        f.write('  subdir = build/{mode}/seastar\n'.format(**locals()))
+        f.write('  subdir = $builddir/{mode}/seastar\n'.format(**locals()))
        f.write('  target = seastar\n'.format(**locals()))
-        f.write('build build/{mode}/seastar/libseastar_testing.a: ninja | always\n'
+        f.write('build $builddir/{mode}/seastar/libseastar_testing.a: ninja | always\n'
                .format(**locals()))
        f.write('  pool = submodule_pool\n')
-        f.write('  subdir = build/{mode}/seastar\n'.format(**locals()))
+        f.write('  subdir = $builddir/{mode}/seastar\n'.format(**locals()))
        f.write('  target = seastar_testing\n'.format(**locals()))
-        f.write('build build/{mode}/seastar/apps/iotune/iotune: ninja\n'
+        f.write('build $builddir/{mode}/seastar/apps/iotune/iotune: ninja\n'
                .format(**locals()))
        f.write('  pool = submodule_pool\n')
-        f.write('  subdir = build/{mode}/seastar\n'.format(**locals()))
+        f.write('  subdir = $builddir/{mode}/seastar\n'.format(**locals()))
        f.write('  target = iotune\n'.format(**locals()))
        f.write(textwrap.dedent('''\
-            build build/{mode}/iotune: copy build/{mode}/seastar/apps/iotune/iotune
+            build $builddir/{mode}/iotune: copy $builddir/{mode}/seastar/apps/iotune/iotune
            ''').format(**locals()))
-        f.write('build build/{mode}/scylla-package.tar.gz: package build/{mode}/scylla build/{mode}/iotune build/SCYLLA-RELEASE-FILE build/SCYLLA-VERSION-FILE build/debian/debian | always\n'.format(**locals()))
+        f.write('build $builddir/{mode}/dist/tar/scylla-package.tar.gz: package $builddir/{mode}/scylla $builddir/{mode}/iotune $builddir/SCYLLA-RELEASE-FILE $builddir/SCYLLA-VERSION-FILE $builddir/debian/debian | always\n'.format(**locals()))
        f.write('  pool = submodule_pool\n')
        f.write('  mode = {mode}\n'.format(**locals()))
-        f.write(f'build build/dist/{mode}/redhat: rpmbuild build/{mode}/scylla-package.tar.gz\n')
+        f.write(f'build $builddir/{mode}/scylla-package.tar.gz: copy $builddir/{mode}/dist/tar/scylla-package.tar.gz\n')
+        f.write(f'build $builddir/dist/{mode}/redhat: rpmbuild $builddir/{mode}/scylla-package.tar.gz\n')
        f.write(f'  pool = submodule_pool\n')
        f.write(f'  mode = {mode}\n')
-        f.write(f'build build/dist/{mode}/debian: debbuild build/{mode}/scylla-package.tar.gz\n')
+        f.write(f'build $builddir/dist/{mode}/debian: debbuild $builddir/{mode}/scylla-package.tar.gz\n')
+        f.write(f'  pool = submodule_pool\n')
+        f.write(f'  mode = {mode}\n')
+        f.write(f'build dist-server-{mode}: phony $builddir/dist/{mode}/redhat $builddir/dist/{mode}/debian\n')
+        f.write(f'build dist-jmx-{mode}: phony $builddir/{mode}/dist/tar/scylla-jmx-package.tar.gz dist-jmx-rpm dist-jmx-deb\n')
+        f.write(f'build dist-tools-{mode}: phony $builddir/{mode}/dist/tar/scylla-tools-package.tar.gz dist-tools-rpm dist-tools-deb\n')
+        f.write(f'build dist-python3-{mode}: phony dist-python3-tar dist-python3-rpm dist-python3-deb compat-python3-rpm compat-python3-deb\n')
+        f.write(f'build dist-unified-{mode}: phony $builddir/{mode}/dist/tar/scylla-unified-package-{scylla_version}.{scylla_release}.tar.gz\n')
+        f.write(f'build $builddir/{mode}/scylla-unified-package-{scylla_version}.{scylla_release}.tar.gz: copy $builddir/{mode}/dist/tar/scylla-unified-package.tar.gz\n')
+        f.write(f'build $builddir/{mode}/dist/tar/scylla-unified-package-{scylla_version}.{scylla_release}.tar.gz: unified $builddir/{mode}/dist/tar/scylla-package.tar.gz $builddir/{mode}/dist/tar/scylla-python3-package.tar.gz $builddir/{mode}/dist/tar/scylla-jmx-package.tar.gz $builddir/{mode}/dist/tar/scylla-tools-package.tar.gz | always\n')
        f.write(f'  pool = submodule_pool\n')
        f.write(f'  mode = {mode}\n')
-        f.write(f'build dist-server-{mode}: phony build/dist/{mode}/redhat build/dist/{mode}/debian\n')
        f.write('rule libdeflate.{mode}\n'.format(**locals()))
-        f.write('  command = make -C libdeflate BUILD_DIR=../build/{mode}/libdeflate/ CFLAGS="{libdeflate_cflags}" CC={args.cc} ../build/{mode}/libdeflate//libdeflate.a\n'.format(**locals()))
-        f.write('build build/{mode}/libdeflate/libdeflate.a: libdeflate.{mode}\n'.format(**locals()))
+        f.write('  command = make -C libdeflate BUILD_DIR=../$builddir/{mode}/libdeflate/ CFLAGS="{libdeflate_cflags}" CC={args.cc} ../$builddir/{mode}/libdeflate//libdeflate.a\n'.format(**locals()))
+        f.write('build $builddir/{mode}/libdeflate/libdeflate.a: libdeflate.{mode}\n'.format(**locals()))
        f.write('  pool = submodule_pool\n')

        for lib in abseil_libs:
-            f.write('build build/{mode}/abseil/{lib}: ninja\n'.format(**locals()))
+            f.write('build $builddir/{mode}/abseil/{lib}: ninja\n'.format(**locals()))
            f.write('  pool = submodule_pool\n')
-            f.write('  subdir = build/{mode}/abseil\n'.format(**locals()))
+            f.write('  subdir = $builddir/{mode}/abseil\n'.format(**locals()))
            f.write('  target = {lib}\n'.format(**locals()))

-    mode = 'dev' if 'dev' in modes else modes[0]
-    f.write('build checkheaders: phony || {}\n'.format(' '.join(['$builddir/{}/{}.o'.format(mode, hh) for hh in headers])))
+    checkheaders_mode = 'dev' if 'dev' in modes else modes[0]
+    f.write('build checkheaders: phony || {}\n'.format(' '.join(['$builddir/{}/{}.o'.format(checkheaders_mode, hh) for hh in headers])))

    f.write(
-            'build test: phony {}\n'.format(' '.join(['{mode}-test'.format(mode=mode) for mode in modes]))
+            'build build: phony {}\n'.format(' '.join([f'{mode}-build' for mode in build_modes]))
    )
    f.write(
-            'build check: phony {}\n'.format(' '.join(['{mode}-check'.format(mode=mode) for mode in modes]))
+            'build test: phony {}\n'.format(' '.join(['{mode}-test'.format(mode=mode) for mode in build_modes]))
+    )
+    f.write(
+            'build check: phony {}\n'.format(' '.join(['{mode}-check'.format(mode=mode) for mode in build_modes]))
    )

    f.write(textwrap.dedent(f'''\
-        build dist-server-deb: phony {' '.join(['build/dist/{mode}/debian'.format(mode=mode) for mode in build_modes])}
-        build dist-server-rpm: phony {' '.join(['build/dist/{mode}/redhat'.format(mode=mode) for mode in build_modes])}
-        build dist-server: phony dist-server-rpm dist-server-deb
+        build dist-unified-tar: phony {' '.join(['$builddir/{mode}/scylla-unified-package-$scylla_version.$scylla_release.tar.gz'.format(mode=mode) for mode in build_modes])}
+        build dist-unified: phony dist-unified-tar
+
+        build dist-server-deb: phony {' '.join(['$builddir/dist/{mode}/debian'.format(mode=mode) for mode in build_modes])}
+        build dist-server-rpm: phony {' '.join(['$builddir/dist/{mode}/redhat'.format(mode=mode) for mode in build_modes])}
+        build dist-server-tar: phony {' '.join(['$builddir/{mode}/scylla-package.tar.gz'.format(mode=mode) for mode in build_modes])}
+        build dist-server: phony dist-server-tar dist-server-rpm dist-server-deb

        rule build-submodule-reloc
-          command = cd $reloc_dir && ./reloc/build_reloc.sh
+          command = cd $reloc_dir && ./reloc/build_reloc.sh --version $$(<../../build/SCYLLA-PRODUCT-FILE)-$$(<../../build/SCYLLA-VERSION-FILE)-$$(<../../build/SCYLLA-RELEASE-FILE) --nodeps $args
        rule build-submodule-rpm
          command = cd $dir && ./reloc/build_rpm.sh --reloc-pkg $artifact
        rule build-submodule-deb
          command = cd $dir && ./reloc/build_deb.sh --reloc-pkg $artifact

-        build scylla-jmx/build/scylla-jmx-package.tar.gz: build-submodule-reloc
-          reloc_dir = scylla-jmx
-        build dist-jmx-rpm: build-submodule-rpm scylla-jmx/build/scylla-jmx-package.tar.gz
-          dir = scylla-jmx
-          artifact = build/scylla-jmx-package.tar.gz
-        build dist-jmx-deb: build-submodule-deb scylla-jmx/build/scylla-jmx-package.tar.gz
-          dir = scylla-jmx
-          artifact = build/scylla-jmx-package.tar.gz
-        build dist-jmx: phony dist-jmx-rpm dist-jmx-deb
+        build tools/jmx/build/scylla-jmx-package.tar.gz: build-submodule-reloc
+          reloc_dir = tools/jmx
+        build dist-jmx-rpm: build-submodule-rpm tools/jmx/build/scylla-jmx-package.tar.gz
+          dir = tools/jmx
+          artifact = $builddir/scylla-jmx-package.tar.gz
+        build dist-jmx-deb: build-submodule-deb tools/jmx/build/scylla-jmx-package.tar.gz
+          dir = tools/jmx
+          artifact = $builddir/scylla-jmx-package.tar.gz
+        build dist-jmx-tar: phony {' '.join(['$builddir/{mode}/dist/tar/scylla-jmx-package.tar.gz'.format(mode=mode) for mode in build_modes])}
+        build dist-jmx: phony dist-jmx-tar dist-jmx-rpm dist-jmx-deb

-        build scylla-tools/build/scylla-tools-package.tar.gz: build-submodule-reloc
-          reloc_dir = scylla-tools
-        build dist-tools-rpm: build-submodule-rpm scylla-tools/build/scylla-tools-package.tar.gz
-          dir = scylla-tools
-          artifact = build/scylla-tools-package.tar.gz
-        build dist-tools-deb: build-submodule-deb scylla-tools/build/scylla-tools-package.tar.gz
-          dir = scylla-tools
-          artifact = build/scylla-tools-package.tar.gz
-        build dist-tools: phony dist-tools-rpm dist-tools-deb
+        build tools/java/build/scylla-tools-package.tar.gz: build-submodule-reloc
+          reloc_dir = tools/java
+        build dist-tools-rpm: build-submodule-rpm tools/java/build/scylla-tools-package.tar.gz
+          dir = tools/java
+          artifact = $builddir/scylla-tools-package.tar.gz
+        build dist-tools-deb: build-submodule-deb tools/java/build/scylla-tools-package.tar.gz
+          dir = tools/java
+          artifact = $builddir/scylla-tools-package.tar.gz
+        build dist-tools-tar: phony {' '.join(['$builddir/{mode}/dist/tar/scylla-tools-package.tar.gz'.format(mode=mode) for mode in build_modes])}
+        build dist-tools: phony dist-tools-tar dist-tools-rpm dist-tools-deb

-        rule build-python-reloc
-          command = ./reloc/python3/build_reloc.sh
-        rule build-python-rpm
-          command = ./reloc/python3/build_rpm.sh
-        rule build-python-deb
-          command = ./reloc/python3/build_deb.sh
+        rule compat-python3-reloc
+          command = mkdir -p $builddir/release && ln -f $dir/$artifact $builddir/release/
+        rule compat-python3-rpm
+          command = cd $dir && ./reloc/build_rpm.sh --reloc-pkg $artifact --builddir ../../build/redhat
+        rule compat-python3-deb
+          command = cd $dir && ./reloc/build_deb.sh --reloc-pkg $artifact --builddir ../../build/debian
+        build $builddir/release/scylla-python3-package.tar.gz: compat-python3-reloc tools/python3/build/scylla-python3-package.tar.gz
+          dir = tools/python3
+          artifact = $builddir/scylla-python3-package.tar.gz
+        build compat-python3-rpm: compat-python3-rpm tools/python3/build/scylla-python3-package.tar.gz
+          dir = tools/python3
+          artifact = $builddir/scylla-python3-package.tar.gz
+        build compat-python3-deb: compat-python3-deb tools/python3/build/scylla-python3-package.tar.gz
+          dir = tools/python3
+          artifact = $builddir/scylla-python3-package.tar.gz

-        build build/release/scylla-python3-package.tar.gz: build-python-reloc
-        build dist-python-rpm: build-python-rpm build/release/scylla-python3-package.tar.gz
-        build dist-python-deb: build-python-deb build/release/scylla-python3-package.tar.gz
-        build dist-python: phony dist-python-rpm dist-python-deb
-        build dist-deb: phony dist-server-deb dist-python-deb dist-jmx-deb dist-tools-deb
-        build dist-rpm: phony dist-server-rpm dist-python-rpm dist-jmx-rpm dist-tools-rpm
-        build dist: phony dist-server dist-python dist-jmx dist-tools
+        build tools/python3/build/scylla-python3-package.tar.gz: build-submodule-reloc
+          reloc_dir = tools/python3
+          args = --packages "{python3_dependencies}"
+        build dist-python3-rpm: build-submodule-rpm tools/python3/build/scylla-python3-package.tar.gz
+          dir = tools/python3
+          artifact = $builddir/scylla-python3-package.tar.gz
+        build dist-python3-deb: build-submodule-deb tools/python3/build/scylla-python3-package.tar.gz
+          dir = tools/python3
+          artifact = $builddir/scylla-python3-package.tar.gz
+        build dist-python3-tar: phony {' '.join(['$builddir/{mode}/dist/tar/scylla-python3-package.tar.gz'.format(mode=mode) for mode in build_modes])}
+        build dist-python3: phony dist-python3-tar dist-python3-rpm dist-python3-deb $builddir/release/scylla-python3-package.tar.gz compat-python3-rpm compat-python3-deb
+        build dist-deb: phony dist-server-deb dist-python3-deb dist-jmx-deb dist-tools-deb
+        build dist-rpm: phony dist-server-rpm dist-python3-rpm dist-jmx-rpm dist-tools-rpm
+        build dist-tar: phony dist-unified-tar dist-server-tar dist-python3-tar dist-jmx-tar dist-tools-tar
+
+        build dist: phony dist-unified dist-server dist-python3 dist-jmx dist-tools
        '''))

    f.write(textwrap.dedent(f'''\
@@ -1764,6 +1921,11 @@ with open(buildfile_tmp, 'w') as f:
        '''))
    for mode in build_modes:
        f.write(textwrap.dedent(f'''\
+        build $builddir/{mode}/dist/tar/scylla-python3-package.tar.gz: copy tools/python3/build/scylla-python3-package.tar.gz
+        build $builddir/{mode}/dist/tar/scylla-tools-package.tar.gz: copy tools/java/build/scylla-tools-package.tar.gz
+        build $builddir/{mode}/dist/tar/scylla-jmx-package.tar.gz: copy tools/jmx/build/scylla-jmx-package.tar.gz
+
+        build dist-{mode}: phony dist-server-{mode} dist-python3-{mode} dist-tools-{mode} dist-jmx-{mode} dist-unified-{mode}
        build dist-check-{mode}: dist-check
          mode = {mode}
            '''))
@@ -1791,10 +1953,10 @@ with open(buildfile_tmp, 'w') as f:
        build always: phony
        rule scylla_version_gen
            command = ./SCYLLA-VERSION-GEN
-        build build/SCYLLA-RELEASE-FILE build/SCYLLA-VERSION-FILE: scylla_version_gen
+        build $builddir/SCYLLA-RELEASE-FILE $builddir/SCYLLA-VERSION-FILE: scylla_version_gen
        rule debian_files_gen
            command = ./dist/debian/debian_files_gen.py
-        build build/debian/debian: debian_files_gen | always
+        build $builddir/debian/debian: debian_files_gen | always
        ''').format(modes_list=' '.join(build_modes), **globals()))

 os.rename(buildfile_tmp, buildfile)
--- a/counters.cc
+++ b/counters.cc
@@ -29,15 +29,6 @@ counter_id counter_id::local()
    return counter_id(service::get_local_storage_service().get_local_id());
 }

-bool counter_id::less_compare_1_7_4::operator()(const counter_id& a, const counter_id& b) const
-{
-    if (a._most_significant != b._most_significant) {
-        return a._most_significant < b._most_significant;
-    } else {
-        return a._least_significant < b._least_significant;
-    }
-}
-
 std::ostream& operator<<(std::ostream& os, const counter_id& id) {
    return os << id.to_uuid();
 }
@@ -68,16 +59,6 @@ void counter_cell_builder::do_sort_and_remove_duplicates()
    _sorted = true;
 }

-std::vector<counter_shard> counter_cell_view::shards_compatible_with_1_7_4() const
-{
-    auto sorted_shards = boost::copy_range<std::vector<counter_shard>>(shards());
-    counter_id::less_compare_1_7_4 cmp;
-    boost::range::sort(sorted_shards, [&] (auto& a, auto& b) {
-        return cmp(a.id(), b.id());
-    });
-    return sorted_shards;
-}
-
 static bool apply_in_place(const column_definition& cdef, atomic_cell_mutable_view dst, atomic_cell_mutable_view src)
 {
    auto dst_ccmv = counter_cell_mutable_view(dst);
--- a/counters.hh
+++ b/counters.hh
@@ -60,11 +60,6 @@ public:
    bool operator!=(const counter_id& other) const {
        return !(*this == other);
    }
-public:
-    // (Wrong) Counter ID ordering used by Scylla 1.7.4 and earlier.
-    struct less_compare_1_7_4 {
-        bool operator()(const counter_id& a, const counter_id& b) const;
-    };
 public:
    static counter_id local();

@@ -186,7 +181,7 @@ public:
    int64_t logical_clock() const { return _logical_clock; }

    counter_shard& update(int64_t value_delta, int64_t clock_increment) noexcept {
-        _value += value_delta;
+        _value = uint64_t(_value) + uint64_t(value_delta); // signed int overflow is undefined hence the cast
        _logical_clock += clock_increment;
        return *this;
    }
@@ -291,7 +286,7 @@ public:
            return *this;
        }
        inserter_iterator& operator=(const counter_shard_view& csv) {
-            return operator=(counter_shard(csv));
+            return this->operator=(counter_shard(csv));
        }
        inserter_iterator& operator++() { return *this; }
        inserter_iterator& operator++(int) { return *this; }
@@ -417,9 +412,6 @@ struct counter_cell_view : basic_counter_cell_view<mutable_view::no> {
        });
    }

-    // Returns counter shards in an order that is compatible with Scylla 1.7.4.
-    std::vector<counter_shard> shards_compatible_with_1_7_4() const;
-
    // Reversibly applies two counter cells, at least one of them must be live.
    static void apply(const column_definition& cdef, atomic_cell_or_collection& dst, atomic_cell_or_collection& src);

--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -93,6 +93,7 @@ options {
 #include "cql3/ut_name.hh"
 #include "cql3/functions/function_name.hh"
 #include "cql3/functions/function_call.hh"
+#include "cql3/expr/expression.hh"
 #include <seastar/core/sstring.hh>
 #include "CqlLexer.hpp"

@@ -1332,7 +1333,7 @@ setOrMapLiteral[shared_ptr<cql3::term::raw> t] returns [shared_ptr<cql3::term::r
      { $value = ::make_shared<cql3::maps::literal>(std::move(m)); }
    | { s.push_back(t); }
          ( ',' tn=term { s.push_back(tn); } )*
-      { $value = make_shared(cql3::sets::literal(std::move(s))); }
+      { $value = ::make_shared<cql3::sets::literal>(std::move(s)); }
    ;

 collectionLiteral returns [shared_ptr<cql3::term::raw> value]
@@ -1343,7 +1344,7 @@ collectionLiteral returns [shared_ptr<cql3::term::raw> value]
    | '{' t=term v=setOrMapLiteral[t] { $value = v; } '}'
    // Note that we have an ambiguity between maps and set for "{}". So we force it to a set literal,
    // and deal with it later based on the type of the column (SetLiteral.java).
-    | '{' '}' { $value = make_shared(cql3::sets::literal({})); }
+    | '{' '}' { $value = ::make_shared<cql3::sets::literal>(std::vector<shared_ptr<cql3::term::raw>>()); }
    ;

 usertypeLiteral returns [shared_ptr<cql3::user_types::literal> ut]
@@ -1474,13 +1475,13 @@ udtColumnOperation[operations_type& operations,
 columnCondition[conditions_type& conditions]
    // Note: we'll reject duplicates later
    : key=cident
-        ( op=relationType t=term { conditions.emplace_back(key, cql3::column_condition::raw::simple_condition(t, {}, *op)); }
+        ( op=relationType t=term { conditions.emplace_back(key, cql3::column_condition::raw::simple_condition(t, {}, op)); }
        | K_IN
            ( values=singleColumnInValues { conditions.emplace_back(key, cql3::column_condition::raw::in_condition({}, {}, values)); }
            | marker=inMarker { conditions.emplace_back(key, cql3::column_condition::raw::in_condition({}, marker, {})); }
            )
        | '[' element=term ']'
-            ( op=relationType t=term { conditions.emplace_back(key, cql3::column_condition::raw::simple_condition(t, element, *op)); }
+            ( op=relationType t=term { conditions.emplace_back(key, cql3::column_condition::raw::simple_condition(t, element, op)); }
            | K_IN
                ( values=singleColumnInValues { conditions.emplace_back(key, cql3::column_condition::raw::in_condition(element, {}, values)); }
                | marker=inMarker { conditions.emplace_back(key, cql3::column_condition::raw::in_condition(element, marker, {})); }
@@ -1503,31 +1504,31 @@ propertyValue returns [sstring str]
    | u=unreserved_keyword { $str = u; }
    ;

-relationType returns [const cql3::operator_type* op = nullptr]
-    : '='  { $op = &cql3::operator_type::EQ; }
-    | '<'  { $op = &cql3::operator_type::LT; }
-    | '<=' { $op = &cql3::operator_type::LTE; }
-    | '>'  { $op = &cql3::operator_type::GT; }
-    | '>=' { $op = &cql3::operator_type::GTE; }
-    | '!=' { $op = &cql3::operator_type::NEQ; }
-    | K_LIKE { $op = &cql3::operator_type::LIKE; }
+relationType returns [cql3::expr::oper_t op]
+    : '='  { $op = cql3::expr::oper_t::EQ; }
+    | '<'  { $op = cql3::expr::oper_t::LT; }
+    | '<=' { $op = cql3::expr::oper_t::LTE; }
+    | '>'  { $op = cql3::expr::oper_t::GT; }
+    | '>=' { $op = cql3::expr::oper_t::GTE; }
+    | '!=' { $op = cql3::expr::oper_t::NEQ; }
+    | K_LIKE { $op = cql3::expr::oper_t::LIKE; }
    ;

 relation[std::vector<cql3::relation_ptr>& clauses]
-    @init{ const cql3::operator_type* rt = nullptr; }
-    : name=cident type=relationType t=term { $clauses.emplace_back(::make_shared<cql3::single_column_relation>(std::move(name), *type, std::move(t))); }
+    @init{ cql3::expr::oper_t rt; }
+    : name=cident type=relationType t=term { $clauses.emplace_back(::make_shared<cql3::single_column_relation>(std::move(name), type, std::move(t))); }

    | K_TOKEN l=tupleOfIdentifiers type=relationType t=term
-        { $clauses.emplace_back(::make_shared<cql3::token_relation>(std::move(l), *type, std::move(t))); }
+        { $clauses.emplace_back(::make_shared<cql3::token_relation>(std::move(l), type, std::move(t))); }
    | name=cident K_IS K_NOT K_NULL {
-          $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), cql3::operator_type::IS_NOT, cql3::constants::NULL_LITERAL)); }
+          $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), cql3::expr::oper_t::IS_NOT, cql3::constants::NULL_LITERAL)); }
    | name=cident K_IN marker=inMarker
-        { $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), cql3::operator_type::IN, std::move(marker))); }
+        { $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), cql3::expr::oper_t::IN, std::move(marker))); }
    | name=cident K_IN in_values=singleColumnInValues
        { $clauses.emplace_back(cql3::single_column_relation::create_in_relation(std::move(name), std::move(in_values))); }
-    | name=cident K_CONTAINS { rt = &cql3::operator_type::CONTAINS; } (K_KEY { rt = &cql3::operator_type::CONTAINS_KEY; })?
-        t=term { $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), *rt, std::move(t))); }
-    | name=cident '[' key=term ']' type=relationType t=term { $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), std::move(key), *type, std::move(t))); }
+    | name=cident K_CONTAINS { rt = cql3::expr::oper_t::CONTAINS; } (K_KEY { rt = cql3::expr::oper_t::CONTAINS_KEY; })?
+        t=term { $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), rt, std::move(t))); }
+    | name=cident '[' key=term ']' type=relationType t=term { $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), std::move(key), type, std::move(t))); }
    | ids=tupleOfIdentifiers
      ( K_IN
          ( '(' ')'
@@ -1543,10 +1544,10 @@ relation[std::vector<cql3::relation_ptr>& clauses]
          )
      | type=relationType literal=tupleLiteral /* (a, b, c) > (1, 2, 3) or (a, b, c) > (?, ?, ?) */
          {
-              $clauses.emplace_back(cql3::multi_column_relation::create_non_in_relation(ids, *type, literal));
+              $clauses.emplace_back(cql3::multi_column_relation::create_non_in_relation(ids, type, literal));
          }
      | type=relationType tupleMarker=markerForTuple /* (a, b, c) >= ? */
-          { $clauses.emplace_back(cql3::multi_column_relation::create_non_in_relation(ids, *type, tupleMarker)); }
+          { $clauses.emplace_back(cql3::multi_column_relation::create_non_in_relation(ids, type, tupleMarker)); }
      )
    | '(' relation[$clauses] ')'
    ;
@@ -1694,7 +1695,7 @@ username returns [sstring str]
 // Basically the same as cident, but we need to exlude existing CQL3 types
 // (which for some reason are not reserved otherwise)
 non_type_ident returns [shared_ptr<cql3::column_identifier> id]
-    : t=IDENT                    { if (_reserved_type_names().count($t.text)) { add_recognition_error("Invalid (reserved) user type name " + $t.text); } $id = ::make_shared<cql3::column_identifier>($t.text, false); }
+    : t=IDENT                    { if (_reserved_type_names().contains($t.text)) { add_recognition_error("Invalid (reserved) user type name " + $t.text); } $id = ::make_shared<cql3::column_identifier>($t.text, false); }
    | t=QUOTED_NAME              { $id = ::make_shared<cql3::column_identifier>($t.text, true); }
    | k=basic_unreserved_keyword { $id = ::make_shared<cql3::column_identifier>(k, false); }
    | kk=K_KEY                   { $id = ::make_shared<cql3::column_identifier>($kk.text, false); }
--- a/cql3/attributes.cc
+++ b/cql3/attributes.cc
@@ -52,11 +52,6 @@ attributes::attributes(::shared_ptr<term>&& timestamp, ::shared_ptr<term>&& time
    , _time_to_live{std::move(time_to_live)}
 { }

-bool attributes::uses_function(const sstring& ks_name, const sstring& function_name) const {
-    return (_timestamp && _timestamp->uses_function(ks_name, function_name))
-        || (_time_to_live && _time_to_live->uses_function(ks_name, function_name));
-}
-
 bool attributes::is_timestamp_set() const {
    return bool(_timestamp);
 }
--- a/cql3/attributes.hh
+++ b/cql3/attributes.hh
@@ -59,8 +59,6 @@ public:
 private:
    attributes(::shared_ptr<term>&& timestamp, ::shared_ptr<term>&& time_to_live);
 public:
-    bool uses_function(const sstring& ks_name, const sstring& function_name) const;
-
    bool is_timestamp_set() const;

    bool is_time_to_live_set() const;
--- a/cql3/column_condition.cc
+++ b/cql3/column_condition.cc
@@ -48,13 +48,14 @@
 #include "types/map.hh"
 #include "types/list.hh"
 #include "utils/like_matcher.hh"
+#include "expr/expression.hh"

 namespace {

-void validate_operation_on_durations(const abstract_type& type, const cql3::operator_type& op) {
+void validate_operation_on_durations(const abstract_type& type, cql3::expr::oper_t op) {
    using cql3::statements::request_validations::check_false;

-    if (op.is_slice() && type.references_duration()) {
+    if (is_slice(op) && type.references_duration()) {
        check_false(type.is_collection(), "Slice conditions are not supported on collections containing durations");
        check_false(type.is_tuple(), "Slice conditions are not supported on tuples containing durations");
        check_false(type.is_user_type(), "Slice conditions are not supported on UDTs containing durations");
@@ -64,7 +65,7 @@ void validate_operation_on_durations(const abstract_type& type, const cql3::oper
    }
 }

-int is_satisfied_by(const cql3::operator_type &op, const abstract_type& cell_type,
+int is_satisfied_by(cql3::expr::oper_t op, const abstract_type& cell_type,
        const abstract_type& param_type, const data_value& cell_value, const bytes& param) {

        int rc;
@@ -82,21 +83,24 @@ int is_satisfied_by(const cql3::operator_type &op, const abstract_type& cell_typ
        } else {
            rc = cell_type.compare(cell_type.decompose(cell_value), param);
        }
-        if (op == cql3::operator_type::EQ) {
+        switch (op) {
+            using cql3::expr::oper_t;
+        case oper_t::EQ:
            return rc == 0;
-        } else if (op == cql3::operator_type::NEQ) {
+        case oper_t::NEQ:
            return rc != 0;
-        } else if (op == cql3::operator_type::GTE) {
+        case oper_t::GTE:
            return rc >= 0;
-        } else if (op == cql3::operator_type::LTE) {
+        case oper_t::LTE:
            return rc <= 0;
-        } else if (op == cql3::operator_type::GT) {
+        case oper_t::GT:
            return rc > 0;
-        } else if (op == cql3::operator_type::LT) {
+        case oper_t::LT:
            return rc < 0;
+        default:
+            assert(false);
+            return false;
        }
-        assert(false);
-        return false;
 }

 // Read the list index from key and check that list index is not
@@ -114,24 +118,6 @@ uint32_t read_and_check_list_index(const cql3::raw_value_view& key) {

 namespace cql3 {

-bool
-column_condition::uses_function(const sstring& ks_name, const sstring& function_name) const {
-    if (bool(_collection_element) && _collection_element->uses_function(ks_name, function_name)) {
-        return true;
-    }
-    if (bool(_value) && _value->uses_function(ks_name, function_name)) {
-        return true;
-    }
-    if (!_in_values.empty()) {
-        for (auto&& value : _in_values) {
-            if (bool(value) && value->uses_function(ks_name, function_name)) {
-                return true;
-            }
-        }
-    }
-    return false;
-}
-
 void column_condition::collect_marker_specificaton(variable_specifications& bound_names) const {
    if (_collection_element) {
        _collection_element->collect_marker_specification(bound_names);
@@ -223,7 +209,7 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti
        }
    }

-    if (_op.is_compare()) {
+    if (is_compare(_op)) {
        // <, >, >=, <=, !=
        cql3::raw_value_view param = _value->bind_and_get(options);

@@ -231,23 +217,23 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti
            throw exceptions::invalid_request_exception("Invalid 'unset' value in condition");
        }
        if (param.is_null()) {
-            if (_op == operator_type::EQ) {
+            if (_op == expr::oper_t::EQ) {
                return cell_value == nullptr;
-            } else if (_op == operator_type::NEQ) {
+            } else if (_op == expr::oper_t::NEQ) {
                return cell_value != nullptr;
            } else {
                throw exceptions::invalid_request_exception(format("Invalid comparison with null for operator \"{}\"", _op));
            }
        } else if (cell_value == nullptr) {
            // The condition parameter is not null, so only NEQ can return true
-            return _op == operator_type::NEQ;
+            return _op == expr::oper_t::NEQ;
        }
        // type::validate() is called by bind_and_get(), so it's safe to pass to_bytes() result
        // directly to compare.
        return is_satisfied_by(_op, *cell_value->type(), *column.type, *cell_value, to_bytes(param));
    }

-    if (_op == operator_type::LIKE) {
+    if (_op == expr::oper_t::LIKE) {
        if (cell_value == nullptr) {
            return false;
        }
@@ -266,7 +252,7 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti
        }
    }

-    assert(_op == operator_type::IN);
+    assert(_op == expr::oper_t::IN);

    std::vector<bytes_opt> in_values;

@@ -284,7 +270,7 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti
    // If cell value is NULL, IN list must contain NULL or an empty set/list. Otherwise it must contain cell value.
    if (cell_value) {
        return std::any_of(in_values.begin(), in_values.end(), [this, cell_value] (const bytes_opt& value) {
-            return value.has_value() && is_satisfied_by(operator_type::EQ, *cell_value->type(), *column.type, *cell_value, *value);
+            return value.has_value() && is_satisfied_by(expr::oper_t::EQ, *cell_value->type(), *column.type, *cell_value, *value);
        });
    } else {
        return std::any_of(in_values.begin(), in_values.end(), [] (const bytes_opt& value) { return !value.has_value() || value->empty(); });
@@ -325,13 +311,13 @@ column_condition::raw::prepare(database& db, const sstring& keyspace, const colu
        collection_element_term = _collection_element->prepare(db, keyspace, element_spec);
    }

-    if (_op.is_compare()) {
+    if (is_compare(_op)) {
        validate_operation_on_durations(*receiver.type, _op);
        return column_condition::condition(receiver, collection_element_term,
                _value->prepare(db, keyspace, value_spec), nullptr, _op);
    }

-    if (_op == operator_type::LIKE) {
+    if (_op == expr::oper_t::LIKE) {
        auto literal_term = dynamic_pointer_cast<constants::literal>(_value);
        if (literal_term) {
            // Pass matcher object
@@ -348,7 +334,7 @@ column_condition::raw::prepare(database& db, const sstring& keyspace, const colu
        }
    }

-    if (_op != operator_type::IN) {
+    if (_op != expr::oper_t::IN) {
        throw exceptions::invalid_request_exception(format("Unsupported operator type {} in a condition ", _op));
    }

--- a/cql3/column_condition.hh
+++ b/cql3/column_condition.hh
@@ -43,7 +43,7 @@

 #include "cql3/term.hh"
 #include "cql3/abstract_marker.hh"
-#include "cql3/operator.hh"
+#include "cql3/expr/expression.hh"
 #include "utils/like_matcher.hh"

 namespace cql3 {
@@ -67,11 +67,11 @@ private:
    // List of terminals for "a IN (value, value, ...)"
    std::vector<::shared_ptr<term>> _in_values;
    const std::unique_ptr<like_matcher> _matcher;
-    const operator_type& _op;
+    expr::oper_t _op;
 public:
    column_condition(const column_definition& column, ::shared_ptr<term> collection_element,
        ::shared_ptr<term> value, std::vector<::shared_ptr<term>> in_values,
-        std::unique_ptr<like_matcher> matcher, const operator_type& op)
+        std::unique_ptr<like_matcher> matcher, expr::oper_t op)
            : column(column)
            , _collection_element(std::move(collection_element))
            , _value(std::move(value))
@@ -79,7 +79,7 @@ public:
            , _matcher(std::move(matcher))
            , _op(op)
    {
-        if (op != operator_type::IN) {
+        if (op != expr::oper_t::IN) {
            assert(_in_values.empty());
        }
    }
@@ -91,8 +91,6 @@ public:
     */
    void collect_marker_specificaton(variable_specifications& bound_names) const;

-    bool uses_function(const sstring& ks_name, const sstring& function_name) const;
-
    // Retrieve parameter marker values, if any, find the appropriate collection
    // element if the cell is a collection and an element access is used in the expression,
    // and evaluate the condition.
@@ -105,7 +103,7 @@ public:
     * "IF col LIKE <pattern>"
     */
    static lw_shared_ptr<column_condition> condition(const column_definition& def, ::shared_ptr<term> collection_element,
-            ::shared_ptr<term> value, std::unique_ptr<like_matcher> matcher, const operator_type& op) {
+            ::shared_ptr<term> value, std::unique_ptr<like_matcher> matcher, expr::oper_t op) {
        return make_lw_shared<column_condition>(def, std::move(collection_element), std::move(value),
            std::vector<::shared_ptr<term>>{}, std::move(matcher), op);
    }
@@ -114,7 +112,7 @@ public:
    static lw_shared_ptr<column_condition> in_condition(const column_definition& def, ::shared_ptr<term> collection_element,
            ::shared_ptr<term> in_marker, std::vector<::shared_ptr<term>> in_values) {
        return make_lw_shared<column_condition>(def, std::move(collection_element), std::move(in_marker),
-            std::move(in_values), nullptr, operator_type::IN);
+            std::move(in_values), nullptr, expr::oper_t::IN);
    }

    class raw final {
@@ -125,13 +123,13 @@ public:

        // Can be nullptr, used with the syntax "IF m[e] = ..." (in which case it's 'e')
        ::shared_ptr<term::raw> _collection_element;
-        const operator_type& _op;
+        expr::oper_t _op;
    public:
        raw(::shared_ptr<term::raw> value,
            std::vector<::shared_ptr<term::raw>> in_values,
            ::shared_ptr<abstract_marker::in_raw> in_marker,
            ::shared_ptr<term::raw> collection_element,
-            const operator_type& op)
+            expr::oper_t op)
                : _value(std::move(value))
                , _in_values(std::move(in_values))
                , _in_marker(std::move(in_marker))
@@ -147,7 +145,7 @@ public:
         * "IF col LIKE 'foo%'"
         */
        static lw_shared_ptr<raw> simple_condition(::shared_ptr<term::raw> value, ::shared_ptr<term::raw> collection_element,
-                const operator_type& op) {
+                expr::oper_t op) {
            return make_lw_shared<raw>(std::move(value), std::vector<::shared_ptr<term::raw>>{},
                    ::shared_ptr<abstract_marker::in_raw>{}, std::move(collection_element), op);
        }
@@ -163,7 +161,7 @@ public:
        static lw_shared_ptr<raw> in_condition(::shared_ptr<term::raw> collection_element,
                ::shared_ptr<abstract_marker::in_raw> in_marker, std::vector<::shared_ptr<term::raw>> in_values) {
            return make_lw_shared<raw>(::shared_ptr<term::raw>{}, std::move(in_values), std::move(in_marker),
-                    std::move(collection_element), operator_type::IN);
+                    std::move(collection_element), expr::oper_t::IN);
        }

        lw_shared_ptr<column_condition> prepare(database& db, const sstring& keyspace, const column_definition& receiver) const;
--- a/cql3/constants.hh
+++ b/cql3/constants.hh
@@ -46,6 +46,7 @@
 #include "cql3/operation.hh"
 #include "cql3/values.hh"
 #include "cql3/term.hh"
+#include "mutation.hh"
 #include <seastar/core/shared_ptr.hh>

 namespace cql3 {
@@ -184,15 +185,19 @@ public:
                }
                return value;
            } catch (const marshal_exception& e) {
-                throw exceptions::invalid_request_exception(e.what());
+                throw exceptions::invalid_request_exception(
+                        format("Exception while binding column {:s}: {:s}", _receiver->name->to_cql_string(), e.what()));
            }
        }

        virtual ::shared_ptr<terminal> bind(const query_options& options) override {
            auto bytes = bind_and_get(options);
-            if (!bytes) {
+            if (bytes.is_null()) {
                return ::shared_ptr<terminal>{};
            }
+            if (bytes.is_unset_value()) {
+                return UNSET_VALUE;
+            }
            return ::make_shared<constants::value>(std::move(cql3::raw_value::make_value(to_bytes(*bytes))));
        }
    };
--- a/cql3/cql3_type.cc
+++ b/cql3/cql3_type.cc
@@ -348,22 +348,22 @@ cql3_type::raw::user_type(ut_name name) {

 shared_ptr<cql3_type::raw>
 cql3_type::raw::map(shared_ptr<raw> t1, shared_ptr<raw> t2) {
-    return make_shared(raw_collection(abstract_type::kind::map, std::move(t1), std::move(t2)));
+    return ::make_shared<raw_collection>(abstract_type::kind::map, std::move(t1), std::move(t2));
 }

 shared_ptr<cql3_type::raw>
 cql3_type::raw::list(shared_ptr<raw> t) {
-    return make_shared(raw_collection(abstract_type::kind::list, {}, std::move(t)));
+    return ::make_shared<raw_collection>(abstract_type::kind::list, nullptr, std::move(t));
 }

 shared_ptr<cql3_type::raw>
 cql3_type::raw::set(shared_ptr<raw> t) {
-    return make_shared(raw_collection(abstract_type::kind::set, {}, std::move(t)));
+    return ::make_shared<raw_collection>(abstract_type::kind::set, nullptr, std::move(t));
 }

 shared_ptr<cql3_type::raw>
 cql3_type::raw::tuple(std::vector<shared_ptr<raw>> ts) {
-    return make_shared(raw_tuple(std::move(ts)));
+    return ::make_shared<raw_tuple>(std::move(ts));
 }

 shared_ptr<cql3_type::raw>
--- a/cql3/cql_statement.hh
+++ b/cql3/cql_statement.hh
@@ -101,13 +101,15 @@ public:
    virtual future<::shared_ptr<cql_transport::messages::result_message>>
        execute(service::storage_proxy& proxy, service::query_state& state, const query_options& options) const = 0;

-    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const = 0;
-
    virtual bool depends_on_keyspace(const sstring& ks_name) const = 0;

    virtual bool depends_on_column_family(const sstring& cf_name) const = 0;

    virtual shared_ptr<const metadata> get_result_metadata() const = 0;
+
+    virtual bool is_conditional() const {
+        return false;
+    }
 };

 class cql_statement_no_metadata : public cql_statement {
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -0,0 +1,933 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "expression.hh"
+
+#include <boost/algorithm/cxx11/all_of.hpp>
+#include <boost/algorithm/cxx11/any_of.hpp>
+#include <boost/range/adaptors.hpp>
+#include <fmt/ostream.h>
+#include <unordered_map>
+
+#include "cql3/constants.hh"
+#include "cql3/lists.hh"
+#include "cql3/statements/request_validations.hh"
+#include "cql3/tuples.hh"
+#include "index/secondary_index_manager.hh"
+#include "types/list.hh"
+#include "types/map.hh"
+#include "types/set.hh"
+#include "utils/like_matcher.hh"
+
+namespace cql3 {
+namespace expr {
+
+using boost::adaptors::filtered;
+using boost::adaptors::transformed;
+
+namespace {
+
+std::optional<atomic_cell_value_view> do_get_value(const schema& schema,
+        const column_definition& cdef,
+        const partition_key& key,
+        const clustering_key_prefix& ckey,
+        const row& cells,
+        gc_clock::time_point now) {
+    switch (cdef.kind) {
+        case column_kind::partition_key:
+            return atomic_cell_value_view(key.get_component(schema, cdef.component_index()));
+        case column_kind::clustering_key:
+            return atomic_cell_value_view(ckey.get_component(schema, cdef.component_index()));
+        default:
+            auto cell = cells.find_cell(cdef.id);
+            if (!cell) {
+                return std::nullopt;
+            }
+            assert(cdef.is_atomic());
+            auto c = cell->as_atomic_cell(cdef);
+            return c.is_dead(now) ? std::nullopt : std::optional<atomic_cell_value_view>(c.value());
+    }
+}
+
+using children_t = std::vector<expression>; // conjunction's children.
+
+children_t explode_conjunction(expression e) {
+    return std::visit(overloaded_functor{
+            [] (const conjunction& c) { return std::move(c.children); },
+            [&] (const auto&) { return children_t{std::move(e)}; },
+        }, e);
+}
+
+using cql3::selection::selection;
+
+/// Serialized values for all types of cells, plus selection (to find a column's index) and options (for
+/// subscript term's value).
+struct row_data_from_partition_slice {
+    const std::vector<bytes>& partition_key;
+    const std::vector<bytes>& clustering_key;
+    const std::vector<bytes_opt>& other_columns;
+    const selection& sel;
+};
+
+/// Data used to derive cell values from a mutation.
+struct row_data_from_mutation {
+    // Underscores avoid name clashes.
+    const partition_key& partition_key_;
+    const clustering_key_prefix& clustering_key_;
+    const row& other_columns;
+    const schema& schema_;
+    gc_clock::time_point now;
+};
+
+/// Everything needed to compute column values during restriction evaluation.
+struct column_value_eval_bag {
+    const query_options& options; // For evaluating subscript terms.
+    std::variant<row_data_from_partition_slice, row_data_from_mutation> row_data;
+};
+
+/// Returns col's value from queried data.
+bytes_opt get_value_from_partition_slice(
+        const column_value& col, row_data_from_partition_slice data, const query_options& options) {
+    auto cdef = col.col;
+    if (col.sub) {
+        auto col_type = static_pointer_cast<const collection_type_impl>(cdef->type);
+        if (!col_type->is_map()) {
+            throw exceptions::invalid_request_exception(format("subscripting non-map column {}", cdef->name_as_text()));
+        }
+        const auto deserialized = cdef->type->deserialize(*data.other_columns[data.sel.index_of(*cdef)]);
+        const auto& data_map = value_cast<map_type_impl::native_type>(deserialized);
+        const auto key = col.sub->bind_and_get(options);
+        auto&& key_type = col_type->name_comparator();
+        const auto found = with_linearized(*key, [&] (bytes_view key_bv) {
+            using entry = std::pair<data_value, data_value>;
+            return std::find_if(data_map.cbegin(), data_map.cend(), [&] (const entry& element) {
+                return key_type->compare(element.first.serialize_nonnull(), key_bv) == 0;
+            });
+        });
+        return found == data_map.cend() ? bytes_opt() : bytes_opt(found->second.serialize_nonnull());
+    } else {
+        switch (cdef->kind) {
+        case column_kind::partition_key:
+            return data.partition_key[cdef->id];
+        case column_kind::clustering_key:
+            return data.clustering_key[cdef->id];
+        case column_kind::static_column:
+        case column_kind::regular_column:
+            return data.other_columns[data.sel.index_of(*cdef)];
+        default:
+            throw exceptions::unsupported_operation_exception("Unknown column kind");
+        }
+    }
+}
+
+/// Returns col's value from a mutation.
+bytes_opt get_value_from_mutation(const column_value& col, row_data_from_mutation data) {
+    const auto v = do_get_value(
+            data.schema_, *col.col, data.partition_key_, data.clustering_key_, data.other_columns, data.now);
+    return v ? v->linearize() : bytes_opt();
+}
+
+/// Returns col's value from the fetched data.
+bytes_opt get_value(const column_value& col, const column_value_eval_bag& bag) {
+    using std::placeholders::_1;
+    return std::visit(overloaded_functor{
+            std::bind(get_value_from_mutation, col, _1),
+            std::bind(get_value_from_partition_slice, col, _1, bag.options),
+        }, bag.row_data);
+}
+
+/// Type for comparing results of get_value().
+const abstract_type* get_value_comparator(const column_definition* cdef) {
+    return cdef->type->is_reversed() ? cdef->type->underlying_type().get() : cdef->type.get();
+}
+
+/// Type for comparing results of get_value().
+const abstract_type* get_value_comparator(const column_value& cv) {
+    return cv.sub ? static_pointer_cast<const collection_type_impl>(cv.col->type)->value_comparator().get()
+            : get_value_comparator(cv.col);
+}
+
+/// If t represents a tuple value, returns that value.  Otherwise, null.
+///
+/// Useful for checking binary_operator::rhs, which packs multiple values into a single term when lhs is itself
+/// a tuple.  NOT useful for the IN operator, whose rhs is either a list or tuples::in_value.
+::shared_ptr<tuples::value> get_tuple(term& t, const query_options& opts) {
+    return dynamic_pointer_cast<tuples::value>(t.bind(opts));
+}
+
+/// True iff lhs's value equals rhs.
+bool equal(const bytes_opt& rhs, const column_value& lhs, const column_value_eval_bag& bag) {
+    if (!rhs) {
+        return false;
+    }
+    const auto value = get_value(lhs, bag);
+    if (!value) {
+        return false;
+    }
+    return get_value_comparator(lhs)->equal(*value, *rhs);
+}
+
+/// Convenience overload for term.
+bool equal(term& rhs, const column_value& lhs, const column_value_eval_bag& bag) {
+    return equal(to_bytes_opt(rhs.bind_and_get(bag.options)), lhs, bag);
+}
+
+/// True iff columns' values equal t.
+bool equal(term& t, const std::vector<column_value>& columns, const column_value_eval_bag& bag) {
+    const auto tup = get_tuple(t, bag.options);
+    if (!tup) {
+        throw exceptions::invalid_request_exception("multi-column equality has right-hand side that isn't a tuple");
+    }
+    const auto& rhs = tup->get_elements();
+    if (rhs.size() != columns.size()) {
+        throw exceptions::invalid_request_exception(
+                format("tuple equality size mismatch: {} elements on left-hand side, {} on right",
+                       columns.size(), rhs.size()));
+    }
+    return boost::equal(rhs, columns, [&] (const bytes_opt& b, const column_value& lhs) {
+        return equal(b, lhs, bag);
+    });
+}
+
+/// True iff lhs is limited by rhs in the manner prescribed by op.
+bool limits(bytes_view lhs, oper_t op, bytes_view rhs, const abstract_type& type) {
+    const auto cmp = type.compare(lhs, rhs);
+    switch (op) {
+    case oper_t::LT:
+        return cmp < 0;
+    case oper_t::LTE:
+        return cmp <= 0;
+    case oper_t::GT:
+        return cmp > 0;
+    case oper_t::GTE:
+        return cmp >= 0;
+    case oper_t::EQ:
+        return cmp == 0;
+    case oper_t::NEQ:
+        return cmp != 0;
+    default:
+        throw std::logic_error(format("limits() called on non-compare op {}", op));
+    }
+}
+
+/// True iff the column value is limited by rhs in the manner prescribed by op.
+bool limits(const column_value& col, oper_t op, term& rhs, const column_value_eval_bag& bag) {
+    if (!is_slice(op)) { // For EQ or NEQ, use equal().
+        throw std::logic_error("limits() called on non-slice op");
+    }
+    auto lhs = get_value(col, bag);
+    if (!lhs) {
+        return false;
+    }
+    const auto b = to_bytes_opt(rhs.bind_and_get(bag.options));
+    return b ? limits(*lhs, op, *b, *get_value_comparator(col)) : false;
+}
+
+/// True iff the column values are limited by t in the manner prescribed by op.
+bool limits(const std::vector<column_value>& columns, const oper_t op, term& t,
+            const column_value_eval_bag& bag) {
+    if (!is_slice(op)) { // For EQ or NEQ, use equal().
+        throw std::logic_error("limits() called on non-slice op");
+    }
+    const auto tup = get_tuple(t, bag.options);
+    if (!tup) {
+        throw exceptions::invalid_request_exception(
+                "multi-column comparison has right-hand side that isn't a tuple");
+    }
+    const auto& rhs = tup->get_elements();
+    if (rhs.size() != columns.size()) {
+        throw exceptions::invalid_request_exception(
+                format("tuple comparison size mismatch: {} elements on left-hand side, {} on right",
+                       columns.size(), rhs.size()));
+    }
+    for (size_t i = 0; i < rhs.size(); ++i) {
+        const auto cmp = get_value_comparator(columns[i])->compare(
+                // CQL dictates that columns[i] is a clustering column and non-null.
+                *get_value(columns[i], bag),
+                *rhs[i]);
+        // If the components aren't equal, then we just learned the LHS/RHS order.
+        if (cmp < 0) {
+            if (op == oper_t::LT || op == oper_t::LTE) {
+                return true;
+            } else if (op == oper_t::GT || op == oper_t::GTE) {
+                return false;
+            } else {
+                throw std::logic_error("Unknown slice operator");
+            }
+        } else if (cmp > 0) {
+            if (op == oper_t::LT || op == oper_t::LTE) {
+                return false;
+            } else if (op == oper_t::GT || op == oper_t::GTE) {
+                return true;
+            } else {
+                throw std::logic_error("Unknown slice operator");
+            }
+        }
+        // Otherwise, we don't know the LHS/RHS order, so check the next component.
+    }
+    // Getting here means LHS == RHS.
+    return op == oper_t::LTE || op == oper_t::GTE;
+}
+
+/// True iff collection (list, set, or map) contains value.
+bool contains(const data_value& collection, const raw_value_view& value) {
+    if (!value) {
+        return true; // Compatible with old code, which skips null terms in value comparisons.
+    }
+    auto col_type = static_pointer_cast<const collection_type_impl>(collection.type());
+    auto&& element_type = col_type->is_set() ? col_type->name_comparator() : col_type->value_comparator();
+    return with_linearized(*value, [&] (bytes_view val) {
+        auto exists_in = [&](auto&& range) {
+            auto found = std::find_if(range.begin(), range.end(), [&] (auto&& element) {
+                return element_type->compare(element.serialize_nonnull(), val) == 0;
+            });
+            return found != range.end();
+        };
+        if (col_type->is_list()) {
+            return exists_in(value_cast<list_type_impl::native_type>(collection));
+        } else if (col_type->is_set()) {
+            return exists_in(value_cast<set_type_impl::native_type>(collection));
+        } else if (col_type->is_map()) {
+            auto data_map = value_cast<map_type_impl::native_type>(collection);
+            using entry = std::pair<data_value, data_value>;
+            return exists_in(data_map | transformed([] (const entry& e) { return e.second; }));
+        } else {
+            throw std::logic_error("unsupported collection type in a CONTAINS expression");
+        }
+    });
+}
+
+/// True iff a column is a collection containing value.
+bool contains(const column_value& col, const raw_value_view& value, const column_value_eval_bag& bag) {
+    if (col.sub) {
+        throw exceptions::unsupported_operation_exception("CONTAINS lhs is subscripted");
+    }
+    const auto collection = get_value(col, bag);
+    if (collection) {
+        return contains(col.col->type->deserialize(*collection), value);
+    } else {
+        return false;
+    }
+}
+
+/// True iff a column is a map containing \p key.
+bool contains_key(const column_value& col, cql3::raw_value_view key, const column_value_eval_bag& bag) {
+    if (col.sub) {
+        throw exceptions::unsupported_operation_exception("CONTAINS KEY lhs is subscripted");
+    }
+    if (!key) {
+        return true; // Compatible with old code, which skips null terms in key comparisons.
+    }
+    auto type = col.col->type;
+    const auto collection = get_value(col, bag);
+    if (!collection) {
+        return false;
+    }
+    const auto data_map = value_cast<map_type_impl::native_type>(type->deserialize(*collection));
+    auto key_type = static_pointer_cast<const collection_type_impl>(type)->name_comparator();
+    auto found = with_linearized(*key, [&] (bytes_view k_bv) {
+        using entry = std::pair<data_value, data_value>;
+        return std::find_if(data_map.begin(), data_map.end(), [&] (const entry& element) {
+            return key_type->compare(element.first.serialize_nonnull(), k_bv) == 0;
+        });
+    });
+    return found != data_map.end();
+}
+
+/// Fetches the next cell value from iter and returns its (possibly null) value.
+bytes_opt next_value(query::result_row_view::iterator_type& iter, const column_definition* cdef) {
+    if (cdef->type->is_multi_cell()) {
+        auto cell = iter.next_collection_cell();
+        if (cell) {
+            return cell->with_linearized([] (bytes_view data) {
+                return bytes(data.cbegin(), data.cend());
+            });
+        }
+    } else {
+        auto cell = iter.next_atomic_cell();
+        if (cell) {
+            return cell->value().with_linearized([] (bytes_view data) {
+                return bytes(data.cbegin(), data.cend());
+            });
+        }
+    }
+    return std::nullopt;
+}
+
+/// Returns values of non-primary-key columns from selection.  The kth element of the result
+/// corresponds to the kth column in selection.
+std::vector<bytes_opt> get_non_pk_values(const selection& selection, const query::result_row_view& static_row,
+                                         const query::result_row_view* row) {
+    const auto& cols = selection.get_columns();
+    std::vector<bytes_opt> vals(cols.size());
+    auto static_row_iterator = static_row.iterator();
+    auto row_iterator = row ? std::optional<query::result_row_view::iterator_type>(row->iterator()) : std::nullopt;
+    for (size_t i = 0; i < cols.size(); ++i) {
+        switch (cols[i]->kind) {
+        case column_kind::static_column:
+            vals[i] = next_value(static_row_iterator, cols[i]);
+            break;
+        case column_kind::regular_column:
+            if (row) {
+                vals[i] = next_value(*row_iterator, cols[i]);
+            }
+            break;
+        default: // Skip.
+            break;
+        }
+    }
+    return vals;
+}
+
+/// True iff cv matches the CQL LIKE pattern.
+bool like(const column_value& cv, const bytes_opt& pattern, const column_value_eval_bag& bag) {
+    if (!cv.col->type->is_string()) {
+        throw exceptions::invalid_request_exception(
+                format("LIKE is allowed only on string types, which {} is not", cv.col->name_as_text()));
+    }
+    auto value = get_value(cv, bag);
+    // TODO: reuse matchers.
+    return (pattern && value) ? like_matcher(*pattern)(*value) : false;
+}
+
+/// True iff the column value is in the set defined by rhs.
+bool is_one_of(const column_value& col, term& rhs, const column_value_eval_bag& bag) {
+    // RHS is prepared differently for different CQL cases.  Cast it dynamically to discern which case this is.
+    if (auto dv = dynamic_cast<lists::delayed_value*>(&rhs)) {
+        // This is `a IN (1,2,3)`.  RHS elements are themselves terms.
+        return boost::algorithm::any_of(dv->get_elements(), [&] (const ::shared_ptr<term>& t) {
+                return equal(*t, col, bag);
+            });
+    } else if (auto mkr = dynamic_cast<lists::marker*>(&rhs)) {
+        // This is `a IN ?`.  RHS elements are values representable as bytes_opt.
+        const auto values = static_pointer_cast<lists::value>(mkr->bind(bag.options));
+        statements::request_validations::check_not_null(
+                values, "Invalid null value for column %s", col.col->name_as_text());
+        return boost::algorithm::any_of(values->get_elements(), [&] (const bytes_opt& b) {
+                return equal(b, col, bag);
+            });
+    }
+    throw std::logic_error("unexpected term type in is_one_of(single column)");
+}
+
+/// True iff the tuple of column values is in the set defined by rhs.
+bool is_one_of(const std::vector<column_value>& cvs, term& rhs, const column_value_eval_bag& bag) {
+    // RHS is prepared differently for different CQL cases.  Cast it dynamically to discern which case this is.
+    if (auto dv = dynamic_cast<lists::delayed_value*>(&rhs)) {
+        // This is `(a,b) IN ((1,1),(2,2),(3,3))`.  RHS elements are themselves terms.
+        return boost::algorithm::any_of(dv->get_elements(), [&] (const ::shared_ptr<term>& t) {
+                return equal(*t, cvs, bag);
+            });
+    } else if (auto mkr = dynamic_cast<tuples::in_marker*>(&rhs)) {
+        // This is `(a,b) IN ?`.  RHS elements are themselves tuples, represented as vector<bytes_opt>.
+        const auto marker_value = static_pointer_cast<tuples::in_value>(mkr->bind(bag.options));
+        return boost::algorithm::any_of(marker_value->get_split_values(), [&] (const std::vector<bytes_opt>& el) {
+                return boost::equal(cvs, el, [&] (const column_value& c, const bytes_opt& b) {
+                    return equal(b, c, bag);
+                });
+            });
+    }
+    throw std::logic_error("unexpected term type in is_one_of(multi-column)");
+}
+
+/// True iff op means bnd type of bound.
+bool matches(oper_t op, statements::bound bnd) {
+    switch (op) {
+    case oper_t::GT:
+    case oper_t::GTE:
+        return is_start(bnd); // These set a lower bound.
+    case oper_t::LT:
+    case oper_t::LTE:
+        return is_end(bnd); // These set an upper bound.
+    case oper_t::EQ:
+        return true; // Bounds from both sides.
+    default:
+        return false;
+    }
+}
+
+const value_set empty_value_set = value_list{};
+const value_set unbounded_value_set = nonwrapping_range<bytes>::make_open_ended_both_sides();
+
+struct intersection_visitor {
+    const abstract_type* type;
+    value_set operator()(const value_list& a, const value_list& b) const {
+        value_list common;
+        common.reserve(std::max(a.size(), b.size()));
+        boost::set_intersection(a, b, back_inserter(common), type->as_less_comparator());
+        return std::move(common);
+    }
+
+    value_set operator()(const nonwrapping_range<bytes>& a, const value_list& b) const {
+        const auto common = b | filtered([&] (const bytes& el) { return a.contains(el, type->as_tri_comparator()); });
+        return value_list(common.begin(), common.end());
+    }
+
+    value_set operator()(const value_list& a, const nonwrapping_range<bytes>& b) const {
+        return (*this)(b, a);
+    }
+
+    value_set operator()(const nonwrapping_range<bytes>& a, const nonwrapping_range<bytes>& b) const {
+        const auto common_range = a.intersection(b, type->as_tri_comparator());
+        return common_range ? *common_range : empty_value_set;
+    }
+};
+
+value_set intersection(value_set a, value_set b, const abstract_type* type) {
+    return std::visit(intersection_visitor{type}, std::move(a), std::move(b));
+}
+
+bool is_satisfied_by(const binary_operator& opr, const column_value_eval_bag& bag) {
+    return std::visit(overloaded_functor{
+            [&] (const column_value& col) {
+                if (opr.op == oper_t::EQ) {
+                    return equal(*opr.rhs, col, bag);
+                } else if (opr.op == oper_t::NEQ) {
+                    return !equal(*opr.rhs, col, bag);
+                } else if (is_slice(opr.op)) {
+                    return limits(col, opr.op, *opr.rhs, bag);
+                } else if (opr.op == oper_t::CONTAINS) {
+                    return contains(col, opr.rhs->bind_and_get(bag.options), bag);
+                } else if (opr.op == oper_t::CONTAINS_KEY) {
+                    return contains_key(col, opr.rhs->bind_and_get(bag.options), bag);
+                } else if (opr.op == oper_t::LIKE) {
+                    return like(col, to_bytes_opt(opr.rhs->bind_and_get(bag.options)), bag);
+                } else if (opr.op == oper_t::IN) {
+                    return is_one_of(col, *opr.rhs, bag);
+                } else {
+                    throw exceptions::unsupported_operation_exception(format("Unhandled binary_operator: {}", opr));
+                }
+            },
+            [&] (const std::vector<column_value>& cvs) {
+                if (opr.op == oper_t::EQ) {
+                    return equal(*opr.rhs, cvs, bag);
+                } else if (is_slice(opr.op)) {
+                    return limits(cvs, opr.op, *opr.rhs, bag);
+                } else if (opr.op == oper_t::IN) {
+                    return is_one_of(cvs, *opr.rhs, bag);
+                } else {
+                    throw exceptions::unsupported_operation_exception(
+                            format("Unhandled multi-column binary_operator: {}", opr));
+                }
+            },
+            [] (const token& tok) -> bool {
+                // The RHS value was already used to ensure we fetch only rows in the specified
+                // token range.  It is impossible for any fetched row not to match now.
+                return true;
+            },
+        }, opr.lhs);
+}
+
+bool is_satisfied_by(const expression& restr, const column_value_eval_bag& bag) {
+    return std::visit(overloaded_functor{
+            [&] (bool v) { return v; },
+            [&] (const conjunction& conj) {
+                return boost::algorithm::all_of(conj.children, [&] (const expression& c) {
+                    return is_satisfied_by(c, bag);
+                });
+            },
+            [&] (const binary_operator& opr) { return is_satisfied_by(opr, bag); },
+        }, restr);
+}
+
+/// If t is a tuple, binds and gets its k-th element.  Otherwise, binds and gets t's whole value.
+bytes_opt get_kth(size_t k, const query_options& options, const ::shared_ptr<term>& t) {
+    auto bound = t->bind(options);
+    if (auto tup = dynamic_pointer_cast<tuples::value>(bound)) {
+        return tup->get_elements()[k];
+    } else {
+        throw std::logic_error("non-tuple RHS for multi-column IN");
+    }
+}
+
+template<typename Range>
+value_list to_sorted_vector(Range r, const serialized_compare& comparator) {
+    BOOST_CONCEPT_ASSERT((boost::ForwardRangeConcept<Range>));
+    value_list tmp(r.begin(), r.end()); // Need random-access range to sort (r is not necessarily random-access).
+    const auto unique = boost::unique(boost::sort(tmp, comparator));
+    return value_list(unique.begin(), unique.end());
+}
+
+const auto non_null = boost::adaptors::filtered([] (const bytes_opt& b) { return b.has_value(); });
+
+const auto deref = boost::adaptors::transformed([] (const bytes_opt& b) { return b.value(); });
+
+/// Returns possible values from t, which must be RHS of IN.
+value_list get_IN_values(
+        const ::shared_ptr<term>& t, const query_options& options, const serialized_compare& comparator,
+        sstring_view column_name) {
+    // RHS is prepared differently for different CQL cases.  Cast it dynamically to discern which case this is.
+    if (auto dv = dynamic_pointer_cast<lists::delayed_value>(t)) {
+        // Case `a IN (1,2,3)`.
+        const auto result_range = dv->get_elements()
+                | boost::adaptors::transformed([&] (const ::shared_ptr<term>& t) { return to_bytes_opt(t->bind_and_get(options)); })
+                | non_null | deref;
+        return to_sorted_vector(std::move(result_range), comparator);
+    } else if (auto mkr = dynamic_pointer_cast<lists::marker>(t)) {
+        // Case `a IN ?`.  Collect all list-element values.
+        const auto val = mkr->bind(options);
+        if (val == constants::UNSET_VALUE) {
+            throw exceptions::invalid_request_exception(format("Invalid unset value for column {}", column_name));
+        }
+        statements::request_validations::check_not_null(val, "Invalid null value for IN tuple");
+        return to_sorted_vector(static_pointer_cast<lists::value>(val)->get_elements() | non_null | deref, comparator);
+    }
+    throw std::logic_error(format("get_IN_values(single column) on invalid term {}", *t));
+}
+
+/// Returns possible values for k-th column from t, which must be RHS of IN.
+value_list get_IN_values(const ::shared_ptr<term>& t, size_t k, const query_options& options,
+                         const serialized_compare& comparator) {
+    // RHS is prepared differently for different CQL cases.  Cast it dynamically to discern which case this is.
+    if (auto dv = dynamic_pointer_cast<lists::delayed_value>(t)) {
+        // Case `(a,b) in ((1,1),(2,2),(3,3))`.  Get kth value from each term element.
+        const auto result_range = dv->get_elements()
+                | boost::adaptors::transformed(std::bind_front(get_kth, k, options)) | non_null | deref;
+        return to_sorted_vector(std::move(result_range), comparator);
+    } else if (auto mkr = dynamic_pointer_cast<tuples::in_marker>(t)) {
+        // Case `(a,b) IN ?`.  Get kth value from each vector<bytes> element.
+        const auto val = static_pointer_cast<tuples::in_value>(mkr->bind(options));
+        const auto split_values = val->get_split_values(); // Need lvalue from which to make std::view.
+        const auto result_range = split_values
+                | boost::adaptors::transformed([k] (const std::vector<bytes_opt>& v) { return v[k]; }) | non_null | deref;
+        return to_sorted_vector(std::move(result_range), comparator);
+    }
+    throw std::logic_error(format("get_IN_values(multi-column) on invalid term {}", *t));
+}
+
+static constexpr bool inclusive = true, exclusive = false;
+
+/// A range of all X such that X op val.
+nonwrapping_range<bytes> to_range(oper_t op, const bytes& val) {
+    switch (op) {
+    case oper_t::GT:
+        return nonwrapping_range<bytes>::make_starting_with(range_bound(val, exclusive));
+    case oper_t::GTE:
+        return nonwrapping_range<bytes>::make_starting_with(range_bound(val, inclusive));
+    case oper_t::LT:
+        return nonwrapping_range<bytes>::make_ending_with(range_bound(val, exclusive));
+    case oper_t::LTE:
+        return nonwrapping_range<bytes>::make_ending_with(range_bound(val, inclusive));
+    default:
+        throw std::logic_error(format("to_range: unknown comparison operator {}", op));
+    }
+}
+
+} // anonymous namespace
+
+expression make_conjunction(expression a, expression b) {
+    auto children = explode_conjunction(std::move(a));
+    boost::copy(explode_conjunction(std::move(b)), back_inserter(children));
+    return conjunction{std::move(children)};
+}
+
+bool is_satisfied_by(
+        const expression& restr,
+        const std::vector<bytes>& partition_key, const std::vector<bytes>& clustering_key,
+        const query::result_row_view& static_row, const query::result_row_view* row,
+        const selection& selection, const query_options& options) {
+    const auto regulars = get_non_pk_values(selection, static_row, row);
+    return is_satisfied_by(
+            restr, {options, row_data_from_partition_slice{partition_key, clustering_key, regulars, selection}});
+}
+
+bool is_satisfied_by(
+        const expression& restr,
+        const schema& schema, const partition_key& key, const clustering_key_prefix& ckey, const row& cells,
+        const query_options& options, gc_clock::time_point now) {
+    return is_satisfied_by(restr, {options, row_data_from_mutation{key, ckey, cells, schema, now}});
+}
+
+std::vector<bytes_opt> first_multicolumn_bound(
+        const expression& restr, const query_options& options, statements::bound bnd) {
+    auto found = find_atom(restr, [bnd] (const binary_operator& oper) {
+        return matches(oper.op, bnd) && std::holds_alternative<std::vector<column_value>>(oper.lhs);
+    });
+    if (found) {
+        return static_pointer_cast<tuples::value>(found->rhs->bind(options))->get_elements();
+    } else {
+        return std::vector<bytes_opt>{};
+    }
+}
+
+value_set possible_lhs_values(const column_definition* cdef, const expression& expr, const query_options& options) {
+    const auto type = cdef ? get_value_comparator(cdef) : long_type.get();
+    return std::visit(overloaded_functor{
+            [] (bool b) {
+                return b ? unbounded_value_set : empty_value_set;
+            },
+            [&] (const conjunction& conj) {
+                return boost::accumulate(conj.children, unbounded_value_set,
+                        [&] (const value_set& acc, const expression& child) {
+                            return intersection(
+                                    std::move(acc), possible_lhs_values(cdef, child, options), type);
+                        });
+            },
+            [&] (const binary_operator& oper) -> value_set {
+                return std::visit(overloaded_functor{
+                        [&] (const column_value& col) -> value_set {
+                            if (!cdef || cdef != col.col) {
+                                return unbounded_value_set;
+                            }
+                            if (is_compare(oper.op)) {
+                                const auto val = to_bytes_opt(oper.rhs->bind_and_get(options));
+                                if (!val) {
+                                    return empty_value_set; // All NULL comparisons fail; no column values match.
+                                }
+                                return oper.op == oper_t::EQ ? value_set(value_list{*val})
+                                        : to_range(oper.op, *val);
+                            } else if (oper.op == oper_t::IN) {
+                                return get_IN_values(oper.rhs, options, type->as_less_comparator(), cdef->name_as_text());
+                            }
+                            throw std::logic_error(format("possible_lhs_values: unhandled operator {}", oper));
+                        },
+                        [&] (const std::vector<column_value>& cvs) -> value_set {
+                            if (!cdef) {
+                                return unbounded_value_set;
+                            }
+                            const auto found = boost::find_if(
+                                    cvs, [&] (const column_value& c) { return c.col == cdef; });
+                            if (found == cvs.end()) {
+                                return unbounded_value_set;
+                            }
+                            const auto column_index_on_lhs = std::distance(cvs.begin(), found);
+                            if (is_compare(oper.op)) {
+                                // RHS must be a tuple due to upstream checks.
+                                bytes_opt val = get_tuple(*oper.rhs, options)->get_elements()[column_index_on_lhs];
+                                if (!val) {
+                                    return empty_value_set; // All NULL comparisons fail; no column values match.
+                                }
+                                if (oper.op == oper_t::EQ) {
+                                    return value_list{*val};
+                                }
+                                if (column_index_on_lhs > 0) {
+                                    // A multi-column comparison restricts only the first column, because
+                                    // comparison is lexicographical.
+                                    return unbounded_value_set;
+                                }
+                                return to_range(oper.op, *val);
+                            } else if (oper.op == oper_t::IN) {
+                                return get_IN_values(oper.rhs, column_index_on_lhs, options, type->as_less_comparator());
+                            }
+                            return unbounded_value_set;
+                        },
+                        [&] (token) -> value_set {
+                            if (cdef) {
+                                return unbounded_value_set;
+                            }
+                            const auto val = to_bytes_opt(oper.rhs->bind_and_get(options));
+                            if (!val) {
+                                return empty_value_set; // All NULL comparisons fail; no token values match.
+                            }
+                            if (oper.op == oper_t::EQ) {
+                                return value_list{*val};
+                            } else if (oper.op == oper_t::GT) {
+                                return nonwrapping_range<bytes>::make_starting_with(range_bound(*val, exclusive));
+                            } else if (oper.op == oper_t::GTE) {
+                                return nonwrapping_range<bytes>::make_starting_with(range_bound(*val, inclusive));
+                            }
+                            static const bytes MININT = serialized(std::numeric_limits<int64_t>::min()),
+                                    MAXINT = serialized(std::numeric_limits<int64_t>::max());
+                            // Undocumented feature: when the user types `token(...) < MININT`, we interpret
+                            // that as MAXINT for some reason.
+                            const auto adjusted_val = (*val == MININT) ? serialized(MAXINT) : *val;
+                            if (oper.op == oper_t::LT) {
+                                return nonwrapping_range<bytes>::make_ending_with(range_bound(adjusted_val, exclusive));
+                            } else if (oper.op == oper_t::LTE) {
+                                return nonwrapping_range<bytes>::make_ending_with(range_bound(adjusted_val, inclusive));
+                            }
+                            throw std::logic_error(format("get_token_interval invalid operator {}", oper.op));
+                        },
+                    }, oper.lhs);
+            },
+        }, expr);
+}
+
+nonwrapping_range<bytes> to_range(const value_set& s) {
+    return std::visit(overloaded_functor{
+            [] (const nonwrapping_range<bytes>& r) { return r; },
+            [] (const value_list& lst) {
+                if (lst.size() != 1) {
+                    throw std::logic_error(format("to_range called on list of size {}", lst.size()));
+                }
+                return nonwrapping_range<bytes>::make_singular(lst[0]);
+            },
+        }, s);
+}
+
+bool is_supported_by(const expression& expr, const secondary_index::index& idx) {
+    using std::placeholders::_1;
+    return std::visit(overloaded_functor{
+            [&] (const conjunction& conj) {
+                return boost::algorithm::all_of(conj.children, std::bind(is_supported_by, _1, idx));
+            },
+            [&] (const binary_operator& oper) {
+                return std::visit(overloaded_functor{
+                        [&] (const column_value& col) {
+                            return idx.supports_expression(*col.col, oper.op);
+                        },
+                        [&] (const std::vector<column_value>& cvs) {
+                            return boost::algorithm::any_of(cvs, [&] (const column_value& c) {
+                                return idx.supports_expression(*c.col, oper.op);
+                            });
+                        },
+                        [&] (const token&) { return false; },
+                    }, oper.lhs);
+            },
+            [] (const auto& default_case) { return false; }
+        }, expr);
+}
+
+bool has_supporting_index(
+        const expression& expr,
+        const secondary_index::secondary_index_manager& index_manager,
+        allow_local_index allow_local) {
+    const auto indexes = index_manager.list_indexes();
+    const auto support = std::bind(is_supported_by, expr, std::placeholders::_1);
+    return allow_local ? boost::algorithm::any_of(indexes, support)
+            : boost::algorithm::any_of(
+                    indexes | filtered([] (const secondary_index::index& i) { return !i.metadata().local(); }),
+                    support);
+}
+
+std::ostream& operator<<(std::ostream& os, const column_value& cv) {
+    os << *cv.col;
+    if (cv.sub) {
+        os << '[' << *cv.sub << ']';
+    }
+    return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const expression& expr) {
+    std::visit(overloaded_functor{
+            [&] (bool b) { os << (b ? "TRUE" : "FALSE"); },
+            [&] (const conjunction& conj) { fmt::print(os, "({})", fmt::join(conj.children, ") AND (")); },
+            [&] (const binary_operator& opr) {
+                std::visit(overloaded_functor{
+                        [&] (const token& t) { os << "TOKEN"; },
+                        [&] (const column_value& col) {
+                            fmt::print(os, "({})", col);
+                        },
+                        [&] (const std::vector<column_value>& cvs) {
+                            fmt::print(os, "(({}))", fmt::join(cvs, ","));
+                        },
+                    }, opr.lhs);
+                os << ' ' << opr.op << ' ' << *opr.rhs;
+            },
+        }, expr);
+    return os;
+}
+
+sstring to_string(const expression& expr) {
+    return fmt::format("{}", expr);
+}
+
+bool is_on_collection(const binary_operator& b) {
+    if (b.op == oper_t::CONTAINS || b.op == oper_t::CONTAINS_KEY) {
+        return true;
+    }
+    if (auto cvs = std::get_if<std::vector<column_value>>(&b.lhs)) {
+        return boost::algorithm::any_of(*cvs, [] (const column_value& v) { return v.sub; });
+    }
+    return false;
+}
+
+expression replace_column_def(const expression& expr, const column_definition* new_cdef) {
+    return std::visit(overloaded_functor{
+            [] (bool b){ return expression(b); },
+            [&] (const conjunction& conj) {
+                const auto applied = conj.children | transformed(
+                        std::bind(replace_column_def, std::placeholders::_1, new_cdef));
+                return expression(conjunction{std::vector(applied.begin(), applied.end())});
+            },
+            [&] (const binary_operator& oper) {
+                return std::visit(overloaded_functor{
+                        [&] (const column_value& col) {
+                            return expression(binary_operator{column_value{new_cdef}, oper.op, oper.rhs});
+                        },
+                        [&] (const std::vector<column_value>& cvs) -> expression {
+                            throw std::logic_error(format("replace_column_def invalid LHS: {}", to_string(oper)));
+                        },
+                        [&] (const token&) { return expr; },
+                    }, oper.lhs);
+            },
+        }, expr);
+}
+
+std::ostream& operator<<(std::ostream& s, oper_t op) {
+    switch (op) {
+    case oper_t::EQ:
+        return s << "=";
+    case oper_t::NEQ:
+        return s << "!=";
+    case oper_t::LT:
+        return s << "<";
+    case oper_t::LTE:
+        return s << "<=";
+    case oper_t::GT:
+        return s << ">";
+    case oper_t::GTE:
+        return s << ">=";
+    case oper_t::IN:
+        return s << "IN";
+    case oper_t::CONTAINS:
+        return s << "CONTAINS";
+    case oper_t::CONTAINS_KEY:
+        return s << "CONTAINS KEY";
+    case oper_t::IS_NOT:
+        return s << "IS NOT";
+    case oper_t::LIKE:
+        return s << "LIKE";
+    }
+    __builtin_unreachable();
+}
+
+} // namespace expr
+} // namespace cql3
+
+
+template <>
+struct fmt::formatter<cql3::expr::expression> {
+    constexpr auto parse(format_parse_context& ctx) {
+        return ctx.end();
+    }
+
+    template <typename FormatContext>
+    auto format(const cql3::expr::expression& expr, FormatContext& ctx) {
+        std::ostringstream os;
+        os << expr;
+        return format_to(ctx.out(), "{}", os.str());
+    }
+};
+
+template <>
+struct fmt::formatter<cql3::expr::column_value> {
+    constexpr auto parse(format_parse_context& ctx) {
+        return ctx.end();
+    }
+
+    template <typename FormatContext>
+    auto format(const cql3::expr::column_value& col, FormatContext& ctx) {
+        std::ostringstream os;
+        os << col;
+        return format_to(ctx.out(), "{}", os.str());
+    }
+};
--- a/cql3/expr/expression.hh
+++ b/cql3/expr/expression.hh
@@ -0,0 +1,243 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <fmt/core.h>
+#include <ostream>
+#include <seastar/core/shared_ptr.hh>
+#include <variant>
+
+#include "bytes.hh"
+#include "cql3/query_options.hh"
+#include "cql3/selection/selection.hh"
+#include "cql3/statements/bound.hh"
+#include "cql3/term.hh"
+#include "database_fwd.hh"
+#include "gc_clock.hh"
+#include "mutation_partition.hh"
+#include "query-result-reader.hh"
+#include "range.hh"
+#include "seastarx.hh"
+#include "utils/overloaded_functor.hh"
+
+namespace secondary_index {
+class index;
+class secondary_index_manager;
+} // namespace secondary_index
+
+namespace cql3 {
+
+namespace expr {
+
+struct allow_local_index_tag {};
+using allow_local_index = bool_class<allow_local_index_tag>;
+
+class binary_operator;
+class conjunction;
+
+/// A restriction expression -- union of all possible restriction types.  bool means a Boolean constant.
+using expression = std::variant<bool, conjunction, binary_operator>;
+
+/// A column, optionally subscripted by a term (eg, c1 or c2['abc']).
+struct column_value {
+    const column_definition* col;
+    ::shared_ptr<term> sub; ///< If present, this LHS is col[sub], otherwise just col.
+    /// For easy creation of vector<column_value> from vector<column_definition*>.
+    column_value(const column_definition* col) : col(col) {}
+    /// The compiler doesn't auto-generate this due to the other constructor's existence.
+    column_value(const column_definition* col, ::shared_ptr<term> sub) : col(col), sub(sub) {}
+};
+
+/// Represents token function on LHS of an operator relation.  No need to list column definitions
+/// here -- token takes exactly the partition key as its argument.
+struct token {};
+
+enum class oper_t { EQ, NEQ, LT, LTE, GTE, GT, IN, CONTAINS, CONTAINS_KEY, IS_NOT, LIKE };
+
+/// Operator restriction: LHS op RHS.
+struct binary_operator {
+    std::variant<column_value, std::vector<column_value>, token> lhs;
+    oper_t op;
+    ::shared_ptr<term> rhs;
+};
+
+/// A conjunction of restrictions.
+struct conjunction {
+    std::vector<expression> children;
+};
+
+/// Creates a conjunction of a and b.  If either a or b is itself a conjunction, its children are inserted
+/// directly into the resulting conjunction's children, flattening the expression tree.
+extern expression make_conjunction(expression a, expression b);
+
+extern std::ostream& operator<<(std::ostream&, oper_t);
+
+/// True iff restr is satisfied with respect to the row provided from a partition slice.
+extern bool is_satisfied_by(
+        const expression& restr,
+        const std::vector<bytes>& partition_key, const std::vector<bytes>& clustering_key,
+        const query::result_row_view& static_row, const query::result_row_view* row,
+        const selection::selection&, const query_options&);
+
+/// True iff restr is satisfied with respect to the row provided from a mutation.
+extern bool is_satisfied_by(
+        const expression& restr,
+        const schema& schema, const partition_key& key, const clustering_key_prefix& ckey, const row& cells,
+        const query_options& options, gc_clock::time_point now);
+
+/// Finds the first binary_operator in restr that represents a bound and returns its RHS as a tuple.  If no
+/// such binary_operator exists, returns an empty vector.  The search is depth first.
+extern std::vector<bytes_opt> first_multicolumn_bound(const expression&, const query_options&, statements::bound);
+
+/// A set of discrete values.
+using value_list = std::vector<bytes>; // Sorted and deduped using value comparator.
+
+/// General set of values.  Empty set and single-element sets are always value_list.  nonwrapping_range is
+/// never singular and never has start > end.  Universal set is a nonwrapping_range with both bounds null.
+using value_set = std::variant<value_list, nonwrapping_range<bytes>>;
+
+/// A set of all column values that would satisfy an expression.  If column is null, a set of all token values
+/// that satisfy.
+///
+/// An expression restricts possible values of a column or token:
+/// - `A>5` restricts A from below
+/// - `A>5 AND A>6 AND B<10 AND A=12 AND B>0` restricts A to 12 and B to between 0 and 10
+/// - `A IN (1, 3, 5)` restricts A to 1, 3, or 5
+/// - `A IN (1, 3, 5) AND A>3` restricts A to just 5
+/// - `A=1 AND A<=0` restricts A to an empty list; no value is able to satisfy the expression
+/// - `A>=NULL` also restricts A to an empty list; all comparisons to NULL are false
+/// - an expression without A "restricts" A to unbounded range
+extern value_set possible_lhs_values(const column_definition*, const expression&, const query_options&);
+
+/// Turns value_set into a range, unless it's a multi-valued list (in which case this throws).
+extern nonwrapping_range<bytes> to_range(const value_set&);
+
+/// True iff the index can support the entire expression.
+extern bool is_supported_by(const expression&, const secondary_index::index&);
+
+/// True iff any of the indices from the manager can support the entire expression.  If allow_local, use all
+/// indices; otherwise, use only global indices.
+extern bool has_supporting_index(
+        const expression&, const secondary_index::secondary_index_manager&, allow_local_index allow_local);
+
+extern sstring to_string(const expression&);
+
+extern std::ostream& operator<<(std::ostream&, const column_value&);
+
+extern std::ostream& operator<<(std::ostream&, const expression&);
+
+/// If there is a binary_operator atom b for which f(b) is true, returns it.  Otherwise returns null.
+template<typename Fn>
+requires std::regular_invocable<Fn, const binary_operator&>
+const binary_operator* find_atom(const expression& e, Fn f) {
+    return std::visit(overloaded_functor{
+            [&] (const binary_operator& op) { return f(op) ? &op : nullptr; },
+            [] (bool) -> const binary_operator* { return nullptr; },
+            [&] (const conjunction& conj) -> const binary_operator* {
+                for (auto& child : conj.children) {
+                    if (auto found = find_atom(child, f)) {
+                        return found;
+                    }
+                }
+                return nullptr;
+            },
+        }, e);
+}
+
+/// Counts binary_operator atoms b for which f(b) is true.
+template<typename Fn>
+requires std::regular_invocable<Fn, const binary_operator&>
+size_t count_if(const expression& e, Fn f) {
+    return std::visit(overloaded_functor{
+            [&] (const binary_operator& op) -> size_t { return f(op) ? 1 : 0; },
+            [&] (const conjunction& conj) {
+                return std::accumulate(conj.children.cbegin(), conj.children.cend(), size_t{0},
+                                       [&] (size_t acc, const expression& c) { return acc + count_if(c, f); });
+            },
+            [] (bool) -> size_t { return 0; },
+        }, e);
+}
+
+inline const binary_operator* find(const expression& e, oper_t op) {
+    return find_atom(e, [&] (const binary_operator& o) { return o.op == op; });
+}
+
+inline bool needs_filtering(oper_t op) {
+    return (op == oper_t::CONTAINS) || (op == oper_t::CONTAINS_KEY) || (op == oper_t::LIKE);
+}
+
+inline auto find_needs_filtering(const expression& e) {
+    return find_atom(e, [] (const binary_operator& bo) { return needs_filtering(bo.op); });
+}
+
+inline bool is_slice(oper_t op) {
+    return (op == oper_t::LT) || (op == oper_t::LTE) || (op == oper_t::GT) || (op == oper_t::GTE);
+}
+
+inline bool has_slice(const expression& e) {
+    return find_atom(e, [] (const binary_operator& bo) { return is_slice(bo.op); });
+}
+
+inline bool is_compare(oper_t op) {
+    switch (op) {
+    case oper_t::EQ:
+    case oper_t::LT:
+    case oper_t::LTE:
+    case oper_t::GT:
+    case oper_t::GTE:
+    case oper_t::NEQ:
+        return true;
+    default:
+        return false;
+    }
+}
+
+inline bool has_token(const expression& e) {
+    return find_atom(e, [] (const binary_operator& o) { return std::holds_alternative<token>(o.lhs); });
+}
+
+inline bool has_slice_or_needs_filtering(const expression& e) {
+    return find_atom(e, [] (const binary_operator& o) { return is_slice(o.op) || needs_filtering(o.op); });
+}
+
+/// True iff binary_operator involves a collection.
+extern bool is_on_collection(const binary_operator&);
+
+/// Replaces every column_definition in an expression with this one.  Throws if any LHS is not a single
+/// column_value.
+extern expression replace_column_def(const expression&, const column_definition*);
+
+inline oper_t pick_operator(statements::bound b, bool inclusive) {
+    return is_start(b) ?
+            (inclusive ? oper_t::GTE : oper_t::GT) :
+            (inclusive ? oper_t::LTE : oper_t::LT);
+}
+
+} // namespace expr
+
+} // namespace cql3
+
+/// Required for fmt::join() to work on expression.
+template <> struct fmt::formatter<cql3::expr::expression>;
+
+/// Required for fmt::join() to work on column_value.
+template <> struct fmt::formatter<cql3::expr::column_value>;
--- a/cql3/functions/abstract_function.hh
+++ b/cql3/functions/abstract_function.hh
@@ -91,10 +91,6 @@ public:
            && _return_type == x._return_type;
    }

-    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
-        return _name.keyspace == ks_name && _name.name == function_name;
-    }
-
    virtual sstring column_name(const std::vector<sstring>& column_names) const override {
        return format("{}({})", _name, join(", ", column_names));
    }
--- a/cql3/functions/aggregate_fcts.cc
+++ b/cql3/functions/aggregate_fcts.cc
@@ -52,7 +52,7 @@ using namespace aggregate_fcts;

 namespace {
 class impl_count_function : public aggregate_function::aggregate {
-    int64_t _count;
+    int64_t _count = 0;
 public:
    virtual void reset() override {
        _count = 0;
--- a/cql3/functions/as_json_function.hh
+++ b/cql3/functions/as_json_function.hh
@@ -140,10 +140,6 @@ public:
        os << ") -> " << utf8_type->as_cql3_type().to_string();
    }

-    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
-        return false;
-    }
-
    virtual sstring column_name(const std::vector<sstring>& column_names) const override {
        return "[json]";
    }
--- a/cql3/functions/castas_fcts.cc
+++ b/cql3/functions/castas_fcts.cc
@@ -207,7 +207,7 @@ castas_fctn get_castas_fctn(data_type to_type, data_type from_type) {
    }

    using kind = abstract_type::kind;
-    switch(cast_switch_case_val(to_type->get_kind(), from_type->get_kind())) {
+    switch (cast_switch_case_val(to_type->get_kind(), from_type->get_kind())) {
    case cast_switch_case_val(kind::byte, kind::short_kind):
        return castas_fctn_simple<int8_t, int16_t>;
    case cast_switch_case_val(kind::byte, kind::int32):
--- a/cql3/functions/function.hh
+++ b/cql3/functions/function.hh
@@ -81,7 +81,6 @@ public:
    virtual bool is_aggregate() const = 0;

    virtual void print(std::ostream& os) const = 0;
-    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const = 0;

    /**
     * Returns the name of the function to use within a ResultSet.
--- a/cql3/functions/function_call.hh
+++ b/cql3/functions/function_call.hh
@@ -56,7 +56,6 @@ public:
    function_call(shared_ptr<scalar_function> fun, std::vector<shared_ptr<term>> terms)
            : _fun(std::move(fun)), _terms(std::move(terms)) {
    }
-    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override;
    virtual void collect_marker_specification(variable_specifications& bound_names) const override;
    virtual shared_ptr<terminal> bind(const query_options& options) override;
    virtual cql3::raw_value_view bind_and_get(const query_options& options) override;
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -62,12 +62,22 @@ bool as_json_function::requires_thread() const { return false; }

 thread_local std::unordered_multimap<function_name, shared_ptr<function>> functions::_declared = init();

-void functions::clear_functions() {
+void functions::clear_functions() noexcept {
    functions::_declared = init();
 }

 std::unordered_multimap<function_name, shared_ptr<function>>
-functions::init() {
+functions::init() noexcept {
+    // It is possible that this function will fail with a
+    // std::bad_alloc causing std::unexpected to be called. Since
+    // this is used during initialization, we would have to abort
+    // somehow. We could add a try/catch to print a better error
+    // message before aborting, but that would produce a core file
+    // that has less information in it. Given how unlikely it is that
+    // we will run out of memory this early, having a better core dump
+    // if we do seems like a good trade-off.
+    memory::disable_failure_guard dfg;
+
    std::unordered_multimap<function_name, shared_ptr<function>> ret;
    auto declare = [&ret] (shared_ptr<function> f) { ret.emplace(f->name(), f); };
    declare(aggregate_fcts::make_count_rows_function());
@@ -172,9 +182,9 @@ shared_ptr<function>
 make_from_json_function(database& db, const sstring& keyspace, data_type t) {
    return make_native_scalar_function<true>("fromjson", t, {utf8_type},
            [&db, &keyspace, t](cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
-        Json::Value json_value = json::to_json_value(utf8_type->to_string(parameters[0].value()));
+        rjson::value json_value = rjson::parse(utf8_type->to_string(parameters[0].value()));
        bytes_opt parsed_json_value;
-        if (!json_value.isNull()) {
+        if (!json_value.IsNull()) {
            parsed_json_value.emplace(from_json_object(*t, json_value, sf));
        }
        return parsed_json_value;
@@ -414,11 +424,6 @@ functions::type_equals(const std::vector<data_type>& t1, const std::vector<data_
    return t1 == t2;
 }

-bool
-function_call::uses_function(const sstring& ks_name, const sstring& function_name) const {
-    return _fun->uses_function(ks_name, function_name);
-}
-
 void
 function_call::collect_marker_specification(variable_specifications& bound_names) const {
    for (auto&& t : _terms) {
@@ -445,7 +450,7 @@ function_call::bind_and_get(const query_options& options) {
        buffers.push_back(std::move(to_bytes_opt(val)));
    }
    auto result = execute_internal(options.get_cql_serialization_format(), *_fun, std::move(buffers));
-    return options.make_temporary(cql3::raw_value::make_value(result));
+    return cql3::raw_value_view::make_temporary(cql3::raw_value::make_value(result));
 }

 bytes_opt
@@ -486,17 +491,17 @@ function_call::make_terminal(shared_ptr<function> fun, cql3::raw_value result, c

    return visit(*fun->return_type(), make_visitor(
    [&] (const list_type_impl& ltype) -> shared_ptr<terminal> {
-        return make_shared(lists::value::from_serialized(to_buffer(result), ltype, sf));
+        return make_shared<lists::value>(lists::value::from_serialized(to_buffer(result), ltype, sf));
    },
    [&] (const set_type_impl& stype) -> shared_ptr<terminal> {
-        return make_shared(sets::value::from_serialized(to_buffer(result), stype, sf));
+        return make_shared<sets::value>(sets::value::from_serialized(to_buffer(result), stype, sf));
    },
    [&] (const map_type_impl& mtype) -> shared_ptr<terminal> {
-        return make_shared(maps::value::from_serialized(to_buffer(result), mtype, sf));
+        return make_shared<maps::value>(maps::value::from_serialized(to_buffer(result), mtype, sf));
    },
    [&] (const user_type_impl& utype) -> shared_ptr<terminal> {
        // TODO (kbraun): write a test for this case when User Defined Functions are implemented
-        return make_shared(user_types::value::from_serialized(to_buffer(result), utype));
+        return make_shared<user_types::value>(user_types::value::from_serialized(to_buffer(result), utype));
    },
    [&] (const abstract_type& type) -> shared_ptr<terminal> {
        if (type.is_collection()) {
--- a/cql3/functions/functions.hh
+++ b/cql3/functions/functions.hh
@@ -65,7 +65,7 @@ class functions {
    using declared_t = cql3::functions::declared_t;
    static thread_local declared_t _declared;
 private:
-    static std::unordered_multimap<function_name, shared_ptr<function>> init();
+    static std::unordered_multimap<function_name, shared_ptr<function>> init() noexcept;
 public:
    static lw_shared_ptr<column_specification> make_arg_spec(const sstring& receiver_ks, const sstring& receiver_cf,
            const function& fun, size_t i);
@@ -91,7 +91,7 @@ public:
    static boost::iterator_range<declared_t::iterator> find(const function_name& name);
    static declared_t::iterator find_iter(const function_name& name, const std::vector<data_type>& arg_types);
    static shared_ptr<function> find(const function_name& name, const std::vector<data_type>& arg_types);
-    static void clear_functions();
+    static void clear_functions() noexcept;
    static void add_function(shared_ptr<function>);
    static void replace_function(shared_ptr<function>);
    static void remove_function(const function_name& name, const std::vector<data_type>& arg_types);
--- a/cql3/lists.cc
+++ b/cql3/lists.cc
@@ -81,7 +81,7 @@ lists::literal::prepare(database& db, const sstring& keyspace, lw_shared_ptr<col
    if (all_terminal) {
        return value.bind(query_options::DEFAULT);
    } else {
-        return make_shared(std::move(value));
+        return make_shared<delayed_value>(std::move(value));
    }
 }

@@ -236,10 +236,11 @@ lists::marker::bind(const query_options& options) {
        try {
            return with_linearized(*value, [&] (bytes_view v) {
                ltype.validate(v, options.get_cql_serialization_format());
-                return make_shared(value::from_serialized(v, ltype, options.get_cql_serialization_format()));
+                return make_shared<lists::value>(value::from_serialized(v, ltype, options.get_cql_serialization_format()));
            });
        } catch (marshal_exception& e) {
-            throw exceptions::invalid_request_exception(e.what());
+            throw exceptions::invalid_request_exception(
+                    format("Exception while binding column {:s}: {:s}", _receiver->name->to_cql_string(), e.what()));
        }
    }
 }
--- a/cql3/lists.hh
+++ b/cql3/lists.hh
@@ -106,6 +106,9 @@ public:
        virtual bool contains_bind_marker() const override;
        virtual void collect_marker_specification(variable_specifications& bound_names) const override;
        virtual shared_ptr<terminal> bind(const query_options& options) override;
+        const std::vector<shared_ptr<term>>& get_elements() const {
+            return _elements;
+        }
    };

    /**
--- a/cql3/maps.cc
+++ b/cql3/maps.cc
@@ -92,7 +92,7 @@ maps::literal::prepare(database& db, const sstring& keyspace, lw_shared_ptr<colu
    if (all_terminal) {
        return value.bind(query_options::DEFAULT);
    } else {
-        return make_shared(std::move(value));
+        return make_shared<delayed_value>(std::move(value));
    }
 }

@@ -267,9 +267,10 @@ maps::marker::bind(const query_options& options) {
            _receiver->type->validate(value, options.get_cql_serialization_format());
        });
    } catch (marshal_exception& e) {
-        throw exceptions::invalid_request_exception(e.what());
+        throw exceptions::invalid_request_exception(
+                format("Exception while binding column {:s}: {:s}", _receiver->name->to_cql_string(), e.what()));
    }
-    return ::make_shared(maps::value::from_serialized(*val, static_cast<const map_type_impl&>(*_receiver->type), options.get_cql_serialization_format()));
+    return ::make_shared<maps::value>(maps::value::from_serialized(*val, static_cast<const map_type_impl&>(*_receiver->type), options.get_cql_serialization_format()));
 }

 void
@@ -304,6 +305,12 @@ maps::setter_by_key::execute(mutation& m, const clustering_key_prefix& prefix, c
    assert(column.type->is_multi_cell()); // "Attempted to set a value for a single key on a frozen map"m
    auto key = _k->bind_and_get(params._options);
    auto value = _t->bind_and_get(params._options);
+    if (value.is_unset_value()) {
+        return;
+    }
+    if (key.is_unset_value() || value.is_unset_value()) {
+        throw invalid_request_exception("Invalid unset map key");
+    }
    if (!key) {
        throw invalid_request_exception("Invalid null map key");
    }
--- a/cql3/multi_column_relation.hh
+++ b/cql3/multi_column_relation.hh
@@ -41,6 +41,7 @@

 #pragma once

+#include "cql3/expr/expression.hh"
 #include "cql3/relation.hh"
 #include "cql3/term.hh"
 #include "cql3/tuples.hh"
@@ -64,8 +65,10 @@ private:
    std::vector<shared_ptr<term::multi_column_raw>> _in_values;
    shared_ptr<tuples::in_raw> _in_marker;

+public:
+
    multi_column_relation(std::vector<shared_ptr<column_identifier::raw>> entities,
-        const operator_type& relation_type, shared_ptr<term::multi_column_raw> values_or_marker,
+        expr::oper_t relation_type, shared_ptr<term::multi_column_raw> values_or_marker,
        std::vector<shared_ptr<term::multi_column_raw>> in_values, shared_ptr<tuples::in_raw> in_marker)
        : relation(relation_type)
        , _entities(std::move(entities))
@@ -73,7 +76,15 @@ private:
        , _in_values(std::move(in_values))
        , _in_marker(std::move(in_marker))
    { }
-public:
+
+    static shared_ptr<multi_column_relation> create_multi_column_relation(
+        std::vector<shared_ptr<column_identifier::raw>> entities, expr::oper_t relation_type,
+        shared_ptr<term::multi_column_raw> values_or_marker, std::vector<shared_ptr<term::multi_column_raw>> in_values,
+        shared_ptr<tuples::in_raw> in_marker) {
+        return ::make_shared<multi_column_relation>(std::move(entities), relation_type, std::move(values_or_marker),
+            std::move(in_values), std::move(in_marker));
+    }
+
    /**
     * Creates a multi-column EQ, LT, LTE, GT, or GTE relation.
     * For example: "SELECT ... WHERE (a, b) > (0, 1)"
@@ -83,9 +94,9 @@ public:
     * @return a new <code>MultiColumnRelation</code> instance
     */
    static shared_ptr<multi_column_relation> create_non_in_relation(std::vector<shared_ptr<column_identifier::raw>> entities,
-                                                                    const operator_type& relation_type, shared_ptr<term::multi_column_raw> values_or_marker) {
-        assert(relation_type != operator_type::IN);
-        return make_shared(multi_column_relation(std::move(entities), relation_type, std::move(values_or_marker), {}, {}));
+                                                                    expr::oper_t relation_type, shared_ptr<term::multi_column_raw> values_or_marker) {
+        assert(relation_type != expr::oper_t::IN);
+        return create_multi_column_relation(std::move(entities), relation_type, std::move(values_or_marker), {}, {});
    }

    /**
@@ -99,14 +110,14 @@ public:
                                                                std::vector<shared_ptr<tuples::literal>> in_values) {
        std::vector<shared_ptr<term::multi_column_raw>> values(in_values.size());
        std::copy(in_values.begin(), in_values.end(), values.begin());
-        return make_shared(multi_column_relation(std::move(entities), operator_type::IN, {}, std::move(values), {}));
+        return create_multi_column_relation(std::move(entities), expr::oper_t::IN, {}, std::move(values), {});
    }

    static shared_ptr<multi_column_relation> create_in_relation(std::vector<shared_ptr<column_identifier::raw>> entities,
                                                                std::vector<shared_ptr<tuples::raw>> in_values) {
        std::vector<shared_ptr<term::multi_column_raw>> values(in_values.size());
        std::copy(in_values.begin(), in_values.end(), values.begin());
-        return make_shared(multi_column_relation(std::move(entities), operator_type::IN, {}, std::move(values), {}));
+        return create_multi_column_relation(std::move(entities), expr::oper_t::IN, {}, std::move(values), {});
    }

    /**
@@ -118,7 +129,7 @@ public:
     */
    static shared_ptr<multi_column_relation> create_single_marker_in_relation(std::vector<shared_ptr<column_identifier::raw>> entities,
                                                                              shared_ptr<tuples::in_raw> in_marker) {
-        return make_shared(multi_column_relation(std::move(entities), operator_type::IN, {}, {}, std::move(in_marker)));
+        return create_multi_column_relation(std::move(entities), expr::oper_t::IN, {}, {}, std::move(in_marker));
    }

    const std::vector<shared_ptr<column_identifier::raw>>& get_entities() const {
@@ -131,7 +142,7 @@ private:
     * @return a Tuples.Literal for non-IN relations or Tuples.Raw marker for a single tuple.
     */
    shared_ptr<term::multi_column_raw> get_value() {
-        return _relation_type == operator_type::IN ? _in_marker : _values_or_marker;
+        return _relation_type == expr::oper_t::IN ? _in_marker : _values_or_marker;
    }
 public:
    virtual bool is_multi_column() const override { return true; }
@@ -197,7 +208,7 @@ protected:
        auto new_entities = boost::copy_range<decltype(_entities)>(_entities | boost::adaptors::transformed([&] (auto&& entity) {
            return *entity == from ? ::make_shared<column_identifier::raw>(to) : entity;
        }));
-        return ::make_shared(multi_column_relation(std::move(new_entities), _relation_type, _values_or_marker, _in_values, _in_marker));
+        return create_multi_column_relation(std::move(new_entities), _relation_type, _values_or_marker, _in_values, _in_marker);
    }

    virtual shared_ptr<term> to_term(const std::vector<lw_shared_ptr<column_specification>>& receivers,
@@ -238,9 +249,7 @@ protected:
            str += !_in_marker ? "?" : tuples::tuple_to_string(_in_values);
            return str;
        }
-        str += sstring(" ") + _relation_type.to_string() + " ";
-        str += _values_or_marker->to_string();
-        return str;
+        return format("{} {} {}", str, _relation_type, _values_or_marker->to_string());
    }
 };

--- a/cql3/operation.hh
+++ b/cql3/operation.hh
@@ -103,10 +103,6 @@ public:
        return params.make_counter_update_cell(delta);
    }

-    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const {
-        return _t && _t->uses_function(ks_name, function_name);
-    }
-
    virtual bool is_raw_counter_shard_write() const {
        return false;
    }
--- a/Show More
+++ b/Show More