mutation_writer: feed_writer(): handle exceptions from consume_end_of_stream()

Currently the exception handling code of feed_writer() assumes consume_end_of_stream() doesn't throw. This is false and an exception from said method can currently lead to an unclean destroy of the writer and reader. Fix by also handling exceptions from consume_end_of_stream() too. Closes #10147 (cherry picked from commit 1963d1cc25)
release: prepare for 4.4.9
2022-03-03 10:45:40 +01:00 · 2022-02-16 14:24:54 +02:00 · 2022-02-03 18:40:12 +02:00 · 2022-01-30 20:08:43 +02:00 · 2022-01-30 11:00:21 +02:00 · 2022-01-27 10:27:45 +02:00
5064 changed files with 69149 additions and 29602 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -0,0 +1,87 @@
+# AUTH
+auth/* @elcallio @vladzcloudius
+
+# CACHE
+row_cache* @tgrabiec @haaawk
+*mutation* @tgrabiec @haaawk
+tests/mvcc* @tgrabiec @haaawk
+
+# CDC
+cdc/* @haaawk @kbr- @elcallio @piodul @jul-stas
+test/cql/cdc_* @haaawk @kbr- @elcallio @piodul @jul-stas
+test/boost/cdc_* @haaawk @kbr- @elcallio @piodul @jul-stas
+
+# COMMITLOG / BATCHLOG
+db/commitlog/* @elcallio
+db/batch* @elcallio
+
+# COORDINATOR
+service/storage_proxy* @gleb-cloudius
+
+# COMPACTION
+sstables/compaction* @raphaelsc @nyh
+
+# CQL TRANSPORT LAYER
+transport/* @penberg
+
+# CQL QUERY LANGUAGE
+cql3/* @tgrabiec @penberg @psarna
+
+# COUNTERS
+counters* @haaawk @jul-stas
+tests/counter_test* @haaawk @jul-stas
+
+# GOSSIP
+gms/* @tgrabiec @asias
+
+# DOCKER
+dist/docker/* @penberg
+
+# LSA
+utils/logalloc* @tgrabiec
+
+# MATERIALIZED VIEWS
+db/view/* @nyh @psarna
+cql3/statements/*view* @nyh @psarna
+test/boost/view_* @nyh @psarna
+
+# PACKAGING
+dist/* @syuu1228
+
+# REPAIR
+repair/* @tgrabiec @asias @nyh
+
+# SCHEMA MANAGEMENT
+db/schema_tables* @tgrabiec @nyh
+db/legacy_schema_migrator* @tgrabiec @nyh
+service/migration* @tgrabiec @nyh
+schema* @tgrabiec @nyh
+
+# SECONDARY INDEXES
+db/index/* @nyh @penberg @psarna
+cql3/statements/*index* @nyh @penberg @psarna
+test/boost/*index* @nyh @penberg @psarna
+
+# SSTABLES
+sstables/* @tgrabiec @raphaelsc @nyh
+
+# STREAMING
+streaming/* @tgrabiec @asias
+service/storage_service.* @tgrabiec @asias
+
+# ALTERNATOR
+alternator/* @nyh @psarna
+test/alternator/* @nyh @psarna
+
+# HINTED HANDOFF
+db/hints/* @haaawk @piodul @vladzcloudius
+
+# REDIS
+redis/* @nyh @syuu1228
+redis-test/* @nyh @syuu1228
+
+# READERS
+reader_* @denesb
+querier* @denesb
+test/boost/mutation_reader_test.cc @denesb
+test/boost/querier_cache_test.cc @denesb
--- a/.github/workflows/pages.yml
+++ b/.github/workflows/pages.yml
@@ -0,0 +1,33 @@
+name: "CI Docs"
+
+on:
+  push:
+    branches:
+    - master
+    paths:
+    - 'docs/**'
+jobs:
+  release:
+    name: Build
+    runs-on: ubuntu-latest
+    env:
+      LATEST_VERSION: master
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+      with:
+        persist-credentials: false
+        fetch-depth: 0
+    - name: Set up Python
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.7
+    - name: Build docs
+      run: |
+        export PATH=$PATH:~/.local/bin
+        cd docs
+        make multiversion
+    - name: Deploy
+      run : ./docs/_utils/deploy.sh
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -22,5 +22,8 @@ resources
 .pytest_cache
 /expressions.tokens
 tags
-testlog/*
+testlog
 test/*/*.reject
+.vscode
+docs/_build
+docs/poetry.lock
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
@@ -13,8 +13,11 @@
 	path = abseil
 	url = ../abseil-cpp
 [submodule "scylla-jmx"]
-	path = scylla-jmx
+	path = tools/jmx
 	url = ../scylla-jmx
 [submodule "scylla-tools"]
-	path = scylla-tools
+	path = tools/java
 	url = ../scylla-tools-java
+[submodule "scylla-python3"]
+	path = tools/python3
+	url = ../scylla-python3
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,8 +1,5 @@
-##
-## For best results, first compile the project using the Ninja build-system.
-##
+cmake_minimum_required(VERSION 3.18)

-cmake_minimum_required(VERSION 3.7)
 project(scylla)

 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
@@ -20,136 +17,740 @@ else()
    set(BUILD_TYPE "release")
 endif()

-if (NOT DEFINED FOR_IDE AND NOT DEFINED ENV{FOR_IDE} AND NOT DEFINED ENV{CLION_IDE})
-    message(FATAL_ERROR "This CMakeLists.txt file is only valid for use in IDEs, please define FOR_IDE to acknowledge this.")
-endif()
-
-# These paths are always available, since they're included in the repository. Additional DPDK headers are placed while
-# Seastar is built, and are captured in `SEASTAR_INCLUDE_DIRS` through parsing the Seastar pkg-config file (below).
-set(SEASTAR_DPDK_INCLUDE_DIRS
-        seastar/dpdk/lib/librte_eal/common/include
-        seastar/dpdk/lib/librte_eal/common/include/generic
-        seastar/dpdk/lib/librte_eal/common/include/x86
-        seastar/dpdk/lib/librte_ether)
-
-find_package(PkgConfig REQUIRED)
-
-set(ENV{PKG_CONFIG_PATH} "${CMAKE_SOURCE_DIR}/build/${BUILD_TYPE}/seastar:$ENV{PKG_CONFIG_PATH}")
-pkg_check_modules(SEASTAR seastar)
-
-if(NOT SEASTAR_INCLUDE_DIRS)
-    # Default value. A more accurate list is populated through `pkg-config` below if `seastar.pc` is available.
-    set(SEASTAR_INCLUDE_DIRS "seastar/include")
-endif()
-
-find_package(Boost COMPONENTS filesystem program_options system thread)
-
-##
-## Populate the names of all source and header files in the indicated paths in a designated variable.
-##
-## When RECURSIVE is specified, directories are traversed recursively.
-##
-## Use: scan_scylla_source_directories(VAR my_result_var [RECURSIVE] PATHS [path1 path2 ...])
-##
-function (scan_scylla_source_directories)
-    set(options RECURSIVE)
-    set(oneValueArgs VAR)
-    set(multiValueArgs PATHS)
-    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}")
-
-    set(globs "")
-
-    foreach (dir ${args_PATHS})
-        list(APPEND globs "${dir}/*.cc" "${dir}/*.hh")
-    endforeach()
-
-    if (args_RECURSIVE)
-        set(glob_kind GLOB_RECURSE)
+function(default_target_arch arch)
+    set(x86_instruction_sets i386 i686 x86_64)
+    if(CMAKE_SYSTEM_PROCESSOR IN_LIST x86_instruction_sets)
+        set(${arch} "westmere" PARENT_SCOPE)
+    elseif(CMAKE_SYSTEM_PROCESSOR EQUAL "aarch64")
+        set(${arch} "armv8-a+crc+crypto" PARENT_SCOPE)
    else()
-        set(glob_kind GLOB)
+        set(${arch} "" PARENT_SCOPE)
    endif()
+endfunction()
+default_target_arch(target_arch)
+if(target_arch)
+    set(target_arch_flag "-march=${target_arch}")
+endif()

-    file(${glob_kind} var
-            ${globs})
+# Configure Seastar compile options to align with Scylla
+set(Seastar_CXX_FLAGS -fcoroutines ${target_arch_flag} CACHE INTERNAL "" FORCE)
+set(Seastar_CXX_DIALECT gnu++20 CACHE INTERNAL "" FORCE)

-    set(${args_VAR} ${var} PARENT_SCOPE)
+add_subdirectory(seastar)
+add_subdirectory(abseil)
+# Exclude absl::strerror from the default "all" target since it's not
+# used in Scylla build and, moreover, makes use of deprecated glibc APIs,
+# such as sys_nerr, which are not exposed from "stdio.h" since glibc 2.32,
+# which happens to be the case for recent Fedora distribution versions.
+#
+# Need to use the internal "absl_strerror" target name instead of namespaced
+# variant because `set_target_properties` does not understand the latter form,
+# unfortunately.
+set_target_properties(absl_strerror PROPERTIES EXCLUDE_FROM_ALL TRUE)
+
+# System libraries dependencies
+find_package(Boost COMPONENTS filesystem program_options system thread regex REQUIRED)
+find_package(Lua REQUIRED)
+find_package(ZLIB REQUIRED)
+find_package(ICU COMPONENTS uc REQUIRED)
+
+set(scylla_build_dir "${CMAKE_BINARY_DIR}/build/${BUILD_TYPE}")
+set(scylla_gen_build_dir "${scylla_build_dir}/gen")
+file(MAKE_DIRECTORY "${scylla_build_dir}" "${scylla_gen_build_dir}")
+
+# Place libraries, executables and archives in ${buildroot}/build/${mode}/
+foreach(mode RUNTIME LIBRARY ARCHIVE)
+    set(CMAKE_${mode}_OUTPUT_DIRECTORY "${scylla_build_dir}")
+endforeach()
+
+# Generate C++ source files from thrift definitions
+function(scylla_generate_thrift)
+    set(one_value_args TARGET VAR IN_FILE OUT_DIR SERVICE)
+    cmake_parse_arguments(args "" "${one_value_args}" "" ${ARGN})
+
+    get_filename_component(in_file_name ${args_IN_FILE} NAME_WE)
+
+    set(aux_out_file_name ${args_OUT_DIR}/${in_file_name})
+    set(outputs
+        ${aux_out_file_name}_types.cpp
+        ${aux_out_file_name}_types.h
+        ${aux_out_file_name}_constants.cpp
+        ${aux_out_file_name}_constants.h
+        ${args_OUT_DIR}/${args_SERVICE}.cpp
+        ${args_OUT_DIR}/${args_SERVICE}.h)
+
+    add_custom_command(
+        DEPENDS
+            ${args_IN_FILE}
+            thrift
+        OUTPUT ${outputs}
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${args_OUT_DIR}
+        COMMAND thrift -gen cpp:cob_style,no_skeleton -out "${args_OUT_DIR}" "${args_IN_FILE}")
+
+    add_custom_target(${args_TARGET}
+        DEPENDS ${outputs})
+
+    set(${args_VAR} ${outputs} PARENT_SCOPE)
 endfunction()

-## Although Seastar is an external project, it is common enough to explore the sources while doing
-## Scylla development that we'll treat the Seastar sources as part of this project for easier navigation.
-scan_scylla_source_directories(
-        VAR SEASTAR_SOURCE_FILES
-        RECURSIVE
+scylla_generate_thrift(
+    TARGET scylla_thrift_gen_cassandra
+    VAR scylla_thrift_gen_cassandra_files
+    IN_FILE interface/cassandra.thrift
+    OUT_DIR ${scylla_gen_build_dir}
+    SERVICE Cassandra)

-        PATHS
-          seastar/core
-          seastar/http
-          seastar/json
-          seastar/net
-          seastar/rpc
-          seastar/testing
-          seastar/util)
+# Parse antlr3 grammar files and generate C++ sources
+function(scylla_generate_antlr3)
+    set(one_value_args TARGET VAR IN_FILE OUT_DIR)
+    cmake_parse_arguments(args "" "${one_value_args}" "" ${ARGN})

-scan_scylla_source_directories(
-        VAR SCYLLA_ROOT_SOURCE_FILES
-        PATHS .)
+    get_filename_component(in_file_pure_name ${args_IN_FILE} NAME)
+    get_filename_component(stem ${in_file_pure_name} NAME_WE)

-scan_scylla_source_directories(
-        VAR SCYLLA_SUB_SOURCE_FILES
-        RECURSIVE
+    set(outputs
+        "${args_OUT_DIR}/${stem}Lexer.hpp"
+        "${args_OUT_DIR}/${stem}Lexer.cpp"
+        "${args_OUT_DIR}/${stem}Parser.hpp"
+        "${args_OUT_DIR}/${stem}Parser.cpp")

-        PATHS
-          api
-          auth
-          cql3
-          db
-          dht
-          exceptions
-          gms
-          index
-          io
-          locator
-          message
-          repair
-          service
-          sstables
-          streaming
-          test
-          thrift
-          tracing
-          transport
-          utils)
+    add_custom_command(
+        DEPENDS
+            ${args_IN_FILE}
+        OUTPUT ${outputs}
+        # Remove #ifdef'ed code from the grammar source code
+        COMMAND sed -e "/^#if 0/,/^#endif/d" "${args_IN_FILE}" > "${args_OUT_DIR}/${in_file_pure_name}"
+        COMMAND antlr3 "${args_OUT_DIR}/${in_file_pure_name}"
+        # We replace many local `ExceptionBaseType* ex` variables with a single function-scope one.
+        # Because we add such a variable to every function, and because `ExceptionBaseType` is not a global
+        # name, we also add a global typedef to avoid compilation errors.
+        COMMAND sed -i -e "/^.*On :.*$/d" "${args_OUT_DIR}/${stem}Lexer.hpp"
+        COMMAND sed -i -e "/^.*On :.*$/d" "${args_OUT_DIR}/${stem}Lexer.cpp"
+        COMMAND sed -i -e "/^.*On :.*$/d" "${args_OUT_DIR}/${stem}Parser.hpp"
+        COMMAND sed -i
+            -e "s/^\\( *\\)\\(ImplTraits::CommonTokenType\\* [a-zA-Z0-9_]* = NULL;\\)$/\\1const \\2/"
+            -e "/^.*On :.*$/d"
+            -e "1i using ExceptionBaseType = int;"
+            -e "s/^{/{ ExceptionBaseType\\* ex = nullptr;/; s/ExceptionBaseType\\* ex = new/ex = new/; s/exceptions::syntax_exception e/exceptions::syntax_exception\\& e/"
+            "${args_OUT_DIR}/${stem}Parser.cpp"
+        VERBATIM)

-scan_scylla_source_directories(
-        VAR SCYLLA_GEN_SOURCE_FILES
-        RECURSIVE
-        PATHS build/${BUILD_TYPE}/gen)
+    add_custom_target(${args_TARGET}
+        DEPENDS ${outputs})

-set(SCYLLA_SOURCE_FILES
-        ${SCYLLA_ROOT_SOURCE_FILES}
-        ${SCYLLA_GEN_SOURCE_FILES}
-        ${SCYLLA_SUB_SOURCE_FILES})
+    set(${args_VAR} ${outputs} PARENT_SCOPE)
+endfunction()
+
+set(antlr3_grammar_files
+    cql3/Cql.g
+    alternator/expressions.g)
+
+set(antlr3_gen_files)
+
+foreach(f ${antlr3_grammar_files})
+    get_filename_component(grammar_file_name "${f}" NAME_WE)
+    get_filename_component(f_dir "${f}" DIRECTORY)
+    scylla_generate_antlr3(
+        TARGET scylla_antlr3_gen_${grammar_file_name}
+        VAR scylla_antlr3_gen_${grammar_file_name}_files
+        IN_FILE ${f}
+        OUT_DIR ${scylla_gen_build_dir}/${f_dir})
+    list(APPEND antlr3_gen_files "${scylla_antlr3_gen_${grammar_file_name}_files}")
+endforeach()
+
+# Generate C++ sources from ragel grammar files
+seastar_generate_ragel(
+    TARGET scylla_ragel_gen_protocol_parser
+    VAR scylla_ragel_gen_protocol_parser_file
+    IN_FILE redis/protocol_parser.rl
+    OUT_FILE ${scylla_gen_build_dir}/redis/protocol_parser.hh)
+
+# Generate C++ sources from Swagger definitions
+set(swagger_files
+    api/api-doc/cache_service.json
+    api/api-doc/collectd.json
+    api/api-doc/column_family.json
+    api/api-doc/commitlog.json
+    api/api-doc/compaction_manager.json
+    api/api-doc/config.json
+    api/api-doc/endpoint_snitch_info.json
+    api/api-doc/error_injection.json
+    api/api-doc/failure_detector.json
+    api/api-doc/gossiper.json
+    api/api-doc/hinted_handoff.json
+    api/api-doc/lsa.json
+    api/api-doc/messaging_service.json
+    api/api-doc/storage_proxy.json
+    api/api-doc/storage_service.json
+    api/api-doc/stream_manager.json
+    api/api-doc/system.json
+    api/api-doc/utils.json)
+
+set(swagger_gen_files)
+
+foreach(f ${swagger_files})
+    get_filename_component(fname "${f}" NAME_WE)
+    get_filename_component(dir "${f}" DIRECTORY)
+    seastar_generate_swagger(
+        TARGET scylla_swagger_gen_${fname}
+        VAR scylla_swagger_gen_${fname}_files
+        IN_FILE "${f}"
+        OUT_DIR "${scylla_gen_build_dir}/${dir}")
+    list(APPEND swagger_gen_files "${scylla_swagger_gen_${fname}_files}")
+endforeach()
+
+# Create C++ bindings for IDL serializers
+function(scylla_generate_idl_serializer)
+    set(one_value_args TARGET VAR IN_FILE OUT_FILE)
+    cmake_parse_arguments(args "" "${one_value_args}" "" ${ARGN})
+    get_filename_component(out_dir ${args_OUT_FILE} DIRECTORY)
+    set(idl_compiler "${CMAKE_SOURCE_DIR}/idl-compiler.py")
+
+    find_package(Python3 COMPONENTS Interpreter)
+
+    add_custom_command(
+        DEPENDS
+            ${args_IN_FILE}
+            ${idl_compiler}
+        OUTPUT ${args_OUT_FILE}
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${out_dir}
+        COMMAND Python3::Interpreter ${idl_compiler} --ns ser -f ${args_IN_FILE} -o ${args_OUT_FILE})
+
+    add_custom_target(${args_TARGET}
+        DEPENDS ${args_OUT_FILE})
+
+    set(${args_VAR} ${args_OUT_FILE} PARENT_SCOPE)
+endfunction()
+
+set(idl_serializers
+    idl/cache_temperature.idl.hh
+    idl/commitlog.idl.hh
+    idl/consistency_level.idl.hh
+    idl/frozen_mutation.idl.hh
+    idl/frozen_schema.idl.hh
+    idl/gossip_digest.idl.hh
+    idl/idl_test.idl.hh
+    idl/keys.idl.hh
+    idl/messaging_service.idl.hh
+    idl/mutation.idl.hh
+    idl/paging_state.idl.hh
+    idl/partition_checksum.idl.hh
+    idl/paxos.idl.hh
+    idl/query.idl.hh
+    idl/range.idl.hh
+    idl/read_command.idl.hh
+    idl/reconcilable_result.idl.hh
+    idl/replay_position.idl.hh
+    idl/result.idl.hh
+    idl/ring_position.idl.hh
+    idl/streaming.idl.hh
+    idl/token.idl.hh
+    idl/tracing.idl.hh
+    idl/truncation_record.idl.hh
+    idl/uuid.idl.hh
+    idl/view.idl.hh)
+
+set(idl_gen_files)
+
+foreach(f ${idl_serializers})
+    get_filename_component(idl_name "${f}" NAME)
+    get_filename_component(idl_target "${idl_name}" NAME_WE)
+    get_filename_component(idl_dir "${f}" DIRECTORY)
+    string(REPLACE ".idl.hh" ".dist.hh" idl_out_hdr_name "${idl_name}")
+    scylla_generate_idl_serializer(
+        TARGET scylla_idl_gen_${idl_target}
+        VAR scylla_idl_gen_${idl_target}_files
+        IN_FILE ${f}
+        OUT_FILE ${scylla_gen_build_dir}/${idl_dir}/${idl_out_hdr_name})
+    list(APPEND idl_gen_files "${scylla_idl_gen_${idl_target}_files}")
+endforeach()
+
+set(scylla_sources
+    absl-flat_hash_map.cc
+    alternator/auth.cc
+    alternator/base64.cc
+    alternator/conditions.cc
+    alternator/executor.cc
+    alternator/expressions.cc
+    alternator/serialization.cc
+    alternator/server.cc
+    alternator/stats.cc
+    alternator/streams.cc
+    api/api.cc
+    api/cache_service.cc
+    api/collectd.cc
+    api/column_family.cc
+    api/commitlog.cc
+    api/compaction_manager.cc
+    api/config.cc
+    api/endpoint_snitch.cc
+    api/error_injection.cc
+    api/failure_detector.cc
+    api/gossiper.cc
+    api/hinted_handoff.cc
+    api/lsa.cc
+    api/messaging_service.cc
+    api/storage_proxy.cc
+    api/storage_service.cc
+    api/stream_manager.cc
+    api/system.cc
+    atomic_cell.cc
+    auth/allow_all_authenticator.cc
+    auth/allow_all_authorizer.cc
+    auth/authenticated_user.cc
+    auth/authentication_options.cc
+    auth/authenticator.cc
+    auth/common.cc
+    auth/default_authorizer.cc
+    auth/password_authenticator.cc
+    auth/passwords.cc
+    auth/permission.cc
+    auth/permissions_cache.cc
+    auth/resource.cc
+    auth/role_or_anonymous.cc
+    auth/roles-metadata.cc
+    auth/sasl_challenge.cc
+    auth/service.cc
+    auth/standard_role_manager.cc
+    auth/transitional.cc
+    bytes.cc
+    canonical_mutation.cc
+    cdc/cdc_partitioner.cc
+    cdc/generation.cc
+    cdc/log.cc
+    cdc/metadata.cc
+    cdc/split.cc
+    clocks-impl.cc
+    collection_mutation.cc
+    compress.cc
+    connection_notifier.cc
+    converting_mutation_partition_applier.cc
+    counters.cc
+    cql3/abstract_marker.cc
+    cql3/attributes.cc
+    cql3/cf_name.cc
+    cql3/column_condition.cc
+    cql3/column_identifier.cc
+    cql3/column_specification.cc
+    cql3/constants.cc
+    cql3/cql3_type.cc
+    cql3/expr/expression.cc
+    cql3/functions/aggregate_fcts.cc
+    cql3/functions/castas_fcts.cc
+    cql3/functions/error_injection_fcts.cc
+    cql3/functions/functions.cc
+    cql3/functions/user_function.cc
+    cql3/index_name.cc
+    cql3/keyspace_element_name.cc
+    cql3/lists.cc
+    cql3/maps.cc
+    cql3/operation.cc
+    cql3/query_options.cc
+    cql3/query_processor.cc
+    cql3/relation.cc
+    cql3/restrictions/statement_restrictions.cc
+    cql3/result_set.cc
+    cql3/role_name.cc
+    cql3/selection/abstract_function_selector.cc
+    cql3/selection/selectable.cc
+    cql3/selection/selection.cc
+    cql3/selection/selector.cc
+    cql3/selection/selector_factories.cc
+    cql3/selection/simple_selector.cc
+    cql3/sets.cc
+    cql3/single_column_relation.cc
+    cql3/statements/alter_keyspace_statement.cc
+    cql3/statements/alter_table_statement.cc
+    cql3/statements/alter_type_statement.cc
+    cql3/statements/alter_view_statement.cc
+    cql3/statements/authentication_statement.cc
+    cql3/statements/authorization_statement.cc
+    cql3/statements/batch_statement.cc
+    cql3/statements/cas_request.cc
+    cql3/statements/cf_prop_defs.cc
+    cql3/statements/cf_statement.cc
+    cql3/statements/create_function_statement.cc
+    cql3/statements/create_index_statement.cc
+    cql3/statements/create_keyspace_statement.cc
+    cql3/statements/create_table_statement.cc
+    cql3/statements/create_type_statement.cc
+    cql3/statements/create_view_statement.cc
+    cql3/statements/delete_statement.cc
+    cql3/statements/drop_function_statement.cc
+    cql3/statements/drop_index_statement.cc
+    cql3/statements/drop_keyspace_statement.cc
+    cql3/statements/drop_table_statement.cc
+    cql3/statements/drop_type_statement.cc
+    cql3/statements/drop_view_statement.cc
+    cql3/statements/function_statement.cc
+    cql3/statements/grant_statement.cc
+    cql3/statements/index_prop_defs.cc
+    cql3/statements/index_target.cc
+    cql3/statements/ks_prop_defs.cc
+    cql3/statements/list_permissions_statement.cc
+    cql3/statements/list_users_statement.cc
+    cql3/statements/modification_statement.cc
+    cql3/statements/permission_altering_statement.cc
+    cql3/statements/property_definitions.cc
+    cql3/statements/raw/parsed_statement.cc
+    cql3/statements/revoke_statement.cc
+    cql3/statements/role-management-statements.cc
+    cql3/statements/schema_altering_statement.cc
+    cql3/statements/select_statement.cc
+    cql3/statements/truncate_statement.cc
+    cql3/statements/update_statement.cc
+    cql3/statements/use_statement.cc
+    cql3/token_relation.cc
+    cql3/tuples.cc
+    cql3/type_json.cc
+    cql3/untyped_result_set.cc
+    cql3/update_parameters.cc
+    cql3/user_types.cc
+    cql3/ut_name.cc
+    cql3/util.cc
+    cql3/values.cc
+    cql3/variable_specifications.cc
+    data/cell.cc
+    database.cc
+    db/batchlog_manager.cc
+    db/commitlog/commitlog.cc
+    db/commitlog/commitlog_entry.cc
+    db/commitlog/commitlog_replayer.cc
+    db/config.cc
+    db/consistency_level.cc
+    db/cql_type_parser.cc
+    db/data_listeners.cc
+    db/extensions.cc
+    db/heat_load_balance.cc
+    db/hints/manager.cc
+    db/hints/resource_manager.cc
+    db/large_data_handler.cc
+    db/legacy_schema_migrator.cc
+    db/marshal/type_parser.cc
+    db/schema_tables.cc
+    db/size_estimates_virtual_reader.cc
+    db/snapshot-ctl.cc
+    db/sstables-format-selector.cc
+    db/system_distributed_keyspace.cc
+    db/system_keyspace.cc
+    db/view/row_locking.cc
+    db/view/view.cc
+    db/view/view_update_generator.cc
+    dht/boot_strapper.cc
+    dht/i_partitioner.cc
+    dht/murmur3_partitioner.cc
+    dht/range_streamer.cc
+    dht/token.cc
+    distributed_loader.cc
+    duration.cc
+    exceptions/exceptions.cc
+    flat_mutation_reader.cc
+    frozen_mutation.cc
+    frozen_schema.cc
+    gms/application_state.cc
+    gms/endpoint_state.cc
+    gms/failure_detector.cc
+    gms/feature_service.cc
+    gms/gossip_digest_ack.cc
+    gms/gossip_digest_ack2.cc
+    gms/gossip_digest_syn.cc
+    gms/gossiper.cc
+    gms/inet_address.cc
+    gms/version_generator.cc
+    gms/versioned_value.cc
+    hashers.cc
+    index/secondary_index.cc
+    index/secondary_index_manager.cc
+    init.cc
+    keys.cc
+    lister.cc
+    locator/abstract_replication_strategy.cc
+    locator/ec2_multi_region_snitch.cc
+    locator/ec2_snitch.cc
+    locator/everywhere_replication_strategy.cc
+    locator/gce_snitch.cc
+    locator/gossiping_property_file_snitch.cc
+    locator/local_strategy.cc
+    locator/network_topology_strategy.cc
+    locator/production_snitch_base.cc
+    locator/rack_inferring_snitch.cc
+    locator/simple_snitch.cc
+    locator/simple_strategy.cc
+    locator/snitch_base.cc
+    locator/token_metadata.cc
+    lua.cc
+    main.cc
+    memtable.cc
+    message/messaging_service.cc
+    multishard_mutation_query.cc
+    mutation.cc
+    raft/fsm.cc
+    raft/log.cc
+    raft/progress.cc
+    raft/raft.cc
+    raft/server.cc
+    mutation_fragment.cc
+    mutation_partition.cc
+    mutation_partition_serializer.cc
+    mutation_partition_view.cc
+    mutation_query.cc
+    mutation_reader.cc
+    mutation_writer/multishard_writer.cc
+    mutation_writer/shard_based_splitting_writer.cc
+    mutation_writer/timestamp_based_splitting_writer.cc
+    mutation_writer/feed_writers.cc
+    partition_slice_builder.cc
+    partition_version.cc
+    querier.cc
+    query-result-set.cc
+    query.cc
+    range_tombstone.cc
+    range_tombstone_list.cc
+    reader_concurrency_semaphore.cc
+    redis/abstract_command.cc
+    redis/command_factory.cc
+    redis/commands.cc
+    redis/keyspace_utils.cc
+    redis/lolwut.cc
+    redis/mutation_utils.cc
+    redis/options.cc
+    redis/query_processor.cc
+    redis/query_utils.cc
+    redis/server.cc
+    redis/service.cc
+    redis/stats.cc
+    repair/repair.cc
+    repair/row_level.cc
+    row_cache.cc
+    schema.cc
+    schema_mutations.cc
+    schema_registry.cc
+    service/client_state.cc
+    service/migration_manager.cc
+    service/migration_task.cc
+    service/misc_services.cc
+    service/pager/paging_state.cc
+    service/pager/query_pagers.cc
+    service/paxos/paxos_state.cc
+    service/paxos/prepare_response.cc
+    service/paxos/prepare_summary.cc
+    service/paxos/proposal.cc
+    service/priority_manager.cc
+    service/storage_proxy.cc
+    service/storage_service.cc
+    sstables/compaction.cc
+    sstables/compaction_manager.cc
+    sstables/compaction_strategy.cc
+    sstables/compress.cc
+    sstables/integrity_checked_file_impl.cc
+    sstables/kl/writer.cc
+    sstables/leveled_compaction_strategy.cc
+    sstables/m_format_read_helpers.cc
+    sstables/metadata_collector.cc
+    sstables/mp_row_consumer.cc
+    sstables/mx/writer.cc
+    sstables/partition.cc
+    sstables/prepended_input_stream.cc
+    sstables/random_access_reader.cc
+    sstables/size_tiered_compaction_strategy.cc
+    sstables/sstable_directory.cc
+    sstables/sstable_version.cc
+    sstables/sstables.cc
+    sstables/sstables_manager.cc
+    sstables/time_window_compaction_strategy.cc
+    sstables/writer.cc
+    streaming/progress_info.cc
+    streaming/session_info.cc
+    streaming/stream_coordinator.cc
+    streaming/stream_manager.cc
+    streaming/stream_plan.cc
+    streaming/stream_reason.cc
+    streaming/stream_receive_task.cc
+    streaming/stream_request.cc
+    streaming/stream_result_future.cc
+    streaming/stream_session.cc
+    streaming/stream_session_state.cc
+    streaming/stream_summary.cc
+    streaming/stream_task.cc
+    streaming/stream_transfer_task.cc
+    table.cc
+    table_helper.cc
+    thrift/controller.cc
+    thrift/handler.cc
+    thrift/server.cc
+    thrift/thrift_validation.cc
+    timeout_config.cc
+    tracing/trace_keyspace_helper.cc
+    tracing/trace_state.cc
+    tracing/traced_file.cc
+    tracing/tracing.cc
+    tracing/tracing_backend_registry.cc
+    transport/controller.cc
+    transport/cql_protocol_extension.cc
+    transport/event.cc
+    transport/event_notifier.cc
+    transport/messages/result_message.cc
+    transport/server.cc
+    types.cc
+    unimplemented.cc
+    utils/UUID_gen.cc
+    utils/arch/powerpc/crc32-vpmsum/crc32_wrapper.cc
+    utils/array-search.cc
+    utils/ascii.cc
+    utils/big_decimal.cc
+    utils/bloom_calculations.cc
+    utils/bloom_filter.cc
+    utils/buffer_input_stream.cc
+    utils/build_id.cc
+    utils/config_file.cc
+    utils/directories.cc
+    utils/disk-error-handler.cc
+    utils/dynamic_bitset.cc
+    utils/error_injection.cc
+    utils/exceptions.cc
+    utils/file_lock.cc
+    utils/generation-number.cc
+    utils/gz/crc_combine.cc
+    utils/human_readable.cc
+    utils/i_filter.cc
+    utils/large_bitset.cc
+    utils/like_matcher.cc
+    utils/limiting_data_source.cc
+    utils/logalloc.cc
+    utils/managed_bytes.cc
+    utils/multiprecision_int.cc
+    utils/murmur_hash.cc
+    utils/rate_limiter.cc
+    utils/rjson.cc
+    utils/runtime.cc
+    utils/updateable_value.cc
+    utils/utf8.cc
+    utils/uuid.cc
+    validation.cc
+    vint-serialization.cc
+    zstd.cc
+    release.cc)
+
+set(scylla_gen_sources
+    "${scylla_thrift_gen_cassandra_files}"
+    "${scylla_ragel_gen_protocol_parser_file}"
+    "${swagger_gen_files}"
+    "${idl_gen_files}"
+    "${antlr3_gen_files}")

 add_executable(scylla
-        ${SEASTAR_SOURCE_FILES}
-        ${SCYLLA_SOURCE_FILES})
+    ${scylla_sources}
+    ${scylla_gen_sources})

-# If the Seastar pkg-config information is available, append to the default flags.
-#
-# For ease of browsing the source code, we always pretend that DPDK is enabled.
-target_compile_options(scylla PUBLIC
-        -std=gnu++20
-        -DHAVE_DPDK
-        -DHAVE_HWLOC
-        "${SEASTAR_CFLAGS}")
+target_link_libraries(scylla PRIVATE
+    seastar
+    # Boost dependencies
+    Boost::filesystem
+    Boost::program_options
+    Boost::system
+    Boost::thread
+    Boost::regex
+    Boost::headers
+    # Abseil libs
+    absl::hashtablez_sampler
+    absl::raw_hash_set
+    absl::synchronization
+    absl::graphcycles_internal
+    absl::stacktrace
+    absl::symbolize
+    absl::debugging_internal
+    absl::demangle_internal
+    absl::time
+    absl::time_zone
+    absl::int128
+    absl::city
+    absl::hash
+    absl::malloc_internal
+    absl::spinlock_wait
+    absl::base
+    absl::dynamic_annotations
+    absl::raw_logging_internal
+    absl::exponential_biased
+    absl::throw_delegate
+    # System libs
+    ZLIB::ZLIB
+    ICU::uc
+    systemd
+    zstd
+    snappy
+    ${LUA_LIBRARIES}
+    thrift
+    crypt)

-# The order matters here: prefer the "static" DPDK directories to any dynamic paths from pkg-config. Some files are only
-# available dynamically, though.
-target_include_directories(scylla PUBLIC
-        .
-        ${SEASTAR_DPDK_INCLUDE_DIRS}
-        ${SEASTAR_INCLUDE_DIRS}
-        ${Boost_INCLUDE_DIRS}
-        xxhash
-        libdeflate
-        build/${BUILD_TYPE}/gen)
+target_link_libraries(scylla PRIVATE
+    -Wl,--build-id=sha1 # Force SHA1 build-id generation
+    # TODO: Use lld linker if it's available, otherwise gold, else bfd
+    -fuse-ld=lld)
+# TODO: patch dynamic linker to match configure.py behavior
+
+target_compile_options(scylla PRIVATE
+    -std=gnu++20
+    -fcoroutines # TODO: Clang does not have this flag, adjust to both variants
+    ${target_arch_flag})
+# Hacks needed to expose internal APIs for xxhash dependencies
+target_compile_definitions(scylla PRIVATE XXH_PRIVATE_API HAVE_LZ4_COMPRESS_DEFAULT)
+
+target_include_directories(scylla PRIVATE
+    "${CMAKE_CURRENT_SOURCE_DIR}"
+    libdeflate
+    abseil
+    "${scylla_gen_build_dir}")
+
+###
+### Create crc_combine_table helper executable.
+### Use it to generate crc_combine_table.cc to be used in scylla at build time.
+###
+add_executable(crc_combine_table utils/gz/gen_crc_combine_table.cc)
+target_link_libraries(crc_combine_table PRIVATE seastar)
+target_include_directories(crc_combine_table PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+target_compile_options(crc_combine_table PRIVATE
+    -std=gnu++20
+    -fcoroutines
+    ${target_arch_flag})
+add_dependencies(scylla crc_combine_table)
+
+# Generate an additional source file at build time that is needed for Scylla compilation
+add_custom_command(OUTPUT "${scylla_gen_build_dir}/utils/gz/crc_combine_table.cc"
+    COMMAND $<TARGET_FILE:crc_combine_table> > "${scylla_gen_build_dir}/utils/gz/crc_combine_table.cc"
+    DEPENDS crc_combine_table)
+target_sources(scylla PRIVATE "${scylla_gen_build_dir}/utils/gz/crc_combine_table.cc")
+
+###
+### Generate version file and supply appropriate compile definitions for release.cc
+###
+execute_process(COMMAND ${CMAKE_SOURCE_DIR}/SCYLLA-VERSION-GEN RESULT_VARIABLE scylla_version_gen_res)
+if(scylla_version_gen_res)
+    message(SEND_ERROR "Version file generation failed. Return code: ${scylla_version_gen_res}")
+endif()
+
+file(READ build/SCYLLA-VERSION-FILE scylla_version)
+string(STRIP "${scylla_version}" scylla_version)
+
+file(READ build/SCYLLA-RELEASE-FILE scylla_release)
+string(STRIP "${scylla_release}" scylla_release)
+
+get_property(release_cdefs SOURCE "${CMAKE_SOURCE_DIR}/release.cc" PROPERTY COMPILE_DEFINITIONS)
+list(APPEND release_cdefs "SCYLLA_VERSION=\"${scylla_version}\"" "SCYLLA_RELEASE=\"${scylla_release}\"")
+set_source_files_properties("${CMAKE_SOURCE_DIR}/release.cc" PROPERTIES COMPILE_DEFINITIONS "${release_cdefs}")
+
+###
+### Custom command for building libdeflate. Link the library to scylla.
+###
+set(libdeflate_lib "${scylla_build_dir}/libdeflate/libdeflate.a")
+add_custom_command(OUTPUT "${libdeflate_lib}"
+    COMMAND make -C libdeflate
+        BUILD_DIR=../build/${BUILD_TYPE}/libdeflate/
+        CC=${CMAKE_C_COMPILER}
+        "CFLAGS=${target_arch_flag}"
+        ../build/${BUILD_TYPE}/libdeflate//libdeflate.a) # Two backslashes are important!
+# Hack to force generating custom command to produce libdeflate.a
+add_custom_target(libdeflate DEPENDS "${libdeflate_lib}")
+target_link_libraries(scylla PRIVATE "${libdeflate_lib}")
+
+# TODO: create cmake/ directory and move utilities (generate functions etc) there
+# TODO: Build tests if BUILD_TESTING=on (using CTest module)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,11 +1,13 @@
-# Asking questions or requesting help
+# Contributing
+
+## Asking questions or requesting help

 Use the [ScyllaDB user mailing list](https://groups.google.com/forum/#!forum/scylladb-users) or the [Slack workspace](http://slack.scylladb.com) for general questions and help.

-# Reporting an issue
+## Reporting an issue

 Please use the [Issue Tracker](https://github.com/scylladb/scylla/issues/) to report issues.  Fill in as much information as you can in the issue template, especially for performance problems.

-# Contributing Code to Scylla
+## Contributing Code to Scylla

 To contribute code to Scylla, you need to sign the [Contributor License Agreement](https://www.scylladb.com/open-source/contributor-agreement/) and send your changes as [patches](https://github.com/scylladb/scylla/wiki/Formatting-and-sending-patches) to the [mailing list](https://groups.google.com/forum/#!forum/scylladb-dev). We don't accept pull requests on GitHub.
--- a/114
+++ b/114
@@ -1,114 +0,0 @@
-M: Maintainer with commit access
-R: Reviewer with subsystem expertise
-F: Filename, directory, or pattern for the subsystem
-
---
-
-AUTH
-R: Calle Wilund <calle@scylladb.com>
-R: Vlad Zolotarov <vladz@scylladb.com>
-R: Jesse Haber-Kucharsky <jhaberku@scylladb.com>
-F: auth/*
-
-CACHE
-M: Tomasz Grabiec <tgrabiec@scylladb.com>
-R: Piotr Jastrzebski <piotr@scylladb.com>
-F: row_cache*
-F: *mutation*
-F: tests/mvcc*
-
-COMMITLOG / BATCHLOGa
-R: Calle Wilund <calle@scylladb.com>
-F: db/commitlog/*
-F: db/batch*
-
-COORDINATOR
-R: Gleb Natapov <gleb@scylladb.com>
-F: service/storage_proxy*
-
-COMPACTION
-R: Raphael S. Carvalho <raphaelsc@scylladb.com>
-R: Glauber Costa <glauber@scylladb.com>
-R: Nadav Har'El <nyh@scylladb.com>
-F: sstables/compaction*
-
-CQL TRANSPORT LAYER
-M: Pekka Enberg <penberg@scylladb.com>
-F: transport/*
-
-CQL QUERY LANGUAGE
-M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Pekka Enberg <penberg@scylladb.com>
-F: cql3/*
-
-COUNTERS
-F: counters*
-F: tests/counter_test*
-
-GOSSIP
-M: Tomasz Grabiec <tgrabiec@scylladb.com>
-R: Asias He <asias@scylladb.com>
-F: gms/*
-
-DOCKER
-M: Pekka Enberg <penberg@scylladb.com>
-F: dist/docker/*
-
-LSA
-M: Tomasz Grabiec <tgrabiec@scylladb.com>
-F: utils/logalloc*
-
-MATERIALIZED VIEWS
-M: Pekka Enberg <penberg@scylladb.com>
-M: Nadav Har'El <nyh@scylladb.com>
-F: db/view/*
-F: cql3/statements/*view*
-
-PACKAGING
-R: Takuya ASADA <syuu@scylladb.com>
-F: dist/*
-
-REPAIR
-M: Tomasz Grabiec <tgrabiec@scylladb.com>
-R: Asias He <asias@scylladb.com>
-R: Nadav Har'El <nyh@scylladb.com>
-F: repair/*
-
-SCHEMA MANAGEMENT
-M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Pekka Enberg <penberg@scylladb.com>
-F: db/schema_tables*
-F: db/legacy_schema_migrator*
-F: service/migration*
-F: schema*
-
-SECONDARY INDEXES
-M: Pekka Enberg <penberg@scylladb.com>
-M: Nadav Har'El <nyh@scylladb.com>
-R: Pekka Enberg <penberg@scylladb.com>
-F: db/index/*
-F: cql3/statements/*index*
-
-SSTABLES
-M: Tomasz Grabiec <tgrabiec@scylladb.com>
-R: Raphael S. Carvalho <raphaelsc@scylladb.com>
-R: Glauber Costa <glauber@scylladb.com>
-R: Nadav Har'El <nyh@scylladb.com>
-F: sstables/*
-
-STREAMING
-M: Tomasz Grabiec <tgrabiec@scylladb.com>
-R: Asias He <asias@scylladb.com>
-F: streaming/*
-F: service/storage_service.*
-
-ALTERNATOR
-M: Nadav Har'El <nyh@scylladb.com>
-F: alternator/*
-F: alternator-test/*
-
-THE REST
-M: Avi Kivity <avi@scylladb.com>
-M: Tomasz Grabiec <tgrabiec@scylladb.com>
-M: Nadav Har'El <nyh@scylladb.com>
-F: *
--- a/README.md
+++ b/README.md
@@ -1,43 +1,66 @@
 # Scylla

-## Quick-start
+[![Slack](https://img.shields.io/badge/slack-scylla-brightgreen.svg?logo=slack)](http://slack.scylladb.com)
+[![Twitter](https://img.shields.io/twitter/follow/ScyllaDB.svg?style=social&label=Follow)](https://twitter.com/intent/follow?screen_name=ScyllaDB)
+
+## What is Scylla?
+
+Scylla is the real-time big data database that is API-compatible with Apache Cassandra and Amazon DynamoDB.
+Scylla embraces a shared-nothing approach that increases throughput and storage capacity to realize order-of-magnitude performance improvements and reduce hardware costs.
+
+For more information, please see the [ScyllaDB web site].
+
+[ScyllaDB web site]: https://www.scylladb.com
+
+## Build Prerequisites

 Scylla is fairly fussy about its build environment, requiring very recent
 versions of the C++20 compiler and of many libraries to build. The document
 [HACKING.md](HACKING.md) includes detailed information on building and
 developing Scylla, but to get Scylla building quickly on (almost) any build
-machine, Scylla offers offers a [frozen toolchain](tools/toolchain/README.md),
+machine, Scylla offers a [frozen toolchain](tools/toolchain/README.md),
 This is a pre-configured Docker image which includes recent versions of all
 the required compilers, libraries and build tools. Using the frozen toolchain
 allows you to avoid changing anything in your build machine to meet Scylla's
 requirements - you just need to meet the frozen toolchain's prerequisites
 (mostly, Docker or Podman being available).

-Building and running Scylla with the frozen toolchain is as easy as:
+## Building Scylla
+
+Building Scylla with the frozen toolchain `dbuild` is as easy as:

 ```bash
-$ ./tools/toolchain/dbuild ./configure.py
-$ ./tools/toolchain/dbuild ninja build/release/scylla
-$ ./tools/toolchain/dbuild ./build/release/scylla --developer-mode 1
+$ git submodule update --init --force --recursive
+$ ./tools/toolchain/dbuild ./configure.py
+$ ./tools/toolchain/dbuild ninja build/release/scylla
 ```

+For further information, please see:
+
+* [Developer documentation] for more information on building Scylla.
+* [Build documentation] on how to build Scylla binaries, tests, and packages.
+* [Docker image build documentation] for information on how to build Docker images.
+
+[developer documentation]: HACKING.md
+[build documentation]: docs/building.md
+[docker image build documentation]: dist/docker/redhat/README.md
+
 ## Running Scylla

-* Run Scylla
-```
-./build/release/scylla
+To start Scylla server, run:

+```bash
+$ ./tools/toolchain/dbuild ./build/release/scylla --workdir tmp --smp 1 --developer-mode 1
 ```

-* run Scylla with one CPU and ./tmp as work directory
+This will start a Scylla node with one CPU core allocated to it and data files stored in the `tmp` directory.
+The `--developer-mode` is needed to disable the various checks Scylla performs at startup to ensure the machine is configured for maximum performance (not relevant on development workstations).
+Please note that you need to run Scylla with `dbuild` if you built it with the frozen toolchain.

-```
-./build/release/scylla --workdir tmp --smp 1
-```
+For more run options, run:

-* For more run options:
-```
-./build/release/scylla --help
+```bash
+$ ./tools/toolchain/dbuild ./build/release/scylla --help
 ```

 ## Testing
@@ -46,19 +69,16 @@ See [test.py manual](docs/testing.md).

 ## Scylla APIs and compatibility
 By default, Scylla is compatible with Apache Cassandra and its APIs - CQL and
-Thrift. There is also experimental support for the API of Amazon DynamoDB,
-but being experimental it needs to be explicitly enabled to be used. For more
-information on how to enable the experimental DynamoDB compatibility in Scylla,
-and the current limitations of this feature, see
+Thrift. There is also support for the API of Amazon DynamoDB™,
+which needs to be enabled and configured in order to be used. For more
+information on how to enable the DynamoDB™ API in Scylla,
+and the current compatibility of this feature as well as Scylla-specific extensions, see
 [Alternator](docs/alternator/alternator.md) and
 [Getting started with Alternator](docs/alternator/getting-started.md).

 ## Documentation

-Documentation can be found in [./docs](./docs) and on the
-[wiki](https://github.com/scylladb/scylla/wiki). There is currently no clear
-definition of what goes where, so when looking for something be sure to check
-both.
+Documentation can be found [here](https://scylla.docs.scylladb.com).
 Seastar documentation can be found [here](http://docs.seastar.io/master/index.html).
 User documentation can be found [here](https://docs.scylladb.com/).

@@ -69,27 +89,22 @@ The courses are free, self-paced and include hands-on examples. They cover a var
 administration, architecture, basic NoSQL concepts, using drivers for application development, Scylla setup, failover, compactions, 
 multi-datacenters and how Scylla integrates with third-party applications.

-## Building a CentOS-based Docker image
-
-Build a Docker image with:
-
-```
-cd dist/docker/redhat
-docker build -t <image-name> .
-```
-
-This build is based on executables downloaded from downloads.scylladb.com,
-**not** on the executables built in this source directory. See further
-instructions in dist/docker/redhat/README.md to build a docker image from
-your own executables.
-
-Run the image with:
-
-```
-docker run -p $(hostname -i):9042:9042 -i -t <image name>
-```
-
 ## Contributing to Scylla

-[Hacking howto](HACKING.md)
-[Guidelines for contributing](CONTRIBUTING.md)
+If you want to report a bug or submit a pull request or a patch, please read the [contribution guidelines].
+
+If you are a developer working on Scylla, please read the [developer guidelines].
+
+[contribution guidelines]: CONTRIBUTING.md
+[developer guidelines]: HACKING.md
+
+## Contact
+
+* The [users mailing list] and [Slack channel] are for users to discuss configuration, management, and operations of the ScyllaDB open source.
+* The [developers mailing list] is for developers and people interested in following the development of ScyllaDB to discuss technical topics.
+
+[Users mailing list]: https://groups.google.com/forum/#!forum/scylladb-users
+
+[Slack channel]: http://slack.scylladb.com/
+
+[Developers mailing list]: https://groups.google.com/forum/#!forum/scylladb-dev
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=666.development
+VERSION=4.4.9

 if test -f version
 then
--- a/2
+++ b/2
--- a/alternator/auth.cc
+++ b/alternator/auth.cc
@@ -78,12 +78,12 @@ void check_expiry(std::string_view signature_date) {
    std::string expiration_str = format_time_point(db_clock::now() - 15min);
    std::string validity_str = format_time_point(db_clock::now() + 15min);
    if (signature_date < expiration_str) {
-        throw api_error("InvalidSignatureException",
+        throw api_error::invalid_signature(
                fmt::format("Signature expired: {} is now earlier than {} (current time - 15 min.)",
                signature_date, expiration_str));
    }
    if (signature_date > validity_str) {
-        throw api_error("InvalidSignatureException",
+        throw api_error::invalid_signature(
                fmt::format("Signature not yet current: {} is still later than {} (current time + 15 min.)",
                signature_date, validity_str));
    }
@@ -94,13 +94,13 @@ std::string get_signature(std::string_view access_key_id, std::string_view secre
        std::string_view body_content, std::string_view region, std::string_view service, std::string_view query_string) {
    auto amz_date_it = signed_headers_map.find("x-amz-date");
    if (amz_date_it == signed_headers_map.end()) {
-        throw api_error("InvalidSignatureException", "X-Amz-Date header is mandatory for signature verification");
+        throw api_error::invalid_signature("X-Amz-Date header is mandatory for signature verification");
    }
    std::string_view amz_date = amz_date_it->second;
    check_expiry(amz_date);
    std::string_view datestamp = amz_date.substr(0, 8);
    if (datestamp != orig_datestamp) {
-        throw api_error("InvalidSignatureException",
+        throw api_error::invalid_signature(
                format("X-Amz-Date date does not match the provided datestamp. Expected {}, got {}",
                        orig_datestamp, datestamp));
    }
@@ -126,19 +126,19 @@ std::string get_signature(std::string_view access_key_id, std::string_view secre

 future<std::string> get_key_from_roles(cql3::query_processor& qp, std::string username) {
    static const sstring query = format("SELECT salted_hash FROM {} WHERE {} = ?",
-            auth::meta::roles_table::qualified_name(), auth::meta::roles_table::role_col_name);
+            auth::meta::roles_table::qualified_name, auth::meta::roles_table::role_col_name);

    auto cl = auth::password_authenticator::consistency_for_user(username);
-    auto timeout = auth::internal_distributed_timeout_config();
+    auto& timeout = auth::internal_distributed_timeout_config();
    return qp.execute_internal(query, cl, timeout, {sstring(username)}, true).then_wrapped([username = std::move(username)] (future<::shared_ptr<cql3::untyped_result_set>> f) {
        auto res = f.get0();
        auto salted_hash = std::optional<sstring>();
        if (res->empty()) {
-            throw api_error("UnrecognizedClientException", fmt::format("User not found: {}", username));
+            throw api_error::unrecognized_client(fmt::format("User not found: {}", username));
        }
        salted_hash = res->one().get_opt<sstring>("salted_hash");
        if (!salted_hash) {
-            throw api_error("UnrecognizedClientException", fmt::format("No password found for user: {}", username));
+            throw api_error::unrecognized_client(fmt::format("No password found for user: {}", username));
        }
        return make_ready_future<std::string>(*salted_hash);
    });
--- a/alternator/base64.cc
+++ b/alternator/base64.cc
@@ -32,13 +32,13 @@
 // and the character used in base64 encoding to represent it.
 static class base64_chars {
 public:
-    static constexpr const char* to =
+    static constexpr const char to[] =
            "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
    int8_t from[255];
    base64_chars() {
-        static_assert(strlen(to) == 64);
+        static_assert(sizeof(to) == 64 + 1);
        for (int i = 0; i < 255; i++) {
-            from[i] = 255; // signal invalid character
+            from[i] = -1; // signal invalid character
        }
        for (int i = 0; i < 64; i++) {
            from[(unsigned) to[i]] = i;
--- a/alternator/base64.hh
+++ b/alternator/base64.hh
@@ -23,7 +23,7 @@

 #include <string_view>
 #include "bytes.hh"
-#include "rjson.hh"
+#include "utils/rjson.hh"

 std::string base64_encode(bytes_view);

--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -26,7 +26,7 @@
 #include "alternator/error.hh"
 #include "cql3/constants.hh"
 #include <unordered_map>
-#include "rjson.hh"
+#include "utils/rjson.hh"
 #include "serialization.hh"
 #include "base64.hh"
 #include <stdexcept>
@@ -57,12 +57,12 @@ comparison_operator_type get_comparison_operator(const rjson::value& comparison_
            {"NOT_CONTAINS", comparison_operator_type::NOT_CONTAINS},
    };
    if (!comparison_operator.IsString()) {
-        throw api_error("ValidationException", format("Invalid comparison operator definition {}", rjson::print(comparison_operator)));
+        throw api_error::validation(format("Invalid comparison operator definition {}", rjson::print(comparison_operator)));
    }
    std::string op = comparison_operator.GetString();
    auto it = ops.find(op);
    if (it == ops.end()) {
-        throw api_error("ValidationException", format("Unsupported comparison operator {}", op));
+        throw api_error::validation(format("Unsupported comparison operator {}", op));
    }
    return it->second;
 }
@@ -98,11 +98,16 @@ struct nonempty : public size_check {

 // Check that array has the expected number of elements
 static void verify_operand_count(const rjson::value* array, const size_check& expected, const rjson::value& op) {
+    if (!array && expected(0)) {
+        // If expected() allows an empty AttributeValueList, it is also fine
+        // that it is missing.
+        return;
+    }
    if (!array || !array->IsArray()) {
-        throw api_error("ValidationException", "With ComparisonOperator, AttributeValueList must be given and an array");
+        throw api_error::validation("With ComparisonOperator, AttributeValueList must be given and an array");
    }
    if (!expected(array->Size())) {
-        throw api_error("ValidationException",
+        throw api_error::validation(
                        format("{} operator requires AttributeValueList {}, instead found list size {}",
                               op, expected.what(), array->Size()));
    }
@@ -118,7 +123,7 @@ struct rjson_engaged_ptr_comp {
 // as internally they're stored in an array, and the order of elements is
 // not important in set equality. See issue #5021
 static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2) {
-    if (set1.Size() != set2.Size()) {
+    if (!set1.IsArray() || !set2.IsArray() || set1.Size() != set2.Size()) {
        return false;
    }
    std::set<const rjson::value*, rjson_engaged_ptr_comp> set1_raw;
@@ -126,7 +131,40 @@ static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2
        set1_raw.insert(&*it);
    }
    for (const auto& a : set2.GetArray()) {
-        if (set1_raw.count(&a) == 0) {
+        if (!set1_raw.contains(&a)) {
+            return false;
+        }
+    }
+    return true;
+}
+// Moreover, the JSON being compared can be a nested document with outer
+// layers of lists and maps and some inner set - and we need to get to that
+// inner set to compare it correctly with check_EQ_for_sets() (issue #8514).
+static bool check_EQ(const rjson::value* v1, const rjson::value& v2);
+static bool check_EQ_for_lists(const rjson::value& list1, const rjson::value& list2) {
+    if (!list1.IsArray() || !list2.IsArray() || list1.Size() != list2.Size()) {
+        return false;
+    }
+    auto it1 = list1.Begin();
+    auto it2 = list2.Begin();
+    while (it1 != list1.End()) {
+        // Note: Alternator limits an item's depth (rjson::parse() limits
+        // it to around 37 levels), so this recursion is safe.
+        if (!check_EQ(&*it1, *it2)) {
+            return false;
+        }
+        ++it1;
+        ++it2;
+    }
+    return true;
+}
+static bool check_EQ_for_maps(const rjson::value& list1, const rjson::value& list2) {
+    if (!list1.IsObject() || !list2.IsObject() || list1.MemberCount() != list2.MemberCount()) {
+        return false;
+    }
+    for (auto it1 = list1.MemberBegin(); it1 != list1.MemberEnd(); ++it1) {
+        auto it2 = list2.FindMember(it1->name);
+        if (it2 == list2.MemberEnd() || !check_EQ(&it1->value, it2->value)) {
            return false;
        }
    }
@@ -135,42 +173,71 @@ static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2

 // Check if two JSON-encoded values match with the EQ relation
 static bool check_EQ(const rjson::value* v1, const rjson::value& v2) {
-    if (!v1) {
-        return false;
-    }
-    if (v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
+    if (v1 && v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
        auto it1 = v1->MemberBegin();
        auto it2 = v2.MemberBegin();
-        if ((it1->name == "SS" && it2->name == "SS") || (it1->name == "NS" && it2->name == "NS") || (it1->name == "BS" && it2->name == "BS")) {
-            return check_EQ_for_sets(it1->value, it2->value);
+        if (it1->name != it2->name) {
+            return false;
        }
+        if (it1->name == "SS" || it1->name == "NS" || it1->name == "BS") {
+            return check_EQ_for_sets(it1->value, it2->value);
+        } else if(it1->name == "L") {
+            return check_EQ_for_lists(it1->value, it2->value);
+        } else if(it1->name == "M") {
+            return check_EQ_for_maps(it1->value, it2->value);
+        } else {
+            // Other, non-nested types (number, string, etc.) can be compared
+            // literally, comparing their JSON representation.
+            return it1->value == it2->value;
+        }
+    } else {
+        // If v1 and/or v2 are missing (IsNull()) the result should be false.
+        // In the unlikely case that the object is malformed (issue #8070),
+        // let's also return false.
+        return false;
    }
-    return *v1 == v2;
 }

 // Check if two JSON-encoded values match with the NE relation
 static bool check_NE(const rjson::value* v1, const rjson::value& v2) {
-    return !v1 || *v1 != v2; // null is unequal to anything.
+    return !check_EQ(v1, v2);
 }

 // Check if two JSON-encoded values match with the BEGINS_WITH relation
-static bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2) {
-    // BEGINS_WITH requires that its single operand (v2) be a string or
-    // binary - otherwise it's a validation error. However, problems with
-    // the stored attribute (v1) will just return false (no match).
-    if (!v2.IsObject() || v2.MemberCount() != 1) {
-        throw api_error("ValidationException", format("BEGINS_WITH operator encountered malformed AttributeValue: {}", v2));
-    }
-    auto it2 = v2.MemberBegin();
-    if (it2->name != "S" && it2->name != "B") {
-        throw api_error("ValidationException", format("BEGINS_WITH operator requires String or Binary type in AttributeValue, got {}", it2->name));
-    }
-
-
+bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2,
+                       bool v1_from_query, bool v2_from_query) {
+    bool bad = false;
    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
+        if (v1_from_query) {
+            throw api_error::validation("begins_with() encountered malformed argument");
+        } else {
+            bad = true;
+        }
+    } else if (v1->MemberBegin()->name != "S" && v1->MemberBegin()->name != "B") {
+        if (v1_from_query) {
+            throw api_error::validation(format("begins_with supports only string or binary type, got: {}", *v1));
+        } else {
+            bad = true;
+        }
+    }
+    if (!v2.IsObject() || v2.MemberCount() != 1) {
+        if (v2_from_query) {
+            throw api_error::validation("begins_with() encountered malformed argument");
+        } else {
+            bad = true;
+        }
+    } else if (v2.MemberBegin()->name != "S" && v2.MemberBegin()->name != "B") {
+        if (v2_from_query) {
+            throw api_error::validation(format("begins_with() supports only string or binary type, got: {}", v2));
+        } else {
+            bad = true;
+        }
+    }
+    if (bad) {
        return false;
    }
    auto it1 = v1->MemberBegin();
+    auto it2 = v2.MemberBegin();
    if (it1->name != it2->name) {
        return false;
    }
@@ -228,12 +295,12 @@ static bool check_NOT_CONTAINS(const rjson::value* v1, const rjson::value& v2) {
 // Check if a JSON-encoded value equals any element of an array, which must have at least one element.
 static bool check_IN(const rjson::value* val, const rjson::value& array) {
    if (!array[0].IsObject() || array[0].MemberCount() != 1) {
-        throw api_error("ValidationException",
+        throw api_error::validation(
                        format("IN operator encountered malformed AttributeValue: {}", array[0]));
    }
    const auto& type = array[0].MemberBegin()->name;
    if (type != "S" && type != "N" && type != "B") {
-        throw api_error("ValidationException",
+        throw api_error::validation(
                        "IN operator requires AttributeValueList elements to be of type String, Number, or Binary ");
    }
    if (!val) {
@@ -242,7 +309,7 @@ static bool check_IN(const rjson::value* val, const rjson::value& array) {
    bool have_match = false;
    for (const auto& elem : array.GetArray()) {
        if (!elem.IsObject() || elem.MemberCount() != 1 || elem.MemberBegin()->name != type) {
-            throw api_error("ValidationException",
+            throw api_error::validation(
                            "IN operator requires all AttributeValueList elements to have the same type ");
        }
        if (!have_match && *val == elem) {
@@ -274,24 +341,40 @@ static bool check_NOT_NULL(const rjson::value* val) {
    return val != nullptr;
 }

+// Only types S, N or B (string, number or bytes) may be compared by the
+// various comparion operators - lt, le, gt, ge, and between.
+// Note that in particular, if the value is missing (v->IsNull()), this
+// check returns false.
+static bool check_comparable_type(const rjson::value& v) {
+    if (!v.IsObject() || v.MemberCount() != 1) {
+        return false;
+    }
+    const rjson::value& type = v.MemberBegin()->name;
+    return type == "S" || type == "N" || type == "B";
+}
+
 // Check if two JSON-encoded values match with cmp.
 template <typename Comparator>
-bool check_compare(const rjson::value* v1, const rjson::value& v2, const Comparator& cmp) {
-    if (!v2.IsObject() || v2.MemberCount() != 1) {
-        throw api_error("ValidationException",
-                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic));
+bool check_compare(const rjson::value* v1, const rjson::value& v2, const Comparator& cmp,
+                   bool v1_from_query, bool v2_from_query) {
+    bool bad = false;
+    if (!v1 || !check_comparable_type(*v1)) {
+        if (v1_from_query) {
+            throw api_error::validation(format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
+        }
+        bad = true;
    }
-    const auto& kv2 = *v2.MemberBegin();
-    if (kv2.name != "S" && kv2.name != "N" && kv2.name != "B") {
-        throw api_error("ValidationException",
-                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic));
+    if (!check_comparable_type(v2)) {
+        if (v2_from_query) {
+            throw api_error::validation(format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
+        }
+        bad = true;
    }
-    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
+    if (bad) {
        return false;
    }
    const auto& kv1 = *v1->MemberBegin();
+    const auto& kv2 = *v2.MemberBegin();
    if (kv1.name != kv2.name) {
        return false;
    }
@@ -305,7 +388,8 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara
    if (kv1.name == "B") {
        return cmp(base64_decode(kv1.value), base64_decode(kv2.value));
    }
-    clogger.error("check_compare panic: LHS type equals RHS type, but one is in {N,S,B} while the other isn't");
+    // cannot reach here, as check_comparable_type() verifies the type is one
+    // of the above options.
    return false;
 }

@@ -336,57 +420,71 @@ struct cmp_gt {
    static constexpr const char* diagnostic = "GT operator";
 };

-// True if v is between lb and ub, inclusive.  Throws if lb > ub.
+// True if v is between lb and ub, inclusive.  Throws or returns false
+// (depending on bounds_from_query parameter) if lb > ub.
 template <typename T>
-static bool check_BETWEEN(const T& v, const T& lb, const T& ub) {
+static bool check_BETWEEN(const T& v, const T& lb, const T& ub, bool bounds_from_query) {
    if (cmp_lt()(ub, lb)) {
-        throw api_error("ValidationException",
-                        format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+        if (bounds_from_query) {
+            throw api_error::validation(
+                format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+        } else {
+            return false;
+        }
    }
    return cmp_ge()(v, lb) && cmp_le()(v, ub);
 }

-static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub) {
-    if (!v) {
+static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub,
+                          bool v_from_query, bool lb_from_query, bool ub_from_query) {
+    if ((v && v_from_query && !check_comparable_type(*v)) ||
+        (lb_from_query && !check_comparable_type(lb)) ||
+        (ub_from_query && !check_comparable_type(ub))) {
+        throw api_error::validation("between allow only the types String, Number, or Binary");
+
+    }
+    if (!v || !v->IsObject() || v->MemberCount() != 1 ||
+        !lb.IsObject() || lb.MemberCount() != 1 ||
+        !ub.IsObject() || ub.MemberCount() != 1) {
        return false;
    }
-    if (!v->IsObject() || v->MemberCount() != 1) {
-        throw api_error("ValidationException", format("BETWEEN operator encountered malformed AttributeValue: {}", *v));
-    }
-    if (!lb.IsObject() || lb.MemberCount() != 1) {
-        throw api_error("ValidationException", format("BETWEEN operator encountered malformed AttributeValue: {}", lb));
-    }
-    if (!ub.IsObject() || ub.MemberCount() != 1) {
-        throw api_error("ValidationException", format("BETWEEN operator encountered malformed AttributeValue: {}", ub));
-    }

    const auto& kv_v = *v->MemberBegin();
    const auto& kv_lb = *lb.MemberBegin();
    const auto& kv_ub = *ub.MemberBegin();
+    bool bounds_from_query = lb_from_query && ub_from_query;
    if (kv_lb.name != kv_ub.name) {
-        throw api_error(
-                "ValidationException",
+        if (bounds_from_query) {
+           throw api_error::validation(
                format("BETWEEN operator requires the same type for lower and upper bound; instead got {} and {}",
                       kv_lb.name, kv_ub.name));
+        } else {
+            return false;
+        }
    }
    if (kv_v.name != kv_lb.name) { // Cannot compare different types, so v is NOT between lb and ub.
        return false;
    }
    if (kv_v.name == "N") {
        const char* diag = "BETWEEN operator";
-        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag));
+        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag), bounds_from_query);
    }
    if (kv_v.name == "S") {
        return check_BETWEEN(std::string_view(kv_v.value.GetString(), kv_v.value.GetStringLength()),
                             std::string_view(kv_lb.value.GetString(), kv_lb.value.GetStringLength()),
-                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()));
+                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()),
+                             bounds_from_query);
    }
    if (kv_v.name == "B") {
-        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value));
+        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value), bounds_from_query);
    }
-    throw api_error("ValidationException",
-        format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
+    if (v_from_query) {
+        throw api_error::validation(
+            format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
               kv_lb.name));
+    } else {
+        return false;
+    }
 }

 // Verify one Expect condition on one attribute (whose content is "got")
@@ -404,24 +502,24 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
    // and requires a different combinations of parameters in the request
    if (value) {
        if (exists && (!exists->IsBool() || exists->GetBool() != true)) {
-            throw api_error("ValidationException", "Cannot combine Value with Exists!=true");
+            throw api_error::validation("Cannot combine Value with Exists!=true");
        }
        if (comparison_operator) {
-            throw api_error("ValidationException", "Cannot combine Value with ComparisonOperator");
+            throw api_error::validation("Cannot combine Value with ComparisonOperator");
        }
        return check_EQ(got, *value);
    } else if (exists) {
        if (comparison_operator) {
-            throw api_error("ValidationException", "Cannot combine Exists with ComparisonOperator");
+            throw api_error::validation("Cannot combine Exists with ComparisonOperator");
        }
        if (!exists->IsBool() || exists->GetBool() != false) {
-            throw api_error("ValidationException", "Exists!=false requires Value");
+            throw api_error::validation("Exists!=false requires Value");
        }
        // Remember Exists=false, so we're checking that the attribute does *not* exist:
        return !got;
    } else {
        if (!comparison_operator) {
-            throw api_error("ValidationException", "Missing ComparisonOperator, Value or Exists");
+            throw api_error::validation("Missing ComparisonOperator, Value or Exists");
        }
        comparison_operator_type op = get_comparison_operator(*comparison_operator);
        switch (op) {
@@ -433,19 +531,19 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
            return check_NE(got, (*attribute_value_list)[0]);
        case comparison_operator_type::LT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_lt{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_lt{}, false, true);
        case comparison_operator_type::LE:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_le{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_le{}, false, true);
        case comparison_operator_type::GT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_gt{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_gt{}, false, true);
        case comparison_operator_type::GE:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_ge{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_ge{}, false, true);
        case comparison_operator_type::BEGINS_WITH:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_BEGINS_WITH(got, (*attribute_value_list)[0]);
+            return check_BEGINS_WITH(got, (*attribute_value_list)[0], false, true);
        case comparison_operator_type::IN:
            verify_operand_count(attribute_value_list, nonempty(), *comparison_operator);
            return check_IN(got, *attribute_value_list);
@@ -457,7 +555,8 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
            return check_NOT_NULL(got);
        case comparison_operator_type::BETWEEN:
            verify_operand_count(attribute_value_list, exact_size(2), *comparison_operator);
-            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1]);
+            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1],
+                                 false, true, true);
        case comparison_operator_type::CONTAINS:
            {
                verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
@@ -466,7 +565,7 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
                const rjson::value& arg = (*attribute_value_list)[0];
                const auto& argtype = (*arg.MemberBegin()).name;
                if (argtype != "S" && argtype != "N" && argtype != "B") {
-                    throw api_error("ValidationException",
+                    throw api_error::validation(
                            format("CONTAINS operator requires a single AttributeValue of type String, Number, or Binary, "
                                    "got {} instead", argtype));
                }
@@ -480,7 +579,7 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
                const rjson::value& arg = (*attribute_value_list)[0];
                const auto& argtype = (*arg.MemberBegin()).name;
                if (argtype != "S" && argtype != "N" && argtype != "B") {
-                    throw api_error("ValidationException",
+                    throw api_error::validation(
                            format("CONTAINS operator requires a single AttributeValue of type String, Number, or Binary, "
                                    "got {} instead", argtype));
                }
@@ -497,7 +596,7 @@ conditional_operator_type get_conditional_operator(const rjson::value& req) {
        return conditional_operator_type::MISSING;
    }
    if (!conditional_operator->IsString()) {
-        throw api_error("ValidationException", "'ConditionalOperator' parameter, if given, must be a string");
+        throw api_error::validation("'ConditionalOperator' parameter, if given, must be a string");
    }
    auto s = rjson::to_string_view(*conditional_operator);
    if (s == "AND") {
@@ -505,7 +604,7 @@ conditional_operator_type get_conditional_operator(const rjson::value& req) {
    } else if (s == "OR") {
        return conditional_operator_type::OR;
    } else {
-        throw api_error("ValidationException",
+        throw api_error::validation(
                format("'ConditionalOperator' parameter must be AND, OR or missing. Found {}.", s));
    }
 }
@@ -520,13 +619,13 @@ bool verify_expected(const rjson::value& req, const rjson::value* previous_item)
    auto conditional_operator = get_conditional_operator(req);
    if (conditional_operator != conditional_operator_type::MISSING &&
        (!expected || (expected->IsObject() && expected->GetObject().ObjectEmpty()))) {
-            throw api_error("ValidationException", "'ConditionalOperator' parameter cannot be specified for missing or empty Expression");
+            throw api_error::validation("'ConditionalOperator' parameter cannot be specified for missing or empty Expression");
    }
    if (!expected) {
        return true;
    }
    if (!expected->IsObject()) {
-        throw api_error("ValidationException", "'Expected' parameter, if given, must be an object");
+        throw api_error::validation("'Expected' parameter, if given, must be an object");
    }
    bool require_all = conditional_operator != conditional_operator_type::OR;
    return verify_condition(*expected, require_all, previous_item);
@@ -569,7 +668,8 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
            // Shouldn't happen unless we have a bug in the parser
            throw std::logic_error(format("Wrong number of values {} in BETWEEN primitive_condition", cond._values.size()));
        }
-        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2]);
+        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2],
+                             cond._values[0].is_constant(), cond._values[1].is_constant(), cond._values[2].is_constant());
    case parsed::primitive_condition::type::IN:
        return check_IN(calculated_values);
    case parsed::primitive_condition::type::VALUE:
@@ -584,7 +684,7 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
                return it->value.GetBool();
            }
        }
-        throw api_error("ValidationException",
+        throw api_error::validation(
                format("ConditionExpression: condition results in a non-boolean value: {}",
                        calculated_values[0]));
    default:
@@ -600,13 +700,17 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
    case parsed::primitive_condition::type::NE:
        return check_NE(&calculated_values[0], calculated_values[1]);
    case parsed::primitive_condition::type::GT:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::GE:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::LT:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::LE:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    default:
        // Shouldn't happen unless we have a bug in the parser
        throw std::logic_error(format("Unknown type {} in primitive_condition object", (int)(cond._op)));
--- a/alternator/conditions.hh
+++ b/alternator/conditions.hh
@@ -52,6 +52,7 @@ bool verify_expected(const rjson::value& req, const rjson::value* previous_item)
 bool verify_condition(const rjson::value& condition, bool require_all, const rjson::value* previous_item);

 bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2);
+bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2, bool v1_from_query, bool v2_from_query);

 bool verify_condition_expression(
        const parsed::condition_expression& condition_expression,
--- a/alternator/error.hh
+++ b/alternator/error.hh
@@ -26,12 +26,15 @@

 namespace alternator {

-// DynamoDB's error messages are described in detail in
+// api_error contains a DynamoDB error message to be returned to the user.
+// It can be returned by value (see executor::request_return_type) or thrown.
+// The DynamoDB's error messages are described in detail in
 // https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Programming.Errors.html
-// Ah An error message has a "type", e.g., "ResourceNotFoundException", a coarser
-// HTTP code (almost always, 400), and a human readable message. Eventually these
-// will be wrapped into a JSON object returned to the client.
-class api_error : public std::exception {
+// An error message has an HTTP code (almost always 400), a type, e.g.,
+// "ResourceNotFoundException", and a human readable message.
+// Eventually alternator::api_handler will convert a returned or thrown
+// api_error into a JSON object, and that is returned to the user.
+class api_error final {
 public:
    using status_type = httpd::reply::status_type;
    status_type _http_code;
@@ -42,8 +45,44 @@ public:
        , _type(std::move(type))
        , _msg(std::move(msg))
    { }
-    api_error() = default;
-    virtual const char* what() const noexcept override { return _msg.c_str(); }
+
+    // Factory functions for some common types of DynamoDB API errors
+    static api_error validation(std::string msg) {
+        return api_error("ValidationException", std::move(msg));
+    }
+    static api_error resource_not_found(std::string msg) {
+        return api_error("ResourceNotFoundException", std::move(msg));
+    }
+    static api_error resource_in_use(std::string msg) {
+        return api_error("ResourceInUseException", std::move(msg));
+    }
+    static api_error invalid_signature(std::string msg) {
+        return api_error("InvalidSignatureException", std::move(msg));
+    }
+    static api_error missing_authentication_token(std::string msg) {
+        return api_error("MissingAuthenticationTokenException", std::move(msg));
+    }
+    static api_error unrecognized_client(std::string msg) {
+        return api_error("UnrecognizedClientException", std::move(msg));
+    }
+    static api_error unknown_operation(std::string msg) {
+        return api_error("UnknownOperationException", std::move(msg));
+    }
+    static api_error access_denied(std::string msg) {
+        return api_error("AccessDeniedException", std::move(msg));
+    }
+    static api_error conditional_check_failed(std::string msg) {
+        return api_error("ConditionalCheckFailedException", std::move(msg));
+    }
+    static api_error expired_iterator(std::string msg) {
+        return api_error("ExpiredIteratorException", std::move(msg));
+    }
+    static api_error trimmed_data_access_exception(std::string msg) {
+        return api_error("TrimmedDataAccessException", std::move(msg));
+    }
+    static api_error internal(std::string msg) {
+        return api_error("InternalServerError", std::move(msg), reply::status_type::internal_server_error);
+    }
 };

 }
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -30,16 +30,121 @@
 #include "service/storage_proxy.hh"
 #include "service/migration_manager.hh"
 #include "service/client_state.hh"
+#include "db/timeout_clock.hh"

 #include "alternator/error.hh"
 #include "stats.hh"
-#include "rjson.hh"
+#include "utils/rjson.hh"
+
+namespace db {
+    class system_distributed_keyspace;
+}
+
+namespace query {
+class partition_slice;
+class result;
+}
+
+namespace cql3::selection {
+    class selection;
+}
+
+namespace service {
+    class storage_service;
+}

 namespace alternator {

+class rmw_operation;
+
+struct make_jsonable : public json::jsonable {
+    rjson::value _value;
+public:
+    explicit make_jsonable(rjson::value&& value);
+    std::string to_json() const override;
+};
+struct json_string : public json::jsonable {
+    std::string _value;
+public:
+    explicit json_string(std::string&& value);
+    std::string to_json() const override;
+};
+
+namespace parsed {
+class path;
+};
+
+// An attribute_path_map object is used to hold data for various attributes
+// paths (parsed::path) in a hierarchy of attribute paths. Each attribute path
+// has a root attribute, and then modified by member and index operators -
+// for example in "a.b[2].c" we have "a" as the root, then ".b" member, then
+// "[2]" index, and finally ".c" member.
+// Data can be added to an attribute_path_map using the add() function, but
+// requires that attributes with data not be *overlapping* or *conflicting*:
+//
+// 1. Two attribute paths which are identical or an ancestor of one another
+//    are considered *overlapping* and not allowed. If a.b.c has data,
+//    we can't add more data in a.b.c or any of its descendants like a.b.c.d.
+//
+// 2. Two attribute paths which need the same parent to have both a member and
+//    an index are considered *conflicting* and not allowed. E.g., if a.b has
+//    data, you can't add a[1]. The meaning of adding both would be that the
+//    attribute a is both a map and an array, which isn't sensible.
+//
+// These two requirements are common to the two places where Alternator uses
+// this abstraction to describe how a hierarchical item is to be transformed:
+//
+// 1. In ProjectExpression: for filtering from a full top-level attribute
+//    only the parts for which user asked in ProjectionExpression.
+//
+// 2. In UpdateExpression: for taking the previous value of a top-level
+//    attribute, and modifying it based on the instructions in the user
+//    wrote in UpdateExpression.
+
+template<typename T>
+class attribute_path_map_node {
+public:
+    using data_t = T;
+    // We need the extra shared_ptr<> here because libstdc++ unordered_map
+    // doesn't work with incomplete types :-( We couldn't use lw_shared_ptr<>
+    // because it doesn't work for incomplete types either. We couldn't use
+    // std::unique_ptr<> because it makes the entire object uncopyable. We
+    // don't often need to copy such a map, but we do have some code that
+    // copies an attrs_to_get object, and is hard to find and remove.
+    // The shared_ptr should never be null.
+    using members_t =  std::unordered_map<std::string, seastar::shared_ptr<attribute_path_map_node<T>>>;
+    // The indexes list is sorted because DynamoDB requires handling writes
+    // beyond the end of a list in index order.
+    using indexes_t = std::map<unsigned, seastar::shared_ptr<attribute_path_map_node<T>>>;
+    // The prohibition on "overlap" and "conflict" explained above means
+    // That only one of data, members or indexes is non-empty.
+    std::optional<std::variant<data_t, members_t, indexes_t>> _content;
+
+    bool is_empty() const { return !_content; }
+    bool has_value() const { return _content && std::holds_alternative<data_t>(*_content); }
+    bool has_members() const { return _content && std::holds_alternative<members_t>(*_content); }
+    bool has_indexes() const { return _content && std::holds_alternative<indexes_t>(*_content); }
+    // get_members() assumes that has_members() is true
+    members_t& get_members() { return std::get<members_t>(*_content); }
+    const members_t& get_members() const { return std::get<members_t>(*_content); }
+    indexes_t& get_indexes() { return std::get<indexes_t>(*_content); }
+    const indexes_t& get_indexes() const { return std::get<indexes_t>(*_content); }
+    T& get_value() { return std::get<T>(*_content); }
+    const T& get_value() const { return std::get<T>(*_content); }
+};
+
+template<typename T>
+using attribute_path_map = std::unordered_map<std::string, attribute_path_map_node<T>>;
+
+using attrs_to_get_node = attribute_path_map_node<std::monostate>;
+using attrs_to_get = attribute_path_map<std::monostate>;
+
+
 class executor : public peering_sharded_service<executor> {
    service::storage_proxy& _proxy;
    service::migration_manager& _mm;
+    db::system_distributed_keyspace& _sdks;
+    service::storage_service& _ss;
    // An smp_service_group to be used for limiting the concurrency when
    // forwarding Alternator request between shards - if necessary for LWT.
    smp_service_group _ssg;
@@ -52,12 +157,13 @@ public:
    static constexpr auto KEYSPACE_NAME_PREFIX = "alternator_";
    static constexpr std::string_view INTERNAL_TABLE_PREFIX = ".scylla.alternator.";

-    executor(service::storage_proxy& proxy, service::migration_manager& mm, smp_service_group ssg)
-        : _proxy(proxy), _mm(mm), _ssg(ssg) {}
+    executor(service::storage_proxy& proxy, service::migration_manager& mm, db::system_distributed_keyspace& sdks, service::storage_service& ss, smp_service_group ssg)
+        : _proxy(proxy), _mm(mm), _sdks(sdks), _ss(ss), _ssg(ssg) {}

    future<request_return_type> create_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
    future<request_return_type> describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
    future<request_return_type> delete_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
+    future<request_return_type> update_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
    future<request_return_type> put_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
    future<request_return_type> get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
    future<request_return_type> delete_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request);
@@ -71,6 +177,10 @@ public:
    future<request_return_type> tag_resource(client_state& client_state, service_permit permit, rjson::value request);
    future<request_return_type> untag_resource(client_state& client_state, service_permit permit, rjson::value request);
    future<request_return_type> list_tags_of_resource(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> list_streams(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> describe_stream(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> get_shard_iterator(client_state& client_state, service_permit permit, rjson::value request);
+    future<request_return_type> get_records(client_state& client_state, tracing::trace_state_ptr, service_permit permit, rjson::value request);

    future<> start();
    future<> stop() { return make_ready_future<>(); }
@@ -78,6 +188,39 @@ public:
    future<> create_keyspace(std::string_view keyspace_name);

    static tracing::trace_state_ptr maybe_trace_query(client_state& client_state, sstring_view op, sstring_view query);
+
+    static sstring table_name(const schema&);
+    static db::timeout_clock::time_point default_timeout();
+    static void set_default_timeout(db::timeout_clock::duration timeout);
+private:
+    static db::timeout_clock::duration s_default_timeout;
+public:
+    static schema_ptr find_table(service::storage_proxy&, const rjson::value& request);
+
+private:
+    friend class rmw_operation;
+
+    static bool is_alternator_keyspace(const sstring& ks_name);
+    static sstring make_keyspace_name(const sstring& table_name);
+    static void describe_key_schema(rjson::value& parent, const schema&, std::unordered_map<std::string,std::string> * = nullptr);
+    static void describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>&);
+    
+public:    
+    static std::optional<rjson::value> describe_single_item(schema_ptr,
+        const query::partition_slice&,
+        const cql3::selection::selection&,
+        const query::result&,
+        const attrs_to_get&);
+
+    static void describe_single_item(const cql3::selection::selection&,
+        const std::vector<bytes_opt>&,
+        const attrs_to_get&,
+        rjson::value&,
+        bool = false);
+
+    void add_stream_options(const rjson::value& stream_spec, schema_builder&) const;
+    void supplement_table_info(rjson::value& descr, const schema& schema) const;
+    void supplement_table_stream_info(rjson::value& descr, const schema& schema) const;
 };

 }
--- a/alternator/expressions.cc
+++ b/alternator/expressions.cc
@@ -130,6 +130,27 @@ void condition_expression::append(condition_expression&& a, char op) {
    }, _expression);
 }

+void path::check_depth_limit() {
+    if (1 + _operators.size() > depth_limit) {
+        throw expressions_syntax_error(format("Document path exceeded {} nesting levels", depth_limit));
+    }
+}
+
+std::ostream& operator<<(std::ostream& os, const path& p) {
+    os << p.root();
+    for (const auto& op : p.operators()) {
+        std::visit(overloaded_functor {
+            [&] (const std::string& member) {
+                os << '.' << member;
+            },
+            [&] (unsigned index) {
+                os << '[' << index << ']';
+            }
+        }, op);
+    }
+    return os;
+}
+
 } // namespace parsed

 // The following resolve_*() functions resolve references in parsed
@@ -151,22 +172,44 @@ void condition_expression::append(condition_expression&& a, char op) {
 // we need to resolve the expression just once but then use it many times
 // (once for each item to be filtered).

-static void resolve_path(parsed::path& p,
+static std::optional<std::string> resolve_path_component(const std::string& column_name,
        const rjson::value* expression_attribute_names,
        std::unordered_set<std::string>& used_attribute_names) {
-    const std::string& column_name = p.root();
    if (column_name.size() > 0 && column_name.front() == '#') {
        if (!expression_attribute_names) {
-            throw api_error("ValidationException",
+            throw api_error::validation(
                    format("ExpressionAttributeNames missing, entry '{}' required by expression", column_name));
        }
        const rjson::value* value = rjson::find(*expression_attribute_names, column_name);
        if (!value || !value->IsString()) {
-            throw api_error("ValidationException",
+            throw api_error::validation(
                    format("ExpressionAttributeNames missing entry '{}' required by expression", column_name));
        }
        used_attribute_names.emplace(column_name);
-        p.set_root(std::string(rjson::to_string_view(*value)));
+        return std::string(rjson::to_string_view(*value));
+    }
+    return std::nullopt;
+}
+
+static void resolve_path(parsed::path& p,
+        const rjson::value* expression_attribute_names,
+        std::unordered_set<std::string>& used_attribute_names) {
+    std::optional<std::string> r = resolve_path_component(p.root(), expression_attribute_names, used_attribute_names);
+    if (r) {
+        p.set_root(std::move(*r));
+    }
+    for (auto& op : p.operators()) {
+        std::visit(overloaded_functor {
+            [&] (std::string& s) {
+                r = resolve_path_component(s, expression_attribute_names, used_attribute_names);
+                if (r) {
+                    s = std::move(*r);
+                }
+            },
+            [&] (unsigned index) {
+                // nothing to resolve
+            }
+        }, op);
    }
 }

@@ -176,16 +219,16 @@ static void resolve_constant(parsed::constant& c,
    std::visit(overloaded_functor {
        [&] (const std::string& valref) {
            if (!expression_attribute_values) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("ExpressionAttributeValues missing, entry '{}' required by expression", valref));
            }
            const rjson::value* value = rjson::find(*expression_attribute_values, valref);
            if (!value) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("ExpressionAttributeValues missing entry '{}' required by expression", valref));
            }
            if (value->IsNull()) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("ExpressionAttributeValues null value for entry '{}' required by expression", valref));
            }
            validate_value(*value, "ExpressionAttributeValues");
@@ -348,6 +391,39 @@ bool condition_expression_on(const parsed::condition_expression& ce, std::string
    }, ce._expression);
 }

+// for_condition_expression_on() runs a given function over all the attributes
+// mentioned in the expression. If the same attribute is mentioned more than
+// once, the function will be called more than once for the same attribute.
+
+static void for_value_on(const parsed::value& v, const noncopyable_function<void(std::string_view)>& func) {
+    std::visit(overloaded_functor {
+        [&] (const parsed::constant& c) { },
+        [&] (const parsed::value::function_call& f) {
+            for (const parsed::value& value : f._parameters) {
+                for_value_on(value, func);
+            }
+        },
+        [&] (const parsed::path& p) {
+            func(p.root());
+        }
+    }, v._value);
+}
+
+void for_condition_expression_on(const parsed::condition_expression& ce, const noncopyable_function<void(std::string_view)>& func) {
+    std::visit(overloaded_functor {
+        [&] (const parsed::primitive_condition& cond) {
+            for (const parsed::value& value : cond._values) {
+                for_value_on(value, func);
+            }
+        },
+        [&] (const parsed::condition_expression::condition_list& list) {
+            for (const parsed::condition_expression& cond : list.conditions) {
+                for_condition_expression_on(cond, func);
+            }
+        }
+    }, ce._expression);
+}
+
 // The following calculate_value() functions calculate, or evaluate, a parsed
 // expression. The parsed expression is assumed to have been "resolved", with
 // the matching resolve_* function.
@@ -359,7 +435,7 @@ static rjson::value list_concatenate(const rjson::value& v1, const rjson::value&
    const rjson::value* list1 = unwrap_list(v1);
    const rjson::value* list2 = unwrap_list(v2);
    if (!list1 || !list2) {
-        throw api_error("ValidationException", "UpdateExpression: list_append() given a non-list");
+        throw api_error::validation("UpdateExpression: list_append() given a non-list");
    }
    rjson::value cat = rjson::copy(*list1);
    for (const auto& a : list2->GetArray()) {
@@ -380,28 +456,28 @@ static rjson::value calculate_size(const rjson::value& v) {
    // must come from the request itself, not from the database, so it makes
    // sense to throw a ValidationException if we see such a problem.
    if (!v.IsObject() || v.MemberCount() != 1) {
-        throw api_error("ValidationException", format("invalid object: {}", v));
+        throw api_error::validation(format("invalid object: {}", v));
    }
    auto it = v.MemberBegin();
    int ret;
    if (it->name == "S") {
        if (!it->value.IsString()) {
-            throw api_error("ValidationException", format("invalid string: {}", v));
+            throw api_error::validation(format("invalid string: {}", v));
        }
        ret = it->value.GetStringLength();
    } else if (it->name == "NS" || it->name == "SS" || it->name == "BS" || it->name == "L") {
        if (!it->value.IsArray()) {
-            throw api_error("ValidationException", format("invalid set: {}", v));
+            throw api_error::validation(format("invalid set: {}", v));
        }
        ret = it->value.Size();
    } else if (it->name == "M") {
        if (!it->value.IsObject()) {
-            throw api_error("ValidationException", format("invalid map: {}", v));
+            throw api_error::validation(format("invalid map: {}", v));
        }
        ret = it->value.MemberCount();
    } else if (it->name == "B") {
        if (!it->value.IsString()) {
-            throw api_error("ValidationException", format("invalid byte string: {}", v));
+            throw api_error::validation(format("invalid byte string: {}", v));
        }
        ret = base64_decoded_len(rjson::to_string_view(it->value));
    } else {
@@ -445,11 +521,11 @@ static const
 std::unordered_map<std::string_view, function_handler_type*> function_handlers {
    {"list_append", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
            if (caller != calculate_value_caller::UpdateExpression) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: list_append() not allowed here", caller));
            }
            if (f._parameters.size() != 2) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: list_append() accepts 2 parameters, got {}", caller, f._parameters.size()));
            }
            rjson::value v1 = calculate_value(f._parameters[0], caller, previous_item);
@@ -459,15 +535,15 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
    },
    {"if_not_exists", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
            if (caller != calculate_value_caller::UpdateExpression) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: if_not_exists() not allowed here", caller));
            }
            if (f._parameters.size() != 2) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: if_not_exists() accepts 2 parameters, got {}", caller, f._parameters.size()));
            }
            if (!std::holds_alternative<parsed::path>(f._parameters[0]._value)) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: if_not_exists() must include path as its first argument", caller));
            }
            rjson::value v1 = calculate_value(f._parameters[0], caller, previous_item);
@@ -477,11 +553,11 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
    },
    {"size", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
            if (caller != calculate_value_caller::ConditionExpression) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: size() not allowed here", caller));
            }
            if (f._parameters.size() != 1) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: size() accepts 1 parameter, got {}", caller, f._parameters.size()));
            }
            rjson::value v = calculate_value(f._parameters[0], caller, previous_item);
@@ -490,15 +566,15 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
    },
    {"attribute_exists", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
            if (caller != calculate_value_caller::ConditionExpressionAlone) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_exists() not allowed here", caller));
            }
            if (f._parameters.size() != 1) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_exists() accepts 1 parameter, got {}", caller, f._parameters.size()));
            }
            if (!std::holds_alternative<parsed::path>(f._parameters[0]._value)) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_exists()'s parameter must be a path", caller));
            }
            rjson::value v = calculate_value(f._parameters[0], caller, previous_item);
@@ -507,15 +583,15 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
    },
    {"attribute_not_exists", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
            if (caller != calculate_value_caller::ConditionExpressionAlone) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_not_exists() not allowed here", caller));
            }
            if (f._parameters.size() != 1) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_not_exists() accepts 1 parameter, got {}", caller, f._parameters.size()));
            }
            if (!std::holds_alternative<parsed::path>(f._parameters[0]._value)) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_not_exists()'s parameter must be a path", caller));
            }
            rjson::value v = calculate_value(f._parameters[0], caller, previous_item);
@@ -524,18 +600,18 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
    },
    {"attribute_type", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
            if (caller != calculate_value_caller::ConditionExpressionAlone) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_type() not allowed here", caller));
            }
            if (f._parameters.size() != 2) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_type() accepts 2 parameters, got {}", caller, f._parameters.size()));
            }
            // There is no real reason for the following check (not
            // allowing the type to come from a document attribute), but
            // DynamoDB does this check, so we do too...
            if (!f._parameters[1].is_constant()) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_types()'s first parameter must be an expression attribute", caller));
            }
            rjson::value v0 = calculate_value(f._parameters[0], caller, previous_item);
@@ -544,7 +620,7 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
                // If the type parameter is not one of the legal types
                // we should generate an error, not a failed condition:
                if (!known_type(rjson::to_string_view(v1.MemberBegin()->value))) {
-                    throw api_error("ValidationException",
+                    throw api_error::validation(
                            format("{}: attribute_types()'s second parameter, {}, is not a known type",
                                    caller, v1.MemberBegin()->value));
                }
@@ -554,77 +630,33 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
                    return to_bool_json(false);
                }
            } else {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: attribute_type() second parameter must refer to a string, got {}", caller, v1));
            }
        }
    },
    {"begins_with", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
            if (caller != calculate_value_caller::ConditionExpressionAlone) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: begins_with() not allowed here", caller));
            }
            if (f._parameters.size() != 2) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: begins_with() accepts 2 parameters, got {}", caller, f._parameters.size()));
            }
            rjson::value v1 = calculate_value(f._parameters[0], caller, previous_item);
            rjson::value v2 = calculate_value(f._parameters[1], caller, previous_item);
-            // TODO: There's duplication here with check_BEGINS_WITH().
-            // But unfortunately, the two functions differ a bit.
-
-            // If one of v1 or v2 is malformed or has an unsupported type
-            // (not B or S), what we do depends on whether it came from
-            // the user's query (is_constant()), or the item. Unsupported
-            // values in the query result in an error, but if they are in
-            // the item, we silently return false (no match).
-            bool bad = false;
-            if (!v1.IsObject() || v1.MemberCount() != 1) {
-                bad = true;
-                if (f._parameters[0].is_constant()) {
-                    throw api_error("ValidationException", format("{}: begins_with() encountered malformed AttributeValue: {}", caller, v1));
-                }
-            } else if (v1.MemberBegin()->name != "S" && v1.MemberBegin()->name != "B") {
-                bad = true;
-                if (f._parameters[0].is_constant()) {
-                    throw api_error("ValidationException", format("{}: begins_with() supports only string or binary in AttributeValue: {}", caller, v1));
-                }
-            }
-            if (!v2.IsObject() || v2.MemberCount() != 1) {
-                bad = true;
-                if (f._parameters[1].is_constant()) {
-                    throw api_error("ValidationException", format("{}: begins_with() encountered malformed AttributeValue: {}", caller, v2));
-                }
-            } else if (v2.MemberBegin()->name != "S" && v2.MemberBegin()->name != "B") {
-                bad = true;
-                if (f._parameters[1].is_constant()) {
-                    throw api_error("ValidationException", format("{}: begins_with() supports only string or binary in AttributeValue: {}", caller, v2));
-                }
-            }
-            bool ret = false;
-            if (!bad) {
-                auto it1 = v1.MemberBegin();
-                auto it2 = v2.MemberBegin();
-                if (it1->name == it2->name) {
-                    if (it2->name == "S") {
-                        std::string_view val1 = rjson::to_string_view(it1->value);
-                        std::string_view val2 = rjson::to_string_view(it2->value);
-                        ret = val1.starts_with(val2);
-                    } else /* it2->name == "B" */ {
-                        ret = base64_begins_with(rjson::to_string_view(it1->value), rjson::to_string_view(it2->value));
-                    }
-                }
-            }
-            return to_bool_json(ret);
+            return to_bool_json(check_BEGINS_WITH(v1.IsNull() ? nullptr : &v1,  v2,
+                                    f._parameters[0].is_constant(), f._parameters[1].is_constant()));
        }
    },
    {"contains", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
            if (caller != calculate_value_caller::ConditionExpressionAlone) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: contains() not allowed here", caller));
            }
            if (f._parameters.size() != 2) {
-                throw api_error("ValidationException",
+                throw api_error::validation(
                        format("{}: contains() accepts 2 parameters, got {}", caller, f._parameters.size()));
            }
            rjson::value v1 = calculate_value(f._parameters[0], caller, previous_item);
@@ -634,6 +666,55 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
    },
 };

+// Given a parsed::path and an item read from the table, extract the value
+// of a certain attribute path, such as "a" or "a.b.c[3]". Returns a null
+// value if the item or the requested attribute does not exist.
+// Note that the item is assumed to be encoded in JSON using DynamoDB
+// conventions - each level of a nested document is a map with one key -
+// a type (e.g., "M" for map) - and its value is the representation of
+// that value.
+static rjson::value extract_path(const rjson::value* item,
+        const parsed::path& p, calculate_value_caller caller) {
+    if (!item) {
+        return rjson::null_value();
+    }
+    const rjson::value* v = rjson::find(*item, p.root());
+    if (!v) {
+        return rjson::null_value();
+    }
+    for (const auto& op : p.operators()) {
+        if (!v->IsObject() || v->MemberCount() != 1) {
+            // This shouldn't happen. We shouldn't have stored malformed
+            // objects. But today Alternator does not validate the structure
+            // of nested documents before storing them, so this can happen on
+            // read.
+            throw api_error::validation(format("{}: malformed item read: {}", *item));
+        }
+        const char* type = v->MemberBegin()->name.GetString();
+        v = &(v->MemberBegin()->value);
+        std::visit(overloaded_functor {
+            [&] (const std::string& member) {
+                if (type[0] == 'M' && v->IsObject()) {
+                    v = rjson::find(*v, member);
+                } else {
+                    v = nullptr;
+                }
+            },
+            [&] (unsigned index) {
+                if (type[0] == 'L' && v->IsArray() && index < v->Size()) {
+                    v = &(v->GetArray()[index]);
+                } else {
+                    v = nullptr;
+                }
+            }
+        }, op);
+        if (!v) {
+            return rjson::null_value();
+        }
+    }
+    return rjson::copy(*v);
+}
+
 // Given a parsed::value, which can refer either to a constant value from
 // ExpressionAttributeValues, to the value of some attribute, or to a function
 // of other values, this function calculates the resulting value.
@@ -650,22 +731,13 @@ rjson::value calculate_value(const parsed::value& v,
        [&] (const parsed::value::function_call& f) -> rjson::value {
            auto function_it = function_handlers.find(std::string_view(f._function_name));
            if (function_it == function_handlers.end()) {
-                throw api_error("ValidationException",
-                        format("UpdateExpression: unknown function '{}' called.", f._function_name));
+                throw api_error::validation(
+                        format("{}: unknown function '{}' called.", caller, f._function_name));
            }
            return function_it->second(caller, previous_item, f);
        },
        [&] (const parsed::path& p) -> rjson::value {
-            if (!previous_item) {
-                return rjson::null_value();
-            }
-            std::string update_path = p.root();
-            if (p.has_operators()) {
-                // FIXME: support this
-                throw api_error("ValidationException", "Reading attribute paths not yet implemented");
-            }
-            const rjson::value* previous_value = rjson::find(*previous_item, update_path);
-            return previous_value ? rjson::copy(*previous_value) : rjson::null_value();
+            return extract_path(previous_item, p, caller);
        }
    }, v._value);
 }
@@ -674,7 +746,7 @@ rjson::value calculate_value(const parsed::value& v,
 // either a single value, or v1+v2 or v1-v2.
 rjson::value calculate_value(const parsed::set_rhs& rhs,
        const rjson::value* previous_item) {
-    switch(rhs._op) {
+    switch (rhs._op) {
    case 'v':
        return calculate_value(rhs._v1, calculate_value_caller::UpdateExpression, previous_item);
    case '+': {
--- a/alternator/expressions.hh
+++ b/alternator/expressions.hh
@@ -27,8 +27,10 @@
 #include <unordered_set>
 #include <string_view>

+#include <seastar/util/noncopyable_function.hh>
+
 #include "expressions_types.hh"
-#include "rjson.hh"
+#include "utils/rjson.hh"

 namespace alternator {

@@ -59,6 +61,11 @@ void validate_value(const rjson::value& v, const char* caller);

 bool condition_expression_on(const parsed::condition_expression& ce, std::string_view attribute);

+// for_condition_expression_on() runs the given function on the attributes
+// that the expression uses. It may run for the same attribute more than once
+// if the same attribute is used more than once in the expression.
+void for_condition_expression_on(const parsed::condition_expression& ce, const noncopyable_function<void(std::string_view)>& func);
+
 // calculate_value() behaves slightly different (especially, different
 // functions supported) when used in different types of expressions, as
 // enumerated in this enum:
--- a/alternator/expressions_types.hh
+++ b/alternator/expressions_types.hh
@@ -27,7 +27,7 @@

 #include <seastar/core/shared_ptr.hh>

-#include "rjson.hh"
+#include "utils/rjson.hh"

 /*
 * Parsed representation of expressions and their components.
@@ -49,15 +49,23 @@ class path {
    // dot (e.g., ".xyz").
    std::string _root;
    std::vector<std::variant<std::string, unsigned>> _operators;
+    // It is useful to limit the depth of a user-specified path, because is
+    // allows us to use recursive algorithms without worrying about recursion
+    // depth. DynamoDB officially limits the length of paths to 32 components
+    // (including the root) so let's use the same limit.
+    static constexpr unsigned depth_limit = 32;
+    void check_depth_limit();
 public:
    void set_root(std::string root) {
        _root = std::move(root);
    }
    void add_index(unsigned i) {
        _operators.emplace_back(i);
+        check_depth_limit();
    }
    void add_dot(std::string(name)) {
        _operators.emplace_back(std::move(name));
+        check_depth_limit();
    }
    const std::string& root() const {
        return _root;
@@ -65,6 +73,13 @@ public:
    bool has_operators() const {
        return !_operators.empty();
    }
+    const std::vector<std::variant<std::string, unsigned>>& operators() const {
+        return _operators;
+    }
+    std::vector<std::variant<std::string, unsigned>>& operators() {
+        return _operators;
+    }
+    friend std::ostream& operator<<(std::ostream&, const path&);
 };

 // When an expression is first parsed, all constants are references, like
--- a/alternator/rjson.hh
+++ b/alternator/rjson.hh
@@ -1,177 +0,0 @@
-/*
- * Copyright 2019 ScyllaDB
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-/*
- * rjson is a wrapper over rapidjson library, providing fast JSON parsing and generation.
- *
- * rapidjson has strict copy elision policies, which, among other things, involves
- * using provided char arrays without copying them and allows copying objects only explicitly.
- * As such, one should be careful when passing strings with limited liveness
- * (e.g. data underneath local std::strings) to rjson functions, because created JSON objects
- * may end up relying on dangling char pointers. All rjson functions that create JSONs from strings
- * by rjson have both APIs for string_ref_type (more optimal, used when the string is known to live
- * at least as long as the object, e.g. a static char array) and for std::strings. The more optimal
- * variants should be used *only* if the liveness of the string is guaranteed, otherwise it will
- * result in undefined behaviour.
- * Also, bear in mind that methods exposed by rjson::value are generic, but some of them
- * work fine only for specific types. In case the type does not match, an rjson::error will be thrown.
- * Examples of such mismatched usages is calling MemberCount() on a JSON value not of object type
- * or calling Size() on a non-array value.
- */
-
-#include <string>
-#include <stdexcept>
-
-namespace rjson {
-class error : public std::exception {
-    std::string _msg;
-public:
-    error() = default;
-    error(const std::string& msg) : _msg(msg) {}
-
-    virtual const char* what() const noexcept override { return _msg.c_str(); }
-};
-}
-
-// rapidjson configuration macros
-#define RAPIDJSON_HAS_STDSTRING 1
-// Default rjson policy is to use assert() - which is dangerous for two reasons:
-// 1. assert() can be turned off with -DNDEBUG
-// 2. assert() crashes a program
-// Fortunately, the default policy can be overridden, and so rapidjson errors will
-// throw an rjson::error exception instead.
-#define RAPIDJSON_ASSERT(x) do { if (!(x)) throw rjson::error(std::string("JSON error: condition not met: ") + #x); } while (0)
-
-#include <rapidjson/document.h>
-#include <rapidjson/writer.h>
-#include <rapidjson/stringbuffer.h>
-#include <rapidjson/error/en.h>
-#include <seastar/core/sstring.hh>
-#include "seastarx.hh"
-
-namespace rjson {
-
-using allocator = rapidjson::CrtAllocator;
-using encoding = rapidjson::UTF8<>;
-using document = rapidjson::GenericDocument<encoding, allocator>;
-using value = rapidjson::GenericValue<encoding, allocator>;
-using string_ref_type = value::StringRefType;
-using string_buffer = rapidjson::GenericStringBuffer<encoding>;
-using writer = rapidjson::Writer<string_buffer, encoding>;
-using type = rapidjson::Type;
-
-// Returns an object representing JSON's null
-inline rjson::value null_value() {
-    return rjson::value(rapidjson::kNullType);
-}
-
-// Returns an empty JSON object - {}
-inline rjson::value empty_object() {
-    return rjson::value(rapidjson::kObjectType);
-}
-
-// Returns an empty JSON array - []
-inline rjson::value empty_array() {
-    return rjson::value(rapidjson::kArrayType);
-}
-
-// Returns an empty JSON string - ""
-inline rjson::value empty_string() {
-    return rjson::value(rapidjson::kStringType);
-}
-
-// Convert the JSON value to a string with JSON syntax, the opposite of parse().
-// The representation is dense - without any redundant indentation.
-std::string print(const rjson::value& value);
-
-// Returns a string_view to the string held in a JSON value (which is
-// assumed to hold a string, i.e., v.IsString() == true). This is a view
-// to the existing data - no copying is done.
-inline std::string_view to_string_view(const rjson::value& v) {
-    return std::string_view(v.GetString(), v.GetStringLength());
-}
-
-// Copies given JSON value - involves allocation
-rjson::value copy(const rjson::value& value);
-
-// Parses a JSON value from given string or raw character array.
-// The string/char array liveness does not need to be persisted,
-// as parse() will allocate member names and values.
-// Throws rjson::error if parsing failed.
-rjson::value parse(std::string_view str);
-// Needs to be run in thread context
-rjson::value parse_yieldable(std::string_view str);
-
-// Creates a JSON value (of JSON string type) out of internal string representations.
-// The string value is copied, so str's liveness does not need to be persisted.
-rjson::value from_string(const std::string& str);
-rjson::value from_string(const sstring& str);
-rjson::value from_string(const char* str, size_t size);
-rjson::value from_string(std::string_view view);
-
-// Returns a pointer to JSON member if it exists, nullptr otherwise
-rjson::value* find(rjson::value& value, std::string_view name);
-const rjson::value* find(const rjson::value& value, std::string_view name);
-
-// Returns a reference to JSON member if it exists, throws otherwise
-rjson::value& get(rjson::value& value, std::string_view name);
-const rjson::value& get(const rjson::value& value, std::string_view name);
-
-// Sets a member in given JSON object by moving the member - allocates the name.
-// Throws if base is not a JSON object.
-void set_with_string_name(rjson::value& base, const std::string& name, rjson::value&& member);
-void set_with_string_name(rjson::value& base, std::string_view name, rjson::value&& member);
-
-// Sets a string member in given JSON object by assigning its reference - allocates the name.
-// NOTICE: member string liveness must be ensured to be at least as long as base's.
-// Throws if base is not a JSON object.
-void set_with_string_name(rjson::value& base, const std::string& name, rjson::string_ref_type member);
-void set_with_string_name(rjson::value& base, std::string_view name, rjson::string_ref_type member);
-
-// Sets a member in given JSON object by moving the member.
-// NOTICE: name liveness must be ensured to be at least as long as base's.
-// Throws if base is not a JSON object.
-void set(rjson::value& base, rjson::string_ref_type name, rjson::value&& member);
-
-// Sets a string member in given JSON object by assigning its reference.
-// NOTICE: name liveness must be ensured to be at least as long as base's.
-// NOTICE: member liveness must be ensured to be at least as long as base's.
-// Throws if base is not a JSON object.
-void set(rjson::value& base, rjson::string_ref_type name, rjson::string_ref_type member);
-
-// Adds a value to a JSON list by moving the item to its end.
-// Throws if base_array is not a JSON array.
-void push_back(rjson::value& base_array, rjson::value&& item);
-
-// Remove a member from a JSON object. Throws if value isn't an object.
-bool remove_member(rjson::value& value, std::string_view name);
-
-struct single_value_comp {
-    bool operator()(const rjson::value& r1, const rjson::value& r2) const;
-};
-
-} // end namespace rjson
-
-namespace std {
-std::ostream& operator<<(std::ostream& os, const rjson::value& v);
-}
--- a/alternator/rmw_operation.hh
+++ b/alternator/rmw_operation.hh
@@ -24,7 +24,7 @@
 #include "seastarx.hh"
 #include "service/storage_proxy.hh"
 #include "service/storage_proxy.hh"
-#include "rjson.hh"
+#include "utils/rjson.hh"
 #include "executor.hh"

 namespace alternator {
@@ -87,7 +87,11 @@ protected:
    // When _returnvalues != NONE, apply() should store here, in JSON form,
    // the values which are to be returned in the "Attributes" field.
    // The default null JSON means do not return an Attributes field at all.
-    rjson::value _return_attributes;
+    // This field is marked "mutable" so that the const apply() can modify
+    // it (see explanation below), but note that because apply() may be
+    // called more than once, if apply() will sometimes set this field it
+    // must set it (even if just to the default empty value) every time.
+    mutable rjson::value _return_attributes;
 public:
    // The constructor of a rmw_operation subclass should parse the request
    // and try to discover as many input errors as it can before really
@@ -100,7 +104,12 @@ public:
    // conditional expression, apply() should return an empty optional.
    // apply() may throw if it encounters input errors not discovered during
    // the constructor.
-    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) = 0;
+    // apply() may be called more than once in case of contention, so it must
+    // not change the state saved in the object (issue #7218 was caused by
+    // violating this). We mark apply() "const" to let the compiler validate
+    // this for us. The output-only field _return_attributes is marked
+    // "mutable" above so that apply() can still write to it.
+    virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const = 0;
    // Convert the above apply() into the signature needed by cas_request:
    virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override;
    virtual ~rmw_operation() = default;
--- a/alternator/serialization.cc
+++ b/alternator/serialization.cc
@@ -65,7 +65,7 @@ struct from_json_visitor {

    void operator()(const reversed_type_impl& t) const { visit(*t.underlying_type(), from_json_visitor{v, bo}); };
    void operator()(const string_type_impl& t) {
-        bo.write(t.from_string(sstring_view(v.GetString(), v.GetStringLength())));
+        bo.write(t.from_string(rjson::to_string_view(v)));
    }
    void operator()(const bytes_type_impl& t) const {
        bo.write(base64_decode(v));
@@ -74,23 +74,27 @@ struct from_json_visitor {
        bo.write(boolean_type->decompose(v.GetBool()));
    }
    void operator()(const decimal_type_impl& t) const {
-        bo.write(t.from_string(sstring_view(v.GetString(), v.GetStringLength())));
+        try {
+            bo.write(t.from_string(rjson::to_string_view(v)));
+        } catch (const marshal_exception& e) {
+            throw api_error::validation(format("The parameter cannot be converted to a numeric value: {}", v));
+        }
    }
    // default
    void operator()(const abstract_type& t) const {
-        bo.write(from_json_object(t, Json::Value(rjson::print(v)), cql_serialization_format::internal()));
+        bo.write(from_json_object(t, v, cql_serialization_format::internal()));
    }
 };

 bytes serialize_item(const rjson::value& item) {
    if (item.IsNull() || item.MemberCount() != 1) {
-        throw api_error("ValidationException", format("An item can contain only one attribute definition: {}", item));
+        throw api_error::validation(format("An item can contain only one attribute definition: {}", item));
    }
    auto it = item.MemberBegin();
    type_info type_info = type_info_from_string(rjson::to_string_view(it->name)); // JSON keys are guaranteed to be strings

    if (type_info.atype == alternator_type::NOT_SUPPORTED_YET) {
-        slogger.trace("Non-optimal serialization of type {}", it->name.GetString());
+        slogger.trace("Non-optimal serialization of type {}", it->name);
        return bytes{int8_t(type_info.atype)} + to_bytes(rjson::print(item));
    }

@@ -128,7 +132,7 @@ struct to_json_visitor {
 rjson::value deserialize_item(bytes_view bv) {
    rjson::value deserialized(rapidjson::kObjectType);
    if (bv.empty()) {
-        throw api_error("ValidationException", "Serialized value empty");
+        throw api_error::validation("Serialized value empty");
    }

    alternator_type atype = alternator_type(bv[0]);
@@ -164,7 +168,7 @@ bytes get_key_column_value(const rjson::value& item, const column_definition& co
    std::string column_name = column.name_as_text();
    const rjson::value* key_typed_value = rjson::find(item, column_name);
    if (!key_typed_value) {
-        throw api_error("ValidationException", format("Key column {} not found", column_name));
+        throw api_error::validation(format("Key column {} not found", column_name));
    }
    return get_key_from_typed_value(*key_typed_value, column);
 }
@@ -175,20 +179,20 @@ bytes get_key_column_value(const rjson::value& item, const column_definition& co
 bytes get_key_from_typed_value(const rjson::value& key_typed_value, const column_definition& column) {
    if (!key_typed_value.IsObject() || key_typed_value.MemberCount() != 1 ||
            !key_typed_value.MemberBegin()->value.IsString()) {
-        throw api_error("ValidationException",
+        throw api_error::validation(
                format("Malformed value object for key column {}: {}",
                        column.name_as_text(), key_typed_value));
    }

    auto it = key_typed_value.MemberBegin();
    if (it->name != type_to_string(column.type)) {
-        throw api_error("ValidationException",
+        throw api_error::validation(
                format("Type mismatch: expected type {} for key column {}, got type {}",
-                        type_to_string(column.type), column.name_as_text(), it->name.GetString()));
+                        type_to_string(column.type), column.name_as_text(), it->name));
    }
    std::string_view value_view = rjson::to_string_view(it->value);
    if (value_view.empty()) {
-        throw api_error("ValidationException",
+        throw api_error::validation(
                format("The AttributeValue for a key attribute cannot contain an empty string value. Key: {}", column.name_as_text()));
    }
    if (column.type == bytes_type) {
@@ -247,20 +251,24 @@ clustering_key ck_from_json(const rjson::value& item, schema_ptr schema) {

 big_decimal unwrap_number(const rjson::value& v, std::string_view diagnostic) {
    if (!v.IsObject() || v.MemberCount() != 1) {
-        throw api_error("ValidationException", format("{}: invalid number object", diagnostic));
+        throw api_error::validation(format("{}: invalid number object", diagnostic));
    }
    auto it = v.MemberBegin();
    if (it->name != "N") {
-        throw api_error("ValidationException", format("{}: expected number, found type '{}'", diagnostic, it->name));
+        throw api_error::validation(format("{}: expected number, found type '{}'", diagnostic, it->name));
    }
-    if (it->value.IsNumber()) {
-         // FIXME(sarna): should use big_decimal constructor with numeric values directly:
-        return big_decimal(rjson::print(it->value));
+    try {
+        if (it->value.IsNumber()) {
+             // FIXME(sarna): should use big_decimal constructor with numeric values directly:
+            return big_decimal(rjson::print(it->value));
+        }
+        if (!it->value.IsString()) {
+            throw api_error::validation(format("{}: improperly formatted number constant", diagnostic));
+        }
+        return big_decimal(rjson::to_string_view(it->value));
+    } catch (const marshal_exception& e) {
+        throw api_error::validation(format("The parameter cannot be converted to a numeric value: {}", it->value));
    }
-    if (!it->value.IsString()) {
-        throw api_error("ValidationException", format("{}: improperly formatted number constant", diagnostic));
-    }
-    return big_decimal(it->value.GetString());
 }

 const std::pair<std::string, const rjson::value*> unwrap_set(const rjson::value& v) {
@@ -312,10 +320,10 @@ rjson::value set_sum(const rjson::value& v1, const rjson::value& v2) {
    auto [set1_type, set1] = unwrap_set(v1);
    auto [set2_type, set2] = unwrap_set(v2);
    if (set1_type != set2_type) {
-        throw api_error("ValidationException", format("Mismatched set types: {} and {}", set1_type, set2_type));
+        throw api_error::validation(format("Mismatched set types: {} and {}", set1_type, set2_type));
    }
    if (!set1 || !set2) {
-        throw api_error("ValidationException", "UpdateExpression: ADD operation for sets must be given sets as arguments");
+        throw api_error::validation("UpdateExpression: ADD operation for sets must be given sets as arguments");
    }
    rjson::value sum = rjson::copy(*set1);
    std::set<rjson::value, rjson::single_value_comp> set1_raw;
@@ -323,7 +331,7 @@ rjson::value set_sum(const rjson::value& v1, const rjson::value& v2) {
        set1_raw.insert(rjson::copy(*it));
    }
    for (const auto& a : set2->GetArray()) {
-        if (set1_raw.count(a) == 0) {
+        if (!set1_raw.contains(a)) {
            rjson::push_back(sum, rjson::copy(a));
        }
    }
@@ -340,10 +348,10 @@ std::optional<rjson::value> set_diff(const rjson::value& v1, const rjson::value&
    auto [set1_type, set1] = unwrap_set(v1);
    auto [set2_type, set2] = unwrap_set(v2);
    if (set1_type != set2_type) {
-        throw api_error("ValidationException", format("Mismatched set types: {} and {}", set1_type, set2_type));
+        throw api_error::validation(format("Mismatched set types: {} and {}", set1_type, set2_type));
    }
    if (!set1 || !set2) {
-        throw api_error("ValidationException", "UpdateExpression: DELETE operation can only be performed on a set");
+        throw api_error::validation("UpdateExpression: DELETE operation can only be performed on a set");
    }
    std::set<rjson::value, rjson::single_value_comp> set1_raw;
    for (auto it = set1->Begin(); it != set1->End(); ++it) {
--- a/alternator/serialization.hh
+++ b/alternator/serialization.hh
@@ -26,7 +26,7 @@
 #include "types.hh"
 #include "schema_fwd.hh"
 #include "keys.hh"
-#include "rjson.hh"
+#include "utils/rjson.hh"
 #include "utils/big_decimal.hh"

 namespace alternator {
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -25,7 +25,7 @@
 #include <seastar/json/json_elements.hh>
 #include "seastarx.hh"
 #include "error.hh"
-#include "rjson.hh"
+#include "utils/rjson.hh"
 #include "auth.hh"
 #include <cctype>
 #include "cql3/query_processor.hh"
@@ -75,20 +75,17 @@ public:
                 // returned to the client as expected. Other types of
                 // exceptions are unexpected, and returned to the user
                 // as an internal server error:
-                 api_error ret;
                 try {
                     resf.get();
                 } catch (api_error &ae) {
-                     ret = ae;
+                     generate_error_reply(*rep, ae);
                 } catch (rjson::error & re) {
-                     ret = api_error("ValidationException", re.what());
+                     generate_error_reply(*rep,
+                             api_error::validation(re.what()));
                 } catch (...) {
-                     ret = api_error(
-                             "Internal Server Error",
-                             format("Internal server error: {}", std::current_exception()),
-                             reply::status_type::internal_server_error);
+                     generate_error_reply(*rep,
+                             api_error::internal(format("Internal server error: {}", std::current_exception())));
                 }
-                 generate_error_reply(*rep, ret);
                 return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
             }
             auto res = resf.get0();
@@ -96,6 +93,10 @@ public:
                 [&] (const json::json_return_type& json_return_value) {
                     slogger.trace("api_handler success case");
                     if (json_return_value._body_writer) {
+                         // Unfortunately, write_body() forces us to choose
+                         // from a fixed and irrelevant list of "mime-types"
+                         // at this point. But we'll override it with the
+                         // one (application/x-amz-json-1.0) below.
                         rep->write_body("json", std::move(json_return_value._body_writer));
                     } else {
                         rep->_content += json_return_value._res;
@@ -108,14 +109,15 @@ public:

             return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
         });
-    }), _type("json") { }
+    }) { }

    api_handler(const api_handler&) = default;
    future<std::unique_ptr<reply>> handle(const sstring& path,
            std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
        return _f_handle(std::move(req), std::move(rep)).then(
                [this](std::unique_ptr<reply> rep) {
-                    rep->done(_type);
+                    rep->set_mime_type("application/x-amz-json-1.0");
+                    rep->done();
                    return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
                });
    }
@@ -129,7 +131,6 @@ protected:
    }

    future_handler_function _f_handle;
-    sstring _type;
 };

 class gated_handler : public handler_base {
@@ -188,31 +189,38 @@ future<> server::verify_signature(const request& req) {
    }
    auto host_it = req._headers.find("Host");
    if (host_it == req._headers.end()) {
-        throw api_error("InvalidSignatureException", "Host header is mandatory for signature verification");
+        throw api_error::invalid_signature("Host header is mandatory for signature verification");
    }
    auto authorization_it = req._headers.find("Authorization");
    if (authorization_it == req._headers.end()) {
-        throw api_error("InvalidSignatureException", "Authorization header is mandatory for signature verification");
+        throw api_error::missing_authentication_token("Authorization header is mandatory for signature verification");
    }
    std::string host = host_it->second;
-    std::vector<std::string_view> credentials_raw = split(authorization_it->second, ' ');
+    std::string_view authorization_header = authorization_it->second;
+    auto pos = authorization_header.find_first_of(' ');
+    if (pos == std::string_view::npos || authorization_header.substr(0, pos) != "AWS4-HMAC-SHA256") {
+        throw api_error::invalid_signature(format("Authorization header must use AWS4-HMAC-SHA256 algorithm: {}", authorization_header));
+    }
+    authorization_header.remove_prefix(pos+1);
    std::string credential;
    std::string user_signature;
    std::string signed_headers_str;
    std::vector<std::string_view> signed_headers;
-    for (std::string_view entry : credentials_raw) {
+    do {
+        // Either one of a comma or space can mark the end of an entry
+        pos = authorization_header.find_first_of(" ,");
+        std::string_view entry = authorization_header.substr(0, pos);
+        if (pos != std::string_view::npos) {
+            authorization_header.remove_prefix(pos + 1);
+        }
+        if (entry.empty()) {
+            continue;
+        }
        std::vector<std::string_view> entry_split = split(entry, '=');
        if (entry_split.size() != 2) {
-            if (entry != "AWS4-HMAC-SHA256") {
-                throw api_error("InvalidSignatureException", format("Only AWS4-HMAC-SHA256 algorithm is supported. Found: {}", entry));
-            }
            continue;
        }
        std::string_view auth_value = entry_split[1];
-        // Commas appear as an additional (quite redundant) delimiter
-        if (auth_value.back() == ',') {
-            auth_value.remove_suffix(1);
-        }
        if (entry_split[0] == "Credential") {
            credential = std::string(auth_value);
        } else if (entry_split[0] == "Signature") {
@@ -222,10 +230,11 @@ future<> server::verify_signature(const request& req) {
            signed_headers = split(auth_value, ';');
            std::sort(signed_headers.begin(), signed_headers.end());
        }
-    }
+    } while (pos != std::string_view::npos);
+
    std::vector<std::string_view> credential_split = split(credential, '/');
    if (credential_split.size() != 5) {
-        throw api_error("ValidationException", format("Incorrect credential information format: {}", credential));
+        throw api_error::validation(format("Incorrect credential information format: {}", credential));
    }
    std::string user(credential_split[0]);
    std::string datestamp(credential_split[1]);
@@ -246,8 +255,8 @@ future<> server::verify_signature(const request& req) {
        }
    }

-    auto cache_getter = [] (std::string username) {
-        return get_key_from_roles(cql3::get_query_processor().local(), std::move(username));
+    auto cache_getter = [&qp = _qp] (std::string username) {
+        return get_key_from_roles(qp, std::move(username));
    };
    return _key_cache.get_ptr(user, cache_getter).then([this, &req,
                                                    user = std::move(user),
@@ -263,7 +272,7 @@ future<> server::verify_signature(const request& req) {

        if (signature != std::string_view(user_signature)) {
            _key_cache.remove(user);
-            throw api_error("UnrecognizedClientException", "The security token included in the request is invalid.");
+            throw api_error::unrecognized_client("The security token included in the request is invalid.");
        }
    });
 }
@@ -274,13 +283,12 @@ future<executor::request_return_type> server::handle_api_request(std::unique_ptr
    std::vector<std::string_view> split_target = split(target, '.');
    //NOTICE(sarna): Target consists of Dynamo API version followed by a dot '.' and operation type (e.g. CreateTable)
    std::string op = split_target.empty() ? std::string() : std::string(split_target.back());
-    slogger.trace("Request: {} {}", op, req->content);
+    slogger.trace("Request: {} {} {}", op, req->content, req->_headers);
    return verify_signature(*req).then([this, op, req = std::move(req)] () mutable {
        auto callback_it = _callbacks.find(op);
        if (callback_it == _callbacks.end()) {
            _executor._stats.unsupported_operations++;
-            throw api_error("UnknownOperationException",
-                    format("Unsupported operation {}", op));
+            throw api_error::unknown_operation(format("Unsupported operation {}", op));
        }
        return with_gate(_pending_requests, [this, callback_it = std::move(callback_it), op = std::move(op), req = std::move(req)] () mutable {
            //FIXME: Client state can provide more context, e.g. client's endpoint address
@@ -332,10 +340,11 @@ void server::set_routes(routes& r) {
 //FIXME: A way to immediately invalidate the cache should be considered,
 // e.g. when the system table which stores the keys is changed.
 // For now, this propagation may take up to 1 minute.
-server::server(executor& exec)
+server::server(executor& exec, cql3::query_processor& qp)
        : _http_server("http-alternator")
        , _https_server("https-alternator")
        , _executor(exec)
+        , _qp(qp)
        , _key_cache(1024, 1min, slogger)
        , _enforce_authorization(false)
        , _enabled_servers{}
@@ -350,6 +359,9 @@ server::server(executor& exec)
        {"DeleteTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
            return e.delete_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
        }},
+        {"UpdateTable", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.update_table(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
        {"PutItem", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
            return e.put_item(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
        }},
@@ -389,6 +401,18 @@ server::server(executor& exec)
        {"ListTagsOfResource", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
            return e.list_tags_of_resource(client_state, std::move(permit), std::move(json_request));
        }},
+        {"ListStreams", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.list_streams(client_state, std::move(permit), std::move(json_request));
+        }},
+        {"DescribeStream", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.describe_stream(client_state, std::move(permit), std::move(json_request));
+        }},
+        {"GetShardIterator", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.get_shard_iterator(client_state, std::move(permit), std::move(json_request));
+        }},
+        {"GetRecords", [] (executor& e, executor::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value json_request, std::unique_ptr<request> req) {
+            return e.get_records(client_state, std::move(trace_state), std::move(permit), std::move(json_request));
+        }},
    } {
 }

--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -41,6 +41,7 @@ class server {
    http_server _http_server;
    http_server _https_server;
    executor& _executor;
+    cql3::query_processor& _qp;

    key_cache _key_cache;
    bool _enforce_authorization;
@@ -68,7 +69,7 @@ class server {
    json_parser _json_parser;

 public:
-    server(executor& executor);
+    server(executor& executor, cql3::query_processor& qp);

    future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
            bool enforce_authorization, semaphore* memory_limiter);
--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -20,7 +20,7 @@
 */

 #include "stats.hh"
-
+#include "utils/histogram_metrics_helper.hh"
 #include <seastar/core/metrics.hh>

 namespace alternator {
@@ -37,7 +37,8 @@ stats::stats() : api_operations{} {
                        seastar::metrics::description("number of operations via Alternator API"), {op(CamelCaseName)}),
 #define OPERATION_LATENCY(name, CamelCaseName) \
                seastar::metrics::make_histogram("op_latency", \
-                        seastar::metrics::description("Latency histogram of an operation via Alternator API"), {op(CamelCaseName)}, [this]{return api_operations.name.get_histogram(1,20);}),
+                        seastar::metrics::description("Latency histogram of an operation via Alternator API"), {op(CamelCaseName)}, [this]{return to_metrics_histogram(api_operations.name);}),
+            OPERATION(batch_get_item, "BatchGetItem")
            OPERATION(batch_write_item, "BatchWriteItem")
            OPERATION(create_backup, "CreateBackup")
            OPERATION(create_global_table, "CreateGlobalTable")
@@ -77,6 +78,11 @@ stats::stats() : api_operations{} {
            OPERATION_LATENCY(get_item_latency, "GetItem")
            OPERATION_LATENCY(delete_item_latency, "DeleteItem")
            OPERATION_LATENCY(update_item_latency, "UpdateItem")
+            OPERATION(list_streams, "ListStreams")
+            OPERATION(describe_stream, "DescribeStream")
+            OPERATION(get_shard_iterator, "GetShardIterator")
+            OPERATION(get_records, "GetRecords")
+            OPERATION_LATENCY(get_records_latency, "GetRecords")
    });
    _metrics.add_group("alternator", {
            seastar::metrics::make_total_operations("unsupported_operations", unsupported_operations,
--- a/alternator/stats.hh
+++ b/alternator/stats.hh
@@ -74,11 +74,16 @@ public:
        uint64_t update_item = 0;
        uint64_t update_table = 0;
        uint64_t update_time_to_live = 0;
+        uint64_t list_streams = 0;
+        uint64_t describe_stream = 0;
+        uint64_t get_shard_iterator = 0;
+        uint64_t get_records = 0;

-        utils::estimated_histogram put_item_latency;
-        utils::estimated_histogram get_item_latency;
-        utils::estimated_histogram delete_item_latency;
-        utils::estimated_histogram update_item_latency;
+        utils::time_estimated_histogram put_item_latency;
+        utils::time_estimated_histogram get_item_latency;
+        utils::time_estimated_histogram delete_item_latency;
+        utils::time_estimated_histogram update_item_latency;
+        utils::time_estimated_histogram get_records_latency;
    } api_operations;
    // Miscellaneous event counters
    uint64_t total_operations = 0;
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
--- a/api/api-doc/column_family.json
+++ b/api/api-doc/column_family.json
@@ -2925,6 +2925,10 @@
         "id":"toppartitions_query_results",
         "description":"nodetool toppartitions query results",
         "properties":{
+            "read_cardinality":{
+               "type":"long",
+               "description":"Number of the unique operations in the sample set"
+            },
            "read":{
               "type":"array",
               "items":{
@@ -2932,6 +2936,10 @@
               },
               "description":"Read results"
            },
+            "write_cardinality":{
+               "type":"long",
+               "description":"Number of the unique operations in the sample set"
+            },
            "write":{
               "type":"array",
               "items":{
--- a/api/api-doc/gossiper.json
+++ b/api/api-doc/gossiper.json
@@ -148,6 +148,30 @@
               ]
            }
         ]
+      },
+      {
+         "path":"/gossiper/force_remove_endpoint/{addr}",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Force remove an endpoint from gossip",
+               "type":"void",
+               "nickname":"force_remove_endpoint",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"addr",
+                     "description":"The endpoint address",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            }
+         ]
      }
   ]
 }
--- a/api/api-doc/messaging_service.json
+++ b/api/api-doc/messaging_service.json
@@ -249,7 +249,7 @@
                 "MIGRATION_REQUEST",
                 "PREPARE_MESSAGE",
                 "PREPARE_DONE_MESSAGE",
-                 "STREAM_MUTATION",
+                 "UNUSED__STREAM_MUTATION",
                 "STREAM_MUTATION_DONE",
                 "COMPLETE_MESSAGE",
                 "REPAIR_CHECKSUM_RANGE",
--- a/api/api-doc/storage_proxy.json
+++ b/api/api-doc/storage_proxy.json
@@ -68,7 +68,7 @@
               "summary":"Get the hinted handoff enabled by dc",
               "type":"array",
               "items":{
-                  "type":"mapper_list"
+                  "type":"array"
               },
               "nickname":"get_hinted_handoff_enabled_by_dc",
               "produces":[
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -833,6 +833,43 @@
            }
         ]
      },
+      {
+         "path":"/storage_service/repair_status/",
+         "operations":[
+            {
+               "method":"GET",
+               "summary":"Query the repair status and return when the repair is finished or timeout",
+               "type":"string",
+               "enum":[
+                  "RUNNING",
+                  "SUCCESSFUL",
+                  "FAILED"
+               ],
+               "nickname":"repair_await_completion",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"id",
+                     "description":"The repair ID to check for status",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type": "long",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"timeout",
+                     "description":"Seconds to wait before the query returns even if the repair is not finished. The value -1 or not providing this parameter means no timeout",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type": "long",
+                     "paramType":"query"
+                  }
+               ]
+            }
+         ]
+      },
      {
         "path":"/storage_service/repair_async/{keyspace}",
         "operations":[
@@ -2431,7 +2468,7 @@
            "version":{
               "type":"string",
               "enum":[
-                  "ka", "la", "mc"
+                  "ka", "la", "mc", "md"
               ],
               "description":"SSTable version"
            },
--- a/api/api.cc
+++ b/api/api.cc
@@ -113,8 +113,20 @@ future<> set_server_storage_service(http_context& ctx) {
    return register_api(ctx, "storage_service", "The storage service API", set_storage_service);
 }

-future<> set_server_snapshot(http_context& ctx) {
-    return ctx.http_server.set_routes([&ctx] (routes& r) { set_snapshot(ctx, r); });
+future<> set_server_repair(http_context& ctx, sharded<netw::messaging_service>& ms) {
+    return ctx.http_server.set_routes([&ctx, &ms] (routes& r) { set_repair(ctx, r, ms); });
+}
+
+future<> unset_server_repair(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_repair(ctx, r); });
+}
+
+future<> set_server_snapshot(http_context& ctx, sharded<db::snapshot_ctl>& snap_ctl) {
+    return ctx.http_server.set_routes([&ctx, &snap_ctl] (routes& r) { set_snapshot(ctx, r, snap_ctl); });
+}
+
+future<> unset_server_snapshot(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_snapshot(ctx, r); });
 }

 future<> set_server_snitch(http_context& ctx) {
@@ -131,9 +143,14 @@ future<> set_server_load_sstable(http_context& ctx) {
                "The column family API", set_column_family);
 }

-future<> set_server_messaging_service(http_context& ctx) {
+future<> set_server_messaging_service(http_context& ctx, sharded<netw::messaging_service>& ms) {
    return register_api(ctx, "messaging_service",
-                "The messaging service API", set_messaging_service);
+                "The messaging service API", [&ms] (http_context& ctx, routes& r) {
+                    set_messaging_service(ctx, r, ms);
+                });
+}
+future<> unset_server_messaging_service(http_context& ctx) {
+    return ctx.http_server.set_routes([&ctx] (routes& r) { unset_messaging_service(ctx, r); });
 }

 future<> set_server_storage_proxy(http_context& ctx) {
--- a/api/api.hh
+++ b/api/api.hh
@@ -256,4 +256,6 @@ public:
    operator T() const { return value; }
 };

+utils_json::estimated_histogram time_to_json_histogram(const utils::time_estimated_histogram& val);
+
 }
--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -24,9 +24,11 @@
 #include <seastar/http/httpd.hh>

 namespace service { class load_meter; }
-namespace locator { class token_metadata; }
+namespace locator { class shared_token_metadata; }
 namespace cql_transport { class controller; }
 class thrift_controller;
+namespace db { class snapshot_ctl; }
+namespace netw { class messaging_service; }

 namespace api {

@@ -37,27 +39,33 @@ struct http_context {
    distributed<database>& db;
    distributed<service::storage_proxy>& sp;
    service::load_meter& lmeter;
-    sharded<locator::token_metadata>& token_metadata;
+    const sharded<locator::shared_token_metadata>& shared_token_metadata;

    http_context(distributed<database>& _db,
            distributed<service::storage_proxy>& _sp,
-            service::load_meter& _lm, sharded<locator::token_metadata>& _tm)
-            : db(_db), sp(_sp), lmeter(_lm), token_metadata(_tm) {
+            service::load_meter& _lm, const sharded<locator::shared_token_metadata>& _stm)
+            : db(_db), sp(_sp), lmeter(_lm), shared_token_metadata(_stm) {
    }
+
+    const locator::token_metadata& get_token_metadata();
 };

 future<> set_server_init(http_context& ctx);
 future<> set_server_config(http_context& ctx);
 future<> set_server_snitch(http_context& ctx);
 future<> set_server_storage_service(http_context& ctx);
+future<> set_server_repair(http_context& ctx, sharded<netw::messaging_service>& ms);
+future<> unset_server_repair(http_context& ctx);
 future<> set_transport_controller(http_context& ctx, cql_transport::controller& ctl);
 future<> unset_transport_controller(http_context& ctx);
 future<> set_rpc_controller(http_context& ctx, thrift_controller& ctl);
 future<> unset_rpc_controller(http_context& ctx);
-future<> set_server_snapshot(http_context& ctx);
+future<> set_server_snapshot(http_context& ctx, sharded<db::snapshot_ctl>& snap_ctl);
+future<> unset_server_snapshot(http_context& ctx);
 future<> set_server_gossip(http_context& ctx);
 future<> set_server_load_sstable(http_context& ctx);
-future<> set_server_messaging_service(http_context& ctx);
+future<> set_server_messaging_service(http_context& ctx, sharded<netw::messaging_service>& ms);
+future<> unset_server_messaging_service(http_context& ctx);
 future<> set_server_storage_proxy(http_context& ctx);
 future<> set_server_stream_manager(http_context& ctx);
 future<> set_server_gossip_settle(http_context& ctx);
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -249,6 +249,12 @@ static future<json::json_return_type> sum_sstable(http_context& ctx, bool total)
    });
 }

+future<json::json_return_type> map_reduce_cf_time_histogram(http_context& ctx, const sstring& name, std::function<utils::time_estimated_histogram(const column_family&)> f) {
+    return map_reduce_cf_raw(ctx, name, utils::time_estimated_histogram(), f, utils::time_estimated_histogram_merge).then([](const utils::time_estimated_histogram& res) {
+        return make_ready_future<json::json_return_type>(time_to_json_histogram(res));
+    });
+}
+
 template <typename T>
 class sum_ratio {
    uint64_t _n = 0;
@@ -304,7 +310,7 @@ void set_column_family(http_context& ctx, routes& r) {
        return res;
    });

-    cf::get_column_family.set(r, [&ctx] (const_req req){
+    cf::get_column_family.set(r, [&ctx] (std::unique_ptr<request> req){
            vector<cf::column_family_info> res;
            for (auto i: ctx.db.local().get_column_families_mapping()) {
                cf::column_family_info info;
@@ -313,7 +319,7 @@ void set_column_family(http_context& ctx, routes& r) {
                info.type = "ColumnFamilies";
                res.push_back(info);
            }
-            return res;
+            return make_ready_future<json::json_return_type>(json::stream_object(std::move(res)));
        });

    cf::get_column_family_name_keyspace.set(r, [&ctx] (const_req req){
@@ -325,15 +331,15 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], 0, [](column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], uint64_t{0}, [](column_family& cf) {
            return cf.active_memtable().partition_count();
-        }, std::plus<int>());
+        }, std::plus<>());
    });

    cf::get_all_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, 0, [](column_family& cf) {
+        return map_reduce_cf(ctx, uint64_t{0}, [](column_family& cf) {
            return cf.active_memtable().partition_count();
-        }, std::plus<int>());
+        }, std::plus<>());
    });

    cf::get_memtable_on_heap_size.set(r, [] (const_req req) {
@@ -650,7 +656,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_size();
+                return s + sst->filter_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -658,7 +664,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_size();
+                return s + sst->filter_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -666,7 +672,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_memory_size();
+                return s + sst->filter_memory_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -674,7 +680,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_memory_size();
+                return s + sst->filter_memory_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -682,7 +688,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->get_summary().memory_footprint();
+                return s + sst->get_summary().memory_footprint();
            });
        }, std::plus<uint64_t>());
    });
@@ -690,7 +696,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->get_summary().memory_footprint();
+                return s + sst->get_summary().memory_footprint();
            });
        }, std::plus<uint64_t>());
    });
@@ -796,24 +802,21 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_cas_prepare.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const column_family& cf) {
            return cf.get_stats().estimated_cas_prepare;
-        },
-        utils::estimated_histogram_merge, utils_json::estimated_histogram());
+        });
    });

    cf::get_cas_propose.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const column_family& cf) {
            return cf.get_stats().estimated_cas_accept;
-        },
-        utils::estimated_histogram_merge, utils_json::estimated_histogram());
+        });
    });

    cf::get_cas_commit.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const column_family& cf) {
            return cf.get_stats().estimated_cas_learn;
-        },
-        utils::estimated_histogram_merge, utils_json::estimated_histogram());
+        });
    });

    cf::get_sstables_per_read_histogram.set(r, [&ctx] (std::unique_ptr<request> req) {
@@ -862,7 +865,9 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_built_indexes.set(r, [&ctx](std::unique_ptr<request> req) {
-        auto [ks, cf_name] = parse_fully_qualified_cf_name(req->param["name"]);
+        auto ks_cf = parse_fully_qualified_cf_name(req->param["name"]);
+        auto&& ks = std::get<0>(ks_cf);
+        auto&& cf_name = std::get<1>(ks_cf);
        return db::system_keyspace::load_view_build_progress().then([ks, cf_name, &ctx](const std::vector<db::system_keyspace::view_build_progress>& vb) mutable {
            std::set<sstring> vp;
            for (auto b : vb) {
@@ -875,7 +880,7 @@ void set_column_family(http_context& ctx, routes& r) {
            column_family& cf = ctx.db.local().find_column_family(uuid);
            res.reserve(cf.get_index_manager().list_indexes().size());
            for (auto&& i : cf.get_index_manager().list_indexes()) {
-                if (vp.find(secondary_index::index_table_name(i.metadata().name())) == vp.end()) {
+                if (!vp.contains(secondary_index::index_table_name(i.metadata().name()))) {
                    res.emplace_back(i.metadata().name());
                }
            }
@@ -909,17 +914,15 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_read_latency_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const column_family& cf) {
            return cf.get_stats().estimated_read;
-        },
-        utils::estimated_histogram_merge, utils_json::estimated_histogram());
+        });
    });

    cf::get_write_latency_estimated_histogram.set(r, [&ctx](std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](column_family& cf) {
+        return map_reduce_cf_time_histogram(ctx, req->param["name"], [](const column_family& cf) {
            return cf.get_stats().estimated_write;
-        },
-        utils::estimated_histogram_merge, utils_json::estimated_histogram());
+        });
    });

    cf::set_compaction_strategy_class.set(r, [&ctx](std::unique_ptr<request> req) {
@@ -988,6 +991,9 @@ void set_column_family(http_context& ctx, routes& r) {
                        apilog.debug("toppartitions query: processing results");
                        cf::toppartitions_query_results results;

+                        results.read_cardinality = topk_results.read.size();
+                        results.write_cardinality = topk_results.write.size();
+
                        for (auto& d: topk_results.read.top(q.list_size())) {
                            cf::toppartitions_record r;
                            r.partition = sstring(d.item);
--- a/api/column_family.hh
+++ b/api/column_family.hh
@@ -68,6 +68,8 @@ future<json::json_return_type> map_reduce_cf(http_context& ctx, const sstring& n
    });
 }

+future<json::json_return_type> map_reduce_cf_time_histogram(http_context& ctx, const sstring& name, std::function<utils::time_estimated_histogram(const column_family&)> f);
+
 struct map_reduce_column_families_locally {
    std::any init;
    std::function<std::unique_ptr<std::any>(column_family&)> mapper;
--- a/api/gossiper.cc
+++ b/api/gossiper.cc
@@ -66,6 +66,13 @@ void set_gossiper(http_context& ctx, routes& r) {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });
+
+    httpd::gossiper_json::force_remove_endpoint.set(r, [](std::unique_ptr<request> req) {
+        gms::inet_address ep(req->param["addr"]);
+        return gms::get_local_gossiper().force_remove_endpoint(ep).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
 }

 }
--- a/api/messaging_service.cc
+++ b/api/messaging_service.cc
@@ -53,8 +53,8 @@ std::vector<message_counter> map_to_message_counters(
 * according to a function that it gets as a parameter.
 *
 */
-future_json_function get_client_getter(std::function<uint64_t(const shard_info&)> f) {
-    return [f](std::unique_ptr<request> req) {
+future_json_function get_client_getter(sharded<netw::messaging_service>& ms, std::function<uint64_t(const shard_info&)> f) {
+    return [&ms, f](std::unique_ptr<request> req) {
        using map_type = std::unordered_map<gms::inet_address, uint64_t>;
        auto get_shard_map = [f](messaging_service& ms) {
            std::unordered_map<gms::inet_address, unsigned long> map;
@@ -63,15 +63,15 @@ future_json_function get_client_getter(std::function<uint64_t(const shard_info&)
            });
            return map;
        };
-        return  get_messaging_service().map_reduce0(get_shard_map, map_type(), map_sum<map_type>).
+        return ms.map_reduce0(get_shard_map, map_type(), map_sum<map_type>).
                then([](map_type&& map) {
            return make_ready_future<json::json_return_type>(map_to_message_counters(map));
        });
    };
 }

-future_json_function get_server_getter(std::function<uint64_t(const rpc::stats&)> f) {
-    return [f](std::unique_ptr<request> req) {
+future_json_function get_server_getter(sharded<netw::messaging_service>& ms, std::function<uint64_t(const rpc::stats&)> f) {
+    return [&ms, f](std::unique_ptr<request> req) {
        using map_type = std::unordered_map<gms::inet_address, uint64_t>;
        auto get_shard_map = [f](messaging_service& ms) {
            std::unordered_map<gms::inet_address, unsigned long> map;
@@ -80,53 +80,53 @@ future_json_function get_server_getter(std::function<uint64_t(const rpc::stats&)
            });
            return map;
        };
-        return  get_messaging_service().map_reduce0(get_shard_map, map_type(), map_sum<map_type>).
+        return ms.map_reduce0(get_shard_map, map_type(), map_sum<map_type>).
                then([](map_type&& map) {
            return make_ready_future<json::json_return_type>(map_to_message_counters(map));
        });
    };
 }

-void set_messaging_service(http_context& ctx, routes& r) {
-    get_timeout_messages.set(r, get_client_getter([](const shard_info& c) {
+void set_messaging_service(http_context& ctx, routes& r, sharded<netw::messaging_service>& ms) {
+    get_timeout_messages.set(r, get_client_getter(ms, [](const shard_info& c) {
        return c.get_stats().timeout;
    }));

-    get_sent_messages.set(r, get_client_getter([](const shard_info& c) {
+    get_sent_messages.set(r, get_client_getter(ms, [](const shard_info& c) {
        return c.get_stats().sent_messages;
    }));

-    get_dropped_messages.set(r, get_client_getter([](const shard_info& c) {
+    get_dropped_messages.set(r, get_client_getter(ms, [](const shard_info& c) {
        // We don't have the same drop message mechanism
        // as origin has.
        // hence we can always return 0
        return 0;
    }));

-    get_exception_messages.set(r, get_client_getter([](const shard_info& c) {
+    get_exception_messages.set(r, get_client_getter(ms, [](const shard_info& c) {
        return c.get_stats().exception_received;
    }));

-    get_pending_messages.set(r, get_client_getter([](const shard_info& c) {
+    get_pending_messages.set(r, get_client_getter(ms, [](const shard_info& c) {
        return c.get_stats().pending;
    }));

-    get_respond_pending_messages.set(r, get_server_getter([](const rpc::stats& c) {
+    get_respond_pending_messages.set(r, get_server_getter(ms, [](const rpc::stats& c) {
        return c.pending;
    }));

-    get_respond_completed_messages.set(r, get_server_getter([](const rpc::stats& c) {
+    get_respond_completed_messages.set(r, get_server_getter(ms, [](const rpc::stats& c) {
        return c.sent_messages;
    }));

-    get_version.set(r, [](const_req req) {
-        return netw::get_local_messaging_service().get_raw_version(req.get_query_param("addr"));
+    get_version.set(r, [&ms](const_req req) {
+        return ms.local().get_raw_version(req.get_query_param("addr"));
    });

-    get_dropped_messages_by_ver.set(r, [](std::unique_ptr<request> req) {
+    get_dropped_messages_by_ver.set(r, [&ms](std::unique_ptr<request> req) {
        shared_ptr<std::vector<uint64_t>> map = make_shared<std::vector<uint64_t>>(num_verb);

-        return netw::get_messaging_service().map_reduce([map](const uint64_t* local_map) mutable {
+        return ms.map_reduce([map](const uint64_t* local_map) mutable {
            for (auto i = 0; i < num_verb; i++) {
                (*map)[i]+= local_map[i];
            }
@@ -151,5 +151,18 @@ void set_messaging_service(http_context& ctx, routes& r) {
        });
    });
 }
+
+void unset_messaging_service(http_context& ctx, routes& r) {
+    get_timeout_messages.unset(r);
+    get_sent_messages.unset(r);
+    get_dropped_messages.unset(r);
+    get_exception_messages.unset(r);
+    get_pending_messages.unset(r);
+    get_respond_pending_messages.unset(r);
+    get_respond_completed_messages.unset(r);
+    get_version.unset(r);
+    get_dropped_messages_by_ver.unset(r);
+}
+
 }

--- a/api/messaging_service.hh
+++ b/api/messaging_service.hh
@@ -23,8 +23,11 @@

 #include "api.hh"

+namespace netw { class messaging_service; }
+
 namespace api {

-void set_messaging_service(http_context& ctx, routes& r);
+void set_messaging_service(http_context& ctx, routes& r, sharded<netw::messaging_service>& ms);
+void unset_messaging_service(http_context& ctx, routes& r);

 }
--- a/api/storage_proxy.cc
+++ b/api/storage_proxy.cc
@@ -201,29 +201,39 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_hinted_handoff_enabled.set(r, [&ctx](std::unique_ptr<request> req)  {
-        auto enabled = ctx.db.local().get_config().hinted_handoff_enabled();
-        return make_ready_future<json::json_return_type>(enabled);
+        const auto& filter = service::get_storage_proxy().local().get_hints_host_filter();
+        return make_ready_future<json::json_return_type>(!filter.is_disabled_for_all());
    });

    sp::set_hinted_handoff_enabled.set(r, [](std::unique_ptr<request> req)  {
-        //TBD
-        unimplemented();
        auto enable = req->get_query_param("enable");
-        return make_ready_future<json::json_return_type>(json_void());
+        auto filter = (enable == "true" || enable == "1")
+                ? db::hints::host_filter(db::hints::host_filter::enabled_for_all_tag {})
+                : db::hints::host_filter(db::hints::host_filter::disabled_for_all_tag {});
+        return service::get_storage_proxy().invoke_on_all([filter = std::move(filter)] (service::storage_proxy& sp) {
+            return sp.change_hints_host_filter(filter);
+        }).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
    });

    sp::get_hinted_handoff_enabled_by_dc.set(r, [](std::unique_ptr<request> req)  {
-        //TBD
-        unimplemented();
-        std::vector<sp::mapper_list> res;
+        std::vector<sstring> res;
+        const auto& filter = service::get_storage_proxy().local().get_hints_host_filter();
+        const auto& dcs = filter.get_dcs();
+        res.reserve(res.size());
+        std::copy(dcs.begin(), dcs.end(), std::back_inserter(res));
        return make_ready_future<json::json_return_type>(res);
    });

    sp::set_hinted_handoff_enabled_by_dc_list.set(r, [](std::unique_ptr<request> req)  {
-        //TBD
-        unimplemented();
-        auto enable = req->get_query_param("dcs");
-        return make_ready_future<json::json_return_type>(json_void());
+        auto dcs = req->get_query_param("dcs");
+        auto filter = db::hints::host_filter::parse_from_dc_list(std::move(dcs));
+        return service::get_storage_proxy().invoke_on_all([filter = std::move(filter)] (service::storage_proxy& sp) {
+            return sp.change_hints_host_filter(filter);
+        }).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
    });

    sp::get_max_hint_window.set(r, [](std::unique_ptr<request> req)  {
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -22,6 +22,7 @@
 #include "storage_service.hh"
 #include "api/api-doc/storage_service.json.hh"
 #include "db/config.hh"
+#include "db/schema_tables.hh"
 #include <optional>
 #include <time.h>
 #include <boost/range/adaptor/map.hpp>
@@ -41,11 +42,17 @@
 #include "sstables/sstables.hh"
 #include "database.hh"
 #include "db/extensions.hh"
+#include "db/snapshot-ctl.hh"
 #include "transport/controller.hh"
 #include "thrift/controller.hh"
+#include "locator/token_metadata.hh"

 namespace api {

+const locator::token_metadata& http_context::get_token_metadata() {
+        return *shared_token_metadata.local().get();
+}
+
 namespace ss = httpd::storage_service_json;
 using namespace json;

@@ -149,6 +156,104 @@ void unset_rpc_controller(http_context& ctx, routes& r) {
    ss::is_rpc_server_running.unset(r);
 }

+void set_repair(http_context& ctx, routes& r, sharded<netw::messaging_service>& ms) {
+    ss::repair_async.set(r, [&ctx, &ms](std::unique_ptr<request> req) {
+        static std::vector<sstring> options = {"primaryRange", "parallelism", "incremental",
+                "jobThreads", "ranges", "columnFamilies", "dataCenters", "hosts", "trace",
+                "startToken", "endToken" };
+        std::unordered_map<sstring, sstring> options_map;
+        for (auto o : options) {
+            auto s = req->get_query_param(o);
+            if (s != "") {
+                options_map[o] = s;
+            }
+        }
+
+        // The repair process is asynchronous: repair_start only starts it and
+        // returns immediately, not waiting for the repair to finish. The user
+        // then has other mechanisms to track the ongoing repair's progress,
+        // or stop it.
+        return repair_start(ctx.db, ms, validate_keyspace(ctx, req->param),
+                options_map).then([] (int i) {
+                    return make_ready_future<json::json_return_type>(i);
+                });
+    });
+
+    ss::get_active_repair_async.set(r, [&ctx](std::unique_ptr<request> req) {
+        return get_active_repairs(ctx.db).then([] (std::vector<int> res){
+            return make_ready_future<json::json_return_type>(res);
+        });
+    });
+
+    ss::repair_async_status.set(r, [&ctx](std::unique_ptr<request> req) {
+        return repair_get_status(ctx.db, boost::lexical_cast<int>( req->get_query_param("id")))
+                .then_wrapped([] (future<repair_status>&& fut) {
+            ss::ns_repair_async_status::return_type_wrapper res;
+            try {
+                res = fut.get0();
+            } catch(std::runtime_error& e) {
+                throw httpd::bad_param_exception(e.what());
+            }
+            return make_ready_future<json::json_return_type>(json::json_return_type(res));
+        });
+    });
+
+    ss::repair_await_completion.set(r, [&ctx](std::unique_ptr<request> req) {
+        int id;
+        using clock = std::chrono::steady_clock;
+        clock::time_point expire;
+        try {
+            id = boost::lexical_cast<int>(req->get_query_param("id"));
+            // If timeout is not provided, it means no timeout.
+            sstring s = req->get_query_param("timeout");
+            int64_t timeout = s.empty() ? int64_t(-1) : boost::lexical_cast<int64_t>(s);
+            if (timeout < 0 && timeout != -1) {
+                return make_exception_future<json::json_return_type>(
+                        httpd::bad_param_exception("timeout can only be -1 (means no timeout) or non negative integer"));
+            }
+            if (timeout < 0) {
+                expire = clock::time_point::max();
+            } else {
+                expire = clock::now() + std::chrono::seconds(timeout);
+            }
+        } catch (std::exception& e) {
+            return make_exception_future<json::json_return_type>(httpd::bad_param_exception(e.what()));
+        }
+        return repair_await_completion(ctx.db, id, expire)
+                .then_wrapped([] (future<repair_status>&& fut) {
+            ss::ns_repair_async_status::return_type_wrapper res;
+            try {
+                res = fut.get0();
+            } catch (std::exception& e) {
+                return make_exception_future<json::json_return_type>(httpd::bad_param_exception(e.what()));
+            }
+            return make_ready_future<json::json_return_type>(json::json_return_type(res));
+        });
+    });
+
+    ss::force_terminate_all_repair_sessions.set(r, [](std::unique_ptr<request> req) {
+        return repair_abort_all(service::get_local_storage_service().db()).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+    ss::force_terminate_all_repair_sessions_new.set(r, [](std::unique_ptr<request> req) {
+        return repair_abort_all(service::get_local_storage_service().db()).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
+
+}
+
+void unset_repair(http_context& ctx, routes& r) {
+    ss::repair_async.unset(r);
+    ss::get_active_repair_async.unset(r);
+    ss::repair_async_status.unset(r);
+    ss::repair_await_completion.unset(r);
+    ss::force_terminate_all_repair_sessions.unset(r);
+    ss::force_terminate_all_repair_sessions_new.unset(r);
+}
+
 void set_storage_service(http_context& ctx, routes& r) {
    ss::local_hostid.set(r, [](std::unique_ptr<request> req) {
        return db::system_keyspace::get_local_host_id().then([](const utils::UUID& id) {
@@ -157,14 +262,14 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::get_tokens.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.token_metadata.local().sorted_tokens(), [](const dht::token& i) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.get_token_metadata().sorted_tokens(), [](const dht::token& i) {
           return boost::lexical_cast<std::string>(i);
        }));
    });

    ss::get_node_tokens.set(r, [&ctx] (std::unique_ptr<request> req) {
        gms::inet_address addr(req->param["endpoint"]);
-        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.token_metadata.local().get_tokens(addr), [](const dht::token& i) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.get_token_metadata().get_tokens(addr), [](const dht::token& i) {
           return boost::lexical_cast<std::string>(i);
       }));
    });
@@ -183,7 +288,7 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::get_leaving_nodes.set(r, [&ctx](const_req req) {
-        return container_to_vec(ctx.token_metadata.local().get_leaving_endpoints());
+        return container_to_vec(ctx.get_token_metadata().get_leaving_endpoints());
    });

    ss::get_moving_nodes.set(r, [](const_req req) {
@@ -192,7 +297,7 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::get_joining_nodes.set(r, [&ctx](const_req req) {
-        auto points = ctx.token_metadata.local().get_bootstrap_tokens();
+        auto points = ctx.get_token_metadata().get_bootstrap_tokens();
        std::unordered_set<sstring> addr;
        for (auto i: points) {
            addr.insert(boost::lexical_cast<std::string>(i.second));
@@ -220,11 +325,26 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::get_range_to_endpoint_map.set(r, [&ctx](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
        auto keyspace = validate_keyspace(ctx, req->param);
        std::vector<ss::maplist_mapper> res;
-        return make_ready_future<json::json_return_type>(res);
+        return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().get_range_to_address_map(keyspace),
+                [](const std::pair<dht::token_range, std::vector<gms::inet_address>>& entry){
+            ss::maplist_mapper m;
+            if (entry.first.start()) {
+                m.key.push(entry.first.start().value().value().to_sstring());
+            } else {
+                m.key.push("");
+            }
+            if (entry.first.end()) {
+                m.key.push(entry.first.end().value().value().to_sstring());
+            } else {
+                m.key.push("");
+            }
+            for (const gms::inet_address& address : entry.second) {
+                m.value.push(address.to_sstring());
+            }
+            return m;
+        }));
    });

    ss::get_pending_range_to_endpoint_map.set(r, [&ctx](std::unique_ptr<request> req) {
@@ -246,7 +366,7 @@ void set_storage_service(http_context& ctx, routes& r) {

    ss::get_host_id_map.set(r, [&ctx](const_req req) {
        std::vector<ss::mapper> res;
-        return map_to_key_value(ctx.token_metadata.local().get_endpoint_to_host_id_map_for_reading(), res);
+        return map_to_key_value(ctx.get_token_metadata().get_endpoint_to_host_id_map_for_reading(), res);
    });

    ss::get_load.set(r, [&ctx](std::unique_ptr<request> req) {
@@ -322,8 +442,8 @@ void set_storage_service(http_context& ctx, routes& r) {
                for (auto cf : column_families) {
                    column_families_vec.push_back(&db.find_column_family(keyspace, cf));
                }
-                return parallel_for_each(column_families_vec, [&cm] (column_family* cf) {
-                    return cm.perform_cleanup(cf);
+                return parallel_for_each(column_families_vec, [&cm, &db] (column_family* cf) {
+                    return cm.perform_cleanup(db, cf);
                });
            }).then([]{
                return make_ready_future<json::json_return_type>(0);
@@ -338,7 +458,7 @@ void set_storage_service(http_context& ctx, routes& r) {
            return do_for_each(column_families, [=, &db](sstring cfname) {
                auto& cm = db.get_compaction_manager();
                auto& cf = db.find_column_family(keyspace, cfname);
-                return cm.perform_sstable_upgrade(&cf, exclude_current_version);
+                return cm.perform_sstable_upgrade(db, &cf, exclude_current_version);
            });
        }).then([]{
            return make_ready_future<json::json_return_type>(0);
@@ -361,59 +481,6 @@ void set_storage_service(http_context& ctx, routes& r) {
    });


-    ss::repair_async.set(r, [&ctx](std::unique_ptr<request> req) {
-        static std::vector<sstring> options = {"primaryRange", "parallelism", "incremental",
-                "jobThreads", "ranges", "columnFamilies", "dataCenters", "hosts", "trace",
-                "startToken", "endToken" };
-        std::unordered_map<sstring, sstring> options_map;
-        for (auto o : options) {
-            auto s = req->get_query_param(o);
-            if (s != "") {
-                options_map[o] = s;
-            }
-        }
-
-        // The repair process is asynchronous: repair_start only starts it and
-        // returns immediately, not waiting for the repair to finish. The user
-        // then has other mechanisms to track the ongoing repair's progress,
-        // or stop it.
-        return repair_start(ctx.db, validate_keyspace(ctx, req->param),
-                options_map).then([] (int i) {
-                    return make_ready_future<json::json_return_type>(i);
-                });
-    });
-
-    ss::get_active_repair_async.set(r, [&ctx](std::unique_ptr<request> req) {
-        return get_active_repairs(ctx.db).then([] (std::vector<int> res){
-            return make_ready_future<json::json_return_type>(res);
-        });
-    });
-
-    ss::repair_async_status.set(r, [&ctx](std::unique_ptr<request> req) {
-        return repair_get_status(ctx.db, boost::lexical_cast<int>( req->get_query_param("id")))
-                .then_wrapped([] (future<repair_status>&& fut) {
-            ss::ns_repair_async_status::return_type_wrapper res;
-            try {
-                res = fut.get0();
-            } catch(std::runtime_error& e) {
-                throw httpd::bad_param_exception(e.what());
-            }
-            return make_ready_future<json::json_return_type>(json::json_return_type(res));
-        });
-    });
-
-    ss::force_terminate_all_repair_sessions.set(r, [](std::unique_ptr<request> req) {
-        return repair_abort_all(service::get_local_storage_service().db()).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
-    ss::force_terminate_all_repair_sessions_new.set(r, [](std::unique_ptr<request> req) {
-        return repair_abort_all(service::get_local_storage_service().db()).then([] {
-            return make_ready_future<json::json_return_type>(json_void());
-        });
-    });
-
    ss::decommission.set(r, [](std::unique_ptr<request> req) {
        return service::get_local_storage_service().decommission().then([] {
            return make_ready_future<json::json_return_type>(json_void());
@@ -671,9 +738,12 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::reset_local_schema.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        return make_ready_future<json::json_return_type>(json_void());
+        // FIXME: We should truncate schema tables if more than one node in the cluster.
+        auto& sp = service::get_storage_proxy();
+        auto& fs = service::get_local_storage_service().features();
+        return db::schema_tables::recalculate_schema_version(sp, fs).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
    });

    ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
@@ -921,7 +991,7 @@ void set_storage_service(http_context& ctx, routes& r) {
                                    e.value = p.second;
                                    nm.attributes.push(std::move(e));
                                }
-                                if (!cp->options().count(compression_parameters::SSTABLE_COMPRESSION)) {
+                                if (!cp->options().contains(compression_parameters::SSTABLE_COMPRESSION)) {
                                    ss::mapper e;
                                    e.key = compression_parameters::SSTABLE_COMPRESSION;
                                    e.value = cp->name();
@@ -979,31 +1049,29 @@ void set_storage_service(http_context& ctx, routes& r) {

 }

-void set_snapshot(http_context& ctx, routes& r) {
-    ss::get_snapshot_details.set(r, [](std::unique_ptr<request> req) {
-        std::function<future<>(output_stream<char>&&)> f = [](output_stream<char>&& s) {
-            return do_with(output_stream<char>(std::move(s)), true, [] (output_stream<char>& s, bool& first){
-                return s.write("[").then([&s, &first] {
-                    return service::get_local_storage_service().get_snapshot_details().then([&s, &first] (std::unordered_map<sstring, std::vector<service::storage_service::snapshot_details>>&& result) {
-                        return do_with(std::move(result), [&s, &first](const std::unordered_map<sstring, std::vector<service::storage_service::snapshot_details>>& result) {
-                            return do_for_each(result, [&s, &result,&first](std::tuple<sstring, std::vector<service::storage_service::snapshot_details>>&& map){
-                                return do_with(ss::snapshots(), [&s, &first, &result, &map](ss::snapshots& all_snapshots) {
-                                    all_snapshots.key = std::get<0>(map);
-                                    future<> f = first ? make_ready_future<>() : s.write(", ");
-                                    first = false;
-                                    std::vector<ss::snapshot> snapshot;
-                                    for (auto& cf: std::get<1>(map)) {
-                                        ss::snapshot snp;
-                                        snp.ks = cf.ks;
-                                        snp.cf = cf.cf;
-                                        snp.live = cf.live;
-                                        snp.total = cf.total;
-                                        snapshot.push_back(std::move(snp));
-                                    }
-                                    all_snapshots.value = std::move(snapshot);
-                                    return f.then([&s, &all_snapshots] {
-                                        return all_snapshots.write(s);
-                                    });
+void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_ctl) {
+    ss::get_snapshot_details.set(r, [&snap_ctl](std::unique_ptr<request> req) {
+        return snap_ctl.local().get_snapshot_details().then([] (std::unordered_map<sstring, std::vector<db::snapshot_ctl::snapshot_details>>&& result) {
+            std::function<future<>(output_stream<char>&&)> f = [result = std::move(result)](output_stream<char>&& s) {
+                return do_with(output_stream<char>(std::move(s)), true, [&result] (output_stream<char>& s, bool& first){
+                    return s.write("[").then([&s, &first, &result] {
+                        return do_for_each(result, [&s, &first](std::tuple<sstring, std::vector<db::snapshot_ctl::snapshot_details>>&& map){
+                            return do_with(ss::snapshots(), [&s, &first, &map](ss::snapshots& all_snapshots) {
+                                all_snapshots.key = std::get<0>(map);
+                                future<> f = first ? make_ready_future<>() : s.write(", ");
+                                first = false;
+                                std::vector<ss::snapshot> snapshot;
+                                for (auto& cf: std::get<1>(map)) {
+                                    ss::snapshot snp;
+                                    snp.ks = cf.ks;
+                                    snp.cf = cf.cf;
+                                    snp.live = cf.live;
+                                    snp.total = cf.total;
+                                    snapshot.push_back(std::move(snp));
+                                }
+                                all_snapshots.value = std::move(snapshot);
+                                return f.then([&s, &all_snapshots] {
+                                    return all_snapshots.write(s);
                                });
                            });
                        });
@@ -1013,12 +1081,13 @@ void set_snapshot(http_context& ctx, routes& r) {
                        });
                    });
                });
-            });
-        };
-        return make_ready_future<json::json_return_type>(std::move(f));
+            };
+
+            return make_ready_future<json::json_return_type>(std::move(f));
+        });
    });

-    ss::take_snapshot.set(r, [](std::unique_ptr<request> req) {
+    ss::take_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) {
        auto tag = req->get_query_param("tag");
        auto column_families = split(req->get_query_param("cf"), ",");

@@ -1026,7 +1095,7 @@ void set_snapshot(http_context& ctx, routes& r) {

        auto resp = make_ready_future<>();
        if (column_families.empty()) {
-            resp = service::get_local_storage_service().take_snapshot(tag, keynames);
+            resp = snap_ctl.local().take_snapshot(tag, keynames);
        } else {
            if (keynames.empty()) {
                throw httpd::bad_param_exception("The keyspace of column families must be specified");
@@ -1034,37 +1103,37 @@ void set_snapshot(http_context& ctx, routes& r) {
            if (keynames.size() > 1) {
                throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
            }
-            resp = service::get_local_storage_service().take_column_family_snapshot(keynames[0], column_families, tag);
+            resp = snap_ctl.local().take_column_family_snapshot(keynames[0], column_families, tag);
        }
        return resp.then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::del_snapshot.set(r, [](std::unique_ptr<request> req) {
+    ss::del_snapshot.set(r, [&snap_ctl](std::unique_ptr<request> req) {
        auto tag = req->get_query_param("tag");
        auto column_family = req->get_query_param("cf");

        std::vector<sstring> keynames = split(req->get_query_param("kn"), ",");
-        return service::get_local_storage_service().clear_snapshot(tag, keynames, column_family).then([] {
+        return snap_ctl.local().clear_snapshot(tag, keynames, column_family).then([] {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });

-    ss::true_snapshots_size.set(r, [](std::unique_ptr<request> req) {
-        return service::get_local_storage_service().true_snapshots_size().then([] (int64_t size) {
+    ss::true_snapshots_size.set(r, [&snap_ctl](std::unique_ptr<request> req) {
+        return snap_ctl.local().true_snapshots_size().then([] (int64_t size) {
            return make_ready_future<json::json_return_type>(size);
        });
    });

-    ss::scrub.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
+    ss::scrub.set(r, wrap_ks_cf(ctx, [&snap_ctl] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
        const auto skip_corrupted = req_param<bool>(*req, "skip_corrupted", false);

        auto f = make_ready_future<>();
        if (!req_param<bool>(*req, "disable_snapshot", false)) {
            auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
-            f = parallel_for_each(column_families, [keyspace, tag](sstring cf) {
-                return service::get_local_storage_service().take_column_family_snapshot(keyspace, cf, tag);
+            f = parallel_for_each(column_families, [&snap_ctl, keyspace, tag](sstring cf) {
+                return snap_ctl.local().take_column_family_snapshot(keyspace, cf, tag);
            });
        }

@@ -1082,4 +1151,12 @@ void set_snapshot(http_context& ctx, routes& r) {
    }));
 }

+void unset_snapshot(http_context& ctx, routes& r) {
+    ss::get_snapshot_details.unset(r);
+    ss::take_snapshot.unset(r);
+    ss::del_snapshot.unset(r);
+    ss::true_snapshots_size.unset(r);
+    ss::scrub.unset(r);
+}
+
 }
--- a/api/storage_service.hh
+++ b/api/storage_service.hh
@@ -21,18 +21,24 @@

 #pragma once

+#include <seastar/core/sharded.hh>
 #include "api.hh"

 namespace cql_transport { class controller; }
 class thrift_controller;
+namespace db { class snapshot_ctl; }
+namespace netw { class messaging_service; }

 namespace api {

 void set_storage_service(http_context& ctx, routes& r);
+void set_repair(http_context& ctx, routes& r, sharded<netw::messaging_service>& ms);
+void unset_repair(http_context& ctx, routes& r);
 void set_transport_controller(http_context& ctx, routes& r, cql_transport::controller& ctl);
 void unset_transport_controller(http_context& ctx, routes& r);
 void set_rpc_controller(http_context& ctx, routes& r, thrift_controller& ctl);
 void unset_rpc_controller(http_context& ctx, routes& r);
-void set_snapshot(http_context& ctx, routes& r);
+void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_ctl);
+void unset_snapshot(http_context& ctx, routes& r);

 }
--- a/atomic_cell.cc
+++ b/atomic_cell.cc
@@ -208,7 +208,7 @@ size_t atomic_cell_or_collection::external_memory_usage(const abstract_type& t)
            external_value_size = cell_view.value_size();
        }
        // Add overhead of chunk headers. The last one is a special case.
-        external_value_size += (external_value_size - 1) / data::cell::maximum_external_chunk_length * data::cell::external_chunk_overhead;
+        external_value_size += (external_value_size - 1) / data::cell::effective_external_chunk_length * data::cell::external_chunk_overhead;
        external_value_size += data::cell::external_last_chunk_overhead;
    }
    return data::cell::structure::serialized_object_size(_data.get(), ctx)
--- a/atomic_cell.hh
+++ b/atomic_cell.hh
@@ -38,6 +38,7 @@

 class abstract_type;
 class collection_type_impl;
+class atomic_cell_or_collection;

 using atomic_cell_value_view = data::value_view;
 using atomic_cell_value_mutable_view = data::value_mutable_view;
--- a/auth/allow_all_authenticator.cc
+++ b/auth/allow_all_authenticator.cc
@@ -26,10 +26,7 @@

 namespace auth {

-const sstring& allow_all_authenticator_name() {
-    static const sstring name = meta::AUTH_PACKAGE_NAME + "AllowAllAuthenticator";
-    return name;
-}
+constexpr std::string_view allow_all_authenticator_name("org.apache.cassandra.auth.AllowAllAuthenticator");

 // To ensure correct initialization order, we unfortunately need to use a string literal.
 static const class_registrator<
--- a/auth/allow_all_authenticator.hh
+++ b/auth/allow_all_authenticator.hh
@@ -37,7 +37,7 @@ class migration_manager;

 namespace auth {

-const sstring& allow_all_authenticator_name();
+extern const std::string_view allow_all_authenticator_name;

 class allow_all_authenticator final : public authenticator {
 public:
@@ -53,7 +53,7 @@ public:
    }

    virtual std::string_view qualified_java_name() const override {
-        return allow_all_authenticator_name();
+        return allow_all_authenticator_name;
    }

    virtual bool require_authentication() const override {
--- a/auth/allow_all_authorizer.cc
+++ b/auth/allow_all_authorizer.cc
@@ -26,10 +26,7 @@

 namespace auth {

-const sstring& allow_all_authorizer_name() {
-    static const sstring name = meta::AUTH_PACKAGE_NAME + "AllowAllAuthorizer";
-    return name;
-}
+constexpr std::string_view allow_all_authorizer_name("org.apache.cassandra.auth.AllowAllAuthorizer");

 // To ensure correct initialization order, we unfortunately need to use a string literal.
 static const class_registrator<
--- a/auth/allow_all_authorizer.hh
+++ b/auth/allow_all_authorizer.hh
@@ -34,7 +34,7 @@ class migration_manager;

 namespace auth {

-const sstring& allow_all_authorizer_name();
+extern const std::string_view allow_all_authorizer_name;

 class allow_all_authorizer final  : public authorizer {
 public:
@@ -50,7 +50,7 @@ public:
    }

    virtual std::string_view qualified_java_name() const override {
-        return allow_all_authorizer_name();
+        return allow_all_authorizer_name;
    }

    virtual future<permission_set> authorize(const role_or_anonymous&, const resource&) const override {
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -34,10 +34,9 @@ namespace auth {

 namespace meta {

-const sstring DEFAULT_SUPERUSER_NAME("cassandra");
-const sstring AUTH_KS("system_auth");
-const sstring USERS_CF("users");
-const sstring AUTH_PACKAGE_NAME("org.apache.cassandra.auth.");
+constexpr std::string_view AUTH_KS("system_auth");
+constexpr std::string_view USERS_CF("users");
+constexpr std::string_view AUTH_PACKAGE_NAME("org.apache.cassandra.auth.");

 }

@@ -83,7 +82,7 @@ static future<> create_metadata_table_if_missing_impl(
    b.set_uuid(uuid);
    schema_ptr table = b.build();
    return ignore_existing([&mm, table = std::move(table)] () {
-        return mm.announce_new_column_family(table, false);
+        return mm.announce_new_column_family(table);
    });
 }

@@ -110,7 +109,12 @@ future<> wait_for_schema_agreement(::service::migration_manager& mm, const datab
 }

 const timeout_config& internal_distributed_timeout_config() noexcept {
+#ifdef DEBUG
+    // Give the much slower debug tests more headroom for completing auth queries.
+    static const auto t = 30s;
+#else
    static const auto t = 5s;
+#endif
    static const timeout_config tc{t, t, t, t, t, t, t};
    return tc;
 }
--- a/auth/common.hh
+++ b/auth/common.hh
@@ -53,10 +53,10 @@ namespace auth {

 namespace meta {

-extern const sstring DEFAULT_SUPERUSER_NAME;
-extern const sstring AUTH_KS;
-extern const sstring USERS_CF;
-extern const sstring AUTH_PACKAGE_NAME;
+constexpr std::string_view DEFAULT_SUPERUSER_NAME("cassandra");
+extern const std::string_view AUTH_KS;
+extern const std::string_view USERS_CF;
+extern const std::string_view AUTH_PACKAGE_NAME;

 }

--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -65,15 +65,14 @@ extern "C" {

 namespace auth {

-const sstring& default_authorizer_name() {
-    static const sstring name = meta::AUTH_PACKAGE_NAME + "CassandraAuthorizer";
-    return name;
+std::string_view default_authorizer::qualified_java_name() const {
+    return "org.apache.cassandra.auth.CassandraAuthorizer";
 }

-static const sstring ROLE_NAME = "role";
-static const sstring RESOURCE_NAME = "resource";
-static const sstring PERMISSIONS_NAME = "permissions";
-static const sstring PERMISSIONS_CF = "role_permissions";
+static constexpr std::string_view ROLE_NAME = "role";
+static constexpr std::string_view RESOURCE_NAME = "resource";
+static constexpr std::string_view PERMISSIONS_NAME = "permissions";
+static constexpr std::string_view PERMISSIONS_CF = "role_permissions";

 static logging::logger alogger("default_authorizer");

--- a/auth/default_authorizer.hh
+++ b/auth/default_authorizer.hh
@@ -51,8 +51,6 @@

 namespace auth {

-const sstring& default_authorizer_name();
-
 class default_authorizer : public authorizer {
    cql3::query_processor& _qp;

@@ -71,9 +69,7 @@ public:

    virtual future<> stop() override;

-    virtual std::string_view qualified_java_name() const override {
-        return default_authorizer_name();
-    }
+    virtual std::string_view qualified_java_name() const override;

    virtual future<permission_set> authorize(const role_or_anonymous&, const resource&) const override;

--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -62,15 +62,12 @@

 namespace auth {

-const sstring& password_authenticator_name() {
-    static const sstring name = meta::AUTH_PACKAGE_NAME + "PasswordAuthenticator";
-    return name;
-}
+constexpr std::string_view password_authenticator_name("org.apache.cassandra.auth.PasswordAuthenticator");

 // name of the hash column.
-static const sstring SALTED_HASH = "salted_hash";
-static const sstring DEFAULT_USER_NAME = meta::DEFAULT_SUPERUSER_NAME;
-static const sstring DEFAULT_USER_PASSWORD = meta::DEFAULT_SUPERUSER_NAME;
+static constexpr std::string_view SALTED_HASH = "salted_hash";
+static constexpr std::string_view DEFAULT_USER_NAME = meta::DEFAULT_SUPERUSER_NAME;
+static const sstring DEFAULT_USER_PASSWORD = sstring(meta::DEFAULT_SUPERUSER_NAME);

 static logging::logger plogger("password_authenticator");

@@ -98,7 +95,7 @@ static bool has_salted_hash(const cql3::untyped_result_set_row& row) {

 static const sstring& update_row_query() {
    static const sstring update_row_query = format("UPDATE {} SET {} = ? WHERE {} = ?",
-            meta::roles_table::qualified_name(),
+            meta::roles_table::qualified_name,
            SALTED_HASH,
            meta::roles_table::role_col_name);
    return update_row_query;
@@ -198,7 +195,7 @@ db::consistency_level password_authenticator::consistency_for_user(std::string_v
 }

 std::string_view password_authenticator::qualified_java_name() const {
-    return password_authenticator_name();
+    return password_authenticator_name;
 }

 bool password_authenticator::require_authentication() const {
@@ -215,10 +212,10 @@ authentication_option_set password_authenticator::alterable_options() const {

 future<authenticated_user> password_authenticator::authenticate(
                const credentials_map& credentials) const {
-    if (!credentials.count(USERNAME_KEY)) {
+    if (!credentials.contains(USERNAME_KEY)) {
        throw exceptions::authentication_exception(format("Required key '{}' is missing", USERNAME_KEY));
    }
-    if (!credentials.count(PASSWORD_KEY)) {
+    if (!credentials.contains(PASSWORD_KEY)) {
        throw exceptions::authentication_exception(format("Required key '{}' is missing", PASSWORD_KEY));
    }

@@ -233,7 +230,7 @@ future<authenticated_user> password_authenticator::authenticate(
    return futurize_invoke([this, username, password] {
        static const sstring query = format("SELECT {} FROM {} WHERE {} = ?",
                SALTED_HASH,
-                meta::roles_table::qualified_name(),
+                meta::roles_table::qualified_name,
                meta::roles_table::role_col_name);

        return _qp.execute_internal(
@@ -283,7 +280,7 @@ future<> password_authenticator::alter(std::string_view role_name, const authent
    }

    static const sstring query = format("UPDATE {} SET {} = ? WHERE {} = ?",
-            meta::roles_table::qualified_name(),
+            meta::roles_table::qualified_name,
            SALTED_HASH,
            meta::roles_table::role_col_name);

@@ -297,7 +294,7 @@ future<> password_authenticator::alter(std::string_view role_name, const authent
 future<> password_authenticator::drop(std::string_view name) const {
    static const sstring query = format("DELETE {} FROM {} WHERE {} = ?",
            SALTED_HASH,
-            meta::roles_table::qualified_name(),
+            meta::roles_table::qualified_name,
            meta::roles_table::role_col_name);

    return _qp.execute_internal(
--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -52,7 +52,7 @@ class migration_manager;

 namespace auth {

-const sstring& password_authenticator_name();
+extern const std::string_view password_authenticator_name;

 class password_authenticator : public authenticator {
    cql3::query_processor& _qp;
--- a/auth/roles-metadata.cc
+++ b/auth/roles-metadata.cc
@@ -45,16 +45,13 @@ std::string_view creation_query() {
            "  member_of set<text>,"
            "  salted_hash text"
            ")",
-            qualified_name(),
+            qualified_name,
            role_col_name);

    return instance;
 }

-std::string_view qualified_name() noexcept {
-    static const sstring instance = AUTH_KS + "." + sstring(name);
-    return instance;
-}
+constexpr std::string_view qualified_name("system_auth.roles");

 }

@@ -64,7 +61,7 @@ future<bool> default_role_row_satisfies(
        cql3::query_processor& qp,
        std::function<bool(const cql3::untyped_result_set_row&)> p) {
    static const sstring query = format("SELECT * FROM {} WHERE {} = ?",
-            meta::roles_table::qualified_name(),
+            meta::roles_table::qualified_name,
            meta::roles_table::role_col_name);

    return do_with(std::move(p), [&qp](const auto& p) {
@@ -97,7 +94,7 @@ future<bool> default_role_row_satisfies(
 future<bool> any_nondefault_role_row_satisfies(
        cql3::query_processor& qp,
        std::function<bool(const cql3::untyped_result_set_row&)> p) {
-    static const sstring query = format("SELECT * FROM {}", meta::roles_table::qualified_name());
+    static const sstring query = format("SELECT * FROM {}", meta::roles_table::qualified_name);

    return do_with(std::move(p), [&qp](const auto& p) {
        return qp.execute_internal(
--- a/auth/roles-metadata.hh
+++ b/auth/roles-metadata.hh
@@ -43,7 +43,7 @@ std::string_view creation_query();

 constexpr std::string_view name{"roles", 5};

-std::string_view qualified_name() noexcept;
+extern const std::string_view qualified_name;

 constexpr std::string_view role_col_name{"role", 4};

--- a/auth/service.cc
+++ b/auth/service.cc
@@ -31,9 +31,7 @@
 #include "auth/allow_all_authenticator.hh"
 #include "auth/allow_all_authorizer.hh"
 #include "auth/common.hh"
-#include "auth/password_authenticator.hh"
 #include "auth/role_or_anonymous.hh"
-#include "auth/standard_role_manager.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/untyped_result_set.hh"
 #include "db/consistency_level_type.hh"
@@ -125,18 +123,7 @@ service::service(
            , _authorizer(std::move(z))
            , _authenticator(std::move(a))
            , _role_manager(std::move(r))
-            , _migration_listener(std::make_unique<auth_migration_listener>(*_authorizer)) {
-    // The password authenticator requires that the `standard_role_manager` is running so that the roles metadata table
-    // it manages is created and updated. This cross-module dependency is rather gross, but we have to maintain it for
-    // the sake of compatibility with Apache Cassandra and its choice of auth. schema.
-    if ((_authenticator->qualified_java_name() == password_authenticator_name())
-            && (_role_manager->qualified_java_name() != standard_role_manager_name())) {
-        throw incompatible_module_combination(
-                format("The {} authenticator must be loaded alongside the {} role-manager.",
-                        password_authenticator_name(),
-                        standard_role_manager_name()));
-    }
-}
+            , _migration_listener(std::make_unique<auth_migration_listener>(*_authorizer)) {}

 service::service(
        permissions_cache_config c,
@@ -167,7 +154,7 @@ future<> service::create_keyspace_if_missing(::service::migration_manager& mm) c

        // We use min_timestamp so that default keyspace metadata will loose with any manual adjustments.
        // See issue #2129.
-        return mm.announce_new_keyspace(ksm, api::min_timestamp, false);
+        return mm.announce_new_keyspace(ksm, api::min_timestamp);
    }

    return make_ready_future<>();
@@ -376,25 +363,28 @@ future<permission_set> get_permissions(const service& ser, const authenticated_u
 }

 bool is_enforcing(const service& ser)  {
-    const bool enforcing_authorizer = ser.underlying_authorizer().qualified_java_name() != allow_all_authorizer_name();
+    const bool enforcing_authorizer = ser.underlying_authorizer().qualified_java_name() != allow_all_authorizer_name;

    const bool enforcing_authenticator = ser.underlying_authenticator().qualified_java_name()
-            != allow_all_authenticator_name();
+            != allow_all_authenticator_name;

    return enforcing_authorizer || enforcing_authenticator;
 }

-bool is_protected(const service& ser, const resource& r) noexcept {
-    return ser.underlying_role_manager().protected_resources().count(r)
-            || ser.underlying_authenticator().protected_resources().count(r)
-            || ser.underlying_authorizer().protected_resources().count(r);
+bool is_protected(const service& ser, command_desc cmd) noexcept {
+    if (cmd.type_ == command_desc::type::ALTER_WITH_OPTS) {
+        return false; // Table attributes are OK to modify; see #7057.
+    }
+    return ser.underlying_role_manager().protected_resources().contains(cmd.resource)
+            || ser.underlying_authenticator().protected_resources().contains(cmd.resource)
+            || ser.underlying_authorizer().protected_resources().contains(cmd.resource);
 }

 static void validate_authentication_options_are_supported(
        const authentication_options& options,
        const authentication_option_set& supported) {
    const auto check = [&supported](authentication_option k) {
-        if (supported.count(k) == 0) {
+        if (!supported.contains(k)) {
            throw unsupported_authentication_option(k);
        }
    };
@@ -474,7 +464,7 @@ future<bool> has_role(const service& ser, std::string_view grantee, std::string_
    return when_all_succeed(
            validate_role_exists(ser, name),
            ser.get_roles(grantee)).then_unpack([name](role_set all_roles) {
-        return make_ready_future<bool>(all_roles.count(sstring(name)) != 0);
+        return make_ready_future<bool>(all_roles.contains(sstring(name)));
    });
 }
 future<bool> has_role(const service& ser, const authenticated_user& u, std::string_view name) {
@@ -531,14 +521,9 @@ future<std::vector<permission_details>> list_filtered_permissions(
                    ? auth::expand_resource_family(r)
                    : auth::resource_set{r};

-            all_details.erase(
-                    std::remove_if(
-                            all_details.begin(),
-                            all_details.end(),
-                            [&resources](const permission_details& pd) {
-                        return resources.count(pd.resource) == 0;
-                    }),
-                    all_details.end());
+            std::erase_if(all_details, [&resources](const permission_details& pd) {
+                return !resources.contains(pd.resource);
+            });
        }

        std::transform(
@@ -551,11 +536,9 @@ future<std::vector<permission_details>> list_filtered_permissions(
                });

        // Eliminate rows with an empty permission set.
-        all_details.erase(
-                std::remove_if(all_details.begin(), all_details.end(), [](const permission_details& pd) {
-                    return pd.permissions.mask() == 0;
-                }),
-                all_details.end());
+        std::erase_if(all_details, [](const permission_details& pd) {
+            return pd.permissions.mask() == 0;
+        });

        if (!role_name) {
            return make_ready_future<std::vector<permission_details>>(std::move(all_details));
@@ -567,14 +550,9 @@ future<std::vector<permission_details>> list_filtered_permissions(

        return do_with(std::move(all_details), [&ser, role_name](auto& all_details) {
            return ser.get_roles(*role_name).then([&all_details](role_set all_roles) {
-                all_details.erase(
-                        std::remove_if(
-                                all_details.begin(),
-                                all_details.end(),
-                                [&all_roles](const permission_details& pd) {
-                            return all_roles.count(pd.role_name) == 0;
-                        }),
-                        all_details.end());
+                std::erase_if(all_details, [&all_roles](const permission_details& pd) {
+                    return !all_roles.contains(pd.role_name);
+                });

                return make_ready_future<std::vector<permission_details>>(std::move(all_details));
            });
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -181,10 +181,21 @@ future<permission_set> get_permissions(const service&, const authenticated_user&
 ///
 bool is_enforcing(const service&);

+/// A description of a CQL command from which auth::service can tell whether or not this command could endanger
+/// internal data on which auth::service depends.
+struct command_desc {
+    auth::permission permission; ///< Nature of the command's alteration.
+    const ::auth::resource& resource; ///< Resource impacted by this command.
+    enum class type {
+        ALTER_WITH_OPTS, ///< Command is ALTER ... WITH ...
+        OTHER
+    } type_ = type::OTHER;
+};
+
 ///
 /// Protected resources cannot be modified even if the performer has permissions to do so.
 ///
-bool is_protected(const service&, const resource&) noexcept;
+bool is_protected(const service&, command_desc) noexcept;

 ///
 /// Create a role with optional authentication information.
--- a/auth/standard_role_manager.cc
+++ b/auth/standard_role_manager.cc
@@ -49,11 +49,7 @@ namespace meta {
 namespace role_members_table {

 constexpr std::string_view name{"role_members" , 12};
-
-static std::string_view qualified_name() noexcept {
-    static const sstring instance = AUTH_KS + "." + sstring(name);
-    return instance;
-}
+constexpr std::string_view qualified_name("system_auth.role_members");

 }

@@ -84,7 +80,7 @@ static db::consistency_level consistency_for_role(std::string_view role_name) no

 static future<std::optional<record>> find_record(cql3::query_processor& qp, std::string_view role_name) {
    static const sstring query = format("SELECT * FROM {} WHERE {} = ?",
-            meta::roles_table::qualified_name(),
+            meta::roles_table::qualified_name,
            meta::roles_table::role_col_name);

    return qp.execute_internal(
@@ -124,13 +120,8 @@ static bool has_can_login(const cql3::untyped_result_set_row& row) {
    return row.has("can_login") && !(boolean_type->deserialize(row.get_blob("can_login")).is_null());
 }

-std::string_view standard_role_manager_name() noexcept {
-    static const sstring instance = meta::AUTH_PACKAGE_NAME + "CassandraRoleManager";
-    return instance;
-}
-
 std::string_view standard_role_manager::qualified_java_name() const noexcept {
-    return standard_role_manager_name();
+    return "org.apache.cassandra.auth.CassandraRoleManager";
 }

 const resource_set& standard_role_manager::protected_resources() const {
@@ -148,7 +139,7 @@ future<> standard_role_manager::create_metadata_tables_if_missing() const {
            "  member text,"
            "  PRIMARY KEY (role, member)"
            ")",
-            meta::role_members_table::qualified_name());
+            meta::role_members_table::qualified_name);


    return when_all_succeed(
@@ -168,7 +159,7 @@ future<> standard_role_manager::create_default_role_if_missing() const {
    return default_role_row_satisfies(_qp, &has_can_login).then([this](bool exists) {
        if (!exists) {
            static const sstring query = format("INSERT INTO {} ({}, is_superuser, can_login) VALUES (?, true, true)",
-                    meta::roles_table::qualified_name(),
+                    meta::roles_table::qualified_name,
                    meta::roles_table::role_col_name);

            return _qp.execute_internal(
@@ -256,7 +247,7 @@ future<> standard_role_manager::stop() {

 future<> standard_role_manager::create_or_replace(std::string_view role_name, const role_config& c) const {
    static const sstring query = format("INSERT INTO {} ({}, is_superuser, can_login) VALUES (?, ?, ?)",
-            meta::roles_table::qualified_name(),
+            meta::roles_table::qualified_name,
            meta::roles_table::role_col_name);

    return _qp.execute_internal(
@@ -301,7 +292,7 @@ standard_role_manager::alter(std::string_view role_name, const role_config_updat

        return _qp.execute_internal(
                format("UPDATE {} SET {} WHERE {} = ?",
-                        meta::roles_table::qualified_name(),
+                        meta::roles_table::qualified_name,
                        build_column_assignments(u),
                        meta::roles_table::role_col_name),
                consistency_for_role(role_name),
@@ -319,7 +310,7 @@ future<> standard_role_manager::drop(std::string_view role_name) const {
        // First, revoke this role from all roles that are members of it.
        const auto revoke_from_members = [this, role_name] {
            static const sstring query = format("SELECT member FROM {} WHERE role = ?",
-                    meta::role_members_table::qualified_name());
+                    meta::role_members_table::qualified_name);

            return _qp.execute_internal(
                    query,
@@ -357,7 +348,7 @@ future<> standard_role_manager::drop(std::string_view role_name) const {
        // Finally, delete the role itself.
        auto delete_role = [this, role_name] {
            static const sstring query = format("DELETE FROM {} WHERE {} = ?",
-                    meta::roles_table::qualified_name(),
+                    meta::roles_table::qualified_name,
                    meta::roles_table::role_col_name);

            return _qp.execute_internal(
@@ -383,7 +374,7 @@ standard_role_manager::modify_membership(
    const auto modify_roles = [this, role_name, grantee_name, ch] {
        const auto query = format(
                "UPDATE {} SET member_of = member_of {} ? WHERE {} = ?",
-                meta::roles_table::qualified_name(),
+                meta::roles_table::qualified_name,
                (ch == membership_change::add ? '+' : '-'),
                meta::roles_table::role_col_name);

@@ -399,7 +390,7 @@ standard_role_manager::modify_membership(
            case membership_change::add:
                return _qp.execute_internal(
                        format("INSERT INTO {} (role, member) VALUES (?, ?)",
-                                meta::role_members_table::qualified_name()),
+                                meta::role_members_table::qualified_name),
                        consistency_for_role(role_name),
                        internal_distributed_timeout_config(),
                        {sstring(role_name), sstring(grantee_name)}).discard_result();
@@ -407,7 +398,7 @@ standard_role_manager::modify_membership(
            case membership_change::remove:
                return _qp.execute_internal(
                        format("DELETE FROM {} WHERE role = ? AND member = ?",
-                                meta::role_members_table::qualified_name()),
+                                meta::role_members_table::qualified_name),
                        consistency_for_role(role_name),
                        internal_distributed_timeout_config(),
                        {sstring(role_name), sstring(grantee_name)}).discard_result();
@@ -416,7 +407,7 @@ standard_role_manager::modify_membership(
        return make_ready_future<>();
    };

-    return when_all_succeed(modify_roles(), modify_role_members()).discard_result();
+    return when_all_succeed(modify_roles(), modify_role_members).discard_result();
 }

 future<>
@@ -425,7 +416,7 @@ standard_role_manager::grant(std::string_view grantee_name, std::string_view rol
        return this->query_granted(
                grantee_name,
                recursive_role_query::yes).then([role_name, grantee_name](role_set roles) {
-            if (roles.count(sstring(role_name)) != 0) {
+            if (roles.contains(sstring(role_name))) {
                throw role_already_included(grantee_name, role_name);
            }

@@ -437,7 +428,7 @@ standard_role_manager::grant(std::string_view grantee_name, std::string_view rol
        return this->query_granted(
                role_name,
                recursive_role_query::yes).then([role_name, grantee_name](role_set roles) {
-            if (roles.count(sstring(grantee_name)) != 0) {
+            if (roles.contains(sstring(grantee_name))) {
                throw role_already_included(role_name, grantee_name);
            }

@@ -460,7 +451,7 @@ standard_role_manager::revoke(std::string_view revokee_name, std::string_view ro
        return this->query_granted(
                revokee_name,
                recursive_role_query::no).then([revokee_name, role_name](role_set roles) {
-            if (roles.count(sstring(role_name)) == 0) {
+            if (!roles.contains(sstring(role_name))) {
                throw revoke_ungranted_role(revokee_name, role_name);
            }

@@ -504,7 +495,7 @@ future<role_set> standard_role_manager::query_granted(std::string_view grantee_n
 future<role_set> standard_role_manager::query_all() const {
    static const sstring query = format("SELECT {} FROM {}",
            meta::roles_table::role_col_name,
-            meta::roles_table::qualified_name());
+            meta::roles_table::qualified_name);

    // To avoid many copies of a view.
    static const auto role_col_name_string = sstring(meta::roles_table::role_col_name);
--- a/auth/standard_role_manager.hh
+++ b/auth/standard_role_manager.hh
@@ -42,8 +42,6 @@ class migration_manager;

 namespace auth {

-std::string_view standard_role_manager_name() noexcept;
-
 class standard_role_manager final : public role_manager {
    cql3::query_processor& _qp;
    ::service::migration_manager& _migration_manager;
--- a/auth/transitional.cc
+++ b/auth/transitional.cc
@@ -101,7 +101,7 @@ public:
    virtual future<authenticated_user> authenticate(const credentials_map& credentials) const override {
        auto i = credentials.find(authenticator::USERNAME_KEY);
        if ((i == credentials.end() || i->second.empty())
-                && (!credentials.count(PASSWORD_KEY) || credentials.at(PASSWORD_KEY).empty())) {
+                && (!credentials.contains(PASSWORD_KEY) || credentials.at(PASSWORD_KEY).empty())) {
            // return anon user
            return make_ready_future<authenticated_user>(anonymous_user());
        }
--- a/bytes.cc
+++ b/bytes.cc
@@ -100,3 +100,7 @@ std::ostream& operator<<(std::ostream& os, const bytes_view& b) {
 }

 }
+
+std::ostream& operator<<(std::ostream& os, const fmt_hex& b) {
+    return os << to_hex(b.v);
+}
--- a/bytes.hh
+++ b/bytes.hh
@@ -28,6 +28,7 @@
 #include <iosfwd>
 #include <functional>
 #include "utils/mutable_view.hh"
+#include <xxhash.h>

 using bytes = basic_sstring<int8_t, uint32_t, 31, false>;
 using bytes_view = std::basic_string_view<int8_t>;
@@ -35,20 +36,24 @@ using bytes_mutable_view = basic_mutable_view<bytes_view::value_type>;
 using bytes_opt = std::optional<bytes>;
 using sstring_view = std::string_view;

+inline bytes to_bytes(bytes&& b) {
+    return std::move(b);
+}
+
 inline sstring_view to_sstring_view(bytes_view view) {
    return {reinterpret_cast<const char*>(view.data()), view.size()};
 }

-namespace std {
+inline bytes_view to_bytes_view(sstring_view view) {
+    return {reinterpret_cast<const int8_t*>(view.data()), view.size()};
+}

-template <>
-struct hash<bytes_view> {
-    size_t operator()(bytes_view v) const {
-        return hash<sstring_view>()({reinterpret_cast<const char*>(v.begin()), v.size()});
-    }
+struct fmt_hex {
+    bytes_view& v;
+    fmt_hex(bytes_view& v) noexcept : v(v) {}
 };

-}
+std::ostream& operator<<(std::ostream& os, const fmt_hex& hex);

 bytes from_hex(sstring_view s);
 sstring to_hex(bytes_view b);
@@ -83,10 +88,37 @@ struct appending_hash<bytes_view> {
    }
 };

+struct bytes_view_hasher : public hasher {
+    XXH64_state_t _state;
+    bytes_view_hasher(uint64_t seed = 0) noexcept {
+        XXH64_reset(&_state, seed);
+    }
+    void update(const char* ptr, size_t length) noexcept {
+        XXH64_update(&_state, ptr, length);
+    }
+    size_t finalize() {
+        return static_cast<size_t>(XXH64_digest(&_state));
+    }
+};
+
+namespace std {
+template <>
+struct hash<bytes_view> {
+    size_t operator()(bytes_view v) const {
+        bytes_view_hasher h;
+        appending_hash<bytes_view>{}(h, v);
+        return h.finalize();
+    }
+};
+} // namespace std
+
 inline int32_t compare_unsigned(bytes_view v1, bytes_view v2) {
-    auto n = memcmp(v1.begin(), v2.begin(), std::min(v1.size(), v2.size()));
+  auto size = std::min(v1.size(), v2.size());
+  if (size) {
+    auto n = memcmp(v1.begin(), v2.begin(), size);
    if (n) {
        return n;
    }
+  }
    return (int32_t) (v1.size() - v2.size());
 }
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -39,7 +39,7 @@ public:
    using size_type = bytes::size_type;
    using value_type = bytes::value_type;
    using fragment_type = bytes_view;
-    static constexpr size_type max_chunk_size() { return 128 * 1024; }
+    static constexpr size_type max_chunk_size() { return max_alloc_size() - sizeof(chunk); }
 private:
    static_assert(sizeof(value_type) == 1, "value_type is assumed to be one byte long");
    struct chunk {
@@ -59,13 +59,21 @@ private:
        void operator delete(void* ptr) { free(ptr); }
    };
    static constexpr size_type default_chunk_size{512};
+    static constexpr size_type max_alloc_size() { return 128 * 1024; }
 private:
    std::unique_ptr<chunk> _begin;
    chunk* _current;
    size_type _size;
    size_type _initial_chunk_size = default_chunk_size;
 public:
-    class fragment_iterator : public std::iterator<std::input_iterator_tag, bytes_view> {
+    class fragment_iterator {
+    public:
+        using iterator_category = std::input_iterator_tag;
+        using value_type = bytes_view;
+        using difference_type = std::ptrdiff_t;
+        using pointer = bytes_view*;
+        using reference = bytes_view&;
+    private:
        chunk* _current = nullptr;
    public:
        fragment_iterator() = default;
@@ -125,16 +133,15 @@ private:
        return _current->size - _current->offset;
    }
    // Figure out next chunk size.
-    //   - must be enough for data_size
+    //   - must be enough for data_size + sizeof(chunk)
    //   - must be at least _initial_chunk_size
    //   - try to double each time to prevent too many allocations
-    //   - do not exceed max_chunk_size
+    //   - should not exceed max_alloc_size, unless data_size requires so
    size_type next_alloc_size(size_t data_size) const {
        auto next_size = _current
                ? _current->size * 2
                : _initial_chunk_size;
-        next_size = std::min(next_size, max_chunk_size());
-        // FIXME: check for overflow?
+        next_size = std::min(next_size, max_alloc_size());
        return std::max<size_type>(next_size, data_size + sizeof(chunk));
    }
    // Makes room for a contiguous region of given size.
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -28,7 +28,6 @@
 #include "partition_version.hh"
 #include "utils/logalloc.hh"
 #include "query-request.hh"
-#include "partition_snapshot_reader.hh"
 #include "partition_snapshot_row_cursor.hh"
 #include "read_context.hh"
 #include "flat_mutation_reader.hh"
@@ -134,7 +133,7 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
    void maybe_add_to_cache(const static_row& sr);
    void maybe_set_static_row_continuous();
    void finish_reader() {
-        push_mutation_fragment(partition_end());
+        push_mutation_fragment(*_schema, _permit, partition_end());
        _end_of_stream = true;
        _state = state::end_of_stream;
    }
@@ -146,7 +145,7 @@ public:
                               lw_shared_ptr<read_context> ctx,
                               partition_snapshot_ptr snp,
                               row_cache& cache)
-        : flat_mutation_reader::impl(std::move(s))
+        : flat_mutation_reader::impl(std::move(s), ctx->permit())
        , _snp(std::move(snp))
        , _position_cmp(*_schema)
        , _ck_ranges(std::move(crr))
@@ -158,8 +157,8 @@ public:
        , _read_context(std::move(ctx))
        , _next_row(*_schema, *_snp)
    {
-        clogger.trace("csm {}: table={}.{}", this, _schema->ks_name(), _schema->cf_name());
-        push_mutation_fragment(partition_start(std::move(dk), _snp->partition_tombstone()));
+        clogger.trace("csm {}: table={}.{}", fmt::ptr(this), _schema->ks_name(), _schema->cf_name());
+        push_mutation_fragment(*_schema, _permit, partition_start(std::move(dk), _snp->partition_tombstone()));
    }
    cache_flat_mutation_reader(const cache_flat_mutation_reader&) = delete;
    cache_flat_mutation_reader(cache_flat_mutation_reader&&) = delete;
@@ -188,7 +187,7 @@ future<> cache_flat_mutation_reader::process_static_row(db::timeout_clock::time_
            return _snp->static_row(_read_context->digest_requested());
        });
        if (!sr.empty()) {
-            push_mutation_fragment(mutation_fragment(std::move(sr)));
+            push_mutation_fragment(mutation_fragment(*_schema, _permit, std::move(sr)));
        }
        return make_ready_future<>();
    } else {
@@ -232,7 +231,7 @@ future<> cache_flat_mutation_reader::fill_buffer(db::timeout_clock::time_point t
            return after_static_row();
        }
    }
-    clogger.trace("csm {}: fill_buffer(), range={}, lb={}", this, *_ck_ranges_curr, _lower_bound);
+    clogger.trace("csm {}: fill_buffer(), range={}, lb={}", fmt::ptr(this), *_ck_ranges_curr, _lower_bound);
    return do_until([this] { return _end_of_stream || is_buffer_full(); }, [this, timeout] {
        return do_fill_buffer(timeout);
    });
@@ -265,6 +264,9 @@ future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_poin
        }
        _state = state::reading_from_underlying;
        _population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
+        if (!_read_context->partition_exists()) {
+            return read_from_underlying(timeout);
+        }
        auto end = _next_row_in_range ? position_in_partition(_next_row.position())
                                      : position_in_partition(_upper_bound);
        return _underlying->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
@@ -277,7 +279,7 @@ future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_poin
    // assert(_state == state::reading_from_cache)
    return _lsa_manager.run_in_read_section([this] {
        auto next_valid = _next_row.iterators_valid();
-        clogger.trace("csm {}: reading_from_cache, range=[{}, {}), next={}, valid={}", this, _lower_bound,
+        clogger.trace("csm {}: reading_from_cache, range=[{}, {}), next={}, valid={}", fmt::ptr(this), _lower_bound,
            _upper_bound, _next_row.position(), next_valid);
        // We assume that if there was eviction, and thus the range may
        // no longer be continuous, the cursor was invalidated.
@@ -291,7 +293,7 @@ future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_poin
            }
        }
        _next_row.maybe_refresh();
-        clogger.trace("csm {}: next={}, cont={}", this, _next_row.position(), _next_row.continuous());
+        clogger.trace("csm {}: next={}, cont={}", fmt::ptr(this), _next_row.position(), _next_row.continuous());
        _lower_bound_changed = false;
        while (_state == state::reading_from_cache) {
            copy_from_cache_to_buffer();
@@ -357,7 +359,7 @@ future<> cache_flat_mutation_reader::read_from_underlying(db::timeout_clock::tim
                                    e.release();
                                    auto next = std::next(it);
                                    it->set_continuous(next->continuous());
-                                    clogger.trace("csm {}: inserted dummy at {}, cont={}", this, it->position(), it->continuous());
+                                    clogger.trace("csm {}: inserted dummy at {}, cont={}", fmt::ptr(this), it->position(), it->continuous());
                                }
                            });
                        } else if (ensure_population_lower_bound()) {
@@ -368,11 +370,11 @@ future<> cache_flat_mutation_reader::read_from_underlying(db::timeout_clock::tim
                                auto insert_result = rows.insert_check(_next_row.get_iterator_in_latest_version(), *e, less);
                                auto inserted = insert_result.second;
                                if (inserted) {
-                                    clogger.trace("csm {}: inserted dummy at {}", this, _upper_bound);
+                                    clogger.trace("csm {}: inserted dummy at {}", fmt::ptr(this), _upper_bound);
                                    _snp->tracker()->insert(*e);
                                    e.release();
                                } else {
-                                    clogger.trace("csm {}: mark {} as continuous", this, insert_result.first->position());
+                                    clogger.trace("csm {}: mark {} as continuous", fmt::ptr(this), insert_result.first->position());
                                    insert_result.first->set_continuous(true);
                                }
                            });
@@ -413,7 +415,7 @@ bool cache_flat_mutation_reader::ensure_population_lower_bound() {
            auto insert_result = rows.insert_check(rows.end(), *e, less);
            auto inserted = insert_result.second;
            if (inserted) {
-                clogger.trace("csm {}: inserted lower bound dummy at {}", this, e->position());
+                clogger.trace("csm {}: inserted lower bound dummy at {}", fmt::ptr(this), e->position());
                _snp->tracker()->insert(*e);
                e.release();
            }
@@ -453,7 +455,7 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
        _read_context->cache().on_mispopulate();
        return;
    }
-    clogger.trace("csm {}: populate({})", this, clustering_row::printer(*_schema, cr));
+    clogger.trace("csm {}: populate({})", fmt::ptr(this), clustering_row::printer(*_schema, cr));
    _lsa_manager.run_in_update_section_with_allocator([this, &cr] {
        mutation_partition& mp = _snp->version()->partition();
        rows_entry::compare less(*_schema);
@@ -462,7 +464,7 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
            cr.cells().prepare_hash(*_schema, column_kind::regular_column);
        }
        auto new_entry = alloc_strategy_unique_ptr<rows_entry>(
-            current_allocator().construct<rows_entry>(*_schema, cr.key(), cr.tomb(), cr.marker(), cr.cells()));
+            current_allocator().construct<rows_entry>(*_schema, cr.key(), cr.as_deletable_row()));
        new_entry->set_continuous(false);
        auto it = _next_row.iterators_valid() ? _next_row.get_iterator_in_latest_version()
                                              : mp.clustered_rows().lower_bound(cr.key(), less);
@@ -475,7 +477,7 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {

        rows_entry& e = *it;
        if (ensure_population_lower_bound()) {
-            clogger.trace("csm {}: set_continuous({})", this, e.position());
+            clogger.trace("csm {}: set_continuous({})", fmt::ptr(this), e.position());
            e.set_continuous(true);
        } else {
            _read_context->cache().on_mispopulate();
@@ -494,14 +496,14 @@ bool cache_flat_mutation_reader::after_current_range(position_in_partition_view

 inline
 void cache_flat_mutation_reader::start_reading_from_underlying() {
-    clogger.trace("csm {}: start_reading_from_underlying(), range=[{}, {})", this, _lower_bound, _next_row_in_range ? _next_row.position() : _upper_bound);
+    clogger.trace("csm {}: start_reading_from_underlying(), range=[{}, {})", fmt::ptr(this), _lower_bound, _next_row_in_range ? _next_row.position() : _upper_bound);
    _state = state::move_to_underlying;
    _next_row.touch();
 }

 inline
 void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
-    clogger.trace("csm {}: copy_from_cache, next={}, next_row_in_range={}", this, _next_row.position(), _next_row_in_range);
+    clogger.trace("csm {}: copy_from_cache, next={}, next_row_in_range={}", fmt::ptr(this), _next_row.position(), _next_row_in_range);
    _next_row.touch();
    position_in_partition_view next_lower_bound = _next_row.dummy() ? _next_row.position() : position_in_partition_view::after_key(_next_row.key());
    for (auto &&rts : _snp->range_tombstones(_lower_bound, _next_row_in_range ? next_lower_bound : _upper_bound)) {
@@ -509,7 +511,7 @@ void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
        // This guarantees that rts starts after any emitted clustering_row
        // and not before any emitted range tombstone.
        if (!less(_lower_bound, rts.position())) {
-            rts.set_start(*_schema, _lower_bound);
+            rts.set_start(_lower_bound);
        } else {
            _lower_bound = position_in_partition(rts.position());
            _lower_bound_changed = true;
@@ -517,7 +519,7 @@ void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
                return;
            }
        }
-        push_mutation_fragment(std::move(rts));
+        push_mutation_fragment(*_schema, _permit, std::move(rts));
    }
    // We add the row to the buffer even when it's full.
    // This simplifies the code. For more info see #3139.
@@ -533,7 +535,7 @@ void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
 inline
 void cache_flat_mutation_reader::move_to_end() {
    finish_reader();
-    clogger.trace("csm {}: eos", this);
+    clogger.trace("csm {}: eos", fmt::ptr(this));
 }

 inline
@@ -558,7 +560,7 @@ void cache_flat_mutation_reader::move_to_range(query::clustering_row_ranges::con
    _ck_ranges_curr = next_it;
    auto adjacent = _next_row.advance_to(_lower_bound);
    _next_row_in_range = !after_current_range(_next_row.position());
-    clogger.trace("csm {}: move_to_range(), range={}, lb={}, ub={}, next={}", this, *_ck_ranges_curr, _lower_bound, _upper_bound, _next_row.position());
+    clogger.trace("csm {}: move_to_range(), range={}, lb={}, ub={}, next={}", fmt::ptr(this), *_ck_ranges_curr, _lower_bound, _upper_bound, _next_row.position());
    if (!adjacent && !_next_row.continuous()) {
        // FIXME: We don't insert a dummy for singular range to avoid allocating 3 entries
        // for a hit (before, at and after). If we supported the concept of an incomplete row,
@@ -568,7 +570,7 @@ void cache_flat_mutation_reader::move_to_range(query::clustering_row_ranges::con
            // Insert dummy for lower bound
            if (can_populate()) {
                // FIXME: _lower_bound could be adjacent to the previous row, in which case we could skip this
-                clogger.trace("csm {}: insert dummy at {}", this, _lower_bound);
+                clogger.trace("csm {}: insert dummy at {}", fmt::ptr(this), _lower_bound);
                auto it = with_allocator(_lsa_manager.region().allocator(), [&] {
                    auto& rows = _snp->version()->partition().clustered_rows();
                    auto new_entry = current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no);
@@ -587,7 +589,7 @@ void cache_flat_mutation_reader::move_to_range(query::clustering_row_ranges::con
 // _next_row must be inside the range.
 inline
 void cache_flat_mutation_reader::move_to_next_entry() {
-    clogger.trace("csm {}: move_to_next_entry(), curr={}", this, _next_row.position());
+    clogger.trace("csm {}: move_to_next_entry(), curr={}", fmt::ptr(this), _next_row.position());
    if (no_clustering_row_between(*_schema, _next_row.position(), _upper_bound)) {
        move_to_next_range();
    } else {
@@ -596,7 +598,7 @@ void cache_flat_mutation_reader::move_to_next_entry() {
            return;
        }
        _next_row_in_range = !after_current_range(_next_row.position());
-        clogger.trace("csm {}: next={}, cont={}, in_range={}", this, _next_row.position(), _next_row.continuous(), _next_row_in_range);
+        clogger.trace("csm {}: next={}, cont={}, in_range={}", fmt::ptr(this), _next_row.position(), _next_row.continuous(), _next_row_in_range);
        if (!_next_row.continuous()) {
            start_reading_from_underlying();
        }
@@ -605,7 +607,7 @@ void cache_flat_mutation_reader::move_to_next_entry() {

 inline
 void cache_flat_mutation_reader::add_to_buffer(mutation_fragment&& mf) {
-    clogger.trace("csm {}: add_to_buffer({})", this, mutation_fragment::printer(*_schema, mf));
+    clogger.trace("csm {}: add_to_buffer({})", fmt::ptr(this), mutation_fragment::printer(*_schema, mf));
    if (mf.is_clustering_row()) {
        add_clustering_row_to_buffer(std::move(mf));
    } else {
@@ -618,7 +620,7 @@ inline
 void cache_flat_mutation_reader::add_to_buffer(const partition_snapshot_row_cursor& row) {
    if (!row.dummy()) {
        _read_context->cache().on_row_hit();
-        add_clustering_row_to_buffer(row.row(_read_context->digest_requested()));
+        add_clustering_row_to_buffer(mutation_fragment(*_schema, _permit, row.row(_read_context->digest_requested())));
    }
 }

@@ -627,7 +629,7 @@ void cache_flat_mutation_reader::add_to_buffer(const partition_snapshot_row_curs
 //   (2) If _lower_bound > mf.position(), mf was emitted
 inline
 void cache_flat_mutation_reader::add_clustering_row_to_buffer(mutation_fragment&& mf) {
-    clogger.trace("csm {}: add_clustering_row_to_buffer({})", this, mutation_fragment::printer(*_schema, mf));
+    clogger.trace("csm {}: add_clustering_row_to_buffer({})", fmt::ptr(this), mutation_fragment::printer(*_schema, mf));
    auto& row = mf.as_clustering_row();
    auto new_lower_bound = position_in_partition::after_key(row.key());
    push_mutation_fragment(std::move(mf));
@@ -637,7 +639,7 @@ void cache_flat_mutation_reader::add_clustering_row_to_buffer(mutation_fragment&

 inline
 void cache_flat_mutation_reader::add_to_buffer(range_tombstone&& rt) {
-    clogger.trace("csm {}: add_to_buffer({})", this, rt);
+    clogger.trace("csm {}: add_to_buffer({})", fmt::ptr(this), rt);
    // This guarantees that rt starts after any emitted clustering_row
    // and not before any emitted range tombstone.
    position_in_partition::less_compare less(*_schema);
@@ -645,18 +647,18 @@ void cache_flat_mutation_reader::add_to_buffer(range_tombstone&& rt) {
        return;
    }
    if (!less(_lower_bound, rt.position())) {
-        rt.set_start(*_schema, _lower_bound);
+        rt.set_start(_lower_bound);
    } else {
        _lower_bound = position_in_partition(rt.position());
        _lower_bound_changed = true;
    }
-    push_mutation_fragment(std::move(rt));
+    push_mutation_fragment(*_schema, _permit, std::move(rt));
 }

 inline
 void cache_flat_mutation_reader::maybe_add_to_cache(const range_tombstone& rt) {
    if (can_populate()) {
-        clogger.trace("csm {}: maybe_add_to_cache({})", this, rt);
+        clogger.trace("csm {}: maybe_add_to_cache({})", fmt::ptr(this), rt);
        _lsa_manager.run_in_update_section_with_allocator([&] {
            _snp->version()->partition().row_tombstones().apply_monotonically(*_schema, rt);
        });
@@ -668,7 +670,7 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const range_tombstone& rt) {
 inline
 void cache_flat_mutation_reader::maybe_add_to_cache(const static_row& sr) {
    if (can_populate()) {
-        clogger.trace("csm {}: populate({})", this, static_row::printer(*_schema, sr));
+        clogger.trace("csm {}: populate({})", fmt::ptr(this), static_row::printer(*_schema, sr));
        _read_context->cache().on_static_row_insert();
        _lsa_manager.run_in_update_section_with_allocator([&] {
            if (_read_context->digest_requested()) {
@@ -684,7 +686,7 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const static_row& sr) {
 inline
 void cache_flat_mutation_reader::maybe_set_static_row_continuous() {
    if (can_populate()) {
-        clogger.trace("csm {}: set static row continuous", this);
+        clogger.trace("csm {}: set static row continuous", fmt::ptr(this));
        _snp->version()->partition().set_static_row_continuous(true);
    } else {
        _read_context->cache().on_mispopulate();
--- a/caching_options.hh
+++ b/caching_options.hh
@@ -23,7 +23,7 @@
 #include <seastar/core/sstring.hh>
 #include <boost/lexical_cast.hpp>
 #include "exceptions/exceptions.hh"
-#include "json.hh"
+#include "utils/rjson.hh"
 #include "seastarx.hh"

 class schema;
@@ -76,7 +76,7 @@ public:
    }

    sstring to_sstring() const {
-        return json::to_json(to_map());
+        return rjson::print(rjson::from_string_map(to_map()));
    }

    static caching_options get_disabled_caching_options() {
@@ -97,13 +97,14 @@ public:
            } else if (p.first == "enabled") {
                e = p.second == "true";
            } else {
-                throw exceptions::configuration_exception("Invalid caching option: " + p.first);
+                throw exceptions::configuration_exception(format("Invalid caching option: {}", p.first));
            }
        }
        return caching_options(k, r, e);
    }
+
    static caching_options from_sstring(const sstring& str) {
-        return from_map(json::to_map(str));
+        return from_map(rjson::parse_to_map<std::map<sstring, sstring>>(str));
    }

    bool operator==(const caching_options& other) const {
--- a/cartesian_product.hh
+++ b/cartesian_product.hh
@@ -33,9 +33,13 @@ template<typename T>
 struct cartesian_product {
    const std::vector<std::vector<T>>& _vec_of_vecs;
 public:
-    class iterator : public std::iterator<std::forward_iterator_tag, std::vector<T>> {
+    class iterator {
    public:
+        using iterator_category = std::forward_iterator_tag;
        using value_type = std::vector<T>;
+        using difference_type = std::ptrdiff_t;
+        using pointer = std::vector<T>*;
+        using reference = std::vector<T>&;
    private:
        size_t _pos;
        const std::vector<std::vector<T>>* _vec_of_vecs;
--- a/cdc/cdc_extension.hh
+++ b/cdc/cdc_extension.hh
@@ -20,10 +20,16 @@

 #pragma once

+#include <map>
+
+#include <seastar/core/sstring.hh>
+
+#include "bytes.hh"
 #include "serializer.hh"
 #include "db/extensions.hh"
 #include "cdc/cdc_options.hh"
 #include "schema.hh"
+#include "serializer_impl.hh"

 namespace cdc {

@@ -33,6 +39,7 @@ public:
    static constexpr auto NAME = "cdc";

    cdc_extension() = default;
+    cdc_extension(const options& opts) : _cdc_options(opts) {}
    explicit cdc_extension(std::map<sstring, sstring> tags) : _cdc_options(std::move(tags)) {}
    explicit cdc_extension(const bytes& b) : _cdc_options(cdc_extension::deserialize(b)) {}
    explicit cdc_extension(const sstring& s) {
--- a/cdc/cdc_options.hh
+++ b/cdc/cdc_options.hh
@@ -27,10 +27,32 @@

 namespace cdc {

+enum class delta_mode : uint8_t {
+    keys,
+    full,
+};
+
+/**
+ * (for now only pre-) image collection mode.
+ * Stating how much info to record.
+ * off == none
+ * on == changed columns
+ * full == all (changed and unmodified columns)
+ */
+enum class image_mode : uint8_t {
+    off, 
+    on,
+    full,
+};
+
+std::ostream& operator<<(std::ostream& os, delta_mode);
+std::ostream& operator<<(std::ostream& os, image_mode);
+
 class options final {
    bool _enabled = false;
-    bool _preimage = false;
+    image_mode _preimage = image_mode::off;
    bool _postimage = false;
+    delta_mode _delta_mode = delta_mode::full;
    int _ttl = 86400; // 24h in seconds
 public:
    options() = default;
@@ -40,10 +62,19 @@ public:
    sstring to_sstring() const;

    bool enabled() const { return _enabled; }
-    bool preimage() const { return _preimage; }
+    bool preimage() const { return _preimage != image_mode::off; }
+    bool full_preimage() const { return _preimage == image_mode::full; }
    bool postimage() const { return _postimage; }
+    delta_mode get_delta_mode() const { return _delta_mode; }
+    void set_delta_mode(delta_mode m) { _delta_mode = m; }
    int ttl() const { return _ttl; }

+    void enabled(bool b) { _enabled = b; }
+    void preimage(bool b) { preimage(b ? image_mode::on : image_mode::off); }
+    void preimage(image_mode m) { _preimage = m; }
+    void postimage(bool b) { _postimage = b; }
+    void ttl(int v) { _ttl = v; }
+
    bool operator==(const options& o) const;
    bool operator!=(const options& o) const;
 };
--- a/cdc/change_visitor.hh
+++ b/cdc/change_visitor.hh
@@ -0,0 +1,283 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "mutation.hh"
+
+/*
+ * This file contains a general abstraction for walking over mutations,
+ * deconstructing them into ``atomic'' pieces, and consuming these pieces.
+ *
+ * The pieces considered atomic are:
+ * - atomic_cells, either in collections or in atomic columns
+ *   (see `live_collection_cell`, `dead_collection_cell`, `live_atomic_cell`, `dead_atomic_cell`),
+ * - collection tombstones (see `collection_tombstone`)
+ * - row markers (see `marker`)
+ * - row tombstones (see `clustered_row_delete`),
+ * - range tombstones (see `range_delete`),
+ * - partition tombstones (see `partition_delete`).
+ * We use the term ``changes'' to refer to these atomic pieces, hence the name ``ChangeVisitor''.
+ *
+ * IMPORTANT: this doesn't understand all possible states that a mutation can have, e.g. it doesn't understand
+ * the concept of ``continuity''. However, it is sufficient for analyzing mutations created by a write coordinator,
+ * e.g. obtained by parsing a CQL statement.
+ *
+ * To analyze a mutation, create a visitor (described by the `ChangeVisitor` concept below) and pass it
+ * together with the mutation to `inspect_mutation`.
+ *
+ * To analyze certain fragments of the mutation, the inspecting code requires further visitors to be passed.
+ * For example, when it encounters a clustered row update, it calls `clustered_row_cells` on the visitor,
+ * passing it the row's key and the callback. The visitor can then decide:
+ * - if it's not interested in the row's cells, it can simply not call the callback,
+ * - otherwise, it can call the callback with a value of type that satisfies the ``RowCellsVisitor'' concept.
+ * If the callback is called, the inspector walks over the row and passes the changes into the ``row cells visitor''.
+ * In either case, it will then proceed to analyze further parts of the mutation, if any.
+ *
+ * Note that the type passed to the callbacks provided by the inspector (such as in the example above)
+ * can be decided at runtime. This can be especially useful with the callback passed to `collection_column`
+ * in RowCellsVisitor, if different collection types require different logic to handle.
+ *
+ * The dummy visitors below are there only to define the concepts.
+ * For example, in the RowCellsVisitor concept I wanted to express that `visit_collection` in RowCellsVisitor
+ * is a function that handles *any* type which satisfies CollectionVisitor. I didn't find a way to do that
+ * other than providing a ``most generic'' concrete type which satisfies the interface (`dummy_collection_visitor`).
+ * Unfortunately C++ is still not Haskell.
+ *
+ * The inspector calls `finished()` after visiting each change, and sometimes before (e.g. when it starts
+ * visiting a static row, but before it visits any of its cells). If it returns true, the inspector
+ * will stop the visitation. Thus, if at any point during the walk the visitor decides it's not interested
+ * in any more changes, it can inform the inspector by returning `true` from `finished()`.
+ *
+ * IMPORTANT: if the visitor returns `true` from `finished()`, it should keep returning `true`. This is because
+ * the inspector may call `finished()` multiple times when exiting some nested loops.
+ *
+ * The order of visitation is as follows:
+ * - First the static row is visited, if it has any cells.
+ *   Within the row, its columns are visited in order of increasing column IDs.
+ *
+ * - Then, for each clustering key, if a change (row marker, cell, or tombstone) exists for this key:
+ *   - The row marker is visited, if there is one.
+ *   - Columns are visited in order of increasing column IDs.
+ *   - The row tombstone is visited, if there is one.
+ *
+ * For both the static row and a clustering row, for each column:
+ * - If the column is atomic, a corresponding atomic_cell is visited (if there is one).
+ * - Otherwise (the column is non-atomic):
+ *   - The collection tombstone is visited first.
+ *   - Cells are visited in order of increasing keys
+ *     (assuming that the mutation was correctly constructed, i.e. it stores cells in key order).
+ *
+ * WARNING: visited collection tombstone and cells
+ * are guaranteed to live only for the duration of `collection_column` call.
+ *
+ * - Then range tombstones are visited. The order is unspecified
+ *   (more accurately: if it's specified, I don't know what it is)
+ *
+ * - Finally, the partition tombstone is visited, if it exists.
+ */
+
+namespace cdc {
+
+template <typename V>
+concept CollectionVisitor = requires(V v,
+        const tombstone& t,
+        bytes_view key,
+        const atomic_cell_view& cell) {
+
+    { v.collection_tombstone(t) }         -> std::same_as<void>;
+    { v.live_collection_cell(key, cell) } -> std::same_as<void>;
+    { v.dead_collection_cell(key, cell) } -> std::same_as<void>;
+    { v.finished() } -> std::same_as<bool>;
+};
+
+struct dummy_collection_visitor {
+    void collection_tombstone(const tombstone&) {}
+    void live_collection_cell(bytes_view, const atomic_cell_view&) {}
+    void dead_collection_cell(bytes_view, const atomic_cell_view&) {}
+    bool finished() { return false; }
+};
+
+template <typename V>
+concept RowCellsVisitor = requires(V v,
+        const column_definition& cdef,
+        const atomic_cell_view& cell,
+        noncopyable_function<void(dummy_collection_visitor&)> visit_collection) {
+
+    { v.live_atomic_cell(cdef, cell) }                         -> std::same_as<void>;
+    { v.dead_atomic_cell(cdef, cell) }                         -> std::same_as<void>;
+    { v.collection_column(cdef, std::move(visit_collection)) } -> std::same_as<void>;
+    { v.finished() }                                           -> std::same_as<bool>;
+};
+
+struct dummy_row_cells_visitor {
+    void live_atomic_cell(const column_definition&, const atomic_cell_view&) {}
+    void dead_atomic_cell(const column_definition&, const atomic_cell_view&) {}
+    void collection_column(const column_definition&, auto&& visit_collection) {
+        dummy_collection_visitor v;
+        visit_collection(v);
+    }
+    bool finished() { return false; }
+};
+
+template <typename V>
+concept ClusteredRowCellsVisitor = requires(V v,
+        const row_marker& rm) {
+    requires RowCellsVisitor<V>;
+    { v.marker(rm) } -> std::same_as<void>;
+};
+
+struct dummy_clustered_row_cells_visitor : public dummy_row_cells_visitor {
+    void marker(const row_marker&) {}
+};
+
+template <typename V>
+concept ChangeVisitor = requires(V v,
+        api::timestamp_type ts,
+        const clustering_key& ckey,
+        const range_tombstone& rt,
+        const tombstone& t,
+        noncopyable_function<void(dummy_clustered_row_cells_visitor&)> visit_clustered_row_cells,
+        noncopyable_function<void(dummy_row_cells_visitor&)> visit_row_cells) {
+
+    { v.static_row_cells(std::move(visit_row_cells)) }                    -> std::same_as<void>;
+    { v.clustered_row_cells(ckey, std::move(visit_clustered_row_cells)) } -> std::same_as<void>;
+    { v.clustered_row_delete(ckey, t) }                                   -> std::same_as<void>;
+    { v.range_delete(rt) }                                                -> std::same_as<void>;
+    { v.partition_delete(t) }                                             -> std::same_as<void>;
+    { v.finished() }                                                      -> std::same_as<bool>;
+};
+
+template <RowCellsVisitor V>
+void inspect_row_cells(const schema& s, column_kind ckind, const row& r, V& v) {
+    r.for_each_cell_until([&s, ckind, &v] (column_id id, const atomic_cell_or_collection& acoc) {
+        auto& cdef = s.column_at(ckind, id);
+
+        if (cdef.is_atomic()) {
+            auto cell = acoc.as_atomic_cell(cdef);
+            if (cell.is_live()) {
+                v.live_atomic_cell(cdef, cell);
+            } else {
+                v.dead_atomic_cell(cdef, cell);
+            }
+
+            return stop_iteration(v.finished());
+        }
+
+        acoc.as_collection_mutation().with_deserialized(*cdef.type, [&v, &cdef] (collection_mutation_view_description view) {
+            v.collection_column(cdef, [&view] (CollectionVisitor auto& cv) {
+                if (cv.finished()) {
+                    return;
+                }
+
+                if (view.tomb) {
+                    cv.collection_tombstone(view.tomb);
+                    if (cv.finished()) {
+                        return;
+                    }
+                }
+
+                for (auto& [key, cell]: view.cells) {
+                    if (cell.is_live()) {
+                        cv.live_collection_cell(key, cell);
+                    } else {
+                        cv.dead_collection_cell(key, cell);
+                    }
+
+                    if (cv.finished()) {
+                        return;
+                    }
+                }
+            });
+        });
+
+        return stop_iteration(v.finished());
+    });
+}
+
+template <ChangeVisitor V>
+void inspect_mutation(const mutation& m, V& v) {
+    auto& p = m.partition();
+    auto& s = *m.schema();
+
+    if (!p.static_row().empty()) {
+        v.static_row_cells([&s, &p] (RowCellsVisitor auto& srv) {
+            if (srv.finished()) {
+                return;
+            }
+            inspect_row_cells(s, column_kind::static_column, p.static_row().get(), srv);
+        });
+
+        if (v.finished()) {
+            return;
+        }
+    }
+
+    for (auto& cr: p.clustered_rows()) {
+        auto& r = cr.row();
+
+        if (r.marker().is_live() || !r.cells().empty()) {
+            v.clustered_row_cells(cr.key(), [&s, &r] (ClusteredRowCellsVisitor auto& crv) {
+                if (crv.finished()) {
+                    return;
+                }
+
+                auto& rm = r.marker();
+                if (rm.is_live()) {
+                    crv.marker(rm);
+
+                    if (crv.finished()) {
+                        return;
+                    }
+                }
+
+                inspect_row_cells(s, column_kind::regular_column, r.cells(), crv);
+            });
+
+            if (v.finished()) {
+                return;
+            }
+        }
+
+        if (r.deleted_at()) {
+            auto t = r.deleted_at().tomb();
+            assert(t.timestamp != api::missing_timestamp);
+            v.clustered_row_delete(cr.key(), t);
+            if (v.finished()) {
+                return;
+            }
+        }
+    }
+
+    for (auto& rt: p.row_tombstones()) {
+        assert(rt.tomb.timestamp != api::missing_timestamp);
+        v.range_delete(rt);
+        if (v.finished()) {
+            return;
+        }
+    }
+
+    if (p.partition_tombstone()) {
+        v.partition_delete(p.partition_tombstone());
+    }
+}
+
+} // namespace cdc
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -22,10 +22,14 @@
 #include <boost/type.hpp>
 #include <random>
 #include <unordered_set>
+#include <algorithm>
 #include <seastar/core/sleep.hh>
+#include <algorithm>
+#include <seastar/core/coroutine.hh>

 #include "keys.hh"
 #include "schema_builder.hh"
+#include "database.hh"
 #include "db/config.hh"
 #include "db/system_keyspace.hh"
 #include "db/system_distributed_keyspace.hh"
@@ -36,6 +40,7 @@
 #include "gms/gossiper.hh"

 #include "cdc/generation.hh"
+#include "cdc/cdc_options.hh"

 extern logging::logger cdc_log;

@@ -59,14 +64,57 @@ static void copy_int_to_bytes(int64_t i, size_t offset, bytes& b) {
    std::copy_n(reinterpret_cast<int8_t*>(&i), sizeof(int64_t), b.begin() + offset);
 }

-stream_id::stream_id(int64_t first, int64_t second)
+static constexpr auto stream_id_version_bits = 4;
+static constexpr auto stream_id_random_bits = 38;
+static constexpr auto stream_id_index_bits = sizeof(uint64_t)*8 - stream_id_version_bits - stream_id_random_bits;
+
+static constexpr auto stream_id_version_shift = 0;
+static constexpr auto stream_id_index_shift = stream_id_version_shift + stream_id_version_bits;
+static constexpr auto stream_id_random_shift = stream_id_index_shift + stream_id_index_bits;
+
+/**
+ * Responsibilty for encoding stream_id moved from factory method to
+ * this constructor, to keep knowledge of composition in a single place.
+ * Note this is private and friended to topology_description_generator,
+ * because he is the one who defined the "order" we view vnodes etc.
+ */
+stream_id::stream_id(dht::token token, size_t vnode_index)
    : _value(bytes::initialized_later(), 2 * sizeof(int64_t))
 {
-    copy_int_to_bytes(first, 0, _value);
-    copy_int_to_bytes(second, sizeof(int64_t), _value);
+    static thread_local std::mt19937_64 rand_gen(std::random_device{}());
+    static thread_local std::uniform_int_distribution<uint64_t> rand_dist;
+
+    auto rand = rand_dist(rand_gen);
+    auto mask_shift = [](uint64_t val, size_t bits, size_t shift) {
+        return (val & ((1ull << bits) - 1u)) << shift;
+    };
+    /**
+     *  Low qword:
+     * 0-4: version
+     * 5-26: vnode index as when created (see generation below). This excludes shards
+     * 27-64: random value (maybe to be replaced with timestamp)
+     */
+    auto low_qword = mask_shift(version_1, stream_id_version_bits, stream_id_version_shift)
+        | mask_shift(vnode_index, stream_id_index_bits, stream_id_index_shift)
+        | mask_shift(rand, stream_id_random_bits, stream_id_random_shift)
+        ;
+
+    copy_int_to_bytes(dht::token::to_int64(token), 0, _value);
+    copy_int_to_bytes(low_qword, sizeof(int64_t), _value);
+    // not a hot code path. make sure we did not mess up the shifts and masks.
+    assert(version() == version_1);
+    assert(index() == vnode_index);
 }

-stream_id::stream_id(bytes b) : _value(std::move(b)) { }
+stream_id::stream_id(bytes b)
+    : _value(std::move(b))
+{
+    // this is not a very solid check. Id:s previous to GA/versioned id:s
+    // have fully random bits in low qword, so this could go either way...
+    if (version() > version_1) {
+        throw std::invalid_argument("Unknown CDC stream id version");
+    }
+}

 bool stream_id::is_set() const {
    return !_value.empty();
@@ -76,6 +124,10 @@ bool stream_id::operator==(const stream_id& o) const {
    return _value == o._value;
 }

+bool stream_id::operator!=(const stream_id& o) const {
+    return !(*this == o);
+}
+
 bool stream_id::operator<(const stream_id& o) const {
    return _value < o._value;
 }
@@ -87,18 +139,26 @@ static int64_t bytes_to_int64(bytes_view b, size_t offset) {
    return net::ntoh(res);
 }

-int64_t stream_id::first() const {
-    return token_from_bytes(_value);
-}
-
-int64_t stream_id::second() const {
-    return bytes_to_int64(_value, sizeof(int64_t));
+dht::token stream_id::token() const {
+    return dht::token::from_int64(token_from_bytes(_value));
 }

 int64_t stream_id::token_from_bytes(bytes_view b) {
    return bytes_to_int64(b, 0);
 }

+static uint64_t unpack_value(bytes_view b, size_t off, size_t shift, size_t bits) {
+    return (uint64_t(bytes_to_int64(b, off)) >> shift) & ((1ull << bits) - 1u);
+}
+
+uint8_t stream_id::version() const {
+    return unpack_value(_value, sizeof(int64_t), stream_id_version_shift, stream_id_version_bits);
+}
+
+size_t stream_id::index() const {
+    return unpack_value(_value, sizeof(int64_t), stream_id_index_shift, stream_id_index_bits);
+}
+
 const bytes& stream_id::to_bytes() const {
    return _value;
 }
@@ -119,26 +179,38 @@ bool topology_description::operator==(const topology_description& o) const {
    return _entries == o._entries;
 }

-const std::vector<token_range_description>& topology_description::entries() const {
+const std::vector<token_range_description>& topology_description::entries() const& {
    return _entries;
 }

-static stream_id create_stream_id(dht::token t) {
-    static thread_local std::mt19937_64 rand_gen(std::random_device().operator()());
-    static thread_local std::uniform_int_distribution<int64_t> rand_dist(std::numeric_limits<int64_t>::min());
+std::vector<token_range_description>&& topology_description::entries() && {
+    return std::move(_entries);
+}

-    return {dht::token::to_int64(t), rand_dist(rand_gen)};
+static std::vector<stream_id> create_stream_ids(
+        size_t index, dht::token start, dht::token end, size_t shard_count, uint8_t ignore_msb) {
+    std::vector<stream_id> result;
+    result.reserve(shard_count);
+    dht::sharder sharder(shard_count, ignore_msb);
+    for (size_t shard_idx = 0; shard_idx < shard_count; ++shard_idx) {
+        auto t = dht::find_first_token_for_shard(sharder, start, end, shard_idx);
+        // compose the id from token and the "index" of the range end owning vnode
+        // as defined by token sort order. Basically grouping within this
+        // shard set.
+        result.emplace_back(stream_id(t, index));
+    }
+    return result;
 }

 class topology_description_generator final {
    const db::config& _cfg;
    const std::unordered_set<dht::token>& _bootstrap_tokens;
-    const locator::token_metadata& _token_metadata;
+    const locator::token_metadata_ptr _tmptr;
    const gms::gossiper& _gossiper;

    // Compute a set of tokens that split the token ring into vnodes
    auto get_tokens() const {
-        auto tokens = _token_metadata.sorted_tokens();
+        auto tokens = _tmptr->sorted_tokens();
        auto it = tokens.insert(
                tokens.end(), _bootstrap_tokens.begin(), _bootstrap_tokens.end());
        std::sort(it, tokens.end());
@@ -150,10 +222,10 @@ class topology_description_generator final {
    // Fetch sharding parameters for a node that owns vnode ending with this.end
    // Returns <shard_count, ignore_msb> pair.
    std::pair<size_t, uint8_t> get_sharding_info(dht::token end) const {
-        if (_bootstrap_tokens.count(end) > 0) {
+        if (_bootstrap_tokens.contains(end)) {
            return {smp::count, _cfg.murmur3_partitioner_ignore_msb_bits()};
        } else {
-            auto endpoint = _token_metadata.get_endpoint(end);
+            auto endpoint = _tmptr->get_endpoint(end);
            if (!endpoint) {
                throw std::runtime_error(
                        format("Can't find endpoint for token {}", end));
@@ -163,32 +235,26 @@ class topology_description_generator final {
        }
    }

-    token_range_description create_description(dht::token start, dht::token end) const {
+    token_range_description create_description(size_t index, dht::token start, dht::token end) const {
        token_range_description desc;

        desc.token_range_end = end;

        auto [shard_count, ignore_msb] = get_sharding_info(end);
-        desc.streams.reserve(shard_count);
+        desc.streams = create_stream_ids(index, start, end, shard_count, ignore_msb);
        desc.sharding_ignore_msb = ignore_msb;

-        dht::sharder sharder(shard_count, ignore_msb);
-        for (size_t shard_idx = 0; shard_idx < shard_count; ++shard_idx) {
-            auto t = dht::find_first_token_for_shard(sharder, start, end, shard_idx);
-            desc.streams.push_back(create_stream_id(t));
-        }
-
        return desc;
    }
 public:
    topology_description_generator(
            const db::config& cfg,
            const std::unordered_set<dht::token>& bootstrap_tokens,
-            const locator::token_metadata& token_metadata,
+            const locator::token_metadata_ptr tmptr,
            const gms::gossiper& gossiper)
        : _cfg(cfg)
        , _bootstrap_tokens(bootstrap_tokens)
-        , _token_metadata(token_metadata)
+        , _tmptr(std::move(tmptr))
        , _gossiper(gossiper)
    {}

@@ -213,10 +279,10 @@ public:
        vnode_descriptions.reserve(tokens.size());

        vnode_descriptions.push_back(
-                create_description(tokens.back(), tokens.front()));
+                create_description(0, tokens.back(), tokens.front()));
        for (size_t idx = 1; idx < tokens.size(); ++idx) {
            vnode_descriptions.push_back(
-                    create_description(tokens[idx - 1], tokens[idx]));
+                    create_description(idx, tokens[idx - 1], tokens[idx]));
        }

        return {std::move(vnode_descriptions)};
@@ -243,22 +309,67 @@ future<db_clock::time_point> get_local_streams_timestamp() {
    });
 }

+// non-static for testing
+size_t limit_of_streams_in_topology_description() {
+    // Each stream takes 16B and we don't want to exceed 4MB so we can have
+    // at most 262144 streams but not less than 1 per vnode.
+    return 4 * 1024 * 1024 / 16;
+}
+
+// non-static for testing
+topology_description limit_number_of_streams_if_needed(topology_description&& desc) {
+    int64_t streams_count = 0;
+    for (auto& tr_desc : desc.entries()) {
+        streams_count += tr_desc.streams.size();
+    }
+
+    size_t limit = std::max(limit_of_streams_in_topology_description(), desc.entries().size());
+    if (limit >= streams_count) {
+        return std::move(desc);
+    }
+    size_t streams_per_vnode_limit = limit / desc.entries().size();
+    auto entries = std::move(desc).entries();
+    auto start = entries.back().token_range_end;
+    for (size_t idx = 0; idx < entries.size(); ++idx) {
+        auto end = entries[idx].token_range_end;
+        if (entries[idx].streams.size() > streams_per_vnode_limit) {
+            entries[idx].streams =
+                create_stream_ids(idx, start, end, streams_per_vnode_limit, entries[idx].sharding_ignore_msb);
+        }
+        start = end;
+    }
+    return topology_description(std::move(entries));
+}
+
 // Run inside seastar::async context.
 db_clock::time_point make_new_cdc_generation(
        const db::config& cfg,
        const std::unordered_set<dht::token>& bootstrap_tokens,
-        const locator::token_metadata& tm,
+        const locator::token_metadata_ptr tmptr,
        const gms::gossiper& g,
        db::system_distributed_keyspace& sys_dist_ks,
        std::chrono::milliseconds ring_delay,
-        bool for_testing) {
-    auto gen = topology_description_generator(cfg, bootstrap_tokens, tm, g).generate();
+        bool add_delay) {
+    using namespace std::chrono;
+    auto gen = topology_description_generator(cfg, bootstrap_tokens, tmptr, g).generate();
+
+    // If the cluster is large we may end up with a generation that contains
+    // large number of streams. This is problematic because we store the
+    // generation in a single row. For a generation with large number of rows
+    // this will lead to a row that can be as big as 32MB. This is much more
+    // than the limit imposed by commitlog_segment_size_in_mb. If the size of
+    // the row that describes a new generation grows above
+    // commitlog_segment_size_in_mb, the write will fail and the new node won't
+    // be able to join. To avoid such problem we make sure that such row is
+    // always smaller than 4MB. We do that by removing some CDC streams from
+    // each vnode if the total number of streams is too large.
+    gen = limit_number_of_streams_if_needed(std::move(gen));

    // Begin the race.
    auto ts = db_clock::now() + (
-            for_testing ? std::chrono::milliseconds(0) : (
-                2 * ring_delay + std::chrono::duration_cast<std::chrono::milliseconds>(generation_leeway)));
-    sys_dist_ks.insert_cdc_topology_description(ts, std::move(gen), { tm.count_normal_token_owners() }).get();
+            (!add_delay || ring_delay == milliseconds(0)) ? milliseconds(0) : (
+                2 * ring_delay + duration_cast<milliseconds>(generation_leeway)));
+    sys_dist_ks.insert_cdc_topology_description(ts, std::move(gen), { tmptr->count_normal_token_owners() }).get();

    return ts;
 }
@@ -269,31 +380,23 @@ std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_ad
    return gms::versioned_value::cdc_streams_timestamp_from_string(streams_ts_string);
 }

-// Run inside seastar::async context.
-static void do_update_streams_description(
+static future<> do_update_streams_description(
        db_clock::time_point streams_ts,
        db::system_distributed_keyspace& sys_dist_ks,
        db::system_distributed_keyspace::context ctx) {
-    if (sys_dist_ks.cdc_desc_exists(streams_ts, ctx).get0()) {
-        cdc_log.debug("update_streams_description: description of generation {} already inserted", streams_ts);
-        return;
+    if (co_await sys_dist_ks.cdc_desc_exists(streams_ts, ctx)) {
+        cdc_log.info("Generation {}: streams description table already updated.", streams_ts);
+        co_return;
    }

    // We might race with another node also inserting the description, but that's ok. It's an idempotent operation.

-    auto topo = sys_dist_ks.read_cdc_topology_description(streams_ts, ctx).get0();
+    auto topo = co_await sys_dist_ks.read_cdc_topology_description(streams_ts, ctx);
    if (!topo) {
-        throw std::runtime_error(format("could not find streams data for timestamp {}", streams_ts));
+        throw no_generation_data_exception(streams_ts);
    }

-    std::set<cdc::stream_id> streams_set;
-    for (auto& entry: topo->entries()) {
-        streams_set.insert(entry.streams.begin(), entry.streams.end());
-    }
-
-    std::vector<cdc::stream_id> streams_vec(streams_set.begin(), streams_set.end());
-
-    sys_dist_ks.create_cdc_desc(streams_ts, streams_vec, ctx).get();
+    co_await sys_dist_ks.create_cdc_desc(streams_ts, *topo, ctx);
    cdc_log.info("CDC description table successfully updated with generation {}.", streams_ts);
 }

@@ -303,7 +406,7 @@ void update_streams_description(
        noncopyable_function<unsigned()> get_num_token_owners,
        abort_source& abort_src) {
    try {
-        do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
+        do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() }).get();
    } catch(...) {
        cdc_log.warn(
            "Could not update CDC description table with generation {}: {}. Will retry in the background.",
@@ -316,7 +419,7 @@ void update_streams_description(
            while (true) {
                sleep_abortable(std::chrono::seconds(60), abort_src).get();
                try {
-                    do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
+                    do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() }).get();
                    return;
                } catch (...) {
                    cdc_log.warn(
@@ -328,4 +431,176 @@ void update_streams_description(
    }
 }

+static db_clock::time_point as_timepoint(const utils::UUID& uuid) {
+    return db_clock::time_point{std::chrono::milliseconds(utils::UUID_gen::get_adjusted_timestamp(uuid))};
+}
+
+static future<std::vector<db_clock::time_point>> get_cdc_desc_v1_timestamps(
+        db::system_distributed_keyspace& sys_dist_ks,
+        abort_source& abort_src,
+        const noncopyable_function<unsigned()>& get_num_token_owners) {
+    while (true) {
+        try {
+            co_return co_await sys_dist_ks.get_cdc_desc_v1_timestamps({ get_num_token_owners() });
+        } catch (...) {
+            cdc_log.warn(
+                    "Failed to retrieve generation timestamps for rewriting: {}. Retrying in 60s.",
+                    std::current_exception());
+        }
+        co_await sleep_abortable(std::chrono::seconds(60), abort_src);
+    }
+}
+
+// Contains a CDC log table's creation time (extracted from its schema's id)
+// and its CDC TTL setting.
+struct time_and_ttl {
+    db_clock::time_point creation_time;
+    int ttl;
+};
+
+/*
+ * See `maybe_rewrite_streams_descriptions`.
+ * This is the long-running-in-the-background part of that function.
+ * It returns the timestamp of the last rewritten generation (if any).
+ */
+static future<std::optional<db_clock::time_point>> rewrite_streams_descriptions(
+        std::vector<time_and_ttl> times_and_ttls,
+        shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source& abort_src) {
+    cdc_log.info("Retrieving generation timestamps for rewriting...");
+    auto tss = co_await get_cdc_desc_v1_timestamps(*sys_dist_ks, abort_src, get_num_token_owners);
+    cdc_log.info("Generation timestamps retrieved.");
+
+    // Find first generation timestamp such that some CDC log table may contain data before this timestamp.
+    // This predicate is monotonic w.r.t the timestamps.
+    auto now = db_clock::now();
+    std::sort(tss.begin(), tss.end());
+    auto first = std::partition_point(tss.begin(), tss.end(), [&] (db_clock::time_point ts) {
+        // partition_point finds first element that does *not* satisfy the predicate.
+        return std::none_of(times_and_ttls.begin(), times_and_ttls.end(),
+                [&] (const time_and_ttl& tat) {
+            // In this CDC log table there are no entries older than the table's creation time
+            // or (now - the table's ttl). We subtract 10s to account for some possible clock drift.
+            // If ttl is set to 0 then entries in this table never expire. In that case we look
+            // only at the table's creation time.
+            auto no_entries_older_than =
+                (tat.ttl == 0 ? tat.creation_time : std::max(tat.creation_time, now - std::chrono::seconds(tat.ttl)))
+                    - std::chrono::seconds(10);
+            return no_entries_older_than < ts;
+        });
+    });
+
+    // Find first generation timestamp such that some CDC log table may contain data in this generation.
+    // This and all later generations need to be written to the new streams table.
+    if (first != tss.begin()) {
+        --first;
+    }
+
+    if (first == tss.end()) {
+        cdc_log.info("No generations to rewrite.");
+        co_return std::nullopt;
+    }
+
+    cdc_log.info("First generation to rewrite: {}", *first);
+
+    bool each_success = true;
+    co_await max_concurrent_for_each(first, tss.end(), 10, [&] (db_clock::time_point ts) -> future<> {
+        while (true) {
+            try {
+                co_return co_await do_update_streams_description(ts, *sys_dist_ks, { get_num_token_owners() });
+            } catch (const no_generation_data_exception& e) {
+                cdc_log.error("Failed to rewrite streams for generation {}: {}. Giving up.", ts, e);
+                each_success = false;
+                co_return;
+            } catch (...) {
+                cdc_log.warn("Failed to rewrite streams for generation {}: {}. Retrying in 60s.", ts, std::current_exception());
+            }
+            co_await sleep_abortable(std::chrono::seconds(60), abort_src);
+        }
+    });
+
+    if (each_success) {
+        cdc_log.info("Rewriting stream tables finished successfully.");
+    } else {
+        cdc_log.info("Rewriting stream tables finished, but some generations could not be rewritten (check the logs).");
+    }
+
+    if (first != tss.end()) {
+        co_return *std::prev(tss.end());
+    }
+
+    co_return std::nullopt;
+}
+
+future<> maybe_rewrite_streams_descriptions(
+        const database& db,
+        shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source& abort_src) {
+    if (!db.has_schema(sys_dist_ks->NAME, sys_dist_ks->CDC_DESC_V1)) {
+        // This cluster never went through a Scylla version which used this table
+        // or the user deleted the table. Nothing to do.
+        co_return;
+    }
+
+    if (co_await db::system_keyspace::cdc_is_rewritten()) {
+        co_return;
+    }
+
+    if (db.get_config().cdc_dont_rewrite_streams()) {
+        cdc_log.warn("Stream rewriting disabled. Manual administrator intervention may be required...");
+        co_return;
+    }
+
+    // For each CDC log table get the TTL setting (from CDC options) and the table's creation time
+    std::vector<time_and_ttl> times_and_ttls;
+    for (auto& [_, cf] : db.get_column_families()) {
+        auto& s = *cf->schema();
+        auto base = cdc::get_base_table(db, s.ks_name(), s.cf_name());
+        if (!base) {
+            // Not a CDC log table.
+            continue;
+        }
+        auto& cdc_opts = base->cdc_options();
+        if (!cdc_opts.enabled()) {
+            // This table is named like a CDC log table but it's not one.
+            continue;
+        }
+
+        times_and_ttls.push_back(time_and_ttl{as_timepoint(s.id()), cdc_opts.ttl()});
+    }
+
+    if (times_and_ttls.empty()) {
+        // There's no point in rewriting old generations' streams (they don't contain any data).
+        cdc_log.info("No CDC log tables present, not rewriting stream tables.");
+        co_return co_await db::system_keyspace::cdc_set_rewritten(std::nullopt);
+    }
+
+    // It's safe to discard this future: the coroutine keeps system_distributed_keyspace alive
+    // and the abort source's lifetime extends the lifetime of any other service.
+    (void)(([_times_and_ttls = std::move(times_and_ttls), _sys_dist_ks = std::move(sys_dist_ks),
+                _get_num_token_owners = std::move(get_num_token_owners), &_abort_src = abort_src] () mutable -> future<> {
+        auto times_and_ttls = std::move(_times_and_ttls);
+        auto sys_dist_ks = std::move(_sys_dist_ks);
+        auto get_num_token_owners = std::move(_get_num_token_owners);
+        auto& abort_src = _abort_src;
+
+        // This code is racing with node startup. At this point, we're most likely still waiting for gossip to settle
+        // and some nodes that are UP may still be marked as DOWN by us.
+        // Let's sleep a bit to increase the chance that the first attempt at rewriting succeeds (it's still ok if
+        // it doesn't - we'll retry - but it's nice if we succeed without any warnings).
+        co_await sleep_abortable(std::chrono::seconds(10), abort_src);
+
+        cdc_log.info("Rewriting stream tables in the background...");
+        auto last_rewritten = co_await rewrite_streams_descriptions(
+                std::move(times_and_ttls),
+                std::move(sys_dist_ks),
+                std::move(get_num_token_owners),
+                abort_src);
+
+        co_await db::system_keyspace::cdc_set_rewritten(last_rewritten);
+    })());
+}
+
 } // namespace cdc
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -40,6 +40,8 @@
 #include "database_fwd.hh"
 #include "db_clock.hh"
 #include "dht/token.hh"
+#include "locator/token_metadata.hh"
+#include "utils/chunked_vector.hh"

 namespace seastar {
    class abort_source;
@@ -55,26 +57,26 @@ namespace gms {
    class gossiper;
 } // namespace gms

-namespace locator {
-    class token_metadata;
-} // namespace locator
-
 namespace cdc {

 class stream_id final {
    bytes _value;
 public:
+    static constexpr uint8_t version_1 = 1;
+
    stream_id() = default;
-    stream_id(int64_t, int64_t);
    stream_id(bytes);
+    stream_id(dht::token, size_t);
+
    bool is_set() const;
    bool operator==(const stream_id&) const;
+    bool operator!=(const stream_id&) const;
    bool operator<(const stream_id&) const;

-    int64_t first() const;
-    int64_t second() const;
-
+    uint8_t version() const;
+    size_t index() const;
    const bytes& to_bytes() const;
+    dht::token token() const;

    partition_key to_partition_key(const schema& log_schema) const;
    static int64_t token_from_bytes(bytes_view);
@@ -110,7 +112,30 @@ public:
    topology_description(std::vector<token_range_description> entries);
    bool operator==(const topology_description&) const;

-    const std::vector<token_range_description>& entries() const;
+    const std::vector<token_range_description>& entries() const&;
+    std::vector<token_range_description>&& entries() &&;
+};
+
+/**
+ * The set of streams for a single topology version/generation
+ * I.e. the stream ids at a given time. 
+ */ 
+class streams_version {
+public:
+    utils::chunked_vector<stream_id> streams;
+    db_clock::time_point timestamp;
+
+    streams_version(utils::chunked_vector<stream_id> s, db_clock::time_point ts)
+        : streams(std::move(s))
+        , timestamp(ts)
+    {}
+};
+
+class no_generation_data_exception : public std::runtime_error {
+public:
+    no_generation_data_exception(db_clock::time_point generation_ts)
+        : std::runtime_error(format("could not find generation data for timestamp {}", generation_ts))
+    {}
 };

 /* Should be called when we're restarting and we noticed that we didn't save any streams timestamp in our local tables,
@@ -130,8 +155,8 @@ bool should_propose_first_generation(const gms::inet_address& me, const gms::gos
 */
 future<db_clock::time_point> get_local_streams_timestamp();

-/* Generate a new set of CDC streams and insert it into the distributed cdc_generations table.
- * Returns the timestamp of this new generation.
+/* Generate a new set of CDC streams and insert it into the distributed cdc_generation_descriptions table.
+ * Returns the timestamp of this new generation
 *
 * Should be called when starting the node for the first time (i.e., joining the ring).
 *
@@ -145,11 +170,11 @@ future<db_clock::time_point> get_local_streams_timestamp();
 db_clock::time_point make_new_cdc_generation(
        const db::config& cfg,
        const std::unordered_set<dht::token>& bootstrap_tokens,
-        const locator::token_metadata& tm,
+        const locator::token_metadata_ptr tmptr,
        const gms::gossiper& g,
        db::system_distributed_keyspace& sys_dist_ks,
        std::chrono::milliseconds ring_delay,
-        bool for_testing);
+        bool add_delay);

 /* Retrieves CDC streams generation timestamp from the given endpoint's application state (broadcasted through gossip).
 * We might be during a rolling upgrade, so the timestamp might not be there (if the other node didn't upgrade yet),
@@ -161,7 +186,7 @@ std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_ad
 /* Inform CDC users about a generation of streams (identified by the given timestamp)
 * by inserting it into the cdc_streams table.
 *
- * Assumes that the cdc_generations table contains this generation.
+ * Assumes that the cdc_generation_descriptions table contains this generation.
 *
 * Returning from this function does not mean that the table update was successful: the function
 * might run an asynchronous task in the background.
@@ -174,4 +199,15 @@ void update_streams_description(
        noncopyable_function<unsigned()> get_num_token_owners,
        abort_source&);

+/* Part of the upgrade procedure. Useful in case where the version of Scylla that we're upgrading from
+ * used the "cdc_streams_descriptions" table. This procedure ensures that the new "cdc_streams_descriptions_v2"
+ * table contains streams of all generations that were present in the old table and may still contain data
+ * (i.e. there exist CDC log tables that may contain rows with partition keys being the stream IDs from
+ * these generations). */
+future<> maybe_rewrite_streams_descriptions(
+        const database&,
+        shared_ptr<db::system_distributed_keyspace>,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source&);
+
 } // namespace cdc
--- a/cdc/log.cc
+++ b/cdc/log.cc
--- a/cdc/log.hh
+++ b/cdc/log.hh
@@ -41,7 +41,6 @@
 #include "exceptions/exceptions.hh"
 #include "timestamp.hh"
 #include "tracing/trace_state.hh"
-#include "cdc_options.hh"
 #include "utils/UUID.hh"

 class schema;
@@ -63,6 +62,7 @@ class query_state;

 class mutation;
 class partition_key;
+class database;

 namespace cdc {

@@ -100,19 +100,16 @@ public:
 struct db_context final {
    service::storage_proxy& _proxy;
    service::migration_notifier& _migration_notifier;
-    locator::token_metadata& _token_metadata;
    cdc::metadata& _cdc_metadata;

    class builder final {
        service::storage_proxy& _proxy;
        std::optional<std::reference_wrapper<service::migration_notifier>> _migration_notifier;
-        std::optional<std::reference_wrapper<locator::token_metadata>> _token_metadata;
        std::optional<std::reference_wrapper<cdc::metadata>> _cdc_metadata;
    public:
        builder(service::storage_proxy& proxy);

        builder& with_migration_notifier(service::migration_notifier& migration_notifier);
-        builder& with_token_metadata(locator::token_metadata& token_metadata);
        builder& with_cdc_metadata(cdc::metadata&);

        db_context build();
@@ -129,7 +126,12 @@ enum class operation : int8_t {
 };

 bool is_log_for_some_table(const sstring& ks_name, const std::string_view& table_name);
-seastar::sstring log_name(const seastar::sstring& table_name);
+
+schema_ptr get_base_table(const database&, const schema&);
+schema_ptr get_base_table(const database&, sstring_view, std::string_view);
+
+seastar::sstring base_name(std::string_view log_name);
+seastar::sstring log_name(std::string_view table_name);
 seastar::sstring log_data_column_name(std::string_view column_name);
 seastar::sstring log_meta_column_name(std::string_view column_name);
 bytes log_data_column_name_bytes(const bytes& column_name);
@@ -141,6 +143,8 @@ bytes log_data_column_deleted_name_bytes(const bytes& column_name);
 seastar::sstring log_data_column_deleted_elements_name(std::string_view column_name);
 bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name);

+bool is_cdc_metacolumn_name(const sstring& name);
+
 utils::UUID generate_timeuuid(api::timestamp_type t);

 } // namespace cdc
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -51,7 +51,8 @@ static cdc::stream_id get_stream(
    return entry.streams[shard_id];
 }

-static cdc::stream_id get_stream(
+// non-static for testing
+cdc::stream_id get_stream(
        const std::vector<cdc::token_range_description>& entries,
        dht::token tok) {
    if (entries.empty()) {
@@ -77,6 +78,12 @@ cdc::metadata::container_t::const_iterator cdc::metadata::gen_used_at(api::times
    return std::prev(it);
 }

+bool cdc::metadata::streams_available() const {
+    auto now = api::new_timestamp();
+    auto it = gen_used_at(now);
+    return  it != _gens.end();
+}
+
 cdc::stream_id cdc::metadata::get_stream(api::timestamp_type ts, dht::token tok) {
    auto now = api::new_timestamp();
    if (ts > now + generation_leeway.count()) {
--- a/cdc/metadata.hh
+++ b/cdc/metadata.hh
@@ -57,6 +57,10 @@ public:
    /* Is a generation with the given timestamp already known or superseded by a newer generation? */
    bool known_or_obsolete(db_clock::time_point) const;

+    /* Are there streams available. I.e. valid for time == now. If this is false, any writes to 
+     * CDC logs will fail fast.
+     */
+    bool streams_available() const;
    /* Return the stream for the base partition whose token is `tok` to which a corresponding log write should go
     * according to the generation used at time `ts` (i.e, the latest generation whose timestamp is less or equal to `ts`).
     *
--- a/cdc/split.cc
+++ b/cdc/split.cc
@@ -22,8 +22,14 @@
 #include "mutation.hh"
 #include "schema.hh"

+#include "concrete_types.hh"
+#include "types/user.hh"
+
 #include "split.hh"
 #include "log.hh"
+#include "change_visitor.hh"
+
+#include <type_traits>

 struct atomic_column_update {
    column_id id;
@@ -70,6 +76,37 @@ struct partition_deletion {
    tombstone t;
 };

+using clustered_column_set = std::map<clustering_key, cdc::one_kind_column_set, clustering_key::less_compare>;
+
+template<typename Container>
+concept EntryContainer = requires(Container& container) {
+    // Parenthesized due to https://bugs.llvm.org/show_bug.cgi?id=45088
+    { (container.atomic_entries) } -> std::same_as<std::vector<atomic_column_update>&>;
+    { (container.nonatomic_entries) } -> std::same_as<std::vector<nonatomic_column_update>&>;
+};
+
+template<EntryContainer Container>
+static void add_columns_affected_by_entries(cdc::one_kind_column_set& cset, const Container& cont) {
+    for (const auto& entry : cont.atomic_entries) {
+        cset.set(entry.id);
+    }
+    for (const auto& entry : cont.nonatomic_entries) {
+        cset.set(entry.id);
+    }
+}
+
+/* Given a mutation with multiple timestamps/ttl/types of changes, we split it into multiple mutations
+ * before passing it into `process_change` (see comment above `should_split_visitor` for more details).
+ *
+ * The first step of the splitting is to walk over the mutation and put each change into an appropriate bucket
+ * (see `batch`). The buckets are sorted by timestamps (see `set_of_changes`), and within each bucket,
+ * the changes are split according to their types (`static_updates`, `clustered_inserts`, and so on).
+ * Within each type, the changes are sorted w.r.t TTLs. Changes without a TTL are treated as if they had TTL = 0.
+ *
+ * The function that puts changes into bucket is called `extract_changes`. Underneath, it uses
+ * `extract_changes_visitor`, `extract_collection_visitor` and `extract_row_visitor`.
+ */
+
 struct batch {
    std::vector<static_row_update> static_updates;
    std::vector<clustered_row_insert> clustered_inserts;
@@ -77,6 +114,40 @@ struct batch {
    std::vector<clustered_row_deletion> clustered_row_deletions;
    std::vector<clustered_range_deletion> clustered_range_deletions;
    std::optional<partition_deletion> partition_deletions;
+
+    clustered_column_set get_affected_clustered_columns_per_row(const schema& s) const {
+        clustered_column_set ret{clustering_key::less_compare(s)};
+
+        if (!clustered_row_deletions.empty()) {
+            // When deleting a row, all columns are affected
+            cdc::one_kind_column_set all_columns{s.regular_columns_count()};
+            all_columns.set(0, s.regular_columns_count(), true);
+            for (const auto& change : clustered_row_deletions) {
+                ret.insert(std::make_pair(change.key, all_columns));
+            }
+        }
+
+        auto process_change_type = [&] (const auto& changes) {
+            for (const auto& change : changes) {
+                auto& cset = ret[change.key];
+                cset.resize(s.regular_columns_count());
+                add_columns_affected_by_entries(cset, change);
+            }
+        };
+
+        process_change_type(clustered_inserts);
+        process_change_type(clustered_updates);
+
+        return ret;
+    }
+
+    cdc::one_kind_column_set get_affected_static_columns(const schema& s) const {
+        cdc::one_kind_column_set ret{s.static_columns_count()};
+        for (const auto& change : static_updates) {
+            add_columns_affected_by_entries(ret, change);
+        }
+        return ret;
+    }
 };

 using set_of_changes = std::map<api::timestamp_type, batch>;
@@ -86,100 +157,179 @@ struct row_update {
    std::vector<nonatomic_column_update> nonatomic_entries;
 };

-static
-std::map<std::pair<api::timestamp_type, gc_clock::duration>, row_update>
-extract_row_updates(const row& r, column_kind ckind, const schema& schema) {
-    std::map<std::pair<api::timestamp_type, gc_clock::duration>, row_update> result;
-    r.for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
-        auto& cdef = schema.column_at(ckind, id);
-        if (cdef.is_atomic()) {
-            auto view = cell.as_atomic_cell(cdef);
-            auto timestamp_and_ttl = std::pair(
-                    view.timestamp(),
-                    view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0)
-                );
-            result[timestamp_and_ttl].atomic_entries.push_back({id, atomic_cell(*cdef.type, view)});
-            return;
+static gc_clock::duration get_ttl(const atomic_cell_view& acv) {
+    return acv.is_live_and_has_ttl() ? acv.ttl() : gc_clock::duration(0);
+}
+
+static gc_clock::duration get_ttl(const row_marker& rm) {
+    return rm.is_expiring() ? rm.ttl() : gc_clock::duration(0);
+}
+
+using change_key_t = std::pair<api::timestamp_type, gc_clock::duration>;
+
+/* Visits the cells and tombstone of a collection, putting the encountered changes into buckets
+ * sorted by timestamp first and ttl second (see `_updates`).
+ */
+template <typename V>
+struct extract_collection_visitor {
+private:
+    const column_id _id;
+    std::map<change_key_t, row_update>& _updates;
+
+    nonatomic_column_update& get_or_append_entry(api::timestamp_type ts, gc_clock::duration ttl) {
+        auto& updates = this->_updates[std::pair(ts, ttl)].nonatomic_entries;
+        if (updates.empty() || updates.back().id != _id) {
+            updates.push_back({_id});
        }
-
-        cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
-            auto desc = mview.materialize(*cdef.type);
-            for (auto& [k, v]: desc.cells) {
-                auto timestamp_and_ttl = std::pair(
-                        v.timestamp(),
-                        v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0)
-                    );
-                auto& updates = result[timestamp_and_ttl].nonatomic_entries;
-                if (updates.empty() || updates.back().id != id) {
-                    updates.push_back({id, {}});
-                }
-                updates.back().cells.push_back({std::move(k), std::move(v)});
-            }
-
-            if (desc.tomb) {
-                auto timestamp_and_ttl = std::pair(desc.tomb.timestamp + 1, gc_clock::duration(0));
-                auto& updates = result[timestamp_and_ttl].nonatomic_entries;
-                if (updates.empty() || updates.back().id != id) {
-                    updates.push_back({id, {}});
-                }
-                updates.back().t = std::move(desc.tomb);
-            }
-        });
-    });
-    return result;
-};
-
-set_of_changes extract_changes(const mutation& base_mutation, const schema& base_schema) {
-    set_of_changes res;
-    auto& p = base_mutation.partition();
-
-    auto sr_updates = extract_row_updates(p.static_row().get(), column_kind::static_column, base_schema);
-    for (auto& [k, up]: sr_updates) {
-        auto [timestamp, ttl] = k;
-        res[timestamp].static_updates.push_back({
-                ttl,
-                std::move(up.atomic_entries),
-                std::move(up.nonatomic_entries)
-            });
+        return updates.back();
    }

-    for (const rows_entry& cr : p.clustered_rows()) {
-        auto cr_updates = extract_row_updates(cr.row().cells(), column_kind::regular_column, base_schema);
+    /* To copy a value from a collection/non-frozen UDT (in order to put it into a bucket) we need to know the value's type.
+     * The method of obtaining the type depends on the collection type; in particular, for non-frozen UDT, each value
+     * might have a different type, thus in general we need a method that, given a key (identifying the value in the collection),
+     * returns the value' type.
+     *
+     * We use the `Curiously Recurring Template Pattern' to avoid performing a dynamic dispatch on the collection's type for each visited cell.
+     * Instead we perform a single dynamic dispatch at the beginning, when encountering the collection column;
+     * the dispatch provides us with a correct `get_value_type` method.
+     * See `extract_row_visitor::collection_column` where the dispatch is done.

-        const auto& marker = cr.row().marker();
-        auto marker_timestamp = marker.timestamp();
-        auto marker_ttl = marker.is_expiring() ? marker.ttl() : gc_clock::duration(0);
-        if (marker.is_live()) {
-            // make sure that an entry corresponding to the row marker's timestamp and ttl is in the map
-            (void)cr_updates[std::pair(marker_timestamp, marker_ttl)];
+    data_type get_value_type(bytes_view);
+    */
+
+    void cell(bytes_view key, const atomic_cell_view& c) {
+        auto& entry = get_or_append_entry(c.timestamp(), get_ttl(c));
+        entry.cells.emplace_back(to_bytes(key), atomic_cell(*static_cast<V&>(*this).get_value_type(key), c));
+    }
+
+public:
+    extract_collection_visitor(column_id id, std::map<change_key_t, row_update>& updates)
+        : _id(id), _updates(updates) {}
+
+    void collection_tombstone(const tombstone& t) {
+        auto& entry = get_or_append_entry(t.timestamp + 1, gc_clock::duration(0));
+        entry.t = t;
+    }
+
+    void live_collection_cell(bytes_view key, const atomic_cell_view& c) {
+        cell(key, c);
+    }
+
+    void dead_collection_cell(bytes_view key, const atomic_cell_view& c) {
+        cell(key, c);
+    }
+
+    constexpr bool finished() const { return false; }
+};
+
+/* Visits all cells and tombstones in a row, putting the encountered changes into buckets
+ * sorted by timestamp first and ttl second (see `_updates`).
+ */
+struct extract_row_visitor {
+    std::map<change_key_t, row_update> _updates;
+
+    void cell(const column_definition& cdef, const atomic_cell_view& cell) {
+        _updates[std::pair(cell.timestamp(), get_ttl(cell))].atomic_entries.push_back({cdef.id, atomic_cell(*cdef.type, cell)});
+    }
+
+    void live_atomic_cell(const column_definition& cdef, const atomic_cell_view& c) {
+        cell(cdef, c);
+    }
+
+    void dead_atomic_cell(const column_definition& cdef, const atomic_cell_view& c) {
+        cell(cdef, c);
+    }
+
+    void collection_column(const column_definition& cdef, auto&& visit_collection) {
+        visit(*cdef.type, make_visitor(
+        [&] (const collection_type_impl& ctype) {
+            struct collection_visitor : public extract_collection_visitor<collection_visitor> {
+                data_type _value_type;
+
+                collection_visitor(column_id id, std::map<change_key_t, row_update>& updates, const collection_type_impl& ctype)
+                    : extract_collection_visitor<collection_visitor>(id, updates), _value_type(ctype.value_comparator()) {}
+
+                data_type get_value_type(bytes_view) {
+                    return _value_type;
+                }
+            } v(cdef.id, _updates, ctype);
+
+            visit_collection(v);
+        },
+        [&] (const user_type_impl& utype) {
+            struct udt_visitor : public extract_collection_visitor<udt_visitor> {
+                const user_type_impl& _utype;
+
+                udt_visitor(column_id id, std::map<change_key_t, row_update>& updates, const user_type_impl& utype)
+                    : extract_collection_visitor<udt_visitor>(id, updates), _utype(utype) {}
+
+                data_type get_value_type(bytes_view key) {
+                    return _utype.type(deserialize_field_index(key));
+                }
+            } v(cdef.id, _updates, utype);
+
+            visit_collection(v);
+        },
+        [&] (const abstract_type& o) {
+            throw std::runtime_error(format("extract_changes: unknown collection type:", o.name()));
        }
+        ));
+    }

-        auto is_insert = [&] (api::timestamp_type timestamp, gc_clock::duration ttl) {
-            if (!marker.is_live()) {
-                return false;
+    constexpr bool finished() const { return false; }
+};
+
+struct extract_changes_visitor {
+    set_of_changes _result;
+
+    void static_row_cells(auto&& visit_row_cells) {
+        extract_row_visitor v;
+        visit_row_cells(v);
+
+        for (auto& [ts_ttl, row_update]: v._updates) {
+            _result[ts_ttl.first].static_updates.push_back({
+                ts_ttl.second,
+                std::move(row_update.atomic_entries),
+                std::move(row_update.nonatomic_entries)
+            });
+        }
+    }
+
+    void clustered_row_cells(const clustering_key& ckey, auto&& visit_row_cells) {
+        struct clustered_cells_visitor : public extract_row_visitor {
+            api::timestamp_type _marker_ts;
+            gc_clock::duration _marker_ttl;
+            std::optional<row_marker> _marker;
+
+            void marker(const row_marker& rm) {
+                _marker_ts = rm.timestamp();
+                _marker_ttl = get_ttl(rm);
+                _marker = rm;
+
+                // make sure that an entry corresponding to the row marker's timestamp and ttl is in the map
+                (void)_updates[std::pair(_marker_ts, _marker_ttl)];
            }
+        } v;
+        visit_row_cells(v);

-            return timestamp == marker_timestamp && ttl == marker_ttl;
-        };
-
-        for (auto& [k, up]: cr_updates) {
+        for (auto& [ts_ttl, row_update]: v._updates) {
            // It is important that changes in the resulting `set_of_changes` are listed
            // in increasing TTL order. The reason is explained in a comment in cdc/log.cc,
            // search for "#6070".
-            auto [timestamp, ttl] = k;
+            auto [ts, ttl] = ts_ttl;

-            if (is_insert(timestamp, ttl)) {
-                res[timestamp].clustered_inserts.push_back({
+            if (v._marker && ts == v._marker_ts && ttl == v._marker_ttl) {
+                _result[ts].clustered_inserts.push_back({
                        ttl,
-                        cr.key(),
-                        marker,
-                        std::move(up.atomic_entries),
+                        ckey,
+                        *v._marker,
+                        std::move(row_update.atomic_entries),
                        {}
                    });

-                auto& cr_insert = res[timestamp].clustered_inserts.back();
+                auto& cr_insert = _result[ts].clustered_inserts.back();
                bool clustered_update_exists = false;
-                for (auto& nonatomic_up: up.nonatomic_entries) {
+                for (auto& nonatomic_up: row_update.nonatomic_entries) {
                    // Updating a collection column with an INSERT statement implies inserting a tombstone.
                    //
                    // For example, suppose that we have:
@@ -205,9 +355,9 @@ set_of_changes extract_changes(const mutation& base_mutation, const schema& base
                        cr_insert.nonatomic_entries.push_back(std::move(nonatomic_up));
                    } else {
                        if (!clustered_update_exists) {
-                            res[timestamp].clustered_updates.push_back({
+                            _result[ts].clustered_updates.push_back({
                                ttl,
-                                cr.key(),
+                                ckey,
                                {},
                                {}
                            });
@@ -228,201 +378,239 @@ set_of_changes extract_changes(const mutation& base_mutation, const schema& base
                            clustered_update_exists = true;
                        }

-                        auto& cr_update = res[timestamp].clustered_updates.back();
+                        auto& cr_update = _result[ts].clustered_updates.back();
                        cr_update.nonatomic_entries.push_back(std::move(nonatomic_up));
                    }
                }
            } else {
-                res[timestamp].clustered_updates.push_back({
+                _result[ts].clustered_updates.push_back({
                        ttl,
-                        cr.key(),
-                        std::move(up.atomic_entries),
-                        std::move(up.nonatomic_entries)
+                        ckey,
+                        std::move(row_update.atomic_entries),
+                        std::move(row_update.nonatomic_entries)
                    });
            }
        }
-
-        auto row_tomb = cr.row().deleted_at().regular();
-        if (row_tomb) {
-            res[row_tomb.timestamp].clustered_row_deletions.push_back({cr.key(), row_tomb});
-        }
    }

-    for (const auto& rt: p.row_tombstones()) {
-        if (rt.tomb.timestamp != api::missing_timestamp) {
-            res[rt.tomb.timestamp].clustered_range_deletions.push_back({rt});
-        }
+    void clustered_row_delete(const clustering_key& ckey, const tombstone& t) {
+        _result[t.timestamp].clustered_row_deletions.push_back({ckey, t});
    }

-    auto partition_tomb_timestamp = p.partition_tombstone().timestamp;
-    if (partition_tomb_timestamp != api::missing_timestamp) {
-        res[partition_tomb_timestamp].partition_deletions = {p.partition_tombstone()};
+    void range_delete(const range_tombstone& rt) {
+        _result[rt.tomb.timestamp].clustered_range_deletions.push_back({rt});
    }

-    return res;
+    void partition_delete(const tombstone& t) {
+        _result[t.timestamp].partition_deletions = {t};
+    }
+
+    constexpr bool finished() const { return false; }
+};
+
+set_of_changes extract_changes(const mutation& m) {
+    extract_changes_visitor v;
+    cdc::inspect_mutation(m, v);
+    return std::move(v._result);
 }

 namespace cdc {

-bool should_split(const mutation& base_mutation, const schema& base_schema) {
-    auto& p = base_mutation.partition();
+struct find_timestamp_visitor {
+    api::timestamp_type _ts = api::missing_timestamp;

-    api::timestamp_type found_ts = api::missing_timestamp;
-    std::optional<gc_clock::duration> found_ttl; // 0 = "no ttl"
+    bool finished() const { return _ts != api::missing_timestamp; }

-    auto check_or_set = [&] (api::timestamp_type ts, gc_clock::duration ttl) {
-        if (found_ts != api::missing_timestamp && found_ts != ts) {
-            return true;
-        }
-        found_ts = ts;
+    void visit(api::timestamp_type ts) { _ts = ts; }
+    void visit(const atomic_cell_view& cell) { visit(cell.timestamp()); }

-        if (found_ttl && *found_ttl != ttl) {
-            return true;
-        }
-        found_ttl = ttl;
+    void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
+    void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
+    void collection_tombstone(const tombstone& t) {
+        // A collection tombstone with timestamp T can be created with:
+        // UPDATE ks.t USING TIMESTAMP T + 1 SET X = null WHERE ...
+        // (where X is a collection column).
+        // This is, among others, the reason why we show it in the CDC log
+        // with cdc$time using timestamp T + 1 instead of T.
+        visit(t.timestamp + 1);
+    }
+    void live_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
+    void dead_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
+    void collection_column(const column_definition&, auto&& visit_collection) { visit_collection(*this); }
+    void marker(const row_marker& rm) { visit(rm.timestamp()); }
+    void static_row_cells(auto&& visit_row_cells) { visit_row_cells(*this); }
+    void clustered_row_cells(const clustering_key&, auto&& visit_row_cells) { visit_row_cells(*this); }
+    void clustered_row_delete(const clustering_key&, const tombstone& t) { visit(t.timestamp); }
+    void range_delete(const range_tombstone& t) { visit(t.tomb.timestamp); }
+    void partition_delete(const tombstone& t) { visit(t.timestamp); }
+};

-        return false;
-    };
+/* Find some timestamp inside the given mutation.
+ *
+ * If this mutation was created using a single insert/update/delete statement, then it will have a single,
+ * well-defined timestamp (even if this timestamp occurs multiple times, e.g. in a cell and row_marker).
+ *
+ * This function shouldn't be used for mutations that have multiple different timestamps: the function
+ * would only find one of them. When dealing with such mutations, the caller should first split the mutation
+ * into multiple ones, each with a single timestamp.
+ */
+api::timestamp_type find_timestamp(const mutation& m) {
+    find_timestamp_visitor v;

-    bool had_static_row = false;
+    cdc::inspect_mutation(m, v);

-    bool should_split = false;
-    p.static_row().get().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
-        had_static_row = true;
-
-        auto& cdef = base_schema.column_at(column_kind::static_column, id);
-        if (cdef.is_atomic()) {
-            auto view = cell.as_atomic_cell(cdef);
-            if (check_or_set(view.timestamp(), view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0))) {
-                should_split = true;
-            }
-            return;
-        }
-
-        cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
-            auto desc = mview.materialize(*cdef.type);
-            for (auto& [k, v]: desc.cells) {
-                if (check_or_set(v.timestamp(), v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0))) {
-                    should_split = true;
-                    return;
-                }
-            }
-
-            if (desc.tomb) {
-                if (check_or_set(desc.tomb.timestamp + 1, gc_clock::duration(0))) {
-                    should_split = true;
-                    return;
-                }
-            }
-        });
-    });
-
-    if (should_split) {
-        return true;
+    if (v._ts == api::missing_timestamp) {
+        throw std::runtime_error("cdc: could not find timestamp of mutation");
    }

-    bool had_clustered_row = false;
-
-    if (!p.clustered_rows().empty() && had_static_row) {
-        return true;
-    }
-    for (const rows_entry& cr : p.clustered_rows()) {
-        had_clustered_row = true;
-
-        const auto& marker = cr.row().marker();
-        if (marker.is_live() && check_or_set(marker.timestamp(), marker.is_expiring() ? marker.ttl() : gc_clock::duration(0))) {
-            return true;
-        }
-
-        bool is_insert = marker.is_live();
-
-        bool had_cells = false;
-        cr.row().cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
-            had_cells = true;
-
-            auto& cdef = base_schema.column_at(column_kind::regular_column, id);
-            if (cdef.is_atomic()) {
-                auto view = cell.as_atomic_cell(cdef);
-                if (check_or_set(view.timestamp(), view.is_live_and_has_ttl() ? view.ttl() : gc_clock::duration(0))) {
-                    should_split = true;
-                }
-                return;
-            }
-
-            cell.as_collection_mutation().with_deserialized(*cdef.type, [&] (collection_mutation_view_description mview) {
-                for (auto& [k, v]: mview.cells) {
-                    if (check_or_set(v.timestamp(), v.is_live_and_has_ttl() ? v.ttl() : gc_clock::duration(0))) {
-                        should_split = true;
-                        return;
-                    }
-
-                    if (is_insert) {
-                        // nonatomic updates cannot be expressed with an INSERT.
-                        should_split = true;
-                        return;
-                    }
-                }
-
-                if (mview.tomb) {
-                    if (check_or_set(mview.tomb.timestamp + 1, gc_clock::duration(0))) {
-                        should_split = true;
-                        return;
-                    }
-                }
-            });
-        });
-
-        if (should_split) {
-            return true;
-        }
-
-        auto row_tomb = cr.row().deleted_at().regular();
-        if (row_tomb) {
-            if (had_cells) {
-                return true;
-            }
-
-            // there were no cells, so no ttl
-            assert(!found_ttl);
-            if (found_ts != api::missing_timestamp && found_ts != row_tomb.timestamp) {
-                return true;
-            }
-
-            found_ts = row_tomb.timestamp;
-        }
-    }
-
-    if (!p.row_tombstones().empty() && (had_static_row || had_clustered_row)) {
-        return true;
-    }
-
-    for (const auto& rt: p.row_tombstones()) {
-        if (rt.tomb) {
-            if (found_ts != api::missing_timestamp && found_ts != rt.tomb.timestamp) {
-                return true;
-            }
-
-            found_ts = rt.tomb.timestamp;
-        }
-    }
-
-    if (p.partition_tombstone().timestamp != api::missing_timestamp
-            && (!p.row_tombstones().empty() || had_static_row || had_clustered_row)) {
-        return true;
-    }
-
-    // A mutation with no timestamp will be split into 0 mutations
-    return found_ts == api::missing_timestamp;
+    return v._ts;
 }

-void for_each_change(const mutation& base_mutation, const schema_ptr& base_schema,
-        seastar::noncopyable_function<void(mutation, api::timestamp_type, bytes, int&)> f) {
-    auto changes = extract_changes(base_mutation, *base_schema);
+/* If a mutation contains multiple timestamps, multiple ttls, or multiple types of changes
+ * (e.g. it was created from a batch that both updated a clustered row and deleted a clustered row),
+ * we split it into multiple mutations, each with exactly one timestamp, at most one ttl, and a single type of change.
+ * We also split if we find both a change with no ttl (e.g. a cell tombstone) and a change with ttl (e.g. a ttled cell update).
+ *
+ * The `should_split` function checks whether the mutation requires such splitting, using `should_split_visitor`.
+ * The visitor uses the order in which the mutation is being visited (see the documentation of ChangeVisitor),
+ * remembers a bunch of state based on whatever was visited until now (e.g. was there a static row update?
+ * Was there a clustered row update? Was there a clustered row delete? Was there a TTL?)
+ * and tells the caller to stop on the first occurence of a second timestamp/ttl/type of change.
+ */
+struct should_split_visitor {
+    bool _had_static_row = false;
+    bool _had_clustered_row = false;
+    bool _had_upsert = false;
+    bool _had_row_marker = false;
+    bool _had_range_delete = false;
+
+    bool _result = false;
+
+    // This becomes a valid (non-missing) timestamp after visiting the first change.
+    // Then, if we encounter any different timestamp, it means that we should split.
+    api::timestamp_type _ts = api::missing_timestamp;
+
+    // This becomes non-null after visiting the fist change.
+    // If the change did not have a ttl (e.g. a non-ttled cell, or a tombstone), we store gc_clock::duration(0) there,
+    // because specifying ttl = 0 is equivalent to not specifying a TTL.
+    // Otherwise we store the change's ttl.
+    std::optional<gc_clock::duration> _ttl = std::nullopt;
+
+    inline bool finished() const { return _result; }
+    inline void stop() { _result = true; }
+
+    void visit(api::timestamp_type ts, gc_clock::duration ttl = gc_clock::duration(0)) {
+        if (_ts != api::missing_timestamp && _ts != ts) {
+            return stop();
+        }
+        _ts = ts;
+
+        if (_ttl && *_ttl != ttl) {
+            return stop();
+        }
+        _ttl = { ttl };
+    }
+
+    void visit(const atomic_cell_view& cell) { visit(cell.timestamp(), get_ttl(cell)); }
+
+    void live_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
+    void dead_atomic_cell(const column_definition&, const atomic_cell_view& cell) { visit(cell); }
+
+    void collection_tombstone(const tombstone& t) { visit(t.timestamp + 1); }
+
+    void live_collection_cell(bytes_view, const atomic_cell_view& cell) {
+        if (_had_row_marker) {
+            // nonatomic updates cannot be expressed with an INSERT.
+            return stop();
+        }
+        visit(cell);
+    }
+    void dead_collection_cell(bytes_view, const atomic_cell_view& cell) { visit(cell); }
+    void collection_column(const column_definition&, auto&& visit_collection) { visit_collection(*this); }
+
+    void marker(const row_marker& rm) {
+        _had_row_marker = true;
+        visit(rm.timestamp(), get_ttl(rm));
+    }
+
+    void static_row_cells(auto&& visit_row_cells) {
+        _had_static_row = true;
+        visit_row_cells(*this);
+    }
+
+    void clustered_row_cells(const clustering_key&, auto&& visit_row_cells) {
+        if (_had_static_row) {
+            return stop();
+        }
+        _had_clustered_row = _had_upsert = true;
+        visit_row_cells(*this);
+    }
+
+    void clustered_row_delete(const clustering_key&, const tombstone& t) {
+        if (_had_static_row || _had_upsert) {
+            return stop();
+        }
+        _had_clustered_row = true;
+        visit(t.timestamp);
+    }
+
+    void range_delete(const range_tombstone& t) {
+        if (_had_static_row || _had_clustered_row) {
+            return stop();
+        }
+        _had_range_delete = true;
+        visit(t.tomb.timestamp);
+    }
+
+    void partition_delete(const tombstone&) {
+        if (_had_range_delete || _had_static_row || _had_clustered_row) {
+            return stop();
+        }
+    }
+};
+
+bool should_split(const mutation& m) {
+    should_split_visitor v;
+
+    cdc::inspect_mutation(m, v);
+
+    return v._result
+    // A mutation with no timestamp will be split into 0 mutations:
+        || v._ts == api::missing_timestamp;
+}
+
+void process_changes_with_splitting(const mutation& base_mutation, change_processor& processor,
+        bool enable_preimage, bool enable_postimage) {
+    const auto base_schema = base_mutation.schema();
+    auto changes = extract_changes(base_mutation);
    auto pk = base_mutation.key();

+    if (changes.empty()) {
+        return;
+    }
+
+    const auto last_timestamp = changes.rbegin()->first;
+
    for (auto& [change_ts, btch] : changes) {
-        auto tuuid = timeuuid_type->decompose(generate_timeuuid(change_ts));
-        int batch_no = 0;
+        const bool is_last = change_ts == last_timestamp;
+        processor.begin_timestamp(change_ts, is_last);
+
+        clustered_column_set affected_clustered_columns_per_row{clustering_key::less_compare(*base_schema)};
+        one_kind_column_set affected_static_columns{base_schema->static_columns_count()};
+
+        if (enable_preimage || enable_postimage) {
+            affected_static_columns = btch.get_affected_static_columns(*base_schema);
+            affected_clustered_columns_per_row = btch.get_affected_clustered_columns_per_row(*base_mutation.schema());
+        }
+
+        if (enable_preimage) {
+            if (affected_static_columns.count() > 0) {
+                processor.produce_preimage(nullptr, affected_static_columns);
+            }
+            for (const auto& [ck, affected_row_cells] : affected_clustered_columns_per_row) {
+                processor.produce_preimage(&ck, affected_row_cells);
+            }
+        }

        for (auto& sr_update : btch.static_updates) {
            mutation m(base_schema, pk);
@@ -434,7 +622,7 @@ void for_each_change(const mutation& base_mutation, const schema_ptr& base_schem
                auto& cdef = base_schema->column_at(column_kind::static_column, nonatomic_update.id);
                m.set_static_cell(cdef, collection_mutation_description{nonatomic_update.t, std::move(nonatomic_update.cells)}.serialize(*cdef.type));
            }
-            f(std::move(m), change_ts, tuuid, batch_no);
+            processor.process_change(m);
        }

        for (auto& cr_insert : btch.clustered_inserts) {
@@ -451,7 +639,7 @@ void for_each_change(const mutation& base_mutation, const schema_ptr& base_schem
            }
            row.apply(cr_insert.marker);

-            f(std::move(m), change_ts, tuuid, batch_no);
+            processor.process_change(m);
        }

        for (auto& cr_update : btch.clustered_updates) {
@@ -467,27 +655,86 @@ void for_each_change(const mutation& base_mutation, const schema_ptr& base_schem
                row.apply(cdef, collection_mutation_description{nonatomic_update.t, std::move(nonatomic_update.cells)}.serialize(*cdef.type));
            }

-            f(std::move(m), change_ts, tuuid, batch_no);
+            processor.process_change(m);
        }

        for (auto& cr_delete : btch.clustered_row_deletions) {
            mutation m(base_schema, pk);
            m.partition().apply_delete(*base_schema, cr_delete.key, cr_delete.t);
-            f(std::move(m), change_ts, tuuid, batch_no);
+            processor.process_change(m);
        }

        for (auto& crange_delete : btch.clustered_range_deletions) {
            mutation m(base_schema, pk);
            m.partition().apply_delete(*base_schema, crange_delete.rt);
-            f(std::move(m), change_ts, tuuid, batch_no);
+            processor.process_change(m);
        }

        if (btch.partition_deletions) {
            mutation m(base_schema, pk);
            m.partition().apply(btch.partition_deletions->t);
-            f(std::move(m), change_ts, tuuid, batch_no);
+            processor.process_change(m);
        }
+
+        if (enable_postimage) {
+            if (affected_static_columns.count() > 0) {
+                processor.produce_postimage(nullptr);
+            }
+            for (const auto& [ck, crow] : affected_clustered_columns_per_row) {
+                processor.produce_postimage(&ck);
+            }
+        }
+
+        processor.end_record();
    }
 }

+void process_changes_without_splitting(const mutation& base_mutation, change_processor& processor,
+        bool enable_preimage, bool enable_postimage) {
+    auto ts = find_timestamp(base_mutation);
+    processor.begin_timestamp(ts, true);
+
+    const auto base_schema = base_mutation.schema();
+
+    if (enable_preimage) {
+        const auto& p = base_mutation.partition();
+
+        one_kind_column_set columns{base_schema->static_columns_count()};
+        if (!p.static_row().empty()) {
+            p.static_row().get().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
+                columns.set(id);
+            });
+            processor.produce_preimage(nullptr, columns);
+        }
+
+        columns.resize(base_schema->regular_columns_count());
+        for (const rows_entry& cr : p.clustered_rows()) {
+            columns.reset();
+            if (cr.row().deleted_at().regular()) {
+                // Row deleted - include all columns in preimage
+                columns.set(0, base_schema->regular_columns_count(), true);
+            } else {
+                cr.row().cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
+                    columns.set(id);
+                });
+            }
+            processor.produce_preimage(&cr.key(), columns);
+        }
+    }
+
+    processor.process_change(base_mutation);
+
+    if (enable_postimage) {
+        const auto& p = base_mutation.partition();
+        if (!p.static_row().empty()) {
+            processor.produce_postimage(nullptr);
+        }
+        for (const rows_entry& cr : p.clustered_rows()) {
+            processor.produce_postimage(&cr.key());
+        }
+    }
+
+    processor.end_record();
+}
+
 } // namespace cdc
--- a/cdc/split.hh
+++ b/cdc/split.hh
@@ -22,6 +22,7 @@
 #pragma once

 #include <vector>
+#include <boost/dynamic_bitset.hpp>
 #include "schema_fwd.hh"
 #include "timestamp.hh"
 #include "bytes.hh"
@@ -31,8 +32,61 @@ class mutation;

 namespace cdc {

-bool should_split(const mutation& base_mutation, const schema& base_schema);
-void for_each_change(const mutation& base_mutation, const schema_ptr& base_schema,
-        seastar::noncopyable_function<void(mutation, api::timestamp_type, bytes, int&)>);
+// Represents a set of column ids of one kind (partition key, clustering key, regular row or static row).
+// There already exists a column_set type, but it keeps ordinal_column_ids, not column_ids (ordinal column ids
+// are unique across whole table, while kind-specific ids are unique only within one column kind).
+// To avoid converting back and forth between ordinal and kind-specific ids, one_kind_column_set is used instead.
+using one_kind_column_set = boost::dynamic_bitset<uint64_t>;
+
+// An object that processes changes from a single, big mutation.
+// It is intended to be used with process_changes_xxx_splitting. Those functions define the order and layout in which
+// changes should appear in CDC log, and change_processor is responsible for producing CDC log rows from changes given
+// by those two functions.
+//
+// The flow of calling its methods should go as follows:
+//   -> begin_timestamp #1
+//     -> produce_preimage (one call for each preimage row to be generated)
+//     -> process_change (one call for each part generated by the splitting function)
+//     -> produce_postimage (one call for each postimage row to be generated)
+//   -> begin_timestamp #2
+//   ...
+class change_processor {
+protected:
+    ~change_processor() {};
+public:
+    // Tells the processor that changes that follow from now on will be of given timestamp.
+    // This method must be called in increasing timestamp order.
+    // begin_timestamp can be called only once for a given timestamp and change_processor object.
+    //   ts - timestamp of mutation parts
+    //   is_last - determines if this will be the last timestamp to be processed by this change_processor instance.
+    virtual void begin_timestamp(api::timestamp_type ts, bool is_last) = 0;
+
+    // Tells the processor to produce a preimage for a given clustering/static row.
+    //   ck - clustering key of the row for which to produce a preimage; if nullptr, static row preimage is requested
+    //   columns_to_include - include information about the current state of those columns only, leave others as null
+    virtual void produce_preimage(const clustering_key* ck, const one_kind_column_set& columns_to_include) = 0;
+
+    // Tells the processor to produce a postimage for a given clustering/static row.
+    // Contrary to preimage, this requires data from all columns to be present.
+    //   ck - clustering key of the row for which to produce a postimage; if nullptr, static row postimage is requested
+    virtual void produce_postimage(const clustering_key* ck) = 0;
+
+    // Processes a smaller mutation which is a subset of the big mutation.
+    // The mutation provided to process_change should be simple enough for it to be possible to convert it
+    // into CDC log rows - for example, it cannot represent a write to two columns of the same row, where
+    // both columns have different timestamp or TTL set.
+    //   m - the small mutation to be converted into CDC log rows.
+    virtual void process_change(const mutation& m) = 0;
+
+    // Tells processor we have reached end of record - last part
+    // of a given timestamp batch
+    virtual void end_record() = 0;
+};
+
+bool should_split(const mutation& base_mutation);
+void process_changes_with_splitting(const mutation& base_mutation, change_processor& processor,
+        bool enable_preimage, bool enable_postimage);
+void process_changes_without_splitting(const mutation& base_mutation, change_processor& processor,
+        bool enable_preimage, bool enable_postimage);

 }
--- a/clustering_bounds_comparator.hh
+++ b/clustering_bounds_comparator.hh
@@ -67,8 +67,8 @@ public:
        int operator()(const clustering_key_prefix& p1, int32_t w1, const clustering_key_prefix& p2, int32_t w2) const {
            auto type = _s.get().clustering_key_prefix_type();
            auto res = prefix_equality_tri_compare(type->types().begin(),
-                type->begin(p1), type->end(p1),
-                type->begin(p2), type->end(p2),
+                type->begin(p1.representation()), type->end(p1.representation()),
+                type->begin(p2.representation()), type->end(p2.representation()),
                ::tri_compare);
            if (res) {
                return res;
--- a/clustering_interval_set.hh
+++ b/clustering_interval_set.hh
@@ -72,7 +72,14 @@ public:
        }
        return result;
    }
-    class position_range_iterator : public std::iterator<std::input_iterator_tag, const position_range> {
+    class position_range_iterator {
+    public:
+        using iterator_category = std::input_iterator_tag;
+        using value_type = const position_range;
+        using difference_type = std::ptrdiff_t;
+        using pointer = const position_range*;
+        using reference = const position_range&;
+    private:
        set_type::iterator _i;
    public:
        position_range_iterator(set_type::iterator i) : _i(i) {}
--- a/collection_mutation.hh
+++ b/collection_mutation.hh
@@ -136,4 +136,4 @@ collection_mutation merge(const abstract_type&, collection_mutation_view, collec
 collection_mutation difference(const abstract_type&, collection_mutation_view, collection_mutation_view);

 // Serializes the given collection of cells to a sequence of bytes ready to be sent over the CQL protocol.
-bytes serialize_for_cql(const abstract_type&, collection_mutation_view, cql_serialization_format);
+bytes_ostream serialize_for_cql(const abstract_type&, collection_mutation_view, cql_serialization_format);
--- a/column_computation.hh
+++ b/column_computation.hh
@@ -21,8 +21,7 @@

 #pragma once

-#include <json/json.h>
-
+#include "utils/rjson.hh"
 #include "bytes.hh"

 class schema;
@@ -47,7 +46,7 @@ public:
    virtual ~column_computation() = default;

    static column_computation_ptr deserialize(bytes_view raw);
-    static column_computation_ptr deserialize(const Json::Value& json);
+    static column_computation_ptr deserialize(const rjson::value& json);

    virtual column_computation_ptr clone() const = 0;

@@ -55,6 +54,36 @@ public:
    virtual bytes_opt compute_value(const schema& schema, const partition_key& key, const clustering_row& row) const = 0;
 };

+/*
+ * Computes token value of partition key and returns it as bytes.
+ *
+ * Should NOT be used (use token_column_computation), because ordering
+ * of bytes is different than ordering of tokens (signed vs unsigned comparison).
+ *
+ * The type name stored for computations of this class is "token" - this was
+ * the original implementation. (now depracated for new tables)
+ */
+class legacy_token_column_computation : public column_computation {
+public:
+    virtual column_computation_ptr clone() const override {
+        return std::make_unique<legacy_token_column_computation>(*this);
+    }
+    virtual bytes serialize() const override;
+    virtual bytes_opt compute_value(const schema& schema, const partition_key& key, const clustering_row& row) const override;
+};
+
+
+/*
+ * Computes token value of partition key and returns it as long_type.
+ * The return type means that it can be trivially sorted (for example
+ * if computed column using this computation is a clustering key),
+ * preserving the correct order of tokens (using signed comparisons).
+ *
+ * Please use this class instead of legacy_token_column_computation.
+ * 
+ * The type name stored for computations of this class is "token_v2".
+ * (the name "token" refers to the depracated legacy_token_column_computation)
+ */
 class token_column_computation : public column_computation {
 public:
    virtual column_computation_ptr clone() const override {
--- a/compound.hh
+++ b/compound.hh
@@ -73,12 +73,19 @@ private:
     *   <len(value1)><value1><len(value2)><value2>...<len(value_n)><value_n>
     *
     */
-    template<typename RangeOfSerializedComponents, typename CharOutputIterator>
-    static void serialize_value(RangeOfSerializedComponents&& values, CharOutputIterator& out) {
+    template<typename RangeOfSerializedComponents, FragmentedMutableView Out>
+    static void serialize_value(RangeOfSerializedComponents&& values, Out out) {
        for (auto&& val : values) {
            assert(val.size() <= std::numeric_limits<size_type>::max());
            write<size_type>(out, size_type(val.size()));
-            out = std::copy(val.begin(), val.end(), out);
+            using val_type = std::remove_cvref_t<decltype(val)>;
+            if constexpr (FragmentedView<val_type>) {
+                write_fragmented(out, val);
+            } else if constexpr (std::same_as<val_type, managed_bytes>) {
+                write_fragmented(out, managed_bytes_view(val));
+            } else {
+                write_fragmented(out, single_fragmented_view(val));
+            }
        }
    }
    template <typename RangeOfSerializedComponents>
@@ -90,25 +97,27 @@ private:
        return len;
    }
 public:
-    bytes serialize_single(bytes&& v) const {
+    managed_bytes serialize_single(managed_bytes&& v) const {
+        return serialize_value({std::move(v)});
+    }
+    managed_bytes serialize_single(bytes&& v) const {
        return serialize_value({std::move(v)});
    }
    template<typename RangeOfSerializedComponents>
-    static bytes serialize_value(RangeOfSerializedComponents&& values) {
+    static managed_bytes serialize_value(RangeOfSerializedComponents&& values) {
        auto size = serialized_size(values);
        if (size > std::numeric_limits<size_type>::max()) {
            throw std::runtime_error(format("Key size too large: {:d} > {:d}", size, std::numeric_limits<size_type>::max()));
        }
-        bytes b(bytes::initialized_later(), size);
-        auto i = b.begin();
-        serialize_value(values, i);
+        managed_bytes b(managed_bytes::initialized_later(), size);
+        serialize_value(values, managed_bytes_mutable_view(b));
        return b;
    }
    template<typename T>
-    static bytes serialize_value(std::initializer_list<T> values) {
+    static managed_bytes serialize_value(std::initializer_list<T> values) {
        return serialize_value(boost::make_iterator_range(values.begin(), values.end()));
    }
-    bytes serialize_optionals(const std::vector<bytes_opt>& values) const {
+    managed_bytes serialize_optionals(const std::vector<bytes_opt>& values) const {
        return serialize_value(values | boost::adaptors::transformed([] (const bytes_opt& bo) -> bytes_view {
            if (!bo) {
                throw std::logic_error("attempted to create key component from empty optional");
@@ -116,7 +125,7 @@ public:
            return *bo;
        }));
    }
-    bytes serialize_value_deep(const std::vector<data_value>& values) const {
+    managed_bytes serialize_value_deep(const std::vector<data_value>& values) const {
        // TODO: Optimize
        std::vector<bytes> partial;
        partial.reserve(values.size());
@@ -127,19 +136,26 @@ public:
        }
        return serialize_value(partial);
    }
-    bytes decompose_value(const value_type& values) const {
+    managed_bytes decompose_value(const value_type& values) const {
        return serialize_value(values);
    }
-    class iterator : public std::iterator<std::input_iterator_tag, const bytes_view> {
+    class iterator {
+    public:
+        using iterator_category = std::input_iterator_tag;
+        using value_type = const managed_bytes_view;
+        using difference_type = std::ptrdiff_t;
+        using pointer = const value_type*;
+        using reference = const value_type&;
    private:
-        bytes_view _v;
-        bytes_view _current;
+        managed_bytes_view _v;
+        managed_bytes_view _current;
+        size_t _remaining = 0;
    private:
        void read_current() {
+            _remaining = _v.size_bytes();
            size_type len;
            {
                if (_v.empty()) {
-                    _v = bytes_view(nullptr, 0);
                    return;
                }
                len = read_simple<size_type>(_v);
@@ -147,15 +163,16 @@ public:
                    throw_with_backtrace<marshal_exception>(format("compound_type iterator - not enough bytes, expected {:d}, got {:d}", len, _v.size()));
                }
            }
-            _current = bytes_view(_v.begin(), len);
-            _v.remove_prefix(len);
+            _current = _v.prefix(len);
+            _v.remove_prefix(_current.size_bytes());
        }
    public:
        struct end_iterator_tag {};
-        iterator(const bytes_view& v) : _v(v) {
+        iterator(const managed_bytes_view& v) : _v(v) {
            read_current();
        }
-        iterator(end_iterator_tag, const bytes_view& v) : _v(nullptr, 0) {}
+        iterator(end_iterator_tag, const managed_bytes_view& v) : _v() {}
+        iterator() {}
        iterator& operator++() {
            read_current();
            return *this;
@@ -167,29 +184,40 @@ public:
        }
        const value_type& operator*() const { return _current; }
        const value_type* operator->() const { return &_current; }
-        bool operator!=(const iterator& i) const { return _v.begin() != i._v.begin(); }
-        bool operator==(const iterator& i) const { return _v.begin() == i._v.begin(); }
+        bool operator==(const iterator& i) const { return _remaining == i._remaining; }
    };
-    static iterator begin(const bytes_view& v) {
+    static iterator begin(managed_bytes_view v) {
        return iterator(v);
    }
-    static iterator end(const bytes_view& v) {
+    static iterator end(managed_bytes_view v) {
        return iterator(typename iterator::end_iterator_tag(), v);
    }
-    static boost::iterator_range<iterator> components(const bytes_view& v) {
+    static boost::iterator_range<iterator> components(managed_bytes_view v) {
        return { begin(v), end(v) };
    }
-    value_type deserialize_value(bytes_view v) const {
+    value_type deserialize_value(managed_bytes_view v) const {
        std::vector<bytes> result;
        result.reserve(_types.size());
        std::transform(begin(v), end(v), std::back_inserter(result), [] (auto&& v) {
-            return bytes(v.begin(), v.end());
+            return to_bytes(v);
        });
        return result;
    }
+    bool less(managed_bytes_view b1, managed_bytes_view b2) const {
+        return with_linearized(b1, [&] (bytes_view bv1) {
+            return with_linearized(b2, [&] (bytes_view bv2) {
+                return less(bv1, bv2);
+            });
+        });
+    }
    bool less(bytes_view b1, bytes_view b2) const {
        return compare(b1, b2) < 0;
    }
+    size_t hash(managed_bytes_view v) const{
+        return with_linearized(v, [&] (bytes_view v) {
+            return hash(v);
+        });
+    }
    size_t hash(bytes_view v) const {
        if (_byte_order_equal) {
            return std::hash<bytes_view>()(v);
@@ -202,6 +230,13 @@ public:
        }
        return h;
    }
+    int compare(managed_bytes_view b1, managed_bytes_view b2) const {
+        return with_linearized(b1, [&] (bytes_view bv1) {
+            return with_linearized(b2, [&] (bytes_view bv2) {
+                return compare(bv1, bv2);
+            });
+        });
+    }
    int compare(bytes_view b1, bytes_view b2) const {
        if (_byte_order_comparable) {
            if (_is_reversed) {
@@ -216,15 +251,21 @@ public:
            });
    }
    // Retruns true iff given prefix has no missing components
-    bool is_full(bytes_view v) const {
+    bool is_full(managed_bytes_view v) const {
        assert(AllowPrefixes == allow_prefixes::yes);
        return std::distance(begin(v), end(v)) == (ssize_t)_types.size();
    }
+    bool is_empty(managed_bytes_view v) const {
+        return v.empty();
+    }
+    bool is_empty(const managed_bytes& v) const {
+        return v.empty();
+    }
    bool is_empty(bytes_view v) const {
        return begin(v) == end(v);
    }
-    void validate(bytes_view v) const {
-        std::vector<bytes_view> values(begin(v), end(v));
+    void validate(managed_bytes_view v) const {
+        std::vector<managed_bytes_view> values(begin(v), end(v));
        if (AllowPrefixes == allow_prefixes::no && values.size() < _types.size()) {
            throw marshal_exception(fmt::format("compound::validate(): non-prefixable compound cannot be a prefix"));
        }
@@ -237,6 +278,13 @@ public:
            _types[i]->validate(values[i], cql_serialization_format::internal());
        }
    }
+    bool equal(managed_bytes_view v1, managed_bytes_view v2) const {
+        return with_linearized(v1, [&] (bytes_view bv1) {
+            return with_linearized(v2, [&] (bytes_view bv2) {
+                return equal(bv1, bv2);
+            });
+        });
+    }
    bool equal(bytes_view v1, bytes_view v2) const {
        if (_byte_order_equal) {
            return compare_unsigned(v1, v2) == 0;
--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -54,14 +54,21 @@ template <typename CompoundType>
 class legacy_compound_view {
    static_assert(!CompoundType::is_prefixable, "Legacy view not defined for prefixes");
    CompoundType& _type;
-    bytes_view _packed;
+    managed_bytes_view _packed;
 public:
-    legacy_compound_view(CompoundType& c, bytes_view packed)
+    legacy_compound_view(CompoundType& c, managed_bytes_view packed)
        : _type(c)
        , _packed(packed)
    { }

-    class iterator : public std::iterator<std::input_iterator_tag, bytes::value_type> {
+    class iterator {
+    public:
+        using iterator_category = std::input_iterator_tag;
+        using value_type = bytes::value_type;
+        using difference_type = std::ptrdiff_t;
+        using pointer = bytes::value_type*;
+        using reference = bytes::value_type&;
+    private:
        bool _singular;
        // Offset within virtual output space of a component.
        //
@@ -140,18 +147,18 @@ public:
        { }

        // @k1 and @k2 must be serialized using @type, which was passed to the constructor.
-        int operator()(bytes_view k1, bytes_view k2) const {
+        int operator()(managed_bytes_view k1, managed_bytes_view k2) const {
            if (_type.is_singular()) {
                return compare_unsigned(*_type.begin(k1), *_type.begin(k2));
            }
            return lexicographical_tri_compare(
                _type.begin(k1), _type.end(k1),
                _type.begin(k2), _type.end(k2),
-                [] (const bytes_view& c1, const bytes_view& c2) -> int {
-                    if (c1.size() != c2.size()) {
-                        return c1.size() < c2.size() ? -1 : 1;
+                [] (const managed_bytes_view& c1, const managed_bytes_view& c2) -> int {
+                    if (c1.size() != c2.size() || !c1.size()) {
+                        return c1.size() < c2.size() ? -1 : c1.size() ? 1 : 0;
                    }
-                    return memcmp(c1.begin(), c2.begin(), c1.size());
+                    return compare_unsigned(c1, c2);
                });
        }
    };
@@ -181,7 +188,7 @@ public:
 // @packed is assumed to be serialized using supplied @type.
 template <typename CompoundType>
 static inline
-bytes to_legacy(CompoundType& type, bytes_view packed) {
+bytes to_legacy(CompoundType& type, managed_bytes_view packed) {
    legacy_compound_view<CompoundType> lv(type, packed);
    bytes legacy_form(bytes::initialized_later(), lv.size());
    std::copy(lv.begin(), lv.end(), legacy_form.begin());
@@ -257,6 +264,12 @@ private:
    static void write_value(Value&& val, CharOutputIterator& out) {
        out = std::copy(val.begin(), val.end(), out);
    }
+    template<typename CharOutputIterator>
+    static void write_value(managed_bytes_view val, CharOutputIterator& out) {
+        for (bytes_view frag : fragment_range(val)) {
+            out = std::copy(frag.begin(), frag.end(), out);
+        }
+    }
    template <typename CharOutputIterator>
    static void write_value(const data_value& val, CharOutputIterator& out) {
        val.serialize(out);
@@ -339,7 +352,14 @@ public:
        return eoc_byte == 0 ? eoc::none : (eoc_byte < 0 ? eoc::start : eoc::end);
    }

-    class iterator : public std::iterator<std::input_iterator_tag, const component_view> {
+    class iterator {
+    public:
+        using iterator_category = std::input_iterator_tag;
+        using value_type = const component_view;
+        using difference_type = std::ptrdiff_t;
+        using pointer = const component_view*;
+        using reference = const component_view&;
+    private:
        bytes_view _v;
        component_view _current;
        bool _strict_mode = true;
@@ -391,6 +411,7 @@ public:
        iterator(end_iterator_tag) : _v(nullptr, 0) {}

    public:
+        iterator() : iterator(end_iterator_tag()) {}
        iterator& operator++() {
            read_current();
            return *this;
--- a/compress.cc
+++ b/compress.cc
@@ -205,7 +205,7 @@ void compression_parameters::validate_options(const std::map<sstring, sstring>&
        ckw = _compressor->option_names();
    }
    for (auto&& opt : options) {
-        if (!keywords.count(opt.first) && !ckw.count(opt.first)) {
+        if (!keywords.contains(opt.first) && !ckw.contains(opt.first)) {
            throw exceptions::configuration_exception(format("Unknown compression option '{}'.", opt.first));
        }
    }
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -99,9 +99,14 @@ listen_address: localhost
 # listen_on_broadcast_address: false

 # port for the CQL native transport to listen for clients on
-# For security reasons, you should not expose this port to the internet.  Firewall it if needed.
+# For security reasons, you should not expose this port to the internet. Firewall it if needed.
+# To disable the CQL native transport, remove this option and configure native_transport_port_ssl.
 native_transport_port: 9042

+# Like native_transport_port, but clients are forwarded to specific shards, based on the
+# client-side port numbers.
+native_shard_aware_transport_port: 19042
+
 # Enabling native transport encryption in client_encryption_options allows you to either use
 # encryption for the standard port or to use a dedicated, additional port along with the unencrypted
 # standard native_transport_port.
@@ -111,6 +116,10 @@ native_transport_port: 9042
 # keeping native_transport_port unencrypted.
 #native_transport_port_ssl: 9142

+# Like native_transport_port_ssl, but clients are forwarded to specific shards, based on the
+# client-side port numbers.
+#native_shard_aware_transport_port_ssl: 19142
+
 # How long the coordinator should wait for read operations to complete
 read_request_timeout_in_ms: 5000

@@ -221,6 +230,9 @@ batch_size_fail_threshold_in_kb: 50
 # - PasswordAuthenticator relies on username/password pairs to authenticate
 #   users. It keeps usernames and hashed passwords in system_auth.credentials table.
 #   Please increase system_auth keyspace replication factor if you use this authenticator.
+# - com.scylladb.auth.TransitionalAuthenticator requires username/password pair
+#   to authenticate in the same manner as PasswordAuthenticator, but improper credentials
+#   result in being logged in as an anonymous user. Use for upgrading clusters' auth.
 # authenticator: AllowAllAuthenticator

 # Authorization backend, implementing IAuthorizer; used to limit access/provide permissions
@@ -230,6 +242,9 @@ batch_size_fail_threshold_in_kb: 50
 # - AllowAllAuthorizer allows any action to any user - set it to disable authorization.
 # - CassandraAuthorizer stores permissions in system_auth.permissions table. Please
 #   increase system_auth keyspace replication factor if you use this authorizer.
+# - com.scylladb.auth.TransitionalAuthorizer wraps around the CassandraAuthorizer, using it for
+#   authorizing permission management. Otherwise, it allows all. Use for upgrading
+#   clusters' auth.
 # authorizer: AllowAllAuthorizer

 # initial_token allows you to specify tokens manually.  While you can use # it with
--- a/configure.py
+++ b/configure.py
@@ -34,7 +34,9 @@ from distutils.spawn import find_executable

 curdir = os.getcwd()

-tempfile.tempdir = "./build/tmp"
+outdir = 'build'
+
+tempfile.tempdir = f"{outdir}/tmp"

 configure_args = str.join(' ', [shlex.quote(x) for x in sys.argv[1:]])

@@ -56,6 +58,10 @@ i18n_xlat = {
    },
 }

+python3_dependencies = subprocess.run('./install-dependencies.sh --print-python3-runtime-packages', shell=True, capture_output=True, encoding='utf-8').stdout.strip()
+node_exporter_filename = subprocess.run('./install-dependencies.sh --print-node-exporter-filename', shell=True, capture_output=True, encoding='utf-8').stdout.strip()
+node_exporter_dirname = os.path.basename(node_exporter_filename).rstrip('.tar.gz')
+

 def pkgname(name):
    if name in i18n_xlat:
@@ -249,28 +255,33 @@ def find_headers(repodir, excluded_dirs):

 modes = {
    'debug': {
-        'cxxflags': '-DDEBUG -DDEBUG_LSA_SANITIZER -DSCYLLA_ENABLE_ERROR_INJECTION',
-        'cxx_ld_flags': '-Wstack-usage=%s' % (1024*40),
+        'cxxflags': '-DDEBUG -DSANITIZE -DDEBUG_LSA_SANITIZER -DSCYLLA_ENABLE_ERROR_INJECTION',
+        'cxx_ld_flags': '',
+        'stack-usage-threshold': 1024*40,
    },
    'release': {
-        'cxxflags': '',
-        'cxx_ld_flags': '-O3 -Wstack-usage=%s' % (1024*13),
+        'cxxflags': '-O3 -ffunction-sections -fdata-sections ',
+        'cxx_ld_flags': '-Wl,--gc-sections',
+        'stack-usage-threshold': 1024*13,
    },
    'dev': {
-        'cxxflags': '-DSEASTAR_ENABLE_ALLOC_FAILURE_INJECTION -DSCYLLA_ENABLE_ERROR_INJECTION',
-        'cxx_ld_flags': '-O1 -Wstack-usage=%s' % (1024*21),
+        'cxxflags': '-O1 -DDEVEL -DSEASTAR_ENABLE_ALLOC_FAILURE_INJECTION -DSCYLLA_ENABLE_ERROR_INJECTION',
+        'cxx_ld_flags': '',
+        'stack-usage-threshold': 1024*21,
    },
    'sanitize': {
-        'cxxflags': '-DDEBUG -DDEBUG_LSA_SANITIZER -DSCYLLA_ENABLE_ERROR_INJECTION',
-        'cxx_ld_flags': '-Os -Wstack-usage=%s' % (1024*50),
+        'cxxflags': '-Os -DDEBUG -DSANITIZE -DDEBUG_LSA_SANITIZER -DSCYLLA_ENABLE_ERROR_INJECTION',
+        'cxx_ld_flags': '',
+        'stack-usage-threshold': 1024*50,
    }
 }

 scylla_tests = set([
    'test/boost/UUID_test',
+    'test/boost/cdc_generation_test',
    'test/boost/aggregate_fcts_test',
    'test/boost/allocation_strategy_test',
-    'test/boost/alternator_base64_test',
+    'test/boost/alternator_unit_test',
    'test/boost/anchorless_list_test',
    'test/boost/auth_passwords_test',
    'test/boost/auth_resource_test',
@@ -290,6 +301,7 @@ scylla_tests = set([
    'test/boost/checksum_utils_test',
    'test/boost/chunked_vector_test',
    'test/boost/clustering_ranges_walker_test',
+    'test/boost/column_mapping_test',
    'test/boost/commitlog_test',
    'test/boost/compound_test',
    'test/boost/compress_test',
@@ -306,6 +318,7 @@ scylla_tests = set([
    'test/boost/crc_test',
    'test/boost/data_listeners_test',
    'test/boost/database_test',
+    'test/boost/double_decker_test',
    'test/boost/duration_test',
    'test/boost/dynamic_bitset_test',
    'test/boost/enum_option_test',
@@ -320,10 +333,14 @@ scylla_tests = set([
    'test/boost/gossip_test',
    'test/boost/gossiping_property_file_snitch_test',
    'test/boost/hash_test',
+    'test/boost/hashers_test',
    'test/boost/idl_test',
+    'test/boost/imr_test',
    'test/boost/input_stream_test',
    'test/boost/json_cql_query_test',
+    'test/boost/json_test',
    'test/boost/keys_test',
+    'test/boost/large_paging_state_test',
    'test/boost/like_matcher_test',
    'test/boost/limiting_data_source_test',
    'test/boost/linearizing_input_stream_test',
@@ -332,6 +349,8 @@ scylla_tests = set([
    'test/boost/estimated_histogram_test',
    'test/boost/logalloc_test',
    'test/boost/managed_vector_test',
+    'test/boost/managed_bytes_test',
+    'test/boost/intrusive_array_test',
    'test/boost/map_difference_test',
    'test/boost/memtable_test',
    'test/boost/meta_test',
@@ -353,6 +372,7 @@ scylla_tests = set([
    'test/boost/range_test',
    'test/boost/range_tombstone_list_test',
    'test/boost/reusable_buffer_test',
+    'test/boost/restrictions_test',
    'test/boost/role_manager_test',
    'test/boost/row_cache_test',
    'test/boost/schema_change_test',
@@ -371,10 +391,10 @@ scylla_tests = set([
    'test/boost/sstable_resharding_test',
    'test/boost/sstable_directory_test',
    'test/boost/sstable_test',
+    'test/boost/sstable_move_test',
    'test/boost/storage_proxy_test',
    'test/boost/top_k_test',
    'test/boost/transport_test',
-    'test/boost/truncation_migration_test',
    'test/boost/types_test',
    'test/boost/user_function_test',
    'test/boost/user_types_test',
@@ -386,12 +406,15 @@ scylla_tests = set([
    'test/boost/view_schema_ckey_test',
    'test/boost/vint_serialization_test',
    'test/boost/virtual_reader_test',
+    'test/boost/bptree_test',
+    'test/boost/double_decker_test',
+    'test/boost/stall_free_test',
+    'test/boost/imr_test',
    'test/manual/ec2_snitch_test',
+    'test/manual/enormous_table_scan_test',
    'test/manual/gce_snitch_test',
    'test/manual/gossip',
    'test/manual/hint_test',
-    'test/manual/imr_test',
-    'test/manual/json_test',
    'test/manual/message',
    'test/manual/partition_data_test',
    'test/manual/row_locker_test',
@@ -403,6 +426,7 @@ scylla_tests = set([
    'test/perf/perf_fast_forward',
    'test/perf/perf_hash',
    'test/perf/perf_mutation',
+    'test/perf/perf_collection',
    'test/perf/perf_row_cache_update',
    'test/perf/perf_simple_query',
    'test/perf/perf_sstable',
@@ -410,6 +434,8 @@ scylla_tests = set([
    'test/unit/lsa_sync_eviction_test',
    'test/unit/row_cache_alloc_stress_test',
    'test/unit/row_cache_stress_test',
+    'test/unit/bptree_stress_test',
+    'test/unit/bptree_compaction_test',
 ])

 perf_tests = set([
@@ -421,13 +447,19 @@ perf_tests = set([
    'test/perf/perf_big_decimal',
 ])

+raft_tests = set([
+    'test/raft/replication_test',
+    'test/boost/raft_fsm_test',
+])
+
 apps = set([
    'scylla',
    'test/tools/cql_repl',
    'tools/scylla-types',
+    'tools/scylla-sstable-index',
 ])

-tests = scylla_tests | perf_tests
+tests = scylla_tests | perf_tests | raft_tests

 other = set([
    'iotune',
@@ -446,18 +478,18 @@ arg_parser.add_argument('--so', dest='so', action='store_true',
 arg_parser.add_argument('--mode', action='append', choices=list(modes.keys()), dest='selected_modes')
 arg_parser.add_argument('--with', dest='artifacts', action='append', choices=all_artifacts, default=[])
 arg_parser.add_argument('--with-seastar', action='store', dest='seastar_path', default='seastar', help='Path to Seastar sources')
+add_tristate(arg_parser, name='dist', dest='enable_dist',
+                        help='scylla-tools-java, scylla-jmx and packages')
 arg_parser.add_argument('--cflags', action='store', dest='user_cflags', default='',
                        help='Extra flags for the C++ compiler')
 arg_parser.add_argument('--ldflags', action='store', dest='user_ldflags', default='',
                        help='Extra flags for the linker')
 arg_parser.add_argument('--target', action='store', dest='target', default=default_target_arch(),
                        help='Target architecture (-march)')
-arg_parser.add_argument('--compiler', action='store', dest='cxx', default='g++',
+arg_parser.add_argument('--compiler', action='store', dest='cxx', default='clang++',
                        help='C++ compiler path')
-arg_parser.add_argument('--c-compiler', action='store', dest='cc', default='gcc',
+arg_parser.add_argument('--c-compiler', action='store', dest='cc', default='clang',
                        help='C compiler path')
-arg_parser.add_argument('--with-osv', action='store', dest='with_osv', default='',
-                        help='Shortcut for compile for OSv')
 add_tristate(arg_parser, name='dpdk', dest='dpdk',
                        help='Use dpdk (from seastar dpdk sources) (default=True for release builds)')
 arg_parser.add_argument('--dpdk-target', action='store', dest='dpdk_target', default='',
@@ -480,21 +512,32 @@ arg_parser.add_argument('--split-dwarf', dest='split_dwarf', action='store_true'
                        help='use of split dwarf (https://gcc.gnu.org/wiki/DebugFission) to speed up linking')
 arg_parser.add_argument('--enable-alloc-failure-injector', dest='alloc_failure_injector', action='store_true', default=False,
                        help='enable allocation failure injection')
+arg_parser.add_argument('--enable-seastar-debug-allocations', dest='seastar_debug_allocations', action='store_true', default=False,
+                        help='enable seastar debug allocations')
 arg_parser.add_argument('--with-antlr3', dest='antlr3_exec', action='store', default=None,
                        help='path to antlr3 executable')
 arg_parser.add_argument('--with-ragel', dest='ragel_exec', action='store', default='ragel',
        help='path to ragel executable')
+arg_parser.add_argument('--build-raft', dest='build_raft', action='store_true', default=False,
+                        help='build raft code')
 add_tristate(arg_parser, name='stack-guards', dest='stack_guards', help='Use stack guards')
+arg_parser.add_argument('--verbose', dest='verbose', action='store_true',
+                        help='Make configure.py output more verbose (useful for debugging the build process itself)')
+arg_parser.add_argument('--test-repeat', dest='test_repeat', action='store', type=str, default='1',
+                         help='Set number of times to repeat each unittest.')
+arg_parser.add_argument('--test-timeout', dest='test_timeout', action='store', type=str, default='7200')
 args = arg_parser.parse_args()

+if not args.build_raft:
+    all_artifacts.difference_update(raft_tests)
+    tests.difference_update(raft_tests)
+
 defines = ['XXH_PRIVATE_API',
           'SEASTAR_TESTING_MAIN',
 ]

 extra_cxxflags = {}

-cassandra_interface = Thrift(source='interface/cassandra.thrift', service='Cassandra')
-
 scylla_core = (['database.cc',
                'absl-flat_hash_map.cc',
                'table.cc',
@@ -515,6 +558,7 @@ scylla_core = (['database.cc',
                'frozen_mutation.cc',
                'memtable.cc',
                'schema_mutations.cc',
+                'utils/array-search.cc',
                'utils/logalloc.cc',
                'utils/large_bitset.cc',
                'utils/buffer_input_stream.cc',
@@ -522,6 +566,8 @@ scylla_core = (['database.cc',
                'utils/updateable_value.cc',
                'utils/directories.cc',
                'utils/generation-number.cc',
+                'utils/rjson.cc',
+                'utils/human_readable.cc',
                'mutation_partition.cc',
                'mutation_partition_view.cc',
                'mutation_partition_serializer.cc',
@@ -529,7 +575,6 @@ scylla_core = (['database.cc',
                'mutation_reader.cc',
                'flat_mutation_reader.cc',
                'mutation_query.cc',
-                'json.cc',
                'keys.cc',
                'counters.cc',
                'compress.cc',
@@ -537,7 +582,9 @@ scylla_core = (['database.cc',
                'sstables/mp_row_consumer.cc',
                'sstables/sstables.cc',
                'sstables/sstables_manager.cc',
-                'sstables/mc/writer.cc',
+                'sstables/sstable_set.cc',
+                'sstables/mx/writer.cc',
+                'sstables/kl/writer.cc',
                'sstables/sstable_version.cc',
                'sstables/compress.cc',
                'sstables/partition.cc',
@@ -551,6 +598,10 @@ scylla_core = (['database.cc',
                'sstables/prepended_input_stream.cc',
                'sstables/m_format_read_helpers.cc',
                'sstables/sstable_directory.cc',
+                'sstables/random_access_reader.cc',
+                'sstables/metadata_collector.cc',
+                'sstables/writer.cc',
+                'transport/cql_protocol_extension.cc',
                'transport/event.cc',
                'transport/event_notifier.cc',
                'transport/server.cc',
@@ -573,6 +624,8 @@ scylla_core = (['database.cc',
                'cql3/sets.cc',
                'cql3/tuples.cc',
                'cql3/maps.cc',
+                'cql3/values.cc',
+                'cql3/expr/expression.cc',
                'cql3/functions/user_function.cc',
                'cql3/functions/functions.cc',
                'cql3/functions/aggregate_fcts.cc',
@@ -620,6 +673,7 @@ scylla_core = (['database.cc',
                'cql3/statements/alter_keyspace_statement.cc',
                'cql3/statements/role-management-statements.cc',
                'cql3/update_parameters.cc',
+                'cql3/util.cc',
                'cql3/ut_name.cc',
                'cql3/role_name.cc',
                'thrift/handler.cc',
@@ -639,7 +693,6 @@ scylla_core = (['database.cc',
                'service/paxos/prepare_response.cc',
                'service/paxos/paxos_state.cc',
                'service/paxos/prepare_summary.cc',
-                'cql3/operator.cc',
                'cql3/relation.cc',
                'cql3/column_identifier.cc',
                'cql3/column_specification.cc',
@@ -673,6 +726,7 @@ scylla_core = (['database.cc',
                'db/data_listeners.cc',
                'db/hints/manager.cc',
                'db/hints/resource_manager.cc',
+                'db/hints/host_filter.cc',
                'db/config.cc',
                'db/extensions.cc',
                'db/heat_load_balance.cc',
@@ -683,6 +737,7 @@ scylla_core = (['database.cc',
                'db/view/view_update_generator.cc',
                'db/view/row_locking.cc',
                'db/sstables-format-selector.cc',
+                'db/snapshot-ctl.cc',
                'index/secondary_index_manager.cc',
                'index/secondary_index.cc',
                'utils/UUID_gen.cc',
@@ -750,6 +805,7 @@ scylla_core = (['database.cc',
                'streaming/stream_manager.cc',
                'streaming/stream_result_future.cc',
                'streaming/stream_session_state.cc',
+                'streaming/stream_reason.cc',
                'clocks-impl.cc',
                'partition_slice_builder.cc',
                'init.cc',
@@ -799,6 +855,7 @@ scylla_core = (['database.cc',
                'utils/error_injection.cc',
                'mutation_writer/timestamp_based_splitting_writer.cc',
                'mutation_writer/shard_based_splitting_writer.cc',
+                'mutation_writer/feed_writers.cc',
                'lua.cc',
                ] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')]
               )
@@ -850,8 +907,8 @@ alternator = [
       'alternator/expressions.cc',
       Antlr3Grammar('alternator/expressions.g'),
       'alternator/conditions.cc',
-       'alternator/rjson.cc',
       'alternator/auth.cc',
+       'alternator/streams.cc',
 ]

 redis = [
@@ -906,6 +963,8 @@ scylla_tests_generic_dependencies = [
    'test/lib/log.cc',
    'test/lib/reader_permit.cc',
    'test/lib/test_utils.cc',
+    'test/lib/tmpdir.cc',
+    'test/lib/sstable_run_based_compaction_strategy_for_tests.cc',
 ]

 scylla_tests_dependencies = scylla_core + idls + scylla_tests_generic_dependencies + [
@@ -918,11 +977,21 @@ scylla_tests_dependencies = scylla_core + idls + scylla_tests_generic_dependenci
    'test/lib/random_schema.cc',
 ]

+scylla_raft_dependencies = [
+    'raft/raft.cc',
+    'raft/server.cc',
+    'raft/fsm.cc',
+    'raft/progress.cc',
+    'raft/log.cc',
+    'utils/uuid.cc'
+]
+
 deps = {
-    'scylla': idls + ['main.cc', 'release.cc', 'build_id.cc'] + scylla_core + api + alternator + redis,
+    'scylla': idls + ['main.cc', 'release.cc', 'utils/build_id.cc'] + scylla_core + api + alternator + redis,
    'test/tools/cql_repl': idls + ['test/tools/cql_repl.cc'] + scylla_core + scylla_tests_generic_dependencies,
    #FIXME: we don't need all of scylla_core here, only the types module, need to modularize scylla_core.
    'tools/scylla-types': idls + ['tools/scylla-types.cc'] + scylla_core,
+    'tools/scylla-sstable-index': idls + ['tools/scylla-sstable-index.cc'] + scylla_core,
 }

 pure_boost_tests = set([
@@ -942,7 +1011,9 @@ pure_boost_tests = set([
    'test/boost/dynamic_bitset_test',
    'test/boost/enum_option_test',
    'test/boost/enum_set_test',
+    'test/boost/hashers_test',
    'test/boost/idl_test',
+    'test/boost/json_test',
    'test/boost/keys_test',
    'test/boost/like_matcher_test',
    'test/boost/linearizing_input_stream_test',
@@ -956,12 +1027,13 @@ pure_boost_tests = set([
    'test/boost/small_vector_test',
    'test/boost/top_k_test',
    'test/boost/vint_serialization_test',
-    'test/manual/json_test',
+    'test/boost/bptree_test',
+    'test/boost/utf8_test',
    'test/manual/streaming_histogram_test',
 ])

 tests_not_using_seastar_test_framework = set([
-    'test/boost/alternator_base64_test',
+    'test/boost/alternator_unit_test',
    'test/boost/small_vector_test',
    'test/manual/gossip',
    'test/manual/message',
@@ -970,10 +1042,13 @@ tests_not_using_seastar_test_framework = set([
    'test/perf/perf_cql_parser',
    'test/perf/perf_hash',
    'test/perf/perf_mutation',
+    'test/perf/perf_collection',
    'test/perf/perf_row_cache_update',
    'test/unit/lsa_async_eviction_test',
    'test/unit/lsa_sync_eviction_test',
    'test/unit/row_cache_alloc_stress_test',
+    'test/unit/bptree_stress_test',
+    'test/unit/bptree_compaction_test',
    'test/manual/sstable_scan_footprint_test',
 ]) | pure_boost_tests

@@ -1017,7 +1092,7 @@ deps['test/boost/anchorless_list_test'] = ['test/boost/anchorless_list_test.cc']
 deps['test/perf/perf_fast_forward'] += ['release.cc']
 deps['test/perf/perf_simple_query'] += ['release.cc']
 deps['test/boost/meta_test'] = ['test/boost/meta_test.cc']
-deps['test/manual/imr_test'] = ['test/manual/imr_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
+deps['test/boost/imr_test'] = ['test/boost/imr_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
 deps['test/boost/reusable_buffer_test'] = [
    "test/boost/reusable_buffer_test.cc",
    "test/lib/log.cc",
@@ -1032,10 +1107,14 @@ deps['test/boost/linearizing_input_stream_test'] = [
 ]

 deps['test/boost/duration_test'] += ['test/lib/exception_utils.cc']
-deps['test/boost/alternator_base64_test'] += ['alternator/base64.cc']
+deps['test/boost/alternator_unit_test'] += ['alternator/base64.cc']
+
+deps['test/raft/replication_test'] = ['test/raft/replication_test.cc'] + scylla_raft_dependencies
+deps['test/boost/raft_fsm_test'] =  ['test/boost/raft_fsm_test.cc', 'test/lib/log.cc'] + scylla_raft_dependencies

 deps['utils/gz/gen_crc_combine_table'] = ['utils/gz/gen_crc_combine_table.cc']

+
 warnings = [
    '-Wall',
    '-Werror',
@@ -1057,6 +1136,28 @@ warnings = [
    '-Wno-ignored-attributes',
    '-Wno-overloaded-virtual',
    '-Wno-stringop-overflow',
+    '-Wno-unused-command-line-argument',
+    '-Wno-inconsistent-missing-override',
+    '-Wno-defaulted-function-deleted',
+    '-Wno-redeclared-class-member',
+    '-Wno-pessimizing-move',
+    '-Wno-redundant-move',
+    '-Wno-gnu-designator',
+    '-Wno-instantiation-after-specialization',
+    '-Wno-unused-private-field',
+    '-Wno-unsupported-friend',
+    '-Wno-unused-variable',
+    '-Wno-return-std-move',
+    '-Wno-delete-non-abstract-non-virtual-dtor',
+    '-Wno-unknown-attributes',
+    '-Wno-braced-scalar-init',
+    '-Wno-range-loop-construct',
+    '-Wno-unused-function',
+    '-Wno-implicit-int-float-conversion',
+    '-Wno-delete-abstract-non-virtual-dtor',
+    '-Wno-uninitialized-const-reference',
+    # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
+    '-Wno-psabi',
 ]

 warnings = [w
@@ -1066,12 +1167,17 @@ warnings = [w
 warnings = ' '.join(warnings + ['-Wno-error=deprecated-declarations'])

 optimization_flags = [
-    '--param inline-unit-growth=300',
+    '--param inline-unit-growth=300', # gcc
+    '-mllvm -inline-threshold=2500',  # clang
 ]
 optimization_flags = [o
                      for o in optimization_flags
                      if flag_supported(flag=o, compiler=args.cxx)]
-modes['release']['cxx_ld_flags'] += ' ' + ' '.join(optimization_flags)
+modes['release']['cxxflags'] += ' ' + ' '.join(optimization_flags)
+
+if flag_supported(flag='-Wstack-usage=4096', compiler=args.cxx):
+    for mode in modes:
+        modes[mode]['cxxflags'] += f' -Wstack-usage={modes[mode]["stack-usage-threshold"]} -Wno-error=stack-usage='

 linker_flags = linker_flags(compiler=args.cxx)

@@ -1106,9 +1212,20 @@ pkgs.append('libsystemd')


 compiler_test_src = '''
-#if __GNUC__ < 8
+
+// clang pretends to be gcc (defined __GNUC__), so we
+// must check it first
+#ifdef __clang__
+
+#if __clang_major__ < 10
    #error "MAJOR"
-#elif __GNUC__ == 8
+#endif
+
+#elif defined(__GNUC__)
+
+#if __GNUC__ < 10
+    #error "MAJOR"
+#elif __GNUC__ == 10
    #if __GNUC_MINOR__ < 1
        #error "MINOR"
    #elif __GNUC_MINOR__ == 1
@@ -1118,10 +1235,16 @@ compiler_test_src = '''
    #endif
 #endif

+#else
+
+#error "Unrecognized compiler"
+
+#endif
+
 int main() { return 0; }
 '''
 if not try_compile_and_link(compiler=args.cxx, source=compiler_test_src):
-    print('Wrong GCC version. Scylla needs GCC >= 8.1.1 to compile.')
+    print('Wrong GCC version. Scylla needs GCC >= 10.1.1 to compile.')
    sys.exit(1)

 if not try_compile(compiler=args.cxx, source='#include <boost/version.hpp>'):
@@ -1165,10 +1288,12 @@ if status != 0:
    print('Version file generation failed')
    sys.exit(1)

-file = open('build/SCYLLA-VERSION-FILE', 'r')
+file = open(f'{outdir}/SCYLLA-VERSION-FILE', 'r')
 scylla_version = file.read().strip()
-file = open('build/SCYLLA-RELEASE-FILE', 'r')
+file = open(f'{outdir}/SCYLLA-RELEASE-FILE', 'r')
 scylla_release = file.read().strip()
+file = open(f'{outdir}/SCYLLA-PRODUCT-FILE', 'r')
+scylla_product = file.read().strip()

 extra_cxxflags["release.cc"] = "-DSCYLLA_VERSION=\"\\\"" + scylla_version + "\\\"\" -DSCYLLA_RELEASE=\"\\\"" + scylla_release + "\\\"\""

@@ -1206,11 +1331,10 @@ forced_ldflags += f'--dynamic-linker={dynamic_linker}'

 args.user_ldflags = forced_ldflags + ' ' + args.user_ldflags

-args.user_cflags += ' -Wno-error=stack-usage='
-
-args.user_cflags += f"-ffile-prefix-map={curdir}=."
+args.user_cflags += f" -ffile-prefix-map={curdir}=."

 seastar_cflags = args.user_cflags
+
 if args.target != '':
    seastar_cflags += ' -march=' + args.target
 seastar_ldflags = args.user_ldflags
@@ -1219,6 +1343,13 @@ libdeflate_cflags = seastar_cflags

 MODE_TO_CMAKE_BUILD_TYPE = {'release' : 'RelWithDebInfo', 'debug' : 'Debug', 'dev' : 'Dev', 'sanitize' : 'Sanitize' }

+# cmake likes to separate things with semicolons
+def semicolon_separated(*flags):
+    # original flags may be space separated, so convert to string still
+    # using spaces
+    f = ' '.join(flags)
+    return re.sub(' +', ';', f)
+
 def configure_seastar(build_dir, mode):
    seastar_build_dir = os.path.join(build_dir, mode, 'seastar')

@@ -1227,10 +1358,10 @@ def configure_seastar(build_dir, mode):
        '-DCMAKE_C_COMPILER={}'.format(args.cc),
        '-DCMAKE_CXX_COMPILER={}'.format(args.cxx),
        '-DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON',
-        '-DSeastar_CXX_FLAGS={}'.format((seastar_cflags + ' ' + modes[mode]['cxx_ld_flags']).replace(' ', ';')),
-        '-DSeastar_LD_FLAGS={}'.format(seastar_ldflags),
+        '-DSeastar_CXX_FLAGS={}'.format((seastar_cflags).replace(' ', ';')),
+        '-DSeastar_LD_FLAGS={}'.format(semicolon_separated(seastar_ldflags, modes[mode]['cxx_ld_flags'])),
        '-DSeastar_CXX_DIALECT=gnu++20',
-        '-DSeastar_API_LEVEL=4',
+        '-DSeastar_API_LEVEL=6',
        '-DSeastar_UNUSED_RESULT_ERROR=ON',
    ]

@@ -1240,13 +1371,15 @@ def configure_seastar(build_dir, mode):

    dpdk = args.dpdk
    if dpdk is None:
-        dpdk = mode == 'release'
+        dpdk = platform.machine() == 'x86_64' and mode == 'release'
    if dpdk:
        seastar_cmake_args += ['-DSeastar_DPDK=ON', '-DSeastar_DPDK_MACHINE=wsm']
    if args.split_dwarf:
        seastar_cmake_args += ['-DSeastar_SPLIT_DWARF=ON']
    if args.alloc_failure_injector:
        seastar_cmake_args += ['-DSeastar_ALLOC_FAILURE_INJECTION=ON']
+    if args.seastar_debug_allocations:
+        seastar_cmake_args += ['-DSeastar_DEBUG_ALLOCATIONS=ON']

    seastar_cmd = ['cmake', '-G', 'Ninja', os.path.relpath(args.seastar_path, seastar_build_dir)] + seastar_cmake_args
    cmake_dir = seastar_build_dir
@@ -1256,14 +1389,15 @@ def configure_seastar(build_dir, mode):
        relative_seastar_build_dir = os.path.join('..', seastar_build_dir)  # relative to seastar/
        seastar_cmd = ['./cooking.sh', '-i', 'dpdk', '-d', relative_seastar_build_dir, '--'] + seastar_cmd[4:]

-    print(seastar_cmd)
+    if args.verbose:
+        print(" \\\n  ".join(seastar_cmd))
    os.makedirs(seastar_build_dir, exist_ok=True)
    subprocess.check_call(seastar_cmd, shell=False, cwd=cmake_dir)

 for mode in build_modes:
-    configure_seastar('build', mode)
+    configure_seastar(outdir, mode)

-pc = {mode: 'build/{}/seastar/seastar.pc'.format(mode) for mode in build_modes}
+pc = {mode: f'{outdir}/{mode}/seastar/seastar.pc' for mode in build_modes}
 ninja = find_executable('ninja') or find_executable('ninja-build')
 if not ninja:
    print('Ninja executable (ninja or ninja-build) not found on PATH\n')
@@ -1318,7 +1452,6 @@ abseil_libs = ['absl/' + lib for lib in [
    'base/libabsl_malloc_internal.a',
    'base/libabsl_spinlock_wait.a',
    'base/libabsl_base.a',
-    'base/libabsl_dynamic_annotations.a',
    'base/libabsl_raw_logging_internal.a',
    'base/libabsl_exponential_biased.a',
    'base/libabsl_throw_delegate.a']]
@@ -1330,18 +1463,15 @@ libs = ' '.join([maybe_static(args.staticyamlcpp, '-lyaml-cpp'), '-latomic', '-l
                 # Must link with static version of libzstd, since
                 # experimental APIs that we use are only present there.
                 maybe_static(True, '-lzstd'),
-                 maybe_static(args.staticboost, '-lboost_date_time -lboost_regex -licuuc'), ])
-
-pkgconfig_libs = [
-    'libxxhash',
-]
-
-args.user_cflags += ' ' + ' '.join([pkg_config(lib, '--cflags') for lib in pkgconfig_libs])
-libs += ' ' + ' '.join([pkg_config(lib, '--libs') for lib in pkgconfig_libs])
+                 maybe_static(args.staticboost, '-lboost_date_time -lboost_regex -licuuc -licui18n'),
+                 '-lxxhash'])

 if not args.staticboost:
    args.user_cflags += ' -DBOOST_TEST_DYN_LINK'

+if build_raft:
+    args.user_cflags += ' -DENABLE_SCYLLA_RAFT'
+
 # thrift version detection, see #4538
 proc_res = subprocess.run(["thrift", "-version"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 proc_res_output = proc_res.stdout.decode("utf-8")
@@ -1366,13 +1496,9 @@ if args.staticthrift:
 else:
    thrift_libs = "-lthrift"

-outdir = 'build'
 buildfile = 'build.ninja'

 os.makedirs(outdir, exist_ok=True)
-do_sanitize = True
-if args.static:
-    do_sanitize = False

 if args.antlr3_exec:
    antlr3_exec = args.antlr3_exec
@@ -1398,7 +1524,7 @@ with open(buildfile_tmp, 'w') as f:
        configure_args = {configure_args}
        builddir = {outdir}
        cxx = {cxx}
-        cxxflags = {user_cflags} {warnings} {defines}
+        cxxflags = --std=gnu++20 {user_cflags} {warnings} {defines}
        ldflags = {linker_flags} {user_ldflags}
        ldflags_build = {linker_flags}
        libs = {libs}
@@ -1428,7 +1554,7 @@ with open(buildfile_tmp, 'w') as f:
            command = $in > $out
            description = GEN $out
        rule copy
-            command = cp $in $out
+            command = cp --reflink=auto $in $out
            description = COPY $out
        rule package
            command = scripts/create-relocatable-package.py --mode $mode $out
@@ -1436,6 +1562,8 @@ with open(buildfile_tmp, 'w') as f:
            command = reloc/build_rpm.sh --reloc-pkg $in --builddir $out
        rule debbuild
            command = reloc/build_deb.sh --reloc-pkg $in --builddir $out
+        rule unified
+            command = unified/build_unified.sh --mode $mode --unified-pkg $out
        ''').format(**globals()))
    for mode in build_modes:
        modeval = modes[mode]
@@ -1468,45 +1596,49 @@ with open(buildfile_tmp, 'w') as f:
            rule thrift.{mode}
                command = thrift -gen cpp:cob_style -out $builddir/{mode}/gen $in
                description = THRIFT $in
+                restat = 1
            rule antlr3.{mode}
                # We replace many local `ExceptionBaseType* ex` variables with a single function-scope one.
                # Because we add such a variable to every function, and because `ExceptionBaseType` is not a global
                # name, we also add a global typedef to avoid compilation errors.
                command = sed -e '/^#if 0/,/^#endif/d' $in > $builddir/{mode}/gen/$in $
                     && {antlr3_exec} $builddir/{mode}/gen/$in $
-                     && sed -i -e '/^.*On :.*$$/d' build/{mode}/gen/${{stem}}Lexer.hpp $
-                     && sed -i -e '/^.*On :.*$$/d' build/{mode}/gen/${{stem}}Lexer.cpp $
-                     && sed -i -e '/^.*On :.*$$/d' build/{mode}/gen/${{stem}}Parser.hpp $
+                     && sed -i -e '/^.*On :.*$$/d' $builddir/{mode}/gen/${{stem}}Lexer.hpp $
+                     && sed -i -e '/^.*On :.*$$/d' $builddir/{mode}/gen/${{stem}}Lexer.cpp $
+                     && sed -i -e '/^.*On :.*$$/d' $builddir/{mode}/gen/${{stem}}Parser.hpp $
                     && sed -i -e 's/^\\( *\)\\(ImplTraits::CommonTokenType\\* [a-zA-Z0-9_]* = NULL;\\)$$/\\1const \\2/' $
                        -e '/^.*On :.*$$/d' $
                        -e '1i using ExceptionBaseType = int;' $
                        -e 's/^{{/{{ ExceptionBaseType\* ex = nullptr;/; $
                            s/ExceptionBaseType\* ex = new/ex = new/; $
                            s/exceptions::syntax_exception e/exceptions::syntax_exception\& e/' $
-                        build/{mode}/gen/${{stem}}Parser.cpp
+                        $builddir/{mode}/gen/${{stem}}Parser.cpp
                description = ANTLR3 $in
            rule checkhh.{mode}
-              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} $obj_cxxflags --include $in -c -o $out build/{mode}/gen/empty.cc
+              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} $obj_cxxflags --include $in -c -o $out $builddir/{mode}/gen/empty.cc
              description = CHECKHH $in
              depfile = $out.d
            rule test.{mode}
-              command = ./test.py --mode={mode}
+              command = ./test.py --mode={mode} --repeat={test_repeat} --timeout={test_timeout}
+              pool = console
              description = TEST {mode}
-            ''').format(mode=mode, antlr3_exec=antlr3_exec, fmt_lib=fmt_lib, **modeval))
+            ''').format(mode=mode, antlr3_exec=antlr3_exec, fmt_lib=fmt_lib, test_repeat=test_repeat, test_timeout=test_timeout, **modeval))
        f.write(
-            'build {mode}: phony {artifacts}\n'.format(
+            'build {mode}-build: phony {artifacts}\n'.format(
                mode=mode,
                artifacts=str.join(' ', ('$builddir/' + mode + '/' + x for x in build_artifacts))
            )
        )
+        include_dist_target = f'dist-{mode}' if args.enable_dist is None or args.enable_dist else ''
+        f.write(f'build {mode}: phony {mode}-build {include_dist_target}\n')
        compiles = {}
        swaggers = set()
        serializers = {}
        thrifts = set()
        ragels = {}
        antlr3_grammars = set()
-        seastar_dep = 'build/{}/seastar/libseastar.a'.format(mode)
-        seastar_testing_dep = 'build/{}/seastar/libseastar_testing.a'.format(mode)
+        seastar_dep = '$builddir/{}/seastar/libseastar.a'.format(mode)
+        seastar_testing_dep = '$builddir/{}/seastar/libseastar_testing.a'.format(mode)
        for binary in build_artifacts:
            if binary in other:
                continue
@@ -1594,7 +1726,7 @@ with open(buildfile_tmp, 'w') as f:
        )

        f.write(
-            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/test/tools/cql_repl\n'.format(
+            'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/test/tools/cql_repl $builddir/{mode}/scylla\n'.format(
                mode=mode,
                test_executables=' '.join(['$builddir/{}/{}'.format(mode, binary) for binary in tests]),
            )
@@ -1654,106 +1786,136 @@ with open(buildfile_tmp, 'w') as f:
                    if has_sanitize_address_use_after_scope:
                        flags += ' -fno-sanitize-address-use-after-scope'
                    f.write('  obj_cxxflags = %s\n' % flags)
-        f.write(f'build build/{mode}/gen/empty.cc: gen\n')
+        f.write(f'build $builddir/{mode}/gen/empty.cc: gen\n')
        for hh in headers:
-            f.write('build $builddir/{mode}/{hh}.o: checkhh.{mode} {hh} | build/{mode}/gen/empty.cc || {gen_headers_dep}\n'.format(
+            f.write('build $builddir/{mode}/{hh}.o: checkhh.{mode} {hh} | $builddir/{mode}/gen/empty.cc || {gen_headers_dep}\n'.format(
                    mode=mode, hh=hh, gen_headers_dep=gen_headers_dep))

-        f.write('build build/{mode}/seastar/libseastar.a: ninja | always\n'
+        f.write('build $builddir/{mode}/seastar/libseastar.a: ninja | always\n'
                .format(**locals()))
        f.write('  pool = submodule_pool\n')
-        f.write('  subdir = build/{mode}/seastar\n'.format(**locals()))
+        f.write('  subdir = $builddir/{mode}/seastar\n'.format(**locals()))
        f.write('  target = seastar\n'.format(**locals()))
-        f.write('build build/{mode}/seastar/libseastar_testing.a: ninja | always\n'
+        f.write('build $builddir/{mode}/seastar/libseastar_testing.a: ninja | always\n'
                .format(**locals()))
        f.write('  pool = submodule_pool\n')
-        f.write('  subdir = build/{mode}/seastar\n'.format(**locals()))
+        f.write('  subdir = $builddir/{mode}/seastar\n'.format(**locals()))
        f.write('  target = seastar_testing\n'.format(**locals()))
-        f.write('build build/{mode}/seastar/apps/iotune/iotune: ninja\n'
+        f.write('build $builddir/{mode}/seastar/apps/iotune/iotune: ninja\n'
                .format(**locals()))
        f.write('  pool = submodule_pool\n')
-        f.write('  subdir = build/{mode}/seastar\n'.format(**locals()))
+        f.write('  subdir = $builddir/{mode}/seastar\n'.format(**locals()))
        f.write('  target = iotune\n'.format(**locals()))
        f.write(textwrap.dedent('''\
-            build build/{mode}/iotune: copy build/{mode}/seastar/apps/iotune/iotune
+            build $builddir/{mode}/iotune: copy $builddir/{mode}/seastar/apps/iotune/iotune
            ''').format(**locals()))
-        f.write('build build/{mode}/scylla-package.tar.gz: package build/{mode}/scylla build/{mode}/iotune build/SCYLLA-RELEASE-FILE build/SCYLLA-VERSION-FILE build/debian/debian | always\n'.format(**locals()))
-        f.write('  pool = submodule_pool\n')
+        f.write('build $builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz: package $builddir/{mode}/scylla $builddir/{mode}/iotune $builddir/SCYLLA-RELEASE-FILE $builddir/SCYLLA-VERSION-FILE $builddir/debian/debian $builddir/node_exporter | always\n'.format(**locals()))
        f.write('  mode = {mode}\n'.format(**locals()))
-        f.write(f'build build/dist/{mode}/redhat: rpmbuild build/{mode}/scylla-package.tar.gz\n')
-        f.write(f'  pool = submodule_pool\n')
+        f.write(f'build $builddir/dist/{mode}/redhat: rpmbuild $builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz\n')
        f.write(f'  mode = {mode}\n')
-        f.write(f'build build/dist/{mode}/debian: debbuild build/{mode}/scylla-package.tar.gz\n')
-        f.write(f'  pool = submodule_pool\n')
+        f.write(f'build $builddir/dist/{mode}/debian: debbuild $builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz\n')
+        f.write(f'  mode = {mode}\n')
+        f.write(f'build dist-server-{mode}: phony $builddir/dist/{mode}/redhat $builddir/dist/{mode}/debian\n')
+        f.write(f'build dist-jmx-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-jmx-package.tar.gz dist-jmx-rpm dist-jmx-deb\n')
+        f.write(f'build dist-tools-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-tools-package.tar.gz dist-tools-rpm dist-tools-deb\n')
+        f.write(f'build dist-python3-{mode}: phony dist-python3-tar dist-python3-rpm dist-python3-deb compat-python3-rpm compat-python3-deb\n')
+        f.write(f'build dist-unified-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-unified-package-{scylla_version}.{scylla_release}.tar.gz\n')
+        f.write(f'build $builddir/{mode}/dist/tar/{scylla_product}-unified-package-{scylla_version}.{scylla_release}.tar.gz: unified $builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz $builddir/{mode}/dist/tar/{scylla_product}-python3-package.tar.gz $builddir/{mode}/dist/tar/{scylla_product}-jmx-package.tar.gz $builddir/{mode}/dist/tar/{scylla_product}-tools-package.tar.gz | always\n')
        f.write(f'  mode = {mode}\n')
-        f.write(f'build dist-server-{mode}: phony build/dist/{mode}/redhat build/dist/{mode}/debian\n')
        f.write('rule libdeflate.{mode}\n'.format(**locals()))
-        f.write('  command = make -C libdeflate BUILD_DIR=../build/{mode}/libdeflate/ CFLAGS="{libdeflate_cflags}" CC={args.cc} ../build/{mode}/libdeflate//libdeflate.a\n'.format(**locals()))
-        f.write('build build/{mode}/libdeflate/libdeflate.a: libdeflate.{mode}\n'.format(**locals()))
+        f.write('  command = make -C libdeflate BUILD_DIR=../$builddir/{mode}/libdeflate/ CFLAGS="{libdeflate_cflags}" CC={args.cc} ../$builddir/{mode}/libdeflate//libdeflate.a\n'.format(**locals()))
+        f.write('build $builddir/{mode}/libdeflate/libdeflate.a: libdeflate.{mode}\n'.format(**locals()))
        f.write('  pool = submodule_pool\n')

        for lib in abseil_libs:
-            f.write('build build/{mode}/abseil/{lib}: ninja\n'.format(**locals()))
+            f.write('build $builddir/{mode}/abseil/{lib}: ninja\n'.format(**locals()))
            f.write('  pool = submodule_pool\n')
-            f.write('  subdir = build/{mode}/abseil\n'.format(**locals()))
+            f.write('  subdir = $builddir/{mode}/abseil\n'.format(**locals()))
            f.write('  target = {lib}\n'.format(**locals()))

-    mode = 'dev' if 'dev' in modes else modes[0]
-    f.write('build checkheaders: phony || {}\n'.format(' '.join(['$builddir/{}/{}.o'.format(mode, hh) for hh in headers])))
+    checkheaders_mode = 'dev' if 'dev' in modes else modes[0]
+    f.write('build checkheaders: phony || {}\n'.format(' '.join(['$builddir/{}/{}.o'.format(checkheaders_mode, hh) for hh in headers])))

    f.write(
-            'build test: phony {}\n'.format(' '.join(['{mode}-test'.format(mode=mode) for mode in modes]))
+            'build build: phony {}\n'.format(' '.join([f'{mode}-build' for mode in build_modes]))
    )
    f.write(
-            'build check: phony {}\n'.format(' '.join(['{mode}-check'.format(mode=mode) for mode in modes]))
+            'build test: phony {}\n'.format(' '.join(['{mode}-test'.format(mode=mode) for mode in build_modes]))
+    )
+    f.write(
+            'build check: phony {}\n'.format(' '.join(['{mode}-check'.format(mode=mode) for mode in build_modes]))
    )

    f.write(textwrap.dedent(f'''\
-        build dist-server-deb: phony {' '.join(['build/dist/{mode}/debian'.format(mode=mode) for mode in build_modes])}
-        build dist-server-rpm: phony {' '.join(['build/dist/{mode}/redhat'.format(mode=mode) for mode in build_modes])}
-        build dist-server: phony dist-server-rpm dist-server-deb
+        build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-package-{scylla_version}.{scylla_release}.tar.gz' for mode in build_modes])}
+        build dist-unified: phony dist-unified-tar
+
+        build dist-server-deb: phony {' '.join(['$builddir/dist/{mode}/debian'.format(mode=mode) for mode in build_modes])}
+        build dist-server-rpm: phony {' '.join(['$builddir/dist/{mode}/redhat'.format(mode=mode) for mode in build_modes])}
+        build dist-server-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz'.format(mode=mode, scylla_product=scylla_product) for mode in build_modes])}
+        build dist-server: phony dist-server-tar dist-server-rpm dist-server-deb

        rule build-submodule-reloc
-          command = cd $reloc_dir && ./reloc/build_reloc.sh
+          command = cd $reloc_dir && ./reloc/build_reloc.sh --version $$(<../../build/SCYLLA-PRODUCT-FILE)-$$(<../../build/SCYLLA-VERSION-FILE)-$$(<../../build/SCYLLA-RELEASE-FILE) --nodeps $args
        rule build-submodule-rpm
          command = cd $dir && ./reloc/build_rpm.sh --reloc-pkg $artifact
        rule build-submodule-deb
          command = cd $dir && ./reloc/build_deb.sh --reloc-pkg $artifact

-        build scylla-jmx/build/scylla-jmx-package.tar.gz: build-submodule-reloc
-          reloc_dir = scylla-jmx
-        build dist-jmx-rpm: build-submodule-rpm scylla-jmx/build/scylla-jmx-package.tar.gz
-          dir = scylla-jmx
-          artifact = build/scylla-jmx-package.tar.gz
-        build dist-jmx-deb: build-submodule-deb scylla-jmx/build/scylla-jmx-package.tar.gz
-          dir = scylla-jmx
-          artifact = build/scylla-jmx-package.tar.gz
-        build dist-jmx: phony dist-jmx-rpm dist-jmx-deb
+        build tools/jmx/build/{scylla_product}-jmx-package.tar.gz: build-submodule-reloc
+          reloc_dir = tools/jmx
+        build dist-jmx-rpm: build-submodule-rpm tools/jmx/build/{scylla_product}-jmx-package.tar.gz
+          dir = tools/jmx
+          artifact = $builddir/{scylla_product}-jmx-package.tar.gz
+        build dist-jmx-deb: build-submodule-deb tools/jmx/build/{scylla_product}-jmx-package.tar.gz
+          dir = tools/jmx
+          artifact = $builddir/{scylla_product}-jmx-package.tar.gz
+        build dist-jmx-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-jmx-package.tar.gz'.format(mode=mode, scylla_product=scylla_product) for mode in build_modes])}
+        build dist-jmx: phony dist-jmx-tar dist-jmx-rpm dist-jmx-deb

-        build scylla-tools/build/scylla-tools-package.tar.gz: build-submodule-reloc
-          reloc_dir = scylla-tools
-        build dist-tools-rpm: build-submodule-rpm scylla-tools/build/scylla-tools-package.tar.gz
-          dir = scylla-tools
-          artifact = build/scylla-tools-package.tar.gz
-        build dist-tools-deb: build-submodule-deb scylla-tools/build/scylla-tools-package.tar.gz
-          dir = scylla-tools
-          artifact = build/scylla-tools-package.tar.gz
-        build dist-tools: phony dist-tools-rpm dist-tools-deb
+        build tools/java/build/{scylla_product}-tools-package.tar.gz: build-submodule-reloc
+          reloc_dir = tools/java
+        build dist-tools-rpm: build-submodule-rpm tools/java/build/{scylla_product}-tools-package.tar.gz
+          dir = tools/java
+          artifact = $builddir/{scylla_product}-tools-package.tar.gz
+        build dist-tools-deb: build-submodule-deb tools/java/build/{scylla_product}-tools-package.tar.gz
+          dir = tools/java
+          artifact = $builddir/{scylla_product}-tools-package.tar.gz
+        build dist-tools-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-tools-package.tar.gz'.format(mode=mode, scylla_product=scylla_product) for mode in build_modes])}
+        build dist-tools: phony dist-tools-tar dist-tools-rpm dist-tools-deb

-        rule build-python-reloc
-          command = ./reloc/python3/build_reloc.sh
-        rule build-python-rpm
-          command = ./reloc/python3/build_rpm.sh
-        rule build-python-deb
-          command = ./reloc/python3/build_deb.sh
+        rule compat-python3-reloc
+          command = mkdir -p $builddir/release && ln -f $dir/$artifact $builddir/release/
+        rule compat-python3-rpm
+          command = cd $dir && ./reloc/build_rpm.sh --reloc-pkg $artifact --builddir ../../build/redhat
+        rule compat-python3-deb
+          command = cd $dir && ./reloc/build_deb.sh --reloc-pkg $artifact --builddir ../../build/debian
+        build $builddir/release/{scylla_product}-python3-package.tar.gz: compat-python3-reloc tools/python3/build/{scylla_product}-python3-package.tar.gz
+          dir = tools/python3
+          artifact = $builddir/{scylla_product}-python3-package.tar.gz
+        build compat-python3-rpm: compat-python3-rpm tools/python3/build/{scylla_product}-python3-package.tar.gz
+          dir = tools/python3
+          artifact = $builddir/{scylla_product}-python3-package.tar.gz
+        build compat-python3-deb: compat-python3-deb tools/python3/build/{scylla_product}-python3-package.tar.gz
+          dir = tools/python3
+          artifact = $builddir/{scylla_product}-python3-package.tar.gz

-        build build/release/scylla-python3-package.tar.gz: build-python-reloc
-        build dist-python-rpm: build-python-rpm build/release/scylla-python3-package.tar.gz
-        build dist-python-deb: build-python-deb build/release/scylla-python3-package.tar.gz
-        build dist-python: phony dist-python-rpm dist-python-deb
-        build dist-deb: phony dist-server-deb dist-python-deb dist-jmx-deb dist-tools-deb
-        build dist-rpm: phony dist-server-rpm dist-python-rpm dist-jmx-rpm dist-tools-rpm
-        build dist: phony dist-server dist-python dist-jmx dist-tools
+        build tools/python3/build/{scylla_product}-python3-package.tar.gz: build-submodule-reloc
+          reloc_dir = tools/python3
+          args = --packages "{python3_dependencies}"
+        build dist-python3-rpm: build-submodule-rpm tools/python3/build/{scylla_product}-python3-package.tar.gz
+          dir = tools/python3
+          artifact = $builddir/{scylla_product}-python3-package.tar.gz
+        build dist-python3-deb: build-submodule-deb tools/python3/build/{scylla_product}-python3-package.tar.gz
+          dir = tools/python3
+          artifact = $builddir/{scylla_product}-python3-package.tar.gz
+        build dist-python3-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-python3-package.tar.gz'.format(mode=mode, scylla_product=scylla_product) for mode in build_modes])}
+        build dist-python3: phony dist-python3-tar dist-python3-rpm dist-python3-deb $builddir/release/{scylla_product}-python3-package.tar.gz compat-python3-rpm compat-python3-deb
+        build dist-deb: phony dist-server-deb dist-python3-deb dist-jmx-deb dist-tools-deb
+        build dist-rpm: phony dist-server-rpm dist-python3-rpm dist-jmx-rpm dist-tools-rpm
+        build dist-tar: phony dist-unified-tar dist-server-tar dist-python3-tar dist-jmx-tar dist-tools-tar
+
+        build dist: phony dist-unified dist-server dist-python3 dist-jmx dist-tools
        '''))

    f.write(textwrap.dedent(f'''\
@@ -1763,6 +1925,11 @@ with open(buildfile_tmp, 'w') as f:
        '''))
    for mode in build_modes:
        f.write(textwrap.dedent(f'''\
+        build $builddir/{mode}/dist/tar/{scylla_product}-python3-package.tar.gz: copy tools/python3/build/{scylla_product}-python3-package.tar.gz
+        build $builddir/{mode}/dist/tar/{scylla_product}-tools-package.tar.gz: copy tools/java/build/{scylla_product}-tools-package.tar.gz
+        build $builddir/{mode}/dist/tar/{scylla_product}-jmx-package.tar.gz: copy tools/jmx/build/{scylla_product}-jmx-package.tar.gz
+
+        build dist-{mode}: phony dist-server-{mode} dist-python3-{mode} dist-tools-{mode} dist-jmx-{mode} dist-unified-{mode}
        build dist-check-{mode}: dist-check
          mode = {mode}
            '''))
@@ -1786,14 +1953,24 @@ with open(buildfile_tmp, 'w') as f:
        build mode_list: mode_list
        default {modes_list}
        ''').format(modes_list=' '.join(default_modes), **globals()))
+    unit_test_list = set(test for test in build_artifacts if test in set(tests))
+    f.write(textwrap.dedent('''\
+        rule unit_test_list
+            command = /usr/bin/env echo -e '{unit_test_list}'
+            description = List configured unit tests
+        build unit_test_list: unit_test_list
+        ''').format(unit_test_list="\\n".join(unit_test_list)))
    f.write(textwrap.dedent('''\
        build always: phony
        rule scylla_version_gen
            command = ./SCYLLA-VERSION-GEN
-        build build/SCYLLA-RELEASE-FILE build/SCYLLA-VERSION-FILE: scylla_version_gen
+        build $builddir/SCYLLA-RELEASE-FILE $builddir/SCYLLA-VERSION-FILE: scylla_version_gen
        rule debian_files_gen
            command = ./dist/debian/debian_files_gen.py
-        build build/debian/debian: debian_files_gen | always
-        ''').format(modes_list=' '.join(build_modes), **globals()))
+        build $builddir/debian/debian: debian_files_gen | always
+        rule extract_node_exporter
+            command = tar -C build -xvpf {node_exporter_filename} --no-same-owner && rm -rfv build/node_exporter && mv -v build/{node_exporter_dirname} build/node_exporter
+        build $builddir/node_exporter: extract_node_exporter | always
+        ''').format(**globals()))

 os.rename(buildfile_tmp, buildfile)
--- a/connection_notifier.cc
+++ b/connection_notifier.cc
@@ -20,44 +20,47 @@
 */

 #include "connection_notifier.hh"
-#include "db/query_context.hh"
 #include "cql3/constants.hh"
 #include "database.hh"
-#include "service/storage_proxy.hh"

 #include <stdexcept>

-namespace db::system_keyspace {
-extern const char *const CLIENTS;
-}
-
-static sstring to_string(client_type ct) {
+sstring to_string(client_type ct) {
    switch (ct) {
        case client_type::cql: return "cql";
        case client_type::thrift: return "thrift";
        case client_type::alternator: return "alternator";
-        default: throw std::runtime_error("Invalid client_type");
    }
+    throw std::runtime_error("Invalid client_type");
+}
+
+static sstring to_string(client_connection_stage ccs) {
+    switch (ccs) {
+        case client_connection_stage::established: return connection_stage_literal<client_connection_stage::established>;
+        case client_connection_stage::authenticating: return connection_stage_literal<client_connection_stage::authenticating>;
+        case client_connection_stage::ready: return connection_stage_literal<client_connection_stage::ready>;
+    }
+    throw std::runtime_error("Invalid client_connection_stage");
 }

 future<> notify_new_client(client_data cd) {
    // FIXME: consider prepared statement
    const static sstring req
-            = format("INSERT INTO system.{} (address, port, client_type, shard_id, protocol_version, username) "
-                     "VALUES (?, ?, ?, ?, ?, ?);", db::system_keyspace::CLIENTS);
+            = format("INSERT INTO system.{} (address, port, client_type, connection_stage, shard_id, protocol_version, username) "
+                     "VALUES (?, ?, ?, ?, ?, ?, ?);", db::system_keyspace::CLIENTS);
    
-    return db::execute_cql(req,
-            std::move(cd.ip), cd.port, to_string(cd.ct), cd.shard_id,
+    return db::qctx->execute_cql(req,
+            std::move(cd.ip), cd.port, to_string(cd.ct), to_string(cd.connection_stage), cd.shard_id,
            cd.protocol_version.has_value() ? data_value(*cd.protocol_version) : data_value::make_null(int32_type),
            cd.username.value_or("anonymous")).discard_result();
 }

-future<> notify_disconnected_client(gms::inet_address addr, client_type ct, int port) {
+future<> notify_disconnected_client(net::inet_address addr, int port, client_type ct) {
    // FIXME: consider prepared statement
    const static sstring req
            = format("DELETE FROM system.{} where address=? AND port=? AND client_type=?;",
                     db::system_keyspace::CLIENTS);
-    return db::execute_cql(req, addr.addr(), port, to_string(ct)).discard_result();
+    return db::qctx->execute_cql(req, std::move(addr), port, to_string(ct)).discard_result();
 }

 future<> clear_clientlist() {
--- a/connection_notifier.hh
+++ b/connection_notifier.hh
@@ -20,27 +20,65 @@
 */
 #pragma once

-#include "gms/inet_address.hh"
+#include "db/query_context.hh"
+
+#include <seastar/net/inet_address.hh>
 #include <seastar/core/sstring.hh>
+#include "seastarx.hh"
+
 #include <optional>

+namespace db::system_keyspace {
+extern const char *const CLIENTS;
+}
+
 enum class client_type {
    cql = 0,
    thrift,
    alternator,
 };

+sstring to_string(client_type ct);
+
+enum class changed_column {
+    username = 0,
+    connection_stage,
+    driver_name,
+    driver_version,
+    hostname,
+    protocol_version,
+};
+
+template <changed_column column> constexpr const char* column_literal = "";
+template <> inline constexpr const char* column_literal<changed_column::username> = "username";
+template <> inline constexpr const char* column_literal<changed_column::connection_stage> = "connection_stage";
+template <> inline constexpr const char* column_literal<changed_column::driver_name> = "driver_name";
+template <> inline constexpr const char* column_literal<changed_column::driver_version> = "driver_version";
+template <> inline constexpr const char* column_literal<changed_column::hostname> = "hostname";
+template <> inline constexpr const char* column_literal<changed_column::protocol_version> = "protocol_version";
+
+enum class client_connection_stage {
+    established = 0,
+    authenticating,
+    ready,
+};
+
+template <client_connection_stage ccs> constexpr const char* connection_stage_literal = "";
+template <> inline constexpr const char* connection_stage_literal<client_connection_stage::established> = "ESTABLISHED";
+template <> inline constexpr const char* connection_stage_literal<client_connection_stage::authenticating> = "AUTHENTICATING";
+template <> inline constexpr const char* connection_stage_literal<client_connection_stage::ready> = "READY";
+
 // Representation of a row in `system.clients'. std::optionals are for nullable cells.
 struct client_data {
-    gms::inet_address ip;
+    net::inet_address ip;
    int32_t port;
    client_type ct;
+    client_connection_stage connection_stage = client_connection_stage::established;
    int32_t shard_id;  /// ID of server-side shard which is processing the connection.

    // `optional' column means that it's nullable (possibly because it's
    // unimplemented yet). If you want to fill ("implement") any of them,
    // remember to update the query in `notify_new_client()'.
-    std::optional<sstring> connection_stage;
    std::optional<sstring> driver_name;
    std::optional<sstring> driver_version;
    std::optional<sstring> hostname;
@@ -52,6 +90,17 @@ struct client_data {
 };

 future<> notify_new_client(client_data cd);
-future<> notify_disconnected_client(gms::inet_address addr, client_type ct, int port);
-
+future<> notify_disconnected_client(net::inet_address addr, int port, client_type ct);
 future<> clear_clientlist();
+
+template <changed_column column_enum_val>
+struct notify_client_change {
+    template <typename T>
+    future<> operator()(net::inet_address addr, int port, client_type ct, T&& value) {
+        const static sstring req
+                = format("UPDATE system.{} SET {}=? WHERE address=? AND port=? AND client_type=?;",
+                        db::system_keyspace::CLIENTS, column_literal<column_enum_val>);
+
+        return db::qctx->execute_cql(req, std::forward<T>(value), std::move(addr), port, to_string(ct)).discard_result();
+    }
+};
--- a/counters.cc
+++ b/counters.cc
@@ -19,25 +19,10 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

-#include "service/storage_service.hh"
 #include "counters.hh"
 #include "mutation.hh"
 #include "combine.hh"

-counter_id counter_id::local()
-{
-    return counter_id(service::get_local_storage_service().get_local_id());
-}
-
-bool counter_id::less_compare_1_7_4::operator()(const counter_id& a, const counter_id& b) const
-{
-    if (a._most_significant != b._most_significant) {
-        return a._most_significant < b._most_significant;
-    } else {
-        return a._least_significant < b._least_significant;
-    }
-}
-
 std::ostream& operator<<(std::ostream& os, const counter_id& id) {
    return os << id.to_uuid();
 }
@@ -68,16 +53,6 @@ void counter_cell_builder::do_sort_and_remove_duplicates()
    _sorted = true;
 }

-std::vector<counter_shard> counter_cell_view::shards_compatible_with_1_7_4() const
-{
-    auto sorted_shards = boost::copy_range<std::vector<counter_shard>>(shards());
-    counter_id::less_compare_1_7_4 cmp;
-    boost::range::sort(sorted_shards, [&] (auto& a, auto& b) {
-        return cmp(a.id(), b.id());
-    });
-    return sorted_shards;
-}
-
 static bool apply_in_place(const column_definition& cdef, atomic_cell_mutable_view dst, atomic_cell_mutable_view src)
 {
    auto dst_ccmv = counter_cell_mutable_view(dst);
@@ -216,10 +191,10 @@ std::optional<atomic_cell> counter_cell_view::difference(atomic_cell_view a, ato
 }


-void transform_counter_updates_to_shards(mutation& m, const mutation* current_state, uint64_t clock_offset) {
+void transform_counter_updates_to_shards(mutation& m, const mutation* current_state, uint64_t clock_offset, utils::UUID local_id) {
    // FIXME: allow current_state to be frozen_mutation

-    auto transform_new_row_to_shards = [&s = *m.schema(), clock_offset] (column_kind kind, auto& cells) {
+    auto transform_new_row_to_shards = [&s = *m.schema(), clock_offset, local_id] (column_kind kind, auto& cells) {
        cells.for_each_cell([&] (column_id id, atomic_cell_or_collection& ac_o_c) {
            auto& cdef = s.column_at(kind, id);
            auto acv = ac_o_c.as_atomic_cell(cdef);
@@ -227,7 +202,7 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
                return; // continue -- we are in lambda
            }
            auto delta = acv.counter_update_value();
-            auto cs = counter_shard(counter_id::local(), delta, clock_offset + 1);
+            auto cs = counter_shard(counter_id(local_id), delta, clock_offset + 1);
            ac_o_c = counter_cell_builder::from_single_shard(acv.timestamp(), cs);
        });
    };
@@ -242,7 +217,7 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st

    clustering_key::less_compare cmp(*m.schema());

-    auto transform_row_to_shards = [&s = *m.schema(), clock_offset] (column_kind kind, auto& transformee, auto& state) {
+    auto transform_row_to_shards = [&s = *m.schema(), clock_offset, local_id] (column_kind kind, auto& transformee, auto& state) {
        std::deque<std::pair<column_id, counter_shard>> shards;
        state.for_each_cell([&] (column_id id, const atomic_cell_or_collection& ac_o_c) {
            auto& cdef = s.column_at(kind, id);
@@ -251,7 +226,7 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
                return; // continue -- we are in lambda
            }
          counter_cell_view::with_linearized(acv, [&] (counter_cell_view ccv) {
-            auto cs = ccv.local_shard();
+            auto cs = ccv.get_shard(counter_id(local_id));
            if (!cs) {
                return; // continue
            }
@@ -272,7 +247,7 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
            auto delta = acv.counter_update_value();

            if (shards.empty() || shards.front().first > id) {
-                auto cs = counter_shard(counter_id::local(), delta, clock_offset + 1);
+                auto cs = counter_shard(counter_id(local_id), delta, clock_offset + 1);
                ac_o_c = counter_cell_builder::from_single_shard(acv.timestamp(), cs);
            } else {
                auto& cs = shards.front().second;
--- a/counters.hh
+++ b/counters.hh
@@ -61,13 +61,6 @@ public:
        return !(*this == other);
    }
 public:
-    // (Wrong) Counter ID ordering used by Scylla 1.7.4 and earlier.
-    struct less_compare_1_7_4 {
-        bool operator()(const counter_id& a, const counter_id& b) const;
-    };
-public:
-    static counter_id local();
-
    // For tests.
    static counter_id generate_random() {
        return counter_id(utils::make_random_uuid());
@@ -186,7 +179,7 @@ public:
    int64_t logical_clock() const { return _logical_clock; }

    counter_shard& update(int64_t value_delta, int64_t clock_increment) noexcept {
-        _value += value_delta;
+        _value = uint64_t(_value) + uint64_t(value_delta); // signed int overflow is undefined hence the cast
        _logical_clock += clock_increment;
        return *this;
    }
@@ -282,7 +275,14 @@ public:
        return ac;
    }

-    class inserter_iterator : public std::iterator<std::output_iterator_tag, counter_shard> {
+    class inserter_iterator {
+    public:
+        using iterator_category = std::output_iterator_tag;
+        using value_type = counter_shard;
+        using difference_type = std::ptrdiff_t;
+        using pointer = counter_shard*;
+        using reference = counter_shard&;
+    private:
        counter_cell_builder* _builder;
    public:
        explicit inserter_iterator(counter_cell_builder& b) : _builder(&b) { }
@@ -291,7 +291,7 @@ public:
            return *this;
        }
        inserter_iterator& operator=(const counter_shard_view& csv) {
-            return operator=(counter_shard(csv));
+            return this->operator=(counter_shard(csv));
        }
        inserter_iterator& operator++() { return *this; }
        inserter_iterator& operator++(int) { return *this; }
@@ -316,7 +316,14 @@ protected:
    basic_atomic_cell_view<is_mutable> _cell;
    linearized_value_view _value;
 private:
-    class shard_iterator : public std::iterator<std::input_iterator_tag, basic_counter_shard_view<is_mutable>> {
+    class shard_iterator {
+    public:
+        using iterator_category = std::input_iterator_tag;
+        using value_type = basic_counter_shard_view<is_mutable>;
+        using difference_type = std::ptrdiff_t;
+        using pointer = basic_counter_shard_view<is_mutable>*;
+        using reference = basic_counter_shard_view<is_mutable>&;
+    private:
        pointer_type _current;
        basic_counter_shard_view<is_mutable> _current_view;
    public:
@@ -396,11 +403,6 @@ public:
        return *it;
    }

-    std::optional<counter_shard_view> local_shard() const {
-        // TODO: consider caching local shard position
-        return get_shard(counter_id::local());
-    }
-
    bool operator==(const basic_counter_cell_view& other) const {
        return timestamp() == other.timestamp() && boost::equal(shards(), other.shards());
    }
@@ -417,9 +419,6 @@ struct counter_cell_view : basic_counter_cell_view<mutable_view::no> {
        });
    }

-    // Returns counter shards in an order that is compatible with Scylla 1.7.4.
-    std::vector<counter_shard> shards_compatible_with_1_7_4() const;
-
    // Reversibly applies two counter cells, at least one of them must be live.
    static void apply(const column_definition& cdef, atomic_cell_or_collection& dst, atomic_cell_or_collection& src);

@@ -445,7 +444,7 @@ struct counter_cell_mutable_view : basic_counter_cell_view<mutable_view::yes> {
 // Transforms mutation dst from counter updates to counter shards using state
 // stored in current_state.
 // If current_state is present it has to be in the same schema as dst.
-void transform_counter_updates_to_shards(mutation& dst, const mutation* current_state, uint64_t clock_offset);
+void transform_counter_updates_to_shards(mutation& dst, const mutation* current_state, uint64_t clock_offset, utils::UUID local_id);

 template<>
 struct appending_hash<counter_shard_view> {
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -93,6 +93,7 @@ options {
 #include "cql3/ut_name.hh"
 #include "cql3/functions/function_name.hh"
 #include "cql3/functions/function_call.hh"
+#include "cql3/expr/expression.hh"
 #include <seastar/core/sstring.hh>
 #include "CqlLexer.hpp"

@@ -393,6 +394,7 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
        bool allow_filtering = false;
        bool is_json = false;
        bool bypass_cache = false;
+        auto attrs = std::make_unique<cql3::attributes::raw>();
    }
    : K_SELECT (
                ( K_JSON { is_json = true; } )?
@@ -407,11 +409,12 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
      ( K_LIMIT rows=intValue { limit = rows; } )?
      ( K_ALLOW K_FILTERING  { allow_filtering = true; } )?
      ( K_BYPASS K_CACHE { bypass_cache = true; })?
+      ( usingClause[attrs] )?
      {
          auto params = make_lw_shared<raw::select_statement::parameters>(std::move(orderings), is_distinct, allow_filtering, is_json, bypass_cache);
          $expr = std::make_unique<raw::select_statement>(std::move(cf), std::move(params),
            std::move(sclause), std::move(wclause), std::move(limit), std::move(per_partition_limit),
-            std::move(gbcolumns));
+            std::move(gbcolumns), std::move(attrs));
      }
    ;

@@ -520,6 +523,7 @@ usingClause[std::unique_ptr<cql3::attributes::raw>& attrs]
 usingClauseObjective[std::unique_ptr<cql3::attributes::raw>& attrs]
    : K_TIMESTAMP ts=intValue { attrs->timestamp = ts; }
    | K_TTL t=intValue { attrs->time_to_live = t; }
+    | K_TIMEOUT to=term { attrs->timeout = to; }
    ;

 /**
@@ -1332,7 +1336,7 @@ setOrMapLiteral[shared_ptr<cql3::term::raw> t] returns [shared_ptr<cql3::term::r
      { $value = ::make_shared<cql3::maps::literal>(std::move(m)); }
    | { s.push_back(t); }
          ( ',' tn=term { s.push_back(tn); } )*
-      { $value = make_shared(cql3::sets::literal(std::move(s))); }
+      { $value = ::make_shared<cql3::sets::literal>(std::move(s)); }
    ;

 collectionLiteral returns [shared_ptr<cql3::term::raw> value]
@@ -1343,7 +1347,7 @@ collectionLiteral returns [shared_ptr<cql3::term::raw> value]
    | '{' t=term v=setOrMapLiteral[t] { $value = v; } '}'
    // Note that we have an ambiguity between maps and set for "{}". So we force it to a set literal,
    // and deal with it later based on the type of the column (SetLiteral.java).
-    | '{' '}' { $value = make_shared(cql3::sets::literal({})); }
+    | '{' '}' { $value = ::make_shared<cql3::sets::literal>(std::vector<shared_ptr<cql3::term::raw>>()); }
    ;

 usertypeLiteral returns [shared_ptr<cql3::user_types::literal> ut]
@@ -1474,13 +1478,13 @@ udtColumnOperation[operations_type& operations,
 columnCondition[conditions_type& conditions]
    // Note: we'll reject duplicates later
    : key=cident
-        ( op=relationType t=term { conditions.emplace_back(key, cql3::column_condition::raw::simple_condition(t, {}, *op)); }
+        ( op=relationType t=term { conditions.emplace_back(key, cql3::column_condition::raw::simple_condition(t, {}, op)); }
        | K_IN
            ( values=singleColumnInValues { conditions.emplace_back(key, cql3::column_condition::raw::in_condition({}, {}, values)); }
            | marker=inMarker { conditions.emplace_back(key, cql3::column_condition::raw::in_condition({}, marker, {})); }
            )
        | '[' element=term ']'
-            ( op=relationType t=term { conditions.emplace_back(key, cql3::column_condition::raw::simple_condition(t, element, *op)); }
+            ( op=relationType t=term { conditions.emplace_back(key, cql3::column_condition::raw::simple_condition(t, element, op)); }
            | K_IN
                ( values=singleColumnInValues { conditions.emplace_back(key, cql3::column_condition::raw::in_condition(element, {}, values)); }
                | marker=inMarker { conditions.emplace_back(key, cql3::column_condition::raw::in_condition(element, marker, {})); }
@@ -1503,31 +1507,31 @@ propertyValue returns [sstring str]
    | u=unreserved_keyword { $str = u; }
    ;

-relationType returns [const cql3::operator_type* op = nullptr]
-    : '='  { $op = &cql3::operator_type::EQ; }
-    | '<'  { $op = &cql3::operator_type::LT; }
-    | '<=' { $op = &cql3::operator_type::LTE; }
-    | '>'  { $op = &cql3::operator_type::GT; }
-    | '>=' { $op = &cql3::operator_type::GTE; }
-    | '!=' { $op = &cql3::operator_type::NEQ; }
-    | K_LIKE { $op = &cql3::operator_type::LIKE; }
+relationType returns [cql3::expr::oper_t op]
+    : '='  { $op = cql3::expr::oper_t::EQ; }
+    | '<'  { $op = cql3::expr::oper_t::LT; }
+    | '<=' { $op = cql3::expr::oper_t::LTE; }
+    | '>'  { $op = cql3::expr::oper_t::GT; }
+    | '>=' { $op = cql3::expr::oper_t::GTE; }
+    | '!=' { $op = cql3::expr::oper_t::NEQ; }
+    | K_LIKE { $op = cql3::expr::oper_t::LIKE; }
    ;

 relation[std::vector<cql3::relation_ptr>& clauses]
-    @init{ const cql3::operator_type* rt = nullptr; }
-    : name=cident type=relationType t=term { $clauses.emplace_back(::make_shared<cql3::single_column_relation>(std::move(name), *type, std::move(t))); }
+    @init{ cql3::expr::oper_t rt; }
+    : name=cident type=relationType t=term { $clauses.emplace_back(::make_shared<cql3::single_column_relation>(std::move(name), type, std::move(t))); }

    | K_TOKEN l=tupleOfIdentifiers type=relationType t=term
-        { $clauses.emplace_back(::make_shared<cql3::token_relation>(std::move(l), *type, std::move(t))); }
+        { $clauses.emplace_back(::make_shared<cql3::token_relation>(std::move(l), type, std::move(t))); }
    | name=cident K_IS K_NOT K_NULL {
-          $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), cql3::operator_type::IS_NOT, cql3::constants::NULL_LITERAL)); }
+          $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), cql3::expr::oper_t::IS_NOT, cql3::constants::NULL_LITERAL)); }
    | name=cident K_IN marker=inMarker
-        { $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), cql3::operator_type::IN, std::move(marker))); }
+        { $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), cql3::expr::oper_t::IN, std::move(marker))); }
    | name=cident K_IN in_values=singleColumnInValues
        { $clauses.emplace_back(cql3::single_column_relation::create_in_relation(std::move(name), std::move(in_values))); }
-    | name=cident K_CONTAINS { rt = &cql3::operator_type::CONTAINS; } (K_KEY { rt = &cql3::operator_type::CONTAINS_KEY; })?
-        t=term { $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), *rt, std::move(t))); }
-    | name=cident '[' key=term ']' type=relationType t=term { $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), std::move(key), *type, std::move(t))); }
+    | name=cident K_CONTAINS { rt = cql3::expr::oper_t::CONTAINS; } (K_KEY { rt = cql3::expr::oper_t::CONTAINS_KEY; })?
+        t=term { $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), rt, std::move(t))); }
+    | name=cident '[' key=term ']' type=relationType t=term { $clauses.emplace_back(make_shared<cql3::single_column_relation>(std::move(name), std::move(key), type, std::move(t))); }
    | ids=tupleOfIdentifiers
      ( K_IN
          ( '(' ')'
@@ -1543,10 +1547,10 @@ relation[std::vector<cql3::relation_ptr>& clauses]
          )
      | type=relationType literal=tupleLiteral /* (a, b, c) > (1, 2, 3) or (a, b, c) > (?, ?, ?) */
          {
-              $clauses.emplace_back(cql3::multi_column_relation::create_non_in_relation(ids, *type, literal));
+              $clauses.emplace_back(cql3::multi_column_relation::create_non_in_relation(ids, type, literal));
          }
      | type=relationType tupleMarker=markerForTuple /* (a, b, c) >= ? */
-          { $clauses.emplace_back(cql3::multi_column_relation::create_non_in_relation(ids, *type, tupleMarker)); }
+          { $clauses.emplace_back(cql3::multi_column_relation::create_non_in_relation(ids, type, tupleMarker)); }
      )
    | '(' relation[$clauses] ')'
    ;
@@ -1694,7 +1698,7 @@ username returns [sstring str]
 // Basically the same as cident, but we need to exlude existing CQL3 types
 // (which for some reason are not reserved otherwise)
 non_type_ident returns [shared_ptr<cql3::column_identifier> id]
-    : t=IDENT                    { if (_reserved_type_names().count($t.text)) { add_recognition_error("Invalid (reserved) user type name " + $t.text); } $id = ::make_shared<cql3::column_identifier>($t.text, false); }
+    : t=IDENT                    { if (_reserved_type_names().contains($t.text)) { add_recognition_error("Invalid (reserved) user type name " + $t.text); } $id = ::make_shared<cql3::column_identifier>($t.text, false); }
    | t=QUOTED_NAME              { $id = ::make_shared<cql3::column_identifier>($t.text, true); }
    | k=basic_unreserved_keyword { $id = ::make_shared<cql3::column_identifier>(k, false); }
    | kk=K_KEY                   { $id = ::make_shared<cql3::column_identifier>($kk.text, false); }
@@ -1760,6 +1764,7 @@ basic_unreserved_keyword returns [sstring str]
        | K_PER
        | K_PARTITION
        | K_GROUP
+        | K_TIMEOUT
        ) { $str = $k.text; }
    ;

@@ -1915,6 +1920,8 @@ K_GROUP:       G R O U P;

 K_LIKE:        L I K E;

+K_TIMEOUT:     T I M E O U T;
+
 // Case-insensitive alpha characters
 fragment A: ('a'|'A');
 fragment B: ('b'|'B');
--- a/cql3/abstract_marker.cc
+++ b/cql3/abstract_marker.cc
@@ -70,11 +70,11 @@ abstract_marker::raw::raw(int32_t bind_index)
 ::shared_ptr<term> abstract_marker::raw::prepare(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const
 {
    if (receiver->type->is_collection()) {
-        if (receiver->type->get_kind() == abstract_type::kind::list) {
+        if (receiver->type->without_reversed().is_list()) {
            return ::make_shared<lists::marker>(_bind_index, receiver);
-        } else if (receiver->type->get_kind() == abstract_type::kind::set) {
+        } else if (receiver->type->without_reversed().is_set()) {
            return ::make_shared<sets::marker>(_bind_index, receiver);
-        } else if (receiver->type->get_kind() == abstract_type::kind::map) {
+        } else if (receiver->type->without_reversed().is_map()) {
            return ::make_shared<maps::marker>(_bind_index, receiver);
        }
        assert(0);
--- a/cql3/attributes.cc
+++ b/cql3/attributes.cc
@@ -44,19 +44,15 @@
 namespace cql3 {

 std::unique_ptr<attributes> attributes::none() {
-    return std::unique_ptr<attributes>{new attributes{{}, {}}};
+    return std::unique_ptr<attributes>{new attributes{{}, {}, {}}};
 }

-attributes::attributes(::shared_ptr<term>&& timestamp, ::shared_ptr<term>&& time_to_live)
+attributes::attributes(::shared_ptr<term>&& timestamp, ::shared_ptr<term>&& time_to_live, ::shared_ptr<term>&& timeout)
    : _timestamp{std::move(timestamp)}
    , _time_to_live{std::move(time_to_live)}
+    , _timeout{std::move(timeout)}
 { }

-bool attributes::uses_function(const sstring& ks_name, const sstring& function_name) const {
-    return (_timestamp && _timestamp->uses_function(ks_name, function_name))
-        || (_time_to_live && _time_to_live->uses_function(ks_name, function_name));
-}
-
 bool attributes::is_timestamp_set() const {
    return bool(_timestamp);
 }
@@ -65,6 +61,10 @@ bool attributes::is_time_to_live_set() const {
    return bool(_time_to_live);
 }

+bool attributes::is_timeout_set() const {
+    return bool(_timeout);
+}
+
 int64_t attributes::get_timestamp(int64_t now, const query_options& options) {
    if (!_timestamp) {
        return now;
@@ -77,14 +77,12 @@ int64_t attributes::get_timestamp(int64_t now, const query_options& options) {
    if (tval.is_unset_value()) {
        return now;
    }
-  return with_linearized(*tval, [&] (bytes_view val) {
    try {
-        data_type_for<int64_t>()->validate(val, options.get_cql_serialization_format());
+        data_type_for<int64_t>()->validate(*tval, options.get_cql_serialization_format());
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid timestamp value");
    }
-    return value_cast<int64_t>(data_type_for<int64_t>()->deserialize(val));
-  });
+    return value_cast<int64_t>(data_type_for<int64_t>()->deserialize(*tval));
 }

 int32_t attributes::get_time_to_live(const query_options& options) {
@@ -98,16 +96,15 @@ int32_t attributes::get_time_to_live(const query_options& options) {
    if (tval.is_unset_value()) {
        return 0;
    }
-  auto ttl = with_linearized(*tval, [&] (bytes_view val) {
+
    try {
-        data_type_for<int32_t>()->validate(val, options.get_cql_serialization_format());
+        data_type_for<int32_t>()->validate(*tval, options.get_cql_serialization_format());
    }
    catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid TTL value");
    }
+    auto ttl = value_cast<int32_t>(data_type_for<int32_t>()->deserialize(*tval));

-    return value_cast<int32_t>(data_type_for<int32_t>()->deserialize(val));
-  });
    if (ttl < 0) {
        throw exceptions::invalid_request_exception("A TTL must be greater or equal to 0");
    }
@@ -120,6 +117,25 @@ int32_t attributes::get_time_to_live(const query_options& options) {
    return ttl;
 }

+
+db::timeout_clock::duration attributes::get_timeout(const query_options& options) const {
+    auto timeout = _timeout->bind_and_get(options);
+    if (timeout.is_null() || timeout.is_unset_value()) {
+        throw exceptions::invalid_request_exception("Timeout value cannot be unset/null");
+    }
+    cql_duration duration = value_cast<cql_duration>(duration_type->deserialize(*timeout));
+    if (duration.months || duration.days) {
+        throw exceptions::invalid_request_exception("Timeout values cannot be expressed in days/months");
+    }
+    if (duration.nanoseconds % 1'000'000 != 0) {
+        throw exceptions::invalid_request_exception("Timeout values cannot have granularity finer than milliseconds");
+    }
+    if (duration.nanoseconds < 0) {
+        throw exceptions::invalid_request_exception("Timeout values must be non-negative");
+    }
+    return std::chrono::duration_cast<db::timeout_clock::duration>(std::chrono::nanoseconds(duration.nanoseconds));
+}
+
 void attributes::collect_marker_specification(variable_specifications& bound_names) const {
    if (_timestamp) {
        _timestamp->collect_marker_specification(bound_names);
@@ -127,12 +143,16 @@ void attributes::collect_marker_specification(variable_specifications& bound_nam
    if (_time_to_live) {
        _time_to_live->collect_marker_specification(bound_names);
    }
+    if (_timeout) {
+        _timeout->collect_marker_specification(bound_names);
+    }
 }

 std::unique_ptr<attributes> attributes::raw::prepare(database& db, const sstring& ks_name, const sstring& cf_name) const {
    auto ts = !timestamp ? ::shared_ptr<term>{} : timestamp->prepare(db, ks_name, timestamp_receiver(ks_name, cf_name));
    auto ttl = !time_to_live ? ::shared_ptr<term>{} : time_to_live->prepare(db, ks_name, time_to_live_receiver(ks_name, cf_name));
-    return std::unique_ptr<attributes>{new attributes{std::move(ts), std::move(ttl)}};
+    auto to = !timeout ? ::shared_ptr<term>{} : timeout->prepare(db, ks_name, timeout_receiver(ks_name, cf_name));
+    return std::unique_ptr<attributes>{new attributes{std::move(ts), std::move(ttl), std::move(to)}};
 }

 lw_shared_ptr<column_specification> attributes::raw::timestamp_receiver(const sstring& ks_name, const sstring& cf_name) const {
@@ -143,4 +163,8 @@ lw_shared_ptr<column_specification> attributes::raw::time_to_live_receiver(const
    return make_lw_shared<column_specification>(ks_name, cf_name, ::make_shared<column_identifier>("[ttl]", true), data_type_for<int32_t>());
 }

+lw_shared_ptr<column_specification> attributes::raw::timeout_receiver(const sstring& ks_name, const sstring& cf_name) const {
+    return make_lw_shared<column_specification>(ks_name, cf_name, ::make_shared<column_identifier>("[timeout]", true), duration_type);
+}
+
 }
--- a/cql3/attributes.hh
+++ b/cql3/attributes.hh
@@ -54,33 +54,39 @@ class attributes final {
 private:
    const ::shared_ptr<term> _timestamp;
    const ::shared_ptr<term> _time_to_live;
+    const ::shared_ptr<term> _timeout;
 public:
    static std::unique_ptr<attributes> none();
 private:
-    attributes(::shared_ptr<term>&& timestamp, ::shared_ptr<term>&& time_to_live);
+    attributes(::shared_ptr<term>&& timestamp, ::shared_ptr<term>&& time_to_live, ::shared_ptr<term>&& timeout);
 public:
-    bool uses_function(const sstring& ks_name, const sstring& function_name) const;
-
    bool is_timestamp_set() const;

    bool is_time_to_live_set() const;

+    bool is_timeout_set() const;
+
    int64_t get_timestamp(int64_t now, const query_options& options);

    int32_t get_time_to_live(const query_options& options);

+    db::timeout_clock::duration get_timeout(const query_options& options) const;
+
    void collect_marker_specification(variable_specifications& bound_names) const;

    class raw final {
    public:
        ::shared_ptr<term::raw> timestamp;
        ::shared_ptr<term::raw> time_to_live;
+        ::shared_ptr<term::raw> timeout;

        std::unique_ptr<attributes> prepare(database& db, const sstring& ks_name, const sstring& cf_name) const;
    private:
        lw_shared_ptr<column_specification> timestamp_receiver(const sstring& ks_name, const sstring& cf_name) const;

        lw_shared_ptr<column_specification> time_to_live_receiver(const sstring& ks_name, const sstring& cf_name) const;
+
+        lw_shared_ptr<column_specification> timeout_receiver(const sstring& ks_name, const sstring& cf_name) const;
    };
 };

--- a/cql3/column_condition.cc
+++ b/cql3/column_condition.cc
@@ -48,13 +48,14 @@
 #include "types/map.hh"
 #include "types/list.hh"
 #include "utils/like_matcher.hh"
+#include "expr/expression.hh"

 namespace {

-void validate_operation_on_durations(const abstract_type& type, const cql3::operator_type& op) {
+void validate_operation_on_durations(const abstract_type& type, cql3::expr::oper_t op) {
    using cql3::statements::request_validations::check_false;

-    if (op.is_slice() && type.references_duration()) {
+    if (is_slice(op) && type.references_duration()) {
        check_false(type.is_collection(), "Slice conditions are not supported on collections containing durations");
        check_false(type.is_tuple(), "Slice conditions are not supported on tuples containing durations");
        check_false(type.is_user_type(), "Slice conditions are not supported on UDTs containing durations");
@@ -64,7 +65,7 @@ void validate_operation_on_durations(const abstract_type& type, const cql3::oper
    }
 }

-int is_satisfied_by(const cql3::operator_type &op, const abstract_type& cell_type,
+int is_satisfied_by(cql3::expr::oper_t op, const abstract_type& cell_type,
        const abstract_type& param_type, const data_value& cell_value, const bytes& param) {

        int rc;
@@ -82,21 +83,24 @@ int is_satisfied_by(const cql3::operator_type &op, const abstract_type& cell_typ
        } else {
            rc = cell_type.compare(cell_type.decompose(cell_value), param);
        }
-        if (op == cql3::operator_type::EQ) {
+        switch (op) {
+            using cql3::expr::oper_t;
+        case oper_t::EQ:
            return rc == 0;
-        } else if (op == cql3::operator_type::NEQ) {
+        case oper_t::NEQ:
            return rc != 0;
-        } else if (op == cql3::operator_type::GTE) {
+        case oper_t::GTE:
            return rc >= 0;
-        } else if (op == cql3::operator_type::LTE) {
+        case oper_t::LTE:
            return rc <= 0;
-        } else if (op == cql3::operator_type::GT) {
+        case oper_t::GT:
            return rc > 0;
-        } else if (op == cql3::operator_type::LT) {
+        case oper_t::LT:
            return rc < 0;
+        default:
+            assert(false);
+            return false;
        }
-        assert(false);
-        return false;
 }

 // Read the list index from key and check that list index is not
@@ -114,24 +118,6 @@ uint32_t read_and_check_list_index(const cql3::raw_value_view& key) {

 namespace cql3 {

-bool
-column_condition::uses_function(const sstring& ks_name, const sstring& function_name) const {
-    if (bool(_collection_element) && _collection_element->uses_function(ks_name, function_name)) {
-        return true;
-    }
-    if (bool(_value) && _value->uses_function(ks_name, function_name)) {
-        return true;
-    }
-    if (!_in_values.empty()) {
-        for (auto&& value : _in_values) {
-            if (bool(value) && value->uses_function(ks_name, function_name)) {
-                return true;
-            }
-        }
-    }
-    return false;
-}
-
 void column_condition::collect_marker_specificaton(variable_specifications& bound_names) const {
    if (_collection_element) {
        _collection_element->collect_marker_specification(bound_names);
@@ -223,7 +209,7 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti
        }
    }

-    if (_op.is_compare()) {
+    if (is_compare(_op)) {
        // <, >, >=, <=, !=
        cql3::raw_value_view param = _value->bind_and_get(options);

@@ -231,23 +217,23 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti
            throw exceptions::invalid_request_exception("Invalid 'unset' value in condition");
        }
        if (param.is_null()) {
-            if (_op == operator_type::EQ) {
+            if (_op == expr::oper_t::EQ) {
                return cell_value == nullptr;
-            } else if (_op == operator_type::NEQ) {
+            } else if (_op == expr::oper_t::NEQ) {
                return cell_value != nullptr;
            } else {
                throw exceptions::invalid_request_exception(format("Invalid comparison with null for operator \"{}\"", _op));
            }
        } else if (cell_value == nullptr) {
            // The condition parameter is not null, so only NEQ can return true
-            return _op == operator_type::NEQ;
+            return _op == expr::oper_t::NEQ;
        }
        // type::validate() is called by bind_and_get(), so it's safe to pass to_bytes() result
        // directly to compare.
        return is_satisfied_by(_op, *cell_value->type(), *column.type, *cell_value, to_bytes(param));
    }

-    if (_op == operator_type::LIKE) {
+    if (_op == expr::oper_t::LIKE) {
        if (cell_value == nullptr) {
            return false;
        }
@@ -266,7 +252,7 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti
        }
    }

-    assert(_op == operator_type::IN);
+    assert(_op == expr::oper_t::IN);

    std::vector<bytes_opt> in_values;

@@ -284,7 +270,7 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti
    // If cell value is NULL, IN list must contain NULL or an empty set/list. Otherwise it must contain cell value.
    if (cell_value) {
        return std::any_of(in_values.begin(), in_values.end(), [this, cell_value] (const bytes_opt& value) {
-            return value.has_value() && is_satisfied_by(operator_type::EQ, *cell_value->type(), *column.type, *cell_value, *value);
+            return value.has_value() && is_satisfied_by(expr::oper_t::EQ, *cell_value->type(), *column.type, *cell_value, *value);
        });
    } else {
        return std::any_of(in_values.begin(), in_values.end(), [] (const bytes_opt& value) { return !value.has_value() || value->empty(); });
@@ -325,13 +311,13 @@ column_condition::raw::prepare(database& db, const sstring& keyspace, const colu
        collection_element_term = _collection_element->prepare(db, keyspace, element_spec);
    }

-    if (_op.is_compare()) {
+    if (is_compare(_op)) {
        validate_operation_on_durations(*receiver.type, _op);
        return column_condition::condition(receiver, collection_element_term,
                _value->prepare(db, keyspace, value_spec), nullptr, _op);
    }

-    if (_op == operator_type::LIKE) {
+    if (_op == expr::oper_t::LIKE) {
        auto literal_term = dynamic_pointer_cast<constants::literal>(_value);
        if (literal_term) {
            // Pass matcher object
@@ -348,7 +334,7 @@ column_condition::raw::prepare(database& db, const sstring& keyspace, const colu
        }
    }

-    if (_op != operator_type::IN) {
+    if (_op != expr::oper_t::IN) {
        throw exceptions::invalid_request_exception(format("Unsupported operator type {} in a condition ", _op));
    }

--- a/Show More
+++ b/Show More