mutation_writer: feed_writer(): handle exceptions from consume_end_of_stream()

Currently the exception handling code of feed_writer() assumes consume_end_of_stream() doesn't throw. This is false and an exception from said method can currently lead to an unclean destroy of the writer and reader. Fix by also handling exceptions from consume_end_of_stream() too. Closes #10147 (cherry picked from commit 1963d1cc25)
release: prepare for 4.4.9
2022-03-03 10:45:40 +01:00 · 2022-02-16 14:24:54 +02:00 · 2022-02-03 18:40:12 +02:00 · 2022-01-30 20:08:43 +02:00 · 2022-01-30 11:00:21 +02:00 · 2022-01-27 10:27:45 +02:00
1280 changed files with 26771 additions and 7780 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -79,3 +79,9 @@ db/hints/* @haaawk @piodul @vladzcloudius
 # REDIS
 redis/* @nyh @syuu1228
 redis-test/* @nyh @syuu1228
+
+# READERS
+reader_* @denesb
+querier* @denesb
+test/boost/mutation_reader_test.cc @denesb
+test/boost/querier_cache_test.cc @denesb
--- a/.github/workflows/pages.yml
+++ b/.github/workflows/pages.yml
@@ -0,0 +1,33 @@
+name: "CI Docs"
+
+on:
+  push:
+    branches:
+    - master
+    paths:
+    - 'docs/**'
+jobs:
+  release:
+    name: Build
+    runs-on: ubuntu-latest
+    env:
+      LATEST_VERSION: master
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+      with:
+        persist-credentials: false
+        fetch-depth: 0
+    - name: Set up Python
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.7
+    - name: Build docs
+      run: |
+        export PATH=$PATH:~/.local/bin
+        cd docs
+        make multiversion
+    - name: Deploy
+      run : ./docs/_utils/deploy.sh
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,5 @@ tags
 testlog
 test/*/*.reject
 .vscode
+docs/_build
+docs/poetry.lock
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,8 +1,5 @@
-##
-## For best results, first compile the project using the Ninja build-system.
-##
+cmake_minimum_required(VERSION 3.18)

-cmake_minimum_required(VERSION 3.7)
 project(scylla)

 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
@@ -20,138 +17,740 @@ else()
    set(BUILD_TYPE "release")
 endif()

-if (NOT DEFINED FOR_IDE AND NOT DEFINED ENV{FOR_IDE} AND NOT DEFINED ENV{CLION_IDE})
-    message(FATAL_ERROR "This CMakeLists.txt file is only valid for use in IDEs, please define FOR_IDE to acknowledge this.")
-endif()
-
-# These paths are always available, since they're included in the repository. Additional DPDK headers are placed while
-# Seastar is built, and are captured in `SEASTAR_INCLUDE_DIRS` through parsing the Seastar pkg-config file (below).
-set(SEASTAR_DPDK_INCLUDE_DIRS
-        seastar/dpdk/lib/librte_eal/common/include
-        seastar/dpdk/lib/librte_eal/common/include/generic
-        seastar/dpdk/lib/librte_eal/common/include/x86
-        seastar/dpdk/lib/librte_ether)
-
-find_package(PkgConfig REQUIRED)
-
-set(ENV{PKG_CONFIG_PATH} "${CMAKE_SOURCE_DIR}/build/${BUILD_TYPE}/seastar:$ENV{PKG_CONFIG_PATH}")
-pkg_check_modules(SEASTAR seastar)
-
-if(NOT SEASTAR_INCLUDE_DIRS)
-    # Default value. A more accurate list is populated through `pkg-config` below if `seastar.pc` is available.
-    set(SEASTAR_INCLUDE_DIRS "seastar/include")
-endif()
-
-find_package(Boost COMPONENTS filesystem program_options system thread)
-
-##
-## Populate the names of all source and header files in the indicated paths in a designated variable.
-##
-## When RECURSIVE is specified, directories are traversed recursively.
-##
-## Use: scan_scylla_source_directories(VAR my_result_var [RECURSIVE] PATHS [path1 path2 ...])
-##
-function (scan_scylla_source_directories)
-    set(options RECURSIVE)
-    set(oneValueArgs VAR)
-    set(multiValueArgs PATHS)
-    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}")
-
-    set(globs "")
-
-    foreach (dir ${args_PATHS})
-        list(APPEND globs "${dir}/*.cc" "${dir}/*.hh")
-    endforeach()
-
-    if (args_RECURSIVE)
-        set(glob_kind GLOB_RECURSE)
+function(default_target_arch arch)
+    set(x86_instruction_sets i386 i686 x86_64)
+    if(CMAKE_SYSTEM_PROCESSOR IN_LIST x86_instruction_sets)
+        set(${arch} "westmere" PARENT_SCOPE)
+    elseif(CMAKE_SYSTEM_PROCESSOR EQUAL "aarch64")
+        set(${arch} "armv8-a+crc+crypto" PARENT_SCOPE)
    else()
-        set(glob_kind GLOB)
+        set(${arch} "" PARENT_SCOPE)
    endif()
+endfunction()
+default_target_arch(target_arch)
+if(target_arch)
+    set(target_arch_flag "-march=${target_arch}")
+endif()

-    file(${glob_kind} var
-            ${globs})
+# Configure Seastar compile options to align with Scylla
+set(Seastar_CXX_FLAGS -fcoroutines ${target_arch_flag} CACHE INTERNAL "" FORCE)
+set(Seastar_CXX_DIALECT gnu++20 CACHE INTERNAL "" FORCE)

-    set(${args_VAR} ${var} PARENT_SCOPE)
+add_subdirectory(seastar)
+add_subdirectory(abseil)
+# Exclude absl::strerror from the default "all" target since it's not
+# used in Scylla build and, moreover, makes use of deprecated glibc APIs,
+# such as sys_nerr, which are not exposed from "stdio.h" since glibc 2.32,
+# which happens to be the case for recent Fedora distribution versions.
+#
+# Need to use the internal "absl_strerror" target name instead of namespaced
+# variant because `set_target_properties` does not understand the latter form,
+# unfortunately.
+set_target_properties(absl_strerror PROPERTIES EXCLUDE_FROM_ALL TRUE)
+
+# System libraries dependencies
+find_package(Boost COMPONENTS filesystem program_options system thread regex REQUIRED)
+find_package(Lua REQUIRED)
+find_package(ZLIB REQUIRED)
+find_package(ICU COMPONENTS uc REQUIRED)
+
+set(scylla_build_dir "${CMAKE_BINARY_DIR}/build/${BUILD_TYPE}")
+set(scylla_gen_build_dir "${scylla_build_dir}/gen")
+file(MAKE_DIRECTORY "${scylla_build_dir}" "${scylla_gen_build_dir}")
+
+# Place libraries, executables and archives in ${buildroot}/build/${mode}/
+foreach(mode RUNTIME LIBRARY ARCHIVE)
+    set(CMAKE_${mode}_OUTPUT_DIRECTORY "${scylla_build_dir}")
+endforeach()
+
+# Generate C++ source files from thrift definitions
+function(scylla_generate_thrift)
+    set(one_value_args TARGET VAR IN_FILE OUT_DIR SERVICE)
+    cmake_parse_arguments(args "" "${one_value_args}" "" ${ARGN})
+
+    get_filename_component(in_file_name ${args_IN_FILE} NAME_WE)
+
+    set(aux_out_file_name ${args_OUT_DIR}/${in_file_name})
+    set(outputs
+        ${aux_out_file_name}_types.cpp
+        ${aux_out_file_name}_types.h
+        ${aux_out_file_name}_constants.cpp
+        ${aux_out_file_name}_constants.h
+        ${args_OUT_DIR}/${args_SERVICE}.cpp
+        ${args_OUT_DIR}/${args_SERVICE}.h)
+
+    add_custom_command(
+        DEPENDS
+            ${args_IN_FILE}
+            thrift
+        OUTPUT ${outputs}
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${args_OUT_DIR}
+        COMMAND thrift -gen cpp:cob_style,no_skeleton -out "${args_OUT_DIR}" "${args_IN_FILE}")
+
+    add_custom_target(${args_TARGET}
+        DEPENDS ${outputs})
+
+    set(${args_VAR} ${outputs} PARENT_SCOPE)
 endfunction()

-## Although Seastar is an external project, it is common enough to explore the sources while doing
-## Scylla development that we'll treat the Seastar sources as part of this project for easier navigation.
-scan_scylla_source_directories(
-        VAR SEASTAR_SOURCE_FILES
-        RECURSIVE
+scylla_generate_thrift(
+    TARGET scylla_thrift_gen_cassandra
+    VAR scylla_thrift_gen_cassandra_files
+    IN_FILE interface/cassandra.thrift
+    OUT_DIR ${scylla_gen_build_dir}
+    SERVICE Cassandra)

-        PATHS
-          seastar/core
-          seastar/http
-          seastar/json
-          seastar/net
-          seastar/rpc
-          seastar/testing
-          seastar/util)
+# Parse antlr3 grammar files and generate C++ sources
+function(scylla_generate_antlr3)
+    set(one_value_args TARGET VAR IN_FILE OUT_DIR)
+    cmake_parse_arguments(args "" "${one_value_args}" "" ${ARGN})

-scan_scylla_source_directories(
-        VAR SCYLLA_ROOT_SOURCE_FILES
-        PATHS .)
+    get_filename_component(in_file_pure_name ${args_IN_FILE} NAME)
+    get_filename_component(stem ${in_file_pure_name} NAME_WE)

-scan_scylla_source_directories(
-        VAR SCYLLA_SUB_SOURCE_FILES
-        RECURSIVE
+    set(outputs
+        "${args_OUT_DIR}/${stem}Lexer.hpp"
+        "${args_OUT_DIR}/${stem}Lexer.cpp"
+        "${args_OUT_DIR}/${stem}Parser.hpp"
+        "${args_OUT_DIR}/${stem}Parser.cpp")

-        PATHS
-          api
-          auth
-          cql3
-          db
-          dht
-          exceptions
-          gms
-          index
-          io
-          locator
-          message
-          raft
-          repair
-          service
-          sstables
-          streaming
-          test
-          thrift
-          tracing
-          transport
-          utils)
+    add_custom_command(
+        DEPENDS
+            ${args_IN_FILE}
+        OUTPUT ${outputs}
+        # Remove #ifdef'ed code from the grammar source code
+        COMMAND sed -e "/^#if 0/,/^#endif/d" "${args_IN_FILE}" > "${args_OUT_DIR}/${in_file_pure_name}"
+        COMMAND antlr3 "${args_OUT_DIR}/${in_file_pure_name}"
+        # We replace many local `ExceptionBaseType* ex` variables with a single function-scope one.
+        # Because we add such a variable to every function, and because `ExceptionBaseType` is not a global
+        # name, we also add a global typedef to avoid compilation errors.
+        COMMAND sed -i -e "/^.*On :.*$/d" "${args_OUT_DIR}/${stem}Lexer.hpp"
+        COMMAND sed -i -e "/^.*On :.*$/d" "${args_OUT_DIR}/${stem}Lexer.cpp"
+        COMMAND sed -i -e "/^.*On :.*$/d" "${args_OUT_DIR}/${stem}Parser.hpp"
+        COMMAND sed -i
+            -e "s/^\\( *\\)\\(ImplTraits::CommonTokenType\\* [a-zA-Z0-9_]* = NULL;\\)$/\\1const \\2/"
+            -e "/^.*On :.*$/d"
+            -e "1i using ExceptionBaseType = int;"
+            -e "s/^{/{ ExceptionBaseType\\* ex = nullptr;/; s/ExceptionBaseType\\* ex = new/ex = new/; s/exceptions::syntax_exception e/exceptions::syntax_exception\\& e/"
+            "${args_OUT_DIR}/${stem}Parser.cpp"
+        VERBATIM)

-scan_scylla_source_directories(
-        VAR SCYLLA_GEN_SOURCE_FILES
-        RECURSIVE
-        PATHS build/${BUILD_TYPE}/gen)
+    add_custom_target(${args_TARGET}
+        DEPENDS ${outputs})

-set(SCYLLA_SOURCE_FILES
-        ${SCYLLA_ROOT_SOURCE_FILES}
-        ${SCYLLA_GEN_SOURCE_FILES}
-        ${SCYLLA_SUB_SOURCE_FILES})
+    set(${args_VAR} ${outputs} PARENT_SCOPE)
+endfunction()
+
+set(antlr3_grammar_files
+    cql3/Cql.g
+    alternator/expressions.g)
+
+set(antlr3_gen_files)
+
+foreach(f ${antlr3_grammar_files})
+    get_filename_component(grammar_file_name "${f}" NAME_WE)
+    get_filename_component(f_dir "${f}" DIRECTORY)
+    scylla_generate_antlr3(
+        TARGET scylla_antlr3_gen_${grammar_file_name}
+        VAR scylla_antlr3_gen_${grammar_file_name}_files
+        IN_FILE ${f}
+        OUT_DIR ${scylla_gen_build_dir}/${f_dir})
+    list(APPEND antlr3_gen_files "${scylla_antlr3_gen_${grammar_file_name}_files}")
+endforeach()
+
+# Generate C++ sources from ragel grammar files
+seastar_generate_ragel(
+    TARGET scylla_ragel_gen_protocol_parser
+    VAR scylla_ragel_gen_protocol_parser_file
+    IN_FILE redis/protocol_parser.rl
+    OUT_FILE ${scylla_gen_build_dir}/redis/protocol_parser.hh)
+
+# Generate C++ sources from Swagger definitions
+set(swagger_files
+    api/api-doc/cache_service.json
+    api/api-doc/collectd.json
+    api/api-doc/column_family.json
+    api/api-doc/commitlog.json
+    api/api-doc/compaction_manager.json
+    api/api-doc/config.json
+    api/api-doc/endpoint_snitch_info.json
+    api/api-doc/error_injection.json
+    api/api-doc/failure_detector.json
+    api/api-doc/gossiper.json
+    api/api-doc/hinted_handoff.json
+    api/api-doc/lsa.json
+    api/api-doc/messaging_service.json
+    api/api-doc/storage_proxy.json
+    api/api-doc/storage_service.json
+    api/api-doc/stream_manager.json
+    api/api-doc/system.json
+    api/api-doc/utils.json)
+
+set(swagger_gen_files)
+
+foreach(f ${swagger_files})
+    get_filename_component(fname "${f}" NAME_WE)
+    get_filename_component(dir "${f}" DIRECTORY)
+    seastar_generate_swagger(
+        TARGET scylla_swagger_gen_${fname}
+        VAR scylla_swagger_gen_${fname}_files
+        IN_FILE "${f}"
+        OUT_DIR "${scylla_gen_build_dir}/${dir}")
+    list(APPEND swagger_gen_files "${scylla_swagger_gen_${fname}_files}")
+endforeach()
+
+# Create C++ bindings for IDL serializers
+function(scylla_generate_idl_serializer)
+    set(one_value_args TARGET VAR IN_FILE OUT_FILE)
+    cmake_parse_arguments(args "" "${one_value_args}" "" ${ARGN})
+    get_filename_component(out_dir ${args_OUT_FILE} DIRECTORY)
+    set(idl_compiler "${CMAKE_SOURCE_DIR}/idl-compiler.py")
+
+    find_package(Python3 COMPONENTS Interpreter)
+
+    add_custom_command(
+        DEPENDS
+            ${args_IN_FILE}
+            ${idl_compiler}
+        OUTPUT ${args_OUT_FILE}
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${out_dir}
+        COMMAND Python3::Interpreter ${idl_compiler} --ns ser -f ${args_IN_FILE} -o ${args_OUT_FILE})
+
+    add_custom_target(${args_TARGET}
+        DEPENDS ${args_OUT_FILE})
+
+    set(${args_VAR} ${args_OUT_FILE} PARENT_SCOPE)
+endfunction()
+
+set(idl_serializers
+    idl/cache_temperature.idl.hh
+    idl/commitlog.idl.hh
+    idl/consistency_level.idl.hh
+    idl/frozen_mutation.idl.hh
+    idl/frozen_schema.idl.hh
+    idl/gossip_digest.idl.hh
+    idl/idl_test.idl.hh
+    idl/keys.idl.hh
+    idl/messaging_service.idl.hh
+    idl/mutation.idl.hh
+    idl/paging_state.idl.hh
+    idl/partition_checksum.idl.hh
+    idl/paxos.idl.hh
+    idl/query.idl.hh
+    idl/range.idl.hh
+    idl/read_command.idl.hh
+    idl/reconcilable_result.idl.hh
+    idl/replay_position.idl.hh
+    idl/result.idl.hh
+    idl/ring_position.idl.hh
+    idl/streaming.idl.hh
+    idl/token.idl.hh
+    idl/tracing.idl.hh
+    idl/truncation_record.idl.hh
+    idl/uuid.idl.hh
+    idl/view.idl.hh)
+
+set(idl_gen_files)
+
+foreach(f ${idl_serializers})
+    get_filename_component(idl_name "${f}" NAME)
+    get_filename_component(idl_target "${idl_name}" NAME_WE)
+    get_filename_component(idl_dir "${f}" DIRECTORY)
+    string(REPLACE ".idl.hh" ".dist.hh" idl_out_hdr_name "${idl_name}")
+    scylla_generate_idl_serializer(
+        TARGET scylla_idl_gen_${idl_target}
+        VAR scylla_idl_gen_${idl_target}_files
+        IN_FILE ${f}
+        OUT_FILE ${scylla_gen_build_dir}/${idl_dir}/${idl_out_hdr_name})
+    list(APPEND idl_gen_files "${scylla_idl_gen_${idl_target}_files}")
+endforeach()
+
+set(scylla_sources
+    absl-flat_hash_map.cc
+    alternator/auth.cc
+    alternator/base64.cc
+    alternator/conditions.cc
+    alternator/executor.cc
+    alternator/expressions.cc
+    alternator/serialization.cc
+    alternator/server.cc
+    alternator/stats.cc
+    alternator/streams.cc
+    api/api.cc
+    api/cache_service.cc
+    api/collectd.cc
+    api/column_family.cc
+    api/commitlog.cc
+    api/compaction_manager.cc
+    api/config.cc
+    api/endpoint_snitch.cc
+    api/error_injection.cc
+    api/failure_detector.cc
+    api/gossiper.cc
+    api/hinted_handoff.cc
+    api/lsa.cc
+    api/messaging_service.cc
+    api/storage_proxy.cc
+    api/storage_service.cc
+    api/stream_manager.cc
+    api/system.cc
+    atomic_cell.cc
+    auth/allow_all_authenticator.cc
+    auth/allow_all_authorizer.cc
+    auth/authenticated_user.cc
+    auth/authentication_options.cc
+    auth/authenticator.cc
+    auth/common.cc
+    auth/default_authorizer.cc
+    auth/password_authenticator.cc
+    auth/passwords.cc
+    auth/permission.cc
+    auth/permissions_cache.cc
+    auth/resource.cc
+    auth/role_or_anonymous.cc
+    auth/roles-metadata.cc
+    auth/sasl_challenge.cc
+    auth/service.cc
+    auth/standard_role_manager.cc
+    auth/transitional.cc
+    bytes.cc
+    canonical_mutation.cc
+    cdc/cdc_partitioner.cc
+    cdc/generation.cc
+    cdc/log.cc
+    cdc/metadata.cc
+    cdc/split.cc
+    clocks-impl.cc
+    collection_mutation.cc
+    compress.cc
+    connection_notifier.cc
+    converting_mutation_partition_applier.cc
+    counters.cc
+    cql3/abstract_marker.cc
+    cql3/attributes.cc
+    cql3/cf_name.cc
+    cql3/column_condition.cc
+    cql3/column_identifier.cc
+    cql3/column_specification.cc
+    cql3/constants.cc
+    cql3/cql3_type.cc
+    cql3/expr/expression.cc
+    cql3/functions/aggregate_fcts.cc
+    cql3/functions/castas_fcts.cc
+    cql3/functions/error_injection_fcts.cc
+    cql3/functions/functions.cc
+    cql3/functions/user_function.cc
+    cql3/index_name.cc
+    cql3/keyspace_element_name.cc
+    cql3/lists.cc
+    cql3/maps.cc
+    cql3/operation.cc
+    cql3/query_options.cc
+    cql3/query_processor.cc
+    cql3/relation.cc
+    cql3/restrictions/statement_restrictions.cc
+    cql3/result_set.cc
+    cql3/role_name.cc
+    cql3/selection/abstract_function_selector.cc
+    cql3/selection/selectable.cc
+    cql3/selection/selection.cc
+    cql3/selection/selector.cc
+    cql3/selection/selector_factories.cc
+    cql3/selection/simple_selector.cc
+    cql3/sets.cc
+    cql3/single_column_relation.cc
+    cql3/statements/alter_keyspace_statement.cc
+    cql3/statements/alter_table_statement.cc
+    cql3/statements/alter_type_statement.cc
+    cql3/statements/alter_view_statement.cc
+    cql3/statements/authentication_statement.cc
+    cql3/statements/authorization_statement.cc
+    cql3/statements/batch_statement.cc
+    cql3/statements/cas_request.cc
+    cql3/statements/cf_prop_defs.cc
+    cql3/statements/cf_statement.cc
+    cql3/statements/create_function_statement.cc
+    cql3/statements/create_index_statement.cc
+    cql3/statements/create_keyspace_statement.cc
+    cql3/statements/create_table_statement.cc
+    cql3/statements/create_type_statement.cc
+    cql3/statements/create_view_statement.cc
+    cql3/statements/delete_statement.cc
+    cql3/statements/drop_function_statement.cc
+    cql3/statements/drop_index_statement.cc
+    cql3/statements/drop_keyspace_statement.cc
+    cql3/statements/drop_table_statement.cc
+    cql3/statements/drop_type_statement.cc
+    cql3/statements/drop_view_statement.cc
+    cql3/statements/function_statement.cc
+    cql3/statements/grant_statement.cc
+    cql3/statements/index_prop_defs.cc
+    cql3/statements/index_target.cc
+    cql3/statements/ks_prop_defs.cc
+    cql3/statements/list_permissions_statement.cc
+    cql3/statements/list_users_statement.cc
+    cql3/statements/modification_statement.cc
+    cql3/statements/permission_altering_statement.cc
+    cql3/statements/property_definitions.cc
+    cql3/statements/raw/parsed_statement.cc
+    cql3/statements/revoke_statement.cc
+    cql3/statements/role-management-statements.cc
+    cql3/statements/schema_altering_statement.cc
+    cql3/statements/select_statement.cc
+    cql3/statements/truncate_statement.cc
+    cql3/statements/update_statement.cc
+    cql3/statements/use_statement.cc
+    cql3/token_relation.cc
+    cql3/tuples.cc
+    cql3/type_json.cc
+    cql3/untyped_result_set.cc
+    cql3/update_parameters.cc
+    cql3/user_types.cc
+    cql3/ut_name.cc
+    cql3/util.cc
+    cql3/values.cc
+    cql3/variable_specifications.cc
+    data/cell.cc
+    database.cc
+    db/batchlog_manager.cc
+    db/commitlog/commitlog.cc
+    db/commitlog/commitlog_entry.cc
+    db/commitlog/commitlog_replayer.cc
+    db/config.cc
+    db/consistency_level.cc
+    db/cql_type_parser.cc
+    db/data_listeners.cc
+    db/extensions.cc
+    db/heat_load_balance.cc
+    db/hints/manager.cc
+    db/hints/resource_manager.cc
+    db/large_data_handler.cc
+    db/legacy_schema_migrator.cc
+    db/marshal/type_parser.cc
+    db/schema_tables.cc
+    db/size_estimates_virtual_reader.cc
+    db/snapshot-ctl.cc
+    db/sstables-format-selector.cc
+    db/system_distributed_keyspace.cc
+    db/system_keyspace.cc
+    db/view/row_locking.cc
+    db/view/view.cc
+    db/view/view_update_generator.cc
+    dht/boot_strapper.cc
+    dht/i_partitioner.cc
+    dht/murmur3_partitioner.cc
+    dht/range_streamer.cc
+    dht/token.cc
+    distributed_loader.cc
+    duration.cc
+    exceptions/exceptions.cc
+    flat_mutation_reader.cc
+    frozen_mutation.cc
+    frozen_schema.cc
+    gms/application_state.cc
+    gms/endpoint_state.cc
+    gms/failure_detector.cc
+    gms/feature_service.cc
+    gms/gossip_digest_ack.cc
+    gms/gossip_digest_ack2.cc
+    gms/gossip_digest_syn.cc
+    gms/gossiper.cc
+    gms/inet_address.cc
+    gms/version_generator.cc
+    gms/versioned_value.cc
+    hashers.cc
+    index/secondary_index.cc
+    index/secondary_index_manager.cc
+    init.cc
+    keys.cc
+    lister.cc
+    locator/abstract_replication_strategy.cc
+    locator/ec2_multi_region_snitch.cc
+    locator/ec2_snitch.cc
+    locator/everywhere_replication_strategy.cc
+    locator/gce_snitch.cc
+    locator/gossiping_property_file_snitch.cc
+    locator/local_strategy.cc
+    locator/network_topology_strategy.cc
+    locator/production_snitch_base.cc
+    locator/rack_inferring_snitch.cc
+    locator/simple_snitch.cc
+    locator/simple_strategy.cc
+    locator/snitch_base.cc
+    locator/token_metadata.cc
+    lua.cc
+    main.cc
+    memtable.cc
+    message/messaging_service.cc
+    multishard_mutation_query.cc
+    mutation.cc
+    raft/fsm.cc
+    raft/log.cc
+    raft/progress.cc
+    raft/raft.cc
+    raft/server.cc
+    mutation_fragment.cc
+    mutation_partition.cc
+    mutation_partition_serializer.cc
+    mutation_partition_view.cc
+    mutation_query.cc
+    mutation_reader.cc
+    mutation_writer/multishard_writer.cc
+    mutation_writer/shard_based_splitting_writer.cc
+    mutation_writer/timestamp_based_splitting_writer.cc
+    mutation_writer/feed_writers.cc
+    partition_slice_builder.cc
+    partition_version.cc
+    querier.cc
+    query-result-set.cc
+    query.cc
+    range_tombstone.cc
+    range_tombstone_list.cc
+    reader_concurrency_semaphore.cc
+    redis/abstract_command.cc
+    redis/command_factory.cc
+    redis/commands.cc
+    redis/keyspace_utils.cc
+    redis/lolwut.cc
+    redis/mutation_utils.cc
+    redis/options.cc
+    redis/query_processor.cc
+    redis/query_utils.cc
+    redis/server.cc
+    redis/service.cc
+    redis/stats.cc
+    repair/repair.cc
+    repair/row_level.cc
+    row_cache.cc
+    schema.cc
+    schema_mutations.cc
+    schema_registry.cc
+    service/client_state.cc
+    service/migration_manager.cc
+    service/migration_task.cc
+    service/misc_services.cc
+    service/pager/paging_state.cc
+    service/pager/query_pagers.cc
+    service/paxos/paxos_state.cc
+    service/paxos/prepare_response.cc
+    service/paxos/prepare_summary.cc
+    service/paxos/proposal.cc
+    service/priority_manager.cc
+    service/storage_proxy.cc
+    service/storage_service.cc
+    sstables/compaction.cc
+    sstables/compaction_manager.cc
+    sstables/compaction_strategy.cc
+    sstables/compress.cc
+    sstables/integrity_checked_file_impl.cc
+    sstables/kl/writer.cc
+    sstables/leveled_compaction_strategy.cc
+    sstables/m_format_read_helpers.cc
+    sstables/metadata_collector.cc
+    sstables/mp_row_consumer.cc
+    sstables/mx/writer.cc
+    sstables/partition.cc
+    sstables/prepended_input_stream.cc
+    sstables/random_access_reader.cc
+    sstables/size_tiered_compaction_strategy.cc
+    sstables/sstable_directory.cc
+    sstables/sstable_version.cc
+    sstables/sstables.cc
+    sstables/sstables_manager.cc
+    sstables/time_window_compaction_strategy.cc
+    sstables/writer.cc
+    streaming/progress_info.cc
+    streaming/session_info.cc
+    streaming/stream_coordinator.cc
+    streaming/stream_manager.cc
+    streaming/stream_plan.cc
+    streaming/stream_reason.cc
+    streaming/stream_receive_task.cc
+    streaming/stream_request.cc
+    streaming/stream_result_future.cc
+    streaming/stream_session.cc
+    streaming/stream_session_state.cc
+    streaming/stream_summary.cc
+    streaming/stream_task.cc
+    streaming/stream_transfer_task.cc
+    table.cc
+    table_helper.cc
+    thrift/controller.cc
+    thrift/handler.cc
+    thrift/server.cc
+    thrift/thrift_validation.cc
+    timeout_config.cc
+    tracing/trace_keyspace_helper.cc
+    tracing/trace_state.cc
+    tracing/traced_file.cc
+    tracing/tracing.cc
+    tracing/tracing_backend_registry.cc
+    transport/controller.cc
+    transport/cql_protocol_extension.cc
+    transport/event.cc
+    transport/event_notifier.cc
+    transport/messages/result_message.cc
+    transport/server.cc
+    types.cc
+    unimplemented.cc
+    utils/UUID_gen.cc
+    utils/arch/powerpc/crc32-vpmsum/crc32_wrapper.cc
+    utils/array-search.cc
+    utils/ascii.cc
+    utils/big_decimal.cc
+    utils/bloom_calculations.cc
+    utils/bloom_filter.cc
+    utils/buffer_input_stream.cc
+    utils/build_id.cc
+    utils/config_file.cc
+    utils/directories.cc
+    utils/disk-error-handler.cc
+    utils/dynamic_bitset.cc
+    utils/error_injection.cc
+    utils/exceptions.cc
+    utils/file_lock.cc
+    utils/generation-number.cc
+    utils/gz/crc_combine.cc
+    utils/human_readable.cc
+    utils/i_filter.cc
+    utils/large_bitset.cc
+    utils/like_matcher.cc
+    utils/limiting_data_source.cc
+    utils/logalloc.cc
+    utils/managed_bytes.cc
+    utils/multiprecision_int.cc
+    utils/murmur_hash.cc
+    utils/rate_limiter.cc
+    utils/rjson.cc
+    utils/runtime.cc
+    utils/updateable_value.cc
+    utils/utf8.cc
+    utils/uuid.cc
+    validation.cc
+    vint-serialization.cc
+    zstd.cc
+    release.cc)
+
+set(scylla_gen_sources
+    "${scylla_thrift_gen_cassandra_files}"
+    "${scylla_ragel_gen_protocol_parser_file}"
+    "${swagger_gen_files}"
+    "${idl_gen_files}"
+    "${antlr3_gen_files}")

 add_executable(scylla
-        ${SEASTAR_SOURCE_FILES}
-        ${SCYLLA_SOURCE_FILES})
+    ${scylla_sources}
+    ${scylla_gen_sources})

-# If the Seastar pkg-config information is available, append to the default flags.
-#
-# For ease of browsing the source code, we always pretend that DPDK is enabled.
-target_compile_options(scylla PUBLIC
-        -std=gnu++20
-        -DHAVE_DPDK
-        -DHAVE_HWLOC
-        "${SEASTAR_CFLAGS}")
+target_link_libraries(scylla PRIVATE
+    seastar
+    # Boost dependencies
+    Boost::filesystem
+    Boost::program_options
+    Boost::system
+    Boost::thread
+    Boost::regex
+    Boost::headers
+    # Abseil libs
+    absl::hashtablez_sampler
+    absl::raw_hash_set
+    absl::synchronization
+    absl::graphcycles_internal
+    absl::stacktrace
+    absl::symbolize
+    absl::debugging_internal
+    absl::demangle_internal
+    absl::time
+    absl::time_zone
+    absl::int128
+    absl::city
+    absl::hash
+    absl::malloc_internal
+    absl::spinlock_wait
+    absl::base
+    absl::dynamic_annotations
+    absl::raw_logging_internal
+    absl::exponential_biased
+    absl::throw_delegate
+    # System libs
+    ZLIB::ZLIB
+    ICU::uc
+    systemd
+    zstd
+    snappy
+    ${LUA_LIBRARIES}
+    thrift
+    crypt)

-# The order matters here: prefer the "static" DPDK directories to any dynamic paths from pkg-config. Some files are only
-# available dynamically, though.
-target_include_directories(scylla PUBLIC
-        .
-        ${SEASTAR_DPDK_INCLUDE_DIRS}
-        ${SEASTAR_INCLUDE_DIRS}
-        ${Boost_INCLUDE_DIRS}
-        xxhash
-        libdeflate
-        abseil
-        build/${BUILD_TYPE}/gen)
+target_link_libraries(scylla PRIVATE
+    -Wl,--build-id=sha1 # Force SHA1 build-id generation
+    # TODO: Use lld linker if it's available, otherwise gold, else bfd
+    -fuse-ld=lld)
+# TODO: patch dynamic linker to match configure.py behavior
+
+target_compile_options(scylla PRIVATE
+    -std=gnu++20
+    -fcoroutines # TODO: Clang does not have this flag, adjust to both variants
+    ${target_arch_flag})
+# Hacks needed to expose internal APIs for xxhash dependencies
+target_compile_definitions(scylla PRIVATE XXH_PRIVATE_API HAVE_LZ4_COMPRESS_DEFAULT)
+
+target_include_directories(scylla PRIVATE
+    "${CMAKE_CURRENT_SOURCE_DIR}"
+    libdeflate
+    abseil
+    "${scylla_gen_build_dir}")
+
+###
+### Create crc_combine_table helper executable.
+### Use it to generate crc_combine_table.cc to be used in scylla at build time.
+###
+add_executable(crc_combine_table utils/gz/gen_crc_combine_table.cc)
+target_link_libraries(crc_combine_table PRIVATE seastar)
+target_include_directories(crc_combine_table PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+target_compile_options(crc_combine_table PRIVATE
+    -std=gnu++20
+    -fcoroutines
+    ${target_arch_flag})
+add_dependencies(scylla crc_combine_table)
+
+# Generate an additional source file at build time that is needed for Scylla compilation
+add_custom_command(OUTPUT "${scylla_gen_build_dir}/utils/gz/crc_combine_table.cc"
+    COMMAND $<TARGET_FILE:crc_combine_table> > "${scylla_gen_build_dir}/utils/gz/crc_combine_table.cc"
+    DEPENDS crc_combine_table)
+target_sources(scylla PRIVATE "${scylla_gen_build_dir}/utils/gz/crc_combine_table.cc")
+
+###
+### Generate version file and supply appropriate compile definitions for release.cc
+###
+execute_process(COMMAND ${CMAKE_SOURCE_DIR}/SCYLLA-VERSION-GEN RESULT_VARIABLE scylla_version_gen_res)
+if(scylla_version_gen_res)
+    message(SEND_ERROR "Version file generation failed. Return code: ${scylla_version_gen_res}")
+endif()
+
+file(READ build/SCYLLA-VERSION-FILE scylla_version)
+string(STRIP "${scylla_version}" scylla_version)
+
+file(READ build/SCYLLA-RELEASE-FILE scylla_release)
+string(STRIP "${scylla_release}" scylla_release)
+
+get_property(release_cdefs SOURCE "${CMAKE_SOURCE_DIR}/release.cc" PROPERTY COMPILE_DEFINITIONS)
+list(APPEND release_cdefs "SCYLLA_VERSION=\"${scylla_version}\"" "SCYLLA_RELEASE=\"${scylla_release}\"")
+set_source_files_properties("${CMAKE_SOURCE_DIR}/release.cc" PROPERTIES COMPILE_DEFINITIONS "${release_cdefs}")
+
+###
+### Custom command for building libdeflate. Link the library to scylla.
+###
+set(libdeflate_lib "${scylla_build_dir}/libdeflate/libdeflate.a")
+add_custom_command(OUTPUT "${libdeflate_lib}"
+    COMMAND make -C libdeflate
+        BUILD_DIR=../build/${BUILD_TYPE}/libdeflate/
+        CC=${CMAKE_C_COMPILER}
+        "CFLAGS=${target_arch_flag}"
+        ../build/${BUILD_TYPE}/libdeflate//libdeflate.a) # Two backslashes are important!
+# Hack to force generating custom command to produce libdeflate.a
+add_custom_target(libdeflate DEPENDS "${libdeflate_lib}")
+target_link_libraries(scylla PRIVATE "${libdeflate_lib}")
+
+# TODO: create cmake/ directory and move utilities (generate functions etc) there
+# TODO: Build tests if BUILD_TESTING=on (using CTest module)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,11 +1,13 @@
-# Asking questions or requesting help
+# Contributing
+
+## Asking questions or requesting help

 Use the [ScyllaDB user mailing list](https://groups.google.com/forum/#!forum/scylladb-users) or the [Slack workspace](http://slack.scylladb.com) for general questions and help.

-# Reporting an issue
+## Reporting an issue

 Please use the [Issue Tracker](https://github.com/scylladb/scylla/issues/) to report issues.  Fill in as much information as you can in the issue template, especially for performance problems.

-# Contributing Code to Scylla
+## Contributing Code to Scylla

 To contribute code to Scylla, you need to sign the [Contributor License Agreement](https://www.scylladb.com/open-source/contributor-agreement/) and send your changes as [patches](https://github.com/scylladb/scylla/wiki/Formatting-and-sending-patches) to the [mailing list](https://groups.google.com/forum/#!forum/scylladb-dev). We don't accept pull requests on GitHub.
--- a/README.md
+++ b/README.md
@@ -78,10 +78,7 @@ and the current compatibility of this feature as well as Scylla-specific extensi

 ## Documentation

-Documentation can be found in [./docs](./docs) and on the
-[wiki](https://github.com/scylladb/scylla/wiki). There is currently no clear
-definition of what goes where, so when looking for something be sure to check
-both.
+Documentation can be found [here](https://scylla.docs.scylladb.com).
 Seastar documentation can be found [here](http://docs.seastar.io/master/index.html).
 User documentation can be found [here](https://docs.scylladb.com/).

--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=666.development
+VERSION=4.4.9

 if test -f version
 then
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -123,7 +123,7 @@ struct rjson_engaged_ptr_comp {
 // as internally they're stored in an array, and the order of elements is
 // not important in set equality. See issue #5021
 static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2) {
-    if (set1.Size() != set2.Size()) {
+    if (!set1.IsArray() || !set2.IsArray() || set1.Size() != set2.Size()) {
        return false;
    }
    std::set<const rjson::value*, rjson_engaged_ptr_comp> set1_raw;
@@ -137,45 +137,107 @@ static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2
    }
    return true;
 }
+// Moreover, the JSON being compared can be a nested document with outer
+// layers of lists and maps and some inner set - and we need to get to that
+// inner set to compare it correctly with check_EQ_for_sets() (issue #8514).
+static bool check_EQ(const rjson::value* v1, const rjson::value& v2);
+static bool check_EQ_for_lists(const rjson::value& list1, const rjson::value& list2) {
+    if (!list1.IsArray() || !list2.IsArray() || list1.Size() != list2.Size()) {
+        return false;
+    }
+    auto it1 = list1.Begin();
+    auto it2 = list2.Begin();
+    while (it1 != list1.End()) {
+        // Note: Alternator limits an item's depth (rjson::parse() limits
+        // it to around 37 levels), so this recursion is safe.
+        if (!check_EQ(&*it1, *it2)) {
+            return false;
+        }
+        ++it1;
+        ++it2;
+    }
+    return true;
+}
+static bool check_EQ_for_maps(const rjson::value& list1, const rjson::value& list2) {
+    if (!list1.IsObject() || !list2.IsObject() || list1.MemberCount() != list2.MemberCount()) {
+        return false;
+    }
+    for (auto it1 = list1.MemberBegin(); it1 != list1.MemberEnd(); ++it1) {
+        auto it2 = list2.FindMember(it1->name);
+        if (it2 == list2.MemberEnd() || !check_EQ(&it1->value, it2->value)) {
+            return false;
+        }
+    }
+    return true;
+}

 // Check if two JSON-encoded values match with the EQ relation
 static bool check_EQ(const rjson::value* v1, const rjson::value& v2) {
-    if (!v1) {
-        return false;
-    }
-    if (v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
+    if (v1 && v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
        auto it1 = v1->MemberBegin();
        auto it2 = v2.MemberBegin();
-        if ((it1->name == "SS" && it2->name == "SS") || (it1->name == "NS" && it2->name == "NS") || (it1->name == "BS" && it2->name == "BS")) {
-            return check_EQ_for_sets(it1->value, it2->value);
+        if (it1->name != it2->name) {
+            return false;
        }
+        if (it1->name == "SS" || it1->name == "NS" || it1->name == "BS") {
+            return check_EQ_for_sets(it1->value, it2->value);
+        } else if(it1->name == "L") {
+            return check_EQ_for_lists(it1->value, it2->value);
+        } else if(it1->name == "M") {
+            return check_EQ_for_maps(it1->value, it2->value);
+        } else {
+            // Other, non-nested types (number, string, etc.) can be compared
+            // literally, comparing their JSON representation.
+            return it1->value == it2->value;
+        }
+    } else {
+        // If v1 and/or v2 are missing (IsNull()) the result should be false.
+        // In the unlikely case that the object is malformed (issue #8070),
+        // let's also return false.
+        return false;
    }
-    return *v1 == v2;
 }

 // Check if two JSON-encoded values match with the NE relation
 static bool check_NE(const rjson::value* v1, const rjson::value& v2) {
-    return !v1 || *v1 != v2; // null is unequal to anything.
+    return !check_EQ(v1, v2);
 }

 // Check if two JSON-encoded values match with the BEGINS_WITH relation
-static bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2) {
-    // BEGINS_WITH requires that its single operand (v2) be a string or
-    // binary - otherwise it's a validation error. However, problems with
-    // the stored attribute (v1) will just return false (no match).
-    if (!v2.IsObject() || v2.MemberCount() != 1) {
-        throw api_error::validation(format("BEGINS_WITH operator encountered malformed AttributeValue: {}", v2));
-    }
-    auto it2 = v2.MemberBegin();
-    if (it2->name != "S" && it2->name != "B") {
-        throw api_error::validation(format("BEGINS_WITH operator requires String or Binary type in AttributeValue, got {}", it2->name));
-    }
-
-
+bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2,
+                       bool v1_from_query, bool v2_from_query) {
+    bool bad = false;
    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
+        if (v1_from_query) {
+            throw api_error::validation("begins_with() encountered malformed argument");
+        } else {
+            bad = true;
+        }
+    } else if (v1->MemberBegin()->name != "S" && v1->MemberBegin()->name != "B") {
+        if (v1_from_query) {
+            throw api_error::validation(format("begins_with supports only string or binary type, got: {}", *v1));
+        } else {
+            bad = true;
+        }
+    }
+    if (!v2.IsObject() || v2.MemberCount() != 1) {
+        if (v2_from_query) {
+            throw api_error::validation("begins_with() encountered malformed argument");
+        } else {
+            bad = true;
+        }
+    } else if (v2.MemberBegin()->name != "S" && v2.MemberBegin()->name != "B") {
+        if (v2_from_query) {
+            throw api_error::validation(format("begins_with() supports only string or binary type, got: {}", v2));
+        } else {
+            bad = true;
+        }
+    }
+    if (bad) {
        return false;
    }
    auto it1 = v1->MemberBegin();
+    auto it2 = v2.MemberBegin();
    if (it1->name != it2->name) {
        return false;
    }
@@ -279,24 +341,40 @@ static bool check_NOT_NULL(const rjson::value* val) {
    return val != nullptr;
 }

+// Only types S, N or B (string, number or bytes) may be compared by the
+// various comparion operators - lt, le, gt, ge, and between.
+// Note that in particular, if the value is missing (v->IsNull()), this
+// check returns false.
+static bool check_comparable_type(const rjson::value& v) {
+    if (!v.IsObject() || v.MemberCount() != 1) {
+        return false;
+    }
+    const rjson::value& type = v.MemberBegin()->name;
+    return type == "S" || type == "N" || type == "B";
+}
+
 // Check if two JSON-encoded values match with cmp.
 template <typename Comparator>
-bool check_compare(const rjson::value* v1, const rjson::value& v2, const Comparator& cmp) {
-    if (!v2.IsObject() || v2.MemberCount() != 1) {
-        throw api_error::validation(
-                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic));
+bool check_compare(const rjson::value* v1, const rjson::value& v2, const Comparator& cmp,
+                   bool v1_from_query, bool v2_from_query) {
+    bool bad = false;
+    if (!v1 || !check_comparable_type(*v1)) {
+        if (v1_from_query) {
+            throw api_error::validation(format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
+        }
+        bad = true;
    }
-    const auto& kv2 = *v2.MemberBegin();
-    if (kv2.name != "S" && kv2.name != "N" && kv2.name != "B") {
-        throw api_error::validation(
-                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic));
+    if (!check_comparable_type(v2)) {
+        if (v2_from_query) {
+            throw api_error::validation(format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
+        }
+        bad = true;
    }
-    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
+    if (bad) {
        return false;
    }
    const auto& kv1 = *v1->MemberBegin();
+    const auto& kv2 = *v2.MemberBegin();
    if (kv1.name != kv2.name) {
        return false;
    }
@@ -310,7 +388,8 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara
    if (kv1.name == "B") {
        return cmp(base64_decode(kv1.value), base64_decode(kv2.value));
    }
-    clogger.error("check_compare panic: LHS type equals RHS type, but one is in {N,S,B} while the other isn't");
+    // cannot reach here, as check_comparable_type() verifies the type is one
+    // of the above options.
    return false;
 }

@@ -341,56 +420,71 @@ struct cmp_gt {
    static constexpr const char* diagnostic = "GT operator";
 };

-// True if v is between lb and ub, inclusive.  Throws if lb > ub.
+// True if v is between lb and ub, inclusive.  Throws or returns false
+// (depending on bounds_from_query parameter) if lb > ub.
 template <typename T>
-static bool check_BETWEEN(const T& v, const T& lb, const T& ub) {
+static bool check_BETWEEN(const T& v, const T& lb, const T& ub, bool bounds_from_query) {
    if (cmp_lt()(ub, lb)) {
-        throw api_error::validation(
-                        format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+        if (bounds_from_query) {
+            throw api_error::validation(
+                format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+        } else {
+            return false;
+        }
    }
    return cmp_ge()(v, lb) && cmp_le()(v, ub);
 }

-static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub) {
-    if (!v) {
+static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub,
+                          bool v_from_query, bool lb_from_query, bool ub_from_query) {
+    if ((v && v_from_query && !check_comparable_type(*v)) ||
+        (lb_from_query && !check_comparable_type(lb)) ||
+        (ub_from_query && !check_comparable_type(ub))) {
+        throw api_error::validation("between allow only the types String, Number, or Binary");
+
+    }
+    if (!v || !v->IsObject() || v->MemberCount() != 1 ||
+        !lb.IsObject() || lb.MemberCount() != 1 ||
+        !ub.IsObject() || ub.MemberCount() != 1) {
        return false;
    }
-    if (!v->IsObject() || v->MemberCount() != 1) {
-        throw api_error::validation(format("BETWEEN operator encountered malformed AttributeValue: {}", *v));
-    }
-    if (!lb.IsObject() || lb.MemberCount() != 1) {
-        throw api_error::validation(format("BETWEEN operator encountered malformed AttributeValue: {}", lb));
-    }
-    if (!ub.IsObject() || ub.MemberCount() != 1) {
-        throw api_error::validation(format("BETWEEN operator encountered malformed AttributeValue: {}", ub));
-    }

    const auto& kv_v = *v->MemberBegin();
    const auto& kv_lb = *lb.MemberBegin();
    const auto& kv_ub = *ub.MemberBegin();
+    bool bounds_from_query = lb_from_query && ub_from_query;
    if (kv_lb.name != kv_ub.name) {
-        throw api_error::validation(
+        if (bounds_from_query) {
+           throw api_error::validation(
                format("BETWEEN operator requires the same type for lower and upper bound; instead got {} and {}",
                       kv_lb.name, kv_ub.name));
+        } else {
+            return false;
+        }
    }
    if (kv_v.name != kv_lb.name) { // Cannot compare different types, so v is NOT between lb and ub.
        return false;
    }
    if (kv_v.name == "N") {
        const char* diag = "BETWEEN operator";
-        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag));
+        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag), bounds_from_query);
    }
    if (kv_v.name == "S") {
        return check_BETWEEN(std::string_view(kv_v.value.GetString(), kv_v.value.GetStringLength()),
                             std::string_view(kv_lb.value.GetString(), kv_lb.value.GetStringLength()),
-                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()));
+                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()),
+                             bounds_from_query);
    }
    if (kv_v.name == "B") {
-        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value));
+        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value), bounds_from_query);
    }
-    throw api_error::validation(
-        format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
+    if (v_from_query) {
+        throw api_error::validation(
+            format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
               kv_lb.name));
+    } else {
+        return false;
+    }
 }

 // Verify one Expect condition on one attribute (whose content is "got")
@@ -437,19 +531,19 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
            return check_NE(got, (*attribute_value_list)[0]);
        case comparison_operator_type::LT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_lt{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_lt{}, false, true);
        case comparison_operator_type::LE:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_le{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_le{}, false, true);
        case comparison_operator_type::GT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_gt{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_gt{}, false, true);
        case comparison_operator_type::GE:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_ge{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_ge{}, false, true);
        case comparison_operator_type::BEGINS_WITH:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_BEGINS_WITH(got, (*attribute_value_list)[0]);
+            return check_BEGINS_WITH(got, (*attribute_value_list)[0], false, true);
        case comparison_operator_type::IN:
            verify_operand_count(attribute_value_list, nonempty(), *comparison_operator);
            return check_IN(got, *attribute_value_list);
@@ -461,7 +555,8 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
            return check_NOT_NULL(got);
        case comparison_operator_type::BETWEEN:
            verify_operand_count(attribute_value_list, exact_size(2), *comparison_operator);
-            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1]);
+            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1],
+                                 false, true, true);
        case comparison_operator_type::CONTAINS:
            {
                verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
@@ -573,7 +668,8 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
            // Shouldn't happen unless we have a bug in the parser
            throw std::logic_error(format("Wrong number of values {} in BETWEEN primitive_condition", cond._values.size()));
        }
-        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2]);
+        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2],
+                             cond._values[0].is_constant(), cond._values[1].is_constant(), cond._values[2].is_constant());
    case parsed::primitive_condition::type::IN:
        return check_IN(calculated_values);
    case parsed::primitive_condition::type::VALUE:
@@ -604,13 +700,17 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
    case parsed::primitive_condition::type::NE:
        return check_NE(&calculated_values[0], calculated_values[1]);
    case parsed::primitive_condition::type::GT:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::GE:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::LT:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::LE:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    default:
        // Shouldn't happen unless we have a bug in the parser
        throw std::logic_error(format("Unknown type {} in primitive_condition object", (int)(cond._op)));
--- a/alternator/conditions.hh
+++ b/alternator/conditions.hh
@@ -52,6 +52,7 @@ bool verify_expected(const rjson::value& req, const rjson::value* previous_item)
 bool verify_condition(const rjson::value& condition, bool require_all, const rjson::value* previous_item);

 bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2);
+bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2, bool v1_from_query, bool v2_from_query);

 bool verify_condition_expression(
        const parsed::condition_expression& condition_expression,
--- a/alternator/error.hh
+++ b/alternator/error.hh
@@ -59,6 +59,9 @@ public:
    static api_error invalid_signature(std::string msg) {
        return api_error("InvalidSignatureException", std::move(msg));
    }
+    static api_error missing_authentication_token(std::string msg) {
+        return api_error("MissingAuthenticationTokenException", std::move(msg));
+    }
    static api_error unrecognized_client(std::string msg) {
        return api_error("UnrecognizedClientException", std::move(msg));
    }
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -70,6 +70,76 @@ public:
    std::string to_json() const override;
 };

+namespace parsed {
+class path;
+};
+
+// An attribute_path_map object is used to hold data for various attributes
+// paths (parsed::path) in a hierarchy of attribute paths. Each attribute path
+// has a root attribute, and then modified by member and index operators -
+// for example in "a.b[2].c" we have "a" as the root, then ".b" member, then
+// "[2]" index, and finally ".c" member.
+// Data can be added to an attribute_path_map using the add() function, but
+// requires that attributes with data not be *overlapping* or *conflicting*:
+//
+// 1. Two attribute paths which are identical or an ancestor of one another
+//    are considered *overlapping* and not allowed. If a.b.c has data,
+//    we can't add more data in a.b.c or any of its descendants like a.b.c.d.
+//
+// 2. Two attribute paths which need the same parent to have both a member and
+//    an index are considered *conflicting* and not allowed. E.g., if a.b has
+//    data, you can't add a[1]. The meaning of adding both would be that the
+//    attribute a is both a map and an array, which isn't sensible.
+//
+// These two requirements are common to the two places where Alternator uses
+// this abstraction to describe how a hierarchical item is to be transformed:
+//
+// 1. In ProjectExpression: for filtering from a full top-level attribute
+//    only the parts for which user asked in ProjectionExpression.
+//
+// 2. In UpdateExpression: for taking the previous value of a top-level
+//    attribute, and modifying it based on the instructions in the user
+//    wrote in UpdateExpression.
+
+template<typename T>
+class attribute_path_map_node {
+public:
+    using data_t = T;
+    // We need the extra shared_ptr<> here because libstdc++ unordered_map
+    // doesn't work with incomplete types :-( We couldn't use lw_shared_ptr<>
+    // because it doesn't work for incomplete types either. We couldn't use
+    // std::unique_ptr<> because it makes the entire object uncopyable. We
+    // don't often need to copy such a map, but we do have some code that
+    // copies an attrs_to_get object, and is hard to find and remove.
+    // The shared_ptr should never be null.
+    using members_t =  std::unordered_map<std::string, seastar::shared_ptr<attribute_path_map_node<T>>>;
+    // The indexes list is sorted because DynamoDB requires handling writes
+    // beyond the end of a list in index order.
+    using indexes_t = std::map<unsigned, seastar::shared_ptr<attribute_path_map_node<T>>>;
+    // The prohibition on "overlap" and "conflict" explained above means
+    // That only one of data, members or indexes is non-empty.
+    std::optional<std::variant<data_t, members_t, indexes_t>> _content;
+
+    bool is_empty() const { return !_content; }
+    bool has_value() const { return _content && std::holds_alternative<data_t>(*_content); }
+    bool has_members() const { return _content && std::holds_alternative<members_t>(*_content); }
+    bool has_indexes() const { return _content && std::holds_alternative<indexes_t>(*_content); }
+    // get_members() assumes that has_members() is true
+    members_t& get_members() { return std::get<members_t>(*_content); }
+    const members_t& get_members() const { return std::get<members_t>(*_content); }
+    indexes_t& get_indexes() { return std::get<indexes_t>(*_content); }
+    const indexes_t& get_indexes() const { return std::get<indexes_t>(*_content); }
+    T& get_value() { return std::get<T>(*_content); }
+    const T& get_value() const { return std::get<T>(*_content); }
+};
+
+template<typename T>
+using attribute_path_map = std::unordered_map<std::string, attribute_path_map_node<T>>;
+
+using attrs_to_get_node = attribute_path_map_node<std::monostate>;
+using attrs_to_get = attribute_path_map<std::monostate>;
+
+
 class executor : public peering_sharded_service<executor> {
    service::storage_proxy& _proxy;
    service::migration_manager& _mm;
@@ -121,6 +191,10 @@ public:

    static sstring table_name(const schema&);
    static db::timeout_clock::time_point default_timeout();
+    static void set_default_timeout(db::timeout_clock::duration timeout);
+private:
+    static db::timeout_clock::duration s_default_timeout;
+public:
    static schema_ptr find_table(service::storage_proxy&, const rjson::value& request);

 private:
@@ -136,16 +210,14 @@ public:
        const query::partition_slice&,
        const cql3::selection::selection&,
        const query::result&,
-        const std::unordered_set<std::string>&);
+        const attrs_to_get&);

    static void describe_single_item(const cql3::selection::selection&,
        const std::vector<bytes_opt>&,
-        const std::unordered_set<std::string>&,
+        const attrs_to_get&,
        rjson::value&,
        bool = false);

-
-
    void add_stream_options(const rjson::value& stream_spec, schema_builder&) const;
    void supplement_table_info(rjson::value& descr, const schema& schema) const;
    void supplement_table_stream_info(rjson::value& descr, const schema& schema) const;
--- a/alternator/expressions.cc
+++ b/alternator/expressions.cc
@@ -130,6 +130,27 @@ void condition_expression::append(condition_expression&& a, char op) {
    }, _expression);
 }

+void path::check_depth_limit() {
+    if (1 + _operators.size() > depth_limit) {
+        throw expressions_syntax_error(format("Document path exceeded {} nesting levels", depth_limit));
+    }
+}
+
+std::ostream& operator<<(std::ostream& os, const path& p) {
+    os << p.root();
+    for (const auto& op : p.operators()) {
+        std::visit(overloaded_functor {
+            [&] (const std::string& member) {
+                os << '.' << member;
+            },
+            [&] (unsigned index) {
+                os << '[' << index << ']';
+            }
+        }, op);
+    }
+    return os;
+}
+
 } // namespace parsed

 // The following resolve_*() functions resolve references in parsed
@@ -151,10 +172,9 @@ void condition_expression::append(condition_expression&& a, char op) {
 // we need to resolve the expression just once but then use it many times
 // (once for each item to be filtered).

-static void resolve_path(parsed::path& p,
+static std::optional<std::string> resolve_path_component(const std::string& column_name,
        const rjson::value* expression_attribute_names,
        std::unordered_set<std::string>& used_attribute_names) {
-    const std::string& column_name = p.root();
    if (column_name.size() > 0 && column_name.front() == '#') {
        if (!expression_attribute_names) {
            throw api_error::validation(
@@ -166,7 +186,30 @@ static void resolve_path(parsed::path& p,
                    format("ExpressionAttributeNames missing entry '{}' required by expression", column_name));
        }
        used_attribute_names.emplace(column_name);
-        p.set_root(std::string(rjson::to_string_view(*value)));
+        return std::string(rjson::to_string_view(*value));
+    }
+    return std::nullopt;
+}
+
+static void resolve_path(parsed::path& p,
+        const rjson::value* expression_attribute_names,
+        std::unordered_set<std::string>& used_attribute_names) {
+    std::optional<std::string> r = resolve_path_component(p.root(), expression_attribute_names, used_attribute_names);
+    if (r) {
+        p.set_root(std::move(*r));
+    }
+    for (auto& op : p.operators()) {
+        std::visit(overloaded_functor {
+            [&] (std::string& s) {
+                r = resolve_path_component(s, expression_attribute_names, used_attribute_names);
+                if (r) {
+                    s = std::move(*r);
+                }
+            },
+            [&] (unsigned index) {
+                // nothing to resolve
+            }
+        }, op);
    }
 }

@@ -348,6 +391,39 @@ bool condition_expression_on(const parsed::condition_expression& ce, std::string
    }, ce._expression);
 }

+// for_condition_expression_on() runs a given function over all the attributes
+// mentioned in the expression. If the same attribute is mentioned more than
+// once, the function will be called more than once for the same attribute.
+
+static void for_value_on(const parsed::value& v, const noncopyable_function<void(std::string_view)>& func) {
+    std::visit(overloaded_functor {
+        [&] (const parsed::constant& c) { },
+        [&] (const parsed::value::function_call& f) {
+            for (const parsed::value& value : f._parameters) {
+                for_value_on(value, func);
+            }
+        },
+        [&] (const parsed::path& p) {
+            func(p.root());
+        }
+    }, v._value);
+}
+
+void for_condition_expression_on(const parsed::condition_expression& ce, const noncopyable_function<void(std::string_view)>& func) {
+    std::visit(overloaded_functor {
+        [&] (const parsed::primitive_condition& cond) {
+            for (const parsed::value& value : cond._values) {
+                for_value_on(value, func);
+            }
+        },
+        [&] (const parsed::condition_expression::condition_list& list) {
+            for (const parsed::condition_expression& cond : list.conditions) {
+                for_condition_expression_on(cond, func);
+            }
+        }
+    }, ce._expression);
+}
+
 // The following calculate_value() functions calculate, or evaluate, a parsed
 // expression. The parsed expression is assumed to have been "resolved", with
 // the matching resolve_* function.
@@ -570,52 +646,8 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
            }
            rjson::value v1 = calculate_value(f._parameters[0], caller, previous_item);
            rjson::value v2 = calculate_value(f._parameters[1], caller, previous_item);
-            // TODO: There's duplication here with check_BEGINS_WITH().
-            // But unfortunately, the two functions differ a bit.
-
-            // If one of v1 or v2 is malformed or has an unsupported type
-            // (not B or S), what we do depends on whether it came from
-            // the user's query (is_constant()), or the item. Unsupported
-            // values in the query result in an error, but if they are in
-            // the item, we silently return false (no match).
-            bool bad = false;
-            if (!v1.IsObject() || v1.MemberCount() != 1) {
-                bad = true;
-                if (f._parameters[0].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() encountered malformed AttributeValue: {}", caller, v1));
-                }
-            } else if (v1.MemberBegin()->name != "S" && v1.MemberBegin()->name != "B") {
-                bad = true;
-                if (f._parameters[0].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() supports only string or binary in AttributeValue: {}", caller, v1));
-                }
-            }
-            if (!v2.IsObject() || v2.MemberCount() != 1) {
-                bad = true;
-                if (f._parameters[1].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() encountered malformed AttributeValue: {}", caller, v2));
-                }
-            } else if (v2.MemberBegin()->name != "S" && v2.MemberBegin()->name != "B") {
-                bad = true;
-                if (f._parameters[1].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() supports only string or binary in AttributeValue: {}", caller, v2));
-                }
-            }
-            bool ret = false;
-            if (!bad) {
-                auto it1 = v1.MemberBegin();
-                auto it2 = v2.MemberBegin();
-                if (it1->name == it2->name) {
-                    if (it2->name == "S") {
-                        std::string_view val1 = rjson::to_string_view(it1->value);
-                        std::string_view val2 = rjson::to_string_view(it2->value);
-                        ret = val1.starts_with(val2);
-                    } else /* it2->name == "B" */ {
-                        ret = base64_begins_with(rjson::to_string_view(it1->value), rjson::to_string_view(it2->value));
-                    }
-                }
-            }
-            return to_bool_json(ret);
+            return to_bool_json(check_BEGINS_WITH(v1.IsNull() ? nullptr : &v1,  v2,
+                                    f._parameters[0].is_constant(), f._parameters[1].is_constant()));
        }
    },
    {"contains", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
@@ -634,6 +666,55 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
    },
 };

+// Given a parsed::path and an item read from the table, extract the value
+// of a certain attribute path, such as "a" or "a.b.c[3]". Returns a null
+// value if the item or the requested attribute does not exist.
+// Note that the item is assumed to be encoded in JSON using DynamoDB
+// conventions - each level of a nested document is a map with one key -
+// a type (e.g., "M" for map) - and its value is the representation of
+// that value.
+static rjson::value extract_path(const rjson::value* item,
+        const parsed::path& p, calculate_value_caller caller) {
+    if (!item) {
+        return rjson::null_value();
+    }
+    const rjson::value* v = rjson::find(*item, p.root());
+    if (!v) {
+        return rjson::null_value();
+    }
+    for (const auto& op : p.operators()) {
+        if (!v->IsObject() || v->MemberCount() != 1) {
+            // This shouldn't happen. We shouldn't have stored malformed
+            // objects. But today Alternator does not validate the structure
+            // of nested documents before storing them, so this can happen on
+            // read.
+            throw api_error::validation(format("{}: malformed item read: {}", *item));
+        }
+        const char* type = v->MemberBegin()->name.GetString();
+        v = &(v->MemberBegin()->value);
+        std::visit(overloaded_functor {
+            [&] (const std::string& member) {
+                if (type[0] == 'M' && v->IsObject()) {
+                    v = rjson::find(*v, member);
+                } else {
+                    v = nullptr;
+                }
+            },
+            [&] (unsigned index) {
+                if (type[0] == 'L' && v->IsArray() && index < v->Size()) {
+                    v = &(v->GetArray()[index]);
+                } else {
+                    v = nullptr;
+                }
+            }
+        }, op);
+        if (!v) {
+            return rjson::null_value();
+        }
+    }
+    return rjson::copy(*v);
+}
+
 // Given a parsed::value, which can refer either to a constant value from
 // ExpressionAttributeValues, to the value of some attribute, or to a function
 // of other values, this function calculates the resulting value.
@@ -651,21 +732,12 @@ rjson::value calculate_value(const parsed::value& v,
            auto function_it = function_handlers.find(std::string_view(f._function_name));
            if (function_it == function_handlers.end()) {
                throw api_error::validation(
-                        format("UpdateExpression: unknown function '{}' called.", f._function_name));
+                        format("{}: unknown function '{}' called.", caller, f._function_name));
            }
            return function_it->second(caller, previous_item, f);
        },
        [&] (const parsed::path& p) -> rjson::value {
-            if (!previous_item) {
-                return rjson::null_value();
-            }
-            std::string update_path = p.root();
-            if (p.has_operators()) {
-                // FIXME: support this
-                throw api_error::validation("Reading attribute paths not yet implemented");
-            }
-            const rjson::value* previous_value = rjson::find(*previous_item, update_path);
-            return previous_value ? rjson::copy(*previous_value) : rjson::null_value();
+            return extract_path(previous_item, p, caller);
        }
    }, v._value);
 }
--- a/alternator/expressions.hh
+++ b/alternator/expressions.hh
@@ -27,6 +27,8 @@
 #include <unordered_set>
 #include <string_view>

+#include <seastar/util/noncopyable_function.hh>
+
 #include "expressions_types.hh"
 #include "utils/rjson.hh"

@@ -59,6 +61,11 @@ void validate_value(const rjson::value& v, const char* caller);

 bool condition_expression_on(const parsed::condition_expression& ce, std::string_view attribute);

+// for_condition_expression_on() runs the given function on the attributes
+// that the expression uses. It may run for the same attribute more than once
+// if the same attribute is used more than once in the expression.
+void for_condition_expression_on(const parsed::condition_expression& ce, const noncopyable_function<void(std::string_view)>& func);
+
 // calculate_value() behaves slightly different (especially, different
 // functions supported) when used in different types of expressions, as
 // enumerated in this enum:
--- a/alternator/expressions_types.hh
+++ b/alternator/expressions_types.hh
@@ -49,15 +49,23 @@ class path {
    // dot (e.g., ".xyz").
    std::string _root;
    std::vector<std::variant<std::string, unsigned>> _operators;
+    // It is useful to limit the depth of a user-specified path, because is
+    // allows us to use recursive algorithms without worrying about recursion
+    // depth. DynamoDB officially limits the length of paths to 32 components
+    // (including the root) so let's use the same limit.
+    static constexpr unsigned depth_limit = 32;
+    void check_depth_limit();
 public:
    void set_root(std::string root) {
        _root = std::move(root);
    }
    void add_index(unsigned i) {
        _operators.emplace_back(i);
+        check_depth_limit();
    }
    void add_dot(std::string(name)) {
        _operators.emplace_back(std::move(name));
+        check_depth_limit();
    }
    const std::string& root() const {
        return _root;
@@ -65,6 +73,13 @@ public:
    bool has_operators() const {
        return !_operators.empty();
    }
+    const std::vector<std::variant<std::string, unsigned>>& operators() const {
+        return _operators;
+    }
+    std::vector<std::variant<std::string, unsigned>>& operators() {
+        return _operators;
+    }
+    friend std::ostream& operator<<(std::ostream&, const path&);
 };

 // When an expression is first parsed, all constants are references, like
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -93,6 +93,10 @@ public:
                 [&] (const json::json_return_type& json_return_value) {
                     slogger.trace("api_handler success case");
                     if (json_return_value._body_writer) {
+                         // Unfortunately, write_body() forces us to choose
+                         // from a fixed and irrelevant list of "mime-types"
+                         // at this point. But we'll override it with the
+                         // one (application/x-amz-json-1.0) below.
                         rep->write_body("json", std::move(json_return_value._body_writer));
                     } else {
                         rep->_content += json_return_value._res;
@@ -105,14 +109,15 @@ public:

             return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
         });
-    }), _type("json") { }
+    }) { }

    api_handler(const api_handler&) = default;
    future<std::unique_ptr<reply>> handle(const sstring& path,
            std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
        return _f_handle(std::move(req), std::move(rep)).then(
                [this](std::unique_ptr<reply> rep) {
-                    rep->done(_type);
+                    rep->set_mime_type("application/x-amz-json-1.0");
+                    rep->done();
                    return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
                });
    }
@@ -126,7 +131,6 @@ protected:
    }

    future_handler_function _f_handle;
-    sstring _type;
 };

 class gated_handler : public handler_base {
@@ -189,27 +193,34 @@ future<> server::verify_signature(const request& req) {
    }
    auto authorization_it = req._headers.find("Authorization");
    if (authorization_it == req._headers.end()) {
-        throw api_error::invalid_signature("Authorization header is mandatory for signature verification");
+        throw api_error::missing_authentication_token("Authorization header is mandatory for signature verification");
    }
    std::string host = host_it->second;
-    std::vector<std::string_view> credentials_raw = split(authorization_it->second, ' ');
+    std::string_view authorization_header = authorization_it->second;
+    auto pos = authorization_header.find_first_of(' ');
+    if (pos == std::string_view::npos || authorization_header.substr(0, pos) != "AWS4-HMAC-SHA256") {
+        throw api_error::invalid_signature(format("Authorization header must use AWS4-HMAC-SHA256 algorithm: {}", authorization_header));
+    }
+    authorization_header.remove_prefix(pos+1);
    std::string credential;
    std::string user_signature;
    std::string signed_headers_str;
    std::vector<std::string_view> signed_headers;
-    for (std::string_view entry : credentials_raw) {
+    do {
+        // Either one of a comma or space can mark the end of an entry
+        pos = authorization_header.find_first_of(" ,");
+        std::string_view entry = authorization_header.substr(0, pos);
+        if (pos != std::string_view::npos) {
+            authorization_header.remove_prefix(pos + 1);
+        }
+        if (entry.empty()) {
+            continue;
+        }
        std::vector<std::string_view> entry_split = split(entry, '=');
        if (entry_split.size() != 2) {
-            if (entry != "AWS4-HMAC-SHA256") {
-                throw api_error::invalid_signature(format("Only AWS4-HMAC-SHA256 algorithm is supported. Found: {}", entry));
-            }
            continue;
        }
        std::string_view auth_value = entry_split[1];
-        // Commas appear as an additional (quite redundant) delimiter
-        if (auth_value.back() == ',') {
-            auth_value.remove_suffix(1);
-        }
        if (entry_split[0] == "Credential") {
            credential = std::string(auth_value);
        } else if (entry_split[0] == "Signature") {
@@ -219,7 +230,8 @@ future<> server::verify_signature(const request& req) {
            signed_headers = split(auth_value, ';');
            std::sort(signed_headers.begin(), signed_headers.end());
        }
-    }
+    } while (pos != std::string_view::npos);
+
    std::vector<std::string_view> credential_split = split(credential, '/');
    if (credential_split.size() != 5) {
        throw api_error::validation(format("Incorrect credential information format: {}", credential));
@@ -243,8 +255,8 @@ future<> server::verify_signature(const request& req) {
        }
    }

-    auto cache_getter = [] (std::string username) {
-        return get_key_from_roles(cql3::get_query_processor().local(), std::move(username));
+    auto cache_getter = [&qp = _qp] (std::string username) {
+        return get_key_from_roles(qp, std::move(username));
    };
    return _key_cache.get_ptr(user, cache_getter).then([this, &req,
                                                    user = std::move(user),
@@ -328,10 +340,11 @@ void server::set_routes(routes& r) {
 //FIXME: A way to immediately invalidate the cache should be considered,
 // e.g. when the system table which stores the keys is changed.
 // For now, this propagation may take up to 1 minute.
-server::server(executor& exec)
+server::server(executor& exec, cql3::query_processor& qp)
        : _http_server("http-alternator")
        , _https_server("https-alternator")
        , _executor(exec)
+        , _qp(qp)
        , _key_cache(1024, 1min, slogger)
        , _enforce_authorization(false)
        , _enabled_servers{}
--- a/alternator/server.hh
+++ b/alternator/server.hh
@@ -41,6 +41,7 @@ class server {
    http_server _http_server;
    http_server _https_server;
    executor& _executor;
+    cql3::query_processor& _qp;

    key_cache _key_cache;
    bool _enforce_authorization;
@@ -68,7 +69,7 @@ class server {
    json_parser _json_parser;

 public:
-    server(executor& executor);
+    server(executor& executor, cql3::query_processor& qp);

    future<> init(net::inet_address addr, std::optional<uint16_t> port, std::optional<uint16_t> https_port, std::optional<tls::credentials_builder> creds,
            bool enforce_authorization, semaphore* memory_limiter);
--- a/alternator/stats.cc
+++ b/alternator/stats.cc
@@ -38,6 +38,7 @@ stats::stats() : api_operations{} {
 #define OPERATION_LATENCY(name, CamelCaseName) \
                seastar::metrics::make_histogram("op_latency", \
                        seastar::metrics::description("Latency histogram of an operation via Alternator API"), {op(CamelCaseName)}, [this]{return to_metrics_histogram(api_operations.name);}),
+            OPERATION(batch_get_item, "BatchGetItem")
            OPERATION(batch_write_item, "BatchWriteItem")
            OPERATION(create_backup, "CreateBackup")
            OPERATION(create_global_table, "CreateGlobalTable")
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -290,7 +290,9 @@ struct sequence_number {
 sequence_number::sequence_number(std::string_view v) 
    : uuid([&] {
        using namespace boost::multiprecision;
-        uint128_t tmp{v};
+        // workaround for weird clang 10 bug when calling constructor with
+        // view directly.
+        uint128_t tmp{std::string(v)};
        // see above
        return utils::UUID_gen::get_time_UUID_raw(uint64_t(tmp >> 64), uint64_t(tmp & std::numeric_limits<uint64_t>::max()));
    }())
@@ -475,6 +477,8 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
            status = "ENABLED";
        }
    } 
+
+    auto ttl = std::chrono::seconds(opts.ttl());
    
    rjson::set(stream_desc, "StreamStatus", rjson::from_string(status));

@@ -494,20 +498,12 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
    // TODO: label
    // TODO: creation time

-    const auto& tm = _proxy.get_token_metadata();
-    // cannot really "resume" query, must iterate all data. because we cannot query neither "time" (pk) > something,
-    // or on expired...
-    // TODO: maybe add secondary index to topology table to enable this?
-    return _sdks.cdc_get_versioned_streams({ tm.count_normal_token_owners() }).then([this, &db, schema, shard_start, limit, ret = std::move(ret), stream_desc = std::move(stream_desc)](std::map<db_clock::time_point, cdc::streams_version> topologies) mutable {
+    auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();

-        // filter out cdc generations older than the table or now() - dynamodb_streams_max_window (24h)
-        auto low_ts = std::max(as_timepoint(schema->id()), db_clock::now() - dynamodb_streams_max_window);
+    // filter out cdc generations older than the table or now() - cdc::ttl (typically dynamodb_streams_max_window - 24h)
+    auto low_ts = std::max(as_timepoint(schema->id()), db_clock::now() - ttl);

-        auto i = topologies.lower_bound(low_ts);
-        // need first gen _intersecting_ the timestamp.
-        if (i != topologies.begin()) {
-            i = std::prev(i);
-        }
+    return _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners }).then([this, &db, shard_start, limit, ret = std::move(ret), stream_desc = std::move(stream_desc)] (std::map<db_clock::time_point, cdc::streams_version> topologies) mutable {

        auto e = topologies.end();
        auto prev = e;
@@ -515,9 +511,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

        std::optional<shard_id> last;

-        // i is now at the youngest generation we include. make a mark of it.
-        auto first = i;
-
+        auto i = topologies.begin();
        // if we're a paged query, skip to the generation where we left of.
        if (shard_start) {
            i = topologies.find(shard_start->time);
@@ -543,7 +537,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
        };

        // need a prev even if we are skipping stuff
-        if (i != first) {
+        if (i != topologies.begin()) {
            prev = std::prev(i);
        }

@@ -849,17 +843,20 @@ future<executor::request_return_type> executor::get_records(client_state& client

    static const bytes timestamp_column_name = cdc::log_meta_column_name_bytes("time");
    static const bytes op_column_name = cdc::log_meta_column_name_bytes("operation");
+    static const bytes eor_column_name = cdc::log_meta_column_name_bytes("end_of_batch");

-    auto key_names = boost::copy_range<std::unordered_set<std::string>>(
+    auto key_names = boost::copy_range<attrs_to_get>(
        boost::range::join(std::move(base->partition_key_columns()), std::move(base->clustering_key_columns()))
-        | boost::adaptors::transformed([&] (const column_definition& cdef) { return cdef.name_as_text(); })
+        | boost::adaptors::transformed([&] (const column_definition& cdef) {
+            return std::make_pair<std::string, attrs_to_get_node>(cdef.name_as_text(), {}); })
    );
    // Include all base table columns as values (in case pre or post is enabled).
    // This will include attributes not stored in the frozen map column
-    auto attr_names = boost::copy_range<std::unordered_set<std::string>>(base->regular_columns()
+    auto attr_names = boost::copy_range<attrs_to_get>(base->regular_columns()
        // this will include the :attrs column, which we will also force evaluating. 
        // But not having this set empty forces out any cdc columns from actual result 
-        | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.name_as_text(); })
+        | boost::adaptors::transformed([] (const column_definition& cdef) {
+            return std::make_pair<std::string, attrs_to_get_node>(cdef.name_as_text(), {}); })
    );

    std::vector<const column_definition*> columns;
@@ -872,7 +869,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
    std::transform(cks.begin(), cks.end(), std::back_inserter(columns), [](auto& c) { return &c; });

    auto regular_columns = boost::copy_range<query::column_id_vector>(schema->regular_columns() 
-        | boost::adaptors::filtered([](const column_definition& cdef) { return cdef.name() == op_column_name || !cdc::is_cdc_metacolumn_name(cdef.name_as_text()); })
+        | boost::adaptors::filtered([](const column_definition& cdef) { return cdef.name() == op_column_name || cdef.name() == eor_column_name || !cdc::is_cdc_metacolumn_name(cdef.name_as_text()); })
        | boost::adaptors::transformed([&] (const column_definition& cdef) { columns.emplace_back(&cdef); return cdef.id; })
    );

@@ -882,8 +879,17 @@ future<executor::request_return_type> executor::get_records(client_state& client
    auto partition_slice = query::partition_slice(
        std::move(bounds)
        , {}, std::move(regular_columns), selection->get_query_options());
+
+	auto& opts = base->cdc_options();
+	auto mul = 2; // key-only, allow for delete + insert
+    if (opts.preimage()) {
+        ++mul;
+    }
+    if (opts.postimage()) {
+        ++mul;
+    }
    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
-            query::row_limit(limit * 4));
+            query::row_limit(limit * mul));

    return _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then(
            [this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), start_time = std::move(start_time), limit, key_names = std::move(key_names), attr_names = std::move(attr_names), type, iter, high_ts] (service::storage_proxy::coordinator_query_result qr) mutable {       
@@ -905,6 +911,11 @@ future<executor::request_return_type> executor::get_records(client_state& client
                return cdef->name->name() == timestamp_column_name;
            })
        );
+        auto eor_index = std::distance(metadata.get_names().begin(), 
+            std::find_if(metadata.get_names().begin(), metadata.get_names().end(), [](const lw_shared_ptr<cql3::column_specification>& cdef) {
+                return cdef->name->name() == eor_column_name;
+            })
+        );

        std::optional<utils::UUID> timestamp;
        auto dynamodb = rjson::empty_object();
@@ -930,15 +941,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
        for (auto& row : result_set->rows()) {
            auto op = static_cast<cdc::operation>(value_cast<op_utype>(data_type_for<op_utype>()->deserialize(*row[op_index])));
            auto ts = value_cast<utils::UUID>(data_type_for<utils::UUID>()->deserialize(*row[ts_index]));
-
-            if (timestamp && timestamp != ts) {
-                maybe_add_record();
-                if (limit == 0) {
-                    break;
-                }
-            }
-
-            timestamp = ts;
+            auto eor = row[eor_index].has_value() ? value_cast<bool>(boolean_type->deserialize(*row[eor_index])) : false;

            if (!dynamodb.HasMember("Keys")) {
                auto keys = rjson::empty_object();
@@ -991,9 +994,13 @@ future<executor::request_return_type> executor::get_records(client_state& client
                rjson::set(record, "eventName", "REMOVE");
                break;
            }
-        }
-        if (limit > 0 && timestamp) {
-            maybe_add_record();
+            if (eor) {
+                maybe_add_record();
+                timestamp = ts;
+                if (limit == 0) {
+                    break;
+                }
+            }
        }

        auto ret = rjson::empty_object();
@@ -1013,7 +1020,9 @@ future<executor::request_return_type> executor::get_records(client_state& client
        }

        // ugh. figure out if we are and end-of-shard
-        return cdc::get_local_streams_timestamp().then([this, iter, high_ts, start_time, ret = std::move(ret)](db_clock::time_point ts) mutable {
+        auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
+        
+        return _sdks.cdc_current_generation_timestamp({ normal_token_owners }).then([this, iter, high_ts, start_time, ret = std::move(ret)](db_clock::time_point ts) mutable {
            auto& shard = iter.shard;            

            if (shard.time < ts && ts < high_ts) {
@@ -1047,6 +1056,9 @@ void executor::add_stream_options(const rjson::value& stream_specification, sche
        if (!db.features().cluster_supports_cdc()) {
            throw api_error::validation("StreamSpecification: streams (CDC) feature not enabled in cluster.");
        }
+        if (!db.features().cluster_supports_alternator_streams()) {
+            throw api_error::validation("StreamSpecification: alternator streams feature not enabled in cluster.");
+        }

        cdc::options opts;
        opts.enabled(true);
--- a/api/api-doc/column_family.json
+++ b/api/api-doc/column_family.json
@@ -2925,6 +2925,10 @@
         "id":"toppartitions_query_results",
         "description":"nodetool toppartitions query results",
         "properties":{
+            "read_cardinality":{
+               "type":"long",
+               "description":"Number of the unique operations in the sample set"
+            },
            "read":{
               "type":"array",
               "items":{
@@ -2932,6 +2936,10 @@
               },
               "description":"Read results"
            },
+            "write_cardinality":{
+               "type":"long",
+               "description":"Number of the unique operations in the sample set"
+            },
            "write":{
               "type":"array",
               "items":{
--- a/api/api-doc/gossiper.json
+++ b/api/api-doc/gossiper.json
@@ -148,6 +148,30 @@
               ]
            }
         ]
+      },
+      {
+         "path":"/gossiper/force_remove_endpoint/{addr}",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Force remove an endpoint from gossip",
+               "type":"void",
+               "nickname":"force_remove_endpoint",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"addr",
+                     "description":"The endpoint address",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            }
+         ]
      }
   ]
 }
--- a/api/api-doc/storage_proxy.json
+++ b/api/api-doc/storage_proxy.json
@@ -68,7 +68,7 @@
               "summary":"Get the hinted handoff enabled by dc",
               "type":"array",
               "items":{
-                  "type":"mapper_list"
+                  "type":"array"
               },
               "nickname":"get_hinted_handoff_enabled_by_dc",
               "produces":[
--- a/api/api_init.hh
+++ b/api/api_init.hh
@@ -24,7 +24,7 @@
 #include <seastar/http/httpd.hh>

 namespace service { class load_meter; }
-namespace locator { class token_metadata; }
+namespace locator { class shared_token_metadata; }
 namespace cql_transport { class controller; }
 class thrift_controller;
 namespace db { class snapshot_ctl; }
@@ -39,13 +39,15 @@ struct http_context {
    distributed<database>& db;
    distributed<service::storage_proxy>& sp;
    service::load_meter& lmeter;
-    const sharded<locator::token_metadata>& token_metadata;
+    const sharded<locator::shared_token_metadata>& shared_token_metadata;

    http_context(distributed<database>& _db,
            distributed<service::storage_proxy>& _sp,
-            service::load_meter& _lm, const sharded<locator::token_metadata>& _tm)
-            : db(_db), sp(_sp), lmeter(_lm), token_metadata(_tm) {
+            service::load_meter& _lm, const sharded<locator::shared_token_metadata>& _stm)
+            : db(_db), sp(_sp), lmeter(_lm), shared_token_metadata(_stm) {
    }
+
+    const locator::token_metadata& get_token_metadata();
 };

 future<> set_server_init(http_context& ctx);
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -310,7 +310,7 @@ void set_column_family(http_context& ctx, routes& r) {
        return res;
    });

-    cf::get_column_family.set(r, [&ctx] (const_req req){
+    cf::get_column_family.set(r, [&ctx] (std::unique_ptr<request> req){
            vector<cf::column_family_info> res;
            for (auto i: ctx.db.local().get_column_families_mapping()) {
                cf::column_family_info info;
@@ -319,7 +319,7 @@ void set_column_family(http_context& ctx, routes& r) {
                info.type = "ColumnFamilies";
                res.push_back(info);
            }
-            return res;
+            return make_ready_future<json::json_return_type>(json::stream_object(std::move(res)));
        });

    cf::get_column_family_name_keyspace.set(r, [&ctx] (const_req req){
@@ -331,15 +331,15 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], 0, [](column_family& cf) {
+        return map_reduce_cf(ctx, req->param["name"], uint64_t{0}, [](column_family& cf) {
            return cf.active_memtable().partition_count();
-        }, std::plus<int>());
+        }, std::plus<>());
    });

    cf::get_all_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, 0, [](column_family& cf) {
+        return map_reduce_cf(ctx, uint64_t{0}, [](column_family& cf) {
            return cf.active_memtable().partition_count();
-        }, std::plus<int>());
+        }, std::plus<>());
    });

    cf::get_memtable_on_heap_size.set(r, [] (const_req req) {
@@ -656,7 +656,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_size();
+                return s + sst->filter_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -664,7 +664,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_size();
+                return s + sst->filter_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -672,7 +672,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_memory_size();
+                return s + sst->filter_memory_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -680,7 +680,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_memory_size();
+                return s + sst->filter_memory_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -688,7 +688,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->get_summary().memory_footprint();
+                return s + sst->get_summary().memory_footprint();
            });
        }, std::plus<uint64_t>());
    });
@@ -696,7 +696,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->get_summary().memory_footprint();
+                return s + sst->get_summary().memory_footprint();
            });
        }, std::plus<uint64_t>());
    });
@@ -991,6 +991,9 @@ void set_column_family(http_context& ctx, routes& r) {
                        apilog.debug("toppartitions query: processing results");
                        cf::toppartitions_query_results results;

+                        results.read_cardinality = topk_results.read.size();
+                        results.write_cardinality = topk_results.write.size();
+
                        for (auto& d: topk_results.read.top(q.list_size())) {
                            cf::toppartitions_record r;
                            r.partition = sstring(d.item);
--- a/api/gossiper.cc
+++ b/api/gossiper.cc
@@ -66,6 +66,13 @@ void set_gossiper(http_context& ctx, routes& r) {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });
+
+    httpd::gossiper_json::force_remove_endpoint.set(r, [](std::unique_ptr<request> req) {
+        gms::inet_address ep(req->param["addr"]);
+        return gms::get_local_gossiper().force_remove_endpoint(ep).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
 }

 }
--- a/api/storage_proxy.cc
+++ b/api/storage_proxy.cc
@@ -201,29 +201,39 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_hinted_handoff_enabled.set(r, [&ctx](std::unique_ptr<request> req)  {
-        auto enabled = ctx.db.local().get_config().hinted_handoff_enabled();
-        return make_ready_future<json::json_return_type>(enabled);
+        const auto& filter = service::get_storage_proxy().local().get_hints_host_filter();
+        return make_ready_future<json::json_return_type>(!filter.is_disabled_for_all());
    });

    sp::set_hinted_handoff_enabled.set(r, [](std::unique_ptr<request> req)  {
-        //TBD
-        unimplemented();
        auto enable = req->get_query_param("enable");
-        return make_ready_future<json::json_return_type>(json_void());
+        auto filter = (enable == "true" || enable == "1")
+                ? db::hints::host_filter(db::hints::host_filter::enabled_for_all_tag {})
+                : db::hints::host_filter(db::hints::host_filter::disabled_for_all_tag {});
+        return service::get_storage_proxy().invoke_on_all([filter = std::move(filter)] (service::storage_proxy& sp) {
+            return sp.change_hints_host_filter(filter);
+        }).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
    });

    sp::get_hinted_handoff_enabled_by_dc.set(r, [](std::unique_ptr<request> req)  {
-        //TBD
-        unimplemented();
-        std::vector<sp::mapper_list> res;
+        std::vector<sstring> res;
+        const auto& filter = service::get_storage_proxy().local().get_hints_host_filter();
+        const auto& dcs = filter.get_dcs();
+        res.reserve(res.size());
+        std::copy(dcs.begin(), dcs.end(), std::back_inserter(res));
        return make_ready_future<json::json_return_type>(res);
    });

    sp::set_hinted_handoff_enabled_by_dc_list.set(r, [](std::unique_ptr<request> req)  {
-        //TBD
-        unimplemented();
-        auto enable = req->get_query_param("dcs");
-        return make_ready_future<json::json_return_type>(json_void());
+        auto dcs = req->get_query_param("dcs");
+        auto filter = db::hints::host_filter::parse_from_dc_list(std::move(dcs));
+        return service::get_storage_proxy().invoke_on_all([filter = std::move(filter)] (service::storage_proxy& sp) {
+            return sp.change_hints_host_filter(filter);
+        }).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
    });

    sp::get_max_hint_window.set(r, [](std::unique_ptr<request> req)  {
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -22,6 +22,7 @@
 #include "storage_service.hh"
 #include "api/api-doc/storage_service.json.hh"
 #include "db/config.hh"
+#include "db/schema_tables.hh"
 #include <optional>
 #include <time.h>
 #include <boost/range/adaptor/map.hpp>
@@ -44,9 +45,14 @@
 #include "db/snapshot-ctl.hh"
 #include "transport/controller.hh"
 #include "thrift/controller.hh"
+#include "locator/token_metadata.hh"

 namespace api {

+const locator::token_metadata& http_context::get_token_metadata() {
+        return *shared_token_metadata.local().get();
+}
+
 namespace ss = httpd::storage_service_json;
 using namespace json;

@@ -219,7 +225,7 @@ void set_repair(http_context& ctx, routes& r, sharded<netw::messaging_service>&
            try {
                res = fut.get0();
            } catch (std::exception& e) {
-                return make_exception_future<json::json_return_type>(httpd::server_error_exception(e.what()));
+                return make_exception_future<json::json_return_type>(httpd::bad_param_exception(e.what()));
            }
            return make_ready_future<json::json_return_type>(json::json_return_type(res));
        });
@@ -256,14 +262,14 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::get_tokens.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.token_metadata.local().sorted_tokens(), [](const dht::token& i) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.get_token_metadata().sorted_tokens(), [](const dht::token& i) {
           return boost::lexical_cast<std::string>(i);
        }));
    });

    ss::get_node_tokens.set(r, [&ctx] (std::unique_ptr<request> req) {
        gms::inet_address addr(req->param["endpoint"]);
-        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.token_metadata.local().get_tokens(addr), [](const dht::token& i) {
+        return make_ready_future<json::json_return_type>(stream_range_as_array(ctx.get_token_metadata().get_tokens(addr), [](const dht::token& i) {
           return boost::lexical_cast<std::string>(i);
       }));
    });
@@ -282,7 +288,7 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::get_leaving_nodes.set(r, [&ctx](const_req req) {
-        return container_to_vec(ctx.token_metadata.local().get_leaving_endpoints());
+        return container_to_vec(ctx.get_token_metadata().get_leaving_endpoints());
    });

    ss::get_moving_nodes.set(r, [](const_req req) {
@@ -291,7 +297,7 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::get_joining_nodes.set(r, [&ctx](const_req req) {
-        auto points = ctx.token_metadata.local().get_bootstrap_tokens();
+        auto points = ctx.get_token_metadata().get_bootstrap_tokens();
        std::unordered_set<sstring> addr;
        for (auto i: points) {
            addr.insert(boost::lexical_cast<std::string>(i.second));
@@ -360,7 +366,7 @@ void set_storage_service(http_context& ctx, routes& r) {

    ss::get_host_id_map.set(r, [&ctx](const_req req) {
        std::vector<ss::mapper> res;
-        return map_to_key_value(ctx.token_metadata.local().get_endpoint_to_host_id_map_for_reading(), res);
+        return map_to_key_value(ctx.get_token_metadata().get_endpoint_to_host_id_map_for_reading(), res);
    });

    ss::get_load.set(r, [&ctx](std::unique_ptr<request> req) {
@@ -732,9 +738,12 @@ void set_storage_service(http_context& ctx, routes& r) {
    });

    ss::reset_local_schema.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        return make_ready_future<json::json_return_type>(json_void());
+        // FIXME: We should truncate schema tables if more than one node in the cluster.
+        auto& sp = service::get_storage_proxy();
+        auto& fs = service::get_local_storage_service().features();
+        return db::schema_tables::recalculate_schema_version(sp, fs).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
    });

    ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -82,7 +82,7 @@ static future<> create_metadata_table_if_missing_impl(
    b.set_uuid(uuid);
    schema_ptr table = b.build();
    return ignore_existing([&mm, table = std::move(table)] () {
-        return mm.announce_new_column_family(table, false);
+        return mm.announce_new_column_family(table);
    });
 }

--- a/auth/service.cc
+++ b/auth/service.cc
@@ -154,7 +154,7 @@ future<> service::create_keyspace_if_missing(::service::migration_manager& mm) c

        // We use min_timestamp so that default keyspace metadata will loose with any manual adjustments.
        // See issue #2129.
-        return mm.announce_new_keyspace(ksm, api::min_timestamp, false);
+        return mm.announce_new_keyspace(ksm, api::min_timestamp);
    }

    return make_ready_future<>();
@@ -371,10 +371,13 @@ bool is_enforcing(const service& ser)  {
    return enforcing_authorizer || enforcing_authenticator;
 }

-bool is_protected(const service& ser, const resource& r) noexcept {
-    return ser.underlying_role_manager().protected_resources().contains(r)
-            || ser.underlying_authenticator().protected_resources().contains(r)
-            || ser.underlying_authorizer().protected_resources().contains(r);
+bool is_protected(const service& ser, command_desc cmd) noexcept {
+    if (cmd.type_ == command_desc::type::ALTER_WITH_OPTS) {
+        return false; // Table attributes are OK to modify; see #7057.
+    }
+    return ser.underlying_role_manager().protected_resources().contains(cmd.resource)
+            || ser.underlying_authenticator().protected_resources().contains(cmd.resource)
+            || ser.underlying_authorizer().protected_resources().contains(cmd.resource);
 }

 static void validate_authentication_options_are_supported(
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -181,10 +181,21 @@ future<permission_set> get_permissions(const service&, const authenticated_user&
 ///
 bool is_enforcing(const service&);

+/// A description of a CQL command from which auth::service can tell whether or not this command could endanger
+/// internal data on which auth::service depends.
+struct command_desc {
+    auth::permission permission; ///< Nature of the command's alteration.
+    const ::auth::resource& resource; ///< Resource impacted by this command.
+    enum class type {
+        ALTER_WITH_OPTS, ///< Command is ALTER ... WITH ...
+        OTHER
+    } type_ = type::OTHER;
+};
+
 ///
 /// Protected resources cannot be modified even if the performer has permissions to do so.
 ///
-bool is_protected(const service&, const resource&) noexcept;
+bool is_protected(const service&, command_desc) noexcept;

 ///
 /// Create a role with optional authentication information.
--- a/bytes.hh
+++ b/bytes.hh
@@ -28,6 +28,7 @@
 #include <iosfwd>
 #include <functional>
 #include "utils/mutable_view.hh"
+#include <xxhash.h>

 using bytes = basic_sstring<int8_t, uint32_t, 31, false>;
 using bytes_view = std::basic_string_view<int8_t>;
@@ -35,6 +36,10 @@ using bytes_mutable_view = basic_mutable_view<bytes_view::value_type>;
 using bytes_opt = std::optional<bytes>;
 using sstring_view = std::string_view;

+inline bytes to_bytes(bytes&& b) {
+    return std::move(b);
+}
+
 inline sstring_view to_sstring_view(bytes_view view) {
    return {reinterpret_cast<const char*>(view.data()), view.size()};
 }
@@ -43,17 +48,6 @@ inline bytes_view to_bytes_view(sstring_view view) {
    return {reinterpret_cast<const int8_t*>(view.data()), view.size()};
 }

-namespace std {
-
-template <>
-struct hash<bytes_view> {
-    size_t operator()(bytes_view v) const {
-        return hash<sstring_view>()({reinterpret_cast<const char*>(v.begin()), v.size()});
-    }
-};
-
-}
-
 struct fmt_hex {
    bytes_view& v;
    fmt_hex(bytes_view& v) noexcept : v(v) {}
@@ -94,6 +88,30 @@ struct appending_hash<bytes_view> {
    }
 };

+struct bytes_view_hasher : public hasher {
+    XXH64_state_t _state;
+    bytes_view_hasher(uint64_t seed = 0) noexcept {
+        XXH64_reset(&_state, seed);
+    }
+    void update(const char* ptr, size_t length) noexcept {
+        XXH64_update(&_state, ptr, length);
+    }
+    size_t finalize() {
+        return static_cast<size_t>(XXH64_digest(&_state));
+    }
+};
+
+namespace std {
+template <>
+struct hash<bytes_view> {
+    size_t operator()(bytes_view v) const {
+        bytes_view_hasher h;
+        appending_hash<bytes_view>{}(h, v);
+        return h.finalize();
+    }
+};
+} // namespace std
+
 inline int32_t compare_unsigned(bytes_view v1, bytes_view v2) {
  auto size = std::min(v1.size(), v2.size());
  if (size) {
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -39,7 +39,7 @@ public:
    using size_type = bytes::size_type;
    using value_type = bytes::value_type;
    using fragment_type = bytes_view;
-    static constexpr size_type max_chunk_size() { return 128 * 1024; }
+    static constexpr size_type max_chunk_size() { return max_alloc_size() - sizeof(chunk); }
 private:
    static_assert(sizeof(value_type) == 1, "value_type is assumed to be one byte long");
    struct chunk {
@@ -59,13 +59,21 @@ private:
        void operator delete(void* ptr) { free(ptr); }
    };
    static constexpr size_type default_chunk_size{512};
+    static constexpr size_type max_alloc_size() { return 128 * 1024; }
 private:
    std::unique_ptr<chunk> _begin;
    chunk* _current;
    size_type _size;
    size_type _initial_chunk_size = default_chunk_size;
 public:
-    class fragment_iterator : public std::iterator<std::input_iterator_tag, bytes_view> {
+    class fragment_iterator {
+    public:
+        using iterator_category = std::input_iterator_tag;
+        using value_type = bytes_view;
+        using difference_type = std::ptrdiff_t;
+        using pointer = bytes_view*;
+        using reference = bytes_view&;
+    private:
        chunk* _current = nullptr;
    public:
        fragment_iterator() = default;
@@ -125,16 +133,15 @@ private:
        return _current->size - _current->offset;
    }
    // Figure out next chunk size.
-    //   - must be enough for data_size
+    //   - must be enough for data_size + sizeof(chunk)
    //   - must be at least _initial_chunk_size
    //   - try to double each time to prevent too many allocations
-    //   - do not exceed max_chunk_size
+    //   - should not exceed max_alloc_size, unless data_size requires so
    size_type next_alloc_size(size_t data_size) const {
        auto next_size = _current
                ? _current->size * 2
                : _initial_chunk_size;
-        next_size = std::min(next_size, max_chunk_size());
-        // FIXME: check for overflow?
+        next_size = std::min(next_size, max_alloc_size());
        return std::max<size_type>(next_size, data_size + sizeof(chunk));
    }
    // Makes room for a contiguous region of given size.
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -264,6 +264,9 @@ future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_poin
        }
        _state = state::reading_from_underlying;
        _population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
+        if (!_read_context->partition_exists()) {
+            return read_from_underlying(timeout);
+        }
        auto end = _next_row_in_range ? position_in_partition(_next_row.position())
                                      : position_in_partition(_upper_bound);
        return _underlying->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
@@ -461,7 +464,7 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
            cr.cells().prepare_hash(*_schema, column_kind::regular_column);
        }
        auto new_entry = alloc_strategy_unique_ptr<rows_entry>(
-            current_allocator().construct<rows_entry>(*_schema, cr.key(), cr.tomb(), cr.marker(), cr.cells()));
+            current_allocator().construct<rows_entry>(*_schema, cr.key(), cr.as_deletable_row()));
        new_entry->set_continuous(false);
        auto it = _next_row.iterators_valid() ? _next_row.get_iterator_in_latest_version()
                                              : mp.clustered_rows().lower_bound(cr.key(), less);
@@ -508,7 +511,7 @@ void cache_flat_mutation_reader::copy_from_cache_to_buffer() {
        // This guarantees that rts starts after any emitted clustering_row
        // and not before any emitted range tombstone.
        if (!less(_lower_bound, rts.position())) {
-            rts.set_start(*_schema, _lower_bound);
+            rts.set_start(_lower_bound);
        } else {
            _lower_bound = position_in_partition(rts.position());
            _lower_bound_changed = true;
@@ -644,7 +647,7 @@ void cache_flat_mutation_reader::add_to_buffer(range_tombstone&& rt) {
        return;
    }
    if (!less(_lower_bound, rt.position())) {
-        rt.set_start(*_schema, _lower_bound);
+        rt.set_start(_lower_bound);
    } else {
        _lower_bound = position_in_partition(rt.position());
        _lower_bound_changed = true;
--- a/cartesian_product.hh
+++ b/cartesian_product.hh
@@ -33,9 +33,13 @@ template<typename T>
 struct cartesian_product {
    const std::vector<std::vector<T>>& _vec_of_vecs;
 public:
-    class iterator : public std::iterator<std::forward_iterator_tag, std::vector<T>> {
+    class iterator {
    public:
+        using iterator_category = std::forward_iterator_tag;
        using value_type = std::vector<T>;
+        using difference_type = std::ptrdiff_t;
+        using pointer = std::vector<T>*;
+        using reference = std::vector<T>&;
    private:
        size_t _pos;
        const std::vector<std::vector<T>>* _vec_of_vecs;
--- a/cdc/cdc_extension.hh
+++ b/cdc/cdc_extension.hh
@@ -20,10 +20,16 @@

 #pragma once

+#include <map>
+
+#include <seastar/core/sstring.hh>
+
+#include "bytes.hh"
 #include "serializer.hh"
 #include "db/extensions.hh"
 #include "cdc/cdc_options.hh"
 #include "schema.hh"
+#include "serializer_impl.hh"

 namespace cdc {

--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -22,10 +22,14 @@
 #include <boost/type.hpp>
 #include <random>
 #include <unordered_set>
+#include <algorithm>
 #include <seastar/core/sleep.hh>
+#include <algorithm>
+#include <seastar/core/coroutine.hh>

 #include "keys.hh"
 #include "schema_builder.hh"
+#include "database.hh"
 #include "db/config.hh"
 #include "db/system_keyspace.hh"
 #include "db/system_distributed_keyspace.hh"
@@ -36,6 +40,7 @@
 #include "gms/gossiper.hh"

 #include "cdc/generation.hh"
+#include "cdc/cdc_options.hh"

 extern logging::logger cdc_log;

@@ -174,19 +179,38 @@ bool topology_description::operator==(const topology_description& o) const {
    return _entries == o._entries;
 }

-const std::vector<token_range_description>& topology_description::entries() const {
+const std::vector<token_range_description>& topology_description::entries() const& {
    return _entries;
 }

+std::vector<token_range_description>&& topology_description::entries() && {
+    return std::move(_entries);
+}
+
+static std::vector<stream_id> create_stream_ids(
+        size_t index, dht::token start, dht::token end, size_t shard_count, uint8_t ignore_msb) {
+    std::vector<stream_id> result;
+    result.reserve(shard_count);
+    dht::sharder sharder(shard_count, ignore_msb);
+    for (size_t shard_idx = 0; shard_idx < shard_count; ++shard_idx) {
+        auto t = dht::find_first_token_for_shard(sharder, start, end, shard_idx);
+        // compose the id from token and the "index" of the range end owning vnode
+        // as defined by token sort order. Basically grouping within this
+        // shard set.
+        result.emplace_back(stream_id(t, index));
+    }
+    return result;
+}
+
 class topology_description_generator final {
    const db::config& _cfg;
    const std::unordered_set<dht::token>& _bootstrap_tokens;
-    const locator::token_metadata& _token_metadata;
+    const locator::token_metadata_ptr _tmptr;
    const gms::gossiper& _gossiper;

    // Compute a set of tokens that split the token ring into vnodes
    auto get_tokens() const {
-        auto tokens = _token_metadata.sorted_tokens();
+        auto tokens = _tmptr->sorted_tokens();
        auto it = tokens.insert(
                tokens.end(), _bootstrap_tokens.begin(), _bootstrap_tokens.end());
        std::sort(it, tokens.end());
@@ -201,7 +225,7 @@ class topology_description_generator final {
        if (_bootstrap_tokens.contains(end)) {
            return {smp::count, _cfg.murmur3_partitioner_ignore_msb_bits()};
        } else {
-            auto endpoint = _token_metadata.get_endpoint(end);
+            auto endpoint = _tmptr->get_endpoint(end);
            if (!endpoint) {
                throw std::runtime_error(
                        format("Can't find endpoint for token {}", end));
@@ -217,29 +241,20 @@ class topology_description_generator final {
        desc.token_range_end = end;

        auto [shard_count, ignore_msb] = get_sharding_info(end);
-        desc.streams.reserve(shard_count);
+        desc.streams = create_stream_ids(index, start, end, shard_count, ignore_msb);
        desc.sharding_ignore_msb = ignore_msb;

-        dht::sharder sharder(shard_count, ignore_msb);
-        for (size_t shard_idx = 0; shard_idx < shard_count; ++shard_idx) {
-            auto t = dht::find_first_token_for_shard(sharder, start, end, shard_idx);
-            // compose the id from token and the "index" of the range end owning vnode
-            // as defined by token sort order. Basically grouping within this
-            // shard set.
-            desc.streams.emplace_back(stream_id(t, index));
-        }
-
        return desc;
    }
 public:
    topology_description_generator(
            const db::config& cfg,
            const std::unordered_set<dht::token>& bootstrap_tokens,
-            const locator::token_metadata& token_metadata,
+            const locator::token_metadata_ptr tmptr,
            const gms::gossiper& gossiper)
        : _cfg(cfg)
        , _bootstrap_tokens(bootstrap_tokens)
-        , _token_metadata(token_metadata)
+        , _tmptr(std::move(tmptr))
        , _gossiper(gossiper)
    {}

@@ -294,23 +309,67 @@ future<db_clock::time_point> get_local_streams_timestamp() {
    });
 }

+// non-static for testing
+size_t limit_of_streams_in_topology_description() {
+    // Each stream takes 16B and we don't want to exceed 4MB so we can have
+    // at most 262144 streams but not less than 1 per vnode.
+    return 4 * 1024 * 1024 / 16;
+}
+
+// non-static for testing
+topology_description limit_number_of_streams_if_needed(topology_description&& desc) {
+    int64_t streams_count = 0;
+    for (auto& tr_desc : desc.entries()) {
+        streams_count += tr_desc.streams.size();
+    }
+
+    size_t limit = std::max(limit_of_streams_in_topology_description(), desc.entries().size());
+    if (limit >= streams_count) {
+        return std::move(desc);
+    }
+    size_t streams_per_vnode_limit = limit / desc.entries().size();
+    auto entries = std::move(desc).entries();
+    auto start = entries.back().token_range_end;
+    for (size_t idx = 0; idx < entries.size(); ++idx) {
+        auto end = entries[idx].token_range_end;
+        if (entries[idx].streams.size() > streams_per_vnode_limit) {
+            entries[idx].streams =
+                create_stream_ids(idx, start, end, streams_per_vnode_limit, entries[idx].sharding_ignore_msb);
+        }
+        start = end;
+    }
+    return topology_description(std::move(entries));
+}
+
 // Run inside seastar::async context.
 db_clock::time_point make_new_cdc_generation(
        const db::config& cfg,
        const std::unordered_set<dht::token>& bootstrap_tokens,
-        const locator::token_metadata& tm,
+        const locator::token_metadata_ptr tmptr,
        const gms::gossiper& g,
        db::system_distributed_keyspace& sys_dist_ks,
        std::chrono::milliseconds ring_delay,
-        bool for_testing) {
+        bool add_delay) {
    using namespace std::chrono;
-    auto gen = topology_description_generator(cfg, bootstrap_tokens, tm, g).generate();
+    auto gen = topology_description_generator(cfg, bootstrap_tokens, tmptr, g).generate();
+
+    // If the cluster is large we may end up with a generation that contains
+    // large number of streams. This is problematic because we store the
+    // generation in a single row. For a generation with large number of rows
+    // this will lead to a row that can be as big as 32MB. This is much more
+    // than the limit imposed by commitlog_segment_size_in_mb. If the size of
+    // the row that describes a new generation grows above
+    // commitlog_segment_size_in_mb, the write will fail and the new node won't
+    // be able to join. To avoid such problem we make sure that such row is
+    // always smaller than 4MB. We do that by removing some CDC streams from
+    // each vnode if the total number of streams is too large.
+    gen = limit_number_of_streams_if_needed(std::move(gen));

    // Begin the race.
    auto ts = db_clock::now() + (
-            (for_testing || ring_delay == milliseconds(0)) ? milliseconds(0) : (
+            (!add_delay || ring_delay == milliseconds(0)) ? milliseconds(0) : (
                2 * ring_delay + duration_cast<milliseconds>(generation_leeway)));
-    sys_dist_ks.insert_cdc_topology_description(ts, std::move(gen), { tm.count_normal_token_owners() }).get();
+    sys_dist_ks.insert_cdc_topology_description(ts, std::move(gen), { tmptr->count_normal_token_owners() }).get();

    return ts;
 }
@@ -321,31 +380,23 @@ std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_ad
    return gms::versioned_value::cdc_streams_timestamp_from_string(streams_ts_string);
 }

-// Run inside seastar::async context.
-static void do_update_streams_description(
+static future<> do_update_streams_description(
        db_clock::time_point streams_ts,
        db::system_distributed_keyspace& sys_dist_ks,
        db::system_distributed_keyspace::context ctx) {
-    if (sys_dist_ks.cdc_desc_exists(streams_ts, ctx).get0()) {
-        cdc_log.debug("update_streams_description: description of generation {} already inserted", streams_ts);
-        return;
+    if (co_await sys_dist_ks.cdc_desc_exists(streams_ts, ctx)) {
+        cdc_log.info("Generation {}: streams description table already updated.", streams_ts);
+        co_return;
    }

    // We might race with another node also inserting the description, but that's ok. It's an idempotent operation.

-    auto topo = sys_dist_ks.read_cdc_topology_description(streams_ts, ctx).get0();
+    auto topo = co_await sys_dist_ks.read_cdc_topology_description(streams_ts, ctx);
    if (!topo) {
-        throw std::runtime_error(format("could not find streams data for timestamp {}", streams_ts));
+        throw no_generation_data_exception(streams_ts);
    }

-    std::set<cdc::stream_id> streams_set;
-    for (auto& entry: topo->entries()) {
-        streams_set.insert(entry.streams.begin(), entry.streams.end());
-    }
-
-    std::vector<cdc::stream_id> streams_vec(streams_set.begin(), streams_set.end());
-
-    sys_dist_ks.create_cdc_desc(streams_ts, streams_vec, ctx).get();
+    co_await sys_dist_ks.create_cdc_desc(streams_ts, *topo, ctx);
    cdc_log.info("CDC description table successfully updated with generation {}.", streams_ts);
 }

@@ -355,7 +406,7 @@ void update_streams_description(
        noncopyable_function<unsigned()> get_num_token_owners,
        abort_source& abort_src) {
    try {
-        do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
+        do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() }).get();
    } catch(...) {
        cdc_log.warn(
            "Could not update CDC description table with generation {}: {}. Will retry in the background.",
@@ -368,7 +419,7 @@ void update_streams_description(
            while (true) {
                sleep_abortable(std::chrono::seconds(60), abort_src).get();
                try {
-                    do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
+                    do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() }).get();
                    return;
                } catch (...) {
                    cdc_log.warn(
@@ -380,4 +431,176 @@ void update_streams_description(
    }
 }

+static db_clock::time_point as_timepoint(const utils::UUID& uuid) {
+    return db_clock::time_point{std::chrono::milliseconds(utils::UUID_gen::get_adjusted_timestamp(uuid))};
+}
+
+static future<std::vector<db_clock::time_point>> get_cdc_desc_v1_timestamps(
+        db::system_distributed_keyspace& sys_dist_ks,
+        abort_source& abort_src,
+        const noncopyable_function<unsigned()>& get_num_token_owners) {
+    while (true) {
+        try {
+            co_return co_await sys_dist_ks.get_cdc_desc_v1_timestamps({ get_num_token_owners() });
+        } catch (...) {
+            cdc_log.warn(
+                    "Failed to retrieve generation timestamps for rewriting: {}. Retrying in 60s.",
+                    std::current_exception());
+        }
+        co_await sleep_abortable(std::chrono::seconds(60), abort_src);
+    }
+}
+
+// Contains a CDC log table's creation time (extracted from its schema's id)
+// and its CDC TTL setting.
+struct time_and_ttl {
+    db_clock::time_point creation_time;
+    int ttl;
+};
+
+/*
+ * See `maybe_rewrite_streams_descriptions`.
+ * This is the long-running-in-the-background part of that function.
+ * It returns the timestamp of the last rewritten generation (if any).
+ */
+static future<std::optional<db_clock::time_point>> rewrite_streams_descriptions(
+        std::vector<time_and_ttl> times_and_ttls,
+        shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source& abort_src) {
+    cdc_log.info("Retrieving generation timestamps for rewriting...");
+    auto tss = co_await get_cdc_desc_v1_timestamps(*sys_dist_ks, abort_src, get_num_token_owners);
+    cdc_log.info("Generation timestamps retrieved.");
+
+    // Find first generation timestamp such that some CDC log table may contain data before this timestamp.
+    // This predicate is monotonic w.r.t the timestamps.
+    auto now = db_clock::now();
+    std::sort(tss.begin(), tss.end());
+    auto first = std::partition_point(tss.begin(), tss.end(), [&] (db_clock::time_point ts) {
+        // partition_point finds first element that does *not* satisfy the predicate.
+        return std::none_of(times_and_ttls.begin(), times_and_ttls.end(),
+                [&] (const time_and_ttl& tat) {
+            // In this CDC log table there are no entries older than the table's creation time
+            // or (now - the table's ttl). We subtract 10s to account for some possible clock drift.
+            // If ttl is set to 0 then entries in this table never expire. In that case we look
+            // only at the table's creation time.
+            auto no_entries_older_than =
+                (tat.ttl == 0 ? tat.creation_time : std::max(tat.creation_time, now - std::chrono::seconds(tat.ttl)))
+                    - std::chrono::seconds(10);
+            return no_entries_older_than < ts;
+        });
+    });
+
+    // Find first generation timestamp such that some CDC log table may contain data in this generation.
+    // This and all later generations need to be written to the new streams table.
+    if (first != tss.begin()) {
+        --first;
+    }
+
+    if (first == tss.end()) {
+        cdc_log.info("No generations to rewrite.");
+        co_return std::nullopt;
+    }
+
+    cdc_log.info("First generation to rewrite: {}", *first);
+
+    bool each_success = true;
+    co_await max_concurrent_for_each(first, tss.end(), 10, [&] (db_clock::time_point ts) -> future<> {
+        while (true) {
+            try {
+                co_return co_await do_update_streams_description(ts, *sys_dist_ks, { get_num_token_owners() });
+            } catch (const no_generation_data_exception& e) {
+                cdc_log.error("Failed to rewrite streams for generation {}: {}. Giving up.", ts, e);
+                each_success = false;
+                co_return;
+            } catch (...) {
+                cdc_log.warn("Failed to rewrite streams for generation {}: {}. Retrying in 60s.", ts, std::current_exception());
+            }
+            co_await sleep_abortable(std::chrono::seconds(60), abort_src);
+        }
+    });
+
+    if (each_success) {
+        cdc_log.info("Rewriting stream tables finished successfully.");
+    } else {
+        cdc_log.info("Rewriting stream tables finished, but some generations could not be rewritten (check the logs).");
+    }
+
+    if (first != tss.end()) {
+        co_return *std::prev(tss.end());
+    }
+
+    co_return std::nullopt;
+}
+
+future<> maybe_rewrite_streams_descriptions(
+        const database& db,
+        shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source& abort_src) {
+    if (!db.has_schema(sys_dist_ks->NAME, sys_dist_ks->CDC_DESC_V1)) {
+        // This cluster never went through a Scylla version which used this table
+        // or the user deleted the table. Nothing to do.
+        co_return;
+    }
+
+    if (co_await db::system_keyspace::cdc_is_rewritten()) {
+        co_return;
+    }
+
+    if (db.get_config().cdc_dont_rewrite_streams()) {
+        cdc_log.warn("Stream rewriting disabled. Manual administrator intervention may be required...");
+        co_return;
+    }
+
+    // For each CDC log table get the TTL setting (from CDC options) and the table's creation time
+    std::vector<time_and_ttl> times_and_ttls;
+    for (auto& [_, cf] : db.get_column_families()) {
+        auto& s = *cf->schema();
+        auto base = cdc::get_base_table(db, s.ks_name(), s.cf_name());
+        if (!base) {
+            // Not a CDC log table.
+            continue;
+        }
+        auto& cdc_opts = base->cdc_options();
+        if (!cdc_opts.enabled()) {
+            // This table is named like a CDC log table but it's not one.
+            continue;
+        }
+
+        times_and_ttls.push_back(time_and_ttl{as_timepoint(s.id()), cdc_opts.ttl()});
+    }
+
+    if (times_and_ttls.empty()) {
+        // There's no point in rewriting old generations' streams (they don't contain any data).
+        cdc_log.info("No CDC log tables present, not rewriting stream tables.");
+        co_return co_await db::system_keyspace::cdc_set_rewritten(std::nullopt);
+    }
+
+    // It's safe to discard this future: the coroutine keeps system_distributed_keyspace alive
+    // and the abort source's lifetime extends the lifetime of any other service.
+    (void)(([_times_and_ttls = std::move(times_and_ttls), _sys_dist_ks = std::move(sys_dist_ks),
+                _get_num_token_owners = std::move(get_num_token_owners), &_abort_src = abort_src] () mutable -> future<> {
+        auto times_and_ttls = std::move(_times_and_ttls);
+        auto sys_dist_ks = std::move(_sys_dist_ks);
+        auto get_num_token_owners = std::move(_get_num_token_owners);
+        auto& abort_src = _abort_src;
+
+        // This code is racing with node startup. At this point, we're most likely still waiting for gossip to settle
+        // and some nodes that are UP may still be marked as DOWN by us.
+        // Let's sleep a bit to increase the chance that the first attempt at rewriting succeeds (it's still ok if
+        // it doesn't - we'll retry - but it's nice if we succeed without any warnings).
+        co_await sleep_abortable(std::chrono::seconds(10), abort_src);
+
+        cdc_log.info("Rewriting stream tables in the background...");
+        auto last_rewritten = co_await rewrite_streams_descriptions(
+                std::move(times_and_ttls),
+                std::move(sys_dist_ks),
+                std::move(get_num_token_owners),
+                abort_src);
+
+        co_await db::system_keyspace::cdc_set_rewritten(last_rewritten);
+    })());
+}
+
 } // namespace cdc
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -40,6 +40,8 @@
 #include "database_fwd.hh"
 #include "db_clock.hh"
 #include "dht/token.hh"
+#include "locator/token_metadata.hh"
+#include "utils/chunked_vector.hh"

 namespace seastar {
    class abort_source;
@@ -55,10 +57,6 @@ namespace gms {
    class gossiper;
 } // namespace gms

-namespace locator {
-    class token_metadata;
-} // namespace locator
-
 namespace cdc {

 class stream_id final {
@@ -68,6 +66,7 @@ public:

    stream_id() = default;
    stream_id(bytes);
+    stream_id(dht::token, size_t);

    bool is_set() const;
    bool operator==(const stream_id&) const;
@@ -81,9 +80,6 @@ public:

    partition_key to_partition_key(const schema& log_schema) const;
    static int64_t token_from_bytes(bytes_view);
-private:
-    friend class topology_description_generator;
-    stream_id(dht::token, size_t);
 };

 /* Describes a mapping of tokens to CDC streams in a token range.
@@ -116,7 +112,8 @@ public:
    topology_description(std::vector<token_range_description> entries);
    bool operator==(const topology_description&) const;

-    const std::vector<token_range_description>& entries() const;
+    const std::vector<token_range_description>& entries() const&;
+    std::vector<token_range_description>&& entries() &&;
 };

 /**
@@ -125,14 +122,19 @@ public:
 */ 
 class streams_version {
 public:
-    std::vector<stream_id> streams;
+    utils::chunked_vector<stream_id> streams;
    db_clock::time_point timestamp;
-    std::optional<db_clock::time_point> expired;

-    streams_version(std::vector<stream_id> s, db_clock::time_point ts, std::optional<db_clock::time_point> exp)
+    streams_version(utils::chunked_vector<stream_id> s, db_clock::time_point ts)
        : streams(std::move(s))
        , timestamp(ts)
-        , expired(std::move(exp))
+    {}
+};
+
+class no_generation_data_exception : public std::runtime_error {
+public:
+    no_generation_data_exception(db_clock::time_point generation_ts)
+        : std::runtime_error(format("could not find generation data for timestamp {}", generation_ts))
    {}
 };

@@ -154,7 +156,7 @@ bool should_propose_first_generation(const gms::inet_address& me, const gms::gos
 future<db_clock::time_point> get_local_streams_timestamp();

 /* Generate a new set of CDC streams and insert it into the distributed cdc_generation_descriptions table.
- * Returns the timestamp of this new generation.
+ * Returns the timestamp of this new generation
 *
 * Should be called when starting the node for the first time (i.e., joining the ring).
 *
@@ -168,11 +170,11 @@ future<db_clock::time_point> get_local_streams_timestamp();
 db_clock::time_point make_new_cdc_generation(
        const db::config& cfg,
        const std::unordered_set<dht::token>& bootstrap_tokens,
-        const locator::token_metadata& tm,
+        const locator::token_metadata_ptr tmptr,
        const gms::gossiper& g,
        db::system_distributed_keyspace& sys_dist_ks,
        std::chrono::milliseconds ring_delay,
-        bool for_testing);
+        bool add_delay);

 /* Retrieves CDC streams generation timestamp from the given endpoint's application state (broadcasted through gossip).
 * We might be during a rolling upgrade, so the timestamp might not be there (if the other node didn't upgrade yet),
@@ -197,4 +199,15 @@ void update_streams_description(
        noncopyable_function<unsigned()> get_num_token_owners,
        abort_source&);

+/* Part of the upgrade procedure. Useful in case where the version of Scylla that we're upgrading from
+ * used the "cdc_streams_descriptions" table. This procedure ensures that the new "cdc_streams_descriptions_v2"
+ * table contains streams of all generations that were present in the old table and may still contain data
+ * (i.e. there exist CDC log tables that may contain rows with partition keys being the stream IDs from
+ * these generations). */
+future<> maybe_rewrite_streams_descriptions(
+        const database&,
+        shared_ptr<db::system_distributed_keyspace>,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source&);
+
 } // namespace cdc
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -220,7 +220,7 @@ public:
            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt);

            auto log_mut = log_schema 
-                ? db::schema_tables::make_update_table_mutations(keyspace.metadata(), log_schema, new_log_schema, timestamp, false)
+                ? db::schema_tables::make_update_table_mutations(db, keyspace.metadata(), log_schema, new_log_schema, timestamp, false)
                : db::schema_tables::make_create_table_mutations(keyspace.metadata(), new_log_schema, timestamp)
                ;

@@ -519,6 +519,7 @@ static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID>
    b.with_column(log_meta_column_name_bytes("batch_seq_no"), int32_type, column_kind::clustering_key);
    b.with_column(log_meta_column_name_bytes("operation"), data_type_for<operation_native_type>());
    b.with_column(log_meta_column_name_bytes("ttl"), long_type);
+    b.with_column(log_meta_column_name_bytes("end_of_batch"), boolean_type);
    b.set_caching_options(caching_options::get_disabled_caching_options());
    auto add_columns = [&] (const schema::const_iterator_range_type& columns, bool is_data_col = false) {
        for (const auto& column : columns) {
@@ -578,11 +579,6 @@ db_context::builder& db_context::builder::with_migration_notifier(service::migra
    return *this;
 }

-db_context::builder& db_context::builder::with_token_metadata(const locator::token_metadata& token_metadata) {
-    _token_metadata = token_metadata;
-    return *this;
-}
-
 db_context::builder& db_context::builder::with_cdc_metadata(cdc::metadata& cdc_metadata) {
    _cdc_metadata = cdc_metadata;
    return *this;
@@ -592,14 +588,20 @@ db_context db_context::builder::build() {
    return db_context{
        _proxy,
        _migration_notifier ? _migration_notifier->get() : service::get_local_storage_service().get_migration_notifier(),
-        _token_metadata ? _token_metadata->get() : service::get_local_storage_service().get_token_metadata(),
        _cdc_metadata ? _cdc_metadata->get() : service::get_local_storage_service().get_cdc_metadata(),
    };
 }

 // iterators for collection merge
 template<typename T>
-class collection_iterator : public std::iterator<std::input_iterator_tag, const T> {
+class collection_iterator {
+public:
+    using iterator_category = std::input_iterator_tag;
+    using value_type = const T;
+    using difference_type = std::ptrdiff_t;
+    using pointer = const T*;
+    using reference = const T&;
+private:
    bytes_view _v, _next;
    size_t _rem = 0;
    T _current;
@@ -707,16 +709,16 @@ private:
       }
       return false;
    }
-    bool compare(const T&, const value_type& v);
+    int32_t compare(const T&, const value_type& v);
 };

 template<>
-bool maybe_back_insert_iterator<std::vector<std::pair<bytes_view, bytes_view>>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
+int32_t maybe_back_insert_iterator<std::vector<std::pair<bytes_view, bytes_view>>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
    return _type.compare(t, v.first);
 }

 template<>
-bool maybe_back_insert_iterator<std::vector<bytes_view>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
+int32_t maybe_back_insert_iterator<std::vector<bytes_view>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
    return _type.compare(t, v);
 }

@@ -880,14 +882,26 @@ public:
        return _base_schema;
    }

+    clustering_key create_ck(int batch) const {
+        return clustering_key::from_exploded(_log_schema, { _tuuid, int32_type->decompose(batch) });
+    }
+
    // Creates a new clustering row in the mutation, assigning it the next `cdc$batch_seq_no`.
    // The numbering of batch sequence numbers starts from 0.
    clustering_key allocate_new_log_row() {
-        auto log_ck = clustering_key::from_exploded(_log_schema, { _tuuid, int32_type->decompose(_batch_no++) });
+        auto log_ck = create_ck(_batch_no++);
        set_key_columns(log_ck, _base_schema.partition_key_columns(), _base_pk);
        return log_ck;
    }

+    bool has_rows() const {
+        return _batch_no != 0;
+    }
+
+    clustering_key last_row_key() const {
+        return create_ck(_batch_no - 1);
+    }
+
    // A common pattern is to allocate a row and then immediately set its `cdc$operation` column.
    clustering_key allocate_new_log_row(operation op) {
        auto log_ck = allocate_new_log_row();
@@ -944,6 +958,11 @@ public:
        _log_mut.set_cell(log_ck, log_cdef, atomic_cell::make_live(*log_cdef.type, _ts, deleted_elements, _ttl));
    }

+    void end_record() {
+        if (has_rows()) {
+            _log_mut.set_cell(last_row_key(), log_meta_column_name_bytes("end_of_batch"), data_value(true), _ts, _ttl);
+        }
+    }
 private:
    void set_key_columns(const clustering_key& log_ck, schema::const_iterator_range_type columns, const std::vector<bytes>& key) {
        size_t pos = 0;
@@ -962,9 +981,9 @@ static bytes get_bytes(const atomic_cell_view& acv) {
    return acv.value().linearize();
 }

-static bytes_view get_bytes_view(const atomic_cell_view& acv, std::vector<bytes>& buf) {
+static bytes_view get_bytes_view(const atomic_cell_view& acv, std::forward_list<bytes>& buf) {
    return acv.value().is_fragmented()
-        ? bytes_view{buf.emplace_back(acv.value().linearize())}
+        ? bytes_view{buf.emplace_front(acv.value().linearize())}
        : acv.value().first_fragment();
 }

@@ -1119,9 +1138,9 @@ struct process_row_visitor {

                struct udt_visitor : public collection_visitor {
                    std::vector<bytes_opt> _added_cells;
-                    std::vector<bytes>& _buf;
+                    std::forward_list<bytes>& _buf;

-                    udt_visitor(ttl_opt& ttl_column, size_t num_keys, std::vector<bytes>& buf)
+                    udt_visitor(ttl_opt& ttl_column, size_t num_keys, std::forward_list<bytes>& buf)
                        : collection_visitor(ttl_column), _added_cells(num_keys), _buf(buf) {}

                    void live_collection_cell(bytes_view key, const atomic_cell_view& cell) {
@@ -1130,7 +1149,7 @@ struct process_row_visitor {
                    }
                };

-                std::vector<bytes> buf;
+                std::forward_list<bytes> buf;
                udt_visitor v(_ttl_column, type.size(), buf);

                visit_collection(v);
@@ -1149,9 +1168,9 @@ struct process_row_visitor {

                struct map_or_list_visitor : public collection_visitor {
                    std::vector<std::pair<bytes_view, bytes_view>> _added_cells;
-                    std::vector<bytes>& _buf;
+                    std::forward_list<bytes>& _buf;

-                    map_or_list_visitor(ttl_opt& ttl_column, std::vector<bytes>& buf)
+                    map_or_list_visitor(ttl_opt& ttl_column, std::forward_list<bytes>& buf)
                        : collection_visitor(ttl_column), _buf(buf) {}

                    void live_collection_cell(bytes_view key, const atomic_cell_view& cell) {
@@ -1160,7 +1179,7 @@ struct process_row_visitor {
                    }
                };

-                std::vector<bytes> buf;
+                std::forward_list<bytes> buf;
                map_or_list_visitor v(_ttl_column, buf);

                visit_collection(v);
@@ -1272,6 +1291,13 @@ struct process_change_visitor {
                _clustering_row_states, _generate_delta_values);
        visit_row_cells(v);

+        if (_enable_updating_state) {
+            // #7716: if there are no regular columns, our visitor would not have visited any cells,
+            // hence it would not have created a row_state for this row. In effect, postimage wouldn't be produced.
+            // Ensure that the row state exists.
+            _clustering_row_states.try_emplace(ckey);
+        }
+
        _builder.set_operation(log_ck, v._cdc_op);
        _builder.set_ttl(log_ck, v._ttl_column);
    }
@@ -1519,6 +1545,11 @@ public:
        cdc::inspect_mutation(m, v);
    }

+    void end_record() override {
+        assert(_builder);
+        _builder->end_record();
+    }
+
    // Takes and returns generated cdc log mutations and associated statistics about parts touched during transformer's lifetime.
    // The `transformer` object on which this method was called on should not be used anymore.
    std::tuple<std::vector<mutation>, stats::part_type_set> finish() && {
--- a/cdc/log.hh
+++ b/cdc/log.hh
@@ -100,19 +100,16 @@ public:
 struct db_context final {
    service::storage_proxy& _proxy;
    service::migration_notifier& _migration_notifier;
-    const locator::token_metadata& _token_metadata;
    cdc::metadata& _cdc_metadata;

    class builder final {
        service::storage_proxy& _proxy;
        std::optional<std::reference_wrapper<service::migration_notifier>> _migration_notifier;
-        std::optional<std::reference_wrapper<const locator::token_metadata>> _token_metadata;
        std::optional<std::reference_wrapper<cdc::metadata>> _cdc_metadata;
    public:
        builder(service::storage_proxy& proxy);

        builder& with_migration_notifier(service::migration_notifier& migration_notifier);
-        builder& with_token_metadata(const locator::token_metadata& token_metadata);
        builder& with_cdc_metadata(cdc::metadata&);

        db_context build();
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -51,7 +51,8 @@ static cdc::stream_id get_stream(
    return entry.streams[shard_id];
 }

-static cdc::stream_id get_stream(
+// non-static for testing
+cdc::stream_id get_stream(
        const std::vector<cdc::token_range_description>& entries,
        dht::token tok) {
    if (entries.empty()) {
--- a/cdc/split.cc
+++ b/cdc/split.cc
@@ -684,6 +684,8 @@ void process_changes_with_splitting(const mutation& base_mutation, change_proces
                processor.produce_postimage(&ck);
            }
        }
+
+        processor.end_record();
    }
 }

@@ -731,6 +733,8 @@ void process_changes_without_splitting(const mutation& base_mutation, change_pro
            processor.produce_postimage(&cr.key());
        }
    }
+
+    processor.end_record();
 }

 } // namespace cdc
--- a/cdc/split.hh
+++ b/cdc/split.hh
@@ -77,6 +77,10 @@ public:
    // both columns have different timestamp or TTL set.
    //   m - the small mutation to be converted into CDC log rows.
    virtual void process_change(const mutation& m) = 0;
+
+    // Tells processor we have reached end of record - last part
+    // of a given timestamp batch
+    virtual void end_record() = 0;
 };

 bool should_split(const mutation& base_mutation);
--- a/clustering_bounds_comparator.hh
+++ b/clustering_bounds_comparator.hh
@@ -67,8 +67,8 @@ public:
        int operator()(const clustering_key_prefix& p1, int32_t w1, const clustering_key_prefix& p2, int32_t w2) const {
            auto type = _s.get().clustering_key_prefix_type();
            auto res = prefix_equality_tri_compare(type->types().begin(),
-                type->begin(p1), type->end(p1),
-                type->begin(p2), type->end(p2),
+                type->begin(p1.representation()), type->end(p1.representation()),
+                type->begin(p2.representation()), type->end(p2.representation()),
                ::tri_compare);
            if (res) {
                return res;
--- a/clustering_interval_set.hh
+++ b/clustering_interval_set.hh
@@ -72,7 +72,14 @@ public:
        }
        return result;
    }
-    class position_range_iterator : public std::iterator<std::input_iterator_tag, const position_range> {
+    class position_range_iterator {
+    public:
+        using iterator_category = std::input_iterator_tag;
+        using value_type = const position_range;
+        using difference_type = std::ptrdiff_t;
+        using pointer = const position_range*;
+        using reference = const position_range&;
+    private:
        set_type::iterator _i;
    public:
        position_range_iterator(set_type::iterator i) : _i(i) {}
--- a/collection_mutation.hh
+++ b/collection_mutation.hh
@@ -136,4 +136,4 @@ collection_mutation merge(const abstract_type&, collection_mutation_view, collec
 collection_mutation difference(const abstract_type&, collection_mutation_view, collection_mutation_view);

 // Serializes the given collection of cells to a sequence of bytes ready to be sent over the CQL protocol.
-bytes serialize_for_cql(const abstract_type&, collection_mutation_view, cql_serialization_format);
+bytes_ostream serialize_for_cql(const abstract_type&, collection_mutation_view, cql_serialization_format);
--- a/column_computation.hh
+++ b/column_computation.hh
@@ -54,6 +54,36 @@ public:
    virtual bytes_opt compute_value(const schema& schema, const partition_key& key, const clustering_row& row) const = 0;
 };

+/*
+ * Computes token value of partition key and returns it as bytes.
+ *
+ * Should NOT be used (use token_column_computation), because ordering
+ * of bytes is different than ordering of tokens (signed vs unsigned comparison).
+ *
+ * The type name stored for computations of this class is "token" - this was
+ * the original implementation. (now depracated for new tables)
+ */
+class legacy_token_column_computation : public column_computation {
+public:
+    virtual column_computation_ptr clone() const override {
+        return std::make_unique<legacy_token_column_computation>(*this);
+    }
+    virtual bytes serialize() const override;
+    virtual bytes_opt compute_value(const schema& schema, const partition_key& key, const clustering_row& row) const override;
+};
+
+
+/*
+ * Computes token value of partition key and returns it as long_type.
+ * The return type means that it can be trivially sorted (for example
+ * if computed column using this computation is a clustering key),
+ * preserving the correct order of tokens (using signed comparisons).
+ *
+ * Please use this class instead of legacy_token_column_computation.
+ * 
+ * The type name stored for computations of this class is "token_v2".
+ * (the name "token" refers to the depracated legacy_token_column_computation)
+ */
 class token_column_computation : public column_computation {
 public:
    virtual column_computation_ptr clone() const override {
--- a/compound.hh
+++ b/compound.hh
@@ -73,12 +73,19 @@ private:
     *   <len(value1)><value1><len(value2)><value2>...<len(value_n)><value_n>
     *
     */
-    template<typename RangeOfSerializedComponents, typename CharOutputIterator>
-    static void serialize_value(RangeOfSerializedComponents&& values, CharOutputIterator& out) {
+    template<typename RangeOfSerializedComponents, FragmentedMutableView Out>
+    static void serialize_value(RangeOfSerializedComponents&& values, Out out) {
        for (auto&& val : values) {
            assert(val.size() <= std::numeric_limits<size_type>::max());
            write<size_type>(out, size_type(val.size()));
-            out = std::copy(val.begin(), val.end(), out);
+            using val_type = std::remove_cvref_t<decltype(val)>;
+            if constexpr (FragmentedView<val_type>) {
+                write_fragmented(out, val);
+            } else if constexpr (std::same_as<val_type, managed_bytes>) {
+                write_fragmented(out, managed_bytes_view(val));
+            } else {
+                write_fragmented(out, single_fragmented_view(val));
+            }
        }
    }
    template <typename RangeOfSerializedComponents>
@@ -90,25 +97,27 @@ private:
        return len;
    }
 public:
-    bytes serialize_single(bytes&& v) const {
+    managed_bytes serialize_single(managed_bytes&& v) const {
+        return serialize_value({std::move(v)});
+    }
+    managed_bytes serialize_single(bytes&& v) const {
        return serialize_value({std::move(v)});
    }
    template<typename RangeOfSerializedComponents>
-    static bytes serialize_value(RangeOfSerializedComponents&& values) {
+    static managed_bytes serialize_value(RangeOfSerializedComponents&& values) {
        auto size = serialized_size(values);
        if (size > std::numeric_limits<size_type>::max()) {
            throw std::runtime_error(format("Key size too large: {:d} > {:d}", size, std::numeric_limits<size_type>::max()));
        }
-        bytes b(bytes::initialized_later(), size);
-        auto i = b.begin();
-        serialize_value(values, i);
+        managed_bytes b(managed_bytes::initialized_later(), size);
+        serialize_value(values, managed_bytes_mutable_view(b));
        return b;
    }
    template<typename T>
-    static bytes serialize_value(std::initializer_list<T> values) {
+    static managed_bytes serialize_value(std::initializer_list<T> values) {
        return serialize_value(boost::make_iterator_range(values.begin(), values.end()));
    }
-    bytes serialize_optionals(const std::vector<bytes_opt>& values) const {
+    managed_bytes serialize_optionals(const std::vector<bytes_opt>& values) const {
        return serialize_value(values | boost::adaptors::transformed([] (const bytes_opt& bo) -> bytes_view {
            if (!bo) {
                throw std::logic_error("attempted to create key component from empty optional");
@@ -116,7 +125,7 @@ public:
            return *bo;
        }));
    }
-    bytes serialize_value_deep(const std::vector<data_value>& values) const {
+    managed_bytes serialize_value_deep(const std::vector<data_value>& values) const {
        // TODO: Optimize
        std::vector<bytes> partial;
        partial.reserve(values.size());
@@ -127,19 +136,26 @@ public:
        }
        return serialize_value(partial);
    }
-    bytes decompose_value(const value_type& values) const {
+    managed_bytes decompose_value(const value_type& values) const {
        return serialize_value(values);
    }
-    class iterator : public std::iterator<std::input_iterator_tag, const bytes_view> {
+    class iterator {
+    public:
+        using iterator_category = std::input_iterator_tag;
+        using value_type = const managed_bytes_view;
+        using difference_type = std::ptrdiff_t;
+        using pointer = const value_type*;
+        using reference = const value_type&;
    private:
-        bytes_view _v;
-        bytes_view _current;
+        managed_bytes_view _v;
+        managed_bytes_view _current;
+        size_t _remaining = 0;
    private:
        void read_current() {
+            _remaining = _v.size_bytes();
            size_type len;
            {
                if (_v.empty()) {
-                    _v = bytes_view(nullptr, 0);
                    return;
                }
                len = read_simple<size_type>(_v);
@@ -147,15 +163,16 @@ public:
                    throw_with_backtrace<marshal_exception>(format("compound_type iterator - not enough bytes, expected {:d}, got {:d}", len, _v.size()));
                }
            }
-            _current = bytes_view(_v.begin(), len);
-            _v.remove_prefix(len);
+            _current = _v.prefix(len);
+            _v.remove_prefix(_current.size_bytes());
        }
    public:
        struct end_iterator_tag {};
-        iterator(const bytes_view& v) : _v(v) {
+        iterator(const managed_bytes_view& v) : _v(v) {
            read_current();
        }
-        iterator(end_iterator_tag, const bytes_view& v) : _v(nullptr, 0) {}
+        iterator(end_iterator_tag, const managed_bytes_view& v) : _v() {}
+        iterator() {}
        iterator& operator++() {
            read_current();
            return *this;
@@ -167,29 +184,40 @@ public:
        }
        const value_type& operator*() const { return _current; }
        const value_type* operator->() const { return &_current; }
-        bool operator!=(const iterator& i) const { return _v.begin() != i._v.begin(); }
-        bool operator==(const iterator& i) const { return _v.begin() == i._v.begin(); }
+        bool operator==(const iterator& i) const { return _remaining == i._remaining; }
    };
-    static iterator begin(const bytes_view& v) {
+    static iterator begin(managed_bytes_view v) {
        return iterator(v);
    }
-    static iterator end(const bytes_view& v) {
+    static iterator end(managed_bytes_view v) {
        return iterator(typename iterator::end_iterator_tag(), v);
    }
-    static boost::iterator_range<iterator> components(const bytes_view& v) {
+    static boost::iterator_range<iterator> components(managed_bytes_view v) {
        return { begin(v), end(v) };
    }
-    value_type deserialize_value(bytes_view v) const {
+    value_type deserialize_value(managed_bytes_view v) const {
        std::vector<bytes> result;
        result.reserve(_types.size());
        std::transform(begin(v), end(v), std::back_inserter(result), [] (auto&& v) {
-            return bytes(v.begin(), v.end());
+            return to_bytes(v);
        });
        return result;
    }
+    bool less(managed_bytes_view b1, managed_bytes_view b2) const {
+        return with_linearized(b1, [&] (bytes_view bv1) {
+            return with_linearized(b2, [&] (bytes_view bv2) {
+                return less(bv1, bv2);
+            });
+        });
+    }
    bool less(bytes_view b1, bytes_view b2) const {
        return compare(b1, b2) < 0;
    }
+    size_t hash(managed_bytes_view v) const{
+        return with_linearized(v, [&] (bytes_view v) {
+            return hash(v);
+        });
+    }
    size_t hash(bytes_view v) const {
        if (_byte_order_equal) {
            return std::hash<bytes_view>()(v);
@@ -202,6 +230,13 @@ public:
        }
        return h;
    }
+    int compare(managed_bytes_view b1, managed_bytes_view b2) const {
+        return with_linearized(b1, [&] (bytes_view bv1) {
+            return with_linearized(b2, [&] (bytes_view bv2) {
+                return compare(bv1, bv2);
+            });
+        });
+    }
    int compare(bytes_view b1, bytes_view b2) const {
        if (_byte_order_comparable) {
            if (_is_reversed) {
@@ -216,15 +251,21 @@ public:
            });
    }
    // Retruns true iff given prefix has no missing components
-    bool is_full(bytes_view v) const {
+    bool is_full(managed_bytes_view v) const {
        assert(AllowPrefixes == allow_prefixes::yes);
        return std::distance(begin(v), end(v)) == (ssize_t)_types.size();
    }
+    bool is_empty(managed_bytes_view v) const {
+        return v.empty();
+    }
+    bool is_empty(const managed_bytes& v) const {
+        return v.empty();
+    }
    bool is_empty(bytes_view v) const {
        return begin(v) == end(v);
    }
-    void validate(bytes_view v) const {
-        std::vector<bytes_view> values(begin(v), end(v));
+    void validate(managed_bytes_view v) const {
+        std::vector<managed_bytes_view> values(begin(v), end(v));
        if (AllowPrefixes == allow_prefixes::no && values.size() < _types.size()) {
            throw marshal_exception(fmt::format("compound::validate(): non-prefixable compound cannot be a prefix"));
        }
@@ -237,6 +278,13 @@ public:
            _types[i]->validate(values[i], cql_serialization_format::internal());
        }
    }
+    bool equal(managed_bytes_view v1, managed_bytes_view v2) const {
+        return with_linearized(v1, [&] (bytes_view bv1) {
+            return with_linearized(v2, [&] (bytes_view bv2) {
+                return equal(bv1, bv2);
+            });
+        });
+    }
    bool equal(bytes_view v1, bytes_view v2) const {
        if (_byte_order_equal) {
            return compare_unsigned(v1, v2) == 0;
--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -54,14 +54,21 @@ template <typename CompoundType>
 class legacy_compound_view {
    static_assert(!CompoundType::is_prefixable, "Legacy view not defined for prefixes");
    CompoundType& _type;
-    bytes_view _packed;
+    managed_bytes_view _packed;
 public:
-    legacy_compound_view(CompoundType& c, bytes_view packed)
+    legacy_compound_view(CompoundType& c, managed_bytes_view packed)
        : _type(c)
        , _packed(packed)
    { }

-    class iterator : public std::iterator<std::input_iterator_tag, bytes::value_type> {
+    class iterator {
+    public:
+        using iterator_category = std::input_iterator_tag;
+        using value_type = bytes::value_type;
+        using difference_type = std::ptrdiff_t;
+        using pointer = bytes::value_type*;
+        using reference = bytes::value_type&;
+    private:
        bool _singular;
        // Offset within virtual output space of a component.
        //
@@ -140,18 +147,18 @@ public:
        { }

        // @k1 and @k2 must be serialized using @type, which was passed to the constructor.
-        int operator()(bytes_view k1, bytes_view k2) const {
+        int operator()(managed_bytes_view k1, managed_bytes_view k2) const {
            if (_type.is_singular()) {
                return compare_unsigned(*_type.begin(k1), *_type.begin(k2));
            }
            return lexicographical_tri_compare(
                _type.begin(k1), _type.end(k1),
                _type.begin(k2), _type.end(k2),
-                [] (const bytes_view& c1, const bytes_view& c2) -> int {
+                [] (const managed_bytes_view& c1, const managed_bytes_view& c2) -> int {
                    if (c1.size() != c2.size() || !c1.size()) {
                        return c1.size() < c2.size() ? -1 : c1.size() ? 1 : 0;
                    }
-                    return memcmp(c1.begin(), c2.begin(), c1.size());
+                    return compare_unsigned(c1, c2);
                });
        }
    };
@@ -181,7 +188,7 @@ public:
 // @packed is assumed to be serialized using supplied @type.
 template <typename CompoundType>
 static inline
-bytes to_legacy(CompoundType& type, bytes_view packed) {
+bytes to_legacy(CompoundType& type, managed_bytes_view packed) {
    legacy_compound_view<CompoundType> lv(type, packed);
    bytes legacy_form(bytes::initialized_later(), lv.size());
    std::copy(lv.begin(), lv.end(), legacy_form.begin());
@@ -257,6 +264,12 @@ private:
    static void write_value(Value&& val, CharOutputIterator& out) {
        out = std::copy(val.begin(), val.end(), out);
    }
+    template<typename CharOutputIterator>
+    static void write_value(managed_bytes_view val, CharOutputIterator& out) {
+        for (bytes_view frag : fragment_range(val)) {
+            out = std::copy(frag.begin(), frag.end(), out);
+        }
+    }
    template <typename CharOutputIterator>
    static void write_value(const data_value& val, CharOutputIterator& out) {
        val.serialize(out);
@@ -339,7 +352,14 @@ public:
        return eoc_byte == 0 ? eoc::none : (eoc_byte < 0 ? eoc::start : eoc::end);
    }

-    class iterator : public std::iterator<std::input_iterator_tag, const component_view> {
+    class iterator {
+    public:
+        using iterator_category = std::input_iterator_tag;
+        using value_type = const component_view;
+        using difference_type = std::ptrdiff_t;
+        using pointer = const component_view*;
+        using reference = const component_view&;
+    private:
        bytes_view _v;
        component_view _current;
        bool _strict_mode = true;
@@ -391,6 +411,7 @@ public:
        iterator(end_iterator_tag) : _v(nullptr, 0) {}

    public:
+        iterator() : iterator(end_iterator_tag()) {}
        iterator& operator++() {
            read_current();
            return *this;
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -99,8 +99,8 @@ listen_address: localhost
 # listen_on_broadcast_address: false

 # port for the CQL native transport to listen for clients on
-# For security reasons, you should not expose this port to the internet.  Firewall it if needed.
-# To disable the CQL native transport, set this option to 0.
+# For security reasons, you should not expose this port to the internet. Firewall it if needed.
+# To disable the CQL native transport, remove this option and configure native_transport_port_ssl.
 native_transport_port: 9042

 # Like native_transport_port, but clients are forwarded to specific shards, based on the
@@ -230,6 +230,9 @@ batch_size_fail_threshold_in_kb: 50
 # - PasswordAuthenticator relies on username/password pairs to authenticate
 #   users. It keeps usernames and hashed passwords in system_auth.credentials table.
 #   Please increase system_auth keyspace replication factor if you use this authenticator.
+# - com.scylladb.auth.TransitionalAuthenticator requires username/password pair
+#   to authenticate in the same manner as PasswordAuthenticator, but improper credentials
+#   result in being logged in as an anonymous user. Use for upgrading clusters' auth.
 # authenticator: AllowAllAuthenticator

 # Authorization backend, implementing IAuthorizer; used to limit access/provide permissions
@@ -239,6 +242,9 @@ batch_size_fail_threshold_in_kb: 50
 # - AllowAllAuthorizer allows any action to any user - set it to disable authorization.
 # - CassandraAuthorizer stores permissions in system_auth.permissions table. Please
 #   increase system_auth keyspace replication factor if you use this authorizer.
+# - com.scylladb.auth.TransitionalAuthorizer wraps around the CassandraAuthorizer, using it for
+#   authorizing permission management. Otherwise, it allows all. Use for upgrading
+#   clusters' auth.
 # authorizer: AllowAllAuthorizer

 # initial_token allows you to specify tokens manually.  While you can use # it with
--- a/configure.py
+++ b/configure.py
@@ -59,6 +59,9 @@ i18n_xlat = {
 }

 python3_dependencies = subprocess.run('./install-dependencies.sh --print-python3-runtime-packages', shell=True, capture_output=True, encoding='utf-8').stdout.strip()
+node_exporter_filename = subprocess.run('./install-dependencies.sh --print-node-exporter-filename', shell=True, capture_output=True, encoding='utf-8').stdout.strip()
+node_exporter_dirname = os.path.basename(node_exporter_filename).rstrip('.tar.gz')
+

 def pkgname(name):
    if name in i18n_xlat:
@@ -257,27 +260,28 @@ modes = {
        'stack-usage-threshold': 1024*40,
    },
    'release': {
-        'cxxflags': '',
-        'cxx_ld_flags': '-O3 -ffunction-sections -fdata-sections -Wl,--gc-sections',
+        'cxxflags': '-O3 -ffunction-sections -fdata-sections ',
+        'cxx_ld_flags': '-Wl,--gc-sections',
        'stack-usage-threshold': 1024*13,
    },
    'dev': {
-        'cxxflags': '-DSEASTAR_ENABLE_ALLOC_FAILURE_INJECTION -DSCYLLA_ENABLE_ERROR_INJECTION',
-        'cxx_ld_flags': '-O1',
+        'cxxflags': '-O1 -DDEVEL -DSEASTAR_ENABLE_ALLOC_FAILURE_INJECTION -DSCYLLA_ENABLE_ERROR_INJECTION',
+        'cxx_ld_flags': '',
        'stack-usage-threshold': 1024*21,
    },
    'sanitize': {
-        'cxxflags': '-DDEBUG -DSANITIZE -DDEBUG_LSA_SANITIZER -DSCYLLA_ENABLE_ERROR_INJECTION',
-        'cxx_ld_flags': '-Os',
+        'cxxflags': '-Os -DDEBUG -DSANITIZE -DDEBUG_LSA_SANITIZER -DSCYLLA_ENABLE_ERROR_INJECTION',
+        'cxx_ld_flags': '',
        'stack-usage-threshold': 1024*50,
    }
 }

 scylla_tests = set([
    'test/boost/UUID_test',
+    'test/boost/cdc_generation_test',
    'test/boost/aggregate_fcts_test',
    'test/boost/allocation_strategy_test',
-    'test/boost/alternator_base64_test',
+    'test/boost/alternator_unit_test',
    'test/boost/anchorless_list_test',
    'test/boost/auth_passwords_test',
    'test/boost/auth_resource_test',
@@ -314,6 +318,7 @@ scylla_tests = set([
    'test/boost/crc_test',
    'test/boost/data_listeners_test',
    'test/boost/database_test',
+    'test/boost/double_decker_test',
    'test/boost/duration_test',
    'test/boost/dynamic_bitset_test',
    'test/boost/enum_option_test',
@@ -328,7 +333,9 @@ scylla_tests = set([
    'test/boost/gossip_test',
    'test/boost/gossiping_property_file_snitch_test',
    'test/boost/hash_test',
+    'test/boost/hashers_test',
    'test/boost/idl_test',
+    'test/boost/imr_test',
    'test/boost/input_stream_test',
    'test/boost/json_cql_query_test',
    'test/boost/json_test',
@@ -342,6 +349,7 @@ scylla_tests = set([
    'test/boost/estimated_histogram_test',
    'test/boost/logalloc_test',
    'test/boost/managed_vector_test',
+    'test/boost/managed_bytes_test',
    'test/boost/intrusive_array_test',
    'test/boost/map_difference_test',
    'test/boost/memtable_test',
@@ -383,6 +391,7 @@ scylla_tests = set([
    'test/boost/sstable_resharding_test',
    'test/boost/sstable_directory_test',
    'test/boost/sstable_test',
+    'test/boost/sstable_move_test',
    'test/boost/storage_proxy_test',
    'test/boost/top_k_test',
    'test/boost/transport_test',
@@ -417,7 +426,7 @@ scylla_tests = set([
    'test/perf/perf_fast_forward',
    'test/perf/perf_hash',
    'test/perf/perf_mutation',
-    'test/perf/perf_bptree',
+    'test/perf/perf_collection',
    'test/perf/perf_row_cache_update',
    'test/perf/perf_simple_query',
    'test/perf/perf_sstable',
@@ -447,6 +456,7 @@ apps = set([
    'scylla',
    'test/tools/cql_repl',
    'tools/scylla-types',
+    'tools/scylla-sstable-index',
 ])

 tests = scylla_tests | perf_tests | raft_tests
@@ -476,9 +486,9 @@ arg_parser.add_argument('--ldflags', action='store', dest='user_ldflags', defaul
                        help='Extra flags for the linker')
 arg_parser.add_argument('--target', action='store', dest='target', default=default_target_arch(),
                        help='Target architecture (-march)')
-arg_parser.add_argument('--compiler', action='store', dest='cxx', default='g++',
+arg_parser.add_argument('--compiler', action='store', dest='cxx', default='clang++',
                        help='C++ compiler path')
-arg_parser.add_argument('--c-compiler', action='store', dest='cc', default='gcc',
+arg_parser.add_argument('--c-compiler', action='store', dest='cc', default='clang',
                        help='C compiler path')
 add_tristate(arg_parser, name='dpdk', dest='dpdk',
                        help='Use dpdk (from seastar dpdk sources) (default=True for release builds)')
@@ -518,17 +528,6 @@ arg_parser.add_argument('--test-repeat', dest='test_repeat', action='store', typ
 arg_parser.add_argument('--test-timeout', dest='test_timeout', action='store', type=str, default='7200')
 args = arg_parser.parse_args()

-coroutines_test_src = '''
-#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-#if GCC_VERSION < 100201
-    #error "Coroutines support requires at leat gcc 10.2.1"
-#endif
-'''
-compiler_supports_coroutines = try_compile(compiler=args.cxx, source=coroutines_test_src)
-
-if args.build_raft and not compiler_supports_coroutines:
-    raise Exception("--build-raft is requested, while the used compiler does not support coroutines")
-
 if not args.build_raft:
    all_artifacts.difference_update(raft_tests)
    tests.difference_update(raft_tests)
@@ -583,6 +582,7 @@ scylla_core = (['database.cc',
                'sstables/mp_row_consumer.cc',
                'sstables/sstables.cc',
                'sstables/sstables_manager.cc',
+                'sstables/sstable_set.cc',
                'sstables/mx/writer.cc',
                'sstables/kl/writer.cc',
                'sstables/sstable_version.cc',
@@ -726,6 +726,7 @@ scylla_core = (['database.cc',
                'db/data_listeners.cc',
                'db/hints/manager.cc',
                'db/hints/resource_manager.cc',
+                'db/hints/host_filter.cc',
                'db/config.cc',
                'db/extensions.cc',
                'db/heat_load_balance.cc',
@@ -854,6 +855,7 @@ scylla_core = (['database.cc',
                'utils/error_injection.cc',
                'mutation_writer/timestamp_based_splitting_writer.cc',
                'mutation_writer/shard_based_splitting_writer.cc',
+                'mutation_writer/feed_writers.cc',
                'lua.cc',
                ] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')]
               )
@@ -989,6 +991,7 @@ deps = {
    'test/tools/cql_repl': idls + ['test/tools/cql_repl.cc'] + scylla_core + scylla_tests_generic_dependencies,
    #FIXME: we don't need all of scylla_core here, only the types module, need to modularize scylla_core.
    'tools/scylla-types': idls + ['tools/scylla-types.cc'] + scylla_core,
+    'tools/scylla-sstable-index': idls + ['tools/scylla-sstable-index.cc'] + scylla_core,
 }

 pure_boost_tests = set([
@@ -1008,6 +1011,7 @@ pure_boost_tests = set([
    'test/boost/dynamic_bitset_test',
    'test/boost/enum_option_test',
    'test/boost/enum_set_test',
+    'test/boost/hashers_test',
    'test/boost/idl_test',
    'test/boost/json_test',
    'test/boost/keys_test',
@@ -1024,11 +1028,12 @@ pure_boost_tests = set([
    'test/boost/top_k_test',
    'test/boost/vint_serialization_test',
    'test/boost/bptree_test',
+    'test/boost/utf8_test',
    'test/manual/streaming_histogram_test',
 ])

 tests_not_using_seastar_test_framework = set([
-    'test/boost/alternator_base64_test',
+    'test/boost/alternator_unit_test',
    'test/boost/small_vector_test',
    'test/manual/gossip',
    'test/manual/message',
@@ -1037,7 +1042,7 @@ tests_not_using_seastar_test_framework = set([
    'test/perf/perf_cql_parser',
    'test/perf/perf_hash',
    'test/perf/perf_mutation',
-    'test/perf/perf_bptree',
+    'test/perf/perf_collection',
    'test/perf/perf_row_cache_update',
    'test/unit/lsa_async_eviction_test',
    'test/unit/lsa_sync_eviction_test',
@@ -1102,7 +1107,7 @@ deps['test/boost/linearizing_input_stream_test'] = [
 ]

 deps['test/boost/duration_test'] += ['test/lib/exception_utils.cc']
-deps['test/boost/alternator_base64_test'] += ['alternator/base64.cc']
+deps['test/boost/alternator_unit_test'] += ['alternator/base64.cc']

 deps['test/raft/replication_test'] = ['test/raft/replication_test.cc'] + scylla_raft_dependencies
 deps['test/boost/raft_fsm_test'] =  ['test/boost/raft_fsm_test.cc', 'test/lib/log.cc'] + scylla_raft_dependencies
@@ -1146,12 +1151,13 @@ warnings = [
    '-Wno-delete-non-abstract-non-virtual-dtor',
    '-Wno-unknown-attributes',
    '-Wno-braced-scalar-init',
-    '-Wno-unused-value',
    '-Wno-range-loop-construct',
    '-Wno-unused-function',
    '-Wno-implicit-int-float-conversion',
    '-Wno-delete-abstract-non-virtual-dtor',
    '-Wno-uninitialized-const-reference',
+    # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
+    '-Wno-psabi',
 ]

 warnings = [w
@@ -1167,11 +1173,11 @@ optimization_flags = [
 optimization_flags = [o
                      for o in optimization_flags
                      if flag_supported(flag=o, compiler=args.cxx)]
-modes['release']['cxx_ld_flags'] += ' ' + ' '.join(optimization_flags)
+modes['release']['cxxflags'] += ' ' + ' '.join(optimization_flags)

 if flag_supported(flag='-Wstack-usage=4096', compiler=args.cxx):
    for mode in modes:
-        modes[mode]['cxx_ld_flags'] += f' -Wstack-usage={modes[mode]["stack-usage-threshold"]} -Wno-error=stack-usage='
+        modes[mode]['cxxflags'] += f' -Wstack-usage={modes[mode]["stack-usage-threshold"]} -Wno-error=stack-usage='

 linker_flags = linker_flags(compiler=args.cxx)

@@ -1286,6 +1292,8 @@ file = open(f'{outdir}/SCYLLA-VERSION-FILE', 'r')
 scylla_version = file.read().strip()
 file = open(f'{outdir}/SCYLLA-RELEASE-FILE', 'r')
 scylla_release = file.read().strip()
+file = open(f'{outdir}/SCYLLA-PRODUCT-FILE', 'r')
+scylla_product = file.read().strip()

 extra_cxxflags["release.cc"] = "-DSCYLLA_VERSION=\"\\\"" + scylla_version + "\\\"\" -DSCYLLA_RELEASE=\"\\\"" + scylla_release + "\\\"\""

@@ -1327,9 +1335,6 @@ args.user_cflags += f" -ffile-prefix-map={curdir}=."

 seastar_cflags = args.user_cflags

-if build_raft:
-    seastar_cflags += ' -fcoroutines'
-
 if args.target != '':
    seastar_cflags += ' -march=' + args.target
 seastar_ldflags = args.user_ldflags
@@ -1338,6 +1343,13 @@ libdeflate_cflags = seastar_cflags

 MODE_TO_CMAKE_BUILD_TYPE = {'release' : 'RelWithDebInfo', 'debug' : 'Debug', 'dev' : 'Dev', 'sanitize' : 'Sanitize' }

+# cmake likes to separate things with semicolons
+def semicolon_separated(*flags):
+    # original flags may be space separated, so convert to string still
+    # using spaces
+    f = ' '.join(flags)
+    return re.sub(' +', ';', f)
+
 def configure_seastar(build_dir, mode):
    seastar_build_dir = os.path.join(build_dir, mode, 'seastar')

@@ -1346,8 +1358,8 @@ def configure_seastar(build_dir, mode):
        '-DCMAKE_C_COMPILER={}'.format(args.cc),
        '-DCMAKE_CXX_COMPILER={}'.format(args.cxx),
        '-DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON',
-        '-DSeastar_CXX_FLAGS={}'.format((seastar_cflags + ' ' + modes[mode]['cxx_ld_flags']).replace(' ', ';')),
-        '-DSeastar_LD_FLAGS={}'.format(seastar_ldflags),
+        '-DSeastar_CXX_FLAGS={}'.format((seastar_cflags).replace(' ', ';')),
+        '-DSeastar_LD_FLAGS={}'.format(semicolon_separated(seastar_ldflags, modes[mode]['cxx_ld_flags'])),
        '-DSeastar_CXX_DIALECT=gnu++20',
        '-DSeastar_API_LEVEL=6',
        '-DSeastar_UNUSED_RESULT_ERROR=ON',
@@ -1458,7 +1470,7 @@ if not args.staticboost:
    args.user_cflags += ' -DBOOST_TEST_DYN_LINK'

 if build_raft:
-    args.user_cflags += ' -DENABLE_SCYLLA_RAFT -fcoroutines'
+    args.user_cflags += ' -DENABLE_SCYLLA_RAFT'

 # thrift version detection, see #4538
 proc_res = subprocess.run(["thrift", "-version"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
@@ -1797,24 +1809,18 @@ with open(buildfile_tmp, 'w') as f:
        f.write(textwrap.dedent('''\
            build $builddir/{mode}/iotune: copy $builddir/{mode}/seastar/apps/iotune/iotune
            ''').format(**locals()))
-        f.write('build $builddir/{mode}/dist/tar/scylla-package.tar.gz: package $builddir/{mode}/scylla $builddir/{mode}/iotune $builddir/SCYLLA-RELEASE-FILE $builddir/SCYLLA-VERSION-FILE $builddir/debian/debian | always\n'.format(**locals()))
-        f.write('  pool = submodule_pool\n')
+        f.write('build $builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz: package $builddir/{mode}/scylla $builddir/{mode}/iotune $builddir/SCYLLA-RELEASE-FILE $builddir/SCYLLA-VERSION-FILE $builddir/debian/debian $builddir/node_exporter | always\n'.format(**locals()))
        f.write('  mode = {mode}\n'.format(**locals()))
-        f.write(f'build $builddir/{mode}/scylla-package.tar.gz: copy $builddir/{mode}/dist/tar/scylla-package.tar.gz\n')
-        f.write(f'build $builddir/dist/{mode}/redhat: rpmbuild $builddir/{mode}/scylla-package.tar.gz\n')
-        f.write(f'  pool = submodule_pool\n')
+        f.write(f'build $builddir/dist/{mode}/redhat: rpmbuild $builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz\n')
        f.write(f'  mode = {mode}\n')
-        f.write(f'build $builddir/dist/{mode}/debian: debbuild $builddir/{mode}/scylla-package.tar.gz\n')
-        f.write(f'  pool = submodule_pool\n')
+        f.write(f'build $builddir/dist/{mode}/debian: debbuild $builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz\n')
        f.write(f'  mode = {mode}\n')
        f.write(f'build dist-server-{mode}: phony $builddir/dist/{mode}/redhat $builddir/dist/{mode}/debian\n')
-        f.write(f'build dist-jmx-{mode}: phony $builddir/{mode}/dist/tar/scylla-jmx-package.tar.gz dist-jmx-rpm dist-jmx-deb\n')
-        f.write(f'build dist-tools-{mode}: phony $builddir/{mode}/dist/tar/scylla-tools-package.tar.gz dist-tools-rpm dist-tools-deb\n')
+        f.write(f'build dist-jmx-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-jmx-package.tar.gz dist-jmx-rpm dist-jmx-deb\n')
+        f.write(f'build dist-tools-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-tools-package.tar.gz dist-tools-rpm dist-tools-deb\n')
        f.write(f'build dist-python3-{mode}: phony dist-python3-tar dist-python3-rpm dist-python3-deb compat-python3-rpm compat-python3-deb\n')
-        f.write(f'build dist-unified-{mode}: phony $builddir/{mode}/dist/tar/scylla-unified-package-{scylla_version}.{scylla_release}.tar.gz\n')
-        f.write(f'build $builddir/{mode}/scylla-unified-package-{scylla_version}.{scylla_release}.tar.gz: copy $builddir/{mode}/dist/tar/scylla-unified-package.tar.gz\n')
-        f.write(f'build $builddir/{mode}/dist/tar/scylla-unified-package-{scylla_version}.{scylla_release}.tar.gz: unified $builddir/{mode}/dist/tar/scylla-package.tar.gz $builddir/{mode}/dist/tar/scylla-python3-package.tar.gz $builddir/{mode}/dist/tar/scylla-jmx-package.tar.gz $builddir/{mode}/dist/tar/scylla-tools-package.tar.gz | always\n')
-        f.write(f'  pool = submodule_pool\n')
+        f.write(f'build dist-unified-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-unified-package-{scylla_version}.{scylla_release}.tar.gz\n')
+        f.write(f'build $builddir/{mode}/dist/tar/{scylla_product}-unified-package-{scylla_version}.{scylla_release}.tar.gz: unified $builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz $builddir/{mode}/dist/tar/{scylla_product}-python3-package.tar.gz $builddir/{mode}/dist/tar/{scylla_product}-jmx-package.tar.gz $builddir/{mode}/dist/tar/{scylla_product}-tools-package.tar.gz | always\n')
        f.write(f'  mode = {mode}\n')
        f.write('rule libdeflate.{mode}\n'.format(**locals()))
        f.write('  command = make -C libdeflate BUILD_DIR=../$builddir/{mode}/libdeflate/ CFLAGS="{libdeflate_cflags}" CC={args.cc} ../$builddir/{mode}/libdeflate//libdeflate.a\n'.format(**locals()))
@@ -1841,12 +1847,12 @@ with open(buildfile_tmp, 'w') as f:
    )

    f.write(textwrap.dedent(f'''\
-        build dist-unified-tar: phony {' '.join(['$builddir/{mode}/scylla-unified-package-$scylla_version.$scylla_release.tar.gz'.format(mode=mode) for mode in build_modes])}
+        build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-package-{scylla_version}.{scylla_release}.tar.gz' for mode in build_modes])}
        build dist-unified: phony dist-unified-tar

        build dist-server-deb: phony {' '.join(['$builddir/dist/{mode}/debian'.format(mode=mode) for mode in build_modes])}
        build dist-server-rpm: phony {' '.join(['$builddir/dist/{mode}/redhat'.format(mode=mode) for mode in build_modes])}
-        build dist-server-tar: phony {' '.join(['$builddir/{mode}/scylla-package.tar.gz'.format(mode=mode) for mode in build_modes])}
+        build dist-server-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz'.format(mode=mode, scylla_product=scylla_product) for mode in build_modes])}
        build dist-server: phony dist-server-tar dist-server-rpm dist-server-deb

        rule build-submodule-reloc
@@ -1856,26 +1862,26 @@ with open(buildfile_tmp, 'w') as f:
        rule build-submodule-deb
          command = cd $dir && ./reloc/build_deb.sh --reloc-pkg $artifact

-        build tools/jmx/build/scylla-jmx-package.tar.gz: build-submodule-reloc
+        build tools/jmx/build/{scylla_product}-jmx-package.tar.gz: build-submodule-reloc
          reloc_dir = tools/jmx
-        build dist-jmx-rpm: build-submodule-rpm tools/jmx/build/scylla-jmx-package.tar.gz
+        build dist-jmx-rpm: build-submodule-rpm tools/jmx/build/{scylla_product}-jmx-package.tar.gz
          dir = tools/jmx
-          artifact = $builddir/scylla-jmx-package.tar.gz
-        build dist-jmx-deb: build-submodule-deb tools/jmx/build/scylla-jmx-package.tar.gz
+          artifact = $builddir/{scylla_product}-jmx-package.tar.gz
+        build dist-jmx-deb: build-submodule-deb tools/jmx/build/{scylla_product}-jmx-package.tar.gz
          dir = tools/jmx
-          artifact = $builddir/scylla-jmx-package.tar.gz
-        build dist-jmx-tar: phony {' '.join(['$builddir/{mode}/dist/tar/scylla-jmx-package.tar.gz'.format(mode=mode) for mode in build_modes])}
+          artifact = $builddir/{scylla_product}-jmx-package.tar.gz
+        build dist-jmx-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-jmx-package.tar.gz'.format(mode=mode, scylla_product=scylla_product) for mode in build_modes])}
        build dist-jmx: phony dist-jmx-tar dist-jmx-rpm dist-jmx-deb

-        build tools/java/build/scylla-tools-package.tar.gz: build-submodule-reloc
+        build tools/java/build/{scylla_product}-tools-package.tar.gz: build-submodule-reloc
          reloc_dir = tools/java
-        build dist-tools-rpm: build-submodule-rpm tools/java/build/scylla-tools-package.tar.gz
+        build dist-tools-rpm: build-submodule-rpm tools/java/build/{scylla_product}-tools-package.tar.gz
          dir = tools/java
-          artifact = $builddir/scylla-tools-package.tar.gz
-        build dist-tools-deb: build-submodule-deb tools/java/build/scylla-tools-package.tar.gz
+          artifact = $builddir/{scylla_product}-tools-package.tar.gz
+        build dist-tools-deb: build-submodule-deb tools/java/build/{scylla_product}-tools-package.tar.gz
          dir = tools/java
-          artifact = $builddir/scylla-tools-package.tar.gz
-        build dist-tools-tar: phony {' '.join(['$builddir/{mode}/dist/tar/scylla-tools-package.tar.gz'.format(mode=mode) for mode in build_modes])}
+          artifact = $builddir/{scylla_product}-tools-package.tar.gz
+        build dist-tools-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-tools-package.tar.gz'.format(mode=mode, scylla_product=scylla_product) for mode in build_modes])}
        build dist-tools: phony dist-tools-tar dist-tools-rpm dist-tools-deb

        rule compat-python3-reloc
@@ -1884,27 +1890,27 @@ with open(buildfile_tmp, 'w') as f:
          command = cd $dir && ./reloc/build_rpm.sh --reloc-pkg $artifact --builddir ../../build/redhat
        rule compat-python3-deb
          command = cd $dir && ./reloc/build_deb.sh --reloc-pkg $artifact --builddir ../../build/debian
-        build $builddir/release/scylla-python3-package.tar.gz: compat-python3-reloc tools/python3/build/scylla-python3-package.tar.gz
+        build $builddir/release/{scylla_product}-python3-package.tar.gz: compat-python3-reloc tools/python3/build/{scylla_product}-python3-package.tar.gz
          dir = tools/python3
-          artifact = $builddir/scylla-python3-package.tar.gz
-        build compat-python3-rpm: compat-python3-rpm tools/python3/build/scylla-python3-package.tar.gz
+          artifact = $builddir/{scylla_product}-python3-package.tar.gz
+        build compat-python3-rpm: compat-python3-rpm tools/python3/build/{scylla_product}-python3-package.tar.gz
          dir = tools/python3
-          artifact = $builddir/scylla-python3-package.tar.gz
-        build compat-python3-deb: compat-python3-deb tools/python3/build/scylla-python3-package.tar.gz
+          artifact = $builddir/{scylla_product}-python3-package.tar.gz
+        build compat-python3-deb: compat-python3-deb tools/python3/build/{scylla_product}-python3-package.tar.gz
          dir = tools/python3
-          artifact = $builddir/scylla-python3-package.tar.gz
+          artifact = $builddir/{scylla_product}-python3-package.tar.gz

-        build tools/python3/build/scylla-python3-package.tar.gz: build-submodule-reloc
+        build tools/python3/build/{scylla_product}-python3-package.tar.gz: build-submodule-reloc
          reloc_dir = tools/python3
          args = --packages "{python3_dependencies}"
-        build dist-python3-rpm: build-submodule-rpm tools/python3/build/scylla-python3-package.tar.gz
+        build dist-python3-rpm: build-submodule-rpm tools/python3/build/{scylla_product}-python3-package.tar.gz
          dir = tools/python3
-          artifact = $builddir/scylla-python3-package.tar.gz
-        build dist-python3-deb: build-submodule-deb tools/python3/build/scylla-python3-package.tar.gz
+          artifact = $builddir/{scylla_product}-python3-package.tar.gz
+        build dist-python3-deb: build-submodule-deb tools/python3/build/{scylla_product}-python3-package.tar.gz
          dir = tools/python3
-          artifact = $builddir/scylla-python3-package.tar.gz
-        build dist-python3-tar: phony {' '.join(['$builddir/{mode}/dist/tar/scylla-python3-package.tar.gz'.format(mode=mode) for mode in build_modes])}
-        build dist-python3: phony dist-python3-tar dist-python3-rpm dist-python3-deb $builddir/release/scylla-python3-package.tar.gz compat-python3-rpm compat-python3-deb
+          artifact = $builddir/{scylla_product}-python3-package.tar.gz
+        build dist-python3-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-python3-package.tar.gz'.format(mode=mode, scylla_product=scylla_product) for mode in build_modes])}
+        build dist-python3: phony dist-python3-tar dist-python3-rpm dist-python3-deb $builddir/release/{scylla_product}-python3-package.tar.gz compat-python3-rpm compat-python3-deb
        build dist-deb: phony dist-server-deb dist-python3-deb dist-jmx-deb dist-tools-deb
        build dist-rpm: phony dist-server-rpm dist-python3-rpm dist-jmx-rpm dist-tools-rpm
        build dist-tar: phony dist-unified-tar dist-server-tar dist-python3-tar dist-jmx-tar dist-tools-tar
@@ -1919,9 +1925,9 @@ with open(buildfile_tmp, 'w') as f:
        '''))
    for mode in build_modes:
        f.write(textwrap.dedent(f'''\
-        build $builddir/{mode}/dist/tar/scylla-python3-package.tar.gz: copy tools/python3/build/scylla-python3-package.tar.gz
-        build $builddir/{mode}/dist/tar/scylla-tools-package.tar.gz: copy tools/java/build/scylla-tools-package.tar.gz
-        build $builddir/{mode}/dist/tar/scylla-jmx-package.tar.gz: copy tools/jmx/build/scylla-jmx-package.tar.gz
+        build $builddir/{mode}/dist/tar/{scylla_product}-python3-package.tar.gz: copy tools/python3/build/{scylla_product}-python3-package.tar.gz
+        build $builddir/{mode}/dist/tar/{scylla_product}-tools-package.tar.gz: copy tools/java/build/{scylla_product}-tools-package.tar.gz
+        build $builddir/{mode}/dist/tar/{scylla_product}-jmx-package.tar.gz: copy tools/jmx/build/{scylla_product}-jmx-package.tar.gz

        build dist-{mode}: phony dist-server-{mode} dist-python3-{mode} dist-tools-{mode} dist-jmx-{mode} dist-unified-{mode}
        build dist-check-{mode}: dist-check
@@ -1947,6 +1953,13 @@ with open(buildfile_tmp, 'w') as f:
        build mode_list: mode_list
        default {modes_list}
        ''').format(modes_list=' '.join(default_modes), **globals()))
+    unit_test_list = set(test for test in build_artifacts if test in set(tests))
+    f.write(textwrap.dedent('''\
+        rule unit_test_list
+            command = /usr/bin/env echo -e '{unit_test_list}'
+            description = List configured unit tests
+        build unit_test_list: unit_test_list
+        ''').format(unit_test_list="\\n".join(unit_test_list)))
    f.write(textwrap.dedent('''\
        build always: phony
        rule scylla_version_gen
@@ -1955,6 +1968,9 @@ with open(buildfile_tmp, 'w') as f:
        rule debian_files_gen
            command = ./dist/debian/debian_files_gen.py
        build $builddir/debian/debian: debian_files_gen | always
-        ''').format(modes_list=' '.join(build_modes), **globals()))
+        rule extract_node_exporter
+            command = tar -C build -xvpf {node_exporter_filename} --no-same-owner && rm -rfv build/node_exporter && mv -v build/{node_exporter_dirname} build/node_exporter
+        build $builddir/node_exporter: extract_node_exporter | always
+        ''').format(**globals()))

 os.rename(buildfile_tmp, buildfile)
--- a/connection_notifier.cc
+++ b/connection_notifier.cc
@@ -20,44 +20,47 @@
 */

 #include "connection_notifier.hh"
-#include "db/query_context.hh"
 #include "cql3/constants.hh"
 #include "database.hh"
-#include "service/storage_proxy.hh"

 #include <stdexcept>

-namespace db::system_keyspace {
-extern const char *const CLIENTS;
-}
-
-static sstring to_string(client_type ct) {
+sstring to_string(client_type ct) {
    switch (ct) {
        case client_type::cql: return "cql";
        case client_type::thrift: return "thrift";
        case client_type::alternator: return "alternator";
-        default: throw std::runtime_error("Invalid client_type");
    }
+    throw std::runtime_error("Invalid client_type");
+}
+
+static sstring to_string(client_connection_stage ccs) {
+    switch (ccs) {
+        case client_connection_stage::established: return connection_stage_literal<client_connection_stage::established>;
+        case client_connection_stage::authenticating: return connection_stage_literal<client_connection_stage::authenticating>;
+        case client_connection_stage::ready: return connection_stage_literal<client_connection_stage::ready>;
+    }
+    throw std::runtime_error("Invalid client_connection_stage");
 }

 future<> notify_new_client(client_data cd) {
    // FIXME: consider prepared statement
    const static sstring req
-            = format("INSERT INTO system.{} (address, port, client_type, shard_id, protocol_version, username) "
-                     "VALUES (?, ?, ?, ?, ?, ?);", db::system_keyspace::CLIENTS);
+            = format("INSERT INTO system.{} (address, port, client_type, connection_stage, shard_id, protocol_version, username) "
+                     "VALUES (?, ?, ?, ?, ?, ?, ?);", db::system_keyspace::CLIENTS);
    
-    return db::execute_cql(req,
-            std::move(cd.ip), cd.port, to_string(cd.ct), cd.shard_id,
+    return db::qctx->execute_cql(req,
+            std::move(cd.ip), cd.port, to_string(cd.ct), to_string(cd.connection_stage), cd.shard_id,
            cd.protocol_version.has_value() ? data_value(*cd.protocol_version) : data_value::make_null(int32_type),
            cd.username.value_or("anonymous")).discard_result();
 }

-future<> notify_disconnected_client(gms::inet_address addr, client_type ct, int port) {
+future<> notify_disconnected_client(net::inet_address addr, int port, client_type ct) {
    // FIXME: consider prepared statement
    const static sstring req
            = format("DELETE FROM system.{} where address=? AND port=? AND client_type=?;",
                     db::system_keyspace::CLIENTS);
-    return db::execute_cql(req, addr.addr(), port, to_string(ct)).discard_result();
+    return db::qctx->execute_cql(req, std::move(addr), port, to_string(ct)).discard_result();
 }

 future<> clear_clientlist() {
--- a/connection_notifier.hh
+++ b/connection_notifier.hh
@@ -20,27 +20,65 @@
 */
 #pragma once

-#include "gms/inet_address.hh"
+#include "db/query_context.hh"
+
+#include <seastar/net/inet_address.hh>
 #include <seastar/core/sstring.hh>
+#include "seastarx.hh"
+
 #include <optional>

+namespace db::system_keyspace {
+extern const char *const CLIENTS;
+}
+
 enum class client_type {
    cql = 0,
    thrift,
    alternator,
 };

+sstring to_string(client_type ct);
+
+enum class changed_column {
+    username = 0,
+    connection_stage,
+    driver_name,
+    driver_version,
+    hostname,
+    protocol_version,
+};
+
+template <changed_column column> constexpr const char* column_literal = "";
+template <> inline constexpr const char* column_literal<changed_column::username> = "username";
+template <> inline constexpr const char* column_literal<changed_column::connection_stage> = "connection_stage";
+template <> inline constexpr const char* column_literal<changed_column::driver_name> = "driver_name";
+template <> inline constexpr const char* column_literal<changed_column::driver_version> = "driver_version";
+template <> inline constexpr const char* column_literal<changed_column::hostname> = "hostname";
+template <> inline constexpr const char* column_literal<changed_column::protocol_version> = "protocol_version";
+
+enum class client_connection_stage {
+    established = 0,
+    authenticating,
+    ready,
+};
+
+template <client_connection_stage ccs> constexpr const char* connection_stage_literal = "";
+template <> inline constexpr const char* connection_stage_literal<client_connection_stage::established> = "ESTABLISHED";
+template <> inline constexpr const char* connection_stage_literal<client_connection_stage::authenticating> = "AUTHENTICATING";
+template <> inline constexpr const char* connection_stage_literal<client_connection_stage::ready> = "READY";
+
 // Representation of a row in `system.clients'. std::optionals are for nullable cells.
 struct client_data {
-    gms::inet_address ip;
+    net::inet_address ip;
    int32_t port;
    client_type ct;
+    client_connection_stage connection_stage = client_connection_stage::established;
    int32_t shard_id;  /// ID of server-side shard which is processing the connection.

    // `optional' column means that it's nullable (possibly because it's
    // unimplemented yet). If you want to fill ("implement") any of them,
    // remember to update the query in `notify_new_client()'.
-    std::optional<sstring> connection_stage;
    std::optional<sstring> driver_name;
    std::optional<sstring> driver_version;
    std::optional<sstring> hostname;
@@ -52,6 +90,17 @@ struct client_data {
 };

 future<> notify_new_client(client_data cd);
-future<> notify_disconnected_client(gms::inet_address addr, client_type ct, int port);
-
+future<> notify_disconnected_client(net::inet_address addr, int port, client_type ct);
 future<> clear_clientlist();
+
+template <changed_column column_enum_val>
+struct notify_client_change {
+    template <typename T>
+    future<> operator()(net::inet_address addr, int port, client_type ct, T&& value) {
+        const static sstring req
+                = format("UPDATE system.{} SET {}=? WHERE address=? AND port=? AND client_type=?;",
+                        db::system_keyspace::CLIENTS, column_literal<column_enum_val>);
+
+        return db::qctx->execute_cql(req, std::forward<T>(value), std::move(addr), port, to_string(ct)).discard_result();
+    }
+};
--- a/counters.cc
+++ b/counters.cc
@@ -19,16 +19,10 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

-#include "service/storage_service.hh"
 #include "counters.hh"
 #include "mutation.hh"
 #include "combine.hh"

-counter_id counter_id::local()
-{
-    return counter_id(service::get_local_storage_service().get_local_id());
-}
-
 std::ostream& operator<<(std::ostream& os, const counter_id& id) {
    return os << id.to_uuid();
 }
@@ -197,10 +191,10 @@ std::optional<atomic_cell> counter_cell_view::difference(atomic_cell_view a, ato
 }


-void transform_counter_updates_to_shards(mutation& m, const mutation* current_state, uint64_t clock_offset) {
+void transform_counter_updates_to_shards(mutation& m, const mutation* current_state, uint64_t clock_offset, utils::UUID local_id) {
    // FIXME: allow current_state to be frozen_mutation

-    auto transform_new_row_to_shards = [&s = *m.schema(), clock_offset] (column_kind kind, auto& cells) {
+    auto transform_new_row_to_shards = [&s = *m.schema(), clock_offset, local_id] (column_kind kind, auto& cells) {
        cells.for_each_cell([&] (column_id id, atomic_cell_or_collection& ac_o_c) {
            auto& cdef = s.column_at(kind, id);
            auto acv = ac_o_c.as_atomic_cell(cdef);
@@ -208,7 +202,7 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
                return; // continue -- we are in lambda
            }
            auto delta = acv.counter_update_value();
-            auto cs = counter_shard(counter_id::local(), delta, clock_offset + 1);
+            auto cs = counter_shard(counter_id(local_id), delta, clock_offset + 1);
            ac_o_c = counter_cell_builder::from_single_shard(acv.timestamp(), cs);
        });
    };
@@ -223,7 +217,7 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st

    clustering_key::less_compare cmp(*m.schema());

-    auto transform_row_to_shards = [&s = *m.schema(), clock_offset] (column_kind kind, auto& transformee, auto& state) {
+    auto transform_row_to_shards = [&s = *m.schema(), clock_offset, local_id] (column_kind kind, auto& transformee, auto& state) {
        std::deque<std::pair<column_id, counter_shard>> shards;
        state.for_each_cell([&] (column_id id, const atomic_cell_or_collection& ac_o_c) {
            auto& cdef = s.column_at(kind, id);
@@ -232,7 +226,7 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
                return; // continue -- we are in lambda
            }
          counter_cell_view::with_linearized(acv, [&] (counter_cell_view ccv) {
-            auto cs = ccv.local_shard();
+            auto cs = ccv.get_shard(counter_id(local_id));
            if (!cs) {
                return; // continue
            }
@@ -253,7 +247,7 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
            auto delta = acv.counter_update_value();

            if (shards.empty() || shards.front().first > id) {
-                auto cs = counter_shard(counter_id::local(), delta, clock_offset + 1);
+                auto cs = counter_shard(counter_id(local_id), delta, clock_offset + 1);
                ac_o_c = counter_cell_builder::from_single_shard(acv.timestamp(), cs);
            } else {
                auto& cs = shards.front().second;
--- a/counters.hh
+++ b/counters.hh
@@ -61,8 +61,6 @@ public:
        return !(*this == other);
    }
 public:
-    static counter_id local();
-
    // For tests.
    static counter_id generate_random() {
        return counter_id(utils::make_random_uuid());
@@ -277,7 +275,14 @@ public:
        return ac;
    }

-    class inserter_iterator : public std::iterator<std::output_iterator_tag, counter_shard> {
+    class inserter_iterator {
+    public:
+        using iterator_category = std::output_iterator_tag;
+        using value_type = counter_shard;
+        using difference_type = std::ptrdiff_t;
+        using pointer = counter_shard*;
+        using reference = counter_shard&;
+    private:
        counter_cell_builder* _builder;
    public:
        explicit inserter_iterator(counter_cell_builder& b) : _builder(&b) { }
@@ -311,7 +316,14 @@ protected:
    basic_atomic_cell_view<is_mutable> _cell;
    linearized_value_view _value;
 private:
-    class shard_iterator : public std::iterator<std::input_iterator_tag, basic_counter_shard_view<is_mutable>> {
+    class shard_iterator {
+    public:
+        using iterator_category = std::input_iterator_tag;
+        using value_type = basic_counter_shard_view<is_mutable>;
+        using difference_type = std::ptrdiff_t;
+        using pointer = basic_counter_shard_view<is_mutable>*;
+        using reference = basic_counter_shard_view<is_mutable>&;
+    private:
        pointer_type _current;
        basic_counter_shard_view<is_mutable> _current_view;
    public:
@@ -391,11 +403,6 @@ public:
        return *it;
    }

-    std::optional<counter_shard_view> local_shard() const {
-        // TODO: consider caching local shard position
-        return get_shard(counter_id::local());
-    }
-
    bool operator==(const basic_counter_cell_view& other) const {
        return timestamp() == other.timestamp() && boost::equal(shards(), other.shards());
    }
@@ -437,7 +444,7 @@ struct counter_cell_mutable_view : basic_counter_cell_view<mutable_view::yes> {
 // Transforms mutation dst from counter updates to counter shards using state
 // stored in current_state.
 // If current_state is present it has to be in the same schema as dst.
-void transform_counter_updates_to_shards(mutation& dst, const mutation* current_state, uint64_t clock_offset);
+void transform_counter_updates_to_shards(mutation& dst, const mutation* current_state, uint64_t clock_offset, utils::UUID local_id);

 template<>
 struct appending_hash<counter_shard_view> {
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -394,6 +394,7 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
        bool allow_filtering = false;
        bool is_json = false;
        bool bypass_cache = false;
+        auto attrs = std::make_unique<cql3::attributes::raw>();
    }
    : K_SELECT (
                ( K_JSON { is_json = true; } )?
@@ -408,11 +409,12 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
      ( K_LIMIT rows=intValue { limit = rows; } )?
      ( K_ALLOW K_FILTERING  { allow_filtering = true; } )?
      ( K_BYPASS K_CACHE { bypass_cache = true; })?
+      ( usingClause[attrs] )?
      {
          auto params = make_lw_shared<raw::select_statement::parameters>(std::move(orderings), is_distinct, allow_filtering, is_json, bypass_cache);
          $expr = std::make_unique<raw::select_statement>(std::move(cf), std::move(params),
            std::move(sclause), std::move(wclause), std::move(limit), std::move(per_partition_limit),
-            std::move(gbcolumns));
+            std::move(gbcolumns), std::move(attrs));
      }
    ;

@@ -521,6 +523,7 @@ usingClause[std::unique_ptr<cql3::attributes::raw>& attrs]
 usingClauseObjective[std::unique_ptr<cql3::attributes::raw>& attrs]
    : K_TIMESTAMP ts=intValue { attrs->timestamp = ts; }
    | K_TTL t=intValue { attrs->time_to_live = t; }
+    | K_TIMEOUT to=term { attrs->timeout = to; }
    ;

 /**
@@ -1761,6 +1764,7 @@ basic_unreserved_keyword returns [sstring str]
        | K_PER
        | K_PARTITION
        | K_GROUP
+        | K_TIMEOUT
        ) { $str = $k.text; }
    ;

@@ -1916,6 +1920,8 @@ K_GROUP:       G R O U P;

 K_LIKE:        L I K E;

+K_TIMEOUT:     T I M E O U T;
+
 // Case-insensitive alpha characters
 fragment A: ('a'|'A');
 fragment B: ('b'|'B');
--- a/cql3/abstract_marker.cc
+++ b/cql3/abstract_marker.cc
@@ -70,11 +70,11 @@ abstract_marker::raw::raw(int32_t bind_index)
 ::shared_ptr<term> abstract_marker::raw::prepare(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const
 {
    if (receiver->type->is_collection()) {
-        if (receiver->type->get_kind() == abstract_type::kind::list) {
+        if (receiver->type->without_reversed().is_list()) {
            return ::make_shared<lists::marker>(_bind_index, receiver);
-        } else if (receiver->type->get_kind() == abstract_type::kind::set) {
+        } else if (receiver->type->without_reversed().is_set()) {
            return ::make_shared<sets::marker>(_bind_index, receiver);
-        } else if (receiver->type->get_kind() == abstract_type::kind::map) {
+        } else if (receiver->type->without_reversed().is_map()) {
            return ::make_shared<maps::marker>(_bind_index, receiver);
        }
        assert(0);
--- a/cql3/attributes.cc
+++ b/cql3/attributes.cc
@@ -44,12 +44,13 @@
 namespace cql3 {

 std::unique_ptr<attributes> attributes::none() {
-    return std::unique_ptr<attributes>{new attributes{{}, {}}};
+    return std::unique_ptr<attributes>{new attributes{{}, {}, {}}};
 }

-attributes::attributes(::shared_ptr<term>&& timestamp, ::shared_ptr<term>&& time_to_live)
+attributes::attributes(::shared_ptr<term>&& timestamp, ::shared_ptr<term>&& time_to_live, ::shared_ptr<term>&& timeout)
    : _timestamp{std::move(timestamp)}
    , _time_to_live{std::move(time_to_live)}
+    , _timeout{std::move(timeout)}
 { }

 bool attributes::is_timestamp_set() const {
@@ -60,6 +61,10 @@ bool attributes::is_time_to_live_set() const {
    return bool(_time_to_live);
 }

+bool attributes::is_timeout_set() const {
+    return bool(_timeout);
+}
+
 int64_t attributes::get_timestamp(int64_t now, const query_options& options) {
    if (!_timestamp) {
        return now;
@@ -72,14 +77,12 @@ int64_t attributes::get_timestamp(int64_t now, const query_options& options) {
    if (tval.is_unset_value()) {
        return now;
    }
-  return with_linearized(*tval, [&] (bytes_view val) {
    try {
-        data_type_for<int64_t>()->validate(val, options.get_cql_serialization_format());
+        data_type_for<int64_t>()->validate(*tval, options.get_cql_serialization_format());
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid timestamp value");
    }
-    return value_cast<int64_t>(data_type_for<int64_t>()->deserialize(val));
-  });
+    return value_cast<int64_t>(data_type_for<int64_t>()->deserialize(*tval));
 }

 int32_t attributes::get_time_to_live(const query_options& options) {
@@ -93,16 +96,15 @@ int32_t attributes::get_time_to_live(const query_options& options) {
    if (tval.is_unset_value()) {
        return 0;
    }
-  auto ttl = with_linearized(*tval, [&] (bytes_view val) {
+
    try {
-        data_type_for<int32_t>()->validate(val, options.get_cql_serialization_format());
+        data_type_for<int32_t>()->validate(*tval, options.get_cql_serialization_format());
    }
    catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid TTL value");
    }
+    auto ttl = value_cast<int32_t>(data_type_for<int32_t>()->deserialize(*tval));

-    return value_cast<int32_t>(data_type_for<int32_t>()->deserialize(val));
-  });
    if (ttl < 0) {
        throw exceptions::invalid_request_exception("A TTL must be greater or equal to 0");
    }
@@ -115,6 +117,25 @@ int32_t attributes::get_time_to_live(const query_options& options) {
    return ttl;
 }

+
+db::timeout_clock::duration attributes::get_timeout(const query_options& options) const {
+    auto timeout = _timeout->bind_and_get(options);
+    if (timeout.is_null() || timeout.is_unset_value()) {
+        throw exceptions::invalid_request_exception("Timeout value cannot be unset/null");
+    }
+    cql_duration duration = value_cast<cql_duration>(duration_type->deserialize(*timeout));
+    if (duration.months || duration.days) {
+        throw exceptions::invalid_request_exception("Timeout values cannot be expressed in days/months");
+    }
+    if (duration.nanoseconds % 1'000'000 != 0) {
+        throw exceptions::invalid_request_exception("Timeout values cannot have granularity finer than milliseconds");
+    }
+    if (duration.nanoseconds < 0) {
+        throw exceptions::invalid_request_exception("Timeout values must be non-negative");
+    }
+    return std::chrono::duration_cast<db::timeout_clock::duration>(std::chrono::nanoseconds(duration.nanoseconds));
+}
+
 void attributes::collect_marker_specification(variable_specifications& bound_names) const {
    if (_timestamp) {
        _timestamp->collect_marker_specification(bound_names);
@@ -122,12 +143,16 @@ void attributes::collect_marker_specification(variable_specifications& bound_nam
    if (_time_to_live) {
        _time_to_live->collect_marker_specification(bound_names);
    }
+    if (_timeout) {
+        _timeout->collect_marker_specification(bound_names);
+    }
 }

 std::unique_ptr<attributes> attributes::raw::prepare(database& db, const sstring& ks_name, const sstring& cf_name) const {
    auto ts = !timestamp ? ::shared_ptr<term>{} : timestamp->prepare(db, ks_name, timestamp_receiver(ks_name, cf_name));
    auto ttl = !time_to_live ? ::shared_ptr<term>{} : time_to_live->prepare(db, ks_name, time_to_live_receiver(ks_name, cf_name));
-    return std::unique_ptr<attributes>{new attributes{std::move(ts), std::move(ttl)}};
+    auto to = !timeout ? ::shared_ptr<term>{} : timeout->prepare(db, ks_name, timeout_receiver(ks_name, cf_name));
+    return std::unique_ptr<attributes>{new attributes{std::move(ts), std::move(ttl), std::move(to)}};
 }

 lw_shared_ptr<column_specification> attributes::raw::timestamp_receiver(const sstring& ks_name, const sstring& cf_name) const {
@@ -138,4 +163,8 @@ lw_shared_ptr<column_specification> attributes::raw::time_to_live_receiver(const
    return make_lw_shared<column_specification>(ks_name, cf_name, ::make_shared<column_identifier>("[ttl]", true), data_type_for<int32_t>());
 }

+lw_shared_ptr<column_specification> attributes::raw::timeout_receiver(const sstring& ks_name, const sstring& cf_name) const {
+    return make_lw_shared<column_specification>(ks_name, cf_name, ::make_shared<column_identifier>("[timeout]", true), duration_type);
+}
+
 }
--- a/cql3/attributes.hh
+++ b/cql3/attributes.hh
@@ -54,31 +54,39 @@ class attributes final {
 private:
    const ::shared_ptr<term> _timestamp;
    const ::shared_ptr<term> _time_to_live;
+    const ::shared_ptr<term> _timeout;
 public:
    static std::unique_ptr<attributes> none();
 private:
-    attributes(::shared_ptr<term>&& timestamp, ::shared_ptr<term>&& time_to_live);
+    attributes(::shared_ptr<term>&& timestamp, ::shared_ptr<term>&& time_to_live, ::shared_ptr<term>&& timeout);
 public:
    bool is_timestamp_set() const;

    bool is_time_to_live_set() const;

+    bool is_timeout_set() const;
+
    int64_t get_timestamp(int64_t now, const query_options& options);

    int32_t get_time_to_live(const query_options& options);

+    db::timeout_clock::duration get_timeout(const query_options& options) const;
+
    void collect_marker_specification(variable_specifications& bound_names) const;

    class raw final {
    public:
        ::shared_ptr<term::raw> timestamp;
        ::shared_ptr<term::raw> time_to_live;
+        ::shared_ptr<term::raw> timeout;

        std::unique_ptr<attributes> prepare(database& db, const sstring& ks_name, const sstring& cf_name) const;
    private:
        lw_shared_ptr<column_specification> timestamp_receiver(const sstring& ks_name, const sstring& cf_name) const;

        lw_shared_ptr<column_specification> time_to_live_receiver(const sstring& ks_name, const sstring& cf_name) const;
+
+        lw_shared_ptr<column_specification> timeout_receiver(const sstring& ks_name, const sstring& cf_name) const;
    };
 };

--- a/cql3/constants.hh
+++ b/cql3/constants.hh
@@ -192,9 +192,12 @@ public:

        virtual ::shared_ptr<terminal> bind(const query_options& options) override {
            auto bytes = bind_and_get(options);
-            if (!bytes) {
+            if (bytes.is_null()) {
                return ::shared_ptr<terminal>{};
            }
+            if (bytes.is_unset_value()) {
+                return UNSET_VALUE;
+            }
            return ::make_shared<constants::value>(std::move(cql3::raw_value::make_value(to_bytes(*bytes))));
        }
    };
@@ -227,9 +230,7 @@ public:
            } else if (value.is_unset_value()) {
                return;
            }
-            auto increment = with_linearized(*value, [] (bytes_view value_view) {
-                return value_cast<int64_t>(long_type->deserialize_value(value_view));
-            });
+            auto increment = value_cast<int64_t>(long_type->deserialize_value(*value));
            m.set_cell(prefix, column, make_counter_update_cell(increment, params));
        }
    };
@@ -244,9 +245,7 @@ public:
            } else if (value.is_unset_value()) {
                return;
            }
-            auto increment = with_linearized(*value, [] (bytes_view value_view) {
-                return value_cast<int64_t>(long_type->deserialize_value(value_view));
-            });
+            auto increment = value_cast<int64_t>(long_type->deserialize_value(*value));
            if (increment == std::numeric_limits<int64_t>::min()) {
                throw exceptions::invalid_request_exception(format("The negation of {:d} overflows supported counter precision (signed 8 bytes integer)", increment));
            }
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -27,7 +27,9 @@
 #include <fmt/ostream.h>
 #include <unordered_map>

+#include "cql3/constants.hh"
 #include "cql3/lists.hh"
+#include "cql3/statements/request_validations.hh"
 #include "cql3/tuples.hh"
 #include "index/secondary_index_manager.hh"
 #include "types/list.hh"
@@ -43,7 +45,8 @@ using boost::adaptors::transformed;

 namespace {

-std::optional<atomic_cell_value_view> do_get_value(const schema& schema,
+static
+bytes_opt do_get_value(const schema& schema,
        const column_definition& cdef,
        const partition_key& key,
        const clustering_key_prefix& ckey,
@@ -51,9 +54,9 @@ std::optional<atomic_cell_value_view> do_get_value(const schema& schema,
        gc_clock::time_point now) {
    switch (cdef.kind) {
        case column_kind::partition_key:
-            return atomic_cell_value_view(key.get_component(schema, cdef.component_index()));
+            return to_bytes(key.get_component(schema, cdef.component_index()));
        case column_kind::clustering_key:
-            return atomic_cell_value_view(ckey.get_component(schema, cdef.component_index()));
+            return to_bytes(ckey.get_component(schema, cdef.component_index()));
        default:
            auto cell = cells.find_cell(cdef.id);
            if (!cell) {
@@ -61,7 +64,7 @@ std::optional<atomic_cell_value_view> do_get_value(const schema& schema,
            }
            assert(cdef.is_atomic());
            auto c = cell->as_atomic_cell(cdef);
-            return c.is_dead(now) ? std::nullopt : std::optional<atomic_cell_value_view>(c.value());
+            return c.is_dead(now) ? std::nullopt : bytes_opt(c.value().linearize());
    }
 }

@@ -138,9 +141,8 @@ bytes_opt get_value_from_partition_slice(

 /// Returns col's value from a mutation.
 bytes_opt get_value_from_mutation(const column_value& col, row_data_from_mutation data) {
-    const auto v = do_get_value(
+    return do_get_value(
            data.schema_, *col.col, data.partition_key_, data.clustering_key_, data.other_columns, data.now);
-    return v ? v->linearize() : bytes_opt();
 }

 /// Returns col's value from the fetched data.
@@ -154,7 +156,7 @@ bytes_opt get_value(const column_value& col, const column_value_eval_bag& bag) {

 /// Type for comparing results of get_value().
 const abstract_type* get_value_comparator(const column_definition* cdef) {
-    return cdef->type->is_reversed() ? cdef->type->underlying_type().get() : cdef->type.get();
+    return &cdef->type->without_reversed();
 }

 /// Type for comparing results of get_value().
@@ -355,16 +357,12 @@ bytes_opt next_value(query::result_row_view::iterator_type& iter, const column_d
    if (cdef->type->is_multi_cell()) {
        auto cell = iter.next_collection_cell();
        if (cell) {
-            return cell->with_linearized([] (bytes_view data) {
-                return bytes(data.cbegin(), data.cend());
-            });
+            return linearized(*cell);
        }
    } else {
        auto cell = iter.next_atomic_cell();
        if (cell) {
-            return cell->value().with_linearized([] (bytes_view data) {
-                return bytes(data.cbegin(), data.cend());
-            });
+            return linearized(cell->value());
        }
    }
    return std::nullopt;
@@ -417,6 +415,8 @@ bool is_one_of(const column_value& col, term& rhs, const column_value_eval_bag&
    } else if (auto mkr = dynamic_cast<lists::marker*>(&rhs)) {
        // This is `a IN ?`.  RHS elements are values representable as bytes_opt.
        const auto values = static_pointer_cast<lists::value>(mkr->bind(bag.options));
+        statements::request_validations::check_not_null(
+                values, "Invalid null value for column %s", col.col->name_as_text());
        return boost::algorithm::any_of(values->get_elements(), [&] (const bytes_opt& b) {
                return equal(b, col, bag);
            });
@@ -568,7 +568,8 @@ const auto deref = boost::adaptors::transformed([] (const bytes_opt& b) { return

 /// Returns possible values from t, which must be RHS of IN.
 value_list get_IN_values(
-        const ::shared_ptr<term>& t, const query_options& options, const serialized_compare& comparator) {
+        const ::shared_ptr<term>& t, const query_options& options, const serialized_compare& comparator,
+        sstring_view column_name) {
    // RHS is prepared differently for different CQL cases.  Cast it dynamically to discern which case this is.
    if (auto dv = dynamic_pointer_cast<lists::delayed_value>(t)) {
        // Case `a IN (1,2,3)`.
@@ -578,8 +579,12 @@ value_list get_IN_values(
        return to_sorted_vector(std::move(result_range), comparator);
    } else if (auto mkr = dynamic_pointer_cast<lists::marker>(t)) {
        // Case `a IN ?`.  Collect all list-element values.
-        const auto val = static_pointer_cast<lists::value>(mkr->bind(options));
-        return to_sorted_vector(val->get_elements() | non_null | deref, comparator);
+        const auto val = mkr->bind(options);
+        if (val == constants::UNSET_VALUE) {
+            throw exceptions::invalid_request_exception(format("Invalid unset value for column {}", column_name));
+        }
+        statements::request_validations::check_not_null(val, "Invalid null value for column %s", column_name);
+        return to_sorted_vector(static_pointer_cast<lists::value>(val)->get_elements() | non_null | deref, comparator);
    }
    throw std::logic_error(format("get_IN_values(single column) on invalid term {}", *t));
 }
@@ -610,13 +615,13 @@ static constexpr bool inclusive = true, exclusive = false;
 nonwrapping_range<bytes> to_range(oper_t op, const bytes& val) {
    switch (op) {
    case oper_t::GT:
-        return nonwrapping_range<bytes>::make_starting_with(range_bound(val, exclusive));
+        return nonwrapping_range<bytes>::make_starting_with(interval_bound(val, exclusive));
    case oper_t::GTE:
-        return nonwrapping_range<bytes>::make_starting_with(range_bound(val, inclusive));
+        return nonwrapping_range<bytes>::make_starting_with(interval_bound(val, inclusive));
    case oper_t::LT:
-        return nonwrapping_range<bytes>::make_ending_with(range_bound(val, exclusive));
+        return nonwrapping_range<bytes>::make_ending_with(interval_bound(val, exclusive));
    case oper_t::LTE:
-        return nonwrapping_range<bytes>::make_ending_with(range_bound(val, inclusive));
+        return nonwrapping_range<bytes>::make_ending_with(interval_bound(val, inclusive));
    default:
        throw std::logic_error(format("to_range: unknown comparison operator {}", op));
    }
@@ -686,7 +691,7 @@ value_set possible_lhs_values(const column_definition* cdef, const expression& e
                                return oper.op == oper_t::EQ ? value_set(value_list{*val})
                                        : to_range(oper.op, *val);
                            } else if (oper.op == oper_t::IN) {
-                                return get_IN_values(oper.rhs, options, type->as_less_comparator());
+                                return get_IN_values(oper.rhs, options, type->as_less_comparator(), cdef->name_as_text());
                            }
                            throw std::logic_error(format("possible_lhs_values: unhandled operator {}", oper));
                        },
@@ -731,9 +736,9 @@ value_set possible_lhs_values(const column_definition* cdef, const expression& e
                            if (oper.op == oper_t::EQ) {
                                return value_list{*val};
                            } else if (oper.op == oper_t::GT) {
-                                return nonwrapping_range<bytes>::make_starting_with(range_bound(*val, exclusive));
+                                return nonwrapping_range<bytes>::make_starting_with(interval_bound(*val, exclusive));
                            } else if (oper.op == oper_t::GTE) {
-                                return nonwrapping_range<bytes>::make_starting_with(range_bound(*val, inclusive));
+                                return nonwrapping_range<bytes>::make_starting_with(interval_bound(*val, inclusive));
                            }
                            static const bytes MININT = serialized(std::numeric_limits<int64_t>::min()),
                                    MAXINT = serialized(std::numeric_limits<int64_t>::max());
@@ -741,9 +746,9 @@ value_set possible_lhs_values(const column_definition* cdef, const expression& e
                            // that as MAXINT for some reason.
                            const auto adjusted_val = (*val == MININT) ? serialized(MAXINT) : *val;
                            if (oper.op == oper_t::LT) {
-                                return nonwrapping_range<bytes>::make_ending_with(range_bound(adjusted_val, exclusive));
+                                return nonwrapping_range<bytes>::make_ending_with(interval_bound(adjusted_val, exclusive));
                            } else if (oper.op == oper_t::LTE) {
-                                return nonwrapping_range<bytes>::make_ending_with(range_bound(adjusted_val, inclusive));
+                                return nonwrapping_range<bytes>::make_ending_with(interval_bound(adjusted_val, inclusive));
                            }
                            throw std::logic_error(format("get_token_interval invalid operator {}", oper.op));
                        },
@@ -776,9 +781,11 @@ bool is_supported_by(const expression& expr, const secondary_index::index& idx)
                            return idx.supports_expression(*col.col, oper.op);
                        },
                        [&] (const std::vector<column_value>& cvs) {
-                            return boost::algorithm::any_of(cvs, [&] (const column_value& c) {
-                                return idx.supports_expression(*c.col, oper.op);
-                            });
+                            if (cvs.size() == 1) {
+                                return idx.supports_expression(*cvs[0].col, oper.op);
+                            }
+                            // We don't use index table for multi-column restrictions, as it cannot avoid filtering.
+                            return false;
                        },
                        [&] (const token&) { return false; },
                    }, oper.lhs);
--- a/cql3/functions/aggregate_fcts.cc
+++ b/cql3/functions/aggregate_fcts.cc
@@ -219,7 +219,7 @@ struct aggregate_type_for<simple_date_native_type> {

 template<>
 struct aggregate_type_for<timeuuid_native_type> {
-    using type = timeuuid_native_type::primary_type;
+    using type = timeuuid_native_type;
 };

 template<>
@@ -227,6 +227,7 @@ struct aggregate_type_for<time_native_type> {
    using type = time_native_type::primary_type;
 };

+// WARNING: never invoke this on temporary values; it will return a dangling reference.
 template <typename Type>
 const Type& max_wrapper(const Type& t1, const Type& t2) {
    using std::max;
@@ -241,6 +242,10 @@ inline const net::inet_address& max_wrapper(const net::inet_address& t1, const n
    return std::memcmp(t1.data(), t2.data(), len) >= 0 ? t1 : t2;
 }

+inline const timeuuid_native_type& max_wrapper(const timeuuid_native_type& t1, const timeuuid_native_type& t2) {
+    return t1.uuid.timestamp() > t2.uuid.timestamp() ? t1 : t2;
+}
+
 template <typename Type>
 class impl_max_function_for final : public aggregate_function::aggregate {
   std::optional<typename aggregate_type_for<Type>::type> _max{};
@@ -323,6 +328,7 @@ make_max_function() {
    return make_shared<max_function_for<Type>>();
 }

+// WARNING: never invoke this on temporary values; it will return a dangling reference.
 template <typename Type>
 const Type& min_wrapper(const Type& t1, const Type& t2) {
    using std::min;
@@ -337,6 +343,10 @@ inline const net::inet_address& min_wrapper(const net::inet_address& t1, const n
    return std::memcmp(t1.data(), t2.data(), len) <= 0 ? t1 : t2;
 }

+inline timeuuid_native_type min_wrapper(timeuuid_native_type t1, timeuuid_native_type t2) {
+    return t1.uuid.timestamp() < t2.uuid.timestamp() ? t1 : t2;
+}
+
 template <typename Type>
 class impl_min_function_for final : public aggregate_function::aggregate {
   std::optional<typename aggregate_type_for<Type>::type> _min{};
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -76,7 +76,7 @@ functions::init() noexcept {
    // that has less information in it. Given how unlikely it is that
    // we will run out of memory this early, having a better core dump
    // if we do seems like a good trade-off.
-    memory::disable_failure_guard dfg;
+    memory::scoped_critical_alloc_section dfg;

    std::unordered_multimap<function_name, shared_ptr<function>> ret;
    auto declare = [&ret] (shared_ptr<function> f) { ret.emplace(f->name(), f); };
@@ -181,13 +181,18 @@ inline
 shared_ptr<function>
 make_from_json_function(database& db, const sstring& keyspace, data_type t) {
    return make_native_scalar_function<true>("fromjson", t, {utf8_type},
-            [&db, &keyspace, t](cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
-        rjson::value json_value = rjson::parse(utf8_type->to_string(parameters[0].value()));
-        bytes_opt parsed_json_value;
-        if (!json_value.IsNull()) {
-            parsed_json_value.emplace(from_json_object(*t, json_value, sf));
+            [&db, keyspace, t](cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+        try {
+            rjson::value json_value = rjson::parse(utf8_type->to_string(parameters[0].value()));
+            bytes_opt parsed_json_value;
+            if (!json_value.IsNull()) {
+                parsed_json_value.emplace(from_json_object(*t, json_value, sf));
+            }
+            return parsed_json_value;
+        } catch(rjson::error& e) {
+            throw exceptions::function_execution_exception("fromJson",
+                format("Failed parsing fromJson parameter: {}", e.what()), keyspace, {t->name()});
        }
-        return parsed_json_value;
    });
 }

--- a/cql3/functions/native_scalar_function.hh
+++ b/cql3/functions/native_scalar_function.hh
@@ -78,7 +78,22 @@ public:
        return Pure;
    }
    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
-        return _func(sf, parameters);
+        try {
+            return _func(sf, parameters);
+        } catch(exceptions::cassandra_exception&) {
+            // If the function's code took the time to produce an official
+            // cassandra_exception, pass it through. Otherwise, below we will
+            // wrap the unknown exception in a function_execution_exception.
+            throw;
+        } catch(...) {
+            std::vector<sstring> args;
+            args.reserve(arg_types().size());
+            for (const data_type& a : arg_types()) {
+                args.push_back(a->name());
+            }
+            throw exceptions::function_execution_exception(name().name,
+                format("Failed execution of function {}: {}", name(), std::current_exception()), name().keyspace, std::move(args));
+        }
    }
 };

--- a/cql3/lists.cc
+++ b/cql3/lists.cc
@@ -40,7 +40,7 @@ lw_shared_ptr<column_specification>
 lists::value_spec_of(const column_specification& column) {
    return make_lw_shared<column_specification>(column.ks_name, column.cf_name,
            ::make_shared<column_identifier>(format("value({})", *column.name), true),
-                dynamic_pointer_cast<const list_type_impl>(column.type)->get_elements_type());
+                dynamic_cast<const list_type_impl&>(column.type->without_reversed()).get_elements_type());
 }

 lw_shared_ptr<column_specification>
@@ -87,7 +87,7 @@ lists::literal::prepare(database& db, const sstring& keyspace, lw_shared_ptr<col

 void
 lists::literal::validate_assignable_to(database& db, const sstring keyspace, const column_specification& receiver) const {
-    if (!dynamic_pointer_cast<const list_type_impl>(receiver.type)) {
+    if (!receiver.type->without_reversed().is_list()) {
        throw exceptions::invalid_request_exception(format("Invalid list literal for {} of type {}",
                *receiver.name, receiver.type->as_cql3_type()));
    }
@@ -125,18 +125,11 @@ lists::literal::to_string() const {

 lists::value
 lists::value::from_serialized(const fragmented_temporary_buffer::view& val, const list_type_impl& type, cql_serialization_format sf) {
-    return with_linearized(val, [&] (bytes_view v) {
-        return from_serialized(v, type, sf);
-    });
-}
-
-lists::value
-lists::value::from_serialized(bytes_view v, const list_type_impl& type, cql_serialization_format sf) {
    try {
        // Collections have this small hack that validate cannot be called on a serialized object,
        // but compose does the validation (so we're fine).
        // FIXME: deserializeForNativeProtocol()?!
-        auto l = value_cast<list_type_impl::native_type>(type.deserialize(v, sf));
+        auto l = value_cast<list_type_impl::native_type>(type.deserialize(val, sf));
        std::vector<bytes_opt> elements;
        elements.reserve(l.size());
        for (auto&& element : l) {
@@ -227,17 +220,15 @@ lists::delayed_value::bind(const query_options& options) {
 ::shared_ptr<terminal>
 lists::marker::bind(const query_options& options) {
    const auto& value = options.get_value_at(_bind_index);
-    auto& ltype = static_cast<const list_type_impl&>(*_receiver->type);
+    auto& ltype = dynamic_cast<const list_type_impl&>(_receiver->type->without_reversed());
    if (value.is_null()) {
        return nullptr;
    } else if (value.is_unset_value()) {
        return constants::UNSET_VALUE;
    } else {
        try {
-            return with_linearized(*value, [&] (bytes_view v) {
-                ltype.validate(v, options.get_cql_serialization_format());
-                return make_shared<lists::value>(value::from_serialized(v, ltype, options.get_cql_serialization_format()));
-            });
+            ltype.validate(*value, options.get_cql_serialization_format());
+            return make_shared<lists::value>(value::from_serialized(*value, ltype, options.get_cql_serialization_format()));
        } catch (marshal_exception& e) {
            throw exceptions::invalid_request_exception(
                    format("Exception while binding column {:s}: {:s}", _receiver->name->to_cql_string(), e.what()));
@@ -308,9 +299,7 @@ lists::setter_by_index::execute(mutation& m, const clustering_key_prefix& prefix
        return;
    }

-    auto idx = with_linearized(*index, [] (bytes_view v) {
-        return value_cast<int32_t>(data_type_for<int32_t>()->deserialize(v));
-    });
+    auto idx = value_cast<int32_t>(data_type_for<int32_t>()->deserialize(*index));
    auto&& existing_list_opt = params.get_prefetched_list(m.key(), prefix, column);
    if (!existing_list_opt) {
        throw exceptions::invalid_request_exception("Attempted to set an element on a list which is null");
--- a/cql3/lists.hh
+++ b/cql3/lists.hh
@@ -73,7 +73,6 @@ public:
    };

    class value : public multi_item_terminal, collection_terminal {
-        static value from_serialized(bytes_view v, const list_type_impl& type, cql_serialization_format sf);
    public:
        std::vector<bytes_opt> _elements;
    public:
--- a/cql3/maps.cc
+++ b/cql3/maps.cc
@@ -55,14 +55,14 @@ lw_shared_ptr<column_specification>
 maps::key_spec_of(const column_specification& column) {
    return make_lw_shared<column_specification>(column.ks_name, column.cf_name,
                ::make_shared<column_identifier>(format("key({})", *column.name), true),
-                 dynamic_pointer_cast<const map_type_impl>(column.type)->get_keys_type());
+                dynamic_cast<const map_type_impl&>(column.type->without_reversed()).get_keys_type());
 }

 lw_shared_ptr<column_specification>
 maps::value_spec_of(const column_specification& column) {
    return make_lw_shared<column_specification>(column.ks_name, column.cf_name,
                ::make_shared<column_identifier>(format("value({})", *column.name), true),
-                 dynamic_pointer_cast<const map_type_impl>(column.type)->get_values_type());
+                 dynamic_cast<const map_type_impl&>(column.type->without_reversed()).get_values_type());
 }

 ::shared_ptr<term>
@@ -88,7 +88,9 @@ maps::literal::prepare(database& db, const sstring& keyspace, lw_shared_ptr<colu

        values.emplace(k, v);
    }
-    delayed_value value(static_pointer_cast<const map_type_impl>(receiver->type)->get_keys_type()->as_less_comparator(), values);
+    delayed_value value(
+            dynamic_cast<const map_type_impl&>(receiver->type->without_reversed()).get_keys_type()->as_less_comparator(),
+            values);
    if (all_terminal) {
        return value.bind(query_options::DEFAULT);
    } else {
@@ -98,7 +100,7 @@ maps::literal::prepare(database& db, const sstring& keyspace, lw_shared_ptr<colu

 void
 maps::literal::validate_assignable_to(database& db, const sstring& keyspace, const column_specification& receiver) const {
-    if (!dynamic_pointer_cast<const map_type_impl>(receiver.type)) {
+    if (!receiver.type->without_reversed().is_map()) {
        throw exceptions::invalid_request_exception(format("Invalid map literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }
    auto&& key_spec = maps::key_spec_of(receiver);
@@ -158,15 +160,13 @@ maps::value::from_serialized(const fragmented_temporary_buffer::view& fragmented
        // Collections have this small hack that validate cannot be called on a serialized object,
        // but compose does the validation (so we're fine).
        // FIXME: deserialize_for_native_protocol?!
-      return with_linearized(fragmented_value, [&] (bytes_view value) {
-        auto m = value_cast<map_type_impl::native_type>(type.deserialize(value, sf));
+        auto m = value_cast<map_type_impl::native_type>(type.deserialize(fragmented_value, sf));
        std::map<bytes, bytes, serialized_compare> map(type.get_keys_type()->as_less_comparator());
        for (auto&& e : m) {
            map.emplace(type.get_keys_type()->decompose(e.first),
                        type.get_values_type()->decompose(e.second));
        }
        return maps::value { std::move(map) };
-      });
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception(e.what());
    }
@@ -263,14 +263,16 @@ maps::marker::bind(const query_options& options) {
        return constants::UNSET_VALUE;
    }
    try {
-        with_linearized(*val, [&] (bytes_view value) {
-            _receiver->type->validate(value, options.get_cql_serialization_format());
-        });
+        _receiver->type->validate(*val, options.get_cql_serialization_format());
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception(
                format("Exception while binding column {:s}: {:s}", _receiver->name->to_cql_string(), e.what()));
    }
-    return ::make_shared<maps::value>(maps::value::from_serialized(*val, static_cast<const map_type_impl&>(*_receiver->type), options.get_cql_serialization_format()));
+    return ::make_shared<maps::value>(
+            maps::value::from_serialized(
+                    *val,
+                    dynamic_cast<const map_type_impl&>(_receiver->type->without_reversed()),
+                    options.get_cql_serialization_format()));
 }

 void
@@ -305,6 +307,12 @@ maps::setter_by_key::execute(mutation& m, const clustering_key_prefix& prefix, c
    assert(column.type->is_multi_cell()); // "Attempted to set a value for a single key on a frozen map"m
    auto key = _k->bind_and_get(params._options);
    auto value = _t->bind_and_get(params._options);
+    if (value.is_unset_value()) {
+        return;
+    }
+    if (key.is_unset_value() || value.is_unset_value()) {
+        throw invalid_request_exception("Invalid unset map key");
+    }
    if (!key) {
        throw invalid_request_exception("Invalid null map key");
    }
--- a/cql3/query_options.cc
+++ b/cql3/query_options.cc
@@ -42,12 +42,14 @@
 #include "cql3/cql_config.hh"
 #include "query_options.hh"
 #include "version.hh"
+#include "db/consistency_level_type.hh"

 namespace cql3 {

 const cql_config default_cql_config;

-thread_local const query_options::specific_options query_options::specific_options::DEFAULT{-1, {}, {}, api::missing_timestamp};
+thread_local const query_options::specific_options query_options::specific_options::DEFAULT{
+    -1, {}, db::consistency_level::SERIAL, api::missing_timestamp};

 thread_local query_options query_options::DEFAULT{default_cql_config,
    db::consistency_level::ONE, infinite_timeout_config, std::nullopt,
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -61,8 +61,6 @@ logging::logger log("query_processor");
 logging::logger prep_cache_log("prepared_statements_cache");
 logging::logger authorized_prepared_statements_cache_log("authorized_prepared_statements_cache");

-distributed<query_processor> _the_query_processor;
-
 const sstring query_processor::CQL_VERSION = "3.3.1";

 const std::chrono::minutes prepared_statements_cache::entry_expiry = std::chrono::minutes(60);
@@ -670,10 +668,14 @@ struct internal_query_state {
    bool more_results = true;
 };

-::shared_ptr<internal_query_state> query_processor::create_paged_state(const sstring& query_string,
-        const std::initializer_list<data_value>& values, int32_t page_size) {
+::shared_ptr<internal_query_state> query_processor::create_paged_state(
+        const sstring& query_string,
+        db::consistency_level cl,
+        const timeout_config& timeout_config,
+        const std::initializer_list<data_value>& values,
+        int32_t page_size) {
    auto p = prepare_internal(query_string);
-    auto opts = make_internal_options(p, values, db::consistency_level::ONE, infinite_timeout_config, page_size);
+    auto opts = make_internal_options(p, values, cl, timeout_config, page_size);
    ::shared_ptr<internal_query_state> res = ::make_shared<internal_query_state>(
            internal_query_state{
                    query_string,
@@ -937,17 +939,20 @@ bool query_processor::migration_subscriber::should_invalidate(
    return statement->depends_on_keyspace(ks_name) && (!cf_name || statement->depends_on_column_family(*cf_name));
 }

-future<> query_processor::query(
+future<> query_processor::query_internal(
        const sstring& query_string,
+        db::consistency_level cl,
+        const timeout_config& timeout_config,
        const std::initializer_list<data_value>& values,
+        int32_t page_size,
        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f) {
-    return for_each_cql_result(create_paged_state(query_string, values), std::move(f));
+    return for_each_cql_result(create_paged_state(query_string, cl, timeout_config, values, page_size), std::move(f));
 }

-future<> query_processor::query(
+future<> query_processor::query_internal(
        const sstring& query_string,
        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f) {
-    return for_each_cql_result(create_paged_state(query_string, {}), std::move(f));
+    return query_internal(query_string, db::consistency_level::ONE, infinite_timeout_config, {}, 1000, std::move(f));
 }

 }
--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -224,75 +224,52 @@ public:
    /*!
     * \brief iterate over all cql results using paging
     *
-     * You Create a statement with optional paraemter and pass
-     * a function that goes over the results.
+     * You create a statement with optional parameters and pass
+     * a function that goes over the result rows.
     *
-     * The passed function would be called for all the results, return stop_iteration::yes
-     * to stop during iteration.
+     * The passed function would be called for all rows; return future<stop_iteration::yes>
+     * to stop iteration.
     *
     * For example:
-            return query("SELECT * from system.compaction_history",
-                         [&history] (const cql3::untyped_result_set::row& row) mutable {
-                ....
-                ....
-                return stop_iteration::no;
-            });
-
-     * You can use place holder in the query, the prepared statement will only be done once.
-     *
-     *
-     * query_string - the cql string, can contain place holder
-     * f - a function to be run on each of the query result, if the function return false the iteration would stop
-     * args - arbitrary number of query parameters
-     */
-    template<typename... Args>
-    future<> query(
-            const sstring& query_string,
-            std::function<stop_iteration(const cql3::untyped_result_set_row&)>&& f,
-            Args&&... args) {
-        return for_each_cql_result(
-                create_paged_state(query_string, { data_value(std::forward<Args>(args))... }), std::move(f));
-    }
-
-    /*!
-     * \brief iterate over all cql results using paging
-     *
-     * You Create a statement with optional paraemter and pass
-     * a function that goes over the results.
-     *
-     * The passed function would be called for all the results, return future<stop_iteration::yes>
-     * to stop during iteration.
-     *
-     * For example:
-            return query("SELECT * from system.compaction_history",
-                         [&history] (const cql3::untyped_result_set::row& row) mutable {
+            return query_internal(
+                    "SELECT * from system.compaction_history",
+                    db::consistency_level::ONE,
+                    infinite_timeout_config,
+                    {},
+                    [&history] (const cql3::untyped_result_set::row& row) mutable {
                ....
                ....
                return make_ready_future<stop_iteration>(stop_iteration::no);
            });

-     * You can use place holder in the query, the prepared statement will only be done once.
+     * You can use placeholders in the query, the statement will only be prepared once.
     *
-     *
-     * query_string - the cql string, can contain place holder
-     * values - query parameters value
-     * f - a function to be run on each of the query result, if the function return stop_iteration::no the iteration
-     * would stop
+     * query_string - the cql string, can contain placeholders
+     * cl - consistency level of the query
+     * timeout_config - timeout configuration
+     * values - values to be substituted for the placeholders in the query
+     * page_size - maximum page size
+     * f - a function to be run on each row of the query result,
+     *     if the function returns stop_iteration::yes the iteration will stop
     */
-    future<> query(
+    future<> query_internal(
            const sstring& query_string,
+            db::consistency_level cl,
+            const timeout_config& timeout_config,
            const std::initializer_list<data_value>& values,
+            int32_t page_size,
            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f);

    /*
     * \brief iterate over all cql results using paging
-     * An overload of the query with future function without query parameters.
+     * An overload of query_internal without query parameters
+     * using CL = ONE, no timeout, and page size = 1000.
     *
-     * query_string - the cql string, can contain place holder
-     * f - a function to be run on each of the query result, if the function return stop_iteration::no the iteration
-     * would stop
+     * query_string - the cql string, can contain placeholders
+     * f - a function to be run on each row of the query result,
+     *     if the function returns stop_iteration::yes the iteration will stop
     */
-    future<> query(
+    future<> query_internal(
            const sstring& query_string,
            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f);

@@ -354,8 +331,10 @@ private:
     */
    ::shared_ptr<internal_query_state> create_paged_state(
            const sstring& query_string,
-            const std::initializer_list<data_value>& = { },
-            int32_t page_size = 1000);
+            db::consistency_level,
+            const timeout_config&,
+            const std::initializer_list<data_value>&,
+            int32_t page_size);

    /*!
     * \brief run a query using paging
@@ -464,14 +443,4 @@ private:
            ::shared_ptr<cql_statement> statement);
 };

-extern seastar::sharded<query_processor> _the_query_processor;
-
-inline seastar::sharded<query_processor>& get_query_processor() {
-    return _the_query_processor;
-}
-
-inline query_processor& get_local_query_processor() {
-    return _the_query_processor.local();
-}
-
 }
--- a/cql3/restrictions/single_column_primary_key_restrictions.hh
+++ b/cql3/restrictions/single_column_primary_key_restrictions.hh
@@ -171,8 +171,7 @@ public:

    virtual void merge_with(::shared_ptr<restriction> restriction) override {
        if (find_atom(restriction->expression, [] (const expr::binary_operator& b) {
-                    return std::holds_alternative<std::vector<expr::column_value>>(b.lhs)
-                            && std::get<std::vector<expr::column_value>>(b.lhs).size() > 1;
+                    return std::holds_alternative<std::vector<expr::column_value>>(b.lhs);
                })) {
            throw exceptions::invalid_request_exception(
                "Mixing single column relations and multi column relations on clustering columns is not allowed");
@@ -213,30 +212,22 @@ private:
    std::vector<range_type> compute_bounds(const query_options& options) const {
        std::vector<range_type> ranges;

-        static constexpr auto invalid_null_msg = std::is_same<ValueType, partition_key>::value
-            ? "Invalid null value for partition key part %s" : "Invalid null value for clustering key part %s";
-
        // TODO: rewrite this to simply invoke possible_lhs_values on each clustering column, find the first
        // non-list, and take Cartesian product of that prefix.  No need for to_range() and std::get() here.
        if (_restrictions->is_all_eq()) {
-            if (_restrictions->size() == 1) {
-                auto&& e = *restrictions().begin();
-                const auto b = std::get<expr::binary_operator>(e.second->expression).rhs->bind_and_get(options);
-                if (!b) {
-                    throw exceptions::invalid_request_exception(sprint(invalid_null_msg, e.first->name_as_text()));
-                }
-                return {range_type::make_singular(ValueType::from_single_value(*_schema, to_bytes(b)))};
-            }
            std::vector<bytes> components;
            components.reserve(_restrictions->size());
            for (auto&& e : restrictions()) {
                const column_definition* def = e.first;
                assert(components.size() == _schema->position(*def));
-                const auto b = std::get<expr::binary_operator>(e.second->expression).rhs->bind_and_get(options);
-                if (!b) {
-                    throw exceptions::invalid_request_exception(sprint(invalid_null_msg, e.first->name_as_text()));
+                // Because _restrictions is all EQ, possible_lhs_values must return a list, not a range.
+                const auto b = std::get<expr::value_list>(possible_lhs_values(e.first, e.second->expression, options));
+                // Furthermore, this list is either a single element (when all RHSs are the same) or empty (when at
+                // least two are different, so the restrictions cannot hold simultaneously -- ie, c=1 AND c=2).
+                if (b.empty()) {
+                    return {};
                }
-                components.emplace_back(to_bytes(b));
+                components.emplace_back(b.front());
            }
            return {range_type::make_singular(ValueType::from_exploded(*_schema, std::move(components)))};
        }
@@ -324,7 +315,7 @@ public:
        std::vector<bytes_opt> res;
        for (const ValueType& r : src) {
            for (const auto& component : r.components()) {
-                res.emplace_back(component);
+                res.emplace_back(to_bytes(component));
            }
        }
        return res;
--- a/cql3/restrictions/single_column_restrictions.hh
+++ b/cql3/restrictions/single_column_restrictions.hh
@@ -108,6 +108,9 @@ public:
            return bytes_opt{};
        } else {
            const auto values = std::get<expr::value_list>(possible_lhs_values(&cdef, it->second->expression, options));
+            if (values.empty()) {
+                return bytes_opt{};
+            }
            assert(values.size() == 1);
            return values.front();
        }
@@ -119,7 +122,7 @@ public:
     * @param column_def the column definition
     * @return the restriction associated to the specified column
     */
-    ::shared_ptr<restriction> get_restriction(const column_definition& column_def) const {
+    ::shared_ptr<single_column_restriction> get_restriction(const column_definition& column_def) const {
        auto i = _restrictions.find(&column_def);
        if (i == _restrictions.end()) {
            return {};
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -147,7 +147,6 @@ statement_restrictions::statement_restrictions(database& db,
        const std::vector<::shared_ptr<relation>>& where_clause,
        variable_specifications& bound_names,
        bool selects_only_static_columns,
-        bool select_a_collection,
        bool for_view,
        bool allow_filtering)
    : statement_restrictions(schema, allow_filtering)
@@ -193,12 +192,12 @@ statement_restrictions::statement_restrictions(database& db,
    const expr::allow_local_index allow_local(
            !_partition_key_restrictions->has_unrestricted_components(*_schema)
            && _partition_key_restrictions->is_all_eq());
-    const bool has_queriable_clustering_column_index = _clustering_columns_restrictions->has_supporting_index(sim, allow_local);
-    const bool has_queriable_pk_index = _partition_key_restrictions->has_supporting_index(sim, allow_local);
-    const bool has_queriable_regular_index = _nonprimary_key_restrictions->has_supporting_index(sim, allow_local);
+    _has_queriable_ck_index = _clustering_columns_restrictions->has_supporting_index(sim, allow_local);
+    _has_queriable_pk_index = _partition_key_restrictions->has_supporting_index(sim, allow_local);
+    _has_queriable_regular_index = _nonprimary_key_restrictions->has_supporting_index(sim, allow_local);

    // At this point, the select statement if fully constructed, but we still have a few things to validate
-    process_partition_key_restrictions(has_queriable_pk_index, for_view, allow_filtering);
+    process_partition_key_restrictions(for_view, allow_filtering);

    // Some but not all of the partition key columns have been specified;
    // hence we need turn these restrictions into index expressions.
@@ -227,10 +226,11 @@ statement_restrictions::statement_restrictions(database& db,
        }
    }

-    process_clustering_columns_restrictions(has_queriable_clustering_column_index, select_a_collection, for_view, allow_filtering);
+    process_clustering_columns_restrictions(for_view, allow_filtering);

    // Covers indexes on the first clustering column (among others).
-    if (_is_key_range && has_queriable_clustering_column_index) {
+    if (_is_key_range && _has_queriable_ck_index &&
+        !dynamic_pointer_cast<multi_column_restriction>(_clustering_columns_restrictions)) {
        _uses_secondary_indexing = true;
    }

@@ -265,7 +265,7 @@ statement_restrictions::statement_restrictions(database& db,
    }

    if (!_nonprimary_key_restrictions->empty()) {
-        if (has_queriable_regular_index) {
+        if (_has_queriable_regular_index) {
            _uses_secondary_indexing = true;
        } else if (!allow_filtering) {
            throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
@@ -329,20 +329,39 @@ int statement_restrictions::score(const secondary_index::index& index) const {
    return 1;
 }

+namespace {
+
+using namespace cql3::restrictions;
+
+/// If rs contains a restrictions_map of individual columns to their restrictions, returns it.  Otherwise, returns null.
+const single_column_restrictions::restrictions_map* get_individual_restrictions_map(const restrictions* rs) {
+    if (auto regular = dynamic_cast<const single_column_restrictions*>(rs)) {
+        return &regular->restrictions();
+    } else if (auto partition = dynamic_cast<const single_column_partition_key_restrictions*>(rs)) {
+        return &partition->restrictions();
+    } else if (auto clustering = dynamic_cast<const single_column_clustering_key_restrictions*>(rs)) {
+        return &clustering->restrictions();
+    }
+    return nullptr;
+}
+
+} // anonymous namespace
+
 std::pair<std::optional<secondary_index::index>, ::shared_ptr<cql3::restrictions::restrictions>> statement_restrictions::find_idx(secondary_index::secondary_index_manager& sim) const {
    std::optional<secondary_index::index> chosen_index;
    int chosen_index_score = 0;
    ::shared_ptr<cql3::restrictions::restrictions> chosen_index_restrictions;

    for (const auto& index : sim.list_indexes()) {
+        auto cdef = _schema->get_column_definition(to_bytes(index.target_column()));
        for (::shared_ptr<cql3::restrictions::restrictions> restriction : index_restrictions()) {
-            for (const auto& cdef : restriction->get_column_defs()) {
-                if (index.depends_on(*cdef)) {
-                    if (score(index) > chosen_index_score) {
-                        chosen_index = index;
-                        chosen_index_score = score(index);
-                        chosen_index_restrictions = restriction;
-                    }
+            if (auto rmap = get_individual_restrictions_map(restriction.get())) {
+                const auto found = rmap->find(cdef);
+                if (found != rmap->end() && is_supported_by(found->second->expression, index)
+                    && score(index) > chosen_index_score) {
+                    chosen_index = index;
+                    chosen_index_score = score(index);
+                    chosen_index_restrictions = restriction;
                }
            }
        }
@@ -401,7 +420,7 @@ std::vector<const column_definition*> statement_restrictions::get_column_defs_fo
    return column_defs_for_filtering;
 }

-void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering) {
+void statement_restrictions::process_partition_key_restrictions(bool for_view, bool allow_filtering) {
    // If there is a queriable index, no special condition are required on the other restrictions.
    // But we still need to know 2 things:
    // - If we don't have a queriable index, is the query ok
@@ -412,17 +431,17 @@ void statement_restrictions::process_partition_key_restrictions(bool has_queriab
        _is_key_range = true;
    } else if (_partition_key_restrictions->empty()) {
        _is_key_range = true;
-        _uses_secondary_indexing = has_queriable_index;
+        _uses_secondary_indexing = _has_queriable_pk_index;
    }

    if (_partition_key_restrictions->needs_filtering(*_schema)) {
-        if (!allow_filtering && !for_view && !has_queriable_index) {
+        if (!allow_filtering && !for_view && !_has_queriable_pk_index) {
            throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
                "thus may have unpredictable performance. If you want to execute "
                "this query despite the performance unpredictability, use ALLOW FILTERING");
        }
        _is_key_range = true;
-        _uses_secondary_indexing = has_queriable_index;
+        _uses_secondary_indexing = _has_queriable_pk_index;
    }

 }
@@ -435,23 +454,19 @@ bool statement_restrictions::has_unrestricted_clustering_columns() const {
    return _clustering_columns_restrictions->has_unrestricted_components(*_schema);
 }

-void statement_restrictions::process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view, bool allow_filtering) {
+void statement_restrictions::process_clustering_columns_restrictions(bool for_view, bool allow_filtering) {
    if (!has_clustering_columns_restriction()) {
        return;
    }

-    if (clustering_key_restrictions_has_IN() && select_a_collection) {
-        throw exceptions::invalid_request_exception(
-            "Cannot restrict clustering columns by IN relations when a collection is selected by the query");
-    }
    if (find_atom(_clustering_columns_restrictions->expression, expr::is_on_collection)
-        && !has_queriable_index && !allow_filtering) {
+        && !_has_queriable_ck_index && !allow_filtering) {
        throw exceptions::invalid_request_exception(
            "Cannot restrict clustering columns by a CONTAINS relation without a secondary index or filtering");
    }

    if (has_clustering_columns_restriction() && _clustering_columns_restrictions->needs_filtering(*_schema)) {
-        if (has_queriable_index) {
+        if (_has_queriable_ck_index) {
            _uses_secondary_indexing = true;
        } else if (!allow_filtering && !for_view) {
            auto clustering_columns_iter = _schema->clustering_key_columns().begin();
@@ -490,24 +505,73 @@ std::vector<query::clustering_range> statement_restrictions::get_clustering_boun
    return _clustering_columns_restrictions->bounds_ranges(options);
 }

+namespace {
+
+/// True iff get_partition_slice_for_global_index_posting_list() will be able to calculate the token value from the
+/// given restrictions.  Keep in sync with the get_partition_slice_for_global_index_posting_list() source.
+bool token_known(const statement_restrictions& r) {
+    return !r.has_partition_key_unrestricted_components() && r.get_partition_key_restrictions()->is_all_eq();
+}
+
+} // anonymous namespace
+
 bool statement_restrictions::need_filtering() const {
-    uint32_t number_of_restricted_columns_for_indexing = 0;
-    for (auto&& restrictions : _index_restrictions) {
-        number_of_restricted_columns_for_indexing += restrictions->size();
+    using namespace expr;
+
+    const auto npart = _partition_key_restrictions->size();
+    if (npart > 0 && npart < _schema->partition_key_size()) {
+        // Can't calculate the token value, so a naive base-table query must be filtered.  Same for any index tables,
+        // except if there's only one restriction supported by an index.
+        return !(npart == 1 && _has_queriable_pk_index &&
+                 _clustering_columns_restrictions->empty() && _nonprimary_key_restrictions->empty());
+    }
+    if (_partition_key_restrictions->needs_filtering(*_schema)) {
+        // We most likely cannot calculate token(s).  Neither base-table nor index-table queries can avoid filtering.
+        return true;
+    }
+    // Now we know the partition key is either unrestricted or fully restricted.
+
+    const auto nreg = _nonprimary_key_restrictions->size();
+    if (nreg > 1 || (nreg == 1 && !_has_queriable_regular_index)) {
+        return true; // Regular columns are unsorted in storage and no single index suffices.
+    }
+    if (nreg == 1) { // Single non-key restriction supported by an index.
+        // Will the index-table query require filtering?  That depends on whether its clustering key is restricted to a
+        // continuous range.  Recall that this clustering key is (token, pk, ck) of the base table.
+        if (npart == 0 && _clustering_columns_restrictions->empty()) {
+            return false; // No clustering key restrictions => whole partitions.
+        }
+        return !token_known(*this) || _clustering_columns_restrictions->needs_filtering(*_schema);
+    }
+    // Now we know there are no nonkey restrictions.
+
+    if (dynamic_pointer_cast<multi_column_restriction>(_clustering_columns_restrictions)) {
+        // Multicolumn bounds mean lexicographic order, implying a continuous clustering range.  Multicolumn IN means a
+        // finite set of continuous ranges.  Multicolumn restrictions cannot currently be combined with single-column
+        // clustering restrictions.  Therefore, a continuous clustering range is guaranteed.
+        return false;
    }

-    int number_of_filtering_restrictions = _nonprimary_key_restrictions->size();
-    // If the whole partition key is restricted, it does not imply filtering
-    if (_partition_key_restrictions->has_unrestricted_components(*_schema) || !_partition_key_restrictions->is_all_eq()) {
-        number_of_filtering_restrictions += _partition_key_restrictions->size() + _clustering_columns_restrictions->size();
-    } else if (_clustering_columns_restrictions->has_unrestricted_components(*_schema)) {
-        number_of_filtering_restrictions += _clustering_columns_restrictions->size() - _clustering_columns_restrictions->prefix_size();
+    if (_has_queriable_ck_index && _uses_secondary_indexing) {
+        // In cases where we use an index, clustering column restrictions might cause the need for filtering.
+        // TODO: This is overly conservative, there are some cases when this returns true but filtering
+        // is not needed. Because of that the database will sometimes perform filtering when it's not actually needed.
+        // Query performance shouldn't be affected much, at most we will filter rows that are all correct.
+        // Here are some cases to consider:
+        // On a table with primary key (p, c1, c2, c3) with an index on c3
+        // WHERE c3 = ? - doesn't require filtering
+        // WHERE c1 = ? AND c2 = ? AND c3 = ? - requires filtering
+        // WHERE p = ? AND c1 = ? AND c3 = ? - doesn't require filtering, but we conservatively report it does
+        // WHERE p = ? AND c1 LIKE ? AND c3 = ? - requires filtering
+        // WHERE p = ? AND c1 = ? AND c2 LIKE ? AND c3 = ? - requires filtering
+        // WHERE p = ? AND c1 = ? AND c2 = ? AND c3 = ? - doesn't use an index
+        // WHERE p = ? AND c1 = ? AND c2 < ? AND c3 = ? - doesn't require filtering, but we report it does
+        return _clustering_columns_restrictions->size() > 1;
    }
-    return number_of_restricted_columns_for_indexing > 1
-            || (number_of_restricted_columns_for_indexing == 0 && _partition_key_restrictions->empty() && !_clustering_columns_restrictions->empty())
-            || (number_of_restricted_columns_for_indexing != 0 && _nonprimary_key_restrictions->has_multiple_contains())
-            || (number_of_restricted_columns_for_indexing != 0 && !_uses_secondary_indexing)
-            || (_uses_secondary_indexing && number_of_filtering_restrictions > 1);
+    // Now we know that the query doesn't use an index.
+
+    // The only thing that can cause filtering now are the clustering columns.
+    return _clustering_columns_restrictions->needs_filtering(*_schema);
 }

 void statement_restrictions::validate_secondary_index_selections(bool selects_only_static_columns) {
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -102,6 +102,8 @@ private:
     */
    bool _is_key_range = false;

+    bool _has_queriable_regular_index = false, _has_queriable_pk_index = false, _has_queriable_ck_index = false;
+
 public:
    /**
     * Creates a new empty <code>StatementRestrictions</code>.
@@ -117,7 +119,6 @@ public:
        const std::vector<::shared_ptr<relation>>& where_clause,
        variable_specifications& bound_names,
        bool selects_only_static_columns,
-        bool select_a_collection,
        bool for_view = false,
        bool allow_filtering = false);

@@ -209,16 +210,15 @@ public:
     */
    bool has_unrestricted_clustering_columns() const;
 private:
-    void process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering);
+    void process_partition_key_restrictions(bool for_view, bool allow_filtering);

    /**
     * Processes the clustering column restrictions.
     *
     * @param has_queriable_index <code>true</code> if some of the queried data are indexed, <code>false</code> otherwise
-     * @param select_a_collection <code>true</code> if the query should return a collection column
     * @throws InvalidRequestException if the request is invalid
     */
-    void process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view, bool allow_filtering);
+    void process_clustering_columns_restrictions(bool for_view, bool allow_filtering);

    /**
     * Returns the <code>Restrictions</code> for the specified type of columns.
--- a/cql3/selection/selection.hh
+++ b/cql3/selection/selection.hh
@@ -140,21 +140,6 @@ public:
        return true;
    }

-    /**
-     * Checks if this selection contains a collection.
-     *
-     * @return <code>true</code> if this selection contains a collection, <code>false</code> otherwise.
-     */
-    bool contains_a_collection() const {
-        if (!_schema->has_multi_cell_collections()) {
-            return false;
-        }
-
-        return std::any_of(_columns.begin(), _columns.end(), [] (auto&& def) {
-           return def->type->is_collection() && def->type->is_multi_cell();
-        });
-    }
-
    /**
     * Returns the index of the specified column.
     *
--- a/cql3/sets.cc
+++ b/cql3/sets.cc
@@ -31,7 +31,7 @@ lw_shared_ptr<column_specification>
 sets::value_spec_of(const column_specification& column) {
    return make_lw_shared<column_specification>(column.ks_name, column.cf_name,
            ::make_shared<column_identifier>(format("value({})", *column.name), true),
-            dynamic_pointer_cast<const set_type_impl>(column.type)->get_elements_type());
+            dynamic_cast<const set_type_impl&>(column.type->without_reversed()).get_elements_type());
 }

 shared_ptr<term>
@@ -74,7 +74,8 @@ sets::literal::prepare(database& db, const sstring& keyspace, lw_shared_ptr<colu

        values.push_back(std::move(t));
    }
-    auto compare = dynamic_pointer_cast<const set_type_impl>(receiver->type)->get_elements_type()->as_less_comparator();
+    auto compare = dynamic_cast<const set_type_impl&>(receiver->type->without_reversed())
+            .get_elements_type()->as_less_comparator();

    auto value = ::make_shared<delayed_value>(compare, std::move(values));
    if (all_terminal) {
@@ -86,7 +87,7 @@ sets::literal::prepare(database& db, const sstring& keyspace, lw_shared_ptr<colu

 void
 sets::literal::validate_assignable_to(database& db, const sstring& keyspace, const column_specification& receiver) const {
-    if (!dynamic_pointer_cast<const set_type_impl>(receiver.type)) {
+    if (!receiver.type->without_reversed().is_set()) {
        // We've parsed empty maps as a set literal to break the ambiguity so
        // handle that case now
        if (dynamic_pointer_cast<const map_type_impl>(receiver.type) && _elements.empty()) {
@@ -106,7 +107,7 @@ sets::literal::validate_assignable_to(database& db, const sstring& keyspace, con

 assignment_testable::test_result
 sets::literal::test_assignment(database& db, const sstring& keyspace, const column_specification& receiver) const {
-    if (!dynamic_pointer_cast<const set_type_impl>(receiver.type)) {
+    if (!receiver.type->without_reversed().is_set()) {
        // We've parsed empty maps as a set literal to break the ambiguity so handle that case now
        if (dynamic_pointer_cast<const map_type_impl>(receiver.type) && _elements.empty()) {
            return assignment_testable::test_result::WEAKLY_ASSIGNABLE;
@@ -137,14 +138,12 @@ sets::value::from_serialized(const fragmented_temporary_buffer::view& val, const
        // Collections have this small hack that validate cannot be called on a serialized object,
        // but compose does the validation (so we're fine).
        // FIXME: deserializeForNativeProtocol?!
-      return with_linearized(val, [&] (bytes_view v) {
-        auto s = value_cast<set_type_impl::native_type>(type.deserialize(v, sf));
+        auto s = value_cast<set_type_impl::native_type>(type.deserialize(val, sf));
        std::set<bytes, serialized_compare> elements(type.get_elements_type()->as_less_comparator());
        for (auto&& element : s) {
            elements.insert(elements.end(), type.get_elements_type()->decompose(element));
        }
        return value(std::move(elements));
-      });
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception(e.what());
    }
@@ -226,8 +225,11 @@ sets::delayed_value::bind(const query_options& options) {

 sets::marker::marker(int32_t bind_index, lw_shared_ptr<column_specification> receiver)
    : abstract_marker{bind_index, std::move(receiver)} {
-        assert(dynamic_cast<const set_type_impl*>(_receiver->type.get()));
+    if (!_receiver->type->without_reversed().is_set()) {
+        throw std::runtime_error(format("Receiver {} for set marker has wrong type: {}",
+                                        _receiver->cf_name, _receiver->type->name()));
    }
+}

 ::shared_ptr<terminal>
 sets::marker::bind(const query_options& options) {
@@ -237,11 +239,9 @@ sets::marker::bind(const query_options& options) {
    } else if (value.is_unset_value()) {
        return constants::UNSET_VALUE;
    } else {
-        auto& type = static_cast<const set_type_impl&>(*_receiver->type);
+        auto& type = dynamic_cast<const set_type_impl&>(_receiver->type->without_reversed());
        try {
-            with_linearized(*value, [&] (bytes_view v) {
-                type.validate(v, options.get_cql_serialization_format());
-            });
+            type.validate(*value, options.get_cql_serialization_format());
        } catch (marshal_exception& e) {
            throw exceptions::invalid_request_exception(
                    format("Exception while binding column {:s}: {:s}", _receiver->name->to_cql_string(), e.what()));
@@ -284,8 +284,7 @@ void
 sets::adder::do_add(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params,
        shared_ptr<term> value, const column_definition& column) {
    auto set_value = dynamic_pointer_cast<sets::value>(std::move(value));
-    auto set_type = dynamic_cast<const set_type_impl*>(column.type.get());
-    assert(set_type);
+    auto& set_type = dynamic_cast<const set_type_impl&>(column.type->without_reversed());
    if (column.type->is_multi_cell()) {
        if (!set_value || set_value->_elements.empty()) {
            return;
@@ -295,10 +294,10 @@ sets::adder::do_add(mutation& m, const clustering_key_prefix& row_key, const upd
        collection_mutation_description mut;

        for (auto&& e : set_value->_elements) {
-            mut.cells.emplace_back(e, params.make_cell(*set_type->value_comparator(), bytes_view(), atomic_cell::collection_member::yes));
+            mut.cells.emplace_back(e, params.make_cell(*set_type.value_comparator(), bytes_view(), atomic_cell::collection_member::yes));
        }

-        m.set_cell(row_key, column, mut.serialize(*set_type));
+        m.set_cell(row_key, column, mut.serialize(set_type));
    } else if (set_value != nullptr) {
        // for frozen sets, we're overwriting the whole cell
        auto v = set_type_impl::serialize_partially_deserialized_form(
@@ -315,7 +314,7 @@ sets::discarder::execute(mutation& m, const clustering_key_prefix& row_key, cons
    assert(column.type->is_multi_cell()); // "Attempted to remove items from a frozen set";

    auto&& value = _t->bind(params._options);
-    if (!value) {
+    if (!value || value == constants::UNSET_VALUE) {
        return;
    }

--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -45,7 +45,7 @@
 #include "db/system_keyspace.hh"
 #include "database.hh"

-bool is_system_keyspace(const sstring& keyspace);
+bool is_system_keyspace(std::string_view keyspace);

 cql3::statements::alter_keyspace_statement::alter_keyspace_statement(sstring name, ::shared_ptr<ks_prop_defs> attrs)
    : _name(name)
@@ -91,10 +91,10 @@ void cql3::statements::alter_keyspace_statement::validate(service::storage_proxy
    }
 }

-future<shared_ptr<cql_transport::event::schema_change>> cql3::statements::alter_keyspace_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const {
+future<shared_ptr<cql_transport::event::schema_change>> cql3::statements::alter_keyspace_statement::announce_migration(service::storage_proxy& proxy) const {
    auto old_ksm = proxy.get_db().local().find_keyspace(_name).metadata();
-    const auto& tm = proxy.get_token_metadata();
-    return service::get_local_migration_manager().announce_keyspace_update(_attrs->as_ks_metadata_update(old_ksm, tm), is_local_only).then([this] {
+    const auto& tm = *proxy.get_token_metadata_ptr();
+    return service::get_local_migration_manager().announce_keyspace_update(_attrs->as_ks_metadata_update(old_ksm, tm)).then([this] {
        using namespace cql_transport;
        return ::make_shared<event::schema_change>(
                event::schema_change::change_type::UPDATED,
--- a/cql3/statements/alter_keyspace_statement.hh
+++ b/cql3/statements/alter_keyspace_statement.hh
@@ -61,7 +61,7 @@ public:

    future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;
    void validate(service::storage_proxy& proxy, const service::client_state& state) const override;
-    future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;
    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
 };

--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -70,7 +70,9 @@ alter_table_statement::alter_table_statement(shared_ptr<cf_name> name,
 }

 future<> alter_table_statement::check_access(service::storage_proxy& proxy, const service::client_state& state) const {
-    return state.has_column_family_access(keyspace(), column_family(), auth::permission::ALTER);
+    using cdt = auth::command_desc::type;
+    return state.has_column_family_access(proxy.local_db(), keyspace(), column_family(), auth::permission::ALTER,
+                                          _type == type::opts ? cdt::ALTER_WITH_OPTS : cdt::OTHER);
 }

 void alter_table_statement::validate(service::storage_proxy& proxy, const service::client_state& state) const
@@ -286,7 +288,7 @@ void alter_table_statement::drop_column(const schema& schema, const table& cf, s
    }
 }

-future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const
+future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::announce_migration(service::storage_proxy& proxy) const
 {
    auto& db = proxy.get_db().local();
    auto s = validation::validate_column_family(db, keyspace(), column_family());
@@ -394,7 +396,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
        break;
    }

-    return service::get_local_migration_manager().announce_column_family_update(cfm.build(), false, std::move(view_updates), is_local_only)
+    return service::get_local_migration_manager().announce_column_family_update(cfm.build(), false, std::move(view_updates))
        .then([this] {
            using namespace cql_transport;
            return ::make_shared<event::schema_change>(
--- a/cql3/statements/alter_table_statement.hh
+++ b/cql3/statements/alter_table_statement.hh
@@ -80,7 +80,7 @@ public:

    virtual future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;
    virtual void validate(service::storage_proxy& proxy, const service::client_state& state) const override;
-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;
    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
 private:
    void add_column(const schema& schema, const table& cf, schema_builder& cfm, std::vector<view_ptr>& view_updates, const column_identifier& column_name, const cql3_type validator, const column_definition* def, bool is_static) const;
--- a/cql3/statements/alter_type_statement.cc
+++ b/cql3/statements/alter_type_statement.cc
@@ -78,7 +78,7 @@ const sstring& alter_type_statement::keyspace() const
    return _name.get_keyspace();
 }

-void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, bool is_local_only) const
+void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks) const
 {
    auto&& all_types = ks.metadata()->user_types().get_all_types();
    auto to_update = all_types.find(_name.get_user_type_name());
@@ -100,7 +100,7 @@ void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, b

    // Now, we need to announce the type update to basically change it for new tables using this type,
    // but we also need to find all existing user types and CF using it and change them.
-    service::get_local_migration_manager().announce_type_update(updated, is_local_only).get();
+    service::get_local_migration_manager().announce_type_update(updated).get();

    for (auto&& schema : ks.metadata()->cf_meta_data() | boost::adaptors::map_values) {
        auto cfm = schema_builder(schema);
@@ -115,21 +115,21 @@ void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, b
        }
        if (modified) {
            if (schema->is_view()) {
-                service::get_local_migration_manager().announce_view_update(view_ptr(cfm.build()), is_local_only).get();
+                service::get_local_migration_manager().announce_view_update(view_ptr(cfm.build())).get();
            } else {
-                service::get_local_migration_manager().announce_column_family_update(cfm.build(), false, {}, is_local_only).get();
+                service::get_local_migration_manager().announce_column_family_update(cfm.build(), false, {}).get();
            }
        }
    }
 }

-future<shared_ptr<cql_transport::event::schema_change>> alter_type_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const
+future<shared_ptr<cql_transport::event::schema_change>> alter_type_statement::announce_migration(service::storage_proxy& proxy) const
 {
-    return seastar::async([this, &proxy, is_local_only] {
+    return seastar::async([this, &proxy] {
        auto&& db = proxy.get_db().local();
        try {
            auto&& ks = db.find_keyspace(keyspace());
-            do_announce_migration(db, ks, is_local_only);
+            do_announce_migration(db, ks);
            using namespace cql_transport;
            return ::make_shared<event::schema_change>(
                    event::schema_change::change_type::UPDATED,
--- a/cql3/statements/alter_type_statement.hh
+++ b/cql3/statements/alter_type_statement.hh
@@ -63,14 +63,14 @@ public:

    virtual const sstring& keyspace() const override;

-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;

    class add_or_alter;
    class renames;
 protected:
    virtual user_type make_updated_type(database& db, user_type to_update) const = 0;
 private:
-    void do_announce_migration(database& db, ::keyspace& ks, bool is_local_only) const;
+    void do_announce_migration(database& db, ::keyspace& ks) const;
 };

 class alter_type_statement::add_or_alter : public alter_type_statement {
--- a/cql3/statements/alter_view_statement.cc
+++ b/cql3/statements/alter_view_statement.cc
@@ -60,9 +60,10 @@ alter_view_statement::alter_view_statement(::shared_ptr<cf_name> view_name, ::sh
 future<> alter_view_statement::check_access(service::storage_proxy& proxy, const service::client_state& state) const
 {
    try {
-        auto&& s = proxy.get_db().local().find_schema(keyspace(), column_family());
+        const database& db = proxy.local_db();
+        auto&& s = db.find_schema(keyspace(), column_family());
        if (s->is_view())  {
-            return state.has_column_family_access(keyspace(), s->view_info()->base_name(), auth::permission::ALTER);
+            return state.has_column_family_access(db, keyspace(), s->view_info()->base_name(), auth::permission::ALTER);
        }
    } catch (const no_such_column_family& e) {
        // Will be validated afterwards.
@@ -75,7 +76,7 @@ void alter_view_statement::validate(service::storage_proxy&, const service::clie
    // validated in announce_migration()
 }

-future<shared_ptr<cql_transport::event::schema_change>> alter_view_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const
+future<shared_ptr<cql_transport::event::schema_change>> alter_view_statement::announce_migration(service::storage_proxy& proxy) const
 {
    auto&& db = proxy.get_db().local();
    schema_ptr schema = validation::validate_column_family(db, keyspace(), column_family());
@@ -107,7 +108,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_view_statement::an
                "the corresponding data in the parent table.");
    }

-    return service::get_local_migration_manager().announce_view_update(view_ptr(builder.build()), is_local_only).then([this] {
+    return service::get_local_migration_manager().announce_view_update(view_ptr(builder.build())).then([this] {
        using namespace cql_transport;

        return ::make_shared<event::schema_change>(
--- a/cql3/statements/alter_view_statement.hh
+++ b/cql3/statements/alter_view_statement.hh
@@ -63,7 +63,7 @@ public:

    virtual void validate(service::storage_proxy&, const service::client_state& state) const override;

-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
 };
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -38,6 +38,7 @@
 */

 #include "batch_statement.hh"
+#include "cql3/util.hh"
 #include "raw/batch_statement.hh"
 #include "db/config.hh"
 #include "db/consistency_level_validations.hh"
@@ -58,6 +59,10 @@ timeout_for_type(batch_statement::type t) {
            : &timeout_config::write_timeout;
 }

+db::timeout_clock::duration batch_statement::get_timeout(const query_options& options) const {
+    return _attrs->is_timeout_set() ? _attrs->get_timeout(options) : options.get_timeout_config().*get_timeout_config_selector();
+}
+
 batch_statement::batch_statement(int bound_terms, type type_,
                                 std::vector<single_statement> statements,
                                 std::unique_ptr<attributes> attrs,
@@ -259,6 +264,7 @@ static thread_local inheriting_concrete_execution_stage<

 future<shared_ptr<cql_transport::messages::result_message>> batch_statement::execute(
        service::storage_proxy& storage, service::query_state& state, const query_options& options) const {
+    cql3::util::validate_timestamp(options, _attrs);
    return batch_stage(this, seastar::ref(storage), seastar::ref(state),
                       seastar::cref(options), false, options.get_timestamp(state));
 }
@@ -284,7 +290,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
    ++_stats.batches;
    _stats.statements_in_batches += _statements.size();

-    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+    auto timeout = db::timeout_clock::now() + get_timeout(options);
    return get_mutations(storage, options, timeout, local, now, query_state).then([this, &storage, &options, timeout, tr_state = query_state.get_trace_state(),
                                                                                                                               permit = query_state.get_permit()] (std::vector<mutation> ms) mutable {
        return execute_without_conditions(storage, std::move(ms), options.get_consistency(), timeout, std::move(tr_state), std::move(permit));
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -170,6 +170,8 @@ private:
            service::storage_proxy& storage,
            const query_options& options,
            service::query_state& state) const;
+
+    db::timeout_clock::duration get_timeout(const query_options& options) const;
 public:
    // FIXME: no cql_statement::to_string() yet
 #if 0
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -157,6 +157,7 @@ void cf_prop_defs::validate(const database& db, const schema::extensions_map& sc
    }

    validate_minimum_int(KW_DEFAULT_TIME_TO_LIVE, 0, DEFAULT_DEFAULT_TIME_TO_LIVE);
+    validate_minimum_int(KW_PAXOSGRACESECONDS, 0, DEFAULT_GC_GRACE_SECONDS);

    auto min_index_interval = get_int(KW_MIN_INDEX_INTERVAL, DEFAULT_MIN_INDEX_INTERVAL);
    auto max_index_interval = get_int(KW_MAX_INDEX_INTERVAL, DEFAULT_MAX_INDEX_INTERVAL);
--- a/cql3/statements/create_function_statement.cc
+++ b/cql3/statements/create_function_statement.cc
@@ -59,11 +59,11 @@ std::unique_ptr<prepared_statement> create_function_statement::prepare(database&
 }

 future<shared_ptr<cql_transport::event::schema_change>> create_function_statement::announce_migration(
-        service::storage_proxy& proxy, bool is_local_only) const {
+        service::storage_proxy& proxy) const {
    if (!_func) {
        return make_ready_future<::shared_ptr<cql_transport::event::schema_change>>();
    }
-    return service::get_local_migration_manager().announce_new_function(_func, is_local_only).then([this] {
+    return service::get_local_migration_manager().announce_new_function(_func).then([this] {
        return create_schema_change(*_func, true);
    });
 }
--- a/cql3/statements/create_function_statement.hh
+++ b/cql3/statements/create_function_statement.hh
@@ -29,7 +29,7 @@ namespace statements {
 class create_function_statement final : public create_function_statement_base {
    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(
-            service::storage_proxy& proxy, bool is_local_only) const override;
+            service::storage_proxy& proxy) const override;
    virtual void create(service::storage_proxy& proxy, functions::function* old) const override;
    sstring _language;
    sstring _body;
--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -73,7 +73,7 @@ create_index_statement::create_index_statement(::shared_ptr<cf_name> name,

 future<>
 create_index_statement::check_access(service::storage_proxy& proxy, const service::client_state& state) const {
-    return state.has_column_family_access(keyspace(), column_family(), auth::permission::ALTER);
+    return state.has_column_family_access(proxy.local_db(), keyspace(), column_family(), auth::permission::ALTER);
 }

 void
@@ -271,7 +271,7 @@ void create_index_statement::validate_targets_for_multi_column_index(std::vector
 }

 future<::shared_ptr<cql_transport::event::schema_change>>
-create_index_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const {
+create_index_statement::announce_migration(service::storage_proxy& proxy) const {
    auto& db = proxy.get_db().local();
    auto schema = db.find_schema(keyspace(), column_family());
    std::vector<::shared_ptr<index_target>> targets;
@@ -306,11 +306,18 @@ create_index_statement::announce_migration(service::storage_proxy& proxy, bool i
                    format("Index {} is a duplicate of existing index {}", index.name(), existing_index.value().name()));
        }
    }
+    auto index_table_name = secondary_index::index_table_name(accepted_name);
+    if (db.has_schema(keyspace(), index_table_name)) {
+        return make_exception_future<::shared_ptr<cql_transport::event::schema_change>>(
+            exceptions::invalid_request_exception(format("Index {} cannot be created, because table {} already exists",
+                accepted_name, index_table_name))
+        );
+    }
    ++_cql_stats->secondary_index_creates;
    schema_builder builder{schema};
    builder.with_index(index);
    return service::get_local_migration_manager().announce_column_family_update(
-            builder.build(), false, {}, is_local_only).then([this]() {
+            builder.build(), false, {}).then([this]() {
        using namespace cql_transport;
        return ::make_shared<event::schema_change>(
                event::schema_change::change_type::UPDATED,
--- a/cql3/statements/create_index_statement.hh
+++ b/cql3/statements/create_index_statement.hh
@@ -79,7 +79,7 @@ public:

    future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;
    void validate(service::storage_proxy&, const service::client_state& state) const override;
-    future<::shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy&, bool is_local_only) const override;
+    future<::shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy&) const override;

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
 private:
--- a/cql3/statements/create_keyspace_statement.cc
+++ b/cql3/statements/create_keyspace_statement.cc
@@ -47,7 +47,7 @@

 #include <regex>

-bool is_system_keyspace(const sstring& keyspace);
+bool is_system_keyspace(std::string_view keyspace);

 namespace cql3 {

@@ -106,11 +106,11 @@ void create_keyspace_statement::validate(service::storage_proxy&, const service:
 #endif
 }

-future<shared_ptr<cql_transport::event::schema_change>> create_keyspace_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const
+future<shared_ptr<cql_transport::event::schema_change>> create_keyspace_statement::announce_migration(service::storage_proxy& proxy) const
 {
-    return make_ready_future<>().then([this, p = proxy.shared_from_this(), is_local_only] {
-        const auto& tm = p->get_token_metadata();
-        return service::get_local_migration_manager().announce_new_keyspace(_attrs->as_ks_metadata(_name, tm), is_local_only);
+    return make_ready_future<>().then([this, p = proxy.shared_from_this()] {
+        const auto& tm = *p->get_token_metadata_ptr();
+        return service::get_local_migration_manager().announce_new_keyspace(_attrs->as_ks_metadata(_name, tm));
    }).then_wrapped([this] (auto&& f) {
        try {
            f.get();
@@ -147,7 +147,7 @@ future<> cql3::statements::create_keyspace_statement::grant_permissions_to_creat
 future<::shared_ptr<messages::result_message>>
 create_keyspace_statement::execute(service::storage_proxy& proxy, service::query_state& state, const query_options& options) const {
    return schema_altering_statement::execute(proxy, state, options).then([this, p = proxy.shared_from_this()] (::shared_ptr<messages::result_message> msg) {
-        bool multidc = p->get_token_metadata().get_topology().get_datacenter_endpoints().size() > 1;
+        bool multidc = p->get_token_metadata_ptr()->get_topology().get_datacenter_endpoints().size() > 1;
        bool simple = _attrs->get_replication_strategy_class() == "SimpleStrategy";

        if (multidc && simple) {
--- a/cql3/statements/create_keyspace_statement.hh
+++ b/cql3/statements/create_keyspace_statement.hh
@@ -84,7 +84,7 @@ public:
     */
    virtual void validate(service::storage_proxy&, const service::client_state& state) const override;

-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;

--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -97,10 +97,10 @@ std::vector<column_definition> create_table_statement::get_columns() const
    return column_defs;
 }

-future<shared_ptr<cql_transport::event::schema_change>> create_table_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const {
+future<shared_ptr<cql_transport::event::schema_change>> create_table_statement::announce_migration(service::storage_proxy& proxy) const {
    auto schema = get_cf_meta_data(proxy.get_db().local());
-    return make_ready_future<>().then([this, is_local_only, schema = std::move(schema)] {
-        return service::get_local_migration_manager().announce_new_column_family(std::move(schema), is_local_only);
+    return make_ready_future<>().then([this, schema = std::move(schema)] {
+        return service::get_local_migration_manager().announce_new_column_family(std::move(schema));
    }).then_wrapped([this] (auto&& f) {
        try {
            f.get();
@@ -204,6 +204,7 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
    }

    _properties.validate(db, _properties.properties()->make_schema_extensions(db.extensions()));
+    const bool has_default_ttl = _properties.properties()->get_default_time_to_live() > 0;

    auto stmt = ::make_shared<create_table_statement>(_cf_name, _properties.properties(), _if_not_exists, _static_columns, _properties.properties()->get_id());

@@ -211,6 +212,11 @@ std::unique_ptr<prepared_statement> create_table_statement::raw_statement::prepa
    for (auto&& entry : _definitions) {
        ::shared_ptr<column_identifier> id = entry.first;
        cql3_type pt = entry.second->prepare(db, keyspace());
+
+        if (has_default_ttl && pt.is_counter()) {
+            throw exceptions::invalid_request_exception("Cannot set default_time_to_live on a table with counters");
+        }
+
        if (pt.get_type()->is_multi_cell()) {
            if (pt.get_type()->is_user_type()) {
                // check for multi-cell types (non-frozen UDTs or collections) inside a non-frozen UDT
--- a/cql3/statements/create_table_statement.hh
+++ b/cql3/statements/create_table_statement.hh
@@ -102,7 +102,7 @@ public:

    virtual void validate(service::storage_proxy&, const service::client_state& state) const override;

-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;

--- a/cql3/statements/create_type_statement.cc
+++ b/cql3/statements/create_type_statement.cc
@@ -138,7 +138,7 @@ inline user_type create_type_statement::create_type(database& db) const
        std::move(field_names), std::move(field_types), true /* multi cell */);
 }

-future<shared_ptr<cql_transport::event::schema_change>> create_type_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const
+future<shared_ptr<cql_transport::event::schema_change>> create_type_statement::announce_migration(service::storage_proxy& proxy) const
 {
    auto&& db = proxy.get_db().local();

@@ -152,7 +152,7 @@ future<shared_ptr<cql_transport::event::schema_change>> create_type_statement::a

    auto type = create_type(db);
    check_for_duplicate_names(type);
-    return service::get_local_migration_manager().announce_new_type(type, is_local_only).then([this] {
+    return service::get_local_migration_manager().announce_new_type(type).then([this] {
        using namespace cql_transport;

        return ::make_shared<event::schema_change>(
--- a/cql3/statements/create_type_statement.hh
+++ b/cql3/statements/create_type_statement.hh
@@ -65,7 +65,7 @@ public:

    virtual const sstring& keyspace() const override;

-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;

--- a/cql3/statements/create_view_statement.cc
+++ b/cql3/statements/create_view_statement.cc
@@ -89,7 +89,7 @@ create_view_statement::create_view_statement(
 }

 future<> create_view_statement::check_access(service::storage_proxy& proxy, const service::client_state& state) const {
-    return state.has_column_family_access(keyspace(), _base_name->get_column_family(), auth::permission::ALTER);
+    return state.has_column_family_access(proxy.local_db(), keyspace(), _base_name->get_column_family(), auth::permission::ALTER);
 }

 void create_view_statement::validate(service::storage_proxy& proxy, const service::client_state& state) const {
@@ -140,7 +140,7 @@ static bool validate_primary_key(
    return new_non_pk_column;
 }

-future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const {
+future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::announce_migration(service::storage_proxy& proxy) const {
    // We need to make sure that:
    //  - primary key includes all columns in base table's primary key
    //  - make sure that the select statement does not have anything other than columns
@@ -225,7 +225,7 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
    }

    auto parameters = make_lw_shared<raw::select_statement::parameters>(raw::select_statement::parameters::orderings_type(), false, true);
-    raw::select_statement raw_select(_base_name, std::move(parameters), _select_clause, _where_clause, nullptr, nullptr, {});
+    raw::select_statement raw_select(_base_name, std::move(parameters), _select_clause, _where_clause, nullptr, nullptr, {}, std::make_unique<cql3::attributes::raw>());
    raw_select.prepare_keyspace(keyspace());
    raw_select.set_bound_variables({});

@@ -350,8 +350,8 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
    auto where_clause_text = util::relations_to_where_clause(_where_clause);
    builder.with_view_info(schema->id(), schema->cf_name(), included.empty(), std::move(where_clause_text));

-    return make_ready_future<>().then([definition = view_ptr(builder.build()), is_local_only]() mutable {
-        return service::get_local_migration_manager().announce_new_view(definition, is_local_only);
+    return make_ready_future<>().then([definition = view_ptr(builder.build())]() mutable {
+        return service::get_local_migration_manager().announce_new_view(definition);
    }).then_wrapped([this] (auto&& f) {
        try {
            f.get();
--- a/cql3/statements/create_view_statement.hh
+++ b/cql3/statements/create_view_statement.hh
@@ -68,7 +68,7 @@ public:
    // Functions we need to override to subclass schema_altering_statement
    virtual future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;
    virtual void validate(service::storage_proxy&, const service::client_state& state) const override;
-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;
    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;

    // FIXME: continue here. See create_table_statement.hh and CreateViewStatement.java
--- a/cql3/statements/drop_function_statement.cc
+++ b/cql3/statements/drop_function_statement.cc
@@ -33,7 +33,7 @@ std::unique_ptr<prepared_statement> drop_function_statement::prepare(database& d
 }

 future<shared_ptr<cql_transport::event::schema_change>> drop_function_statement::announce_migration(
-        service::storage_proxy& proxy, bool is_local_only) const {
+        service::storage_proxy& proxy) const {
    if (!_func) {
        return make_ready_future<shared_ptr<cql_transport::event::schema_change>>();
    }
@@ -41,7 +41,7 @@ future<shared_ptr<cql_transport::event::schema_change>> drop_function_statement:
    if (!user_func) {
        throw exceptions::invalid_request_exception(format("'{}' is not a user defined function", _func));
    }
-    return service::get_local_migration_manager().announce_function_drop(user_func, is_local_only).then([this] {
+    return service::get_local_migration_manager().announce_function_drop(user_func).then([this] {
        return create_schema_change(*_func, false);
    });
 }
--- a/Show More
+++ b/Show More