Merge "mutation_writer: explicitly close writers" from Benny

" _consumer_fut is expected to return an exception on the abort path. Wait for it and drop any exception so it won't be abandoned as seen in #7904. A future<> close() method was added to return _consumer_fut. It is called both after abort() in the error path, and after consume_end_of_stream, on the success path. With that, consume_end_of_stream was made void as it doesn't return a future<> anymore. Fixes #7904 Test: unit(release) " * tag 'close-bucket-writer-v5' of github.com:bhalevy/scylla: mutation_writer: bucket_writer: add close mutation_writer/feed_writers: refactor bucket/shard writers mutation_writer: update bucket/shard writers consume_end_of_stream (cherry picked from commit f11a0700a8)
Merge 'cdc: Limit size of topology description' from Piotr Jastrzębski
2021-03-21 18:09:45 +02:00 · 2021-03-21 14:05:36 +02:00 · 2021-03-21 12:19:24 +02:00 · 2021-03-21 10:51:04 +02:00 · 2021-03-19 16:42:30 +02:00 · 2021-03-19 00:08:27 +02:00
1107 changed files with 16733 additions and 4630 deletions
--- a/.github/workflows/pages.yml
+++ b/.github/workflows/pages.yml
@@ -0,0 +1,33 @@
+name: "CI Docs"
+
+on:
+  push:
+    branches:
+    - master
+    paths:
+    - 'docs/**'
+jobs:
+  release:
+    name: Build
+    runs-on: ubuntu-latest
+    env:
+      LATEST_VERSION: master
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+      with:
+        persist-credentials: false
+        fetch-depth: 0
+    - name: Set up Python
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.7
+    - name: Build docs
+      run: |
+        export PATH=$PATH:~/.local/bin
+        cd docs
+        make multiversion
+    - name: Deploy
+      run : ./docs/_utils/deploy.sh
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,5 @@ tags
 testlog
 test/*/*.reject
 .vscode
+docs/_build
+docs/poetry.lock
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -498,6 +498,7 @@ set(scylla_sources
    mutation_writer/multishard_writer.cc
    mutation_writer/shard_based_splitting_writer.cc
    mutation_writer/timestamp_based_splitting_writer.cc
+    mutation_writer/feed_writers.cc
    partition_slice_builder.cc
    partition_version.cc
    querier.cc
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,11 +1,13 @@
-# Asking questions or requesting help
+# Contributing
+
+## Asking questions or requesting help

 Use the [ScyllaDB user mailing list](https://groups.google.com/forum/#!forum/scylladb-users) or the [Slack workspace](http://slack.scylladb.com) for general questions and help.

-# Reporting an issue
+## Reporting an issue

 Please use the [Issue Tracker](https://github.com/scylladb/scylla/issues/) to report issues.  Fill in as much information as you can in the issue template, especially for performance problems.

-# Contributing Code to Scylla
+## Contributing Code to Scylla

 To contribute code to Scylla, you need to sign the [Contributor License Agreement](https://www.scylladb.com/open-source/contributor-agreement/) and send your changes as [patches](https://github.com/scylladb/scylla/wiki/Formatting-and-sending-patches) to the [mailing list](https://groups.google.com/forum/#!forum/scylladb-dev). We don't accept pull requests on GitHub.
--- a/README.md
+++ b/README.md
@@ -78,10 +78,7 @@ and the current compatibility of this feature as well as Scylla-specific extensi

 ## Documentation

-Documentation can be found in [./docs](./docs) and on the
-[wiki](https://github.com/scylladb/scylla/wiki). There is currently no clear
-definition of what goes where, so when looking for something be sure to check
-both.
+Documentation can be found [here](https://scylla.docs.scylladb.com).
 Seastar documentation can be found [here](http://docs.seastar.io/master/index.html).
 User documentation can be found [here](https://docs.scylladb.com/).

--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=4.4.dev
+VERSION=4.4.0

 if test -f version
 then
--- a/alternator/conditions.cc
+++ b/alternator/conditions.cc
@@ -159,23 +159,40 @@ static bool check_NE(const rjson::value* v1, const rjson::value& v2) {
 }

 // Check if two JSON-encoded values match with the BEGINS_WITH relation
-static bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2) {
-    // BEGINS_WITH requires that its single operand (v2) be a string or
-    // binary - otherwise it's a validation error. However, problems with
-    // the stored attribute (v1) will just return false (no match).
-    if (!v2.IsObject() || v2.MemberCount() != 1) {
-        throw api_error::validation(format("BEGINS_WITH operator encountered malformed AttributeValue: {}", v2));
-    }
-    auto it2 = v2.MemberBegin();
-    if (it2->name != "S" && it2->name != "B") {
-        throw api_error::validation(format("BEGINS_WITH operator requires String or Binary type in AttributeValue, got {}", it2->name));
-    }
-
-
+bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2,
+                       bool v1_from_query, bool v2_from_query) {
+    bool bad = false;
    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
+        if (v1_from_query) {
+            throw api_error::validation("begins_with() encountered malformed argument");
+        } else {
+            bad = true;
+        }
+    } else if (v1->MemberBegin()->name != "S" && v1->MemberBegin()->name != "B") {
+        if (v1_from_query) {
+            throw api_error::validation(format("begins_with supports only string or binary type, got: {}", *v1));
+        } else {
+            bad = true;
+        }
+    }
+    if (!v2.IsObject() || v2.MemberCount() != 1) {
+        if (v2_from_query) {
+            throw api_error::validation("begins_with() encountered malformed argument");
+        } else {
+            bad = true;
+        }
+    } else if (v2.MemberBegin()->name != "S" && v2.MemberBegin()->name != "B") {
+        if (v2_from_query) {
+            throw api_error::validation(format("begins_with() supports only string or binary type, got: {}", v2));
+        } else {
+            bad = true;
+        }
+    }
+    if (bad) {
        return false;
    }
    auto it1 = v1->MemberBegin();
+    auto it2 = v2.MemberBegin();
    if (it1->name != it2->name) {
        return false;
    }
@@ -279,24 +296,38 @@ static bool check_NOT_NULL(const rjson::value* val) {
    return val != nullptr;
 }

+// Only types S, N or B (string, number or bytes) may be compared by the
+// various comparion operators - lt, le, gt, ge, and between.
+static bool check_comparable_type(const rjson::value& v) {
+    if (!v.IsObject() || v.MemberCount() != 1) {
+        return false;
+    }
+    const rjson::value& type = v.MemberBegin()->name;
+    return type == "S" || type == "N" || type == "B";
+}
+
 // Check if two JSON-encoded values match with cmp.
 template <typename Comparator>
-bool check_compare(const rjson::value* v1, const rjson::value& v2, const Comparator& cmp) {
-    if (!v2.IsObject() || v2.MemberCount() != 1) {
-        throw api_error::validation(
-                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic));
+bool check_compare(const rjson::value* v1, const rjson::value& v2, const Comparator& cmp,
+                   bool v1_from_query, bool v2_from_query) {
+    bool bad = false;
+    if (!v1 || !check_comparable_type(*v1)) {
+        if (v1_from_query) {
+            throw api_error::validation(format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
+        }
+        bad = true;
    }
-    const auto& kv2 = *v2.MemberBegin();
-    if (kv2.name != "S" && kv2.name != "N" && kv2.name != "B") {
-        throw api_error::validation(
-                        format("{} requires a single AttributeValue of type String, Number, or Binary",
-                               cmp.diagnostic));
+    if (!check_comparable_type(v2)) {
+        if (v2_from_query) {
+            throw api_error::validation(format("{} allow only the types String, Number, or Binary", cmp.diagnostic));
+        }
+        bad = true;
    }
-    if (!v1 || !v1->IsObject() || v1->MemberCount() != 1) {
+    if (bad) {
        return false;
    }
    const auto& kv1 = *v1->MemberBegin();
+    const auto& kv2 = *v2.MemberBegin();
    if (kv1.name != kv2.name) {
        return false;
    }
@@ -310,7 +341,8 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara
    if (kv1.name == "B") {
        return cmp(base64_decode(kv1.value), base64_decode(kv2.value));
    }
-    clogger.error("check_compare panic: LHS type equals RHS type, but one is in {N,S,B} while the other isn't");
+    // cannot reach here, as check_comparable_type() verifies the type is one
+    // of the above options.
    return false;
 }

@@ -341,56 +373,71 @@ struct cmp_gt {
    static constexpr const char* diagnostic = "GT operator";
 };

-// True if v is between lb and ub, inclusive.  Throws if lb > ub.
+// True if v is between lb and ub, inclusive.  Throws or returns false
+// (depending on bounds_from_query parameter) if lb > ub.
 template <typename T>
-static bool check_BETWEEN(const T& v, const T& lb, const T& ub) {
+static bool check_BETWEEN(const T& v, const T& lb, const T& ub, bool bounds_from_query) {
    if (cmp_lt()(ub, lb)) {
-        throw api_error::validation(
-                        format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+        if (bounds_from_query) {
+            throw api_error::validation(
+                format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
+        } else {
+            return false;
+        }
    }
    return cmp_ge()(v, lb) && cmp_le()(v, ub);
 }

-static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub) {
-    if (!v) {
+static bool check_BETWEEN(const rjson::value* v, const rjson::value& lb, const rjson::value& ub,
+                          bool v_from_query, bool lb_from_query, bool ub_from_query) {
+    if ((v && v_from_query && !check_comparable_type(*v)) ||
+        (lb_from_query && !check_comparable_type(lb)) ||
+        (ub_from_query && !check_comparable_type(ub))) {
+        throw api_error::validation("between allow only the types String, Number, or Binary");
+
+    }
+    if (!v || !v->IsObject() || v->MemberCount() != 1 ||
+        !lb.IsObject() || lb.MemberCount() != 1 ||
+        !ub.IsObject() || ub.MemberCount() != 1) {
        return false;
    }
-    if (!v->IsObject() || v->MemberCount() != 1) {
-        throw api_error::validation(format("BETWEEN operator encountered malformed AttributeValue: {}", *v));
-    }
-    if (!lb.IsObject() || lb.MemberCount() != 1) {
-        throw api_error::validation(format("BETWEEN operator encountered malformed AttributeValue: {}", lb));
-    }
-    if (!ub.IsObject() || ub.MemberCount() != 1) {
-        throw api_error::validation(format("BETWEEN operator encountered malformed AttributeValue: {}", ub));
-    }

    const auto& kv_v = *v->MemberBegin();
    const auto& kv_lb = *lb.MemberBegin();
    const auto& kv_ub = *ub.MemberBegin();
+    bool bounds_from_query = lb_from_query && ub_from_query;
    if (kv_lb.name != kv_ub.name) {
-        throw api_error::validation(
+        if (bounds_from_query) {
+           throw api_error::validation(
                format("BETWEEN operator requires the same type for lower and upper bound; instead got {} and {}",
                       kv_lb.name, kv_ub.name));
+        } else {
+            return false;
+        }
    }
    if (kv_v.name != kv_lb.name) { // Cannot compare different types, so v is NOT between lb and ub.
        return false;
    }
    if (kv_v.name == "N") {
        const char* diag = "BETWEEN operator";
-        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag));
+        return check_BETWEEN(unwrap_number(*v, diag), unwrap_number(lb, diag), unwrap_number(ub, diag), bounds_from_query);
    }
    if (kv_v.name == "S") {
        return check_BETWEEN(std::string_view(kv_v.value.GetString(), kv_v.value.GetStringLength()),
                             std::string_view(kv_lb.value.GetString(), kv_lb.value.GetStringLength()),
-                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()));
+                             std::string_view(kv_ub.value.GetString(), kv_ub.value.GetStringLength()),
+                             bounds_from_query);
    }
    if (kv_v.name == "B") {
-        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value));
+        return check_BETWEEN(base64_decode(kv_v.value), base64_decode(kv_lb.value), base64_decode(kv_ub.value), bounds_from_query);
    }
-    throw api_error::validation(
-        format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
+    if (v_from_query) {
+        throw api_error::validation(
+            format("BETWEEN operator requires AttributeValueList elements to be of type String, Number, or Binary; instead got {}",
               kv_lb.name));
+    } else {
+        return false;
+    }
 }

 // Verify one Expect condition on one attribute (whose content is "got")
@@ -437,19 +484,19 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
            return check_NE(got, (*attribute_value_list)[0]);
        case comparison_operator_type::LT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_lt{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_lt{}, false, true);
        case comparison_operator_type::LE:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_le{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_le{}, false, true);
        case comparison_operator_type::GT:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_gt{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_gt{}, false, true);
        case comparison_operator_type::GE:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_compare(got, (*attribute_value_list)[0], cmp_ge{});
+            return check_compare(got, (*attribute_value_list)[0], cmp_ge{}, false, true);
        case comparison_operator_type::BEGINS_WITH:
            verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
-            return check_BEGINS_WITH(got, (*attribute_value_list)[0]);
+            return check_BEGINS_WITH(got, (*attribute_value_list)[0], false, true);
        case comparison_operator_type::IN:
            verify_operand_count(attribute_value_list, nonempty(), *comparison_operator);
            return check_IN(got, *attribute_value_list);
@@ -461,7 +508,8 @@ static bool verify_expected_one(const rjson::value& condition, const rjson::valu
            return check_NOT_NULL(got);
        case comparison_operator_type::BETWEEN:
            verify_operand_count(attribute_value_list, exact_size(2), *comparison_operator);
-            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1]);
+            return check_BETWEEN(got, (*attribute_value_list)[0], (*attribute_value_list)[1],
+                                 false, true, true);
        case comparison_operator_type::CONTAINS:
            {
                verify_operand_count(attribute_value_list, exact_size(1), *comparison_operator);
@@ -573,7 +621,8 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
            // Shouldn't happen unless we have a bug in the parser
            throw std::logic_error(format("Wrong number of values {} in BETWEEN primitive_condition", cond._values.size()));
        }
-        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2]);
+        return check_BETWEEN(&calculated_values[0], calculated_values[1], calculated_values[2],
+                             cond._values[0].is_constant(), cond._values[1].is_constant(), cond._values[2].is_constant());
    case parsed::primitive_condition::type::IN:
        return check_IN(calculated_values);
    case parsed::primitive_condition::type::VALUE:
@@ -604,13 +653,17 @@ static bool calculate_primitive_condition(const parsed::primitive_condition& con
    case parsed::primitive_condition::type::NE:
        return check_NE(&calculated_values[0], calculated_values[1]);
    case parsed::primitive_condition::type::GT:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_gt{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::GE:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_ge{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::LT:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_lt{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    case parsed::primitive_condition::type::LE:
-        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{});
+        return check_compare(&calculated_values[0], calculated_values[1], cmp_le{},
+            cond._values[0].is_constant(), cond._values[1].is_constant());
    default:
        // Shouldn't happen unless we have a bug in the parser
        throw std::logic_error(format("Unknown type {} in primitive_condition object", (int)(cond._op)));
--- a/alternator/conditions.hh
+++ b/alternator/conditions.hh
@@ -52,6 +52,7 @@ bool verify_expected(const rjson::value& req, const rjson::value* previous_item)
 bool verify_condition(const rjson::value& condition, bool require_all, const rjson::value* previous_item);

 bool check_CONTAINS(const rjson::value* v1, const rjson::value& v2);
+bool check_BEGINS_WITH(const rjson::value* v1, const rjson::value& v2, bool v1_from_query, bool v2_from_query);

 bool verify_condition_expression(
        const parsed::condition_expression& condition_expression,
--- a/alternator/error.hh
+++ b/alternator/error.hh
@@ -59,6 +59,9 @@ public:
    static api_error invalid_signature(std::string msg) {
        return api_error("InvalidSignatureException", std::move(msg));
    }
+    static api_error missing_authentication_token(std::string msg) {
+        return api_error("MissingAuthenticationTokenException", std::move(msg));
+    }
    static api_error unrecognized_client(std::string msg) {
        return api_error("UnrecognizedClientException", std::move(msg));
    }
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -55,7 +55,7 @@
 #include "schema.hh"
 #include "alternator/tags_extension.hh"
 #include "alternator/rmw_operation.hh"
-
+#include <seastar/core/coroutine.hh>
 #include <boost/range/adaptors.hpp>

 logging::logger elogger("alternator-executor");
@@ -202,7 +202,7 @@ static schema_ptr get_table(service::storage_proxy& proxy, const rjson::value& r
    if (!schema) {
        // if we get here then the name was missing, since syntax or missing actual CF 
        // checks throw. Slow path, but just call get_table_name to generate exception. 
-        get_table_name(request);        
+        get_table_name(request);
    }
    return schema;
 }
@@ -220,7 +220,7 @@ static std::tuple<bool, std::string_view, std::string_view> try_get_internal_tab
    std::string_view ks_name = table_name.substr(0, delim);
    table_name.remove_prefix(ks_name.size() + 1);
    // Only internal keyspaces can be accessed to avoid leakage
-    if (!is_internal_keyspace(sstring(ks_name))) {
+    if (!is_internal_keyspace(ks_name)) {
        return {false, "", ""};
    }
    return {true, ks_name, table_name};
@@ -476,8 +476,8 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
        return make_ready_future<request_return_type>(api_error::resource_not_found(
                format("Requested resource not found: Table: {} not found", table_name)));
    }
-    return _mm.announce_column_family_drop(keyspace_name, table_name, false, service::migration_manager::drop_views::yes).then([this, keyspace_name] {
-        return _mm.announce_keyspace_drop(keyspace_name, false);
+    return _mm.announce_column_family_drop(keyspace_name, table_name, service::migration_manager::drop_views::yes).then([this, keyspace_name] {
+        return _mm.announce_keyspace_drop(keyspace_name);
    }).then([table_name = std::move(table_name)] {
        // FIXME: need more attributes?
        rjson::value table_description = rjson::empty_object();
@@ -704,52 +704,48 @@ static void update_tags_map(const rjson::value& tags, std::map<sstring, sstring>
 static future<> update_tags(service::migration_manager& mm, schema_ptr schema, std::map<sstring, sstring>&& tags_map) {
    schema_builder builder(schema);
    builder.add_extension(tags_extension::NAME, ::make_shared<tags_extension>(std::move(tags_map)));
-    return mm.announce_column_family_update(builder.build(), false, std::vector<view_ptr>(), false);
+    return mm.announce_column_family_update(builder.build(), false, std::vector<view_ptr>());
 }

 future<executor::request_return_type> executor::tag_resource(client_state& client_state, service_permit permit, rjson::value request) {
    _stats.api_operations.tag_resource++;

-    return seastar::async([this, &client_state, request = std::move(request)] () mutable -> request_return_type {
-        const rjson::value* arn = rjson::find(request, "ResourceArn");
-        if (!arn || !arn->IsString()) {
-            return api_error::access_denied("Incorrect resource identifier");
-        }
-        schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
-        std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
-        const rjson::value* tags = rjson::find(request, "Tags");
-        if (!tags || !tags->IsArray()) {
-            return api_error::validation("Cannot parse tags");
-        }
-        if (tags->Size() < 1) {
-            return api_error::validation("The number of tags must be at least 1") ;
-        }
-        update_tags_map(*tags, tags_map,  update_tags_action::add_tags);
-        update_tags(_mm, schema, std::move(tags_map)).get();
-        return json_string("");
-    });
+    const rjson::value* arn = rjson::find(request, "ResourceArn");
+    if (!arn || !arn->IsString()) {
+        co_return api_error::access_denied("Incorrect resource identifier");
+    }
+    schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
+    std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
+    const rjson::value* tags = rjson::find(request, "Tags");
+    if (!tags || !tags->IsArray()) {
+        co_return api_error::validation("Cannot parse tags");
+    }
+    if (tags->Size() < 1) {
+        co_return api_error::validation("The number of tags must be at least 1") ;
+    }
+    update_tags_map(*tags, tags_map,  update_tags_action::add_tags);
+    co_await update_tags(_mm, schema, std::move(tags_map));
+    co_return json_string("");
 }

 future<executor::request_return_type> executor::untag_resource(client_state& client_state, service_permit permit, rjson::value request) {
    _stats.api_operations.untag_resource++;

-    return seastar::async([this, &client_state, request = std::move(request)] () -> request_return_type {
-        const rjson::value* arn = rjson::find(request, "ResourceArn");
-        if (!arn || !arn->IsString()) {
-            return api_error::access_denied("Incorrect resource identifier");
-        }
-        const rjson::value* tags = rjson::find(request, "TagKeys");
-        if (!tags || !tags->IsArray()) {
-            return api_error::validation(format("Cannot parse tag keys"));
-        }
+    const rjson::value* arn = rjson::find(request, "ResourceArn");
+    if (!arn || !arn->IsString()) {
+        co_return api_error::access_denied("Incorrect resource identifier");
+    }
+    const rjson::value* tags = rjson::find(request, "TagKeys");
+    if (!tags || !tags->IsArray()) {
+        co_return api_error::validation(format("Cannot parse tag keys"));
+    }

-        schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
+    schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));

-        std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
-        update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
-        update_tags(_mm, schema, std::move(tags_map)).get();
-        return json_string("");
-    });
+    std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
+    update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
+    co_await update_tags(_mm, schema, std::move(tags_map));
+    co_return json_string("");
 }

 future<executor::request_return_type> executor::list_tags_of_resource(client_state& client_state, service_permit permit, rjson::value request) {
@@ -985,7 +981,7 @@ future<executor::request_return_type> executor::create_table(client_state& clien
    return create_keyspace(keyspace_name).handle_exception_type([] (exceptions::already_exists_exception&) {
            // Ignore the fact that the keyspace may already exist. See discussion in #6340
        }).then([this, table_name, request = std::move(request), schema, view_builders = std::move(view_builders), tags_map = std::move(tags_map)] () mutable {
-        return futurize_invoke([&] { return _mm.announce_new_column_family(schema, false); }).then([this, table_info = std::move(request), schema, view_builders = std::move(view_builders), tags_map = std::move(tags_map)] () mutable {
+        return futurize_invoke([&] { return _mm.announce_new_column_family(schema); }).then([this, table_info = std::move(request), schema, view_builders = std::move(view_builders), tags_map = std::move(tags_map)] () mutable {
            return parallel_for_each(std::move(view_builders), [this, schema] (schema_builder builder) {
                return _mm.announce_new_view(view_ptr(builder.build()));
            }).then([this, table_info = std::move(table_info), schema, tags_map = std::move(tags_map)] () mutable {
@@ -1241,10 +1237,16 @@ mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) co
    return m;
 }

-// The DynamoDB API doesn't let the client control the server's timeout.
-// Let's pick something reasonable:
+// The DynamoDB API doesn't let the client control the server's timeout, so
+// we have a global default_timeout() for Alternator requests. The value of
+// default_timeout is overwritten by main.cc based on the
+// "alternator_timeout_in_ms" configuration parameter.
+db::timeout_clock::duration executor::s_default_timeout = 10s;
+void executor::set_default_timeout(db::timeout_clock::duration timeout) {
+    s_default_timeout = timeout;
+}
 db::timeout_clock::time_point executor::default_timeout() {
-    return db::timeout_clock::now() + 10s;
+    return db::timeout_clock::now() + s_default_timeout;
 }
        
 static future<std::unique_ptr<rjson::value>> get_previous_item(
@@ -1880,18 +1882,182 @@ static std::string get_item_type_string(const rjson::value& v) {
    return it->name.GetString();
 }

+// attrs_to_get saves for each top-level attribute an attrs_to_get_node,
+// a hierarchy of subparts that need to be kept. The following function
+// takes a given JSON value and drops its parts which weren't asked to be
+// kept. It modifies the given JSON value, or returns false to signify that
+// the entire object should be dropped.
+// Note that The JSON value is assumed to be encoded using the DynamoDB
+// conventions - i.e., it is really a map whose key has a type string,
+// and the value is the real object.
+template<typename T>
+static bool hierarchy_filter(rjson::value& val, const attribute_path_map_node<T>& h) {
+    if (!val.IsObject() || val.MemberCount() != 1) {
+        // This shouldn't happen. We shouldn't have stored malformed objects.
+        // But today Alternator does not validate the structure of nested
+        // documents before storing them, so this can happen on read.
+        throw api_error::internal(format("Malformed value object read: {}", val));
+    }
+    const char* type = val.MemberBegin()->name.GetString();
+    rjson::value& v = val.MemberBegin()->value;
+    if (h.has_members()) {
+        const auto& members = h.get_members();
+        if (type[0] != 'M' || !v.IsObject()) {
+            // If v is not an object (dictionary, map), none of the members
+            // can match.
+            return false;
+        }
+        rjson::value newv = rjson::empty_object();
+        for (auto it = v.MemberBegin(); it != v.MemberEnd(); ++it) {
+            std::string attr = it->name.GetString();
+            auto x = members.find(attr);
+            if (x != members.end()) {
+                if (x->second) {
+                    // Only a part of this attribute is to be filtered, do it.
+                    if (hierarchy_filter(it->value, *x->second)) {
+                        rjson::set_with_string_name(newv, attr, std::move(it->value));
+                    }
+                } else {
+                    // The entire attribute is to be kept
+                    rjson::set_with_string_name(newv, attr, std::move(it->value));
+                }
+            }
+        }
+        if (newv.MemberCount() == 0) {
+            return false;
+        }
+        v = newv;
+    } else if (h.has_indexes()) {
+        const auto& indexes = h.get_indexes();
+        if (type[0] != 'L' || !v.IsArray()) {
+            return false;
+        }
+        rjson::value newv = rjson::empty_array();
+        const auto& a = v.GetArray();
+        for (unsigned i = 0; i < v.Size(); i++) {
+            auto x = indexes.find(i);
+            if (x != indexes.end()) {
+                if (x->second) {
+                    if (hierarchy_filter(a[i], *x->second)) {
+                        rjson::push_back(newv, std::move(a[i]));
+                    }
+                } else {
+                    // The entire attribute is to be kept
+                    rjson::push_back(newv, std::move(a[i]));
+                }
+            }
+        }
+        if (newv.Size() == 0) {
+            return false;
+        }
+        v = newv;
+    }
+    return true;
+}
+
+// Add a path to a attribute_path_map. Throws a validation error if the path
+// "overlaps" with one already in the filter (one is a sub-path of the other)
+// or "conflicts" with it (both a member and index is requested).
+template<typename T>
+void attribute_path_map_add(const char* source, attribute_path_map<T>& map, const parsed::path& p, T value = {}) {
+   using node = attribute_path_map_node<T>;
+    // The first step is to look for the top-level attribute (p.root()):
+    auto it = map.find(p.root());
+    if (it == map.end()) {
+        if (p.has_operators()) {
+            it = map.emplace(p.root(), node {std::nullopt}).first;
+        } else {
+            (void) map.emplace(p.root(), node {std::move(value)}).first;
+            // Value inserted for top-level node. We're done.
+            return;
+        }
+    } else if(!p.has_operators()) {
+        // If p is top-level and we already have it or a part of it
+        // in map, it's a forbidden overlapping path.
+        throw api_error::validation(format(
+            "Invalid {}: two document paths overlap at {}", source, p.root()));
+    } else if (it->second.has_value()) {
+        // If we're here, it != map.end() && p.has_operators && it->second.has_value().
+        // This means the top-level attribute already has a value, and we're
+        // trying to add a non-top-level value. It's an overlap.
+        throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p.root()));
+    }
+    node* h = &it->second;
+    // The second step is to walk h from the top-level node to the inner node
+    // where we're supposed to insert the value:
+    for (const auto& op : p.operators()) {
+        std::visit(overloaded_functor {
+            [&] (const std::string& member) {
+                if (h->is_empty()) {
+                    *h = node {typename node::members_t()};
+                } else if (h->has_indexes()) {
+                    throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p));
+                } else if (h->has_value()) {
+                    throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p));
+                }
+                typename node::members_t& members = h->get_members();
+                auto it = members.find(member);
+                if (it == members.end()) {
+                    it = members.insert({member, make_shared<node>()}).first;
+                }
+                h = it->second.get();
+            },
+            [&] (unsigned index) {
+                if (h->is_empty()) {
+                    *h = node {typename node::indexes_t()};
+                } else if (h->has_members()) {
+                    throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p));
+                } else if (h->has_value()) {
+                    throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p));
+                }
+                typename node::indexes_t& indexes = h->get_indexes();
+                auto it = indexes.find(index);
+                if (it == indexes.end()) {
+                    it = indexes.insert({index, make_shared<node>()}).first;
+                }
+                h = it->second.get();
+            }
+        }, op);
+    }
+    // Finally, insert the value in the node h.
+    if (h->is_empty()) {
+        *h = node {std::move(value)};
+    } else {
+        throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p));
+    }
+}
+
+// A very simplified version of the above function for the special case of
+// adding only top-level attribute. It's not only simpler, we also use a
+// different error message, referring to a "duplicate attribute"instead of
+// "overlapping paths". DynamoDB also has this distinction (errors in
+// AttributesToGet refer to duplicates, not overlaps, but errors in
+// ProjectionExpression refer to overlap - even if it's an exact duplicate).
+template<typename T>
+void attribute_path_map_add(const char* source, attribute_path_map<T>& map, const std::string& attr, T value = {}) {
+   using node = attribute_path_map_node<T>;
+    auto it = map.find(attr);
+    if (it == map.end()) {
+        map.emplace(attr, node {std::move(value)});
+    } else {
+        throw api_error::validation(format(
+            "Invalid {}: Duplicate attribute: {}", source, attr));
+    }
+}
+
 // calculate_attrs_to_get() takes either AttributesToGet or
 // ProjectionExpression parameters (having both is *not* allowed),
 // and returns the list of cells we need to read, or an empty set when
 // *all* attributes are to be returned.
-// In our current implementation, only top-level attributes are stored
-// as cells, and nested documents are stored serialized as JSON.
-// So this function currently returns only the the top-level attributes
-// but we also need to add, after the query, filtering to keep only
-// the parts of the JSON attributes that were chosen in the paths'
-// operators. Because we don't have such filtering yet (FIXME), we fail here
-// if the requested paths are anything but top-level attributes.
-std::unordered_set<std::string> calculate_attrs_to_get(const rjson::value& req, std::unordered_set<std::string>& used_attribute_names) {
+// However, in our current implementation, only top-level attributes are
+// stored as separate cells - a nested document is stored serialized together
+// (as JSON) in the same cell. So this function return a map - each key is the
+// top-level attribute we will need need to read, and the value for each
+// top-level attribute is the partial hierarchy (struct hierarchy_filter)
+// that we will need to extract from that serialized JSON.
+// For example, if ProjectionExpression lists a.b and a.c[2], we
+// return one top-level attribute name, "a", with the value "{b, c[2]}".
+static attrs_to_get calculate_attrs_to_get(const rjson::value& req, std::unordered_set<std::string>& used_attribute_names) {
    const bool has_attributes_to_get = req.HasMember("AttributesToGet");
    const bool has_projection_expression = req.HasMember("ProjectionExpression");
    if (has_attributes_to_get && has_projection_expression) {
@@ -1900,9 +2066,9 @@ std::unordered_set<std::string> calculate_attrs_to_get(const rjson::value& req,
    }
    if (has_attributes_to_get) {
        const rjson::value& attributes_to_get = req["AttributesToGet"];
-        std::unordered_set<std::string> ret;
+        attrs_to_get ret;
        for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) {
-            ret.insert(it->GetString());
+            attribute_path_map_add("AttributesToGet", ret, it->GetString());
        }
        return ret;
    } else if (has_projection_expression) {
@@ -1915,24 +2081,13 @@ std::unordered_set<std::string> calculate_attrs_to_get(const rjson::value& req,
            throw api_error::validation(e.what());
        }
        resolve_projection_expression(paths_to_get, expression_attribute_names, used_attribute_names);
-        std::unordered_set<std::string> seen_column_names;
-        auto ret = boost::copy_range<std::unordered_set<std::string>>(paths_to_get |
-            boost::adaptors::transformed([&] (const parsed::path& p) {
-                if (p.has_operators()) {
-                    // FIXME: this check will need to change when we support non-toplevel attributes
-                    throw api_error::validation("Non-toplevel attributes in ProjectionExpression not yet implemented");
-                }
-                if (!seen_column_names.insert(p.root()).second) {
-                    // FIXME: this check will need to change when we support non-toplevel attributes
-                    throw api_error::validation(
-                            format("Invalid ProjectionExpression: two document paths overlap with each other: {} and {}.",
-                                    p.root(), p.root()));
-                }
-                return p.root();
-            }));
+        attrs_to_get ret;
+        for (const parsed::path& p : paths_to_get) {
+            attribute_path_map_add("ProjectionExpression", ret, p);
+        }
        return ret;
    }
-    // An empty set asks to read everything
+    // An empty map asks to read everything
    return {};
 }

@@ -1953,7 +2108,7 @@ std::unordered_set<std::string> calculate_attrs_to_get(const rjson::value& req,
 */ 
 void executor::describe_single_item(const cql3::selection::selection& selection,
    const std::vector<bytes_opt>& result_row,
-    const std::unordered_set<std::string>& attrs_to_get,
+    const attrs_to_get& attrs_to_get,
    rjson::value& item,
    bool include_all_embedded_attributes) 
 {
@@ -1974,7 +2129,16 @@ void executor::describe_single_item(const cql3::selection::selection& selection,
                std::string attr_name = value_cast<sstring>(entry.first);
                if (include_all_embedded_attributes || attrs_to_get.empty() || attrs_to_get.contains(attr_name)) {
                    bytes value = value_cast<bytes>(entry.second);
-                    rjson::set_with_string_name(item, attr_name, deserialize_item(value));
+                    rjson::value v = deserialize_item(value);
+                    auto it = attrs_to_get.find(attr_name);
+                    if (it != attrs_to_get.end()) {
+                        // attrs_to_get may have asked for only part of this attribute:
+                        if (hierarchy_filter(v, it->second)) {
+                            rjson::set_with_string_name(item, attr_name, std::move(v));
+                        }
+                    } else {
+                        rjson::set_with_string_name(item, attr_name, std::move(v));
+                    }
                }
            }
        }
@@ -1986,7 +2150,7 @@ std::optional<rjson::value> executor::describe_single_item(schema_ptr schema,
        const query::partition_slice& slice,
        const cql3::selection::selection& selection,
        const query::result& query_result,
-        const std::unordered_set<std::string>& attrs_to_get) {
+        const attrs_to_get& attrs_to_get) {
    rjson::value item = rjson::empty_object();

    cql3::selection::result_set_builder builder(selection, gc_clock::now(), cql_serialization_format::latest());
@@ -2022,8 +2186,16 @@ static bool check_needs_read_before_write(const parsed::value& v) {
    }, v._value);
 }

-static bool check_needs_read_before_write(const parsed::update_expression& update_expression) {
-    return boost::algorithm::any_of(update_expression.actions(), [](const parsed::update_expression::action& action) {
+static bool check_needs_read_before_write(const attribute_path_map<parsed::update_expression::action>& update_expression) {
+    return boost::algorithm::any_of(update_expression, [](const auto& p) {
+        if (!p.second.has_value()) {
+            // If the action is not on the top-level attribute, we need to
+            // read the old item: we change only a part of the top-level
+            // attribute, and write the full top-level attribute back.
+            return true;
+        }
+        // Otherwise, the action p.second.get_value() is just on top-level
+        // attribute. Check if it needs read-before-write:
        return std::visit(overloaded_functor {
            [&] (const parsed::update_expression::action::set& a) -> bool {
                return check_needs_read_before_write(a._rhs._v1) || (a._rhs._op != 'v' && check_needs_read_before_write(a._rhs._v2));
@@ -2037,7 +2209,7 @@ static bool check_needs_read_before_write(const parsed::update_expression& updat
            [&] (const parsed::update_expression::action::del& a) -> bool {
                return true;
            }
-        }, action._action);
+        }, p.second.get_value()._action);
    });
 }

@@ -2046,7 +2218,11 @@ public:
    // Some information parsed during the constructor to check for input
    // errors, and cached to be used again during apply().
    rjson::value* _attribute_updates;
-    parsed::update_expression _update_expression;
+    // Instead of keeping a parsed::update_expression with an unsorted list
+    // list of actions, we keep them in an attribute_path_map which groups
+    // them by top-level attribute, and detects forbidden overlaps/conflicts.
+    attribute_path_map<parsed::update_expression::action> _update_expression;
+
    parsed::condition_expression _condition_expression;

    update_item_operation(service::storage_proxy& proxy, rjson::value&& request);
@@ -2077,16 +2253,22 @@ update_item_operation::update_item_operation(service::storage_proxy& proxy, rjso
            throw api_error::validation("UpdateExpression must be a string");
        }
        try {
-            _update_expression = parse_update_expression(update_expression->GetString());
-            resolve_update_expression(_update_expression,
+            parsed::update_expression expr = parse_update_expression(update_expression->GetString());
+            resolve_update_expression(expr,
                    expression_attribute_names, expression_attribute_values,
                    used_attribute_names, used_attribute_values);
+            if (expr.empty()) {
+                throw api_error::validation("Empty expression in UpdateExpression is not allowed");
+            }
+            for (auto& action : expr.actions()) {
+                // Unfortunately we need to copy the action's path, because
+                // we std::move the action object.
+                auto p = action._path;
+                attribute_path_map_add("UpdateExpression", _update_expression, p, std::move(action));
+            }
        } catch(expressions_syntax_error& e) {
            throw api_error::validation(e.what());
        }
-        if (_update_expression.empty()) {
-            throw api_error::validation("Empty expression in UpdateExpression is not allowed");
-        }
    }
    _attribute_updates = rjson::find(_request, "AttributeUpdates");
    if (_attribute_updates) {
@@ -2128,6 +2310,187 @@ update_item_operation::needs_read_before_write() const {
           (_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::UPDATED_NEW);
 }

+// action_result() returns the result of applying an UpdateItem action -
+// this result is either a JSON object or an unset optional which indicates
+// the action was a deletion. The caller (update_item_operation::apply()
+// below) will either write this JSON as the content of a column, or
+// use it as a piece in a bigger top-level attribute.
+static std::optional<rjson::value> action_result(
+        const parsed::update_expression::action& action,
+        const rjson::value* previous_item) {
+    return std::visit(overloaded_functor {
+        [&] (const parsed::update_expression::action::set& a) -> std::optional<rjson::value> {
+            return calculate_value(a._rhs, previous_item);
+        },
+        [&] (const parsed::update_expression::action::remove& a) -> std::optional<rjson::value> {
+            return std::nullopt;
+        },
+        [&] (const parsed::update_expression::action::add& a) -> std::optional<rjson::value> {
+            parsed::value base;
+            parsed::value addition;
+            base.set_path(action._path);
+            addition.set_constant(a._valref);
+            rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item);
+            rjson::value v2 = calculate_value(addition, calculate_value_caller::UpdateExpression, previous_item);
+            rjson::value result;
+            // An ADD can be used to create a new attribute (when
+            // v1.IsNull()) or to add to a pre-existing attribute:
+            if (v1.IsNull()) {
+                std::string v2_type = get_item_type_string(v2);
+                if (v2_type == "N" || v2_type == "SS" || v2_type == "NS" || v2_type == "BS") {
+                    result = v2;
+                } else {
+                    throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v2));
+                }
+            } else {
+                std::string v1_type = get_item_type_string(v1);
+                if (v1_type == "N") {
+                    if (get_item_type_string(v2) != "N") {
+                        throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
+                    }
+                    result = number_add(v1, v2);
+                } else if (v1_type == "SS" || v1_type == "NS" || v1_type == "BS") {
+                    if (get_item_type_string(v2) != v1_type) {
+                        throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
+                    }
+                    result = set_sum(v1, v2);
+                } else {
+                    throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v1));
+                }
+            }
+            return result;
+        },
+        [&] (const parsed::update_expression::action::del& a) -> std::optional<rjson::value> {
+            parsed::value base;
+            parsed::value subset;
+            base.set_path(action._path);
+            subset.set_constant(a._valref);
+            rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item);
+            rjson::value v2 = calculate_value(subset, calculate_value_caller::UpdateExpression, previous_item);
+            if (!v1.IsNull()) {
+                return set_diff(v1, v2);
+            }
+            // When we return nullopt here, we ask to *delete* this attribute,
+            // which is unnecessary because we know the attribute does not
+            // exist anyway. This is a waste, but a small one. Note that also
+            // for the "remove" action above we don't bother to check if the
+            // previous_item add anything to remove.
+            return std::nullopt;
+        }
+    }, action._action);
+}
+
+// Print an attribute_path_map_node<action> as the list of paths it contains:
+static std::ostream& operator<<(std::ostream& out, const attribute_path_map_node<parsed::update_expression::action>& h) {
+    if (h.has_value()) {
+        out << " " << h.get_value()._path;
+    } else if (h.has_members()) {
+        for (auto& member : h.get_members()) {
+            out << *member.second;
+        }
+    } else if (h.has_indexes()) {
+        for (auto& index : h.get_indexes()) {
+            out << *index.second;
+        }
+    }
+    return out;
+}
+
+// Apply the hierarchy of actions in an attribute_path_map_node<action> to a
+// JSON object which uses DynamoDB's serialization conventions. The complete,
+// unmodified, previous_item is also necessary for the right-hand sides of the
+// actions. Modifies obj in-place or returns false if it is to be removed.
+static bool hierarchy_actions(
+        rjson::value& obj,
+        const attribute_path_map_node<parsed::update_expression::action>& h,
+        const rjson::value* previous_item)
+{
+    if (!obj.IsObject() || obj.MemberCount() != 1) {
+        // This shouldn't happen. We shouldn't have stored malformed objects.
+        // But today Alternator does not validate the structure of nested
+        // documents before storing them, so this can happen on read.
+        throw api_error::validation(format("Malformed value object read: {}", obj));
+    }
+    const char* type = obj.MemberBegin()->name.GetString();
+    rjson::value& v = obj.MemberBegin()->value;
+    if (h.has_value()) {
+        // Action replacing everything in this position in the hierarchy
+        std::optional<rjson::value> newv = action_result(h.get_value(), previous_item);
+        if (newv) {
+            obj = std::move(*newv);
+        } else {
+            return false;
+        }
+    } else if (h.has_members()) {
+        if (type[0] != 'M' || !v.IsObject()) {
+            // A .something on a non-map doesn't work.
+            throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
+        }
+        for (const auto& member : h.get_members()) {
+            std::string attr = member.first;
+            const attribute_path_map_node<parsed::update_expression::action>& subh = *member.second;
+            rjson::value *subobj = rjson::find(v, attr);
+            if (subobj) {
+                if (!hierarchy_actions(*subobj, subh, previous_item)) {
+                    rjson::remove_member(v, attr);
+                }
+            } else {
+                // When a.b does not exist, setting a.b itself (i.e.
+                // subh.has_value()) is fine, but setting a.b.c is not.
+                if (subh.has_value()) {
+                    std::optional<rjson::value> newv = action_result(subh.get_value(), previous_item);
+                    if (newv) {
+                        rjson::set_with_string_name(v, attr, std::move(*newv));
+                    } else {
+                        throw api_error::validation(format("Can't remove document path {} - not present in item",
+                            subh.get_value()._path));
+                    }
+                } else {
+                    throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
+                }
+            }
+        }
+    } else if (h.has_indexes()) {
+        if (type[0] != 'L' || !v.IsArray()) {
+            // A [i] on a non-list doesn't work.
+            throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
+        }
+        unsigned nremoved = 0;
+        for (const auto& index : h.get_indexes()) {
+            unsigned i = index.first - nremoved;
+            const attribute_path_map_node<parsed::update_expression::action>& subh = *index.second;
+            if (i < v.Size()) {
+                if (!hierarchy_actions(v[i], subh, previous_item)) {
+                    v.Erase(v.Begin() + i);
+                    // If we have the actions "REMOVE a[1] SET a[3] = :val",
+                    // the index 3 refers to the original indexes, before any
+                    // items were removed. So we offset the next indexes
+                    // (which are guaranteed to be higher than i - indexes is
+                    // a sorted map) by an increased "nremoved".
+                    nremoved++;
+                }
+            } else {
+                // If a[7] does not exist, setting a[7] itself (i.e.
+                // subh.has_value()) is fine - and appends an item, though
+                // not necessarily with index 7. But setting a[7].b will
+                // not work.
+                if (subh.has_value()) {
+                    std::optional<rjson::value> newv = action_result(subh.get_value(), previous_item);
+                    if (newv) {
+                        rjson::push_back(v, std::move(*newv));
+                    } else {
+                        // Removing a[7] when the list has fewer elements is
+                        // silently ignored. It's not considered an error.
+                    }
+                } else {
+                    throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
+                }
+            }
+        }
+    }
+    return true;
+}
+
 std::optional<mutation>
 update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const {
    if (!verify_expected(_request, previous_item.get()) ||
@@ -2142,17 +2505,37 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
    auto& row = m.partition().clustered_row(*_schema, _ck);
    attribute_collector attrs_collector;
    bool any_updates = false;
-    auto do_update = [&] (bytes&& column_name, const rjson::value& json_value) {
+    auto do_update = [&] (bytes&& column_name, const rjson::value& json_value,
+                          const attribute_path_map_node<parsed::update_expression::action>* h = nullptr) {
        any_updates = true;
-        if (_returnvalues == returnvalues::ALL_NEW ||
-            _returnvalues == returnvalues::UPDATED_NEW) {
+        if (_returnvalues == returnvalues::ALL_NEW) {
            rjson::set_with_string_name(_return_attributes,
-                    to_sstring_view(column_name), rjson::copy(json_value));
+                to_sstring_view(column_name), rjson::copy(json_value));
+        } else if (_returnvalues == returnvalues::UPDATED_NEW) {
+            rjson::value&& v = rjson::copy(json_value);
+            if (h) {
+                // If the operation was only on specific attribute paths,
+                // leave only them in _return_attributes.
+                if (hierarchy_filter(v, *h)) {
+                    rjson::set_with_string_name(_return_attributes,
+                        to_sstring_view(column_name), std::move(v));
+                }
+            } else {
+                rjson::set_with_string_name(_return_attributes,
+                    to_sstring_view(column_name), std::move(v));
+            }
        } else if (_returnvalues == returnvalues::UPDATED_OLD && previous_item) {
            std::string_view cn =  to_sstring_view(column_name);
            const rjson::value* col = rjson::find(*previous_item, cn);
            if (col) {
-                rjson::set_with_string_name(_return_attributes, cn, rjson::copy(*col));
+                rjson::value&& v = rjson::copy(*col);
+                if (h) {
+                    if (hierarchy_filter(v, *h)) {
+                        rjson::set_with_string_name(_return_attributes, cn, std::move(v));
+                    }
+                } else {
+                    rjson::set_with_string_name(_return_attributes, cn, std::move(v));
+                }
            }
        }
        const column_definition* cdef = _schema->get_column_definition(column_name);
@@ -2194,7 +2577,7 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
    // can just move previous_item later, when we don't need it any more.
    if (_returnvalues == returnvalues::ALL_NEW) {
        if (previous_item) {
-            _return_attributes = std::move(*previous_item);
+            _return_attributes = rjson::copy(*previous_item);
        } else {
            // If there is no previous item, usually a new item is created
            // and contains they given key. This may be cancelled at the end
@@ -2207,77 +2590,44 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
    }

    if (!_update_expression.empty()) {
-        std::unordered_set<std::string> seen_column_names;
-        for (auto& action : _update_expression.actions()) {
-            if (action._path.has_operators()) {
-                // FIXME: implement this case
-                throw api_error::validation("UpdateItem support for nested updates not yet implemented");
-            }
-            std::string column_name = action._path.root();
+        for (auto& actions : _update_expression) {
+            // The actions of _update_expression are grouped by top-level
+            // attributes. Here, all actions in actions.second share the same
+            // top-level attribute actions.first.
+            std::string column_name = actions.first;
            const column_definition* cdef = _schema->get_column_definition(to_bytes(column_name));
            if (cdef && cdef->is_primary_key()) {
-                throw api_error::validation(
-                        format("UpdateItem cannot update key column {}", column_name));
+                throw api_error::validation(format("UpdateItem cannot update key column {}", column_name));
            }
-            // DynamoDB forbids multiple updates in the same expression to
-            // modify overlapping document paths. Updates of one expression
-            // have the same timestamp, so it's unclear which would "win".
-            // FIXME: currently, without full support for document paths,
-            // we only check if the paths' roots are the same.
-            if (!seen_column_names.insert(column_name).second) {
-                throw api_error::validation(
-                        format("Invalid UpdateExpression: two document paths overlap with each other: {} and {}.",
-                                column_name, column_name));
-            }
-            std::visit(overloaded_functor {
-                [&] (const parsed::update_expression::action::set& a) {
-                    auto value = calculate_value(a._rhs, previous_item.get());
-                    do_update(to_bytes(column_name), value);
-                },
-                [&] (const parsed::update_expression::action::remove& a) {
+            if (actions.second.has_value()) {
+                // An action on a top-level attribute column_name. The single
+                // action is actions.second.get_value(). We can simply invoke
+                // the action and replace the attribute with its result:
+                std::optional<rjson::value> result = action_result(actions.second.get_value(), previous_item.get());
+                if (result) {
+                    do_update(to_bytes(column_name), *result);
+                } else {
                    do_delete(to_bytes(column_name));
-                },
-                [&] (const parsed::update_expression::action::add& a) {
-                    parsed::value base;
-                    parsed::value addition;
-                    base.set_path(action._path);
-                    addition.set_constant(a._valref);
-                    rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item.get());
-                    rjson::value v2 = calculate_value(addition, calculate_value_caller::UpdateExpression, previous_item.get());
-                    rjson::value result;
-                    std::string v1_type = get_item_type_string(v1);
-                    if (v1_type == "N") {
-                        if (get_item_type_string(v2) != "N") {
-                            throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
-                        }
-                        result = number_add(v1, v2);
-                    } else if (v1_type == "SS" || v1_type == "NS" || v1_type == "BS") {
-                        if (get_item_type_string(v2) != v1_type) {
-                            throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
-                        }
-                        result = set_sum(v1, v2);
-                    } else {
-                        throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v1));
-                    }
-                    do_update(to_bytes(column_name), result);
-                },
-                [&] (const parsed::update_expression::action::del& a) {
-                    parsed::value base;
-                    parsed::value subset;
-                    base.set_path(action._path);
-                    subset.set_constant(a._valref);
-                    rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item.get());
-                    rjson::value v2 = calculate_value(subset, calculate_value_caller::UpdateExpression, previous_item.get());
-                    if (!v1.IsNull()) {
-                        std::optional<rjson::value> result  = set_diff(v1, v2);
-                        if (result) {
-                            do_update(to_bytes(column_name), *result);
-                        } else {
-                            do_delete(to_bytes(column_name));
-                        }
-                    }
                }
-            }, action._action);
+            } else {
+                // We have actions on a path or more than one path in the same
+                // top-level attribute column_name - but not on the top-level
+                // attribute as a whole. We already read the full top-level
+                // attribute (see check_needs_read_before_write()), and now we
+                // need to modify pieces of it and write back the entire
+                // top-level attribute.
+                if (!previous_item) {
+                    throw api_error::validation(format("UpdateItem cannot update nested document path on non-existent item"));
+                }
+                const rjson::value *toplevel = rjson::find(*previous_item, column_name);
+                if (!toplevel) {
+                    throw api_error::validation(format("UpdateItem cannot update document path: missing attribute {}",
+                        column_name));
+                }
+                rjson::value result = rjson::copy(*toplevel);
+                hierarchy_actions(result, actions.second, previous_item.get());
+                do_update(to_bytes(column_name), std::move(result), &actions.second);
+            }
        }
    }
    if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
@@ -2395,7 +2745,7 @@ static rjson::value describe_item(schema_ptr schema,
        const query::partition_slice& slice,
        const cql3::selection::selection& selection,
        const query::result& query_result,
-        const std::unordered_set<std::string>& attrs_to_get) {
+        const attrs_to_get& attrs_to_get) {
    std::optional<rjson::value> opt_item = executor::describe_single_item(std::move(schema), slice, selection, std::move(query_result), attrs_to_get);
    if (!opt_item) {
        // If there is no matching item, we're supposed to return an empty
@@ -2467,7 +2817,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
    struct table_requests {
        schema_ptr schema;
        db::consistency_level cl;
-        std::unordered_set<std::string> attrs_to_get;
+        attrs_to_get attrs_to_get;
        struct single_request {
            partition_key pk;
            clustering_key ck;
@@ -2681,7 +3031,7 @@ void filter::for_filters_on(const noncopyable_function<void(std::string_view)>&
 class describe_items_visitor {
    typedef std::vector<const column_definition*> columns_t;
    const columns_t& _columns;
-    const std::unordered_set<std::string>& _attrs_to_get;
+    const attrs_to_get& _attrs_to_get;
    std::unordered_set<std::string> _extra_filter_attrs;
    const filter& _filter;
    typename columns_t::const_iterator _column_it;
@@ -2690,7 +3040,7 @@ class describe_items_visitor {
    size_t _scanned_count;

 public:
-    describe_items_visitor(const columns_t& columns, const std::unordered_set<std::string>& attrs_to_get, filter& filter)
+    describe_items_visitor(const columns_t& columns, const attrs_to_get& attrs_to_get, filter& filter)
            : _columns(columns)
            , _attrs_to_get(attrs_to_get)
            , _filter(filter)
@@ -2739,6 +3089,12 @@ public:
                    std::string attr_name = value_cast<sstring>(entry.first);
                    if (_attrs_to_get.empty() || _attrs_to_get.contains(attr_name) || _extra_filter_attrs.contains(attr_name)) {
                        bytes value = value_cast<bytes>(entry.second);
+                        // Even if _attrs_to_get asked to keep only a part of a
+                        // top-level attribute, we keep the entire attribute
+                        // at this stage, because the item filter might still
+                        // need the other parts (it was easier for us to keep
+                        // extra_filter_attrs at top-level granularity). We'll
+                        // filter the unneeded parts after item filtering.
                        rjson::set_with_string_name(_item, attr_name, deserialize_item(value));
                    }
                }
@@ -2749,11 +3105,24 @@ public:

    void end_row() {
        if (_filter.check(_item)) {
+            // As noted above, we kept entire top-level attributes listed in
+            // _attrs_to_get. We may need to only keep parts of them.
+            for (const auto& attr: _attrs_to_get) {
+                // If !attr.has_value() it means we were asked not to keep
+                // attr entirely, but just parts of it.
+                if (!attr.second.has_value()) {
+                    rjson::value* toplevel= rjson::find(_item, attr.first);
+                    if (toplevel && !hierarchy_filter(*toplevel, attr.second)) {
+                        rjson::remove_member(_item, attr.first);
+                    }
+                }
+            }
            // Remove the extra attributes _extra_filter_attrs which we had
            // to add just for the filter, and not requested to be returned:
            for (const auto& attr : _extra_filter_attrs) {
                rjson::remove_member(_item, attr);
            }
+
            rjson::push_back(_items, std::move(_item));
        }
        _item = rjson::empty_object();
@@ -2769,7 +3138,7 @@ public:
    }
 };

-static rjson::value describe_items(schema_ptr schema, const query::partition_slice& slice, const cql3::selection::selection& selection, std::unique_ptr<cql3::result_set> result_set, std::unordered_set<std::string>&& attrs_to_get, filter&& filter) {
+static rjson::value describe_items(schema_ptr schema, const query::partition_slice& slice, const cql3::selection::selection& selection, std::unique_ptr<cql3::result_set> result_set, attrs_to_get&& attrs_to_get, filter&& filter) {
    describe_items_visitor visitor(selection.get_columns(), attrs_to_get, filter);
    result_set->visit(visitor);
    auto scanned_count = visitor.get_scanned_count();
@@ -2788,7 +3157,7 @@ static rjson::value encode_paging_state(const schema& schema, const service::pag
    for (const column_definition& cdef : schema.partition_key_columns()) {
        rjson::set_with_string_name(last_evaluated_key, std::string_view(cdef.name_as_text()), rjson::empty_object());
        rjson::value& key_entry = last_evaluated_key[cdef.name_as_text()];
-        rjson::set_with_string_name(key_entry, type_to_string(cdef.type), rjson::parse(to_json_string(*cdef.type, *exploded_pk_it)));
+        rjson::set_with_string_name(key_entry, type_to_string(cdef.type), json_key_column_value(*exploded_pk_it, cdef));
        ++exploded_pk_it;
    }
    auto ck = paging_state.get_clustering_key();
@@ -2798,7 +3167,7 @@ static rjson::value encode_paging_state(const schema& schema, const service::pag
        for (const column_definition& cdef : schema.clustering_key_columns()) {
            rjson::set_with_string_name(last_evaluated_key, std::string_view(cdef.name_as_text()), rjson::empty_object());
            rjson::value& key_entry = last_evaluated_key[cdef.name_as_text()];
-            rjson::set_with_string_name(key_entry, type_to_string(cdef.type), rjson::parse(to_json_string(*cdef.type, *exploded_ck_it)));
+            rjson::set_with_string_name(key_entry, type_to_string(cdef.type), json_key_column_value(*exploded_ck_it, cdef));
            ++exploded_ck_it;
        }
    }
@@ -2810,7 +3179,7 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr
        const rjson::value* exclusive_start_key,
        dht::partition_range_vector&& partition_ranges,
        std::vector<query::clustering_range>&& ck_bounds,
-        std::unordered_set<std::string>&& attrs_to_get,
+        attrs_to_get&& attrs_to_get,
        uint32_t limit,
        db::consistency_level cl,
        filter&& filter,
@@ -2850,7 +3219,7 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr
    auto p = service::pager::query_pagers::pager(schema, selection, *query_state_ptr, *query_options, command, std::move(partition_ranges), nullptr);

    return p->fetch_page(limit, gc_clock::now(), executor::default_timeout()).then(
-            [p, schema, cql_stats, partition_slice = std::move(partition_slice),
+            [p = std::move(p), schema, cql_stats, partition_slice = std::move(partition_slice),
             selection = std::move(selection), query_state_ptr = std::move(query_state_ptr),
             attrs_to_get = std::move(attrs_to_get),
             query_options = std::move(query_options),
@@ -3536,7 +3905,7 @@ future<> executor::create_keyspace(std::string_view keyspace_name) {
        }
        auto opts = get_network_topology_options(rf);
        auto ksm = keyspace_metadata::new_keyspace(keyspace_name_str, "org.apache.cassandra.locator.NetworkTopologyStrategy", std::move(opts), true);
-        return _mm.announce_new_keyspace(ksm, api::new_timestamp(), false);
+        return _mm.announce_new_keyspace(ksm, api::new_timestamp());
    });
 }

--- a/alternator/executor.hh
+++ b/alternator/executor.hh
@@ -70,6 +70,76 @@ public:
    std::string to_json() const override;
 };

+namespace parsed {
+class path;
+};
+
+// An attribute_path_map object is used to hold data for various attributes
+// paths (parsed::path) in a hierarchy of attribute paths. Each attribute path
+// has a root attribute, and then modified by member and index operators -
+// for example in "a.b[2].c" we have "a" as the root, then ".b" member, then
+// "[2]" index, and finally ".c" member.
+// Data can be added to an attribute_path_map using the add() function, but
+// requires that attributes with data not be *overlapping* or *conflicting*:
+//
+// 1. Two attribute paths which are identical or an ancestor of one another
+//    are considered *overlapping* and not allowed. If a.b.c has data,
+//    we can't add more data in a.b.c or any of its descendants like a.b.c.d.
+//
+// 2. Two attribute paths which need the same parent to have both a member and
+//    an index are considered *conflicting* and not allowed. E.g., if a.b has
+//    data, you can't add a[1]. The meaning of adding both would be that the
+//    attribute a is both a map and an array, which isn't sensible.
+//
+// These two requirements are common to the two places where Alternator uses
+// this abstraction to describe how a hierarchical item is to be transformed:
+//
+// 1. In ProjectExpression: for filtering from a full top-level attribute
+//    only the parts for which user asked in ProjectionExpression.
+//
+// 2. In UpdateExpression: for taking the previous value of a top-level
+//    attribute, and modifying it based on the instructions in the user
+//    wrote in UpdateExpression.
+
+template<typename T>
+class attribute_path_map_node {
+public:
+    using data_t = T;
+    // We need the extra shared_ptr<> here because libstdc++ unordered_map
+    // doesn't work with incomplete types :-( We couldn't use lw_shared_ptr<>
+    // because it doesn't work for incomplete types either. We couldn't use
+    // std::unique_ptr<> because it makes the entire object uncopyable. We
+    // don't often need to copy such a map, but we do have some code that
+    // copies an attrs_to_get object, and is hard to find and remove.
+    // The shared_ptr should never be null.
+    using members_t =  std::unordered_map<std::string, seastar::shared_ptr<attribute_path_map_node<T>>>;
+    // The indexes list is sorted because DynamoDB requires handling writes
+    // beyond the end of a list in index order.
+    using indexes_t = std::map<unsigned, seastar::shared_ptr<attribute_path_map_node<T>>>;
+    // The prohibition on "overlap" and "conflict" explained above means
+    // That only one of data, members or indexes is non-empty.
+    std::optional<std::variant<data_t, members_t, indexes_t>> _content;
+
+    bool is_empty() const { return !_content; }
+    bool has_value() const { return _content && std::holds_alternative<data_t>(*_content); }
+    bool has_members() const { return _content && std::holds_alternative<members_t>(*_content); }
+    bool has_indexes() const { return _content && std::holds_alternative<indexes_t>(*_content); }
+    // get_members() assumes that has_members() is true
+    members_t& get_members() { return std::get<members_t>(*_content); }
+    const members_t& get_members() const { return std::get<members_t>(*_content); }
+    indexes_t& get_indexes() { return std::get<indexes_t>(*_content); }
+    const indexes_t& get_indexes() const { return std::get<indexes_t>(*_content); }
+    T& get_value() { return std::get<T>(*_content); }
+    const T& get_value() const { return std::get<T>(*_content); }
+};
+
+template<typename T>
+using attribute_path_map = std::unordered_map<std::string, attribute_path_map_node<T>>;
+
+using attrs_to_get_node = attribute_path_map_node<std::monostate>;
+using attrs_to_get = attribute_path_map<std::monostate>;
+
+
 class executor : public peering_sharded_service<executor> {
    service::storage_proxy& _proxy;
    service::migration_manager& _mm;
@@ -121,6 +191,10 @@ public:

    static sstring table_name(const schema&);
    static db::timeout_clock::time_point default_timeout();
+    static void set_default_timeout(db::timeout_clock::duration timeout);
+private:
+    static db::timeout_clock::duration s_default_timeout;
+public:
    static schema_ptr find_table(service::storage_proxy&, const rjson::value& request);

 private:
@@ -136,16 +210,14 @@ public:
        const query::partition_slice&,
        const cql3::selection::selection&,
        const query::result&,
-        const std::unordered_set<std::string>&);
+        const attrs_to_get&);

    static void describe_single_item(const cql3::selection::selection&,
        const std::vector<bytes_opt>&,
-        const std::unordered_set<std::string>&,
+        const attrs_to_get&,
        rjson::value&,
        bool = false);

-
-
    void add_stream_options(const rjson::value& stream_spec, schema_builder&) const;
    void supplement_table_info(rjson::value& descr, const schema& schema) const;
    void supplement_table_stream_info(rjson::value& descr, const schema& schema) const;
--- a/alternator/expressions.cc
+++ b/alternator/expressions.cc
@@ -130,6 +130,27 @@ void condition_expression::append(condition_expression&& a, char op) {
    }, _expression);
 }

+void path::check_depth_limit() {
+    if (1 + _operators.size() > depth_limit) {
+        throw expressions_syntax_error(format("Document path exceeded {} nesting levels", depth_limit));
+    }
+}
+
+std::ostream& operator<<(std::ostream& os, const path& p) {
+    os << p.root();
+    for (const auto& op : p.operators()) {
+        std::visit(overloaded_functor {
+            [&] (const std::string& member) {
+                os << '.' << member;
+            },
+            [&] (unsigned index) {
+                os << '[' << index << ']';
+            }
+        }, op);
+    }
+    return os;
+}
+
 } // namespace parsed

 // The following resolve_*() functions resolve references in parsed
@@ -151,10 +172,9 @@ void condition_expression::append(condition_expression&& a, char op) {
 // we need to resolve the expression just once but then use it many times
 // (once for each item to be filtered).

-static void resolve_path(parsed::path& p,
+static std::optional<std::string> resolve_path_component(const std::string& column_name,
        const rjson::value* expression_attribute_names,
        std::unordered_set<std::string>& used_attribute_names) {
-    const std::string& column_name = p.root();
    if (column_name.size() > 0 && column_name.front() == '#') {
        if (!expression_attribute_names) {
            throw api_error::validation(
@@ -166,7 +186,30 @@ static void resolve_path(parsed::path& p,
                    format("ExpressionAttributeNames missing entry '{}' required by expression", column_name));
        }
        used_attribute_names.emplace(column_name);
-        p.set_root(std::string(rjson::to_string_view(*value)));
+        return std::string(rjson::to_string_view(*value));
+    }
+    return std::nullopt;
+}
+
+static void resolve_path(parsed::path& p,
+        const rjson::value* expression_attribute_names,
+        std::unordered_set<std::string>& used_attribute_names) {
+    std::optional<std::string> r = resolve_path_component(p.root(), expression_attribute_names, used_attribute_names);
+    if (r) {
+        p.set_root(std::move(*r));
+    }
+    for (auto& op : p.operators()) {
+        std::visit(overloaded_functor {
+            [&] (std::string& s) {
+                r = resolve_path_component(s, expression_attribute_names, used_attribute_names);
+                if (r) {
+                    s = std::move(*r);
+                }
+            },
+            [&] (unsigned index) {
+                // nothing to resolve
+            }
+        }, op);
    }
 }

@@ -603,52 +646,8 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
            }
            rjson::value v1 = calculate_value(f._parameters[0], caller, previous_item);
            rjson::value v2 = calculate_value(f._parameters[1], caller, previous_item);
-            // TODO: There's duplication here with check_BEGINS_WITH().
-            // But unfortunately, the two functions differ a bit.
-
-            // If one of v1 or v2 is malformed or has an unsupported type
-            // (not B or S), what we do depends on whether it came from
-            // the user's query (is_constant()), or the item. Unsupported
-            // values in the query result in an error, but if they are in
-            // the item, we silently return false (no match).
-            bool bad = false;
-            if (!v1.IsObject() || v1.MemberCount() != 1) {
-                bad = true;
-                if (f._parameters[0].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() encountered malformed AttributeValue: {}", caller, v1));
-                }
-            } else if (v1.MemberBegin()->name != "S" && v1.MemberBegin()->name != "B") {
-                bad = true;
-                if (f._parameters[0].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() supports only string or binary in AttributeValue: {}", caller, v1));
-                }
-            }
-            if (!v2.IsObject() || v2.MemberCount() != 1) {
-                bad = true;
-                if (f._parameters[1].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() encountered malformed AttributeValue: {}", caller, v2));
-                }
-            } else if (v2.MemberBegin()->name != "S" && v2.MemberBegin()->name != "B") {
-                bad = true;
-                if (f._parameters[1].is_constant()) {
-                    throw api_error::validation(format("{}: begins_with() supports only string or binary in AttributeValue: {}", caller, v2));
-                }
-            }
-            bool ret = false;
-            if (!bad) {
-                auto it1 = v1.MemberBegin();
-                auto it2 = v2.MemberBegin();
-                if (it1->name == it2->name) {
-                    if (it2->name == "S") {
-                        std::string_view val1 = rjson::to_string_view(it1->value);
-                        std::string_view val2 = rjson::to_string_view(it2->value);
-                        ret = val1.starts_with(val2);
-                    } else /* it2->name == "B" */ {
-                        ret = base64_begins_with(rjson::to_string_view(it1->value), rjson::to_string_view(it2->value));
-                    }
-                }
-            }
-            return to_bool_json(ret);
+            return to_bool_json(check_BEGINS_WITH(v1.IsNull() ? nullptr : &v1,  v2,
+                                    f._parameters[0].is_constant(), f._parameters[1].is_constant()));
        }
    },
    {"contains", [] (calculate_value_caller caller, const rjson::value* previous_item, const parsed::value::function_call& f) {
@@ -667,6 +666,55 @@ std::unordered_map<std::string_view, function_handler_type*> function_handlers {
    },
 };

+// Given a parsed::path and an item read from the table, extract the value
+// of a certain attribute path, such as "a" or "a.b.c[3]". Returns a null
+// value if the item or the requested attribute does not exist.
+// Note that the item is assumed to be encoded in JSON using DynamoDB
+// conventions - each level of a nested document is a map with one key -
+// a type (e.g., "M" for map) - and its value is the representation of
+// that value.
+static rjson::value extract_path(const rjson::value* item,
+        const parsed::path& p, calculate_value_caller caller) {
+    if (!item) {
+        return rjson::null_value();
+    }
+    const rjson::value* v = rjson::find(*item, p.root());
+    if (!v) {
+        return rjson::null_value();
+    }
+    for (const auto& op : p.operators()) {
+        if (!v->IsObject() || v->MemberCount() != 1) {
+            // This shouldn't happen. We shouldn't have stored malformed
+            // objects. But today Alternator does not validate the structure
+            // of nested documents before storing them, so this can happen on
+            // read.
+            throw api_error::validation(format("{}: malformed item read: {}", *item));
+        }
+        const char* type = v->MemberBegin()->name.GetString();
+        v = &(v->MemberBegin()->value);
+        std::visit(overloaded_functor {
+            [&] (const std::string& member) {
+                if (type[0] == 'M' && v->IsObject()) {
+                    v = rjson::find(*v, member);
+                } else {
+                    v = nullptr;
+                }
+            },
+            [&] (unsigned index) {
+                if (type[0] == 'L' && v->IsArray() && index < v->Size()) {
+                    v = &(v->GetArray()[index]);
+                } else {
+                    v = nullptr;
+                }
+            }
+        }, op);
+        if (!v) {
+            return rjson::null_value();
+        }
+    }
+    return rjson::copy(*v);
+}
+
 // Given a parsed::value, which can refer either to a constant value from
 // ExpressionAttributeValues, to the value of some attribute, or to a function
 // of other values, this function calculates the resulting value.
@@ -684,21 +732,12 @@ rjson::value calculate_value(const parsed::value& v,
            auto function_it = function_handlers.find(std::string_view(f._function_name));
            if (function_it == function_handlers.end()) {
                throw api_error::validation(
-                        format("UpdateExpression: unknown function '{}' called.", f._function_name));
+                        format("{}: unknown function '{}' called.", caller, f._function_name));
            }
            return function_it->second(caller, previous_item, f);
        },
        [&] (const parsed::path& p) -> rjson::value {
-            if (!previous_item) {
-                return rjson::null_value();
-            }
-            std::string update_path = p.root();
-            if (p.has_operators()) {
-                // FIXME: support this
-                throw api_error::validation("Reading attribute paths not yet implemented");
-            }
-            const rjson::value* previous_value = rjson::find(*previous_item, update_path);
-            return previous_value ? rjson::copy(*previous_value) : rjson::null_value();
+            return extract_path(previous_item, p, caller);
        }
    }, v._value);
 }
--- a/alternator/expressions_types.hh
+++ b/alternator/expressions_types.hh
@@ -49,15 +49,23 @@ class path {
    // dot (e.g., ".xyz").
    std::string _root;
    std::vector<std::variant<std::string, unsigned>> _operators;
+    // It is useful to limit the depth of a user-specified path, because is
+    // allows us to use recursive algorithms without worrying about recursion
+    // depth. DynamoDB officially limits the length of paths to 32 components
+    // (including the root) so let's use the same limit.
+    static constexpr unsigned depth_limit = 32;
+    void check_depth_limit();
 public:
    void set_root(std::string root) {
        _root = std::move(root);
    }
    void add_index(unsigned i) {
        _operators.emplace_back(i);
+        check_depth_limit();
    }
    void add_dot(std::string(name)) {
        _operators.emplace_back(std::move(name));
+        check_depth_limit();
    }
    const std::string& root() const {
        return _root;
@@ -65,6 +73,13 @@ public:
    bool has_operators() const {
        return !_operators.empty();
    }
+    const std::vector<std::variant<std::string, unsigned>>& operators() const {
+        return _operators;
+    }
+    std::vector<std::variant<std::string, unsigned>>& operators() {
+        return _operators;
+    }
+    friend std::ostream& operator<<(std::ostream&, const path&);
 };

 // When an expression is first parsed, all constants are references, like
--- a/alternator/server.cc
+++ b/alternator/server.cc
@@ -189,7 +189,7 @@ future<> server::verify_signature(const request& req) {
    }
    auto authorization_it = req._headers.find("Authorization");
    if (authorization_it == req._headers.end()) {
-        throw api_error::invalid_signature("Authorization header is mandatory for signature verification");
+        throw api_error::missing_authentication_token("Authorization header is mandatory for signature verification");
    }
    std::string host = host_it->second;
    std::vector<std::string_view> credentials_raw = split(authorization_it->second, ' ');
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -499,19 +499,11 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
    // TODO: creation time

    auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
-    // cannot really "resume" query, must iterate all data. because we cannot query neither "time" (pk) > something,
-    // or on expired...
-    // TODO: maybe add secondary index to topology table to enable this?
-    return _sdks.cdc_get_versioned_streams({ normal_token_owners }).then([this, &db, schema, shard_start, limit, ret = std::move(ret), stream_desc = std::move(stream_desc), ttl](std::map<db_clock::time_point, cdc::streams_version> topologies) mutable {

-        // filter out cdc generations older than the table or now() - cdc::ttl (typically dynamodb_streams_max_window - 24h)
-        auto low_ts = std::max(as_timepoint(schema->id()), db_clock::now() - ttl);
+    // filter out cdc generations older than the table or now() - cdc::ttl (typically dynamodb_streams_max_window - 24h)
+    auto low_ts = std::max(as_timepoint(schema->id()), db_clock::now() - ttl);

-        auto i = topologies.lower_bound(low_ts);
-        // need first gen _intersecting_ the timestamp.
-        if (i != topologies.begin()) {
-            i = std::prev(i);
-        }
+    return _sdks.cdc_get_versioned_streams(low_ts, { normal_token_owners }).then([this, &db, shard_start, limit, ret = std::move(ret), stream_desc = std::move(stream_desc)] (std::map<db_clock::time_point, cdc::streams_version> topologies) mutable {

        auto e = topologies.end();
        auto prev = e;
@@ -519,9 +511,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl

        std::optional<shard_id> last;

-        // i is now at the youngest generation we include. make a mark of it.
-        auto first = i;
-
+        auto i = topologies.begin();
        // if we're a paged query, skip to the generation where we left of.
        if (shard_start) {
            i = topologies.find(shard_start->time);
@@ -547,7 +537,7 @@ future<executor::request_return_type> executor::describe_stream(client_state& cl
        };

        // need a prev even if we are skipping stuff
-        if (i != first) {
+        if (i != topologies.begin()) {
            prev = std::prev(i);
        }

@@ -855,16 +845,18 @@ future<executor::request_return_type> executor::get_records(client_state& client
    static const bytes op_column_name = cdc::log_meta_column_name_bytes("operation");
    static const bytes eor_column_name = cdc::log_meta_column_name_bytes("end_of_batch");

-    auto key_names = boost::copy_range<std::unordered_set<std::string>>(
+    auto key_names = boost::copy_range<attrs_to_get>(
        boost::range::join(std::move(base->partition_key_columns()), std::move(base->clustering_key_columns()))
-        | boost::adaptors::transformed([&] (const column_definition& cdef) { return cdef.name_as_text(); })
+        | boost::adaptors::transformed([&] (const column_definition& cdef) {
+            return std::make_pair<std::string, attrs_to_get_node>(cdef.name_as_text(), {}); })
    );
    // Include all base table columns as values (in case pre or post is enabled).
    // This will include attributes not stored in the frozen map column
-    auto attr_names = boost::copy_range<std::unordered_set<std::string>>(base->regular_columns()
+    auto attr_names = boost::copy_range<attrs_to_get>(base->regular_columns()
        // this will include the :attrs column, which we will also force evaluating. 
        // But not having this set empty forces out any cdc columns from actual result 
-        | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.name_as_text(); })
+        | boost::adaptors::transformed([] (const column_definition& cdef) {
+            return std::make_pair<std::string, attrs_to_get_node>(cdef.name_as_text(), {}); })
    );

    std::vector<const column_definition*> columns;
@@ -1028,7 +1020,9 @@ future<executor::request_return_type> executor::get_records(client_state& client
        }

        // ugh. figure out if we are and end-of-shard
-        return cdc::get_local_streams_timestamp().then([this, iter, high_ts, start_time, ret = std::move(ret)](db_clock::time_point ts) mutable {
+        auto normal_token_owners = _proxy.get_token_metadata_ptr()->count_normal_token_owners();
+        
+        return _sdks.cdc_current_generation_timestamp({ normal_token_owners }).then([this, iter, high_ts, start_time, ret = std::move(ret)](db_clock::time_point ts) mutable {
            auto& shard = iter.shard;            

            if (shard.time < ts && ts < high_ts) {
--- a/api/api-doc/column_family.json
+++ b/api/api-doc/column_family.json
@@ -2925,6 +2925,10 @@
         "id":"toppartitions_query_results",
         "description":"nodetool toppartitions query results",
         "properties":{
+            "read_cardinality":{
+               "type":"long",
+               "description":"Number of the unique operations in the sample set"
+            },
            "read":{
               "type":"array",
               "items":{
@@ -2932,6 +2936,10 @@
               },
               "description":"Read results"
            },
+            "write_cardinality":{
+               "type":"long",
+               "description":"Number of the unique operations in the sample set"
+            },
            "write":{
               "type":"array",
               "items":{
--- a/api/api-doc/gossiper.json
+++ b/api/api-doc/gossiper.json
@@ -148,6 +148,30 @@
               ]
            }
         ]
+      },
+      {
+         "path":"/gossiper/force_remove_endpoint/{addr}",
+         "operations":[
+            {
+               "method":"POST",
+               "summary":"Force remove an endpoint from gossip",
+               "type":"void",
+               "nickname":"force_remove_endpoint",
+               "produces":[
+                  "application/json"
+               ],
+               "parameters":[
+                  {
+                     "name":"addr",
+                     "description":"The endpoint address",
+                     "required":true,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"path"
+                  }
+               ]
+            }
+         ]
      }
   ]
 }
--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -310,7 +310,7 @@ void set_column_family(http_context& ctx, routes& r) {
        return res;
    });

-    cf::get_column_family.set(r, [&ctx] (const_req req){
+    cf::get_column_family.set(r, [&ctx] (std::unique_ptr<request> req){
            vector<cf::column_family_info> res;
            for (auto i: ctx.db.local().get_column_families_mapping()) {
                cf::column_family_info info;
@@ -319,7 +319,7 @@ void set_column_family(http_context& ctx, routes& r) {
                info.type = "ColumnFamilies";
                res.push_back(info);
            }
-            return res;
+            return make_ready_future<json::json_return_type>(json::stream_object(std::move(res)));
        });

    cf::get_column_family_name_keyspace.set(r, [&ctx] (const_req req){
@@ -656,7 +656,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_size();
+                return s + sst->filter_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -664,7 +664,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_size();
+                return s + sst->filter_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -672,7 +672,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_memory_size();
+                return s + sst->filter_memory_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -680,7 +680,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_bloom_filter_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->filter_memory_size();
+                return s + sst->filter_memory_size();
            });
        }, std::plus<uint64_t>());
    });
@@ -688,7 +688,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, req->param["name"], uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->get_summary().memory_footprint();
+                return s + sst->get_summary().memory_footprint();
            });
        }, std::plus<uint64_t>());
    });
@@ -696,7 +696,7 @@ void set_column_family(http_context& ctx, routes& r) {
    cf::get_all_index_summary_off_heap_memory_used.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, uint64_t(0), [] (column_family& cf) {
            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), uint64_t(0), [](uint64_t s, auto& sst) {
-                return sst->get_summary().memory_footprint();
+                return s + sst->get_summary().memory_footprint();
            });
        }, std::plus<uint64_t>());
    });
@@ -991,6 +991,9 @@ void set_column_family(http_context& ctx, routes& r) {
                        apilog.debug("toppartitions query: processing results");
                        cf::toppartitions_query_results results;

+                        results.read_cardinality = topk_results.read.size();
+                        results.write_cardinality = topk_results.write.size();
+
                        for (auto& d: topk_results.read.top(q.list_size())) {
                            cf::toppartitions_record r;
                            r.partition = sstring(d.item);
--- a/api/gossiper.cc
+++ b/api/gossiper.cc
@@ -66,6 +66,13 @@ void set_gossiper(http_context& ctx, routes& r) {
            return make_ready_future<json::json_return_type>(json_void());
        });
    });
+
+    httpd::gossiper_json::force_remove_endpoint.set(r, [](std::unique_ptr<request> req) {
+        gms::inet_address ep(req->param["addr"]);
+        return gms::get_local_gossiper().force_remove_endpoint(ep).then([] {
+            return make_ready_future<json::json_return_type>(json_void());
+        });
+    });
 }

 }
--- a/auth/common.cc
+++ b/auth/common.cc
@@ -82,7 +82,7 @@ static future<> create_metadata_table_if_missing_impl(
    b.set_uuid(uuid);
    schema_ptr table = b.build();
    return ignore_existing([&mm, table = std::move(table)] () {
-        return mm.announce_new_column_family(table, false);
+        return mm.announce_new_column_family(table);
    });
 }

--- a/auth/service.cc
+++ b/auth/service.cc
@@ -154,7 +154,7 @@ future<> service::create_keyspace_if_missing(::service::migration_manager& mm) c

        // We use min_timestamp so that default keyspace metadata will loose with any manual adjustments.
        // See issue #2129.
-        return mm.announce_new_keyspace(ksm, api::min_timestamp, false);
+        return mm.announce_new_keyspace(ksm, api::min_timestamp);
    }

    return make_ready_future<>();
--- a/bytes.hh
+++ b/bytes.hh
@@ -28,6 +28,7 @@
 #include <iosfwd>
 #include <functional>
 #include "utils/mutable_view.hh"
+#include <xxhash.h>

 using bytes = basic_sstring<int8_t, uint32_t, 31, false>;
 using bytes_view = std::basic_string_view<int8_t>;
@@ -35,6 +36,10 @@ using bytes_mutable_view = basic_mutable_view<bytes_view::value_type>;
 using bytes_opt = std::optional<bytes>;
 using sstring_view = std::string_view;

+inline bytes to_bytes(bytes&& b) {
+    return std::move(b);
+}
+
 inline sstring_view to_sstring_view(bytes_view view) {
    return {reinterpret_cast<const char*>(view.data()), view.size()};
 }
@@ -43,17 +48,6 @@ inline bytes_view to_bytes_view(sstring_view view) {
    return {reinterpret_cast<const int8_t*>(view.data()), view.size()};
 }

-namespace std {
-
-template <>
-struct hash<bytes_view> {
-    size_t operator()(bytes_view v) const {
-        return hash<sstring_view>()({reinterpret_cast<const char*>(v.begin()), v.size()});
-    }
-};
-
-}
-
 struct fmt_hex {
    bytes_view& v;
    fmt_hex(bytes_view& v) noexcept : v(v) {}
@@ -94,6 +88,30 @@ struct appending_hash<bytes_view> {
    }
 };

+struct bytes_view_hasher : public hasher {
+    XXH64_state_t _state;
+    bytes_view_hasher(uint64_t seed = 0) noexcept {
+        XXH64_reset(&_state, seed);
+    }
+    void update(const char* ptr, size_t length) noexcept {
+        XXH64_update(&_state, ptr, length);
+    }
+    size_t finalize() {
+        return static_cast<size_t>(XXH64_digest(&_state));
+    }
+};
+
+namespace std {
+template <>
+struct hash<bytes_view> {
+    size_t operator()(bytes_view v) const {
+        bytes_view_hasher h;
+        appending_hash<bytes_view>{}(h, v);
+        return h.finalize();
+    }
+};
+} // namespace std
+
 inline int32_t compare_unsigned(bytes_view v1, bytes_view v2) {
  auto size = std::min(v1.size(), v2.size());
  if (size) {
--- a/cache_flat_mutation_reader.hh
+++ b/cache_flat_mutation_reader.hh
@@ -461,7 +461,7 @@ void cache_flat_mutation_reader::maybe_add_to_cache(const clustering_row& cr) {
            cr.cells().prepare_hash(*_schema, column_kind::regular_column);
        }
        auto new_entry = alloc_strategy_unique_ptr<rows_entry>(
-            current_allocator().construct<rows_entry>(*_schema, cr.key(), cr.tomb(), cr.marker(), cr.cells()));
+            current_allocator().construct<rows_entry>(*_schema, cr.key(), cr.as_deletable_row()));
        new_entry->set_continuous(false);
        auto it = _next_row.iterators_valid() ? _next_row.get_iterator_in_latest_version()
                                              : mp.clustered_rows().lower_bound(cr.key(), less);
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -22,10 +22,14 @@
 #include <boost/type.hpp>
 #include <random>
 #include <unordered_set>
+#include <algorithm>
 #include <seastar/core/sleep.hh>
+#include <algorithm>
+#include <seastar/core/coroutine.hh>

 #include "keys.hh"
 #include "schema_builder.hh"
+#include "database.hh"
 #include "db/config.hh"
 #include "db/system_keyspace.hh"
 #include "db/system_distributed_keyspace.hh"
@@ -36,6 +40,7 @@
 #include "gms/gossiper.hh"

 #include "cdc/generation.hh"
+#include "cdc/cdc_options.hh"

 extern logging::logger cdc_log;

@@ -174,10 +179,29 @@ bool topology_description::operator==(const topology_description& o) const {
    return _entries == o._entries;
 }

-const std::vector<token_range_description>& topology_description::entries() const {
+const std::vector<token_range_description>& topology_description::entries() const& {
    return _entries;
 }

+std::vector<token_range_description>&& topology_description::entries() && {
+    return std::move(_entries);
+}
+
+static std::vector<stream_id> create_stream_ids(
+        size_t index, dht::token start, dht::token end, size_t shard_count, uint8_t ignore_msb) {
+    std::vector<stream_id> result;
+    result.reserve(shard_count);
+    dht::sharder sharder(shard_count, ignore_msb);
+    for (size_t shard_idx = 0; shard_idx < shard_count; ++shard_idx) {
+        auto t = dht::find_first_token_for_shard(sharder, start, end, shard_idx);
+        // compose the id from token and the "index" of the range end owning vnode
+        // as defined by token sort order. Basically grouping within this
+        // shard set.
+        result.emplace_back(stream_id(t, index));
+    }
+    return result;
+}
+
 class topology_description_generator final {
    const db::config& _cfg;
    const std::unordered_set<dht::token>& _bootstrap_tokens;
@@ -217,18 +241,9 @@ class topology_description_generator final {
        desc.token_range_end = end;

        auto [shard_count, ignore_msb] = get_sharding_info(end);
-        desc.streams.reserve(shard_count);
+        desc.streams = create_stream_ids(index, start, end, shard_count, ignore_msb);
        desc.sharding_ignore_msb = ignore_msb;

-        dht::sharder sharder(shard_count, ignore_msb);
-        for (size_t shard_idx = 0; shard_idx < shard_count; ++shard_idx) {
-            auto t = dht::find_first_token_for_shard(sharder, start, end, shard_idx);
-            // compose the id from token and the "index" of the range end owning vnode
-            // as defined by token sort order. Basically grouping within this
-            // shard set.
-            desc.streams.emplace_back(stream_id(t, index));
-        }
-
        return desc;
    }
 public:
@@ -294,6 +309,38 @@ future<db_clock::time_point> get_local_streams_timestamp() {
    });
 }

+// non-static for testing
+size_t limit_of_streams_in_topology_description() {
+    // Each stream takes 16B and we don't want to exceed 4MB so we can have
+    // at most 262144 streams but not less than 1 per vnode.
+    return 4 * 1024 * 1024 / 16;
+}
+
+// non-static for testing
+topology_description limit_number_of_streams_if_needed(topology_description&& desc) {
+    int64_t streams_count = 0;
+    for (auto& tr_desc : desc.entries()) {
+        streams_count += tr_desc.streams.size();
+    }
+
+    size_t limit = std::max(limit_of_streams_in_topology_description(), desc.entries().size());
+    if (limit >= streams_count) {
+        return std::move(desc);
+    }
+    size_t streams_per_vnode_limit = limit / desc.entries().size();
+    auto entries = std::move(desc).entries();
+    auto start = entries.back().token_range_end;
+    for (size_t idx = 0; idx < entries.size(); ++idx) {
+        auto end = entries[idx].token_range_end;
+        if (entries[idx].streams.size() > streams_per_vnode_limit) {
+            entries[idx].streams =
+                create_stream_ids(idx, start, end, streams_per_vnode_limit, entries[idx].sharding_ignore_msb);
+        }
+        start = end;
+    }
+    return topology_description(std::move(entries));
+}
+
 // Run inside seastar::async context.
 db_clock::time_point make_new_cdc_generation(
        const db::config& cfg,
@@ -306,6 +353,18 @@ db_clock::time_point make_new_cdc_generation(
    using namespace std::chrono;
    auto gen = topology_description_generator(cfg, bootstrap_tokens, tmptr, g).generate();

+    // If the cluster is large we may end up with a generation that contains
+    // large number of streams. This is problematic because we store the
+    // generation in a single row. For a generation with large number of rows
+    // this will lead to a row that can be as big as 32MB. This is much more
+    // than the limit imposed by commitlog_segment_size_in_mb. If the size of
+    // the row that describes a new generation grows above
+    // commitlog_segment_size_in_mb, the write will fail and the new node won't
+    // be able to join. To avoid such problem we make sure that such row is
+    // always smaller than 4MB. We do that by removing some CDC streams from
+    // each vnode if the total number of streams is too large.
+    gen = limit_number_of_streams_if_needed(std::move(gen));
+
    // Begin the race.
    auto ts = db_clock::now() + (
            (!add_delay || ring_delay == milliseconds(0)) ? milliseconds(0) : (
@@ -321,31 +380,23 @@ std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_ad
    return gms::versioned_value::cdc_streams_timestamp_from_string(streams_ts_string);
 }

-// Run inside seastar::async context.
-static void do_update_streams_description(
+static future<> do_update_streams_description(
        db_clock::time_point streams_ts,
        db::system_distributed_keyspace& sys_dist_ks,
        db::system_distributed_keyspace::context ctx) {
-    if (sys_dist_ks.cdc_desc_exists(streams_ts, ctx).get0()) {
-        cdc_log.debug("update_streams_description: description of generation {} already inserted", streams_ts);
-        return;
+    if (co_await sys_dist_ks.cdc_desc_exists(streams_ts, ctx)) {
+        cdc_log.info("Generation {}: streams description table already updated.", streams_ts);
+        co_return;
    }

    // We might race with another node also inserting the description, but that's ok. It's an idempotent operation.

-    auto topo = sys_dist_ks.read_cdc_topology_description(streams_ts, ctx).get0();
+    auto topo = co_await sys_dist_ks.read_cdc_topology_description(streams_ts, ctx);
    if (!topo) {
-        throw std::runtime_error(format("could not find streams data for timestamp {}", streams_ts));
+        throw no_generation_data_exception(streams_ts);
    }

-    std::set<cdc::stream_id> streams_set;
-    for (auto& entry: topo->entries()) {
-        streams_set.insert(entry.streams.begin(), entry.streams.end());
-    }
-
-    std::vector<cdc::stream_id> streams_vec(streams_set.begin(), streams_set.end());
-
-    sys_dist_ks.create_cdc_desc(streams_ts, streams_vec, ctx).get();
+    co_await sys_dist_ks.create_cdc_desc(streams_ts, *topo, ctx);
    cdc_log.info("CDC description table successfully updated with generation {}.", streams_ts);
 }

@@ -355,7 +406,7 @@ void update_streams_description(
        noncopyable_function<unsigned()> get_num_token_owners,
        abort_source& abort_src) {
    try {
-        do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
+        do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() }).get();
    } catch(...) {
        cdc_log.warn(
            "Could not update CDC description table with generation {}: {}. Will retry in the background.",
@@ -368,7 +419,7 @@ void update_streams_description(
            while (true) {
                sleep_abortable(std::chrono::seconds(60), abort_src).get();
                try {
-                    do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() });
+                    do_update_streams_description(streams_ts, *sys_dist_ks, { get_num_token_owners() }).get();
                    return;
                } catch (...) {
                    cdc_log.warn(
@@ -380,4 +431,176 @@ void update_streams_description(
    }
 }

+static db_clock::time_point as_timepoint(const utils::UUID& uuid) {
+    return db_clock::time_point{std::chrono::milliseconds(utils::UUID_gen::get_adjusted_timestamp(uuid))};
+}
+
+static future<std::vector<db_clock::time_point>> get_cdc_desc_v1_timestamps(
+        db::system_distributed_keyspace& sys_dist_ks,
+        abort_source& abort_src,
+        const noncopyable_function<unsigned()>& get_num_token_owners) {
+    while (true) {
+        try {
+            co_return co_await sys_dist_ks.get_cdc_desc_v1_timestamps({ get_num_token_owners() });
+        } catch (...) {
+            cdc_log.warn(
+                    "Failed to retrieve generation timestamps for rewriting: {}. Retrying in 60s.",
+                    std::current_exception());
+        }
+        co_await sleep_abortable(std::chrono::seconds(60), abort_src);
+    }
+}
+
+// Contains a CDC log table's creation time (extracted from its schema's id)
+// and its CDC TTL setting.
+struct time_and_ttl {
+    db_clock::time_point creation_time;
+    int ttl;
+};
+
+/*
+ * See `maybe_rewrite_streams_descriptions`.
+ * This is the long-running-in-the-background part of that function.
+ * It returns the timestamp of the last rewritten generation (if any).
+ */
+static future<std::optional<db_clock::time_point>> rewrite_streams_descriptions(
+        std::vector<time_and_ttl> times_and_ttls,
+        shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source& abort_src) {
+    cdc_log.info("Retrieving generation timestamps for rewriting...");
+    auto tss = co_await get_cdc_desc_v1_timestamps(*sys_dist_ks, abort_src, get_num_token_owners);
+    cdc_log.info("Generation timestamps retrieved.");
+
+    // Find first generation timestamp such that some CDC log table may contain data before this timestamp.
+    // This predicate is monotonic w.r.t the timestamps.
+    auto now = db_clock::now();
+    std::sort(tss.begin(), tss.end());
+    auto first = std::partition_point(tss.begin(), tss.end(), [&] (db_clock::time_point ts) {
+        // partition_point finds first element that does *not* satisfy the predicate.
+        return std::none_of(times_and_ttls.begin(), times_and_ttls.end(),
+                [&] (const time_and_ttl& tat) {
+            // In this CDC log table there are no entries older than the table's creation time
+            // or (now - the table's ttl). We subtract 10s to account for some possible clock drift.
+            // If ttl is set to 0 then entries in this table never expire. In that case we look
+            // only at the table's creation time.
+            auto no_entries_older_than =
+                (tat.ttl == 0 ? tat.creation_time : std::max(tat.creation_time, now - std::chrono::seconds(tat.ttl)))
+                    - std::chrono::seconds(10);
+            return no_entries_older_than < ts;
+        });
+    });
+
+    // Find first generation timestamp such that some CDC log table may contain data in this generation.
+    // This and all later generations need to be written to the new streams table.
+    if (first != tss.begin()) {
+        --first;
+    }
+
+    if (first == tss.end()) {
+        cdc_log.info("No generations to rewrite.");
+        co_return std::nullopt;
+    }
+
+    cdc_log.info("First generation to rewrite: {}", *first);
+
+    bool each_success = true;
+    co_await max_concurrent_for_each(first, tss.end(), 10, [&] (db_clock::time_point ts) -> future<> {
+        while (true) {
+            try {
+                co_return co_await do_update_streams_description(ts, *sys_dist_ks, { get_num_token_owners() });
+            } catch (const no_generation_data_exception& e) {
+                cdc_log.error("Failed to rewrite streams for generation {}: {}. Giving up.", ts, e);
+                each_success = false;
+                co_return;
+            } catch (...) {
+                cdc_log.warn("Failed to rewrite streams for generation {}: {}. Retrying in 60s.", ts, std::current_exception());
+            }
+            co_await sleep_abortable(std::chrono::seconds(60), abort_src);
+        }
+    });
+
+    if (each_success) {
+        cdc_log.info("Rewriting stream tables finished successfully.");
+    } else {
+        cdc_log.info("Rewriting stream tables finished, but some generations could not be rewritten (check the logs).");
+    }
+
+    if (first != tss.end()) {
+        co_return *std::prev(tss.end());
+    }
+
+    co_return std::nullopt;
+}
+
+future<> maybe_rewrite_streams_descriptions(
+        const database& db,
+        shared_ptr<db::system_distributed_keyspace> sys_dist_ks,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source& abort_src) {
+    if (!db.has_schema(sys_dist_ks->NAME, sys_dist_ks->CDC_DESC_V1)) {
+        // This cluster never went through a Scylla version which used this table
+        // or the user deleted the table. Nothing to do.
+        co_return;
+    }
+
+    if (co_await db::system_keyspace::cdc_is_rewritten()) {
+        co_return;
+    }
+
+    if (db.get_config().cdc_dont_rewrite_streams()) {
+        cdc_log.warn("Stream rewriting disabled. Manual administrator intervention may be required...");
+        co_return;
+    }
+
+    // For each CDC log table get the TTL setting (from CDC options) and the table's creation time
+    std::vector<time_and_ttl> times_and_ttls;
+    for (auto& [_, cf] : db.get_column_families()) {
+        auto& s = *cf->schema();
+        auto base = cdc::get_base_table(db, s.ks_name(), s.cf_name());
+        if (!base) {
+            // Not a CDC log table.
+            continue;
+        }
+        auto& cdc_opts = base->cdc_options();
+        if (!cdc_opts.enabled()) {
+            // This table is named like a CDC log table but it's not one.
+            continue;
+        }
+
+        times_and_ttls.push_back(time_and_ttl{as_timepoint(s.id()), cdc_opts.ttl()});
+    }
+
+    if (times_and_ttls.empty()) {
+        // There's no point in rewriting old generations' streams (they don't contain any data).
+        cdc_log.info("No CDC log tables present, not rewriting stream tables.");
+        co_return co_await db::system_keyspace::cdc_set_rewritten(std::nullopt);
+    }
+
+    // It's safe to discard this future: the coroutine keeps system_distributed_keyspace alive
+    // and the abort source's lifetime extends the lifetime of any other service.
+    (void)(([_times_and_ttls = std::move(times_and_ttls), _sys_dist_ks = std::move(sys_dist_ks),
+                _get_num_token_owners = std::move(get_num_token_owners), &_abort_src = abort_src] () mutable -> future<> {
+        auto times_and_ttls = std::move(_times_and_ttls);
+        auto sys_dist_ks = std::move(_sys_dist_ks);
+        auto get_num_token_owners = std::move(_get_num_token_owners);
+        auto& abort_src = _abort_src;
+
+        // This code is racing with node startup. At this point, we're most likely still waiting for gossip to settle
+        // and some nodes that are UP may still be marked as DOWN by us.
+        // Let's sleep a bit to increase the chance that the first attempt at rewriting succeeds (it's still ok if
+        // it doesn't - we'll retry - but it's nice if we succeed without any warnings).
+        co_await sleep_abortable(std::chrono::seconds(10), abort_src);
+
+        cdc_log.info("Rewriting stream tables in the background...");
+        auto last_rewritten = co_await rewrite_streams_descriptions(
+                std::move(times_and_ttls),
+                std::move(sys_dist_ks),
+                std::move(get_num_token_owners),
+                abort_src);
+
+        co_await db::system_keyspace::cdc_set_rewritten(last_rewritten);
+    })());
+}
+
 } // namespace cdc
--- a/cdc/generation.hh
+++ b/cdc/generation.hh
@@ -41,6 +41,7 @@
 #include "db_clock.hh"
 #include "dht/token.hh"
 #include "locator/token_metadata.hh"
+#include "utils/chunked_vector.hh"

 namespace seastar {
    class abort_source;
@@ -65,6 +66,7 @@ public:

    stream_id() = default;
    stream_id(bytes);
+    stream_id(dht::token, size_t);

    bool is_set() const;
    bool operator==(const stream_id&) const;
@@ -78,9 +80,6 @@ public:

    partition_key to_partition_key(const schema& log_schema) const;
    static int64_t token_from_bytes(bytes_view);
-private:
-    friend class topology_description_generator;
-    stream_id(dht::token, size_t);
 };

 /* Describes a mapping of tokens to CDC streams in a token range.
@@ -113,7 +112,8 @@ public:
    topology_description(std::vector<token_range_description> entries);
    bool operator==(const topology_description&) const;

-    const std::vector<token_range_description>& entries() const;
+    const std::vector<token_range_description>& entries() const&;
+    std::vector<token_range_description>&& entries() &&;
 };

 /**
@@ -122,14 +122,19 @@ public:
 */ 
 class streams_version {
 public:
-    std::vector<stream_id> streams;
+    utils::chunked_vector<stream_id> streams;
    db_clock::time_point timestamp;
-    std::optional<db_clock::time_point> expired;

-    streams_version(std::vector<stream_id> s, db_clock::time_point ts, std::optional<db_clock::time_point> exp)
+    streams_version(utils::chunked_vector<stream_id> s, db_clock::time_point ts)
        : streams(std::move(s))
        , timestamp(ts)
-        , expired(std::move(exp))
+    {}
+};
+
+class no_generation_data_exception : public std::runtime_error {
+public:
+    no_generation_data_exception(db_clock::time_point generation_ts)
+        : std::runtime_error(format("could not find generation data for timestamp {}", generation_ts))
    {}
 };

@@ -194,4 +199,15 @@ void update_streams_description(
        noncopyable_function<unsigned()> get_num_token_owners,
        abort_source&);

+/* Part of the upgrade procedure. Useful in case where the version of Scylla that we're upgrading from
+ * used the "cdc_streams_descriptions" table. This procedure ensures that the new "cdc_streams_descriptions_v2"
+ * table contains streams of all generations that were present in the old table and may still contain data
+ * (i.e. there exist CDC log tables that may contain rows with partition keys being the stream IDs from
+ * these generations). */
+future<> maybe_rewrite_streams_descriptions(
+        const database&,
+        shared_ptr<db::system_distributed_keyspace>,
+        noncopyable_function<unsigned()> get_num_token_owners,
+        abort_source&);
+
 } // namespace cdc
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -220,7 +220,7 @@ public:
            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt);

            auto log_mut = log_schema 
-                ? db::schema_tables::make_update_table_mutations(keyspace.metadata(), log_schema, new_log_schema, timestamp, false)
+                ? db::schema_tables::make_update_table_mutations(db, keyspace.metadata(), log_schema, new_log_schema, timestamp, false)
                : db::schema_tables::make_create_table_mutations(keyspace.metadata(), new_log_schema, timestamp)
                ;

@@ -579,11 +579,6 @@ db_context::builder& db_context::builder::with_migration_notifier(service::migra
    return *this;
 }

-db_context::builder& db_context::builder::with_token_metadata(const locator::token_metadata& token_metadata) {
-    _token_metadata = token_metadata;
-    return *this;
-}
-
 db_context::builder& db_context::builder::with_cdc_metadata(cdc::metadata& cdc_metadata) {
    _cdc_metadata = cdc_metadata;
    return *this;
@@ -593,7 +588,6 @@ db_context db_context::builder::build() {
    return db_context{
        _proxy,
        _migration_notifier ? _migration_notifier->get() : service::get_local_storage_service().get_migration_notifier(),
-        _token_metadata ? _token_metadata->get() : service::get_local_storage_service().get_token_metadata(),
        _cdc_metadata ? _cdc_metadata->get() : service::get_local_storage_service().get_cdc_metadata(),
    };
 }
@@ -1297,6 +1291,13 @@ struct process_change_visitor {
                _clustering_row_states, _generate_delta_values);
        visit_row_cells(v);

+        if (_enable_updating_state) {
+            // #7716: if there are no regular columns, our visitor would not have visited any cells,
+            // hence it would not have created a row_state for this row. In effect, postimage wouldn't be produced.
+            // Ensure that the row state exists.
+            _clustering_row_states.try_emplace(ckey);
+        }
+
        _builder.set_operation(log_ck, v._cdc_op);
        _builder.set_ttl(log_ck, v._ttl_column);
    }
--- a/cdc/log.hh
+++ b/cdc/log.hh
@@ -100,19 +100,16 @@ public:
 struct db_context final {
    service::storage_proxy& _proxy;
    service::migration_notifier& _migration_notifier;
-    const locator::token_metadata& _token_metadata;
    cdc::metadata& _cdc_metadata;

    class builder final {
        service::storage_proxy& _proxy;
        std::optional<std::reference_wrapper<service::migration_notifier>> _migration_notifier;
-        std::optional<std::reference_wrapper<const locator::token_metadata>> _token_metadata;
        std::optional<std::reference_wrapper<cdc::metadata>> _cdc_metadata;
    public:
        builder(service::storage_proxy& proxy);

        builder& with_migration_notifier(service::migration_notifier& migration_notifier);
-        builder& with_token_metadata(const locator::token_metadata& token_metadata);
        builder& with_cdc_metadata(cdc::metadata&);

        db_context build();
--- a/cdc/metadata.cc
+++ b/cdc/metadata.cc
@@ -51,7 +51,8 @@ static cdc::stream_id get_stream(
    return entry.streams[shard_id];
 }

-static cdc::stream_id get_stream(
+// non-static for testing
+cdc::stream_id get_stream(
        const std::vector<cdc::token_range_description>& entries,
        dht::token tok) {
    if (entries.empty()) {
--- a/clustering_bounds_comparator.hh
+++ b/clustering_bounds_comparator.hh
@@ -67,8 +67,8 @@ public:
        int operator()(const clustering_key_prefix& p1, int32_t w1, const clustering_key_prefix& p2, int32_t w2) const {
            auto type = _s.get().clustering_key_prefix_type();
            auto res = prefix_equality_tri_compare(type->types().begin(),
-                type->begin(p1), type->end(p1),
-                type->begin(p2), type->end(p2),
+                type->begin(p1.representation()), type->end(p1.representation()),
+                type->begin(p2.representation()), type->end(p2.representation()),
                ::tri_compare);
            if (res) {
                return res;
--- a/collection_mutation.hh
+++ b/collection_mutation.hh
@@ -136,4 +136,4 @@ collection_mutation merge(const abstract_type&, collection_mutation_view, collec
 collection_mutation difference(const abstract_type&, collection_mutation_view, collection_mutation_view);

 // Serializes the given collection of cells to a sequence of bytes ready to be sent over the CQL protocol.
-bytes serialize_for_cql(const abstract_type&, collection_mutation_view, cql_serialization_format);
+bytes_ostream serialize_for_cql(const abstract_type&, collection_mutation_view, cql_serialization_format);
--- a/compound.hh
+++ b/compound.hh
@@ -73,12 +73,19 @@ private:
     *   <len(value1)><value1><len(value2)><value2>...<len(value_n)><value_n>
     *
     */
-    template<typename RangeOfSerializedComponents, typename CharOutputIterator>
-    static void serialize_value(RangeOfSerializedComponents&& values, CharOutputIterator& out) {
+    template<typename RangeOfSerializedComponents, FragmentedMutableView Out>
+    static void serialize_value(RangeOfSerializedComponents&& values, Out out) {
        for (auto&& val : values) {
            assert(val.size() <= std::numeric_limits<size_type>::max());
            write<size_type>(out, size_type(val.size()));
-            out = std::copy(val.begin(), val.end(), out);
+            using val_type = std::remove_cvref_t<decltype(val)>;
+            if constexpr (FragmentedView<val_type>) {
+                write_fragmented(out, val);
+            } else if constexpr (std::same_as<val_type, managed_bytes>) {
+                write_fragmented(out, managed_bytes_view(val));
+            } else {
+                write_fragmented(out, single_fragmented_view(val));
+            }
        }
    }
    template <typename RangeOfSerializedComponents>
@@ -90,25 +97,27 @@ private:
        return len;
    }
 public:
-    bytes serialize_single(bytes&& v) const {
+    managed_bytes serialize_single(managed_bytes&& v) const {
+        return serialize_value({std::move(v)});
+    }
+    managed_bytes serialize_single(bytes&& v) const {
        return serialize_value({std::move(v)});
    }
    template<typename RangeOfSerializedComponents>
-    static bytes serialize_value(RangeOfSerializedComponents&& values) {
+    static managed_bytes serialize_value(RangeOfSerializedComponents&& values) {
        auto size = serialized_size(values);
        if (size > std::numeric_limits<size_type>::max()) {
            throw std::runtime_error(format("Key size too large: {:d} > {:d}", size, std::numeric_limits<size_type>::max()));
        }
-        bytes b(bytes::initialized_later(), size);
-        auto i = b.begin();
-        serialize_value(values, i);
+        managed_bytes b(managed_bytes::initialized_later(), size);
+        serialize_value(values, managed_bytes_mutable_view(b));
        return b;
    }
    template<typename T>
-    static bytes serialize_value(std::initializer_list<T> values) {
+    static managed_bytes serialize_value(std::initializer_list<T> values) {
        return serialize_value(boost::make_iterator_range(values.begin(), values.end()));
    }
-    bytes serialize_optionals(const std::vector<bytes_opt>& values) const {
+    managed_bytes serialize_optionals(const std::vector<bytes_opt>& values) const {
        return serialize_value(values | boost::adaptors::transformed([] (const bytes_opt& bo) -> bytes_view {
            if (!bo) {
                throw std::logic_error("attempted to create key component from empty optional");
@@ -116,7 +125,7 @@ public:
            return *bo;
        }));
    }
-    bytes serialize_value_deep(const std::vector<data_value>& values) const {
+    managed_bytes serialize_value_deep(const std::vector<data_value>& values) const {
        // TODO: Optimize
        std::vector<bytes> partial;
        partial.reserve(values.size());
@@ -127,25 +136,26 @@ public:
        }
        return serialize_value(partial);
    }
-    bytes decompose_value(const value_type& values) const {
+    managed_bytes decompose_value(const value_type& values) const {
        return serialize_value(values);
    }
    class iterator {
    public:
        using iterator_category = std::input_iterator_tag;
-        using value_type = const bytes_view;
+        using value_type = const managed_bytes_view;
        using difference_type = std::ptrdiff_t;
-        using pointer = const bytes_view*;
-        using reference = const bytes_view&;
+        using pointer = const value_type*;
+        using reference = const value_type&;
    private:
-        bytes_view _v;
-        bytes_view _current;
+        managed_bytes_view _v;
+        managed_bytes_view _current;
+        size_t _remaining = 0;
    private:
        void read_current() {
+            _remaining = _v.size_bytes();
            size_type len;
            {
                if (_v.empty()) {
-                    _v = bytes_view(nullptr, 0);
                    return;
                }
                len = read_simple<size_type>(_v);
@@ -153,15 +163,16 @@ public:
                    throw_with_backtrace<marshal_exception>(format("compound_type iterator - not enough bytes, expected {:d}, got {:d}", len, _v.size()));
                }
            }
-            _current = bytes_view(_v.begin(), len);
-            _v.remove_prefix(len);
+            _current = _v.prefix(len);
+            _v.remove_prefix(_current.size_bytes());
        }
    public:
        struct end_iterator_tag {};
-        iterator(const bytes_view& v) : _v(v) {
+        iterator(const managed_bytes_view& v) : _v(v) {
            read_current();
        }
-        iterator(end_iterator_tag, const bytes_view& v) : _v(nullptr, 0) {}
+        iterator(end_iterator_tag, const managed_bytes_view& v) : _v() {}
+        iterator() {}
        iterator& operator++() {
            read_current();
            return *this;
@@ -173,29 +184,40 @@ public:
        }
        const value_type& operator*() const { return _current; }
        const value_type* operator->() const { return &_current; }
-        bool operator!=(const iterator& i) const { return _v.begin() != i._v.begin(); }
-        bool operator==(const iterator& i) const { return _v.begin() == i._v.begin(); }
+        bool operator==(const iterator& i) const { return _remaining == i._remaining; }
    };
-    static iterator begin(const bytes_view& v) {
+    static iterator begin(managed_bytes_view v) {
        return iterator(v);
    }
-    static iterator end(const bytes_view& v) {
+    static iterator end(managed_bytes_view v) {
        return iterator(typename iterator::end_iterator_tag(), v);
    }
-    static boost::iterator_range<iterator> components(const bytes_view& v) {
+    static boost::iterator_range<iterator> components(managed_bytes_view v) {
        return { begin(v), end(v) };
    }
-    value_type deserialize_value(bytes_view v) const {
+    value_type deserialize_value(managed_bytes_view v) const {
        std::vector<bytes> result;
        result.reserve(_types.size());
        std::transform(begin(v), end(v), std::back_inserter(result), [] (auto&& v) {
-            return bytes(v.begin(), v.end());
+            return to_bytes(v);
        });
        return result;
    }
+    bool less(managed_bytes_view b1, managed_bytes_view b2) const {
+        return with_linearized(b1, [&] (bytes_view bv1) {
+            return with_linearized(b2, [&] (bytes_view bv2) {
+                return less(bv1, bv2);
+            });
+        });
+    }
    bool less(bytes_view b1, bytes_view b2) const {
        return compare(b1, b2) < 0;
    }
+    size_t hash(managed_bytes_view v) const{
+        return with_linearized(v, [&] (bytes_view v) {
+            return hash(v);
+        });
+    }
    size_t hash(bytes_view v) const {
        if (_byte_order_equal) {
            return std::hash<bytes_view>()(v);
@@ -208,6 +230,13 @@ public:
        }
        return h;
    }
+    int compare(managed_bytes_view b1, managed_bytes_view b2) const {
+        return with_linearized(b1, [&] (bytes_view bv1) {
+            return with_linearized(b2, [&] (bytes_view bv2) {
+                return compare(bv1, bv2);
+            });
+        });
+    }
    int compare(bytes_view b1, bytes_view b2) const {
        if (_byte_order_comparable) {
            if (_is_reversed) {
@@ -222,15 +251,21 @@ public:
            });
    }
    // Retruns true iff given prefix has no missing components
-    bool is_full(bytes_view v) const {
+    bool is_full(managed_bytes_view v) const {
        assert(AllowPrefixes == allow_prefixes::yes);
        return std::distance(begin(v), end(v)) == (ssize_t)_types.size();
    }
+    bool is_empty(managed_bytes_view v) const {
+        return v.empty();
+    }
+    bool is_empty(const managed_bytes& v) const {
+        return v.empty();
+    }
    bool is_empty(bytes_view v) const {
        return begin(v) == end(v);
    }
-    void validate(bytes_view v) const {
-        std::vector<bytes_view> values(begin(v), end(v));
+    void validate(managed_bytes_view v) const {
+        std::vector<managed_bytes_view> values(begin(v), end(v));
        if (AllowPrefixes == allow_prefixes::no && values.size() < _types.size()) {
            throw marshal_exception(fmt::format("compound::validate(): non-prefixable compound cannot be a prefix"));
        }
@@ -243,6 +278,13 @@ public:
            _types[i]->validate(values[i], cql_serialization_format::internal());
        }
    }
+    bool equal(managed_bytes_view v1, managed_bytes_view v2) const {
+        return with_linearized(v1, [&] (bytes_view bv1) {
+            return with_linearized(v2, [&] (bytes_view bv2) {
+                return equal(bv1, bv2);
+            });
+        });
+    }
    bool equal(bytes_view v1, bytes_view v2) const {
        if (_byte_order_equal) {
            return compare_unsigned(v1, v2) == 0;
--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -54,9 +54,9 @@ template <typename CompoundType>
 class legacy_compound_view {
    static_assert(!CompoundType::is_prefixable, "Legacy view not defined for prefixes");
    CompoundType& _type;
-    bytes_view _packed;
+    managed_bytes_view _packed;
 public:
-    legacy_compound_view(CompoundType& c, bytes_view packed)
+    legacy_compound_view(CompoundType& c, managed_bytes_view packed)
        : _type(c)
        , _packed(packed)
    { }
@@ -147,18 +147,18 @@ public:
        { }

        // @k1 and @k2 must be serialized using @type, which was passed to the constructor.
-        int operator()(bytes_view k1, bytes_view k2) const {
+        int operator()(managed_bytes_view k1, managed_bytes_view k2) const {
            if (_type.is_singular()) {
                return compare_unsigned(*_type.begin(k1), *_type.begin(k2));
            }
            return lexicographical_tri_compare(
                _type.begin(k1), _type.end(k1),
                _type.begin(k2), _type.end(k2),
-                [] (const bytes_view& c1, const bytes_view& c2) -> int {
+                [] (const managed_bytes_view& c1, const managed_bytes_view& c2) -> int {
                    if (c1.size() != c2.size() || !c1.size()) {
                        return c1.size() < c2.size() ? -1 : c1.size() ? 1 : 0;
                    }
-                    return memcmp(c1.begin(), c2.begin(), c1.size());
+                    return compare_unsigned(c1, c2);
                });
        }
    };
@@ -188,7 +188,7 @@ public:
 // @packed is assumed to be serialized using supplied @type.
 template <typename CompoundType>
 static inline
-bytes to_legacy(CompoundType& type, bytes_view packed) {
+bytes to_legacy(CompoundType& type, managed_bytes_view packed) {
    legacy_compound_view<CompoundType> lv(type, packed);
    bytes legacy_form(bytes::initialized_later(), lv.size());
    std::copy(lv.begin(), lv.end(), legacy_form.begin());
@@ -264,6 +264,12 @@ private:
    static void write_value(Value&& val, CharOutputIterator& out) {
        out = std::copy(val.begin(), val.end(), out);
    }
+    template<typename CharOutputIterator>
+    static void write_value(managed_bytes_view val, CharOutputIterator& out) {
+        for (bytes_view frag : fragment_range(val)) {
+            out = std::copy(frag.begin(), frag.end(), out);
+        }
+    }
    template <typename CharOutputIterator>
    static void write_value(const data_value& val, CharOutputIterator& out) {
        val.serialize(out);
@@ -405,6 +411,7 @@ public:
        iterator(end_iterator_tag) : _v(nullptr, 0) {}

    public:
+        iterator() : iterator(end_iterator_tag()) {}
        iterator& operator++() {
            read_current();
            return *this;
--- a/configure.py
+++ b/configure.py
@@ -59,6 +59,9 @@ i18n_xlat = {
 }

 python3_dependencies = subprocess.run('./install-dependencies.sh --print-python3-runtime-packages', shell=True, capture_output=True, encoding='utf-8').stdout.strip()
+node_exporter_filename = subprocess.run('./install-dependencies.sh --print-node-exporter-filename', shell=True, capture_output=True, encoding='utf-8').stdout.strip()
+node_exporter_dirname = os.path.basename(node_exporter_filename).rstrip('.tar.gz')
+

 def pkgname(name):
    if name in i18n_xlat:
@@ -262,7 +265,7 @@ modes = {
        'stack-usage-threshold': 1024*13,
    },
    'dev': {
-        'cxxflags': '-O1 -DSEASTAR_ENABLE_ALLOC_FAILURE_INJECTION -DSCYLLA_ENABLE_ERROR_INJECTION',
+        'cxxflags': '-O1 -DDEVEL -DSEASTAR_ENABLE_ALLOC_FAILURE_INJECTION -DSCYLLA_ENABLE_ERROR_INJECTION',
        'cxx_ld_flags': '',
        'stack-usage-threshold': 1024*21,
    },
@@ -275,6 +278,7 @@ modes = {

 scylla_tests = set([
    'test/boost/UUID_test',
+    'test/boost/cdc_generation_test',
    'test/boost/aggregate_fcts_test',
    'test/boost/allocation_strategy_test',
    'test/boost/alternator_base64_test',
@@ -329,6 +333,7 @@ scylla_tests = set([
    'test/boost/gossip_test',
    'test/boost/gossiping_property_file_snitch_test',
    'test/boost/hash_test',
+    'test/boost/hashers_test',
    'test/boost/idl_test',
    'test/boost/imr_test',
    'test/boost/input_stream_test',
@@ -344,6 +349,7 @@ scylla_tests = set([
    'test/boost/estimated_histogram_test',
    'test/boost/logalloc_test',
    'test/boost/managed_vector_test',
+    'test/boost/managed_bytes_test',
    'test/boost/intrusive_array_test',
    'test/boost/map_difference_test',
    'test/boost/memtable_test',
@@ -450,6 +456,7 @@ apps = set([
    'scylla',
    'test/tools/cql_repl',
    'tools/scylla-types',
+    'tools/scylla-sstable-index',
 ])

 tests = scylla_tests | perf_tests | raft_tests
@@ -575,6 +582,7 @@ scylla_core = (['database.cc',
                'sstables/mp_row_consumer.cc',
                'sstables/sstables.cc',
                'sstables/sstables_manager.cc',
+                'sstables/sstable_set.cc',
                'sstables/mx/writer.cc',
                'sstables/kl/writer.cc',
                'sstables/sstable_version.cc',
@@ -847,6 +855,7 @@ scylla_core = (['database.cc',
                'utils/error_injection.cc',
                'mutation_writer/timestamp_based_splitting_writer.cc',
                'mutation_writer/shard_based_splitting_writer.cc',
+                'mutation_writer/feed_writers.cc',
                'lua.cc',
                ] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')]
               )
@@ -982,6 +991,7 @@ deps = {
    'test/tools/cql_repl': idls + ['test/tools/cql_repl.cc'] + scylla_core + scylla_tests_generic_dependencies,
    #FIXME: we don't need all of scylla_core here, only the types module, need to modularize scylla_core.
    'tools/scylla-types': idls + ['tools/scylla-types.cc'] + scylla_core,
+    'tools/scylla-sstable-index': idls + ['tools/scylla-sstable-index.cc'] + scylla_core,
 }

 pure_boost_tests = set([
@@ -1001,6 +1011,7 @@ pure_boost_tests = set([
    'test/boost/dynamic_bitset_test',
    'test/boost/enum_option_test',
    'test/boost/enum_set_test',
+    'test/boost/hashers_test',
    'test/boost/idl_test',
    'test/boost/json_test',
    'test/boost/keys_test',
@@ -1017,6 +1028,7 @@ pure_boost_tests = set([
    'test/boost/top_k_test',
    'test/boost/vint_serialization_test',
    'test/boost/bptree_test',
+    'test/boost/utf8_test',
    'test/manual/streaming_histogram_test',
 ])

@@ -1139,7 +1151,6 @@ warnings = [
    '-Wno-delete-non-abstract-non-virtual-dtor',
    '-Wno-unknown-attributes',
    '-Wno-braced-scalar-init',
-    '-Wno-unused-value',
    '-Wno-range-loop-construct',
    '-Wno-unused-function',
    '-Wno-implicit-int-float-conversion',
@@ -1798,7 +1809,7 @@ with open(buildfile_tmp, 'w') as f:
        f.write(textwrap.dedent('''\
            build $builddir/{mode}/iotune: copy $builddir/{mode}/seastar/apps/iotune/iotune
            ''').format(**locals()))
-        f.write('build $builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz: package $builddir/{mode}/scylla $builddir/{mode}/iotune $builddir/SCYLLA-RELEASE-FILE $builddir/SCYLLA-VERSION-FILE $builddir/debian/debian | always\n'.format(**locals()))
+        f.write('build $builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz: package $builddir/{mode}/scylla $builddir/{mode}/iotune $builddir/SCYLLA-RELEASE-FILE $builddir/SCYLLA-VERSION-FILE $builddir/debian/debian $builddir/node_exporter | always\n'.format(**locals()))
        f.write('  mode = {mode}\n'.format(**locals()))
        f.write(f'build $builddir/dist/{mode}/redhat: rpmbuild $builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz\n')
        f.write(f'  mode = {mode}\n')
@@ -1957,6 +1968,9 @@ with open(buildfile_tmp, 'w') as f:
        rule debian_files_gen
            command = ./dist/debian/debian_files_gen.py
        build $builddir/debian/debian: debian_files_gen | always
+        rule extract_node_exporter
+            command = tar -C build -xvpf {node_exporter_filename} && rm -rfv build/node_exporter && mv -v build/{node_exporter_dirname} build/node_exporter
+        build $builddir/node_exporter: extract_node_exporter | always
        ''').format(**globals()))

 os.rename(buildfile_tmp, buildfile)
--- a/counters.cc
+++ b/counters.cc
@@ -19,16 +19,10 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

-#include "service/storage_service.hh"
 #include "counters.hh"
 #include "mutation.hh"
 #include "combine.hh"

-counter_id counter_id::local()
-{
-    return counter_id(service::get_local_storage_service().get_local_id());
-}
-
 std::ostream& operator<<(std::ostream& os, const counter_id& id) {
    return os << id.to_uuid();
 }
@@ -197,10 +191,10 @@ std::optional<atomic_cell> counter_cell_view::difference(atomic_cell_view a, ato
 }


-void transform_counter_updates_to_shards(mutation& m, const mutation* current_state, uint64_t clock_offset) {
+void transform_counter_updates_to_shards(mutation& m, const mutation* current_state, uint64_t clock_offset, utils::UUID local_id) {
    // FIXME: allow current_state to be frozen_mutation

-    auto transform_new_row_to_shards = [&s = *m.schema(), clock_offset] (column_kind kind, auto& cells) {
+    auto transform_new_row_to_shards = [&s = *m.schema(), clock_offset, local_id] (column_kind kind, auto& cells) {
        cells.for_each_cell([&] (column_id id, atomic_cell_or_collection& ac_o_c) {
            auto& cdef = s.column_at(kind, id);
            auto acv = ac_o_c.as_atomic_cell(cdef);
@@ -208,7 +202,7 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
                return; // continue -- we are in lambda
            }
            auto delta = acv.counter_update_value();
-            auto cs = counter_shard(counter_id::local(), delta, clock_offset + 1);
+            auto cs = counter_shard(counter_id(local_id), delta, clock_offset + 1);
            ac_o_c = counter_cell_builder::from_single_shard(acv.timestamp(), cs);
        });
    };
@@ -223,7 +217,7 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st

    clustering_key::less_compare cmp(*m.schema());

-    auto transform_row_to_shards = [&s = *m.schema(), clock_offset] (column_kind kind, auto& transformee, auto& state) {
+    auto transform_row_to_shards = [&s = *m.schema(), clock_offset, local_id] (column_kind kind, auto& transformee, auto& state) {
        std::deque<std::pair<column_id, counter_shard>> shards;
        state.for_each_cell([&] (column_id id, const atomic_cell_or_collection& ac_o_c) {
            auto& cdef = s.column_at(kind, id);
@@ -232,7 +226,7 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
                return; // continue -- we are in lambda
            }
          counter_cell_view::with_linearized(acv, [&] (counter_cell_view ccv) {
-            auto cs = ccv.local_shard();
+            auto cs = ccv.get_shard(counter_id(local_id));
            if (!cs) {
                return; // continue
            }
@@ -253,7 +247,7 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
            auto delta = acv.counter_update_value();

            if (shards.empty() || shards.front().first > id) {
-                auto cs = counter_shard(counter_id::local(), delta, clock_offset + 1);
+                auto cs = counter_shard(counter_id(local_id), delta, clock_offset + 1);
                ac_o_c = counter_cell_builder::from_single_shard(acv.timestamp(), cs);
            } else {
                auto& cs = shards.front().second;
--- a/counters.hh
+++ b/counters.hh
@@ -61,8 +61,6 @@ public:
        return !(*this == other);
    }
 public:
-    static counter_id local();
-
    // For tests.
    static counter_id generate_random() {
        return counter_id(utils::make_random_uuid());
@@ -405,11 +403,6 @@ public:
        return *it;
    }

-    std::optional<counter_shard_view> local_shard() const {
-        // TODO: consider caching local shard position
-        return get_shard(counter_id::local());
-    }
-
    bool operator==(const basic_counter_cell_view& other) const {
        return timestamp() == other.timestamp() && boost::equal(shards(), other.shards());
    }
@@ -451,7 +444,7 @@ struct counter_cell_mutable_view : basic_counter_cell_view<mutable_view::yes> {
 // Transforms mutation dst from counter updates to counter shards using state
 // stored in current_state.
 // If current_state is present it has to be in the same schema as dst.
-void transform_counter_updates_to_shards(mutation& dst, const mutation* current_state, uint64_t clock_offset);
+void transform_counter_updates_to_shards(mutation& dst, const mutation* current_state, uint64_t clock_offset, utils::UUID local_id);

 template<>
 struct appending_hash<counter_shard_view> {
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -394,6 +394,7 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
        bool allow_filtering = false;
        bool is_json = false;
        bool bypass_cache = false;
+        auto attrs = std::make_unique<cql3::attributes::raw>();
    }
    : K_SELECT (
                ( K_JSON { is_json = true; } )?
@@ -408,11 +409,12 @@ selectStatement returns [std::unique_ptr<raw::select_statement> expr]
      ( K_LIMIT rows=intValue { limit = rows; } )?
      ( K_ALLOW K_FILTERING  { allow_filtering = true; } )?
      ( K_BYPASS K_CACHE { bypass_cache = true; })?
+      ( usingClause[attrs] )?
      {
          auto params = make_lw_shared<raw::select_statement::parameters>(std::move(orderings), is_distinct, allow_filtering, is_json, bypass_cache);
          $expr = std::make_unique<raw::select_statement>(std::move(cf), std::move(params),
            std::move(sclause), std::move(wclause), std::move(limit), std::move(per_partition_limit),
-            std::move(gbcolumns));
+            std::move(gbcolumns), std::move(attrs));
      }
    ;

@@ -521,6 +523,7 @@ usingClause[std::unique_ptr<cql3::attributes::raw>& attrs]
 usingClauseObjective[std::unique_ptr<cql3::attributes::raw>& attrs]
    : K_TIMESTAMP ts=intValue { attrs->timestamp = ts; }
    | K_TTL t=intValue { attrs->time_to_live = t; }
+    | K_TIMEOUT to=term { attrs->timeout = to; }
    ;

 /**
@@ -1761,6 +1764,7 @@ basic_unreserved_keyword returns [sstring str]
        | K_PER
        | K_PARTITION
        | K_GROUP
+        | K_TIMEOUT
        ) { $str = $k.text; }
    ;

@@ -1916,6 +1920,8 @@ K_GROUP:       G R O U P;

 K_LIKE:        L I K E;

+K_TIMEOUT:     T I M E O U T;
+
 // Case-insensitive alpha characters
 fragment A: ('a'|'A');
 fragment B: ('b'|'B');
--- a/cql3/abstract_marker.cc
+++ b/cql3/abstract_marker.cc
@@ -70,11 +70,11 @@ abstract_marker::raw::raw(int32_t bind_index)
 ::shared_ptr<term> abstract_marker::raw::prepare(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const
 {
    if (receiver->type->is_collection()) {
-        if (receiver->type->get_kind() == abstract_type::kind::list) {
+        if (receiver->type->without_reversed().is_list()) {
            return ::make_shared<lists::marker>(_bind_index, receiver);
-        } else if (receiver->type->get_kind() == abstract_type::kind::set) {
+        } else if (receiver->type->without_reversed().is_set()) {
            return ::make_shared<sets::marker>(_bind_index, receiver);
-        } else if (receiver->type->get_kind() == abstract_type::kind::map) {
+        } else if (receiver->type->without_reversed().is_map()) {
            return ::make_shared<maps::marker>(_bind_index, receiver);
        }
        assert(0);
--- a/cql3/attributes.cc
+++ b/cql3/attributes.cc
@@ -44,12 +44,13 @@
 namespace cql3 {

 std::unique_ptr<attributes> attributes::none() {
-    return std::unique_ptr<attributes>{new attributes{{}, {}}};
+    return std::unique_ptr<attributes>{new attributes{{}, {}, {}}};
 }

-attributes::attributes(::shared_ptr<term>&& timestamp, ::shared_ptr<term>&& time_to_live)
+attributes::attributes(::shared_ptr<term>&& timestamp, ::shared_ptr<term>&& time_to_live, ::shared_ptr<term>&& timeout)
    : _timestamp{std::move(timestamp)}
    , _time_to_live{std::move(time_to_live)}
+    , _timeout{std::move(timeout)}
 { }

 bool attributes::is_timestamp_set() const {
@@ -60,6 +61,10 @@ bool attributes::is_time_to_live_set() const {
    return bool(_time_to_live);
 }

+bool attributes::is_timeout_set() const {
+    return bool(_timeout);
+}
+
 int64_t attributes::get_timestamp(int64_t now, const query_options& options) {
    if (!_timestamp) {
        return now;
@@ -72,14 +77,12 @@ int64_t attributes::get_timestamp(int64_t now, const query_options& options) {
    if (tval.is_unset_value()) {
        return now;
    }
-  return with_linearized(*tval, [&] (bytes_view val) {
    try {
-        data_type_for<int64_t>()->validate(val, options.get_cql_serialization_format());
+        data_type_for<int64_t>()->validate(*tval, options.get_cql_serialization_format());
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid timestamp value");
    }
-    return value_cast<int64_t>(data_type_for<int64_t>()->deserialize(val));
-  });
+    return value_cast<int64_t>(data_type_for<int64_t>()->deserialize(*tval));
 }

 int32_t attributes::get_time_to_live(const query_options& options) {
@@ -93,16 +96,15 @@ int32_t attributes::get_time_to_live(const query_options& options) {
    if (tval.is_unset_value()) {
        return 0;
    }
-  auto ttl = with_linearized(*tval, [&] (bytes_view val) {
+
    try {
-        data_type_for<int32_t>()->validate(val, options.get_cql_serialization_format());
+        data_type_for<int32_t>()->validate(*tval, options.get_cql_serialization_format());
    }
    catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid TTL value");
    }
+    auto ttl = value_cast<int32_t>(data_type_for<int32_t>()->deserialize(*tval));

-    return value_cast<int32_t>(data_type_for<int32_t>()->deserialize(val));
-  });
    if (ttl < 0) {
        throw exceptions::invalid_request_exception("A TTL must be greater or equal to 0");
    }
@@ -115,6 +117,25 @@ int32_t attributes::get_time_to_live(const query_options& options) {
    return ttl;
 }

+
+db::timeout_clock::duration attributes::get_timeout(const query_options& options) const {
+    auto timeout = _timeout->bind_and_get(options);
+    if (timeout.is_null() || timeout.is_unset_value()) {
+        throw exceptions::invalid_request_exception("Timeout value cannot be unset/null");
+    }
+    cql_duration duration = value_cast<cql_duration>(duration_type->deserialize(*timeout));
+    if (duration.months || duration.days) {
+        throw exceptions::invalid_request_exception("Timeout values cannot be expressed in days/months");
+    }
+    if (duration.nanoseconds % 1'000'000 != 0) {
+        throw exceptions::invalid_request_exception("Timeout values cannot have granularity finer than milliseconds");
+    }
+    if (duration.nanoseconds < 0) {
+        throw exceptions::invalid_request_exception("Timeout values must be non-negative");
+    }
+    return std::chrono::duration_cast<db::timeout_clock::duration>(std::chrono::nanoseconds(duration.nanoseconds));
+}
+
 void attributes::collect_marker_specification(variable_specifications& bound_names) const {
    if (_timestamp) {
        _timestamp->collect_marker_specification(bound_names);
@@ -122,12 +143,16 @@ void attributes::collect_marker_specification(variable_specifications& bound_nam
    if (_time_to_live) {
        _time_to_live->collect_marker_specification(bound_names);
    }
+    if (_timeout) {
+        _timeout->collect_marker_specification(bound_names);
+    }
 }

 std::unique_ptr<attributes> attributes::raw::prepare(database& db, const sstring& ks_name, const sstring& cf_name) const {
    auto ts = !timestamp ? ::shared_ptr<term>{} : timestamp->prepare(db, ks_name, timestamp_receiver(ks_name, cf_name));
    auto ttl = !time_to_live ? ::shared_ptr<term>{} : time_to_live->prepare(db, ks_name, time_to_live_receiver(ks_name, cf_name));
-    return std::unique_ptr<attributes>{new attributes{std::move(ts), std::move(ttl)}};
+    auto to = !timeout ? ::shared_ptr<term>{} : timeout->prepare(db, ks_name, timeout_receiver(ks_name, cf_name));
+    return std::unique_ptr<attributes>{new attributes{std::move(ts), std::move(ttl), std::move(to)}};
 }

 lw_shared_ptr<column_specification> attributes::raw::timestamp_receiver(const sstring& ks_name, const sstring& cf_name) const {
@@ -138,4 +163,8 @@ lw_shared_ptr<column_specification> attributes::raw::time_to_live_receiver(const
    return make_lw_shared<column_specification>(ks_name, cf_name, ::make_shared<column_identifier>("[ttl]", true), data_type_for<int32_t>());
 }

+lw_shared_ptr<column_specification> attributes::raw::timeout_receiver(const sstring& ks_name, const sstring& cf_name) const {
+    return make_lw_shared<column_specification>(ks_name, cf_name, ::make_shared<column_identifier>("[timeout]", true), duration_type);
+}
+
 }
--- a/cql3/attributes.hh
+++ b/cql3/attributes.hh
@@ -54,31 +54,39 @@ class attributes final {
 private:
    const ::shared_ptr<term> _timestamp;
    const ::shared_ptr<term> _time_to_live;
+    const ::shared_ptr<term> _timeout;
 public:
    static std::unique_ptr<attributes> none();
 private:
-    attributes(::shared_ptr<term>&& timestamp, ::shared_ptr<term>&& time_to_live);
+    attributes(::shared_ptr<term>&& timestamp, ::shared_ptr<term>&& time_to_live, ::shared_ptr<term>&& timeout);
 public:
    bool is_timestamp_set() const;

    bool is_time_to_live_set() const;

+    bool is_timeout_set() const;
+
    int64_t get_timestamp(int64_t now, const query_options& options);

    int32_t get_time_to_live(const query_options& options);

+    db::timeout_clock::duration get_timeout(const query_options& options) const;
+
    void collect_marker_specification(variable_specifications& bound_names) const;

    class raw final {
    public:
        ::shared_ptr<term::raw> timestamp;
        ::shared_ptr<term::raw> time_to_live;
+        ::shared_ptr<term::raw> timeout;

        std::unique_ptr<attributes> prepare(database& db, const sstring& ks_name, const sstring& cf_name) const;
    private:
        lw_shared_ptr<column_specification> timestamp_receiver(const sstring& ks_name, const sstring& cf_name) const;

        lw_shared_ptr<column_specification> time_to_live_receiver(const sstring& ks_name, const sstring& cf_name) const;
+
+        lw_shared_ptr<column_specification> timeout_receiver(const sstring& ks_name, const sstring& cf_name) const;
    };
 };

--- a/cql3/constants.hh
+++ b/cql3/constants.hh
@@ -192,9 +192,12 @@ public:

        virtual ::shared_ptr<terminal> bind(const query_options& options) override {
            auto bytes = bind_and_get(options);
-            if (!bytes) {
+            if (bytes.is_null()) {
                return ::shared_ptr<terminal>{};
            }
+            if (bytes.is_unset_value()) {
+                return UNSET_VALUE;
+            }
            return ::make_shared<constants::value>(std::move(cql3::raw_value::make_value(to_bytes(*bytes))));
        }
    };
@@ -227,9 +230,7 @@ public:
            } else if (value.is_unset_value()) {
                return;
            }
-            auto increment = with_linearized(*value, [] (bytes_view value_view) {
-                return value_cast<int64_t>(long_type->deserialize_value(value_view));
-            });
+            auto increment = value_cast<int64_t>(long_type->deserialize_value(*value));
            m.set_cell(prefix, column, make_counter_update_cell(increment, params));
        }
    };
@@ -244,9 +245,7 @@ public:
            } else if (value.is_unset_value()) {
                return;
            }
-            auto increment = with_linearized(*value, [] (bytes_view value_view) {
-                return value_cast<int64_t>(long_type->deserialize_value(value_view));
-            });
+            auto increment = value_cast<int64_t>(long_type->deserialize_value(*value));
            if (increment == std::numeric_limits<int64_t>::min()) {
                throw exceptions::invalid_request_exception(format("The negation of {:d} overflows supported counter precision (signed 8 bytes integer)", increment));
            }
--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -27,7 +27,9 @@
 #include <fmt/ostream.h>
 #include <unordered_map>

+#include "cql3/constants.hh"
 #include "cql3/lists.hh"
+#include "cql3/statements/request_validations.hh"
 #include "cql3/tuples.hh"
 #include "index/secondary_index_manager.hh"
 #include "types/list.hh"
@@ -43,7 +45,8 @@ using boost::adaptors::transformed;

 namespace {

-std::optional<atomic_cell_value_view> do_get_value(const schema& schema,
+static
+bytes_opt do_get_value(const schema& schema,
        const column_definition& cdef,
        const partition_key& key,
        const clustering_key_prefix& ckey,
@@ -51,9 +54,9 @@ std::optional<atomic_cell_value_view> do_get_value(const schema& schema,
        gc_clock::time_point now) {
    switch (cdef.kind) {
        case column_kind::partition_key:
-            return atomic_cell_value_view(key.get_component(schema, cdef.component_index()));
+            return to_bytes(key.get_component(schema, cdef.component_index()));
        case column_kind::clustering_key:
-            return atomic_cell_value_view(ckey.get_component(schema, cdef.component_index()));
+            return to_bytes(ckey.get_component(schema, cdef.component_index()));
        default:
            auto cell = cells.find_cell(cdef.id);
            if (!cell) {
@@ -61,7 +64,7 @@ std::optional<atomic_cell_value_view> do_get_value(const schema& schema,
            }
            assert(cdef.is_atomic());
            auto c = cell->as_atomic_cell(cdef);
-            return c.is_dead(now) ? std::nullopt : std::optional<atomic_cell_value_view>(c.value());
+            return c.is_dead(now) ? std::nullopt : bytes_opt(c.value().linearize());
    }
 }

@@ -138,9 +141,8 @@ bytes_opt get_value_from_partition_slice(

 /// Returns col's value from a mutation.
 bytes_opt get_value_from_mutation(const column_value& col, row_data_from_mutation data) {
-    const auto v = do_get_value(
+    return do_get_value(
            data.schema_, *col.col, data.partition_key_, data.clustering_key_, data.other_columns, data.now);
-    return v ? v->linearize() : bytes_opt();
 }

 /// Returns col's value from the fetched data.
@@ -154,7 +156,7 @@ bytes_opt get_value(const column_value& col, const column_value_eval_bag& bag) {

 /// Type for comparing results of get_value().
 const abstract_type* get_value_comparator(const column_definition* cdef) {
-    return cdef->type->is_reversed() ? cdef->type->underlying_type().get() : cdef->type.get();
+    return &cdef->type->without_reversed();
 }

 /// Type for comparing results of get_value().
@@ -355,16 +357,12 @@ bytes_opt next_value(query::result_row_view::iterator_type& iter, const column_d
    if (cdef->type->is_multi_cell()) {
        auto cell = iter.next_collection_cell();
        if (cell) {
-            return cell->with_linearized([] (bytes_view data) {
-                return bytes(data.cbegin(), data.cend());
-            });
+            return linearized(*cell);
        }
    } else {
        auto cell = iter.next_atomic_cell();
        if (cell) {
-            return cell->value().with_linearized([] (bytes_view data) {
-                return bytes(data.cbegin(), data.cend());
-            });
+            return linearized(cell->value());
        }
    }
    return std::nullopt;
@@ -417,6 +415,8 @@ bool is_one_of(const column_value& col, term& rhs, const column_value_eval_bag&
    } else if (auto mkr = dynamic_cast<lists::marker*>(&rhs)) {
        // This is `a IN ?`.  RHS elements are values representable as bytes_opt.
        const auto values = static_pointer_cast<lists::value>(mkr->bind(bag.options));
+        statements::request_validations::check_not_null(
+                values, "Invalid null value for column %s", col.col->name_as_text());
        return boost::algorithm::any_of(values->get_elements(), [&] (const bytes_opt& b) {
                return equal(b, col, bag);
            });
@@ -568,7 +568,8 @@ const auto deref = boost::adaptors::transformed([] (const bytes_opt& b) { return

 /// Returns possible values from t, which must be RHS of IN.
 value_list get_IN_values(
-        const ::shared_ptr<term>& t, const query_options& options, const serialized_compare& comparator) {
+        const ::shared_ptr<term>& t, const query_options& options, const serialized_compare& comparator,
+        sstring_view column_name) {
    // RHS is prepared differently for different CQL cases.  Cast it dynamically to discern which case this is.
    if (auto dv = dynamic_pointer_cast<lists::delayed_value>(t)) {
        // Case `a IN (1,2,3)`.
@@ -578,8 +579,12 @@ value_list get_IN_values(
        return to_sorted_vector(std::move(result_range), comparator);
    } else if (auto mkr = dynamic_pointer_cast<lists::marker>(t)) {
        // Case `a IN ?`.  Collect all list-element values.
-        const auto val = static_pointer_cast<lists::value>(mkr->bind(options));
-        return to_sorted_vector(val->get_elements() | non_null | deref, comparator);
+        const auto val = mkr->bind(options);
+        if (val == constants::UNSET_VALUE) {
+            throw exceptions::invalid_request_exception(format("Invalid unset value for column {}", column_name));
+        }
+        statements::request_validations::check_not_null(val, "Invalid null value for column %s", column_name);
+        return to_sorted_vector(static_pointer_cast<lists::value>(val)->get_elements() | non_null | deref, comparator);
    }
    throw std::logic_error(format("get_IN_values(single column) on invalid term {}", *t));
 }
@@ -686,7 +691,7 @@ value_set possible_lhs_values(const column_definition* cdef, const expression& e
                                return oper.op == oper_t::EQ ? value_set(value_list{*val})
                                        : to_range(oper.op, *val);
                            } else if (oper.op == oper_t::IN) {
-                                return get_IN_values(oper.rhs, options, type->as_less_comparator());
+                                return get_IN_values(oper.rhs, options, type->as_less_comparator(), cdef->name_as_text());
                            }
                            throw std::logic_error(format("possible_lhs_values: unhandled operator {}", oper));
                        },
@@ -776,9 +781,11 @@ bool is_supported_by(const expression& expr, const secondary_index::index& idx)
                            return idx.supports_expression(*col.col, oper.op);
                        },
                        [&] (const std::vector<column_value>& cvs) {
-                            return boost::algorithm::any_of(cvs, [&] (const column_value& c) {
-                                return idx.supports_expression(*c.col, oper.op);
-                            });
+                            if (cvs.size() == 1) {
+                                return idx.supports_expression(*cvs[0].col, oper.op);
+                            }
+                            // We don't use index table for multi-column restrictions, as it cannot avoid filtering.
+                            return false;
                        },
                        [&] (const token&) { return false; },
                    }, oper.lhs);
--- a/cql3/functions/aggregate_fcts.cc
+++ b/cql3/functions/aggregate_fcts.cc
@@ -219,7 +219,7 @@ struct aggregate_type_for<simple_date_native_type> {

 template<>
 struct aggregate_type_for<timeuuid_native_type> {
-    using type = timeuuid_native_type::primary_type;
+    using type = timeuuid_native_type;
 };

 template<>
@@ -227,6 +227,7 @@ struct aggregate_type_for<time_native_type> {
    using type = time_native_type::primary_type;
 };

+// WARNING: never invoke this on temporary values; it will return a dangling reference.
 template <typename Type>
 const Type& max_wrapper(const Type& t1, const Type& t2) {
    using std::max;
@@ -241,6 +242,10 @@ inline const net::inet_address& max_wrapper(const net::inet_address& t1, const n
    return std::memcmp(t1.data(), t2.data(), len) >= 0 ? t1 : t2;
 }

+inline const timeuuid_native_type& max_wrapper(const timeuuid_native_type& t1, const timeuuid_native_type& t2) {
+    return t1.uuid.timestamp() > t2.uuid.timestamp() ? t1 : t2;
+}
+
 template <typename Type>
 class impl_max_function_for final : public aggregate_function::aggregate {
   std::optional<typename aggregate_type_for<Type>::type> _max{};
@@ -323,6 +328,7 @@ make_max_function() {
    return make_shared<max_function_for<Type>>();
 }

+// WARNING: never invoke this on temporary values; it will return a dangling reference.
 template <typename Type>
 const Type& min_wrapper(const Type& t1, const Type& t2) {
    using std::min;
@@ -337,6 +343,10 @@ inline const net::inet_address& min_wrapper(const net::inet_address& t1, const n
    return std::memcmp(t1.data(), t2.data(), len) <= 0 ? t1 : t2;
 }

+inline timeuuid_native_type min_wrapper(timeuuid_native_type t1, timeuuid_native_type t2) {
+    return t1.uuid.timestamp() < t2.uuid.timestamp() ? t1 : t2;
+}
+
 template <typename Type>
 class impl_min_function_for final : public aggregate_function::aggregate {
   std::optional<typename aggregate_type_for<Type>::type> _min{};
--- a/cql3/lists.cc
+++ b/cql3/lists.cc
@@ -40,7 +40,7 @@ lw_shared_ptr<column_specification>
 lists::value_spec_of(const column_specification& column) {
    return make_lw_shared<column_specification>(column.ks_name, column.cf_name,
            ::make_shared<column_identifier>(format("value({})", *column.name), true),
-                dynamic_pointer_cast<const list_type_impl>(column.type)->get_elements_type());
+                dynamic_cast<const list_type_impl&>(column.type->without_reversed()).get_elements_type());
 }

 lw_shared_ptr<column_specification>
@@ -87,7 +87,7 @@ lists::literal::prepare(database& db, const sstring& keyspace, lw_shared_ptr<col

 void
 lists::literal::validate_assignable_to(database& db, const sstring keyspace, const column_specification& receiver) const {
-    if (!dynamic_pointer_cast<const list_type_impl>(receiver.type)) {
+    if (!receiver.type->without_reversed().is_list()) {
        throw exceptions::invalid_request_exception(format("Invalid list literal for {} of type {}",
                *receiver.name, receiver.type->as_cql3_type()));
    }
@@ -125,18 +125,11 @@ lists::literal::to_string() const {

 lists::value
 lists::value::from_serialized(const fragmented_temporary_buffer::view& val, const list_type_impl& type, cql_serialization_format sf) {
-    return with_linearized(val, [&] (bytes_view v) {
-        return from_serialized(v, type, sf);
-    });
-}
-
-lists::value
-lists::value::from_serialized(bytes_view v, const list_type_impl& type, cql_serialization_format sf) {
    try {
        // Collections have this small hack that validate cannot be called on a serialized object,
        // but compose does the validation (so we're fine).
        // FIXME: deserializeForNativeProtocol()?!
-        auto l = value_cast<list_type_impl::native_type>(type.deserialize(v, sf));
+        auto l = value_cast<list_type_impl::native_type>(type.deserialize(val, sf));
        std::vector<bytes_opt> elements;
        elements.reserve(l.size());
        for (auto&& element : l) {
@@ -227,17 +220,15 @@ lists::delayed_value::bind(const query_options& options) {
 ::shared_ptr<terminal>
 lists::marker::bind(const query_options& options) {
    const auto& value = options.get_value_at(_bind_index);
-    auto& ltype = static_cast<const list_type_impl&>(*_receiver->type);
+    auto& ltype = dynamic_cast<const list_type_impl&>(_receiver->type->without_reversed());
    if (value.is_null()) {
        return nullptr;
    } else if (value.is_unset_value()) {
        return constants::UNSET_VALUE;
    } else {
        try {
-            return with_linearized(*value, [&] (bytes_view v) {
-                ltype.validate(v, options.get_cql_serialization_format());
-                return make_shared<lists::value>(value::from_serialized(v, ltype, options.get_cql_serialization_format()));
-            });
+            ltype.validate(*value, options.get_cql_serialization_format());
+            return make_shared<lists::value>(value::from_serialized(*value, ltype, options.get_cql_serialization_format()));
        } catch (marshal_exception& e) {
            throw exceptions::invalid_request_exception(
                    format("Exception while binding column {:s}: {:s}", _receiver->name->to_cql_string(), e.what()));
@@ -308,9 +299,7 @@ lists::setter_by_index::execute(mutation& m, const clustering_key_prefix& prefix
        return;
    }

-    auto idx = with_linearized(*index, [] (bytes_view v) {
-        return value_cast<int32_t>(data_type_for<int32_t>()->deserialize(v));
-    });
+    auto idx = value_cast<int32_t>(data_type_for<int32_t>()->deserialize(*index));
    auto&& existing_list_opt = params.get_prefetched_list(m.key(), prefix, column);
    if (!existing_list_opt) {
        throw exceptions::invalid_request_exception("Attempted to set an element on a list which is null");
--- a/cql3/lists.hh
+++ b/cql3/lists.hh
@@ -73,7 +73,6 @@ public:
    };

    class value : public multi_item_terminal, collection_terminal {
-        static value from_serialized(bytes_view v, const list_type_impl& type, cql_serialization_format sf);
    public:
        std::vector<bytes_opt> _elements;
    public:
--- a/cql3/maps.cc
+++ b/cql3/maps.cc
@@ -55,14 +55,14 @@ lw_shared_ptr<column_specification>
 maps::key_spec_of(const column_specification& column) {
    return make_lw_shared<column_specification>(column.ks_name, column.cf_name,
                ::make_shared<column_identifier>(format("key({})", *column.name), true),
-                 dynamic_pointer_cast<const map_type_impl>(column.type)->get_keys_type());
+                dynamic_cast<const map_type_impl&>(column.type->without_reversed()).get_keys_type());
 }

 lw_shared_ptr<column_specification>
 maps::value_spec_of(const column_specification& column) {
    return make_lw_shared<column_specification>(column.ks_name, column.cf_name,
                ::make_shared<column_identifier>(format("value({})", *column.name), true),
-                 dynamic_pointer_cast<const map_type_impl>(column.type)->get_values_type());
+                 dynamic_cast<const map_type_impl&>(column.type->without_reversed()).get_values_type());
 }

 ::shared_ptr<term>
@@ -88,7 +88,9 @@ maps::literal::prepare(database& db, const sstring& keyspace, lw_shared_ptr<colu

        values.emplace(k, v);
    }
-    delayed_value value(static_pointer_cast<const map_type_impl>(receiver->type)->get_keys_type()->as_less_comparator(), values);
+    delayed_value value(
+            dynamic_cast<const map_type_impl&>(receiver->type->without_reversed()).get_keys_type()->as_less_comparator(),
+            values);
    if (all_terminal) {
        return value.bind(query_options::DEFAULT);
    } else {
@@ -98,7 +100,7 @@ maps::literal::prepare(database& db, const sstring& keyspace, lw_shared_ptr<colu

 void
 maps::literal::validate_assignable_to(database& db, const sstring& keyspace, const column_specification& receiver) const {
-    if (!dynamic_pointer_cast<const map_type_impl>(receiver.type)) {
+    if (!receiver.type->without_reversed().is_map()) {
        throw exceptions::invalid_request_exception(format("Invalid map literal for {} of type {}", *receiver.name, receiver.type->as_cql3_type()));
    }
    auto&& key_spec = maps::key_spec_of(receiver);
@@ -158,15 +160,13 @@ maps::value::from_serialized(const fragmented_temporary_buffer::view& fragmented
        // Collections have this small hack that validate cannot be called on a serialized object,
        // but compose does the validation (so we're fine).
        // FIXME: deserialize_for_native_protocol?!
-      return with_linearized(fragmented_value, [&] (bytes_view value) {
-        auto m = value_cast<map_type_impl::native_type>(type.deserialize(value, sf));
+        auto m = value_cast<map_type_impl::native_type>(type.deserialize(fragmented_value, sf));
        std::map<bytes, bytes, serialized_compare> map(type.get_keys_type()->as_less_comparator());
        for (auto&& e : m) {
            map.emplace(type.get_keys_type()->decompose(e.first),
                        type.get_values_type()->decompose(e.second));
        }
        return maps::value { std::move(map) };
-      });
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception(e.what());
    }
@@ -263,14 +263,16 @@ maps::marker::bind(const query_options& options) {
        return constants::UNSET_VALUE;
    }
    try {
-        with_linearized(*val, [&] (bytes_view value) {
-            _receiver->type->validate(value, options.get_cql_serialization_format());
-        });
+        _receiver->type->validate(*val, options.get_cql_serialization_format());
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception(
                format("Exception while binding column {:s}: {:s}", _receiver->name->to_cql_string(), e.what()));
    }
-    return ::make_shared<maps::value>(maps::value::from_serialized(*val, static_cast<const map_type_impl&>(*_receiver->type), options.get_cql_serialization_format()));
+    return ::make_shared<maps::value>(
+            maps::value::from_serialized(
+                    *val,
+                    dynamic_cast<const map_type_impl&>(_receiver->type->without_reversed()),
+                    options.get_cql_serialization_format()));
 }

 void
@@ -305,6 +307,12 @@ maps::setter_by_key::execute(mutation& m, const clustering_key_prefix& prefix, c
    assert(column.type->is_multi_cell()); // "Attempted to set a value for a single key on a frozen map"m
    auto key = _k->bind_and_get(params._options);
    auto value = _t->bind_and_get(params._options);
+    if (value.is_unset_value()) {
+        return;
+    }
+    if (key.is_unset_value() || value.is_unset_value()) {
+        throw invalid_request_exception("Invalid unset map key");
+    }
    if (!key) {
        throw invalid_request_exception("Invalid null map key");
    }
--- a/cql3/query_options.cc
+++ b/cql3/query_options.cc
@@ -42,12 +42,14 @@
 #include "cql3/cql_config.hh"
 #include "query_options.hh"
 #include "version.hh"
+#include "db/consistency_level_type.hh"

 namespace cql3 {

 const cql_config default_cql_config;

-thread_local const query_options::specific_options query_options::specific_options::DEFAULT{-1, {}, {}, api::missing_timestamp};
+thread_local const query_options::specific_options query_options::specific_options::DEFAULT{
+    -1, {}, db::consistency_level::SERIAL, api::missing_timestamp};

 thread_local query_options query_options::DEFAULT{default_cql_config,
    db::consistency_level::ONE, infinite_timeout_config, std::nullopt,
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -668,10 +668,14 @@ struct internal_query_state {
    bool more_results = true;
 };

-::shared_ptr<internal_query_state> query_processor::create_paged_state(const sstring& query_string,
-        const std::initializer_list<data_value>& values, int32_t page_size) {
+::shared_ptr<internal_query_state> query_processor::create_paged_state(
+        const sstring& query_string,
+        db::consistency_level cl,
+        const timeout_config& timeout_config,
+        const std::initializer_list<data_value>& values,
+        int32_t page_size) {
    auto p = prepare_internal(query_string);
-    auto opts = make_internal_options(p, values, db::consistency_level::ONE, infinite_timeout_config, page_size);
+    auto opts = make_internal_options(p, values, cl, timeout_config, page_size);
    ::shared_ptr<internal_query_state> res = ::make_shared<internal_query_state>(
            internal_query_state{
                    query_string,
@@ -935,17 +939,20 @@ bool query_processor::migration_subscriber::should_invalidate(
    return statement->depends_on_keyspace(ks_name) && (!cf_name || statement->depends_on_column_family(*cf_name));
 }

-future<> query_processor::query(
+future<> query_processor::query_internal(
        const sstring& query_string,
+        db::consistency_level cl,
+        const timeout_config& timeout_config,
        const std::initializer_list<data_value>& values,
+        int32_t page_size,
        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f) {
-    return for_each_cql_result(create_paged_state(query_string, values), std::move(f));
+    return for_each_cql_result(create_paged_state(query_string, cl, timeout_config, values, page_size), std::move(f));
 }

-future<> query_processor::query(
+future<> query_processor::query_internal(
        const sstring& query_string,
        noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f) {
-    return for_each_cql_result(create_paged_state(query_string, {}), std::move(f));
+    return query_internal(query_string, db::consistency_level::ONE, infinite_timeout_config, {}, 1000, std::move(f));
 }

 }
--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -224,75 +224,52 @@ public:
    /*!
     * \brief iterate over all cql results using paging
     *
-     * You Create a statement with optional paraemter and pass
-     * a function that goes over the results.
+     * You create a statement with optional parameters and pass
+     * a function that goes over the result rows.
     *
-     * The passed function would be called for all the results, return stop_iteration::yes
-     * to stop during iteration.
+     * The passed function would be called for all rows; return future<stop_iteration::yes>
+     * to stop iteration.
     *
     * For example:
-            return query("SELECT * from system.compaction_history",
-                         [&history] (const cql3::untyped_result_set::row& row) mutable {
-                ....
-                ....
-                return stop_iteration::no;
-            });
-
-     * You can use place holder in the query, the prepared statement will only be done once.
-     *
-     *
-     * query_string - the cql string, can contain place holder
-     * f - a function to be run on each of the query result, if the function return false the iteration would stop
-     * args - arbitrary number of query parameters
-     */
-    template<typename... Args>
-    future<> query(
-            const sstring& query_string,
-            std::function<stop_iteration(const cql3::untyped_result_set_row&)>&& f,
-            Args&&... args) {
-        return for_each_cql_result(
-                create_paged_state(query_string, { data_value(std::forward<Args>(args))... }), std::move(f));
-    }
-
-    /*!
-     * \brief iterate over all cql results using paging
-     *
-     * You Create a statement with optional paraemter and pass
-     * a function that goes over the results.
-     *
-     * The passed function would be called for all the results, return future<stop_iteration::yes>
-     * to stop during iteration.
-     *
-     * For example:
-            return query("SELECT * from system.compaction_history",
-                         [&history] (const cql3::untyped_result_set::row& row) mutable {
+            return query_internal(
+                    "SELECT * from system.compaction_history",
+                    db::consistency_level::ONE,
+                    infinite_timeout_config,
+                    {},
+                    [&history] (const cql3::untyped_result_set::row& row) mutable {
                ....
                ....
                return make_ready_future<stop_iteration>(stop_iteration::no);
            });

-     * You can use place holder in the query, the prepared statement will only be done once.
+     * You can use placeholders in the query, the statement will only be prepared once.
     *
-     *
-     * query_string - the cql string, can contain place holder
-     * values - query parameters value
-     * f - a function to be run on each of the query result, if the function return stop_iteration::no the iteration
-     * would stop
+     * query_string - the cql string, can contain placeholders
+     * cl - consistency level of the query
+     * timeout_config - timeout configuration
+     * values - values to be substituted for the placeholders in the query
+     * page_size - maximum page size
+     * f - a function to be run on each row of the query result,
+     *     if the function returns stop_iteration::yes the iteration will stop
     */
-    future<> query(
+    future<> query_internal(
            const sstring& query_string,
+            db::consistency_level cl,
+            const timeout_config& timeout_config,
            const std::initializer_list<data_value>& values,
+            int32_t page_size,
            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f);

    /*
     * \brief iterate over all cql results using paging
-     * An overload of the query with future function without query parameters.
+     * An overload of query_internal without query parameters
+     * using CL = ONE, no timeout, and page size = 1000.
     *
-     * query_string - the cql string, can contain place holder
-     * f - a function to be run on each of the query result, if the function return stop_iteration::no the iteration
-     * would stop
+     * query_string - the cql string, can contain placeholders
+     * f - a function to be run on each row of the query result,
+     *     if the function returns stop_iteration::yes the iteration will stop
     */
-    future<> query(
+    future<> query_internal(
            const sstring& query_string,
            noncopyable_function<future<stop_iteration>(const cql3::untyped_result_set_row&)>&& f);

@@ -354,8 +331,10 @@ private:
     */
    ::shared_ptr<internal_query_state> create_paged_state(
            const sstring& query_string,
-            const std::initializer_list<data_value>& = { },
-            int32_t page_size = 1000);
+            db::consistency_level,
+            const timeout_config&,
+            const std::initializer_list<data_value>&,
+            int32_t page_size);

    /*!
     * \brief run a query using paging
--- a/cql3/restrictions/single_column_primary_key_restrictions.hh
+++ b/cql3/restrictions/single_column_primary_key_restrictions.hh
@@ -171,8 +171,7 @@ public:

    virtual void merge_with(::shared_ptr<restriction> restriction) override {
        if (find_atom(restriction->expression, [] (const expr::binary_operator& b) {
-                    return std::holds_alternative<std::vector<expr::column_value>>(b.lhs)
-                            && std::get<std::vector<expr::column_value>>(b.lhs).size() > 1;
+                    return std::holds_alternative<std::vector<expr::column_value>>(b.lhs);
                })) {
            throw exceptions::invalid_request_exception(
                "Mixing single column relations and multi column relations on clustering columns is not allowed");
@@ -213,30 +212,22 @@ private:
    std::vector<range_type> compute_bounds(const query_options& options) const {
        std::vector<range_type> ranges;

-        static constexpr auto invalid_null_msg = std::is_same<ValueType, partition_key>::value
-            ? "Invalid null value for partition key part %s" : "Invalid null value for clustering key part %s";
-
        // TODO: rewrite this to simply invoke possible_lhs_values on each clustering column, find the first
        // non-list, and take Cartesian product of that prefix.  No need for to_range() and std::get() here.
        if (_restrictions->is_all_eq()) {
-            if (_restrictions->size() == 1) {
-                auto&& e = *restrictions().begin();
-                const auto b = std::get<expr::binary_operator>(e.second->expression).rhs->bind_and_get(options);
-                if (!b) {
-                    throw exceptions::invalid_request_exception(sprint(invalid_null_msg, e.first->name_as_text()));
-                }
-                return {range_type::make_singular(ValueType::from_single_value(*_schema, to_bytes(b)))};
-            }
            std::vector<bytes> components;
            components.reserve(_restrictions->size());
            for (auto&& e : restrictions()) {
                const column_definition* def = e.first;
                assert(components.size() == _schema->position(*def));
-                const auto b = std::get<expr::binary_operator>(e.second->expression).rhs->bind_and_get(options);
-                if (!b) {
-                    throw exceptions::invalid_request_exception(sprint(invalid_null_msg, e.first->name_as_text()));
+                // Because _restrictions is all EQ, possible_lhs_values must return a list, not a range.
+                const auto b = std::get<expr::value_list>(possible_lhs_values(e.first, e.second->expression, options));
+                // Furthermore, this list is either a single element (when all RHSs are the same) or empty (when at
+                // least two are different, so the restrictions cannot hold simultaneously -- ie, c=1 AND c=2).
+                if (b.empty()) {
+                    return {};
                }
-                components.emplace_back(to_bytes(b));
+                components.emplace_back(b.front());
            }
            return {range_type::make_singular(ValueType::from_exploded(*_schema, std::move(components)))};
        }
@@ -324,7 +315,7 @@ public:
        std::vector<bytes_opt> res;
        for (const ValueType& r : src) {
            for (const auto& component : r.components()) {
-                res.emplace_back(component);
+                res.emplace_back(to_bytes(component));
            }
        }
        return res;
--- a/cql3/restrictions/single_column_restrictions.hh
+++ b/cql3/restrictions/single_column_restrictions.hh
@@ -108,6 +108,9 @@ public:
            return bytes_opt{};
        } else {
            const auto values = std::get<expr::value_list>(possible_lhs_values(&cdef, it->second->expression, options));
+            if (values.empty()) {
+                return bytes_opt{};
+            }
            assert(values.size() == 1);
            return values.front();
        }
@@ -119,7 +122,7 @@ public:
     * @param column_def the column definition
     * @return the restriction associated to the specified column
     */
-    ::shared_ptr<restriction> get_restriction(const column_definition& column_def) const {
+    ::shared_ptr<single_column_restriction> get_restriction(const column_definition& column_def) const {
        auto i = _restrictions.find(&column_def);
        if (i == _restrictions.end()) {
            return {};
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -147,7 +147,6 @@ statement_restrictions::statement_restrictions(database& db,
        const std::vector<::shared_ptr<relation>>& where_clause,
        variable_specifications& bound_names,
        bool selects_only_static_columns,
-        bool select_a_collection,
        bool for_view,
        bool allow_filtering)
    : statement_restrictions(schema, allow_filtering)
@@ -227,10 +226,11 @@ statement_restrictions::statement_restrictions(database& db,
        }
    }

-    process_clustering_columns_restrictions(select_a_collection, for_view, allow_filtering);
+    process_clustering_columns_restrictions(for_view, allow_filtering);

    // Covers indexes on the first clustering column (among others).
-    if (_is_key_range && _has_queriable_ck_index) {
+    if (_is_key_range && _has_queriable_ck_index &&
+        !dynamic_pointer_cast<multi_column_restriction>(_clustering_columns_restrictions)) {
        _uses_secondary_indexing = true;
    }

@@ -329,20 +329,39 @@ int statement_restrictions::score(const secondary_index::index& index) const {
    return 1;
 }

+namespace {
+
+using namespace cql3::restrictions;
+
+/// If rs contains a restrictions_map of individual columns to their restrictions, returns it.  Otherwise, returns null.
+const single_column_restrictions::restrictions_map* get_individual_restrictions_map(const restrictions* rs) {
+    if (auto regular = dynamic_cast<const single_column_restrictions*>(rs)) {
+        return &regular->restrictions();
+    } else if (auto partition = dynamic_cast<const single_column_partition_key_restrictions*>(rs)) {
+        return &partition->restrictions();
+    } else if (auto clustering = dynamic_cast<const single_column_clustering_key_restrictions*>(rs)) {
+        return &clustering->restrictions();
+    }
+    return nullptr;
+}
+
+} // anonymous namespace
+
 std::pair<std::optional<secondary_index::index>, ::shared_ptr<cql3::restrictions::restrictions>> statement_restrictions::find_idx(secondary_index::secondary_index_manager& sim) const {
    std::optional<secondary_index::index> chosen_index;
    int chosen_index_score = 0;
    ::shared_ptr<cql3::restrictions::restrictions> chosen_index_restrictions;

    for (const auto& index : sim.list_indexes()) {
+        auto cdef = _schema->get_column_definition(to_bytes(index.target_column()));
        for (::shared_ptr<cql3::restrictions::restrictions> restriction : index_restrictions()) {
-            for (const auto& cdef : restriction->get_column_defs()) {
-                if (index.depends_on(*cdef)) {
-                    if (score(index) > chosen_index_score) {
-                        chosen_index = index;
-                        chosen_index_score = score(index);
-                        chosen_index_restrictions = restriction;
-                    }
+            if (auto rmap = get_individual_restrictions_map(restriction.get())) {
+                const auto found = rmap->find(cdef);
+                if (found != rmap->end() && is_supported_by(found->second->expression, index)
+                    && score(index) > chosen_index_score) {
+                    chosen_index = index;
+                    chosen_index_score = score(index);
+                    chosen_index_restrictions = restriction;
                }
            }
        }
@@ -435,15 +454,11 @@ bool statement_restrictions::has_unrestricted_clustering_columns() const {
    return _clustering_columns_restrictions->has_unrestricted_components(*_schema);
 }

-void statement_restrictions::process_clustering_columns_restrictions(bool select_a_collection, bool for_view, bool allow_filtering) {
+void statement_restrictions::process_clustering_columns_restrictions(bool for_view, bool allow_filtering) {
    if (!has_clustering_columns_restriction()) {
        return;
    }

-    if (clustering_key_restrictions_has_IN() && select_a_collection) {
-        throw exceptions::invalid_request_exception(
-            "Cannot restrict clustering columns by IN relations when a collection is selected by the query");
-    }
    if (find_atom(_clustering_columns_restrictions->expression, expr::is_on_collection)
        && !_has_queriable_ck_index && !allow_filtering) {
        throw exceptions::invalid_request_exception(
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -119,7 +119,6 @@ public:
        const std::vector<::shared_ptr<relation>>& where_clause,
        variable_specifications& bound_names,
        bool selects_only_static_columns,
-        bool select_a_collection,
        bool for_view = false,
        bool allow_filtering = false);

@@ -217,10 +216,9 @@ private:
     * Processes the clustering column restrictions.
     *
     * @param has_queriable_index <code>true</code> if some of the queried data are indexed, <code>false</code> otherwise
-     * @param select_a_collection <code>true</code> if the query should return a collection column
     * @throws InvalidRequestException if the request is invalid
     */
-    void process_clustering_columns_restrictions(bool select_a_collection, bool for_view, bool allow_filtering);
+    void process_clustering_columns_restrictions(bool for_view, bool allow_filtering);

    /**
     * Returns the <code>Restrictions</code> for the specified type of columns.
--- a/cql3/selection/selection.hh
+++ b/cql3/selection/selection.hh
@@ -140,21 +140,6 @@ public:
        return true;
    }

-    /**
-     * Checks if this selection contains a collection.
-     *
-     * @return <code>true</code> if this selection contains a collection, <code>false</code> otherwise.
-     */
-    bool contains_a_collection() const {
-        if (!_schema->has_multi_cell_collections()) {
-            return false;
-        }
-
-        return std::any_of(_columns.begin(), _columns.end(), [] (auto&& def) {
-           return def->type->is_collection() && def->type->is_multi_cell();
-        });
-    }
-
    /**
     * Returns the index of the specified column.
     *
--- a/cql3/sets.cc
+++ b/cql3/sets.cc
@@ -31,7 +31,7 @@ lw_shared_ptr<column_specification>
 sets::value_spec_of(const column_specification& column) {
    return make_lw_shared<column_specification>(column.ks_name, column.cf_name,
            ::make_shared<column_identifier>(format("value({})", *column.name), true),
-            dynamic_pointer_cast<const set_type_impl>(column.type)->get_elements_type());
+            dynamic_cast<const set_type_impl&>(column.type->without_reversed()).get_elements_type());
 }

 shared_ptr<term>
@@ -74,7 +74,8 @@ sets::literal::prepare(database& db, const sstring& keyspace, lw_shared_ptr<colu

        values.push_back(std::move(t));
    }
-    auto compare = dynamic_pointer_cast<const set_type_impl>(receiver->type)->get_elements_type()->as_less_comparator();
+    auto compare = dynamic_cast<const set_type_impl&>(receiver->type->without_reversed())
+            .get_elements_type()->as_less_comparator();

    auto value = ::make_shared<delayed_value>(compare, std::move(values));
    if (all_terminal) {
@@ -86,7 +87,7 @@ sets::literal::prepare(database& db, const sstring& keyspace, lw_shared_ptr<colu

 void
 sets::literal::validate_assignable_to(database& db, const sstring& keyspace, const column_specification& receiver) const {
-    if (!dynamic_pointer_cast<const set_type_impl>(receiver.type)) {
+    if (!receiver.type->without_reversed().is_set()) {
        // We've parsed empty maps as a set literal to break the ambiguity so
        // handle that case now
        if (dynamic_pointer_cast<const map_type_impl>(receiver.type) && _elements.empty()) {
@@ -106,7 +107,7 @@ sets::literal::validate_assignable_to(database& db, const sstring& keyspace, con

 assignment_testable::test_result
 sets::literal::test_assignment(database& db, const sstring& keyspace, const column_specification& receiver) const {
-    if (!dynamic_pointer_cast<const set_type_impl>(receiver.type)) {
+    if (!receiver.type->without_reversed().is_set()) {
        // We've parsed empty maps as a set literal to break the ambiguity so handle that case now
        if (dynamic_pointer_cast<const map_type_impl>(receiver.type) && _elements.empty()) {
            return assignment_testable::test_result::WEAKLY_ASSIGNABLE;
@@ -137,14 +138,12 @@ sets::value::from_serialized(const fragmented_temporary_buffer::view& val, const
        // Collections have this small hack that validate cannot be called on a serialized object,
        // but compose does the validation (so we're fine).
        // FIXME: deserializeForNativeProtocol?!
-      return with_linearized(val, [&] (bytes_view v) {
-        auto s = value_cast<set_type_impl::native_type>(type.deserialize(v, sf));
+        auto s = value_cast<set_type_impl::native_type>(type.deserialize(val, sf));
        std::set<bytes, serialized_compare> elements(type.get_elements_type()->as_less_comparator());
        for (auto&& element : s) {
            elements.insert(elements.end(), type.get_elements_type()->decompose(element));
        }
        return value(std::move(elements));
-      });
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception(e.what());
    }
@@ -226,8 +225,11 @@ sets::delayed_value::bind(const query_options& options) {

 sets::marker::marker(int32_t bind_index, lw_shared_ptr<column_specification> receiver)
    : abstract_marker{bind_index, std::move(receiver)} {
-        assert(dynamic_cast<const set_type_impl*>(_receiver->type.get()));
+    if (!_receiver->type->without_reversed().is_set()) {
+        throw std::runtime_error(format("Receiver {} for set marker has wrong type: {}",
+                                        _receiver->cf_name, _receiver->type->name()));
    }
+}

 ::shared_ptr<terminal>
 sets::marker::bind(const query_options& options) {
@@ -237,11 +239,9 @@ sets::marker::bind(const query_options& options) {
    } else if (value.is_unset_value()) {
        return constants::UNSET_VALUE;
    } else {
-        auto& type = static_cast<const set_type_impl&>(*_receiver->type);
+        auto& type = dynamic_cast<const set_type_impl&>(_receiver->type->without_reversed());
        try {
-            with_linearized(*value, [&] (bytes_view v) {
-                type.validate(v, options.get_cql_serialization_format());
-            });
+            type.validate(*value, options.get_cql_serialization_format());
        } catch (marshal_exception& e) {
            throw exceptions::invalid_request_exception(
                    format("Exception while binding column {:s}: {:s}", _receiver->name->to_cql_string(), e.what()));
@@ -284,8 +284,7 @@ void
 sets::adder::do_add(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params,
        shared_ptr<term> value, const column_definition& column) {
    auto set_value = dynamic_pointer_cast<sets::value>(std::move(value));
-    auto set_type = dynamic_cast<const set_type_impl*>(column.type.get());
-    assert(set_type);
+    auto& set_type = dynamic_cast<const set_type_impl&>(column.type->without_reversed());
    if (column.type->is_multi_cell()) {
        if (!set_value || set_value->_elements.empty()) {
            return;
@@ -295,10 +294,10 @@ sets::adder::do_add(mutation& m, const clustering_key_prefix& row_key, const upd
        collection_mutation_description mut;

        for (auto&& e : set_value->_elements) {
-            mut.cells.emplace_back(e, params.make_cell(*set_type->value_comparator(), bytes_view(), atomic_cell::collection_member::yes));
+            mut.cells.emplace_back(e, params.make_cell(*set_type.value_comparator(), bytes_view(), atomic_cell::collection_member::yes));
        }

-        m.set_cell(row_key, column, mut.serialize(*set_type));
+        m.set_cell(row_key, column, mut.serialize(set_type));
    } else if (set_value != nullptr) {
        // for frozen sets, we're overwriting the whole cell
        auto v = set_type_impl::serialize_partially_deserialized_form(
@@ -315,7 +314,7 @@ sets::discarder::execute(mutation& m, const clustering_key_prefix& row_key, cons
    assert(column.type->is_multi_cell()); // "Attempted to remove items from a frozen set";

    auto&& value = _t->bind(params._options);
-    if (!value) {
+    if (!value || value == constants::UNSET_VALUE) {
        return;
    }

--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -45,7 +45,7 @@
 #include "db/system_keyspace.hh"
 #include "database.hh"

-bool is_system_keyspace(const sstring& keyspace);
+bool is_system_keyspace(std::string_view keyspace);

 cql3::statements::alter_keyspace_statement::alter_keyspace_statement(sstring name, ::shared_ptr<ks_prop_defs> attrs)
    : _name(name)
@@ -91,10 +91,10 @@ void cql3::statements::alter_keyspace_statement::validate(service::storage_proxy
    }
 }

-future<shared_ptr<cql_transport::event::schema_change>> cql3::statements::alter_keyspace_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const {
+future<shared_ptr<cql_transport::event::schema_change>> cql3::statements::alter_keyspace_statement::announce_migration(service::storage_proxy& proxy) const {
    auto old_ksm = proxy.get_db().local().find_keyspace(_name).metadata();
    const auto& tm = *proxy.get_token_metadata_ptr();
-    return service::get_local_migration_manager().announce_keyspace_update(_attrs->as_ks_metadata_update(old_ksm, tm), is_local_only).then([this] {
+    return service::get_local_migration_manager().announce_keyspace_update(_attrs->as_ks_metadata_update(old_ksm, tm)).then([this] {
        using namespace cql_transport;
        return ::make_shared<event::schema_change>(
                event::schema_change::change_type::UPDATED,
--- a/cql3/statements/alter_keyspace_statement.hh
+++ b/cql3/statements/alter_keyspace_statement.hh
@@ -61,7 +61,7 @@ public:

    future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;
    void validate(service::storage_proxy& proxy, const service::client_state& state) const override;
-    future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;
    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
 };

--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -71,7 +71,7 @@ alter_table_statement::alter_table_statement(shared_ptr<cf_name> name,

 future<> alter_table_statement::check_access(service::storage_proxy& proxy, const service::client_state& state) const {
    using cdt = auth::command_desc::type;
-    return state.has_column_family_access(keyspace(), column_family(), auth::permission::ALTER,
+    return state.has_column_family_access(proxy.local_db(), keyspace(), column_family(), auth::permission::ALTER,
                                          _type == type::opts ? cdt::ALTER_WITH_OPTS : cdt::OTHER);
 }

@@ -288,7 +288,7 @@ void alter_table_statement::drop_column(const schema& schema, const table& cf, s
    }
 }

-future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const
+future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::announce_migration(service::storage_proxy& proxy) const
 {
    auto& db = proxy.get_db().local();
    auto s = validation::validate_column_family(db, keyspace(), column_family());
@@ -396,7 +396,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
        break;
    }

-    return service::get_local_migration_manager().announce_column_family_update(cfm.build(), false, std::move(view_updates), is_local_only)
+    return service::get_local_migration_manager().announce_column_family_update(cfm.build(), false, std::move(view_updates))
        .then([this] {
            using namespace cql_transport;
            return ::make_shared<event::schema_change>(
--- a/cql3/statements/alter_table_statement.hh
+++ b/cql3/statements/alter_table_statement.hh
@@ -80,7 +80,7 @@ public:

    virtual future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;
    virtual void validate(service::storage_proxy& proxy, const service::client_state& state) const override;
-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;
    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
 private:
    void add_column(const schema& schema, const table& cf, schema_builder& cfm, std::vector<view_ptr>& view_updates, const column_identifier& column_name, const cql3_type validator, const column_definition* def, bool is_static) const;
--- a/cql3/statements/alter_type_statement.cc
+++ b/cql3/statements/alter_type_statement.cc
@@ -78,7 +78,7 @@ const sstring& alter_type_statement::keyspace() const
    return _name.get_keyspace();
 }

-void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, bool is_local_only) const
+void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks) const
 {
    auto&& all_types = ks.metadata()->user_types().get_all_types();
    auto to_update = all_types.find(_name.get_user_type_name());
@@ -100,7 +100,7 @@ void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, b

    // Now, we need to announce the type update to basically change it for new tables using this type,
    // but we also need to find all existing user types and CF using it and change them.
-    service::get_local_migration_manager().announce_type_update(updated, is_local_only).get();
+    service::get_local_migration_manager().announce_type_update(updated).get();

    for (auto&& schema : ks.metadata()->cf_meta_data() | boost::adaptors::map_values) {
        auto cfm = schema_builder(schema);
@@ -115,21 +115,21 @@ void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, b
        }
        if (modified) {
            if (schema->is_view()) {
-                service::get_local_migration_manager().announce_view_update(view_ptr(cfm.build()), is_local_only).get();
+                service::get_local_migration_manager().announce_view_update(view_ptr(cfm.build())).get();
            } else {
-                service::get_local_migration_manager().announce_column_family_update(cfm.build(), false, {}, is_local_only).get();
+                service::get_local_migration_manager().announce_column_family_update(cfm.build(), false, {}).get();
            }
        }
    }
 }

-future<shared_ptr<cql_transport::event::schema_change>> alter_type_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const
+future<shared_ptr<cql_transport::event::schema_change>> alter_type_statement::announce_migration(service::storage_proxy& proxy) const
 {
-    return seastar::async([this, &proxy, is_local_only] {
+    return seastar::async([this, &proxy] {
        auto&& db = proxy.get_db().local();
        try {
            auto&& ks = db.find_keyspace(keyspace());
-            do_announce_migration(db, ks, is_local_only);
+            do_announce_migration(db, ks);
            using namespace cql_transport;
            return ::make_shared<event::schema_change>(
                    event::schema_change::change_type::UPDATED,
--- a/cql3/statements/alter_type_statement.hh
+++ b/cql3/statements/alter_type_statement.hh
@@ -63,14 +63,14 @@ public:

    virtual const sstring& keyspace() const override;

-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;

    class add_or_alter;
    class renames;
 protected:
    virtual user_type make_updated_type(database& db, user_type to_update) const = 0;
 private:
-    void do_announce_migration(database& db, ::keyspace& ks, bool is_local_only) const;
+    void do_announce_migration(database& db, ::keyspace& ks) const;
 };

 class alter_type_statement::add_or_alter : public alter_type_statement {
--- a/cql3/statements/alter_view_statement.cc
+++ b/cql3/statements/alter_view_statement.cc
@@ -60,9 +60,10 @@ alter_view_statement::alter_view_statement(::shared_ptr<cf_name> view_name, ::sh
 future<> alter_view_statement::check_access(service::storage_proxy& proxy, const service::client_state& state) const
 {
    try {
-        auto&& s = proxy.get_db().local().find_schema(keyspace(), column_family());
+        const database& db = proxy.local_db();
+        auto&& s = db.find_schema(keyspace(), column_family());
        if (s->is_view())  {
-            return state.has_column_family_access(keyspace(), s->view_info()->base_name(), auth::permission::ALTER);
+            return state.has_column_family_access(db, keyspace(), s->view_info()->base_name(), auth::permission::ALTER);
        }
    } catch (const no_such_column_family& e) {
        // Will be validated afterwards.
@@ -75,7 +76,7 @@ void alter_view_statement::validate(service::storage_proxy&, const service::clie
    // validated in announce_migration()
 }

-future<shared_ptr<cql_transport::event::schema_change>> alter_view_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const
+future<shared_ptr<cql_transport::event::schema_change>> alter_view_statement::announce_migration(service::storage_proxy& proxy) const
 {
    auto&& db = proxy.get_db().local();
    schema_ptr schema = validation::validate_column_family(db, keyspace(), column_family());
@@ -107,7 +108,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_view_statement::an
                "the corresponding data in the parent table.");
    }

-    return service::get_local_migration_manager().announce_view_update(view_ptr(builder.build()), is_local_only).then([this] {
+    return service::get_local_migration_manager().announce_view_update(view_ptr(builder.build())).then([this] {
        using namespace cql_transport;

        return ::make_shared<event::schema_change>(
--- a/cql3/statements/alter_view_statement.hh
+++ b/cql3/statements/alter_view_statement.hh
@@ -63,7 +63,7 @@ public:

    virtual void validate(service::storage_proxy&, const service::client_state& state) const override;

-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
 };
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -59,6 +59,10 @@ timeout_for_type(batch_statement::type t) {
            : &timeout_config::write_timeout;
 }

+db::timeout_clock::duration batch_statement::get_timeout(const query_options& options) const {
+    return _attrs->is_timeout_set() ? _attrs->get_timeout(options) : options.get_timeout_config().*get_timeout_config_selector();
+}
+
 batch_statement::batch_statement(int bound_terms, type type_,
                                 std::vector<single_statement> statements,
                                 std::unique_ptr<attributes> attrs,
@@ -286,7 +290,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
    ++_stats.batches;
    _stats.statements_in_batches += _statements.size();

-    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+    auto timeout = db::timeout_clock::now() + get_timeout(options);
    return get_mutations(storage, options, timeout, local, now, query_state).then([this, &storage, &options, timeout, tr_state = query_state.get_trace_state(),
                                                                                                                               permit = query_state.get_permit()] (std::vector<mutation> ms) mutable {
        return execute_without_conditions(storage, std::move(ms), options.get_consistency(), timeout, std::move(tr_state), std::move(permit));
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -170,6 +170,8 @@ private:
            service::storage_proxy& storage,
            const query_options& options,
            service::query_state& state) const;
+
+    db::timeout_clock::duration get_timeout(const query_options& options) const;
 public:
    // FIXME: no cql_statement::to_string() yet
 #if 0
--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -157,6 +157,7 @@ void cf_prop_defs::validate(const database& db, const schema::extensions_map& sc
    }

    validate_minimum_int(KW_DEFAULT_TIME_TO_LIVE, 0, DEFAULT_DEFAULT_TIME_TO_LIVE);
+    validate_minimum_int(KW_PAXOSGRACESECONDS, 0, DEFAULT_GC_GRACE_SECONDS);

    auto min_index_interval = get_int(KW_MIN_INDEX_INTERVAL, DEFAULT_MIN_INDEX_INTERVAL);
    auto max_index_interval = get_int(KW_MAX_INDEX_INTERVAL, DEFAULT_MAX_INDEX_INTERVAL);
--- a/cql3/statements/create_function_statement.cc
+++ b/cql3/statements/create_function_statement.cc
@@ -59,11 +59,11 @@ std::unique_ptr<prepared_statement> create_function_statement::prepare(database&
 }

 future<shared_ptr<cql_transport::event::schema_change>> create_function_statement::announce_migration(
-        service::storage_proxy& proxy, bool is_local_only) const {
+        service::storage_proxy& proxy) const {
    if (!_func) {
        return make_ready_future<::shared_ptr<cql_transport::event::schema_change>>();
    }
-    return service::get_local_migration_manager().announce_new_function(_func, is_local_only).then([this] {
+    return service::get_local_migration_manager().announce_new_function(_func).then([this] {
        return create_schema_change(*_func, true);
    });
 }
--- a/cql3/statements/create_function_statement.hh
+++ b/cql3/statements/create_function_statement.hh
@@ -29,7 +29,7 @@ namespace statements {
 class create_function_statement final : public create_function_statement_base {
    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(
-            service::storage_proxy& proxy, bool is_local_only) const override;
+            service::storage_proxy& proxy) const override;
    virtual void create(service::storage_proxy& proxy, functions::function* old) const override;
    sstring _language;
    sstring _body;
--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -73,7 +73,7 @@ create_index_statement::create_index_statement(::shared_ptr<cf_name> name,

 future<>
 create_index_statement::check_access(service::storage_proxy& proxy, const service::client_state& state) const {
-    return state.has_column_family_access(keyspace(), column_family(), auth::permission::ALTER);
+    return state.has_column_family_access(proxy.local_db(), keyspace(), column_family(), auth::permission::ALTER);
 }

 void
@@ -271,7 +271,7 @@ void create_index_statement::validate_targets_for_multi_column_index(std::vector
 }

 future<::shared_ptr<cql_transport::event::schema_change>>
-create_index_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const {
+create_index_statement::announce_migration(service::storage_proxy& proxy) const {
    auto& db = proxy.get_db().local();
    auto schema = db.find_schema(keyspace(), column_family());
    std::vector<::shared_ptr<index_target>> targets;
@@ -310,7 +310,7 @@ create_index_statement::announce_migration(service::storage_proxy& proxy, bool i
    schema_builder builder{schema};
    builder.with_index(index);
    return service::get_local_migration_manager().announce_column_family_update(
-            builder.build(), false, {}, is_local_only).then([this]() {
+            builder.build(), false, {}).then([this]() {
        using namespace cql_transport;
        return ::make_shared<event::schema_change>(
                event::schema_change::change_type::UPDATED,
--- a/cql3/statements/create_index_statement.hh
+++ b/cql3/statements/create_index_statement.hh
@@ -79,7 +79,7 @@ public:

    future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;
    void validate(service::storage_proxy&, const service::client_state& state) const override;
-    future<::shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy&, bool is_local_only) const override;
+    future<::shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy&) const override;

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
 private:
--- a/cql3/statements/create_keyspace_statement.cc
+++ b/cql3/statements/create_keyspace_statement.cc
@@ -47,7 +47,7 @@

 #include <regex>

-bool is_system_keyspace(const sstring& keyspace);
+bool is_system_keyspace(std::string_view keyspace);

 namespace cql3 {

@@ -106,11 +106,11 @@ void create_keyspace_statement::validate(service::storage_proxy&, const service:
 #endif
 }

-future<shared_ptr<cql_transport::event::schema_change>> create_keyspace_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const
+future<shared_ptr<cql_transport::event::schema_change>> create_keyspace_statement::announce_migration(service::storage_proxy& proxy) const
 {
-    return make_ready_future<>().then([this, p = proxy.shared_from_this(), is_local_only] {
+    return make_ready_future<>().then([this, p = proxy.shared_from_this()] {
        const auto& tm = *p->get_token_metadata_ptr();
-        return service::get_local_migration_manager().announce_new_keyspace(_attrs->as_ks_metadata(_name, tm), is_local_only);
+        return service::get_local_migration_manager().announce_new_keyspace(_attrs->as_ks_metadata(_name, tm));
    }).then_wrapped([this] (auto&& f) {
        try {
            f.get();
--- a/cql3/statements/create_keyspace_statement.hh
+++ b/cql3/statements/create_keyspace_statement.hh
@@ -84,7 +84,7 @@ public:
     */
    virtual void validate(service::storage_proxy&, const service::client_state& state) const override;

-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;

--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -97,10 +97,10 @@ std::vector<column_definition> create_table_statement::get_columns() const
    return column_defs;
 }

-future<shared_ptr<cql_transport::event::schema_change>> create_table_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const {
+future<shared_ptr<cql_transport::event::schema_change>> create_table_statement::announce_migration(service::storage_proxy& proxy) const {
    auto schema = get_cf_meta_data(proxy.get_db().local());
-    return make_ready_future<>().then([this, is_local_only, schema = std::move(schema)] {
-        return service::get_local_migration_manager().announce_new_column_family(std::move(schema), is_local_only);
+    return make_ready_future<>().then([this, schema = std::move(schema)] {
+        return service::get_local_migration_manager().announce_new_column_family(std::move(schema));
    }).then_wrapped([this] (auto&& f) {
        try {
            f.get();
--- a/cql3/statements/create_table_statement.hh
+++ b/cql3/statements/create_table_statement.hh
@@ -102,7 +102,7 @@ public:

    virtual void validate(service::storage_proxy&, const service::client_state& state) const override;

-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;

--- a/cql3/statements/create_type_statement.cc
+++ b/cql3/statements/create_type_statement.cc
@@ -138,7 +138,7 @@ inline user_type create_type_statement::create_type(database& db) const
        std::move(field_names), std::move(field_types), true /* multi cell */);
 }

-future<shared_ptr<cql_transport::event::schema_change>> create_type_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const
+future<shared_ptr<cql_transport::event::schema_change>> create_type_statement::announce_migration(service::storage_proxy& proxy) const
 {
    auto&& db = proxy.get_db().local();

@@ -152,7 +152,7 @@ future<shared_ptr<cql_transport::event::schema_change>> create_type_statement::a

    auto type = create_type(db);
    check_for_duplicate_names(type);
-    return service::get_local_migration_manager().announce_new_type(type, is_local_only).then([this] {
+    return service::get_local_migration_manager().announce_new_type(type).then([this] {
        using namespace cql_transport;

        return ::make_shared<event::schema_change>(
--- a/cql3/statements/create_type_statement.hh
+++ b/cql3/statements/create_type_statement.hh
@@ -65,7 +65,7 @@ public:

    virtual const sstring& keyspace() const override;

-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;

--- a/cql3/statements/create_view_statement.cc
+++ b/cql3/statements/create_view_statement.cc
@@ -89,7 +89,7 @@ create_view_statement::create_view_statement(
 }

 future<> create_view_statement::check_access(service::storage_proxy& proxy, const service::client_state& state) const {
-    return state.has_column_family_access(keyspace(), _base_name->get_column_family(), auth::permission::ALTER);
+    return state.has_column_family_access(proxy.local_db(), keyspace(), _base_name->get_column_family(), auth::permission::ALTER);
 }

 void create_view_statement::validate(service::storage_proxy& proxy, const service::client_state& state) const {
@@ -140,7 +140,7 @@ static bool validate_primary_key(
    return new_non_pk_column;
 }

-future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const {
+future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::announce_migration(service::storage_proxy& proxy) const {
    // We need to make sure that:
    //  - primary key includes all columns in base table's primary key
    //  - make sure that the select statement does not have anything other than columns
@@ -225,7 +225,7 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
    }

    auto parameters = make_lw_shared<raw::select_statement::parameters>(raw::select_statement::parameters::orderings_type(), false, true);
-    raw::select_statement raw_select(_base_name, std::move(parameters), _select_clause, _where_clause, nullptr, nullptr, {});
+    raw::select_statement raw_select(_base_name, std::move(parameters), _select_clause, _where_clause, nullptr, nullptr, {}, std::make_unique<cql3::attributes::raw>());
    raw_select.prepare_keyspace(keyspace());
    raw_select.set_bound_variables({});

@@ -350,8 +350,8 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
    auto where_clause_text = util::relations_to_where_clause(_where_clause);
    builder.with_view_info(schema->id(), schema->cf_name(), included.empty(), std::move(where_clause_text));

-    return make_ready_future<>().then([definition = view_ptr(builder.build()), is_local_only]() mutable {
-        return service::get_local_migration_manager().announce_new_view(definition, is_local_only);
+    return make_ready_future<>().then([definition = view_ptr(builder.build())]() mutable {
+        return service::get_local_migration_manager().announce_new_view(definition);
    }).then_wrapped([this] (auto&& f) {
        try {
            f.get();
--- a/cql3/statements/create_view_statement.hh
+++ b/cql3/statements/create_view_statement.hh
@@ -68,7 +68,7 @@ public:
    // Functions we need to override to subclass schema_altering_statement
    virtual future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;
    virtual void validate(service::storage_proxy&, const service::client_state& state) const override;
-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;
    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;

    // FIXME: continue here. See create_table_statement.hh and CreateViewStatement.java
--- a/cql3/statements/drop_function_statement.cc
+++ b/cql3/statements/drop_function_statement.cc
@@ -33,7 +33,7 @@ std::unique_ptr<prepared_statement> drop_function_statement::prepare(database& d
 }

 future<shared_ptr<cql_transport::event::schema_change>> drop_function_statement::announce_migration(
-        service::storage_proxy& proxy, bool is_local_only) const {
+        service::storage_proxy& proxy) const {
    if (!_func) {
        return make_ready_future<shared_ptr<cql_transport::event::schema_change>>();
    }
@@ -41,7 +41,7 @@ future<shared_ptr<cql_transport::event::schema_change>> drop_function_statement:
    if (!user_func) {
        throw exceptions::invalid_request_exception(format("'{}' is not a user defined function", _func));
    }
-    return service::get_local_migration_manager().announce_function_drop(user_func, is_local_only).then([this] {
+    return service::get_local_migration_manager().announce_function_drop(user_func).then([this] {
        return create_schema_change(*_func, false);
    });
 }
--- a/cql3/statements/drop_function_statement.hh
+++ b/cql3/statements/drop_function_statement.hh
@@ -28,7 +28,7 @@ namespace statements {
 class drop_function_statement final : public drop_function_statement_base {
    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(
-            service::storage_proxy& proxy, bool is_local_only) const override;
+            service::storage_proxy& proxy) const override;

 public:
    drop_function_statement(functions::function_name name, std::vector<shared_ptr<cql3_type::raw>> arg_types,
--- a/cql3/statements/drop_index_statement.cc
+++ b/cql3/statements/drop_index_statement.cc
@@ -70,7 +70,7 @@ future<> drop_index_statement::check_access(service::storage_proxy& proxy, const
    if (!cfm) {
        return make_ready_future<>();
    }
-    return state.has_column_family_access(cfm->ks_name(), cfm->cf_name(), auth::permission::ALTER);
+    return state.has_column_family_access(proxy.local_db(), cfm->ks_name(), cfm->cf_name(), auth::permission::ALTER);
 }

 void drop_index_statement::validate(service::storage_proxy& proxy, const service::client_state& state) const
@@ -86,7 +86,7 @@ void drop_index_statement::validate(service::storage_proxy& proxy, const service
    }
 }

-future<shared_ptr<cql_transport::event::schema_change>> drop_index_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const
+future<shared_ptr<cql_transport::event::schema_change>> drop_index_statement::announce_migration(service::storage_proxy& proxy) const
 {
    auto cfm = lookup_indexed_table(proxy);
    if (!cfm) {
@@ -95,7 +95,7 @@ future<shared_ptr<cql_transport::event::schema_change>> drop_index_statement::an
    ++_cql_stats->secondary_index_drops;
    auto builder = schema_builder(cfm);
    builder.without_index(_index_name);
-    return service::get_local_migration_manager().announce_column_family_update(builder.build(), false, {}, is_local_only).then([cfm] {
+    return service::get_local_migration_manager().announce_column_family_update(builder.build(), false, {}).then([cfm] {
        // Dropping an index is akin to updating the CF
        // Note that we shouldn't call columnFamily() at this point because the index has been dropped and the call to lookupIndexedTable()
        // in that method would now throw.
--- a/cql3/statements/drop_index_statement.hh
+++ b/cql3/statements/drop_index_statement.hh
@@ -72,7 +72,7 @@ public:

    virtual void validate(service::storage_proxy&, const service::client_state& state) const override;

-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
 private:
--- a/cql3/statements/drop_keyspace_statement.cc
+++ b/cql3/statements/drop_keyspace_statement.cc
@@ -74,10 +74,10 @@ const sstring& drop_keyspace_statement::keyspace() const
    return _keyspace;
 }

-future<shared_ptr<cql_transport::event::schema_change>> drop_keyspace_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const
+future<shared_ptr<cql_transport::event::schema_change>> drop_keyspace_statement::announce_migration(service::storage_proxy& proxy) const
 {
-    return make_ready_future<>().then([this, is_local_only] {
-        return service::get_local_migration_manager().announce_keyspace_drop(_keyspace, is_local_only);
+    return make_ready_future<>().then([this] {
+        return service::get_local_migration_manager().announce_keyspace_drop(_keyspace);
    }).then_wrapped([this] (auto&& f) {
        try {
            f.get();
--- a/cql3/statements/drop_keyspace_statement.hh
+++ b/cql3/statements/drop_keyspace_statement.hh
@@ -59,7 +59,7 @@ public:

    virtual const sstring& keyspace() const override;

-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
 };
--- a/cql3/statements/drop_table_statement.cc
+++ b/cql3/statements/drop_table_statement.cc
@@ -58,7 +58,7 @@ future<> drop_table_statement::check_access(service::storage_proxy& proxy, const
 {
    // invalid_request_exception is only thrown synchronously.
    try {
-        return state.has_column_family_access(keyspace(), column_family(), auth::permission::DROP);
+        return state.has_column_family_access(proxy.local_db(), keyspace(), column_family(), auth::permission::DROP);
    } catch (exceptions::invalid_request_exception&) {
        if (!_if_exists) {
            throw;
@@ -72,10 +72,10 @@ void drop_table_statement::validate(service::storage_proxy&, const service::clie
    // validated in announce_migration()
 }

-future<shared_ptr<cql_transport::event::schema_change>> drop_table_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const
+future<shared_ptr<cql_transport::event::schema_change>> drop_table_statement::announce_migration(service::storage_proxy& proxy) const
 {
-    return make_ready_future<>().then([this, is_local_only] {
-        return service::get_local_migration_manager().announce_column_family_drop(keyspace(), column_family(), is_local_only);
+    return make_ready_future<>().then([this] {
+        return service::get_local_migration_manager().announce_column_family_drop(keyspace(), column_family());
    }).then_wrapped([this] (auto&& f) {
        try {
            f.get();
--- a/cql3/statements/drop_table_statement.hh
+++ b/cql3/statements/drop_table_statement.hh
@@ -58,7 +58,7 @@ public:

    virtual void validate(service::storage_proxy&, const service::client_state& state) const override;

-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
 };
--- a/cql3/statements/drop_type_statement.cc
+++ b/cql3/statements/drop_type_statement.cc
@@ -142,7 +142,7 @@ const sstring& drop_type_statement::keyspace() const
    return _name.get_keyspace();
 }

-future<shared_ptr<cql_transport::event::schema_change>> drop_type_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const
+future<shared_ptr<cql_transport::event::schema_change>> drop_type_statement::announce_migration(service::storage_proxy& proxy) const
 {
    auto&& db = proxy.get_db().local();

@@ -157,7 +157,7 @@ future<shared_ptr<cql_transport::event::schema_change>> drop_type_statement::ann
        return make_ready_future<::shared_ptr<cql_transport::event::schema_change>>();
    }

-    return service::get_local_migration_manager().announce_type_drop(to_drop->second, is_local_only).then([this] {
+    return service::get_local_migration_manager().announce_type_drop(to_drop->second).then([this] {
        using namespace cql_transport;

        return ::make_shared<event::schema_change>(
--- a/cql3/statements/drop_type_statement.hh
+++ b/cql3/statements/drop_type_statement.hh
@@ -61,7 +61,7 @@ public:

    virtual const sstring& keyspace() const override;

-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
 };
--- a/cql3/statements/drop_view_statement.cc
+++ b/cql3/statements/drop_view_statement.cc
@@ -58,9 +58,10 @@ drop_view_statement::drop_view_statement(::shared_ptr<cf_name> view_name, bool i
 future<> drop_view_statement::check_access(service::storage_proxy& proxy, const service::client_state& state) const
 {
    try {
-        auto&& s = proxy.get_db().local().find_schema(keyspace(), column_family());
+        const database& db = proxy.local_db();
+        auto&& s = db.find_schema(keyspace(), column_family());
        if (s->is_view()) {
-            return state.has_column_family_access(keyspace(), s->view_info()->base_name(), auth::permission::ALTER);
+            return state.has_column_family_access(db, keyspace(), s->view_info()->base_name(), auth::permission::ALTER);
        }
    } catch (const no_such_column_family& e) {
        // Will be validated afterwards.
@@ -73,10 +74,10 @@ void drop_view_statement::validate(service::storage_proxy&, const service::clien
    // validated in migration_manager::announce_view_drop()
 }

-future<shared_ptr<cql_transport::event::schema_change>> drop_view_statement::announce_migration(service::storage_proxy& proxy, bool is_local_only) const
+future<shared_ptr<cql_transport::event::schema_change>> drop_view_statement::announce_migration(service::storage_proxy& proxy) const
 {
-    return make_ready_future<>().then([this, is_local_only] {
-        return service::get_local_migration_manager().announce_view_drop(keyspace(), column_family(), is_local_only);
+    return make_ready_future<>().then([this] {
+        return service::get_local_migration_manager().announce_view_drop(keyspace(), column_family());
    }).then_wrapped([this] (auto&& f) {
        try {
            f.get();
--- a/cql3/statements/drop_view_statement.hh
+++ b/cql3/statements/drop_view_statement.hh
@@ -63,7 +63,7 @@ public:

    virtual void validate(service::storage_proxy&, const service::client_state& state) const override;

-    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const override;

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
 };
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -59,7 +59,7 @@
 #include "partition_slice_builder.hh"
 #include "cas_request.hh"

-bool is_system_keyspace(const sstring& name);
+bool is_system_keyspace(std::string_view name);

 namespace cql3 {

@@ -74,6 +74,10 @@ modification_statement_timeout(const schema& s) {
    }
 }

+db::timeout_clock::duration modification_statement::get_timeout(const query_options& options) const {
+    return attrs->is_timeout_set() ? attrs->get_timeout(options) : options.get_timeout_config().*get_timeout_config_selector();
+}
+
 modification_statement::modification_statement(statement_type type_, uint32_t bound_terms, schema_ptr schema_, std::unique_ptr<attributes> attrs_, cql_stats& stats_)
    : cql_statement_opt_metadata(modification_statement_timeout(*schema_))
    , type{type_}
@@ -120,10 +124,11 @@ gc_clock::duration modification_statement::get_time_to_live(const query_options&
 }

 future<> modification_statement::check_access(service::storage_proxy& proxy, const service::client_state& state) const {
-    auto f = state.has_column_family_access(keyspace(), column_family(), auth::permission::MODIFY);
+    const database& db = proxy.local_db();
+    auto f = state.has_column_family_access(db, keyspace(), column_family(), auth::permission::MODIFY);
    if (has_conditions()) {
-        f = f.then([this, &state] {
-           return state.has_column_family_access(keyspace(), column_family(), auth::permission::SELECT);
+        f = f.then([this, &state, &db] {
+           return state.has_column_family_access(db, keyspace(), column_family(), auth::permission::SELECT);
        });
    }
    return f;
@@ -286,7 +291,7 @@ modification_statement::do_execute(service::storage_proxy& proxy, service::query
 future<>
 modification_statement::execute_without_condition(service::storage_proxy& proxy, service::query_state& qs, const query_options& options) const {
    auto cl = options.get_consistency();
-    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+    auto timeout = db::timeout_clock::now() + get_timeout(options);
    return get_mutations(proxy, options, timeout, false, options.get_timestamp(qs), qs).then([this, cl, timeout, &proxy, &qs] (auto mutations) {
        if (mutations.empty()) {
            return now();
--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -298,6 +298,9 @@ protected:
     * @throws InvalidRequestException
     */
    virtual void validate_where_clause_for_conditions() const;
+
+    db::timeout_clock::duration get_timeout(const query_options& options) const;
+
    friend class raw::modification_statement;
 };

--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -48,6 +48,7 @@
 #include "cql3/selection/raw_selector.hh"
 #include "cql3/restrictions/statement_restrictions.hh"
 #include "cql3/result_set.hh"
+#include "cql3/attributes.hh"
 #include "exceptions/unrecognized_entity_exception.hh"
 #include "service/client_state.hh"
 #include <seastar/core/shared_ptr.hh>
@@ -105,6 +106,7 @@ private:
    ::shared_ptr<term::raw> _limit;
    ::shared_ptr<term::raw> _per_partition_limit;
    std::vector<::shared_ptr<cql3::column_identifier::raw>> _group_by_columns;
+    std::unique_ptr<cql3::attributes::raw> _attrs;
 public:
    select_statement(::shared_ptr<cf_name> cf_name,
            lw_shared_ptr<const parameters> parameters,
@@ -112,7 +114,8 @@ public:
            std::vector<::shared_ptr<relation>> where_clause,
            ::shared_ptr<term::raw> limit,
            ::shared_ptr<term::raw> per_partition_limit,
-            std::vector<::shared_ptr<cql3::column_identifier::raw>> group_by_columns);
+            std::vector<::shared_ptr<cql3::column_identifier::raw>> group_by_columns,
+            std::unique_ptr<cql3::attributes::raw> attrs);

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override {
        return prepare(db, stats, false);
--- a/cql3/statements/schema_altering_statement.cc
+++ b/cql3/statements/schema_altering_statement.cc
@@ -90,10 +90,10 @@ void schema_altering_statement::prepare_keyspace(const service::client_state& st
 }

 future<::shared_ptr<messages::result_message>>
-schema_altering_statement::execute0(service::storage_proxy& proxy, service::query_state& state, const query_options& options, bool is_local_only) const {
+schema_altering_statement::execute0(service::storage_proxy& proxy, service::query_state& state, const query_options& options) const {
    // If an IF [NOT] EXISTS clause was used, this may not result in an actual schema change.  To avoid doing
    // extra work in the drivers to handle schema changes, we return an empty message in this case. (CASSANDRA-7600)
-    return announce_migration(proxy, is_local_only).then([this] (auto ce) {
+    return announce_migration(proxy).then([this] (auto ce) {
        ::shared_ptr<messages::result_message> result;
        if (!ce) {
            result = ::make_shared<messages::result_message::void_message>();
@@ -120,7 +120,7 @@ schema_altering_statement::execute(service::storage_proxy& proxy, service::query
        }
    }

-    return execute0(proxy, state, options, internal).then([this, &state, internal](::shared_ptr<messages::result_message> result) {
+    return execute0(proxy, state, options).then([this, &state, internal](::shared_ptr<messages::result_message> result) {
        auto permissions_granted_fut = internal
                ? make_ready_future<>()
                : grant_permissions_to_creator(state.get_client_state());
--- a/cql3/statements/schema_altering_statement.hh
+++ b/cql3/statements/schema_altering_statement.hh
@@ -65,7 +65,7 @@ private:
    const bool _is_column_family_level;

    future<::shared_ptr<messages::result_message>>
-    execute0(service::storage_proxy& proxy, service::query_state& state, const query_options& options, bool) const;
+    execute0(service::storage_proxy& proxy, service::query_state& state, const query_options& options) const;
 protected:
    explicit schema_altering_statement(timeout_config_selector timeout_selector = &timeout_config::other_timeout);

@@ -87,7 +87,7 @@ protected:

    virtual void prepare_keyspace(const service::client_state& state) override;

-    virtual future<::shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy, bool is_local_only) const = 0;
+    virtual future<::shared_ptr<cql_transport::event::schema_change>> announce_migration(service::storage_proxy& proxy) const = 0;

    virtual future<::shared_ptr<messages::result_message>>
    execute(service::storage_proxy& proxy, service::query_state& state, const query_options& options) const override;
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -62,7 +62,7 @@
 #include "test/lib/select_statement_utils.hh"
 #include <boost/algorithm/cxx11/any_of.hpp>

-bool is_system_keyspace(const sstring& name);
+bool is_system_keyspace(std::string_view name);

 namespace cql3 {

@@ -138,7 +138,8 @@ select_statement::select_statement(schema_ptr schema,
                                   ordering_comparator_type ordering_comparator,
                                   ::shared_ptr<term> limit,
                                   ::shared_ptr<term> per_partition_limit,
-                                   cql_stats& stats)
+                                   cql_stats& stats,
+                                   std::unique_ptr<attributes> attrs)
    : cql_statement(select_timeout(*restrictions))
    , _schema(schema)
    , _bound_terms(bound_terms)
@@ -152,6 +153,7 @@ select_statement::select_statement(schema_ptr schema,
    , _ordering_comparator(std::move(ordering_comparator))
    , _stats(stats)
    , _ks_sel(::is_system_keyspace(schema->ks_name()) ? ks_selector::SYSTEM : ks_selector::NONSYSTEM)
+    , _attrs(std::move(attrs))
 {
    _opts = _selection->get_query_options();
    _opts.set_if<query::partition_slice::option::bypass_cache>(_parameters->bypass_cache());
@@ -159,6 +161,10 @@ select_statement::select_statement(schema_ptr schema,
    _opts.set_if<query::partition_slice::option::reversed>(_is_reversed);
 }

+db::timeout_clock::duration select_statement::get_timeout(const query_options& options) const {
+    return _attrs->is_timeout_set() ? _attrs->get_timeout(options) : options.get_timeout_config().*get_timeout_config_selector();
+}
+
 ::shared_ptr<const cql3::metadata> select_statement::get_result_metadata() const {
    // FIXME: COUNT needs special result metadata handling.
    return _selection->get_result_metadata();
@@ -170,9 +176,10 @@ uint32_t select_statement::get_bound_terms() const {

 future<> select_statement::check_access(service::storage_proxy& proxy, const service::client_state& state) const {
    try {
-        auto&& s = proxy.get_db().local().find_schema(keyspace(), column_family());
+        const database& db = proxy.local_db();
+        auto&& s = db.find_schema(keyspace(), column_family());
        auto& cf_name = s->is_view() ? s->view_info()->base_name() : column_family();
-        return state.has_column_family_access(keyspace(), cf_name, auth::permission::SELECT);
+        return state.has_column_family_access(db, keyspace(), cf_name, auth::permission::SELECT);
    } catch (const no_such_column_family& e) {
        // Will be validated afterwards.
        return make_ready_future<>();
@@ -252,10 +259,9 @@ uint64_t select_statement::do_get_limit(const query_options& options, ::shared_p
    if (val.is_unset_value()) {
        return default_limit;
    }
-  return with_linearized(*val, [&] (bytes_view bv) {
    try {
-        int32_type->validate(bv, options.get_cql_serialization_format());
-        auto l = value_cast<int32_t>(int32_type->deserialize(bv));
+        int32_type->validate(*val, options.get_cql_serialization_format());
+        auto l = value_cast<int32_t>(int32_type->deserialize(*val));
        if (l <= 0) {
            throw exceptions::invalid_request_exception("LIMIT must be strictly positive");
        }
@@ -263,7 +269,6 @@ uint64_t select_statement::do_get_limit(const query_options& options, ::shared_p
    } catch (const marshal_exception& e) {
        throw exceptions::invalid_request_exception("Invalid limit value");
    }
-  });
 }

 bool select_statement::needs_post_query_ordering() const {
@@ -366,7 +371,7 @@ select_statement::do_execute(service::storage_proxy& proxy,
    }

    command->slice.options.set<query::partition_slice::option::allow_short_read>();
-    auto timeout_duration = options.get_timeout_config().*get_timeout_config_selector();
+    auto timeout_duration = get_timeout(options);
    auto timeout = db::timeout_clock::now() + timeout_duration;
    auto p = service::pager::query_pagers::pager(_schema, _selection,
            state, options, command, std::move(key_ranges), restrictions_need_filtering ? _restrictions : nullptr);
@@ -374,14 +379,14 @@ select_statement::do_execute(service::storage_proxy& proxy,
    if (aggregate || nonpaged_filtering) {
        return do_with(
                cql3::selection::result_set_builder(*_selection, now,
-                        options.get_cql_serialization_format(), *_group_by_cell_indices),
-                [this, p, page_size, now, timeout, restrictions_need_filtering](auto& builder) {
-                    return do_until([p] {return p->is_exhausted();},
-                            [p, &builder, page_size, now, timeout] {
+                        options.get_cql_serialization_format(), *_group_by_cell_indices), std::move(p),
+                [this, page_size, now, timeout, restrictions_need_filtering](auto& builder, std::unique_ptr<service::pager::query_pager>& p) {
+                    return do_until([&p] {return p->is_exhausted();},
+                            [&p, &builder, page_size, now, timeout] {
                                return p->fetch_page(builder, page_size, now, timeout);
                            }
-                    ).then([this, p, &builder, restrictions_need_filtering] {
-                        return builder.with_thread_if_needed([this, p, &builder, restrictions_need_filtering] {
+                    ).then([this, &p, &builder, restrictions_need_filtering] {
+                        return builder.with_thread_if_needed([this, &p, &builder, restrictions_need_filtering] {
                            auto rs = builder.build();
                            if (restrictions_need_filtering) {
                                _stats.filtered_rows_read_total += p->stats().rows_read_total;
@@ -402,7 +407,7 @@ select_statement::do_execute(service::storage_proxy& proxy,
    }

    if (_selection->is_trivial() && !restrictions_need_filtering && !_per_partition_limit) {
-        return p->fetch_page_generator(page_size, now, timeout, _stats).then([this, p] (result_generator generator) {
+        return p->fetch_page_generator(page_size, now, timeout, _stats).then([this, p = std::move(p)] (result_generator generator) {
            auto meta = [&] () -> shared_ptr<const cql3::metadata> {
                if (!p->is_exhausted()) {
                    auto meta = make_shared<metadata>(*_selection->get_result_metadata());
@@ -420,7 +425,7 @@ select_statement::do_execute(service::storage_proxy& proxy,
    }

    return p->fetch_page(page_size, now, timeout).then(
-            [this, p, &options, now, restrictions_need_filtering](std::unique_ptr<cql3::result_set> rs) {
+            [this, p = std::move(p), &options, now, restrictions_need_filtering](std::unique_ptr<cql3::result_set> rs) {

                if (!p->is_exhausted()) {
                    rs->get_metadata().set_paging_state(p->state());
@@ -447,7 +452,7 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const std::option
        return KeyType::make_empty();
    }

-    std::vector<bytes_view> exploded_base_key;
+    std::vector<managed_bytes_view> exploded_base_key;
    exploded_base_key.reserve(base_columns.size());

    for (const column_definition& base_col : base_columns) {
@@ -513,7 +518,7 @@ indexed_table_select_statement::do_execute_base_query(
        lw_shared_ptr<const service::pager::paging_state> paging_state) const {
    using value_type = std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>;
    auto cmd = prepare_command_for_base_query(proxy, options, state, now, bool(paging_state));
-    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+    auto timeout = db::timeout_clock::now() + get_timeout(options);
    uint32_t queried_ranges_count = partition_ranges.size();
    service::query_ranges_to_vnodes_generator ranges_to_vnodes(proxy.get_token_metadata_ptr(), _schema, std::move(partition_ranges));

@@ -607,7 +612,7 @@ indexed_table_select_statement::do_execute_base_query(
        lw_shared_ptr<const service::pager::paging_state> paging_state) const {
    using value_type = std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>;
    auto cmd = prepare_command_for_base_query(proxy, options, state, now, bool(paging_state));
-    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+    auto timeout = db::timeout_clock::now() + get_timeout(options);

    struct base_query_state {
        query::result_merger merger;
@@ -689,7 +694,7 @@ select_statement::execute(service::storage_proxy& proxy,
    // is specified we need to get "limit" rows from each partition since there
    // is no way to tell which of these rows belong to the query result before
    // doing post-query ordering.
-    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+    auto timeout = db::timeout_clock::now() + get_timeout(options);
    if (needs_post_query_ordering() && _limit) {
        return do_with(std::forward<dht::partition_range_vector>(partition_ranges), [this, &proxy, &state, &options, cmd, timeout](auto& prs) {
            assert(cmd->partition_limit == query::max_partitions);
@@ -792,8 +797,9 @@ primary_key_select_statement::primary_key_select_statement(schema_ptr schema, ui
                                                           ordering_comparator_type ordering_comparator,
                                                           ::shared_ptr<term> limit,
                                                           ::shared_ptr<term> per_partition_limit,
-                                                           cql_stats &stats)
-    : select_statement{schema, bound_terms, parameters, selection, restrictions, group_by_cell_indices, is_reversed, ordering_comparator, limit, per_partition_limit, stats}
+                                                           cql_stats &stats,
+                                                           std::unique_ptr<attributes> attrs)
+    : select_statement{schema, bound_terms, parameters, selection, restrictions, group_by_cell_indices, is_reversed, ordering_comparator, limit, per_partition_limit, stats, std::move(attrs)}
 {
    if (_ks_sel == ks_selector::NONSYSTEM) {
        if (_restrictions->need_filtering() ||
@@ -819,7 +825,8 @@ indexed_table_select_statement::prepare(database& db,
                                        ordering_comparator_type ordering_comparator,
                                        ::shared_ptr<term> limit,
                                         ::shared_ptr<term> per_partition_limit,
-                                         cql_stats &stats)
+                                         cql_stats &stats,
+                                         std::unique_ptr<attributes> attrs)
 {
    auto& sim = db.find_column_family(schema).get_index_manager();
    auto [index_opt, used_index_restrictions] = restrictions->find_idx(sim);
@@ -845,7 +852,8 @@ indexed_table_select_statement::prepare(database& db,
            stats,
            *index_opt,
            std::move(used_index_restrictions),
-            view_schema);
+            view_schema,
+            std::move(attrs));

 }

@@ -861,8 +869,9 @@ indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema
                                                           cql_stats &stats,
                                                           const secondary_index::index& index,
                                                           ::shared_ptr<restrictions::restrictions> used_index_restrictions,
-                                                           schema_ptr view_schema)
-    : select_statement{schema, bound_terms, parameters, selection, restrictions, group_by_cell_indices, is_reversed, ordering_comparator, limit, per_partition_limit, stats}
+                                                           schema_ptr view_schema,
+                                                           std::unique_ptr<attributes> attrs)
+    : select_statement{schema, bound_terms, parameters, selection, restrictions, group_by_cell_indices, is_reversed, ordering_comparator, limit, per_partition_limit, stats, std::move(attrs)}
    , _index{index}
    , _used_index_restrictions(used_index_restrictions)
    , _view_schema(view_schema)
@@ -878,7 +887,7 @@ indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema

 template<typename KeyType>
 requires (std::is_same_v<KeyType, partition_key> || std::is_same_v<KeyType, clustering_key_prefix>)
-static void append_base_key_to_index_ck(std::vector<bytes_view>& exploded_index_ck, const KeyType& base_key, const column_definition& index_cdef) {
+static void append_base_key_to_index_ck(std::vector<managed_bytes_view>& exploded_index_ck, const KeyType& base_key, const column_definition& index_cdef) {
    auto key_view = base_key.view();
    auto begin = key_view.begin();
    if ((std::is_same_v<KeyType, partition_key> && index_cdef.is_partition_key())
@@ -933,7 +942,7 @@ lw_shared_ptr<const service::pager::paging_state> indexed_table_select_statement
        }
    }();

-    std::vector<bytes_view> exploded_index_ck;
+    std::vector<managed_bytes_view> exploded_index_ck;
    exploded_index_ck.reserve(_view_schema->clustering_key_size());

    bytes token_bytes;
@@ -1235,7 +1244,7 @@ indexed_table_select_statement::read_posting_list(service::storage_proxy& proxy,

    auto p = service::pager::query_pagers::pager(_view_schema, selection,
            state, options, cmd, std::move(partition_ranges), nullptr);
-    return p->fetch_page(options.get_page_size(), now, timeout).then([p, &options, limit, now] (std::unique_ptr<cql3::result_set> rs) {
+    return p->fetch_page(options.get_page_size(), now, timeout).then([p = std::move(p), &options, limit, now] (std::unique_ptr<cql3::result_set> rs) {
        rs->get_metadata().set_paging_state(p->state());
        return ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
    });
@@ -1250,7 +1259,7 @@ indexed_table_select_statement::find_index_partition_ranges(service::storage_pro
 {
    using value_type = std::tuple<dht::partition_range_vector, lw_shared_ptr<const service::pager::paging_state>>;
    auto now = gc_clock::now();
-    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+    auto timeout = db::timeout_clock::now() + get_timeout(options);
    return read_posting_list(proxy, options, get_limit(options), state, now, timeout, false).then(
            [this, now, &options] (::shared_ptr<cql_transport::messages::result_message::rows> rows) {
        auto rs = cql3::untyped_result_set(rows);
@@ -1291,7 +1300,7 @@ indexed_table_select_statement::find_index_clustering_rows(service::storage_prox
 {
    using value_type = std::tuple<std::vector<indexed_table_select_statement::primary_key>, lw_shared_ptr<const service::pager::paging_state>>;
    auto now = gc_clock::now();
-    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+    auto timeout = db::timeout_clock::now() + get_timeout(options);
    return read_posting_list(proxy, options, get_limit(options), state, now, timeout, true).then(
            [this, now, &options] (::shared_ptr<cql_transport::messages::result_message::rows> rows) {

@@ -1318,13 +1327,23 @@ indexed_table_select_statement::find_index_clustering_rows(service::storage_prox

 namespace raw {

+static void validate_attrs(const cql3::attributes::raw& attrs) {
+    if (attrs.timestamp) {
+        throw exceptions::invalid_request_exception("Specifying TIMESTAMP is not legal for SELECT statement");
+    }
+    if (attrs.time_to_live) {
+        throw exceptions::invalid_request_exception("Specifying TTL is not legal for SELECT statement");
+    }
+}
+
 select_statement::select_statement(::shared_ptr<cf_name> cf_name,
                                   lw_shared_ptr<const parameters> parameters,
                                   std::vector<::shared_ptr<selection::raw_selector>> select_clause,
                                   std::vector<::shared_ptr<relation>> where_clause,
                                   ::shared_ptr<term::raw> limit,
                                   ::shared_ptr<term::raw> per_partition_limit,
-                                   std::vector<::shared_ptr<cql3::column_identifier::raw>> group_by_columns)
+                                   std::vector<::shared_ptr<cql3::column_identifier::raw>> group_by_columns,
+                                   std::unique_ptr<attributes::raw> attrs)
    : cf_statement(std::move(cf_name))
    , _parameters(std::move(parameters))
    , _select_clause(std::move(select_clause))
@@ -1332,7 +1351,10 @@ select_statement::select_statement(::shared_ptr<cf_name> cf_name,
    , _limit(std::move(limit))
    , _per_partition_limit(std::move(per_partition_limit))
    , _group_by_columns(std::move(group_by_columns))
-{ }
+    , _attrs(std::move(attrs))
+{
+    validate_attrs(*_attrs);
+}

 void select_statement::maybe_jsonize_select_clause(database& db, schema_ptr schema) {
    // Fill wildcard clause with explicit column identifiers for as_json function
@@ -1403,6 +1425,8 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
    auto group_by_cell_indices = ::make_shared<std::vector<size_t>>(prepare_group_by(*schema, *selection));

    ::shared_ptr<cql3::statements::select_statement> stmt;
+    auto prepared_attrs = _attrs->prepare(db, keyspace(), column_family());
+    prepared_attrs->collect_marker_specification(bound_names);
    if (restrictions->uses_secondary_indexing()) {
        stmt = indexed_table_select_statement::prepare(
                db,
@@ -1416,7 +1440,8 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
                std::move(ordering_comparator),
                prepare_limit(db, bound_names, _limit),
                prepare_limit(db, bound_names, _per_partition_limit),
-                stats);
+                stats,
+                std::move(prepared_attrs));
    } else {
        stmt = ::make_shared<cql3::statements::primary_key_select_statement>(
                schema,
@@ -1429,7 +1454,8 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
                std::move(ordering_comparator),
                prepare_limit(db, bound_names, _limit),
                prepare_limit(db, bound_names, _per_partition_limit),
-                stats);
+                stats,
+                std::move(prepared_attrs));
    }

    auto partition_key_bind_indices = bound_names.get_partition_key_bind_indexes(*schema);
@@ -1447,7 +1473,7 @@ select_statement::prepare_restrictions(database& db,
 {
    try {
        return ::make_shared<restrictions::statement_restrictions>(db, schema, statement_type::SELECT, std::move(_where_clause), bound_names,
-            selection->contains_only_static_columns(), selection->contains_a_collection(), for_view, allow_filtering);
+            selection->contains_only_static_columns(), for_view, allow_filtering);
    } catch (const exceptions::unrecognized_entity_exception& e) {
        if (contains_alias(e.entity)) {
            throw exceptions::invalid_request_exception(format("Aliases aren't allowed in the where clause ('{}')", e.relation_str));
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -96,6 +96,7 @@ protected:
    const ks_selector _ks_sel;
    bool _range_scan = false;
    bool _range_scan_no_bypass_cache = false;
+    std::unique_ptr<cql3::attributes> _attrs;
 protected :
    virtual future<::shared_ptr<cql_transport::messages::result_message>> do_execute(service::storage_proxy& proxy,
        service::query_state& state, const query_options& options) const;
@@ -111,7 +112,8 @@ public:
            ordering_comparator_type ordering_comparator,
            ::shared_ptr<term> limit,
            ::shared_ptr<term> per_partition_limit,
-            cql_stats& stats);
+            cql_stats& stats,
+            std::unique_ptr<cql3::attributes> attrs);

    virtual ::shared_ptr<const cql3::metadata> get_result_metadata() const override;
    virtual uint32_t get_bound_terms() const override;
@@ -145,6 +147,8 @@ public:

    bool has_group_by() const { return _group_by_cell_indices && !_group_by_cell_indices->empty(); }

+    db::timeout_clock::duration get_timeout(const query_options& options) const;
+
 protected:
    uint64_t do_get_limit(const query_options& options, ::shared_ptr<term> limit, uint64_t default_limit) const;
    uint64_t get_limit(const query_options& options) const {
@@ -171,7 +175,8 @@ public:
                     ordering_comparator_type ordering_comparator,
                     ::shared_ptr<term> limit,
                     ::shared_ptr<term> per_partition_limit,
-                     cql_stats &stats);
+                     cql_stats &stats,
+                     std::unique_ptr<cql3::attributes> attrs);
 };

 class indexed_table_select_statement : public select_statement {
@@ -192,7 +197,8 @@ public:
                                                                    ordering_comparator_type ordering_comparator,
                                                                    ::shared_ptr<term> limit,
                                                                     ::shared_ptr<term> per_partition_limit,
-                                                                    cql_stats &stats);
+                                                                    cql_stats &stats,
+                                                                    std::unique_ptr<cql3::attributes> attrs);

    indexed_table_select_statement(schema_ptr schema,
                                   uint32_t bound_terms,
@@ -207,7 +213,8 @@ public:
                                   cql_stats &stats,
                                   const secondary_index::index& index,
                                   ::shared_ptr<restrictions::restrictions> used_index_restrictions,
-                                   schema_ptr view_schema);
+                                   schema_ptr view_schema,
+                                   std::unique_ptr<cql3::attributes> attrs);

 private:
    virtual future<::shared_ptr<cql_transport::messages::result_message>> do_execute(service::storage_proxy& proxy,
--- a/cql3/statements/truncate_statement.cc
+++ b/cql3/statements/truncate_statement.cc
@@ -78,7 +78,7 @@ bool truncate_statement::depends_on_column_family(const sstring& cf_name) const

 future<> truncate_statement::check_access(service::storage_proxy& proxy, const service::client_state& state) const
 {
-    return state.has_column_family_access(keyspace(), column_family(), auth::permission::MODIFY);
+    return state.has_column_family_access(proxy.local_db(), keyspace(), column_family(), auth::permission::MODIFY);
 }

 void truncate_statement::validate(service::storage_proxy&, const service::client_state& state) const
--- a/cql3/tuples.cc
+++ b/cql3/tuples.cc
@@ -85,8 +85,7 @@ tuples::in_value::from_serialized(const fragmented_temporary_buffer::view& value
    try {
        // Collections have this small hack that validate cannot be called on a serialized object,
        // but the deserialization does the validation (so we're fine).
-      return with_linearized(value_view, [&] (bytes_view value) {
-        auto l = value_cast<list_type_impl::native_type>(type.deserialize(value, options.get_cql_serialization_format()));
+        auto l = value_cast<list_type_impl::native_type>(type.deserialize(value_view, options.get_cql_serialization_format()));
        auto ttype = dynamic_pointer_cast<const tuple_type_impl>(type.get_elements_type());
        assert(ttype);

@@ -96,7 +95,6 @@ tuples::in_value::from_serialized(const fragmented_temporary_buffer::view& value
            elements.emplace_back(to_bytes_opt_vec(ttype->split(ttype->decompose(e))));
        }
        return tuples::in_value(elements);
-      });
    } catch (marshal_exception& e) {
        throw exceptions::invalid_request_exception(e.what());
    }
@@ -142,14 +140,12 @@ shared_ptr<terminal> tuples::in_marker::bind(const query_options& options) {
        auto& type = static_cast<const list_type_impl&>(*_receiver->type);
        auto& elem_type = static_cast<const tuple_type_impl&>(*type.get_elements_type());
        try {
-            with_linearized(*value, [&] (bytes_view v) {
-                type.validate(v, options.get_cql_serialization_format());
-                auto l = value_cast<list_type_impl::native_type>(type.deserialize(v, options.get_cql_serialization_format()));
+            type.validate(*value, options.get_cql_serialization_format());
+            auto l = value_cast<list_type_impl::native_type>(type.deserialize(*value, options.get_cql_serialization_format()));

-                for (auto&& element : l) {
-                    elem_type.validate(elem_type.decompose(element), options.get_cql_serialization_format());
-                }
-            });
+            for (auto&& element : l) {
+                elem_type.validate(elem_type.decompose(element), options.get_cql_serialization_format());
+            }
        } catch (marshal_exception& e) {
            throw exceptions::invalid_request_exception(e.what());
        }
--- a/cql3/tuples.hh
+++ b/cql3/tuples.hh
@@ -317,9 +317,7 @@ public:
            } else {
                auto& type = static_cast<const tuple_type_impl&>(*_receiver->type);
                try {
-                    with_linearized(*value, [&] (bytes_view v) {
-                        type.validate(v, options.get_cql_serialization_format());
-                    });
+                    type.validate(*value, options.get_cql_serialization_format());
                } catch (marshal_exception& e) {
                    throw exceptions::invalid_request_exception(
                            format("Exception while binding column {:s}: {:s}", _receiver->name->to_cql_string(), e.what()));
--- a/Show More
+++ b/Show More