release: prepare for 4.6.11

Merge 'cql3: don't ignore other restrictions when a multi column restriction is present during filtering' from Jan Ciołek
When filtering with multi column restriction present all other restrictions were ignored. So a query like: `SELECT * FROM WHERE pk = 0 AND (ck1, ck2) < (0, 0) AND regular_col = 0 ALLOW FILTERING;` would ignore the restriction `regular_col = 0`. This was caused by a bug in the filtering code: 2779a171fc/cql3/selection/selection.cc (L433-L449) When multi column restrictions were detected, the code checked if they are satisfied and returned immediately. This is fixed by returning only when these restrictions are not satisfied. When they are satisfied the other restrictions are checked as well to ensure all of them are satisfied. This code was introduced back in 2019, when fixing #3574. Perhaps back then it was impossible to mix multi column and regular columns and this approach was correct. Fixes: #6200 Fixes: #12014 Closes #12031 * github.com:scylladb/scylladb: cql-pytest: add a reproducer for #12014, verify that filtering multi column and regular restrictions works boost/restrictions-test: uncomment part of the test that passes now cql-pytest: enable test for filtering combined multi column and regular column restrictions cql3: don't ignore other restrictions when a multi column restriction is present during filtering (cherry picked from commit 2d2034ea28) Closes #12086
2022-11-28 15:45:26 +02:00 · 2022-11-27 00:15:04 +02:00 · 2022-11-16 11:52:51 +03:00 · 2022-11-14 10:30:20 +02:00 · 2022-11-10 20:43:44 +02:00 · 2022-11-07 16:51:14 +02:00
117 changed files with 1715 additions and 312 deletions
--- a/2
+++ b/2
@@ -60,7 +60,7 @@ fi

 # Default scylla product/version tags
 PRODUCT=scylla
-VERSION=4.6.2
+VERSION=4.6.11

 if test -f version
 then
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -415,6 +415,11 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
    rjson::add(table_description, "BillingModeSummary", rjson::empty_object());
    rjson::add(table_description["BillingModeSummary"], "BillingMode", "PAY_PER_REQUEST");
    rjson::add(table_description["BillingModeSummary"], "LastUpdateToPayPerRequestDateTime", rjson::value(creation_date_seconds));
+    // In PAY_PER_REQUEST billing mode, provisioned capacity should return 0
+    rjson::add(table_description, "ProvisionedThroughput", rjson::empty_object());
+    rjson::add(table_description["ProvisionedThroughput"], "ReadCapacityUnits", 0);
+    rjson::add(table_description["ProvisionedThroughput"], "WriteCapacityUnits", 0);
+    rjson::add(table_description["ProvisionedThroughput"], "NumberOfDecreasesToday", 0);

    std::unordered_map<std::string,std::string> key_attribute_types;
    // Add base table's KeySchema and collect types for AttributeDefinitions:
@@ -2078,6 +2083,9 @@ static attrs_to_get calculate_attrs_to_get(const rjson::value& req, std::unorder
        for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) {
            attribute_path_map_add("AttributesToGet", ret, it->GetString());
        }
+        if (ret.empty()) {
+            throw api_error::validation("Empty AttributesToGet is not allowed. Consider using Select=COUNT instead.");
+        }
        return ret;
    } else if (has_projection_expression) {
        const rjson::value& projection_expression = req["ProjectionExpression"];
--- a/alternator/ttl.cc
+++ b/alternator/ttl.cc
@@ -94,10 +94,7 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
 }

 future<executor::request_return_type> executor::describe_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
-    _stats.api_operations.update_time_to_live++;
-    if (!_proxy.get_db().local().features().cluster_supports_alternator_ttl()) {
-        co_return api_error::unknown_operation("DescribeTimeToLive not yet supported. Experimental support is available if the 'alternator_ttl' experimental feature is enabled on all nodes.");
-    }
+    _stats.api_operations.describe_time_to_live++;
    schema_ptr schema = get_table(_proxy, request);
    std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
    rjson::value desc = rjson::empty_object();
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -604,15 +604,21 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
                return make_exception_future<json::json_return_type>(
                        std::runtime_error("Can not perform cleanup operation when topology changes"));
            }
-            return ctx.db.invoke_on_all([keyspace, column_families] (database& db) {
-                std::vector<column_family*> column_families_vec;
-                auto& cm = db.get_compaction_manager();
-                for (auto cf : column_families) {
-                    column_families_vec.push_back(&db.find_column_family(keyspace, cf));
-                }
-                return parallel_for_each(column_families_vec, [&cm, &db] (column_family* cf) {
-                    return cm.perform_cleanup(db, cf);
+            return ctx.db.invoke_on_all([keyspace, column_families] (database& db) -> future<> {
+                auto table_ids = boost::copy_range<std::vector<utils::UUID>>(column_families | boost::adaptors::transformed([&] (auto& table_name) {
+                    return db.find_uuid(keyspace, table_name);
+                }));
+                // cleanup smaller tables first, to increase chances of success if low on space.
+                std::ranges::sort(table_ids, std::less<>(), [&] (const utils::UUID& id) {
+                    return db.find_column_family(id).get_stats().live_disk_space_used;
                });
+                auto& cm = db.get_compaction_manager();
+                // as a table can be dropped during loop below, let's find it before issuing the cleanup request.
+                for (auto& id : table_ids) {
+                    table& t = db.find_column_family(id);
+                    co_await cm.perform_cleanup(db, &t);
+                }
+                co_return;
            }).then([]{
                return make_ready_future<json::json_return_type>(0);
            });
--- a/cdc/generation.cc
+++ b/cdc/generation.cc
@@ -765,8 +765,12 @@ future<> generation_service::check_and_repair_cdc_streams() {
    std::optional<cdc::generation_id> latest = _gen_id;
    const auto& endpoint_states = _gossiper.get_endpoint_states();
    for (const auto& [addr, state] : endpoint_states) {
-        if (!_gossiper.is_normal(addr))  {
-            throw std::runtime_error(format("All nodes must be in NORMAL state while performing check_and_repair_cdc_streams"
+        if (_gossiper.is_left(addr)) {
+            cdc_log.info("check_and_repair_cdc_streams ignored node {} because it is in LEFT state", addr);
+            continue;
+        }
+        if (!_gossiper.is_normal(addr)) {
+            throw std::runtime_error(format("All nodes must be in NORMAL or LEFT state while performing check_and_repair_cdc_streams"
                    " ({} is in state {})", addr, _gossiper.get_gossip_status(state)));
        }

@@ -830,6 +834,11 @@ future<> generation_service::check_and_repair_cdc_streams() {
                latest, db_clock::now());
            should_regenerate = true;
        } else {
+          if (tmptr->sorted_tokens().size() != gen->entries().size()) {
+              // We probably have garbage streams from old generations
+              cdc_log.info("Generation size does not match the token ring, regenerating");
+              should_regenerate = true;
+          } else {
            std::unordered_set<dht::token> gen_ends;
            for (const auto& entry : gen->entries()) {
                gen_ends.insert(entry.token_range_end);
@@ -841,6 +850,7 @@ future<> generation_service::check_and_repair_cdc_streams() {
                    break;
                }
            }
+          }
        }
    }

--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -73,7 +73,7 @@ using namespace std::chrono_literals;
 logging::logger cdc_log("cdc");

 namespace cdc {
-static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {});
+static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {}, schema_ptr = nullptr);
 }

 static constexpr auto cdc_group_name = "cdc";
@@ -220,7 +220,7 @@ public:
                return;
            }

-            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt);
+            auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt, log_schema);

            auto log_mut = log_schema 
                ? db::schema_tables::make_update_table_mutations(db, keyspace.metadata(), log_schema, new_log_schema, timestamp, false)
@@ -503,7 +503,7 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
    return to_bytes(cdc_deleted_elements_column_prefix) + column_name;
 }

-static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid) {
+static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid, schema_ptr old) {
    schema_builder b(s.ks_name(), log_name(s.cf_name()));
    b.with_partitioner("com.scylladb.dht.CDCPartitioner");
    b.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
@@ -590,6 +590,20 @@ static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID>
        b.set_uuid(*uuid);
    }

+    /**
+     * #10473 - if we are redefining the log table, we need to ensure any dropped
+     * columns are registered in "dropped_columns" table, otherwise clients will not
+     * be able to read data older than now.
+     */
+    if (old) {
+        // not super efficient, but we don't do this often.
+        for (auto& col : old->all_columns()) {
+            if (!b.has_column({col.name(), col.name_as_text() })) {
+                b.without_column(col.name_as_text(), col.type, api::new_timestamp());
+            }
+        }
+    }
+
    return b.build();
 }

--- a/compaction/compaction_manager.cc
+++ b/compaction/compaction_manager.cc
@@ -527,16 +527,11 @@ future<> compaction_manager::stop() {
    }
 }

-void compaction_manager::really_do_stop() {
-    if (_state == state::none || _state == state::stopped) {
-        return;
-    }
-
-    _state = state::stopped;
+future<> compaction_manager::really_do_stop() {
    cmlog.info("Asked to stop");
    // Reset the metrics registry
    _metrics.clear();
-    _stop_future.emplace(stop_ongoing_compactions("shutdown").then([this] () mutable {
+    return stop_ongoing_compactions("shutdown").then([this] () mutable {
        reevaluate_postponed_compactions();
        return std::move(_waiting_reevalution);
    }).then([this] {
@@ -544,12 +539,34 @@ void compaction_manager::really_do_stop() {
        _compaction_submission_timer.cancel();
        cmlog.info("Stopped");
        return _compaction_controller.shutdown();
-    }));
+    });
+}
+
+template <typename Ex>
+requires std::is_base_of_v<std::exception, Ex> &&
+requires (const Ex& ex) {
+    { ex.code() } noexcept -> std::same_as<const std::error_code&>;
+}
+auto swallow_enospc(const Ex& ex) noexcept {
+    if (ex.code().value() != ENOSPC) {
+        return make_exception_future<>(std::make_exception_ptr(ex));
+    }
+
+    cmlog.warn("Got ENOSPC on stop, ignoring...");
+    return make_ready_future<>();
 }

 void compaction_manager::do_stop() noexcept {
+    if (_state == state::none || _state == state::stopped) {
+        return;
+    }
+
    try {
-        really_do_stop();
+        _state = state::stopped;
+        _stop_future = really_do_stop()
+            .handle_exception_type([] (const std::system_error& ex) { return swallow_enospc(ex); })
+            .handle_exception_type([] (const storage_io_error& ex) { return swallow_enospc(ex); })
+        ;
    } catch (...) {
        try {
            cmlog.error("Failed to stop the manager: {}", std::current_exception());
@@ -681,6 +698,7 @@ void compaction_manager::submit_offstrategy(column_family* cf) {
                _stats.active_tasks++;
                task->setup_new_compaction();

+              return with_scheduling_group(_maintenance_sg.cpu, [this, task, cf] {
                return cf->run_offstrategy_compaction(task->compaction_data).then_wrapped([this, task] (future<> f) mutable {
                    _stats.active_tasks--;
                    task->finish_compaction();
@@ -703,6 +721,7 @@ void compaction_manager::submit_offstrategy(column_family* cf) {
                    _tasks.remove(task);
                    return make_ready_future<stop_iteration>(stop_iteration::yes);
                });
+              });
            });
        });
    });
@@ -719,9 +738,20 @@ inline bool compaction_manager::check_for_cleanup(column_family* cf) {

 future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compaction_type_options options, get_candidates_func get_func, can_purge_tombstones can_purge) {
    auto task = make_lw_shared<compaction_manager::task>(cf, options.type());
-    _tasks.push_back(task);

-    auto sstables = std::make_unique<std::vector<sstables::shared_sstable>>(get_func(*cf));
+    std::unique_ptr<std::vector<sstables::shared_sstable>> sstables;
+    lw_shared_ptr<compacting_sstable_registration> compacting;
+
+    // since we might potentially have ongoing compactions, and we
+    // must ensure that all sstables created before we run are included
+    // in the re-write, we need to barrier out any previously running
+    // compaction.
+    auto get_and_register_candidates_func = [this, &sstables, &compacting, &get_func] () mutable -> future<> {
+        sstables = std::make_unique<std::vector<sstables::shared_sstable>>(co_await get_func());
+        compacting = make_lw_shared<compacting_sstable_registration>(this, *sstables);
+    };
+
+    co_await cf->run_with_compaction_disabled(std::ref(get_and_register_candidates_func));
    // sort sstables by size in descending order, such that the smallest files will be rewritten first
    // (as sstable to be rewritten is popped off from the back of container), so rewrite will have higher
    // chance to succeed when the biggest files are reached.
@@ -729,10 +759,11 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
        return a->data_size() > b->data_size();
    });

-    auto compacting = make_lw_shared<compacting_sstable_registration>(this, *sstables);
    auto sstables_ptr = sstables.get();
    _stats.pending_tasks += sstables->size();

+    _tasks.push_back(task);
+
    task->compaction_done = do_until([this, sstables_ptr, task] { return sstables_ptr->empty() || !can_proceed(task); },
             [this, task, options, sstables_ptr, compacting, can_purge] () mutable {
        auto sst = sstables_ptr->back();
@@ -789,7 +820,7 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
        _tasks.remove(task);
    });

-    return task->compaction_done.get_future().then([task] {});
+    co_return co_await task->compaction_done.get_future();
 }

 future<> compaction_manager::perform_sstable_scrub_validate_mode(column_family* cf) {
@@ -871,31 +902,29 @@ future<> compaction_manager::perform_cleanup(database& db, column_family* cf) {
        return make_exception_future<>(std::runtime_error(format("cleanup request failed: there is an ongoing cleanup on {}.{}",
            cf->schema()->ks_name(), cf->schema()->cf_name())));
    }
-    return seastar::async([this, cf, &db] {
+  // FIXME: indentation
+  auto sorted_owned_ranges = db.get_keyspace_local_ranges(cf->schema()->ks_name());
+  auto get_sstables = [this, &db, cf, sorted_owned_ranges] () -> future<std::vector<sstables::shared_sstable>> {
+    return seastar::async([this, &db, cf, sorted_owned_ranges = std::move(sorted_owned_ranges)] {
        auto schema = cf->schema();
-        auto sorted_owned_ranges = db.get_keyspace_local_ranges(schema->ks_name());
        auto sstables = std::vector<sstables::shared_sstable>{};
        const auto candidates = get_candidates(*cf);
        std::copy_if(candidates.begin(), candidates.end(), std::back_inserter(sstables), [&sorted_owned_ranges, schema] (const sstables::shared_sstable& sst) {
            seastar::thread::maybe_yield();
            return sorted_owned_ranges.empty() || needs_cleanup(sst, sorted_owned_ranges, schema);
        });
-        return std::tuple<dht::token_range_vector, std::vector<sstables::shared_sstable>>(sorted_owned_ranges, sstables);
-    }).then_unpack([this, cf, &db] (dht::token_range_vector owned_ranges, std::vector<sstables::shared_sstable> sstables) {
-        return rewrite_sstables(cf, sstables::compaction_type_options::make_cleanup(std::move(owned_ranges)),
-                [sstables = std::move(sstables)] (const table&) { return sstables; });
+       return sstables;
    });
+  };
+  return rewrite_sstables(cf, sstables::compaction_type_options::make_cleanup(std::move(sorted_owned_ranges)), std::move(get_sstables));
 }

 // Submit a column family to be upgraded and wait for its termination.
 future<> compaction_manager::perform_sstable_upgrade(database& db, column_family* cf, bool exclude_current_version) {
-    using shared_sstables = std::vector<sstables::shared_sstable>;
-    return do_with(shared_sstables{}, [this, &db, cf, exclude_current_version](shared_sstables& tables) {
-        // since we might potentially have ongoing compactions, and we
-        // must ensure that all sstables created before we run are included
-        // in the re-write, we need to barrier out any previously running
-        // compaction.
-        return cf->run_with_compaction_disabled([this, cf, &tables, exclude_current_version] {
+    auto get_sstables = [this, &db, cf, exclude_current_version] {
+            // FIXME: indentation
+            std::vector<sstables::shared_sstable> tables;
+
            auto last_version = cf->get_sstables_manager().get_highest_supported_format();

            for (auto& sst : get_candidates(*cf)) {
@@ -906,21 +935,17 @@ future<> compaction_manager::perform_sstable_upgrade(database& db, column_family
                    tables.emplace_back(sst);
                }
            }
-            return make_ready_future<>();
-        }).then([&db, cf] {
-             return db.get_keyspace_local_ranges(cf->schema()->ks_name());
-        }).then([this, &db, cf, &tables] (dht::token_range_vector owned_ranges) {
-            // doing a "cleanup" is about as compacting as we need
-            // to be, provided we get to decide the tables to process,
-            // and ignoring any existing operations.
-            // Note that we potentially could be doing multiple
-            // upgrades here in parallel, but that is really the users
-            // problem.
-            return rewrite_sstables(cf, sstables::compaction_type_options::make_upgrade(std::move(owned_ranges)), [&](auto&) mutable {
-                return std::exchange(tables, {});
-            });
-        });
-    });
+
+            return make_ready_future<std::vector<sstables::shared_sstable>>(tables);
+    };
+
+    // doing a "cleanup" is about as compacting as we need
+    // to be, provided we get to decide the tables to process,
+    // and ignoring any existing operations.
+    // Note that we potentially could be doing multiple
+    // upgrades here in parallel, but that is really the users
+    // problem.
+    return rewrite_sstables(cf, sstables::compaction_type_options::make_upgrade(db.get_keyspace_local_ranges(cf->schema()->ks_name())), std::move(get_sstables));
 }

 // Submit a column family to be scrubbed and wait for its termination.
@@ -928,14 +953,10 @@ future<> compaction_manager::perform_sstable_scrub(column_family* cf, sstables::
    if (scrub_mode == sstables::compaction_type_options::scrub::mode::validate) {
        return perform_sstable_scrub_validate_mode(cf);
    }
-    // since we might potentially have ongoing compactions, and we
-    // must ensure that all sstables created before we run are scrubbed,
-    // we need to barrier out any previously running compaction.
-    return cf->run_with_compaction_disabled([this, cf, scrub_mode] {
-        return rewrite_sstables(cf, sstables::compaction_type_options::make_scrub(scrub_mode), [this] (const table& cf) {
-            return get_candidates(cf);
+        // FIXME: indentation
+        return rewrite_sstables(cf, sstables::compaction_type_options::make_scrub(scrub_mode), [this, cf] {
+            return make_ready_future<std::vector<sstables::shared_sstable>>(get_candidates(*cf));
        }, can_purge_tombstones::no);
-    });
 }

 future<> compaction_manager::remove(column_family* cf) {
--- a/compaction/compaction_manager.hh
+++ b/compaction/compaction_manager.hh
@@ -178,7 +178,7 @@ private:
    maintenance_scheduling_group _maintenance_sg;
    size_t _available_memory;

-    using get_candidates_func = std::function<std::vector<sstables::shared_sstable>(const column_family&)>;
+    using get_candidates_func = std::function<future<std::vector<sstables::shared_sstable>>()>;
    class can_purge_tombstones_tag;
    using can_purge_tombstones = bool_class<can_purge_tombstones_tag>;

@@ -209,7 +209,7 @@ public:

    // Stop all fibers, without waiting. Safe to be called multiple times.
    void do_stop() noexcept;
-    void really_do_stop();
+    future<> really_do_stop();

    // Submit a column family to be compacted.
    void submit(column_family* cf);
--- a/compaction/leveled_compaction_strategy.cc
+++ b/compaction/leveled_compaction_strategy.cc
@@ -80,7 +80,11 @@ compaction_descriptor leveled_compaction_strategy::get_major_compaction_job(colu
 }

 void leveled_compaction_strategy::notify_completion(const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) {
-    if (removed.empty() || added.empty()) {
+    // All the update here is only relevant for regular compaction's round-robin picking policy, and if
+    // last_compacted_keys wasn't generated by regular, it means regular is disabled since last restart,
+    // therefore we can skip the updates here until regular runs for the first time. Once it runs,
+    // it will be able to generate last_compacted_keys correctly by looking at metadata of files.
+    if (removed.empty() || added.empty() || !_last_compacted_keys) {
        return;
    }
    auto min_level = std::numeric_limits<uint32_t>::max();
--- a/compaction/time_window_compaction_strategy.cc
+++ b/compaction/time_window_compaction_strategy.cc
@@ -225,6 +225,7 @@ time_window_compaction_strategy::get_sstables_for_compaction(column_family& cf,
    auto gc_before = gc_clock::now() - cf.schema()->gc_grace_seconds();

    if (candidates.empty()) {
+        _estimated_remaining_tasks = 0;
        return compaction_descriptor();
    }

--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -1403,7 +1403,7 @@ serviceLevelOrRoleName returns [sstring name]
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower); }
 | t=STRING_LITERAL     { $name = sstring($t.text); }
 | t=QUOTED_NAME        { $name = sstring($t.text); }
-| k=unreserved_keyword { $name = sstring($t.text); 
+| k=unreserved_keyword { $name = k;
 						 std::transform($name.begin(), $name.end(), $name.begin(), ::tolower);}
 | QMARK {add_recognition_error("Bind variables cannot be used for service levels or role names");}
 ;
--- a/cql3/cql3_type.cc
+++ b/cql3/cql3_type.cc
@@ -25,6 +25,7 @@

 #include "cql3_type.hh"
 #include "cql3/util.hh"
+#include "exceptions/exceptions.hh"
 #include "ut_name.hh"
 #include "database.hh"
 #include "user_types_metadata.hh"
@@ -448,7 +449,20 @@ sstring maybe_quote(const sstring& identifier) {
    }

    if (!need_quotes) {
-        return identifier;
+        // A seemingly valid identifier matching [a-z][a-z0-9_]* may still
+        // need quoting if it is a CQL keyword, e.g., "to" (see issue #9450).
+        // While our parser Cql.g has different production rules for different
+        // types of identifiers (column names, table names, etc.), all of
+        // these behave identically for alphanumeric strings: they exclude
+        // many keywords but allow keywords listed as "unreserved keywords".
+        // So we can use any of them, for example cident.
+        try {
+            cql3::util::do_with_parser(identifier, std::mem_fn(&cql3_parser::CqlParser::cident));
+            return identifier;
+        } catch(exceptions::syntax_exception&) {
+            // This alphanumeric string is not a valid identifier, so fall
+            // through to have it quoted:
+        }
    }
    if (num_quotes == 0) {
        return make_sstring("\"", identifier, "\"");
--- a/cql3/cql_statement.hh
+++ b/cql3/cql_statement.hh
@@ -109,9 +109,7 @@ public:
    virtual seastar::future<seastar::shared_ptr<cql_transport::messages::result_message>>
        execute(query_processor& qp, service::query_state& state, const query_options& options) const = 0;

-    virtual bool depends_on_keyspace(const seastar::sstring& ks_name) const = 0;
-
-    virtual bool depends_on_column_family(const seastar::sstring& cf_name) const = 0;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const = 0;

    virtual seastar::shared_ptr<const metadata> get_result_metadata() const = 0;

--- a/cql3/expr/expression.cc
+++ b/cql3/expr/expression.cc
@@ -123,10 +123,38 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
                    format("Column definition {} does not match any column in the query selection",
                    cdef->name_as_text()));
        }
-        const auto deserialized = cdef->type->deserialize(managed_bytes_view(*data.other_columns[index]));
+        const managed_bytes_opt& serialized = data.other_columns[index];
+        if (!serialized) {
+            // For null[i] we return null.
+            return std::nullopt;
+        }
+        const auto deserialized = cdef->type->deserialize(managed_bytes_view(*serialized));
        const auto& data_map = value_cast<map_type_impl::native_type>(deserialized);
        const auto key = evaluate_to_raw_view(col.sub, options);
        auto&& key_type = col_type->name_comparator();
+        if (key.is_null()) {
+            // For m[null] return null.
+            // This is different from Cassandra - which treats m[null]
+            // as an invalid request error. But m[null] -> null is more
+            // consistent with our usual null treatement (e.g., both
+            // null[2] and null < 2 return null). It will also allow us
+            // to support non-constant subscripts (e.g., m[a]) where "a"
+            // may be null in some rows and non-null in others, and it's
+            // not an error.
+            return std::nullopt;
+        }
+        if (key.is_unset_value()) {
+            // An m[?] with ? bound to UNSET_VALUE is a invalid query.
+            // We could have detected it earlier while binding, but since
+            // we currently don't, we must protect the following code
+            // which can't work with an UNSET_VALUE. Note that the
+            // placement of this check here means that in an empty table,
+            // where we never need to evaluate the filter expression, this
+            // error will not be detected.
+            throw exceptions::invalid_request_exception(
+                format("Unsupported unset map key for column {}",
+                    cdef->name_as_text()));
+        }
        const auto found = key.with_linearized([&] (bytes_view key_bv) {
            using entry = std::pair<data_value, data_value>;
            return std::find_if(data_map.cbegin(), data_map.cend(), [&] (const entry& element) {
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -970,7 +970,7 @@ bool query_processor::migration_subscriber::should_invalidate(
        sstring ks_name,
        std::optional<sstring> cf_name,
        ::shared_ptr<cql_statement> statement) {
-    return statement->depends_on_keyspace(ks_name) && (!cf_name || statement->depends_on_column_family(*cf_name));
+    return statement->depends_on(ks_name, cf_name);
 }

 future<> query_processor::query_internal(
--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -450,11 +450,16 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
    }

    auto clustering_columns_restrictions = _restrictions->get_clustering_columns_restrictions();
-    if (dynamic_pointer_cast<cql3::restrictions::multi_column_restriction>(clustering_columns_restrictions)) {
+    bool has_multi_col_clustering_restrictions =
+        dynamic_pointer_cast<cql3::restrictions::multi_column_restriction>(clustering_columns_restrictions) != nullptr;
+    if (has_multi_col_clustering_restrictions) {
        clustering_key_prefix ckey = clustering_key_prefix::from_exploded(clustering_key);
-        return expr::is_satisfied_by(
+        bool multi_col_clustering_satisfied = expr::is_satisfied_by(
                clustering_columns_restrictions->expression,
                partition_key, clustering_key, static_row, row, selection, _options);
+        if (!multi_col_clustering_satisfied) {
+            return false;
+        }
    }

    auto static_row_iterator = static_row.iterator();
@@ -502,6 +507,13 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
            if (_skip_ck_restrictions) {
                continue;
            }
+            if (has_multi_col_clustering_restrictions) {
+                // Mixing multi column and single column restrictions on clustering
+                // key columns is forbidden.
+                // Since there are multi column restrictions we have to skip
+                // evaluating single column restrictions or we will get an error.
+                continue;
+            }
            auto clustering_key_restrictions_map = _restrictions->get_single_column_clustering_key_restrictions();
            auto restr_it = clustering_key_restrictions_map.find(cdef);
            if (restr_it == clustering_key_restrictions_map.end()) {
--- a/cql3/statements/authentication_statement.cc
+++ b/cql3/statements/authentication_statement.cc
@@ -46,13 +46,7 @@ uint32_t cql3::statements::authentication_statement::get_bound_terms() const {
    return 0;
 }

-bool cql3::statements::authentication_statement::depends_on_keyspace(
-                const sstring& ks_name) const {
-    return false;
-}
-
-bool cql3::statements::authentication_statement::depends_on_column_family(
-                const sstring& cf_name) const {
+bool cql3::statements::authentication_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/authentication_statement.hh
+++ b/cql3/statements/authentication_statement.hh
@@ -55,9 +55,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;

--- a/cql3/statements/authorization_statement.cc
+++ b/cql3/statements/authorization_statement.cc
@@ -48,13 +48,7 @@ uint32_t cql3::statements::authorization_statement::get_bound_terms() const {
    return 0;
 }

-bool cql3::statements::authorization_statement::depends_on_keyspace(
-                const sstring& ks_name) const {
-    return false;
-}
-
-bool cql3::statements::authorization_statement::depends_on_column_family(
-                const sstring& cf_name) const {
+bool cql3::statements::authorization_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/authorization_statement.hh
+++ b/cql3/statements/authorization_statement.hh
@@ -59,9 +59,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;

--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -98,14 +98,9 @@ batch_statement::batch_statement(type type_,
 {
 }

-bool batch_statement::depends_on_keyspace(const sstring& ks_name) const
+bool batch_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
-    return false;
-}
-
-bool batch_statement::depends_on_column_family(const sstring& cf_name) const
-{
-    return false;
+    return boost::algorithm::any_of(_statements, [&ks_name, &cf_name] (auto&& s) { return s.statement->depends_on(ks_name, cf_name); });
 }

 uint32_t batch_statement::get_bound_terms() const
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -115,9 +115,7 @@ public:
                    std::unique_ptr<attributes> attrs,
                    cql_stats& stats);

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual uint32_t get_bound_terms() const override;

--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -46,6 +46,7 @@
 #include "cdc/cdc_extension.hh"
 #include "gms/feature.hh"
 #include "gms/feature_service.hh"
+#include "utils/bloom_calculations.hh"

 #include <boost/algorithm/string/predicate.hpp>

@@ -168,6 +169,16 @@ void cf_prop_defs::validate(const database& db, const schema::extensions_map& sc
        throw exceptions::configuration_exception(KW_MAX_INDEX_INTERVAL + " must be greater than " + KW_MIN_INDEX_INTERVAL);
    }

+    if (get_simple(KW_BF_FP_CHANCE)) {
+        double bloom_filter_fp_chance = get_double(KW_BF_FP_CHANCE, 0/*not used*/);
+        double min_bloom_filter_fp_chance = utils::bloom_calculations::min_supported_bloom_filter_fp_chance();
+        if (bloom_filter_fp_chance <= min_bloom_filter_fp_chance || bloom_filter_fp_chance > 1.0) {
+            throw exceptions::configuration_exception(format(
+                "{} must be larger than {} and less than or equal to 1.0 (got {})",
+                KW_BF_FP_CHANCE, min_bloom_filter_fp_chance, bloom_filter_fp_chance));
+        }
+    }
+
    speculative_retry::from_sstring(get_string(KW_SPECULATIVE_RETRY, speculative_retry(speculative_retry::type::NONE, 0).to_sstring()));
 }

--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -571,12 +571,8 @@ modification_statement::validate(service::storage_proxy&, const service::client_
    }
 }

-bool modification_statement::depends_on_keyspace(const sstring& ks_name) const {
-    return keyspace() == ks_name;
-}
-
-bool modification_statement::depends_on_column_family(const sstring& cf_name) const {
-    return column_family() == cf_name;
+bool modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+    return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
 }

 void modification_statement::add_operation(::shared_ptr<operation> op) {
--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -165,9 +165,7 @@ public:
    // Validate before execute, using client state and current schema
    void validate(service::storage_proxy&, const service::client_state& state) const override;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    void add_operation(::shared_ptr<operation> op);

--- a/cql3/statements/schema_altering_statement.cc
+++ b/cql3/statements/schema_altering_statement.cc
@@ -67,12 +67,7 @@ future<> schema_altering_statement::grant_permissions_to_creator(const service::
    return make_ready_future<>();
 }

-bool schema_altering_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool schema_altering_statement::depends_on_column_family(const sstring& cf_name) const
+bool schema_altering_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/schema_altering_statement.hh
+++ b/cql3/statements/schema_altering_statement.hh
@@ -79,9 +79,7 @@ protected:
     */
    virtual future<> grant_permissions_to_creator(const service::client_state&) const;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual uint32_t get_bound_terms() const override;

--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -194,12 +194,8 @@ void select_statement::validate(service::storage_proxy&, const service::client_s
    // Nothing to do, all validation has been done by raw_statemet::prepare()
 }

-bool select_statement::depends_on_keyspace(const sstring& ks_name) const {
-    return keyspace() == ks_name;
-}
-
-bool select_statement::depends_on_column_family(const sstring& cf_name) const {
-    return column_family() == cf_name;
+bool select_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
+    return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
 }

 const sstring& select_statement::keyspace() const {
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -127,8 +127,7 @@ public:
    virtual uint32_t get_bound_terms() const override;
    virtual future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;
    virtual void validate(service::storage_proxy&, const service::client_state& state) const override;
-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual future<::shared_ptr<cql_transport::messages::result_message>> execute(query_processor& qp,
        service::query_state& state, const query_options& options) const override;
--- a/cql3/statements/service_level_statement.cc
+++ b/cql3/statements/service_level_statement.cc
@@ -30,13 +30,7 @@ uint32_t service_level_statement::get_bound_terms() const {
    return 0;
 }

-bool service_level_statement::depends_on_keyspace(
-        const sstring &ks_name) const {
-    return false;
-}
-
-bool service_level_statement::depends_on_column_family(
-        const sstring &cf_name) const {
+bool service_level_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
    return false;
 }

--- a/cql3/statements/service_level_statement.hh
+++ b/cql3/statements/service_level_statement.hh
@@ -56,9 +56,7 @@ public:

    uint32_t get_bound_terms() const override;

-    bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    bool depends_on_column_family(const sstring& cf_name) const override;
+    bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    future<> check_access(service::storage_proxy& sp, const service::client_state& state) const override;

--- a/cql3/statements/sl_prop_defs.cc
+++ b/cql3/statements/sl_prop_defs.cc
@@ -43,7 +43,7 @@ void sl_prop_defs::validate() {
        data_value v = duration_type->deserialize(duration_type->from_string(*repr));
        cql_duration duration = static_pointer_cast<const duration_type_impl>(duration_type)->from_value(v);
        if (duration.months || duration.days) {
-            throw exceptions::invalid_request_exception("Timeout values cannot be longer than 24h");
+            throw exceptions::invalid_request_exception("Timeout values cannot be expressed in days/months");
        }
        if (duration.nanoseconds % 1'000'000 != 0) {
            throw exceptions::invalid_request_exception("Timeout values must be expressed in millisecond granularity");
--- a/cql3/statements/truncate_statement.cc
+++ b/cql3/statements/truncate_statement.cc
@@ -67,12 +67,7 @@ std::unique_ptr<prepared_statement> truncate_statement::prepare(database& db,cql
    return std::make_unique<prepared_statement>(::make_shared<truncate_statement>(*this));
 }

-bool truncate_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool truncate_statement::depends_on_column_family(const sstring& cf_name) const
+bool truncate_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/truncate_statement.hh
+++ b/cql3/statements/truncate_statement.hh
@@ -58,9 +58,7 @@ public:

    virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;

-    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;

--- a/cql3/statements/use_statement.cc
+++ b/cql3/statements/use_statement.cc
@@ -74,12 +74,7 @@ std::unique_ptr<prepared_statement> use_statement::prepare(database& db, cql_sta

 }

-bool use_statement::depends_on_keyspace(const sstring& ks_name) const
-{
-    return false;
-}
-
-bool use_statement::depends_on_column_family(const sstring& cf_name) const
+bool use_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
 {
    return false;
 }
--- a/cql3/statements/use_statement.hh
+++ b/cql3/statements/use_statement.hh
@@ -59,9 +59,7 @@ public:

    virtual uint32_t get_bound_terms() const override;

-    virtual bool depends_on_keyspace(const seastar::sstring& ks_name) const override;
-
-    virtual bool depends_on_column_family(const seastar::sstring& cf_name) const override;
+    virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;

    virtual seastar::future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;

--- a/cql3/type_json.cc
+++ b/cql3/type_json.cc
@@ -31,6 +31,8 @@
 #include "types/listlike_partial_deserializing_iterator.hh"
 #include "utils/managed_bytes.hh"
 #include "exceptions/exceptions.hh"
+#include <boost/algorithm/string/trim_all.hpp>
+#include <boost/algorithm/string.hpp>

 static inline bool is_control_char(char c) {
    return c >= 0 && c <= 0x1F;
@@ -212,6 +214,17 @@ struct from_json_object_visitor {
    }
    bytes operator()(const boolean_type_impl& t) {
        if (!value.IsBool()) {
+            if (value.IsString()) {
+                std::string str(rjson::to_string_view(value));
+                boost::trim_all(str);
+                boost::to_lower(str);
+
+                if (str == "true") {
+                    return t.decompose(true);
+                } else if (str == "false") {
+                    return t.decompose(false);
+                }
+            }
            throw marshal_exception(format("Invalid JSON object {}", value));
        }
        return t.decompose(value.GetBool());
--- a/cql3/util.hh
+++ b/cql3/util.hh
@@ -87,6 +87,13 @@ std::unique_ptr<cql3::statements::raw::select_statement> build_select_statement(
 /// forbids non-alpha-numeric characters in identifier names.
 /// Quoting involves wrapping the string in double-quotes ("). A double-quote
 /// character itself is quoted by doubling it.
+/// maybe_quote() also quotes reserved CQL keywords (e.g., "to", "where")
+/// but doesn't quote *unreserved* keywords (like ttl, int or as).
+/// Note that this means that if new reserved keywords are added to the
+/// parser, a saved output of maybe_quote() may no longer be parsable by
+/// parser. To avoid this forward-compatibility issue, use quote() instead
+/// of maybe_quote() - to unconditionally quote an identifier even if it is
+/// lowercase and not (yet) a keyword.
 sstring maybe_quote(const sstring& s);

 // Check whether timestamp is not too far in the future as this probably
--- a/database.cc
+++ b/database.cc
@@ -926,10 +926,9 @@ bool database::update_column_family(schema_ptr new_schema) {
    return columns_changed;
 }

-future<> database::remove(const column_family& cf) noexcept {
+void database::remove(const table& cf) noexcept {
    auto s = cf.schema();
    auto& ks = find_keyspace(s->ks_name());
-    co_await _querier_cache.evict_all_for_table(s->id());
    _column_families.erase(s->id());
    ks.metadata()->remove_column_family(s);
    _ks_cf_to_uuid.erase(std::make_pair(s->ks_name(), s->cf_name()));
@@ -946,13 +945,20 @@ future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_
    auto& ks = find_keyspace(ks_name);
    auto uuid = find_uuid(ks_name, cf_name);
    auto cf = _column_families.at(uuid);
-    co_await remove(*cf);
+    remove(*cf);
    cf->clear_views();
-    co_return co_await cf->await_pending_ops().then([this, &ks, cf, tsf = std::move(tsf), snapshot] {
-        return truncate(ks, *cf, std::move(tsf), snapshot).finally([this, cf] {
-            return cf->stop();
-        });
-    }).finally([cf] {});
+    co_await cf->await_pending_ops();
+    co_await _querier_cache.evict_all_for_table(cf->schema()->id());
+    std::exception_ptr ex;
+    try {
+        co_await truncate(ks, *cf, std::move(tsf), snapshot);
+    } catch (...) {
+        ex = std::current_exception();
+    }
+    co_await cf->stop();
+    if (ex) {
+        std::rethrow_exception(std::move(ex));
+    }
 }

 const utils::UUID& database::find_uuid(std::string_view ks, std::string_view cf) const {
--- a/database.hh
+++ b/database.hh
@@ -1384,6 +1384,7 @@ private:
    Future update_write_metrics(Future&& f);
    void update_write_metrics_for_timed_out_write();
    future<> create_keyspace(const lw_shared_ptr<keyspace_metadata>&, bool is_bootstrap, system_keyspace system);
+    void remove(const table&) noexcept;
 public:
    static utils::UUID empty_version;

@@ -1582,7 +1583,6 @@ public:

    bool update_column_family(schema_ptr s);
    future<> drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func, bool with_snapshot = true);
-    future<> remove(const column_family&) noexcept;

    const logalloc::region_group& dirty_memory_region_group() const {
        return _dirty_memory_manager.region_group();
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -39,6 +39,7 @@
 */

 #include <chrono>
+#include <exception>
 #include <seastar/core/future-util.hh>
 #include <seastar/core/do_with.hh>
 #include <seastar/core/semaphore.hh>
@@ -306,6 +307,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
            } catch (no_such_keyspace& ex) {
                // should probably ignore and drop the batch
            } catch (...) {
+                blogger.warn("Replay failed (will retry): {}", std::current_exception());
                // timeout, overload etc.
                // Do _not_ remove the batch, assuning we got a node write error.
                // Since we don't have hints (which origin is satisfied with),
--- a/db/config.cc
+++ b/db/config.cc
@@ -860,6 +860,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Flush tables in the system_schema keyspace after schema modification. This is required for crash recovery, but slows down tests and can be disabled for them")
    , restrict_replication_simplestrategy(this, "restrict_replication_simplestrategy", liveness::LiveUpdate, value_status::Used, db::tri_mode_restriction_t::mode::FALSE, "Controls whether to disable SimpleStrategy replication. Can be true, false, or warn.")
    , restrict_dtcs(this, "restrict_dtcs", liveness::LiveUpdate, value_status::Used, db::tri_mode_restriction_t::mode::WARN, "Controls whether to prevent setting DateTieredCompactionStrategy. Can be true, false, or warn.")
+    , cache_index_pages(this, "cache_index_pages", liveness::LiveUpdate, value_status::Used, true,
+        "Keep SSTable index pages in the global cache after a SSTable read. Expected to improve performance for workloads with big partitions, but may degrade performance for workloads with small partitions.")
    , default_log_level(this, "default_log_level", value_status::Used)
    , logger_log_level(this, "logger_log_level", value_status::Used)
    , log_to_stdout(this, "log_to_stdout", value_status::Used)
--- a/db/config.hh
+++ b/db/config.hh
@@ -372,6 +372,8 @@ public:
    named_value<tri_mode_restriction> restrict_replication_simplestrategy;
    named_value<tri_mode_restriction> restrict_dtcs;

+    named_value<bool> cache_index_pages;
+
    seastar::logging_settings logging_settings(const boost::program_options::variables_map&) const;

    const db::extensions& extensions() const;
--- a/db/snapshot-ctl.cc
+++ b/db/snapshot-ctl.cc
@@ -119,8 +119,9 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
        return check_snapshot_not_exist(ks_name, tag, tables).then([this, ks_name, tables, tag, sf] {
            return do_with(std::vector<sstring>(std::move(tables)),[this, ks_name, tag, sf](const std::vector<sstring>& tables) {
                return do_for_each(tables, [ks_name, tag, sf, this] (const sstring& table_name) {
-                    if (table_name.find(".") != sstring::npos) {
-                        throw std::invalid_argument("Cannot take a snapshot of a secondary index by itself. Run snapshot on the table that owns the index.");
+                    auto& cf = _db.local().find_column_family(ks_name, table_name);
+                    if (cf.schema()->is_view()) {
+                        throw std::invalid_argument("Do not take a snapshot of a materialized view or a secondary index by itself. Run snapshot on the base table instead.");
                    }
                    return _db.invoke_on_all([ks_name, table_name, tag, sf] (database &db) {
                        auto& cf = db.find_column_family(ks_name, table_name);
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -350,7 +350,11 @@ public:
    view_filter_checking_visitor(const schema& base, const view_info& view)
        : _base(base)
        , _view(view)
-        , _selection(cql3::selection::selection::wildcard(_base.shared_from_this()))
+        , _selection(cql3::selection::selection::for_columns(_base.shared_from_this(),
+            boost::copy_range<std::vector<const column_definition*>>(
+                _base.regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return &cdef; }))
+            )
+        )
    {}

    void accept_new_partition(const partition_key& key, uint64_t row_count) {
@@ -887,13 +891,18 @@ void view_updates::generate_update(
    bool same_row = true;
    for (auto col_id : col_ids) {
        auto* after = update.cells().find_cell(col_id);
-        // Note: multi-cell columns can't be part of the primary key.
        auto& cdef = _base->regular_column_at(col_id);
        if (existing) {
            auto* before = existing->cells().find_cell(col_id);
+            // Note that this cell is necessarily atomic, because col_ids are
+            // view key columns, and keys must be atomic.
            if (before && before->as_atomic_cell(cdef).is_live()) {
                if (after && after->as_atomic_cell(cdef).is_live()) {
-                    auto cmp = compare_atomic_cell_for_merge(before->as_atomic_cell(cdef), after->as_atomic_cell(cdef));
+                    // We need to compare just the values of the keys, not
+                    // metadata like the timestamp. This is because below,
+                    // if the old and new view row have the same key, we need
+                    // to be sure to reach the update_entry() case.
+                    auto cmp = compare_unsigned(before->as_atomic_cell(cdef).value(), after->as_atomic_cell(cdef).value());
                    if (cmp != 0) {
                        same_row = false;
                    }
@@ -913,7 +922,13 @@ void view_updates::generate_update(
            if (same_row) {
                update_entry(base_key, update, *existing, now);
            } else {
-                replace_entry(base_key, update, *existing, now);
+                // This code doesn't work if the old and new view row have the
+                // same key, because if they do we get both data and tombstone
+                // for the same timestamp (now) and the tombstone wins. This
+                // is why we need the "same_row" case above - it's not just a
+                // performance optimization.
+                delete_old_entry(base_key, *existing, update, now);
+                create_entry(base_key, update, now);
            }
        } else {
            delete_old_entry(base_key, *existing, update, now);
@@ -1320,7 +1335,7 @@ future<> mutate_MV(
            auto mut_ptr = remote_endpoints.empty() ? std::make_unique<frozen_mutation>(std::move(mut.fm)) : std::make_unique<frozen_mutation>(mut.fm);
            tracing::trace(tr_state, "Locally applying view update for {}.{}; base token = {}; view token = {}",
                    mut.s->ks_name(), mut.s->cf_name(), base_token, view_token);
-            local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, std::move(tr_state), db::commitlog::force_sync::no).then_wrapped(
+            local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, tr_state, db::commitlog::force_sync::no).then_wrapped(
                    [s = mut.s, &stats, &cf_stats, tr_state, base_token, view_token, my_address, mut_ptr = std::move(mut_ptr),
                            units = sem_units.split(sem_units.count())] (future<>&& f) {
                --stats.writes;
--- a/db/view/view.hh
+++ b/db/view/view.hh
@@ -164,10 +164,7 @@ private:
    void delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now);
    void do_delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now);
    void update_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now);
-    void replace_entry(const partition_key& base_key, const clustering_row& update, const clustering_row& existing, gc_clock::time_point now) {
-        create_entry(base_key, update, now);
-        delete_old_entry(base_key, existing, update, now);
-    }
+    void update_entry_for_computed_column(const partition_key& base_key, const clustering_row& update, const std::optional<clustering_row>& existing, gc_clock::time_point now);
 };

 class view_update_builder {
--- a/dirty_memory_manager.hh
+++ b/dirty_memory_manager.hh
@@ -215,6 +215,12 @@ public:
        });
    }

+    future<flush_permit> get_all_flush_permits() {
+        return get_units(_background_work_flush_serializer, _max_background_work).then([this] (auto&& units) {
+            return this->get_flush_permit(std::move(units));
+        });
+    }
+
    bool has_extraneous_flushes_requested() const {
        return _extraneous_flushes > 0;
    }
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -127,10 +127,14 @@ WantedBy=multi-user.target
        #  - Storage: /path/to/file (inacessible)
        #  - Storage: /path/to/file
        #
+        # After systemd-v248, available coredump file output changed like this:
+        #  - Storage: /path/to/file (present)
+        # We need to support both versions.
+        #
        # reference: https://github.com/systemd/systemd/commit/47f50642075a7a215c9f7b600599cbfee81a2913

        corefail = False
-        res = re.findall(r'Storage: (.*)$', coreinfo, flags=re.MULTILINE)
+        res = re.findall(r'Storage: (\S+)(?: \(.+\))?$', coreinfo, flags=re.MULTILINE)
        # v232 or later
        if res:
            corepath = res[0]
--- a/dist/docker/debian/build_docker.sh
+++ b/dist/docker/debian/build_docker.sh
@@ -101,6 +101,7 @@ run bash -ec "cat /scylla_bashrc >> /etc/bash.bashrc"
 run mkdir -p /etc/supervisor.conf.d
 run mkdir -p /var/log/scylla
 run chown -R scylla:scylla /var/lib/scylla
+run sed -i -e 's/^SCYLLA_ARGS=".*"$/SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"/' /etc/default/scylla-server

 run mkdir -p /opt/scylladb/supervisor
 run touch /opt/scylladb/SCYLLA-CONTAINER-FILE
--- a/dist/docker/etc/sysconfig/scylla-server
+++ b/dist/docker/etc/sysconfig/scylla-server
@@ -1,41 +0,0 @@
-# choose following mode: virtio, dpdk, posix
-NETWORK_MODE=posix
-
-# tap device name(virtio)
-TAP=tap0
-
-# bridge device name (virtio)
-BRIDGE=virbr0
-
-# ethernet device name
-IFNAME=eth0
-
-# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
-SET_NIC_AND_DISKS=no
-
-# ethernet device driver (dpdk)
-ETHDRV=
-
-# ethernet device PCI ID (dpdk)
-ETHPCIID=
-
-# number of hugepages
-NR_HUGEPAGES=64
-
-# user for process (must be root for dpdk)
-USER=scylla
-
-# group for process
-GROUP=scylla
-
-# scylla home dir
-SCYLLA_HOME=/var/lib/scylla
-
-# scylla config dir
-SCYLLA_CONF=/etc/scylla
-
-# scylla arguments
-SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"
-
-# setup as AMI instance
-AMI=no
--- a/flat_mutation_reader.cc
+++ b/flat_mutation_reader.cc
@@ -45,7 +45,7 @@
 logging::logger fmr_logger("flat_mutation_reader");

 flat_mutation_reader& flat_mutation_reader::operator=(flat_mutation_reader&& o) noexcept {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -58,7 +58,7 @@ flat_mutation_reader& flat_mutation_reader::operator=(flat_mutation_reader&& o)
 }

 flat_mutation_reader::~flat_mutation_reader() {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -1344,7 +1344,7 @@ void mutation_fragment_stream_validating_filter::on_end_of_stream() {
 }

 flat_mutation_reader_v2& flat_mutation_reader_v2::operator=(flat_mutation_reader_v2&& o) noexcept {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
@@ -1357,7 +1357,7 @@ flat_mutation_reader_v2& flat_mutation_reader_v2::operator=(flat_mutation_reader
 }

 flat_mutation_reader_v2::~flat_mutation_reader_v2() {
-    if (_impl) {
+    if (_impl && _impl->is_close_required()) {
        impl* ip = _impl.get();
        // Abort to enforce calling close() before readers are closed
        // to prevent leaks and potential use-after-free due to background
--- a/flat_mutation_reader.hh
+++ b/flat_mutation_reader.hh
@@ -142,6 +142,7 @@ public:
    private:
        tracked_buffer _buffer;
        size_t _buffer_size = 0;
+        bool _close_required = false;
    protected:
        size_t max_buffer_size_in_bytes = default_max_buffer_size_in_bytes();
        bool _end_of_stream = false;
@@ -175,6 +176,8 @@ public:
        bool is_end_of_stream() const { return _end_of_stream; }
        bool is_buffer_empty() const { return _buffer.empty(); }
        bool is_buffer_full() const { return _buffer_size >= max_buffer_size_in_bytes; }
+        bool is_close_required() const { return _close_required; }
+        void set_close_required() { _close_required = true; }
        static constexpr size_t default_max_buffer_size_in_bytes() { return 8 * 1024; }

        mutation_fragment pop_mutation_fragment() {
@@ -506,9 +509,15 @@ public:
    //
    // Can be used to skip over entire partitions if interleaved with
    // `operator()()` calls.
-    future<> next_partition() { return _impl->next_partition(); }
+    future<> next_partition() {
+        _impl->set_close_required();
+        return _impl->next_partition();
+    }

-    future<> fill_buffer() { return _impl->fill_buffer(); }
+    future<> fill_buffer() {
+        _impl->set_close_required();
+        return _impl->fill_buffer();
+    }

    // Changes the range of partitions to pr. The range can only be moved
    // forwards. pr.begin() needs to be larger than pr.end() of the previousl
@@ -517,6 +526,7 @@ public:
    // pr needs to be valid until the reader is destroyed or fast_forward_to()
    // is called again.
    future<> fast_forward_to(const dht::partition_range& pr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(pr);
    }
    // Skips to a later range of rows.
@@ -546,6 +556,7 @@ public:
    // In particular one must first enter a partition by fetching a `partition_start`
    // fragment before calling `fast_forward_to`.
    future<> fast_forward_to(position_range cr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(std::move(cr));
    }
    // Closes the reader.
--- a/flat_mutation_reader_v2.hh
+++ b/flat_mutation_reader_v2.hh
@@ -177,6 +177,7 @@ public:
    private:
        tracked_buffer _buffer;
        size_t _buffer_size = 0;
+        bool _close_required = false;
    protected:
        size_t max_buffer_size_in_bytes = default_max_buffer_size_in_bytes();

@@ -216,6 +217,8 @@ public:
        bool is_end_of_stream() const { return _end_of_stream; }
        bool is_buffer_empty() const { return _buffer.empty(); }
        bool is_buffer_full() const { return _buffer_size >= max_buffer_size_in_bytes; }
+        bool is_close_required() const { return _close_required; }
+        void set_close_required() { _close_required = true; }
        static constexpr size_t default_max_buffer_size_in_bytes() { return 8 * 1024; }

        mutation_fragment_v2 pop_mutation_fragment() {
@@ -547,9 +550,15 @@ public:
    //
    // Can be used to skip over entire partitions if interleaved with
    // `operator()()` calls.
-    future<> next_partition() { return _impl->next_partition(); }
+    future<> next_partition() {
+        _impl->set_close_required();
+        return _impl->next_partition();
+    }

-    future<> fill_buffer() { return _impl->fill_buffer(); }
+    future<> fill_buffer() {
+        _impl->set_close_required();
+        return _impl->fill_buffer();
+    }

    // Changes the range of partitions to pr. The range can only be moved
    // forwards. pr.begin() needs to be larger than pr.end() of the previousl
@@ -558,6 +567,7 @@ public:
    // pr needs to be valid until the reader is destroyed or fast_forward_to()
    // is called again.
    future<> fast_forward_to(const dht::partition_range& pr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(pr);
    }
    // Skips to a later range of rows.
@@ -587,6 +597,7 @@ public:
    // In particular one must first enter a partition by fetching a `partition_start`
    // fragment before calling `fast_forward_to`.
    future<> fast_forward_to(position_range cr) {
+        _impl->set_close_required();
        return _impl->fast_forward_to(std::move(cr));
    }
    // Closes the reader.
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -1672,6 +1672,10 @@ bool gossiper::is_normal(const inet_address& endpoint) const {
    return get_gossip_status(endpoint) == sstring(versioned_value::STATUS_NORMAL);
 }

+bool gossiper::is_left(const inet_address& endpoint) const {
+    return get_gossip_status(endpoint) == sstring(versioned_value::STATUS_LEFT);
+}
+
 bool gossiper::is_normal_ring_member(const inet_address& endpoint) const {
    auto status = get_gossip_status(endpoint);
    return status == sstring(versioned_value::STATUS_NORMAL) || status == sstring(versioned_value::SHUTDOWN);
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -571,6 +571,7 @@ public:
    bool is_seed(const inet_address& endpoint) const;
    bool is_shutdown(const inet_address& endpoint) const;
    bool is_normal(const inet_address& endpoint) const;
+    bool is_left(const inet_address& endpoint) const;
    // Check if a node is in NORMAL or SHUTDOWN status which means the node is
    // part of the token ring from the gossip point of view and operates in
    // normal status or was in normal status but is shutdown.
--- a/locator/azure_snitch.cc
+++ b/locator/azure_snitch.cc
@@ -61,6 +61,10 @@ azure_snitch::azure_snitch(const sstring& fname, unsigned io_cpuid) : production
 }

 future<> azure_snitch::load_config() {
+    if (this_shard_id() != io_cpu_id()) {
+        co_return;
+    }
+
    sstring region = co_await azure_api_call(REGION_NAME_QUERY_PATH);
    sstring azure_zone = co_await azure_api_call(ZONE_NAME_QUERY_PATH);

--- a/locator/ec2_snitch.cc
+++ b/locator/ec2_snitch.cc
@@ -1,5 +1,7 @@
 #include "locator/ec2_snitch.hh"
 #include <seastar/core/seastar.hh>
+#include <seastar/core/sleep.hh>
+#include <seastar/core/do_with.hh>

 #include <boost/algorithm/string/classification.hpp>
 #include <boost/algorithm/string/split.hpp>
@@ -67,6 +69,30 @@ future<> ec2_snitch::start() {
 }

 future<sstring> ec2_snitch::aws_api_call(sstring addr, uint16_t port, sstring cmd) {
+    return do_with(int(0), [this, addr, port, cmd] (int& i) {
+        return repeat_until_value([this, addr, port, cmd, &i]() -> future<std::optional<sstring>> {
+            ++i;
+            return aws_api_call_once(addr, port, cmd).then([] (auto res) {
+                return make_ready_future<std::optional<sstring>>(std::move(res));
+            }).handle_exception([&i] (auto ep) {
+                try {
+                    std::rethrow_exception(ep);
+                } catch (const std::system_error &e) {
+                    logger().error(e.what());
+                    if (i >= AWS_API_CALL_RETRIES - 1) {
+                        logger().error("Maximum number of retries exceeded");
+                        throw e;
+                    }
+                }
+                return sleep(AWS_API_CALL_RETRY_INTERVAL).then([] {
+                    return make_ready_future<std::optional<sstring>>(std::nullopt);
+                });
+            });
+        });
+    });
+}
+
+future<sstring> ec2_snitch::aws_api_call_once(sstring addr, uint16_t port, sstring cmd) {
    return connect(socket_address(inet_address{addr}, port))
    .then([this, addr, cmd] (connected_socket fd) {
        _sd = std::move(fd);
--- a/locator/ec2_snitch.hh
+++ b/locator/ec2_snitch.hh
@@ -29,6 +29,8 @@ public:
    static constexpr const char* ZONE_NAME_QUERY_REQ = "/latest/meta-data/placement/availability-zone";
    static constexpr const char* AWS_QUERY_SERVER_ADDR = "169.254.169.254";
    static constexpr uint16_t AWS_QUERY_SERVER_PORT = 80;
+    static constexpr int AWS_API_CALL_RETRIES = 5;
+    static constexpr auto AWS_API_CALL_RETRY_INTERVAL = std::chrono::seconds{5};

    ec2_snitch(const sstring& fname = "", unsigned io_cpu_id = 0);
    virtual future<> start() override;
@@ -45,5 +47,6 @@ private:
    output_stream<char> _out;
    http_response_parser _parser;
    sstring _zone_req;
+    future<sstring> aws_api_call_once(sstring addr, uint16_t port, const sstring cmd);
 };
 } // namespace locator
--- a/main.cc
+++ b/main.cc
@@ -562,6 +562,12 @@ int main(int ac, char** av) {

            cfg->broadcast_to_all_shards().get();

+            // We pass this piece of config through a global as a temporary hack.
+            // See the comment at the definition of sstables::global_cache_index_pages.
+            smp::invoke_on_all([&cfg] {
+                sstables::global_cache_index_pages = cfg->cache_index_pages.operator utils::updateable_value<bool>();
+            }).get();
+
            ::sighup_handler sighup_handler(opts, *cfg);
            auto stop_sighup_handler = defer_verbose_shutdown("sighup", [&] {
                sighup_handler.stop().get();
--- a/memtable-sstable.hh
+++ b/memtable-sstable.hh
@@ -30,6 +30,7 @@
 #include <seastar/core/io_priority_class.hh>

 class memtable;
+class reader_permit;
 class flat_mutation_reader;

 namespace sstables {
--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -442,6 +442,8 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
    case messaging_verb::GOSSIP_ECHO:
    case messaging_verb::GOSSIP_GET_ENDPOINT_STATES:
    case messaging_verb::GET_SCHEMA_VERSION:
+        // ATTN -- if moving GOSSIP_ verbs elsewhere, mind updating the tcp_nodelay
+        // setting in get_rpc_client(), which assumes gossiper verbs live in idx 0
        return 0;
    case messaging_verb::PREPARE_MESSAGE:
    case messaging_verb::PREPARE_DONE_MESSAGE:
@@ -689,7 +691,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
    }();

    auto must_tcp_nodelay = [&] {
-        if (idx == 1) {
+        if (idx == 0) {
            return true; // gossip
        }
        if (_cfg.tcp_nodelay == tcp_nodelay_what::local) {
@@ -710,10 +712,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
    }
    opts.tcp_nodelay = must_tcp_nodelay;
    opts.reuseaddr = true;
-    // We send cookies only for non-default statement tenant clients.
-    if (idx > 3) {
-        opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;
-    }
+    opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;

    auto client = must_encrypt ?
                    ::make_shared<rpc_protocol_client_wrapper>(_rpc->protocol(), std::move(opts),
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -283,8 +283,8 @@ public:

    future<> lookup_readers(db::timeout_clock::time_point timeout);

-    future<> save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, detached_compaction_state compaction_state,
-            std::optional<clustering_key_prefix> last_ckey);
+    future<> save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, std::optional<detached_compaction_state> compaction_state,
+            dht::decorated_key last_pkey, std::optional<clustering_key_prefix> last_ckey);

    future<> stop();
 };
@@ -583,19 +583,22 @@ future<> read_context::lookup_readers(db::timeout_clock::time_point timeout) {
    });
 }

-future<> read_context::save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, detached_compaction_state compaction_state,
-            std::optional<clustering_key_prefix> last_ckey) {
+future<> read_context::save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, std::optional<detached_compaction_state> compaction_state,
+            dht::decorated_key last_pkey, std::optional<clustering_key_prefix> last_ckey) {
    if (_cmd.query_uuid == utils::UUID{}) {
        return make_ready_future<>();
    }

-    auto last_pkey = compaction_state.partition_start.key();
-
    const auto cb_stats = dismantle_combined_buffer(std::move(unconsumed_buffer), last_pkey);
    tracing::trace(_trace_state, "Dismantled combined buffer: {}", cb_stats);

-    const auto cs_stats = dismantle_compaction_state(std::move(compaction_state));
-    tracing::trace(_trace_state, "Dismantled compaction state: {}", cs_stats);
+    auto cs_stats = dismantle_buffer_stats{};
+    if (compaction_state) {
+        cs_stats = dismantle_compaction_state(std::move(*compaction_state));
+        tracing::trace(_trace_state, "Dismantled compaction state: {}", cs_stats);
+    } else {
+        tracing::trace(_trace_state, "No compaction state to dismantle, partition exhausted", cs_stats);
+    }

    return do_with(std::move(last_pkey), std::move(last_ckey), [this] (const dht::decorated_key& last_pkey,
            const std::optional<clustering_key_prefix>& last_ckey) {
@@ -694,16 +697,18 @@ future<typename ResultBuilder::result_type> do_query(
        ResultBuilder&& result_builder) {
    auto ctx = seastar::make_shared<read_context>(db, s, cmd, ranges, trace_state, timeout);

-    co_await ctx->lookup_readers(timeout);
-
    std::exception_ptr ex;

    try {
+        co_await ctx->lookup_readers(timeout);
+
        auto [last_ckey, result, unconsumed_buffer, compaction_state] = co_await read_page<ResultBuilder>(ctx, s, cmd, ranges, trace_state,
                std::move(result_builder));

        if (compaction_state->are_limits_reached() || result.is_short_read()) {
-            co_await ctx->save_readers(std::move(unconsumed_buffer), std::move(*compaction_state).detach_state(), std::move(last_ckey));
+            // Must call before calling 'detached_state()`.
+            auto last_pkey = *compaction_state->current_partition();
+            co_await ctx->save_readers(std::move(unconsumed_buffer), std::move(*compaction_state).detach_state(), std::move(last_pkey), std::move(last_ckey));
        }

        co_await ctx->stop();
--- a/mutation_compactor.hh
+++ b/mutation_compactor.hh
@@ -175,6 +175,9 @@ class compact_mutation_state {
    std::unique_ptr<mutation_compactor_garbage_collector> _collector;

    compaction_stats _stats;
+
+    // Remember if we requested to stop mid-partition.
+    stop_iteration _stop = stop_iteration::no;
 private:
    static constexpr bool only_live() {
        return OnlyLive == emit_only_live_rows::yes;
@@ -270,6 +273,7 @@ public:
    }

    void consume_new_partition(const dht::decorated_key& dk) {
+        _stop = stop_iteration::no;
        auto& pk = dk.key();
        _dk = &dk;
        _return_static_content_on_partition_with_no_rows =
@@ -323,9 +327,9 @@ public:
        _static_row_live = is_live;
        if (is_live || (!only_live() && !sr.empty())) {
            partition_is_not_empty(consumer);
-            return consumer.consume(std::move(sr), current_tombstone, is_live);
+            _stop = consumer.consume(std::move(sr), current_tombstone, is_live);
        }
-        return stop_iteration::no;
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -370,23 +374,22 @@ public:

        if (only_live() && is_live) {
            partition_is_not_empty(consumer);
-            auto stop = consumer.consume(std::move(cr), t, true);
+            _stop = consumer.consume(std::move(cr), t, true);
            if (++_rows_in_current_partition == _current_partition_limit) {
-                return stop_iteration::yes;
+                _stop = stop_iteration::yes;
            }
-            return stop;
+            return _stop;
        } else if (!only_live()) {
-            auto stop = stop_iteration::no;
            if (!cr.empty()) {
                partition_is_not_empty(consumer);
-                stop = consumer.consume(std::move(cr), t, is_live);
+                _stop = consumer.consume(std::move(cr), t, is_live);
            }
            if (!sstable_compaction() && is_live && ++_rows_in_current_partition == _current_partition_limit) {
-                return stop_iteration::yes;
+                _stop = stop_iteration::yes;
            }
-            return stop;
+            return _stop;
        }
-        return stop_iteration::no;
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -398,13 +401,13 @@ public:
        if (rt.tomb > _range_tombstones.get_partition_tombstone()) {
            if (can_purge_tombstone(rt.tomb)) {
                partition_is_not_empty_for_gc_consumer(gc_consumer);
-                return gc_consumer.consume(std::move(rt));
+                _stop = gc_consumer.consume(std::move(rt));
            } else {
                partition_is_not_empty(consumer);
-                return consumer.consume(std::move(rt));
+                _stop = consumer.consume(std::move(rt));
            }
         }
-        return stop_iteration::no;
+        return _stop;
    }

    template <typename Consumer, typename GCConsumer>
@@ -492,9 +495,24 @@ public:
    /// compactor will result in the new compactor being in the same state *this
    /// is (given the same outside parameters of course). Practically this
    /// allows the compaction state to be stored in the compacted reader.
-    detached_compaction_state detach_state() && {
+    /// If the currently compacted partition is exhausted a disengaged optional
+    /// is returned -- in this case there is no state to detach.
+    std::optional<detached_compaction_state> detach_state() && {
+        // If we exhausted the partition, there is no need to detach-restore the
+        // compaction state.
+        // We exhausted the partition if `consume_partition_end()` was called
+        // without us requesting the consumption to stop (remembered in _stop)
+        // from one of the consume() overloads.
+        // The consume algorithm calls `consume_partition_end()` in two cases:
+        // * on a partition-end fragment
+        // * consume() requested to stop
+        // In the latter case, the partition is not exhausted. Even if the next
+        // fragment to process is a partition-end, it will not be consumed.
+        if (!_stop) {
+            return {};
+        }
        partition_start ps(std::move(_last_dk), _range_tombstones.get_partition_tombstone());
-        return {std::move(ps), std::move(_last_static_row), std::move(_range_tombstones).range_tombstones()};
+        return detached_compaction_state{std::move(ps), std::move(_last_static_row), std::move(_range_tombstones).range_tombstones()};
    }

    const compaction_stats& stats() const { return _stats; }
--- a/mutation_partition.hh
+++ b/mutation_partition.hh
@@ -843,7 +843,6 @@ public:

    void apply(shadowable_tombstone deleted_at) {
        _deleted_at.apply(deleted_at, _marker);
-        maybe_shadow();
    }

    void apply(row_tombstone deleted_at) {
--- a/partition_snapshot_reader.hh
+++ b/partition_snapshot_reader.hh
@@ -305,14 +305,23 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
                const std::optional<position_in_partition>& last_row,
                const std::optional<position_in_partition>& last_rts,
                position_in_partition_view pos) {
-            if (!_rt_stream.empty()) {
-                return _rt_stream.get_next(std::move(pos));
-            }
            return in_alloc_section([&] () -> mutation_fragment_opt {
                maybe_refresh_state(ck_range_snapshot, last_row, last_rts);

                position_in_partition::less_compare rt_less(_query_schema);

+                // The while below moves range tombstones from partition versions
+                // into _rt_stream, just enough to produce the next range tombstone
+                // The main goal behind moving to _rt_stream is to deoverlap range tombstones
+                // which have the same starting position. This is not in order to satisfy
+                // flat_mutation_reader stream requirements, the reader can emit range tombstones
+                // which have the same position incrementally. This is to guarantee forward
+                // progress in the case iterators get invalidated and maybe_refresh_state()
+                // above needs to restore them. It does so using last_rts, which tracks
+                // the position of the last emitted range tombstone. All range tombstones
+                // with positions <= than last_rts are skipped on refresh. To make progress,
+                // we need to make sure that all range tombstones with duplicated positions
+                // are emitted before maybe_refresh_state().
                while (has_more_range_tombstones()
                        && !rt_less(pos, peek_range_tombstone().position())
                        && (_rt_stream.empty() || !rt_less(_rt_stream.peek_next().position(), peek_range_tombstone().position()))) {
--- a/partition_snapshot_row_cursor.hh
+++ b/partition_snapshot_row_cursor.hh
@@ -325,7 +325,7 @@ public:
    // When throws, the cursor is invalidated and its position is not changed.
    bool advance_to(position_in_partition_view lower_bound) {
        prepare_heap(lower_bound);
-        bool found = no_clustering_row_between(_schema, lower_bound, _heap[0].it->position());
+        bool found = no_clustering_row_between_weak(_schema, lower_bound, _heap[0].it->position());
        recreate_current_row();
        return found;
    }
--- a/position_in_partition.hh
+++ b/position_in_partition.hh
@@ -575,6 +575,20 @@ bool no_clustering_row_between(const schema& s, position_in_partition_view a, po
    }
 }

+// Returns true if and only if there can't be any clustering_row with position >= a and < b.
+// It is assumed that a <= b.
+inline
+bool no_clustering_row_between_weak(const schema& s, position_in_partition_view a, position_in_partition_view b) {
+    clustering_key_prefix::equality eq(s);
+    if (a.has_key() && b.has_key()) {
+        return eq(a.key(), b.key())
+               && (a.get_bound_weight() == bound_weight::after_all_prefixed
+                   || b.get_bound_weight() != bound_weight::after_all_prefixed);
+    } else {
+        return !a.has_key() && !b.has_key();
+    }
+}
+
 // Includes all position_in_partition objects "p" for which: start <= p < end
 // And only those.
 class position_range {
@@ -659,3 +673,9 @@ inline
 bool position_range::is_all_clustered_rows(const schema& s) const {
    return _start.is_before_all_clustered_rows(s) && _end.is_after_all_clustered_rows(s);
 }
+
+// Assumes that the bounds of `r` are of 'clustered' type
+// and that `r` is non-empty (the left bound is smaller than the right bound).
+//
+// If `r` does not contain any keys, returns nullopt.
+std::optional<query::clustering_range> position_range_to_clustering_range(const position_range& r, const schema&);
--- a/query.cc
+++ b/query.cc
@@ -379,3 +379,52 @@ foreign_ptr<lw_shared_ptr<query::result>> result_merger::get() {
 }

 }
+
+std::optional<query::clustering_range> position_range_to_clustering_range(const position_range& r, const schema& s) {
+    assert(r.start().get_type() == partition_region::clustered);
+    assert(r.end().get_type() == partition_region::clustered);
+
+    if (r.start().has_key() && r.end().has_key()
+            && clustering_key_prefix::equality(s)(r.start().key(), r.end().key())) {
+        assert(r.start().get_bound_weight() != r.end().get_bound_weight());
+
+        if (r.end().get_bound_weight() == bound_weight::after_all_prefixed
+                && r.start().get_bound_weight() != bound_weight::after_all_prefixed) {
+            // [before x, after x) and [for x, after x) get converted to [x, x].
+            return query::clustering_range::make_singular(r.start().key());
+        }
+
+        // [before x, for x) does not contain any keys.
+        return std::nullopt;
+    }
+
+    // position_range -> clustering_range
+    // (recall that position_ranges are always left-closed, right opened):
+    // [before x, ...), [for x, ...) -> [x, ...
+    // [after x, ...) -> (x, ...
+    // [..., before x), [..., for x) -> ..., x)
+    // [..., after x) -> ..., x]
+
+    auto to_bound = [&s] (const position_in_partition& p, bool left) -> std::optional<query::clustering_range::bound> {
+        if (p.is_before_all_clustered_rows(s)) {
+            assert(left);
+            return {};
+        }
+
+        if (p.is_after_all_clustered_rows(s)) {
+            assert(!left);
+            return {};
+        }
+
+        assert(p.has_key());
+
+        auto bw = p.get_bound_weight();
+        bool inclusive = left
+            ? bw != bound_weight::after_all_prefixed
+            : bw == bound_weight::after_all_prefixed;
+
+        return query::clustering_range::bound{p.key(), inclusive};
+    };
+
+    return query::clustering_range{to_bound(r.start(), true), to_bound(r.end(), false)};
+}
--- a/range_tombstone_list.cc
+++ b/range_tombstone_list.cc
@@ -109,7 +109,7 @@ void range_tombstone_list::insert_from(const schema& s,
        if (cmp(end, it->position()) < 0) {
            // not overlapping
            if (it->tombstone().tomb == tomb && cmp(end, it->position()) == 0) {
-                rev.update(it, {std::move(start), std::move(start), tomb});
+                rev.update(it, {std::move(start), std::move(end), tomb});
            } else {
                auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), tomb);
                rev.insert(it, *rt);
--- a/schema_upgrader.hh
+++ b/schema_upgrader.hh
@@ -22,6 +22,7 @@
 #pragma once

 #include "mutation_fragment.hh"
+#include "mutation_fragment_v2.hh"
 #include "converting_mutation_partition_applier.hh"

 // A StreamedMutationTransformer which transforms the stream to a different schema
--- a/2
+++ b/2
--- a/sstables/kl/reader.cc
+++ b/sstables/kl/reader.cc
@@ -1155,7 +1155,7 @@ private:
    }
    index_reader& get_index_reader() {
        if (!_index_reader) {
-            auto caching = use_caching(!_slice.options.contains(query::partition_slice::option::bypass_cache));
+            auto caching = use_caching(global_cache_index_pages && !_slice.options.contains(query::partition_slice::option::bypass_cache));
            _index_reader = std::make_unique<index_reader>(_sst, _consumer.permit(), _consumer.io_priority(),
                                                           _consumer.trace_state(), caching);
        }
--- a/sstables/mx/reader.cc
+++ b/sstables/mx/reader.cc
@@ -1308,7 +1308,7 @@ private:
    }
    index_reader& get_index_reader() {
        if (!_index_reader) {
-            auto caching = use_caching(!_slice.options.contains(query::partition_slice::option::bypass_cache));
+            auto caching = use_caching(global_cache_index_pages && !_slice.options.contains(query::partition_slice::option::bypass_cache));
            _index_reader = std::make_unique<index_reader>(_sst, _consumer.permit(), _consumer.io_priority(),
                                                           _consumer.trace_state(), caching);
        }
@@ -1745,9 +1745,7 @@ public:
        _monitor.on_read_started(_context->reader_position());
    }
 public:
-    void on_out_of_clustering_range() override {
-        push_mutation_fragment(mutation_fragment_v2(*_schema, _permit, partition_end()));
-    }
+    void on_out_of_clustering_range() override { }
    virtual future<> fast_forward_to(const dht::partition_range& pr) override {
        on_internal_error(sstlog, "mx_crawling_sstable_mutation_reader: doesn't support fast_forward_to(const dht::partition_range&)");
    }
--- a/sstables/partition_index_cache.hh
+++ b/sstables/partition_index_cache.hh
@@ -68,7 +68,12 @@ private:
        entry(entry&&) noexcept = default;

        ~entry() {
-            assert(!is_referenced());
+            if (is_referenced()) {
+                // Live entry_ptr should keep the entry alive, except when the entry failed on loading.
+                // In that case, entry_ptr holders are not supposed to use the pointer, so it's safe
+                // to nullify those entry_ptrs.
+                assert(!ready());
+            }
        }

        void on_evicted() noexcept override;
--- a/sstables/sstable_set.cc
+++ b/sstables/sstable_set.cc
@@ -400,10 +400,15 @@ void time_series_sstable_set::for_each_sstable(std::function<void(const shared_s

 // O(log n)
 void time_series_sstable_set::insert(shared_sstable sst) {
+  try {
    auto min_pos = sst->min_position();
    auto max_pos_reversed = sst->max_position().reversed();
    _sstables->emplace(std::move(min_pos), sst);
    _sstables_reversed->emplace(std::move(max_pos_reversed), std::move(sst));
+  } catch (...) {
+    erase(sst);
+    throw;
+  }
 }

 // O(n) worst case, but should be close to O(log n) most of the time
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -94,6 +94,18 @@ thread_local disk_error_signal_type sstable_write_error;

 namespace sstables {

+// The below flag governs the mode of index file page caching used by the index
+// reader.
+//
+// If set to true, the reader will read and/or populate a common global cache,
+// which shares its capacity with the row cache. If false, the reader will use
+// BYPASS CACHE semantics for index caching.
+//
+// This flag is intended to be a temporary hack. The goal is to eventually
+// solve index caching problems via a smart cache replacement policy.
+//
+thread_local utils::updateable_value<bool> global_cache_index_pages(false);
+
 logging::logger sstlog("sstable");

 // Because this is a noop and won't hold any state, it is better to use a global than a
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -61,6 +61,7 @@
 #include "sstables/open_info.hh"
 #include "query-request.hh"
 #include "mutation_fragment_stream_validator.hh"
+#include "utils/updateable_value.hh"

 #include <seastar/util/optimized_optional.hh>

@@ -70,6 +71,8 @@ class cached_file;

 namespace sstables {

+extern thread_local utils::updateable_value<bool> global_cache_index_pages;
+
 namespace mc {
 class writer;
 }
--- a/table.cc
+++ b/table.cc
@@ -1493,13 +1493,14 @@ bool table::can_flush() const {
 }

 future<> table::clear() {
+    auto permits = co_await _config.dirty_memory_manager->get_all_flush_permits();
    if (_commitlog) {
        for (auto& t : *_memtables) {
            _commitlog->discard_completed_segments(_schema->id(), t->get_and_discard_rp_set());
        }
    }
    _memtables->clear_and_add();
-    return _cache.invalidate(row_cache::external_updater([] { /* There is no underlying mutation source */ }));
+    co_await _cache.invalidate(row_cache::external_updater([] { /* There is no underlying mutation source */ }));
 }

 // NOTE: does not need to be futurized, but might eventually, depending on
--- a/test/alternator/test_describe_table.py
+++ b/test/alternator/test_describe_table.py
@@ -107,9 +107,9 @@ def test_describe_table_size(test_table):
 # Test the ProvisionedThroughput attribute returned by DescribeTable.
 # This is a very partial test: Our test table is configured without
 # provisioned throughput, so obviously it will not have interesting settings
-# for it. DynamoDB returns zeros for some of the attributes, even though
-# the documentation suggests missing values should have been fine too.
-@pytest.mark.xfail(reason="DescribeTable does not return provisioned throughput")
+# for it. But DynamoDB documents that zeros be returned for WriteCapacityUnits
+# and ReadCapacityUnits, and does this in practice as well - and some
+# applications assume these numbers are always there (even if 0).
 def test_describe_table_provisioned_throughput(test_table):
    got = test_table.meta.client.describe_table(TableName=test_table.name)['Table']
    assert got['ProvisionedThroughput']['NumberOfDecreasesToday'] == 0
--- a/test/alternator/test_gsi.py
+++ b/test/alternator/test_gsi.py
@@ -438,6 +438,126 @@ def test_gsi_update_second_regular_base_column(test_table_gsi_3):
        KeyConditions={'a': {'AttributeValueList': [items[3]['a']], 'ComparisonOperator': 'EQ'},
                       'b': {'AttributeValueList': [items[3]['b']], 'ComparisonOperator': 'EQ'}})

+# Test reproducing issue #11801: In issue #5006 we noticed that in the special
+# case of a GSI with with two non-key attributes as keys (test_table_gsi_3),
+# an update of the second attribute forgot to delete the old row. We fixed
+# that bug, but a bug remained for updates which update the value to the *same*
+# value - in that case the old row shouldn't be deleted, but we did - as
+# noticed in issue #11801.
+def test_11801(test_table_gsi_3):
+    p = random_string()
+    a = random_string()
+    b = random_string()
+    item = {'p': p, 'a': a, 'b': b, 'd': random_string()}
+    test_table_gsi_3.put_item(Item=item)
+    assert_index_query(test_table_gsi_3, 'hello', [item],
+        KeyConditions={'a': {'AttributeValueList': [a], 'ComparisonOperator': 'EQ'},
+                       'b': {'AttributeValueList': [b], 'ComparisonOperator': 'EQ'}})
+    # Update the attribute 'b' to the same value b that it already had.
+    # This shouldn't change anything in the base table or in the GSI
+    test_table_gsi_3.update_item(Key={'p':  p}, AttributeUpdates={'b': {'Value': b, 'Action': 'PUT'}})
+    assert item == test_table_gsi_3.get_item(Key={'p': p}, ConsistentRead=True)['Item']
+    # In issue #11801, the following assertion failed (the view row was
+    # deleted and nothing matched the query).
+    assert_index_query(test_table_gsi_3, 'hello', [item],
+        KeyConditions={'a': {'AttributeValueList': [a], 'ComparisonOperator': 'EQ'},
+                       'b': {'AttributeValueList': [b], 'ComparisonOperator': 'EQ'}})
+    # Above we checked that setting 'b' to the same value didn't remove
+    # the old GSI row. But the same update may actually modify the GSI row
+    # (e.g., an unrelated attribute d) -  check this modification took place:
+    item['d'] = random_string()
+    test_table_gsi_3.update_item(Key={'p':  p},
+        AttributeUpdates={'b': {'Value': b, 'Action': 'PUT'},
+                          'd': {'Value': item['d'], 'Action': 'PUT'}})
+    assert item == test_table_gsi_3.get_item(Key={'p': p}, ConsistentRead=True)['Item']
+    assert_index_query(test_table_gsi_3, 'hello', [item],
+        KeyConditions={'a': {'AttributeValueList': [a], 'ComparisonOperator': 'EQ'},
+                       'b': {'AttributeValueList': [b], 'ComparisonOperator': 'EQ'}})
+
+# This test is the same as test_11801, but updating the first attribute (a)
+# instead of the second (b). This test didn't fail, showing that issue #11801
+# is - like #5006 - specific to the case of updating the second attribute.
+def test_11801_variant1(test_table_gsi_3):
+    p = random_string()
+    a = random_string()
+    b = random_string()
+    d = random_string()
+    item = {'p': p, 'a': a, 'b': b, 'd': d}
+    test_table_gsi_3.put_item(Item=item)
+    assert_index_query(test_table_gsi_3, 'hello', [item],
+        KeyConditions={'a': {'AttributeValueList': [a], 'ComparisonOperator': 'EQ'},
+                       'b': {'AttributeValueList': [b], 'ComparisonOperator': 'EQ'}})
+    test_table_gsi_3.update_item(Key={'p':  p}, AttributeUpdates={'a': {'Value': a, 'Action': 'PUT'}})
+    assert_index_query(test_table_gsi_3, 'hello', [item],
+        KeyConditions={'a': {'AttributeValueList': [a], 'ComparisonOperator': 'EQ'},
+                       'b': {'AttributeValueList': [b], 'ComparisonOperator': 'EQ'}})
+
+# This test is the same as test_11801, but updates b to a different value
+# (newb) instead of to the same one. This test didn't fail, showing that
+# issue #11801 is specific to updates to the same value. This test basically
+# reproduces the already-fixed #5006 (we also have another test above which
+# reproduces that issue - test_gsi_update_second_regular_base_column())
+def test_11801_variant2(test_table_gsi_3):
+    p = random_string()
+    a = random_string()
+    b = random_string()
+    item = {'p': p, 'a': a, 'b': b, 'd': random_string()}
+    test_table_gsi_3.put_item(Item=item)
+    assert_index_query(test_table_gsi_3, 'hello', [item],
+        KeyConditions={'a': {'AttributeValueList': [a], 'ComparisonOperator': 'EQ'},
+                       'b': {'AttributeValueList': [b], 'ComparisonOperator': 'EQ'}})
+    newb = random_string()
+    item['b'] = newb
+    test_table_gsi_3.update_item(Key={'p':  p}, AttributeUpdates={'b': {'Value': newb, 'Action': 'PUT'}})
+    assert_index_query(test_table_gsi_3, 'hello', [],
+        KeyConditions={'a': {'AttributeValueList': [a], 'ComparisonOperator': 'EQ'},
+                       'b': {'AttributeValueList': [b], 'ComparisonOperator': 'EQ'}})
+    assert_index_query(test_table_gsi_3, 'hello', [item],
+        KeyConditions={'a': {'AttributeValueList': [a], 'ComparisonOperator': 'EQ'},
+                       'b': {'AttributeValueList': [newb], 'ComparisonOperator': 'EQ'}})
+
+# This test is the same as test_11801, but uses a different table schema
+# (test_table_gsi_5) where there is only one new key column in the view (x).
+# This test passed, showing that issue #11801 was specific to the special
+# case of a view with two new key columns (test_table_gsi_3).
+def test_11801_variant3(test_table_gsi_5):
+    p = random_string()
+    c = random_string()
+    x = random_string()
+    item = {'p': p, 'c': c, 'x': x, 'd': random_string()}
+    test_table_gsi_5.put_item(Item=item)
+    assert_index_query(test_table_gsi_5, 'hello', [item],
+        KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'},
+                       'x': {'AttributeValueList': [x], 'ComparisonOperator': 'EQ'}})
+    test_table_gsi_5.update_item(Key={'p':  p, 'c': c}, AttributeUpdates={'x': {'Value': x, 'Action': 'PUT'}})
+    assert item == test_table_gsi_5.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
+    assert_index_query(test_table_gsi_5, 'hello', [item],
+        KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'},
+                       'x': {'AttributeValueList': [x], 'ComparisonOperator': 'EQ'}})
+
+# Another test similar to test_11801, but instead of updating a view key
+# column to the same value it already has, simply don't update it at all
+# (and just modify some other regular column). This test passed, showing
+# that issue #11801 is specific to the case of updating a view key column
+# to the same value it already had.
+def test_11801_variant4(test_table_gsi_3):
+    p = random_string()
+    a = random_string()
+    b = random_string()
+    item = {'p': p, 'a': a, 'b': b, 'd': random_string()}
+    test_table_gsi_3.put_item(Item=item)
+    assert_index_query(test_table_gsi_3, 'hello', [item],
+        KeyConditions={'a': {'AttributeValueList': [a], 'ComparisonOperator': 'EQ'},
+                       'b': {'AttributeValueList': [b], 'ComparisonOperator': 'EQ'}})
+    # An update that doesn't change the GSI keys (a or b), just a regular
+    # column d.
+    item['d'] = random_string()
+    test_table_gsi_3.update_item(Key={'p':  p}, AttributeUpdates={'d': {'Value': item['d'], 'Action': 'PUT'}})
+    assert item == test_table_gsi_3.get_item(Key={'p': p}, ConsistentRead=True)['Item']
+    assert_index_query(test_table_gsi_3, 'hello', [item],
+        KeyConditions={'a': {'AttributeValueList': [a], 'ComparisonOperator': 'EQ'},
+                       'b': {'AttributeValueList': [b], 'ComparisonOperator': 'EQ'}})
+
 # Test that when a table has a GSI, if the indexed attribute is missing, the
 # item is added to the base table but not the index.
 # This is the same feature we already tested in test_gsi_missing_attribute()
--- a/test/alternator/test_item.py
+++ b/test/alternator/test_item.py
@@ -374,6 +374,14 @@ def test_getitem_attributes_to_get_duplicate(dynamodb, test_table):
    with pytest.raises(ClientError, match='ValidationException.*Duplicate'):
        test_table.get_item(Key={'p': p, 'c': c}, AttributesToGet=['a', 'a'], ConsistentRead=True)

+# Verify that it is forbidden to ask for an empty AttributesToGet
+# Reproduces issue #10332.
+def test_getitem_attributes_to_get_empty(dynamodb, test_table):
+    p = random_string()
+    c = random_string()
+    with pytest.raises(ClientError, match='ValidationException'):
+        test_table.get_item(Key={'p': p, 'c': c}, AttributesToGet=[], ConsistentRead=True)
+
 # Basic test for DeleteItem, with hash key only
 def test_delete_item_hash(test_table_s):
    p = random_string()
--- a/test/alternator/test_query.py
+++ b/test/alternator/test_query.py
@@ -170,6 +170,13 @@ def test_query_attributes_to_get(dynamodb, test_table):
        expected_items = [{k: x[k] for k in wanted if k in x} for x in items]
        assert multiset(expected_items) == multiset(got_items)

+# Verify that it is forbidden to ask for an empty AttributesToGet
+# Reproduces issue #10332.
+def test_query_attributes_to_get_empty(dynamodb, test_table):
+    p = random_string()
+    with pytest.raises(ClientError, match='ValidationException'):
+        full_query(test_table, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, AttributesToGet=[])
+
 # Test that in a table with both hash key and sort key, which keys we can
 # Query by: We can Query by the hash key, by a combination of both hash and
 # sort keys, but *cannot* query by just the sort key, and obviously not
--- a/test/boost/chunked_managed_vector_test.cc
+++ b/test/boost/chunked_managed_vector_test.cc
@@ -25,6 +25,8 @@
 #include <deque>
 #include <random>
 #include "utils/lsa/chunked_managed_vector.hh"
+#include "utils/managed_ref.hh"
+#include "test/lib/log.hh"

 #include <boost/range/algorithm/sort.hpp>
 #include <boost/range/algorithm/equal.hpp>
@@ -216,3 +218,106 @@ SEASTAR_TEST_CASE(tests_reserve_partial) {
  });
  return make_ready_future<>();
 }
+
+SEASTAR_TEST_CASE(test_clear_and_release) {
+    region region;
+    allocating_section as;
+
+    with_allocator(region.allocator(), [&] {
+        lsa::chunked_managed_vector<managed_ref<uint64_t>> v;
+
+        for (uint64_t i = 1; i < 4000; ++i) {
+            as(region, [&] {
+                v.emplace_back(make_managed<uint64_t>(i));
+            });
+        }
+
+        v.clear_and_release();
+    });
+
+    return make_ready_future<>();
+}
+
+SEASTAR_TEST_CASE(test_chunk_reserve) {
+    region region;
+    allocating_section as;
+
+    for (auto conf :
+            { // std::make_pair(reserve size, push count)
+                std::make_pair(0, 4000),
+                std::make_pair(100, 4000),
+                std::make_pair(200, 4000),
+                std::make_pair(1000, 4000),
+                std::make_pair(2000, 4000),
+                std::make_pair(3000, 4000),
+                std::make_pair(5000, 4000),
+                std::make_pair(500, 8000),
+                std::make_pair(1000, 8000),
+                std::make_pair(2000, 8000),
+                std::make_pair(8000, 500),
+            })
+    {
+        with_allocator(region.allocator(), [&] {
+            auto [reserve_size, push_count] = conf;
+            testlog.info("Testing reserve({}), {}x emplace_back()", reserve_size, push_count);
+            lsa::chunked_managed_vector<managed_ref<uint64_t>> v;
+            v.reserve(reserve_size);
+            uint64_t seed = rand();
+            for (uint64_t i = 0; i < push_count; ++i) {
+                as(region, [&] {
+                    v.emplace_back(make_managed<uint64_t>(seed + i));
+                    BOOST_REQUIRE(**v.begin() == seed);
+                });
+            }
+            auto v_it = v.begin();
+            for (uint64_t i = 0; i < push_count; ++i) {
+                BOOST_REQUIRE(**v_it++ == seed + i);
+            }
+            v.clear_and_release();
+        });
+    }
+
+    return make_ready_future<>();
+}
+
+// Tests the case of make_room() invoked with last_chunk_capacity_deficit but _size not in
+// the last reserved chunk.
+SEASTAR_TEST_CASE(test_shrinking_and_expansion_involving_chunk_boundary) {
+    region region;
+    allocating_section as;
+
+    with_allocator(region.allocator(), [&] {
+        lsa::chunked_managed_vector<managed_ref<uint64_t>> v;
+
+        // Fill two chunks
+        v.reserve(2000);
+        for (uint64_t i = 0; i < 2000; ++i) {
+            as(region, [&] {
+                v.emplace_back(make_managed<uint64_t>(i));
+            });
+        }
+
+        // Make the last chunk smaller than max size to trigger the last_chunk_capacity_deficit path in make_room()
+        v.shrink_to_fit();
+
+        // Leave the last chunk reserved but empty
+        for (uint64_t i = 0; i < 1000; ++i) {
+            v.pop_back();
+        }
+
+        // Try to reserve more than the currently reserved capacity and trigger last_chunk_capacity_deficit path
+        // with _size not in the last chunk. Should not sigsegv.
+        v.reserve(8000);
+
+        for (uint64_t i = 0; i < 2000; ++i) {
+            as(region, [&] {
+                v.emplace_back(make_managed<uint64_t>(i));
+            });
+        }
+
+        v.clear_and_release();
+    });
+
+    return make_ready_future<>();
+}
+
--- a/test/boost/chunked_vector_test.cc
+++ b/test/boost/chunked_vector_test.cc
@@ -191,3 +191,32 @@ BOOST_AUTO_TEST_CASE(tests_reserve_partial) {
        BOOST_REQUIRE_EQUAL(v.capacity(), orig_size);
    }
 }
+
+// Tests the case of make_room() invoked with last_chunk_capacity_deficit but _size not in
+// the last reserved chunk.
+BOOST_AUTO_TEST_CASE(test_shrinking_and_expansion_involving_chunk_boundary) {
+    using vector_type = utils::chunked_vector<std::unique_ptr<uint64_t>>;
+    vector_type v;
+
+    // Fill two chunks
+    v.reserve(vector_type::max_chunk_capacity() * 3 / 2);
+    for (uint64_t i = 0; i < vector_type::max_chunk_capacity() * 3 / 2; ++i) {
+        v.emplace_back(std::make_unique<uint64_t>(i));
+    }
+
+    // Make the last chunk smaller than max size to trigger the last_chunk_capacity_deficit path in make_room()
+    v.shrink_to_fit();
+
+    // Leave the last chunk reserved but empty
+    for (uint64_t i = 0; i < vector_type::max_chunk_capacity(); ++i) {
+        v.pop_back();
+    }
+
+    // Try to reserve more than the currently reserved capacity and trigger last_chunk_capacity_deficit path
+    // with _size not in the last chunk. Should not sigsegv.
+    v.reserve(vector_type::max_chunk_capacity() * 4);
+
+    for (uint64_t i = 0; i < vector_type::max_chunk_capacity() * 2; ++i) {
+        v.emplace_back(std::make_unique<uint64_t>(i));
+    }
+}
--- a/test/boost/cql_auth_syntax_test.cc
+++ b/test/boost/cql_auth_syntax_test.cc
@@ -32,7 +32,7 @@
 #include "cql3/util.hh"

 //
-// Test basic CQL string quoting
+// Test basic CQL identifier quoting
 //
 BOOST_AUTO_TEST_CASE(maybe_quote) {
    std::string s(65536, 'x');
@@ -67,6 +67,16 @@ BOOST_AUTO_TEST_CASE(maybe_quote) {
    BOOST_REQUIRE_EQUAL(cql3::util::maybe_quote("\"\""), "\"\"\"\"\"\"");
    BOOST_REQUIRE_EQUAL(cql3::util::maybe_quote("\"hell0\""), "\"\"\"hell0\"\"\"");
    BOOST_REQUIRE_EQUAL(cql3::util::maybe_quote("hello \"my\" world"), "\"hello \"\"my\"\" world\"");
+
+    // Reproducer for issue #9450. Reserved keywords like "to" or "where"
+    // need quoting, but unreserved keywords like "ttl", "int" or "as",
+    // do not.
+    BOOST_REQUIRE_EQUAL(cql3::util::maybe_quote("to"), "\"to\"");
+    BOOST_REQUIRE_EQUAL(cql3::util::maybe_quote("where"), "\"where\"");
+    BOOST_REQUIRE_EQUAL(cql3::util::maybe_quote("ttl"), "ttl");
+    BOOST_REQUIRE_EQUAL(cql3::util::maybe_quote("int"), "int");
+    BOOST_REQUIRE_EQUAL(cql3::util::maybe_quote("as"), "as");
+    BOOST_REQUIRE_EQUAL(cql3::util::maybe_quote("ttl hi"), "\"ttl hi\"");
 }

 //
--- a/test/boost/database_test.cc
+++ b/test/boost/database_test.cc
@@ -784,3 +784,38 @@ SEASTAR_TEST_CASE(upgrade_sstables) {
        }).get();
    });
 }
+
+SEASTAR_TEST_CASE(database_drop_column_family_clears_querier_cache) {
+    return do_with_cql_env_thread([] (cql_test_env& e) {
+        e.execute_cql("create table ks.cf (k text, v int, primary key (k));").get();
+        auto& db = e.local_db();
+        const auto ts = db_clock::now();
+        auto& tbl = db.find_column_family("ks", "cf");
+
+        auto op = std::optional(tbl.read_in_progress());
+        auto s = tbl.schema();
+        auto q = query::data_querier(
+                tbl.as_mutation_source(),
+                tbl.schema(),
+                database_test(db).get_user_read_concurrency_semaphore().make_tracking_only_permit(s.get(), "test", db::no_timeout),
+                query::full_partition_range,
+                s->full_slice(),
+                default_priority_class(),
+                nullptr);
+
+        auto f = e.db().invoke_on_all([ts] (database& db) {
+            return db.drop_column_family("ks", "cf", [ts] { return make_ready_future<db_clock::time_point>(ts); });
+        });
+
+        // we add a querier to the querier cache while the drop is ongoing
+        auto& qc = db.get_querier_cache();
+        qc.insert(utils::make_random_uuid(), std::move(q), nullptr);
+        BOOST_REQUIRE_EQUAL(qc.get_stats().population, 1);
+
+        op.reset(); // this should allow the drop to finish
+        f.get();
+
+        // the drop should have cleaned up all entries belonging to that table
+        BOOST_REQUIRE_EQUAL(qc.get_stats().population, 0);
+    });
+}
--- a/test/boost/flat_mutation_reader_test.cc
+++ b/test/boost/flat_mutation_reader_test.cc
@@ -941,3 +941,28 @@ SEASTAR_THREAD_TEST_CASE(test_reverse_reader_is_mutation_source) {
    };
    run_mutation_source_tests(populate);
 }
+
+SEASTAR_THREAD_TEST_CASE(test_allow_reader_early_destruction) {
+    struct test_reader_impl : public flat_mutation_reader::impl {
+        using flat_mutation_reader::impl::impl;
+        virtual future<> fill_buffer() override { return make_ready_future<>(); }
+        virtual future<> next_partition() override { return make_ready_future<>(); }
+        virtual future<> fast_forward_to(const dht::partition_range&) override { return make_ready_future<>(); }
+        virtual future<> fast_forward_to(position_range) override { return make_ready_future<>(); }
+        virtual future<> close() noexcept override { return make_ready_future<>(); };
+    };
+    struct test_reader_v2_impl : public flat_mutation_reader_v2::impl {
+        using flat_mutation_reader_v2::impl::impl;
+        virtual future<> fill_buffer() override { return make_ready_future<>(); }
+        virtual future<> next_partition() override { return make_ready_future<>(); }
+        virtual future<> fast_forward_to(const dht::partition_range&) override { return make_ready_future<>(); }
+        virtual future<> fast_forward_to(position_range) override { return make_ready_future<>(); }
+        virtual future<> close() noexcept override { return make_ready_future<>(); };
+    };
+
+    simple_schema s;
+    tests::reader_concurrency_semaphore_wrapper semaphore;
+    // These readers are not closed, but didn't start any operations, so it's safe for them to be destroyed.
+    auto reader = make_flat_mutation_reader<test_reader_impl>(s.schema(), semaphore.make_permit());
+    auto reader_v2 = make_flat_mutation_reader_v2<test_reader_v2_impl>(s.schema(), semaphore.make_permit());
+}
--- a/test/boost/loading_cache_test.cc
+++ b/test/boost/loading_cache_test.cc
@@ -391,3 +391,87 @@ SEASTAR_TEST_CASE(test_loading_cache_reload_during_eviction) {
        BOOST_REQUIRE_EQUAL(loading_cache.size(), 1);
    });
 }
+
+SEASTAR_THREAD_TEST_CASE(test_loading_cache_remove_leaves_no_old_entries_behind) {
+    using namespace std::chrono;
+    load_count = 0;
+
+    auto load_v1 = [] (auto key) { return make_ready_future<sstring>("v1"); };
+    auto load_v2 = [] (auto key) { return make_ready_future<sstring>("v2"); };
+    auto load_v3 = [] (auto key) { return make_ready_future<sstring>("v3"); };
+
+    {
+        utils::loading_cache<int, sstring> loading_cache(num_loaders, 100s, testlog);
+        auto stop_cache_reload = seastar::defer([&loading_cache] { loading_cache.stop().get(); });
+
+        //
+        // Test remove() concurrent with loading
+        //
+
+        auto f = loading_cache.get_ptr(0, [&](auto key) {
+            return later().then([&] {
+                return load_v1(key);
+            });
+        });
+
+        loading_cache.remove(0);
+
+        BOOST_REQUIRE_EQUAL(loading_cache.find(0), nullptr);
+        BOOST_REQUIRE_EQUAL(loading_cache.size(), 0);
+
+        auto ptr1 = f.get0();
+        BOOST_REQUIRE_EQUAL(*ptr1, "v1");
+
+        BOOST_REQUIRE_EQUAL(loading_cache.find(0), nullptr);
+        BOOST_REQUIRE_EQUAL(loading_cache.size(), 0);
+
+        ptr1 = loading_cache.get_ptr(0, load_v2).get0();
+        loading_cache.remove(0);
+        BOOST_REQUIRE_EQUAL(*ptr1, "v2");
+
+        //
+        // Test that live ptr1, removed from cache, does not prevent reload of new value
+        //
+        auto ptr2 = loading_cache.get_ptr(0, load_v3).get0();
+        ptr1 = nullptr;
+        BOOST_REQUIRE_EQUAL(*ptr2, "v3");
+    }
+
+    // Test remove_if()
+    {
+        utils::loading_cache<int, sstring> loading_cache(num_loaders, 100s, testlog);
+        auto stop_cache_reload = seastar::defer([&loading_cache] { loading_cache.stop().get(); });
+
+        //
+        // Test remove_if() concurrent with loading
+        //
+        auto f = loading_cache.get_ptr(0, [&](auto key) {
+            return later().then([&] {
+                return load_v1(key);
+            });
+        });
+
+        loading_cache.remove_if([] (auto&& v) { return v == "v1"; });
+
+        BOOST_REQUIRE_EQUAL(loading_cache.find(0), nullptr);
+        BOOST_REQUIRE_EQUAL(loading_cache.size(), 0);
+
+        auto ptr1 = f.get0();
+        BOOST_REQUIRE_EQUAL(*ptr1, "v1");
+
+        BOOST_REQUIRE_EQUAL(loading_cache.find(0), nullptr);
+        BOOST_REQUIRE_EQUAL(loading_cache.size(), 0);
+
+        ptr1 = loading_cache.get_ptr(0, load_v2).get0();
+        loading_cache.remove_if([] (auto&& v) { return v == "v2"; });
+        BOOST_REQUIRE_EQUAL(*ptr1, "v2");
+
+        //
+        // Test that live ptr1, removed from cache, does not prevent reload of new value
+        //
+        auto ptr2 = loading_cache.get_ptr(0, load_v3).get0();
+        ptr1 = nullptr;
+        BOOST_REQUIRE_EQUAL(*ptr2, "v3");
+        ptr2 = nullptr;
+    }
+}
--- a/test/boost/memtable_test.cc
+++ b/test/boost/memtable_test.cc
@@ -39,6 +39,9 @@
 #include "test/lib/random_utils.hh"
 #include "test/lib/log.hh"
 #include "test/lib/reader_concurrency_semaphore.hh"
+#include "test/lib/simple_schema.hh"
+#include "test/lib/make_random_string.hh"
+#include "utils/error_injection.hh"

 static api::timestamp_type next_timestamp() {
    static thread_local api::timestamp_type next_timestamp = 1;
@@ -528,6 +531,74 @@ SEASTAR_TEST_CASE(test_exception_safety_of_single_partition_reads) {
    });
 }

+SEASTAR_THREAD_TEST_CASE(test_tombstone_merging_with_multiple_versions) {
+    tests::reader_concurrency_semaphore_wrapper semaphore;
+    simple_schema ss;
+    auto s = ss.schema();
+    auto mt = make_lw_shared<memtable>(ss.schema());
+
+    auto pk = ss.make_pkey(0);
+    auto pr = dht::partition_range::make_singular(pk);
+
+    auto t0 = ss.new_tombstone();
+    auto t1 = ss.new_tombstone();
+    auto t2 = ss.new_tombstone();
+    auto t3 = ss.new_tombstone();
+
+    mutation m1(s, pk);
+    ss.delete_range(m1, *position_range_to_clustering_range(position_range(
+                position_in_partition::before_key(ss.make_ckey(0)),
+                position_in_partition::for_key(ss.make_ckey(3))), *s), t1);
+    ss.add_row(m1, ss.make_ckey(0), "v");
+    ss.add_row(m1, ss.make_ckey(1), "v");
+
+    // Fill so that rd1 stays in the partition snapshot
+    int n_rows = 1000;
+    auto v = make_random_string(512);
+    for (int i = 0; i < n_rows; ++i) {
+        ss.add_row(m1, ss.make_ckey(i), v);
+    }
+
+    mutation m2(s, pk);
+    ss.delete_range(m2, *position_range_to_clustering_range(position_range(
+            position_in_partition::before_key(ss.make_ckey(0)),
+            position_in_partition::before_key(ss.make_ckey(1))), *s), t2);
+    ss.delete_range(m2, *position_range_to_clustering_range(position_range(
+            position_in_partition::before_key(ss.make_ckey(1)),
+            position_in_partition::for_key(ss.make_ckey(3))), *s), t3);
+
+    mutation m3(s, pk);
+    ss.delete_range(m3, *position_range_to_clustering_range(position_range(
+            position_in_partition::before_key(ss.make_ckey(0)),
+            position_in_partition::for_key(ss.make_ckey(4))), *s), t0);
+
+    mt->apply(m1);
+
+    auto rd1 = mt->make_flat_reader(s, semaphore.make_permit(), pr, s->full_slice(), default_priority_class(),
+                                    nullptr, streamed_mutation::forwarding::no, mutation_reader::forwarding::no);
+    auto close_rd1 = defer([&] { rd1.close().get(); });
+
+    rd1.fill_buffer().get();
+    BOOST_REQUIRE(!rd1.is_end_of_stream()); // rd1 must keep the m1 version alive
+
+    mt->apply(m2);
+
+    auto rd2 = mt->make_flat_reader(s, semaphore.make_permit(), pr, s->full_slice(), default_priority_class(),
+                                    nullptr, streamed_mutation::forwarding::no, mutation_reader::forwarding::no);
+    auto close_r2 = defer([&] { rd2.close().get(); });
+
+    rd2.fill_buffer().get();
+    BOOST_REQUIRE(!rd2.is_end_of_stream()); // rd2 must keep the m1 version alive
+
+    mt->apply(m3);
+
+    assert_that(mt->make_flat_reader(s, semaphore.make_permit(), pr))
+        .has_monotonic_positions();
+
+    assert_that(mt->make_flat_reader(s, semaphore.make_permit(), pr))
+        .produces(m1 + m2 + m3);
+}
+
 SEASTAR_TEST_CASE(test_hash_is_cached) {
    return seastar::async([] {
        auto s = schema_builder("ks", "cf")
--- a/test/boost/mvcc_test.cc
+++ b/test/boost/mvcc_test.cc
@@ -560,7 +560,7 @@ SEASTAR_TEST_CASE(test_apply_to_incomplete_respects_continuity) {
 static mutation_partition read_using_cursor(partition_snapshot& snap) {
    tests::reader_concurrency_semaphore_wrapper semaphore;
    partition_snapshot_row_cursor cur(*snap.schema(), snap);
-    cur.maybe_refresh();
+    cur.advance_to(position_in_partition::before_all_clustered_rows());
    auto mp = read_partition_from(*snap.schema(), cur);
    for (auto&& rt : snap.range_tombstones()) {
        mp.apply_delete(*snap.schema(), rt);
--- a/test/boost/restrictions_test.cc
+++ b/test/boost/restrictions_test.cc
@@ -763,9 +763,8 @@ SEASTAR_THREAD_TEST_CASE(multi_col_in) {
        cquery_nofail(e, "insert into t(pk,ck1,ck2,r) values (4,13,23,'a')");
        require_rows(e, "select pk from t where (ck1,ck2) in ((13,23)) allow filtering", {{I(3)}, {I(4)}});
        require_rows(e, "select pk from t where (ck1) in ((13),(33),(44)) allow filtering", {{I(3)}, {I(4)}});
-        // TODO: uncomment when #6200 is fixed.
-        // require_rows(e, "select pk from t where (ck1,ck2) in ((13,23)) and r='a' allow filtering",
-        //                  {{I(4), I(13), F(23), T("a")}});
+        require_rows(e, "select pk from t where (ck1,ck2) in ((13,23)) and r='a' allow filtering",
+                         {{I(4), I(13), F(23), T("a")}});
        cquery_nofail(e, "delete from t where pk=4");
        require_rows(e, "select pk from t where (ck1,ck2) in ((13,23)) allow filtering", {{I(3)}});
        auto stmt = e.prepare("select ck1 from t where (ck1,ck2) in ? allow filtering").get0();
--- a/test/boost/row_cache_test.cc
+++ b/test/boost/row_cache_test.cc
@@ -1242,9 +1242,13 @@ SEASTAR_TEST_CASE(test_update_failure) {
 class throttle {
    unsigned _block_counter = 0;
    promise<> _p; // valid when _block_counter != 0, resolves when goes down to 0
+    std::optional<promise<>> _entered;
+    bool _one_shot;
 public:
+    // one_shot means whether only the first enter() after block() will block.
+    throttle(bool one_shot = false) : _one_shot(one_shot) {}
    future<> enter() {
-        if (_block_counter) {
+        if (_block_counter && (!_one_shot || _entered)) {
            promise<> p1;
            promise<> p2;

@@ -1256,16 +1260,21 @@ public:
                p3.set_value();
            });
            _p = std::move(p2);
-
+            if (_entered) {
+                _entered->set_value();
+                _entered.reset();
+            }
            return f1;
        } else {
            return make_ready_future<>();
        }
    }

-    void block() {
+    future<> block() {
        ++_block_counter;
        _p = promise<>();
+        _entered = promise<>();
+        return _entered->get_future();
    }

    void unblock() {
@@ -1410,7 +1419,7 @@ SEASTAR_TEST_CASE(test_cache_population_and_update_race) {
            mt2->apply(m);
        }

-        thr.block();
+        auto f = thr.block();

        auto m0_range = dht::partition_range::make_singular(ring[0].ring_position());
        auto rd1 = cache.make_reader(s, semaphore.make_permit(), m0_range);
@@ -1421,6 +1430,7 @@ SEASTAR_TEST_CASE(test_cache_population_and_update_race) {
        rd2.set_max_buffer_size(1);
        auto rd2_fill_buffer = rd2.fill_buffer();

+        f.get();
        sleep(10ms).get();

        // This update should miss on all partitions
@@ -1548,12 +1558,13 @@ SEASTAR_TEST_CASE(test_cache_population_and_clear_race) {
            mt2->apply(m);
        }

-        thr.block();
+        auto f = thr.block();

        auto rd1 = cache.make_reader(s, semaphore.make_permit());
        rd1.set_max_buffer_size(1);
        auto rd1_fill_buffer = rd1.fill_buffer();

+        f.get();
        sleep(10ms).get();

        // This update should miss on all partitions
@@ -3777,3 +3788,81 @@ SEASTAR_TEST_CASE(test_scans_erase_dummies) {
        BOOST_REQUIRE_EQUAL(tracker.get_stats().rows, 2);
    });
 }
+
+SEASTAR_TEST_CASE(test_eviction_of_upper_bound_of_population_range) {
+    return seastar::async([] {
+        simple_schema s;
+        tests::reader_concurrency_semaphore_wrapper semaphore;
+        auto cache_mt = make_lw_shared<memtable>(s.schema());
+
+        auto pkey = s.make_pkey("pk");
+
+        mutation m1(s.schema(), pkey);
+        s.add_row(m1, s.make_ckey(1), "v1");
+        s.add_row(m1, s.make_ckey(2), "v2");
+        cache_mt->apply(m1);
+
+        cache_tracker tracker;
+        throttle thr(true);
+        auto cache_source = make_decorated_snapshot_source(snapshot_source([&] { return cache_mt->as_data_source(); }),
+                                                           [&] (mutation_source src) {
+            return throttled_mutation_source(thr, std::move(src));
+        });
+        row_cache cache(s.schema(), cache_source, tracker);
+
+        auto pr = dht::partition_range::make_singular(pkey);
+
+        auto read = [&] (int start, int end) {
+            auto slice = partition_slice_builder(*s.schema())
+                    .with_range(query::clustering_range::make(s.make_ckey(start), s.make_ckey(end)))
+                    .build();
+            auto rd = cache.make_reader(s.schema(), semaphore.make_permit(), pr, slice);
+            auto close_rd = deferred_close(rd);
+            auto m_cache = read_mutation_from_flat_mutation_reader(rd).get0();
+            close_rd.close_now();
+            rd = cache_mt->make_flat_reader(s.schema(), semaphore.make_permit(), pr, slice);
+            auto close_rd2 = deferred_close(rd);
+            auto m_mt = read_mutation_from_flat_mutation_reader(rd).get0();
+            BOOST_REQUIRE(m_mt);
+            assert_that(m_cache).has_mutation().is_equal_to(*m_mt);
+        };
+
+        // populate [2]
+        {
+            auto slice = partition_slice_builder(*s.schema())
+                    .with_range(query::clustering_range::make_singular(s.make_ckey(2)))
+                    .build();
+            assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr, slice))
+                    .has_monotonic_positions();
+        }
+
+        auto arrived = thr.block();
+
+        // Read [0, 2]
+        auto f = seastar::async([&] {
+            read(0, 2);
+        });
+
+        arrived.get();
+
+        // populate (2, 3]
+        {
+            auto slice = partition_slice_builder(*s.schema())
+                    .with_range(query::clustering_range::make(query::clustering_range::bound(s.make_ckey(2), false),
+                                                              query::clustering_range::bound(s.make_ckey(3), true)))
+                    .build();
+            assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr, slice))
+                    .has_monotonic_positions();
+        }
+
+        testlog.trace("Evicting");
+        evict_one_row(tracker); // Evicts before(0)
+        evict_one_row(tracker); // Evicts ck(2)
+        testlog.trace("Unblocking");
+
+        thr.unblock();
+        f.get();
+
+        read(0, 3);
+    });
+}
--- a/test/boost/sstable_datafile_test.cc
+++ b/test/boost/sstable_datafile_test.cc
@@ -72,6 +72,7 @@
 #include "test/lib/reader_concurrency_semaphore.hh"
 #include "test/lib/sstable_utils.hh"
 #include "test/lib/random_utils.hh"
+#include "test/lib/random_schema.hh"

 namespace fs = std::filesystem;

@@ -3003,3 +3004,58 @@ SEASTAR_TEST_CASE(sstable_reader_with_timeout) {
        });
    });
 }
+
+SEASTAR_TEST_CASE(test_crawling_reader_out_of_range_last_range_tombstone_change) {
+    return test_env::do_with_async([] (test_env& env) {
+        simple_schema table;
+
+        auto mut = table.new_mutation("pk0");
+        auto ckeys = table.make_ckeys(4);
+        table.add_row(mut, ckeys[0], "v0");
+        table.add_row(mut, ckeys[1], "v1");
+        table.add_row(mut, ckeys[2], "v2");
+        using bound = query::clustering_range::bound;
+        table.delete_range(mut, query::clustering_range::make(bound{ckeys[3], true}, bound{clustering_key::make_empty(), true}), tombstone(1, gc_clock::now()));
+
+        auto tmp = tmpdir();
+        auto sst_gen = [&env, &table, &tmp] () {
+            return env.make_sstable(table.schema(), tmp.path().string(), 1, sstables::get_highest_sstable_version(), big);
+        };
+        auto sst = make_sstable_containing(sst_gen, {mut});
+
+        assert_that(sst->make_crawling_reader(table.schema(), env.make_reader_permit())).has_monotonic_positions();
+    });
+}
+
+SEASTAR_TEST_CASE(test_crawling_reader_random_schema_random_mutations) {
+    return test_env::do_with_async([this] (test_env& env) {
+        auto random_spec = tests::make_random_schema_specification(
+                get_name(),
+                std::uniform_int_distribution<size_t>(1, 4),
+                std::uniform_int_distribution<size_t>(2, 4),
+                std::uniform_int_distribution<size_t>(2, 8),
+                std::uniform_int_distribution<size_t>(2, 8));
+        auto random_schema = tests::random_schema{tests::random::get_int<uint32_t>(), *random_spec};
+        auto schema = random_schema.schema();
+
+        testlog.info("Random schema:\n{}", random_schema.cql());
+
+        const auto muts = tests::generate_random_mutations(random_schema, 20).get();
+
+        auto tmp = tmpdir();
+        auto sst_gen = [&env, schema, &tmp] () {
+            return env.make_sstable(schema, tmp.path().string(), 1, sstables::get_highest_sstable_version(), big);
+        };
+        auto sst = make_sstable_containing(sst_gen, muts);
+
+        {
+            auto rd = assert_that(sst->make_crawling_reader(schema, env.make_reader_permit()));
+
+            for (const auto& mut : muts) {
+                rd.produces(mut);
+            }
+        }
+
+        assert_that(sst->make_crawling_reader(schema, env.make_reader_permit())).has_monotonic_positions();
+    });
+}
--- a/test/boost/sstable_partition_index_cache_test.cc
+++ b/test/boost/sstable_partition_index_cache_test.cc
@@ -49,10 +49,18 @@ static void add_entry(logalloc::region& r,

 static partition_index_page make_page0(logalloc::region& r, simple_schema& s) {
    partition_index_page page;
+    auto destroy_page = defer([&] {
+        with_allocator(r.allocator(), [&] {
+           auto p = std::move(page);
+        });
+    });
+
    add_entry(r, *s.schema(), page, s.make_pkey(0).key(), 0);
    add_entry(r, *s.schema(), page, s.make_pkey(1).key(), 1);
    add_entry(r, *s.schema(), page, s.make_pkey(2).key(), 2);
    add_entry(r, *s.schema(), page, s.make_pkey(3).key(), 3);
+
+    destroy_page.cancel();
    return page;
 }

@@ -143,6 +151,47 @@ SEASTAR_THREAD_TEST_CASE(test_caching) {
    }
 }

+template <typename T>
+static future<> ignore_result(future<T>&& f) {
+    return f.then_wrapped([] (auto&& f) {
+        try {
+            f.get();
+        } catch (...) {
+            // expected, silence warnings about ignored failed futures
+        }
+    });
+}
+
+SEASTAR_THREAD_TEST_CASE(test_exception_while_loading) {
+    ::lru lru;
+    simple_schema s;
+    logalloc::region r;
+    partition_index_cache cache(lru, r);
+
+    auto clear_lru = defer([&] {
+        with_allocator(r.allocator(), [&] {
+            lru.evict_all();
+        });
+    });
+
+    auto page0_loader = [&] (partition_index_cache::key_type k) {
+        return later().then([&] {
+            return make_page0(r, s);
+        });
+    };
+
+    memory::with_allocation_failures([&] {
+        cache.evict_gently().get();
+        auto f0 = ignore_result(cache.get_or_load(0, page0_loader));
+        auto f1 = ignore_result(cache.get_or_load(0, page0_loader));
+        f0.get();
+        f1.get();
+    });
+
+    auto ptr = cache.get_or_load(0, page0_loader).get0();
+    has_page0(ptr);
+}
+
 SEASTAR_THREAD_TEST_CASE(test_auto_clear) {
    ::lru lru;
    simple_schema s;
--- a/test/cql-pytest/test_cdc.py
+++ b/test/cql-pytest/test_cdc.py
@@ -19,6 +19,7 @@ from cassandra.cluster import ConsistencyLevel
 from cassandra.query import SimpleStatement

 from util import new_test_table
+from nodetool import flush

 def test_cdc_log_entries_use_cdc_streams(scylla_only, cql, test_keyspace):
    '''Test that the stream IDs chosen for CDC log entries come from the CDC generation
@@ -44,3 +45,16 @@ def test_cdc_log_entries_use_cdc_streams(scylla_only, cql, test_keyspace):

    assert(log_stream_ids.issubset(stream_ids))

+
+# Test for #10473 - reading logs (from sstable) after dropping
+# column in base.
+def test_cdc_alter_table_drop_column(scylla_only, cql, test_keyspace):
+    schema = "pk int primary key, v int"
+    extra = " with cdc = {'enabled': true}"
+    with new_test_table(cql, test_keyspace, schema, extra) as table:
+        cql.execute(f"insert into {table} (pk, v) values (0, 0)")
+        cql.execute(f"insert into {table} (pk, v) values (1, null)")
+        flush(cql, table)
+        flush(cql, table + "_scylla_cdc_log")
+        cql.execute(f"alter table {table} drop v")
+        cql.execute(f"select * from {table}_scylla_cdc_log")
--- a/test/cql-pytest/test_filtering.py
+++ b/test/cql-pytest/test_filtering.py
@@ -24,9 +24,11 @@
 # is or isn't necessary.

 import pytest
+import re
 from util import new_test_table
 from cassandra.protocol import InvalidRequest
 from cassandra.connection import DRIVER_NAME, DRIVER_VERSION
+from cassandra.query import UNSET_VALUE

 # When filtering for "x > 0" or "x < 0", rows with an unset value for x
 # should not match the filter.
@@ -141,3 +143,118 @@ def test_index_with_in_relation(scylla_only, cql, test_keyspace):
            cql.execute(f"insert into {table} (p,c,v) values ({p}, {c}, {v})")
        res = cql.execute(f"select * from {table} where p in (0,1) and v = False ALLOW FILTERING")
        assert set(res) == set([(0,1,False),(0,3,False),(1,1,False), (1,3,False)])
+
+# Test that LIKE operator works fine as a filter when the filtered column
+# has descending order. Regression test for issue #10183, when it was incorrectly
+# rejected as a "non-string" column.
+def test_filter_like_on_desc_column(cql, test_keyspace):
+    with new_test_table(cql, test_keyspace, "a int, b text, primary key(a, b)",
+            extra="with clustering order by (b desc)") as table:
+        cql.execute(f"INSERT INTO {table} (a, b) VALUES (1, 'one')")
+        res = cql.execute(f"SELECT b FROM {table} WHERE b LIKE '%%%' ALLOW FILTERING")
+        assert res.one().b == "one"
+
+# Test that IN restrictions are supported with filtering and return the
+# correct results.
+# We mark this test "cassandra_bug" because Cassandra could support this
+# feature but doesn't yet: It reports "IN predicates on non-primary-key
+# columns (v) is not yet supported" when v is a regular column, or "IN
+# restrictions are not supported when the query involves filtering" on
+# partition-key columns p1 or p2. By the way, it does support IN restrictions
+# on a clustering-key column.
+def test_filtering_with_in_relation(cql, test_keyspace, cassandra_bug):
+    schema = 'p1 int, p2 int, c int, v int, primary key ((p1, p2),c)'
+    with new_test_table(cql, test_keyspace, schema) as table:
+        cql.execute(f"INSERT INTO {table} (p1, p2, c, v) VALUES (1, 2, 3, 4)")
+        cql.execute(f"INSERT INTO {table} (p1, p2, c, v) VALUES (2, 3, 4, 5)")
+        cql.execute(f"INSERT INTO {table} (p1, p2, c, v) VALUES (3, 4, 5, 6)")
+        cql.execute(f"INSERT INTO {table} (p1, p2, c, v) VALUES (4, 5, 6, 7)")
+        res = cql.execute(f"select * from {table} where p1 in (2,4) ALLOW FILTERING")
+        assert set(res) == set([(2,3,4,5), (4,5,6,7)])
+        res = cql.execute(f"select * from {table} where p2 in (2,4) ALLOW FILTERING")
+        assert set(res) == set([(1,2,3,4), (3,4,5,6)])
+        res = cql.execute(f"select * from {table} where c in (3,5) ALLOW FILTERING")
+        assert set(res) == set([(1,2,3,4), (3,4,5,6)])
+        res = cql.execute(f"select * from {table} where v in (5,7) ALLOW FILTERING")
+        assert set(res) == set([(2,3,4,5), (4,5,6,7)])
+
+# Test that subscripts in expressions work as expected. They should only work
+# on map columns, and must have the correct type. Test that they also work
+# as expected for null or unset subscripts.
+# Cassandra considers the null subscript 'm[null]' to be an invalid request.
+# In Scylla we decided to it differently (we think better): m[null] is simply
+# a null, so the filter 'WHERE m[null] = 2' is not an error - it just doesn't
+# match anything. This is more consistent with our usual null handling
+# (null[2] and null < 2 are both defined as returning null), and will also
+# allow us in the future to support non-constant subscript - for example m[a]
+# where the column a can be null for some rows and non-null for other rows.
+# Because we decided that our behavior is better than Cassandra's, this test
+# fails on Cassandra and is marked with cassandra_bug.
+# This test is a superset of test test_null.py::test_map_subscript_null which
+# tests only the special case of a null subscript.
+# Reproduces #10361
+def test_filtering_with_subscript(cql, test_keyspace, cassandra_bug):
+    with new_test_table(cql, test_keyspace,
+            "p int, m1 map<int, int>, m2 map<text, text>, s set<int>, PRIMARY KEY (p)") as table:
+        # Check for *errors* in subscript expressions - such as wrong type or
+        # null - with an empty table. This will force the implementation to
+        # check for these errors before actually evaluating the filter
+        # expression - because there will be no rows to filter.
+
+        # A subscript is not allowed on a non-map column (in this case, a set)
+        with pytest.raises(InvalidRequest, match='cannot be used as a map'):
+            cql.execute(f"SELECT p FROM {table} WHERE s[2] = 3 ALLOW FILTERING")
+        # A wrong type is passed for the subscript is not allowed
+        with pytest.raises(InvalidRequest, match=re.escape('key(m1)')):
+            cql.execute(f"select p from {table} where m1['black'] = 2 ALLOW FILTERING")
+        with pytest.raises(InvalidRequest, match=re.escape('key(m2)')):
+            cql.execute(f"select p from {table} where m2[1] = 2 ALLOW FILTERING")
+        # See discussion of m1[null] above. Reproduces #10361, and fails
+        # on Cassandra (Cassandra deliberately returns an error here -
+        # an InvalidRequest with "Unsupported null map key for column m1"
+        assert list(cql.execute(f"select p from {table} where m1[null] = 2 ALLOW FILTERING")) == []
+        assert list(cql.execute(f"select p from {table} where m2[null] = 'hi' ALLOW FILTERING")) == []
+        # Similar to above checks, but using a prepared statement. We can't
+        # cause the driver to send the wrong type to a bound variable, so we
+        # can't check that case unfortunately, but we have a new UNSET_VALUE
+        # case.
+        stmt = cql.prepare(f"select p from {table} where m1[?] = 2 ALLOW FILTERING")
+        assert list(cql.execute(stmt, [None])) == []
+        # The expression m1[UNSET_VALUE] should be an error, but because the
+        # table is empty, we do not actually need to evaluate the expression
+        # and the error might might not be caught. So this test is commented
+        # out. We'll do it below, after we add some data to ensure that the
+        # expression does need to be evaluated.
+        #with pytest.raises(InvalidRequest, match='Unsupported unset map key for column m1'):
+        #    cql.execute(stmt, [UNSET_VALUE])
+
+        # Finally, check for sucessful filtering with subscripts. For that we
+        # need to add some data:
+        cql.execute("INSERT INTO "+table+" (p, m1, m2) VALUES (1, {1:2, 3:4}, {'dog':'cat', 'hi':'hello'})")
+        cql.execute("INSERT INTO "+table+" (p, m1, m2) VALUES (2, {2:3, 4:5}, {'man':'woman', 'black':'white'})")
+        res = cql.execute(f"select p from {table} where m1[1] = 2 ALLOW FILTERING")
+        assert list(res) == [(1,)]
+        res = cql.execute(f"select p from {table} where m2['black'] = 'white' ALLOW FILTERING")
+        assert list(res) == [(2,)]
+        res = cql.execute(stmt, [1])
+        assert list(res) == [(1,)]
+
+        # Try again the null-key request (reproduces #10361) that we did
+        # earlier when there was no data in the table. Now there is, and
+        # the scan brings up several rows, it may exercise different code
+        # paths.
+        assert list(cql.execute(f"select p from {table} where m1[null] = 2 ALLOW FILTERING")) == []
+        with pytest.raises(InvalidRequest, match='Unsupported unset map key for column m1'):
+            cql.execute(stmt, [UNSET_VALUE])
+
+# Beyond the tests of map subscript expressions above, also test what happens
+# when the expression is fine (e.g., m[2] = 3) but the *data* itself is null.
+# We used to have a bug there where we attempted to incorrectly deserialize
+# this null and get marshaling errors or even crashes - see issue #10417.
+# This test reproduces #10417, but not always - run with "--count" to
+# reproduce failures.
+def test_filtering_null_map_with_subscript(cql, test_keyspace):
+    schema = 'p text primary key, m map<int, int>'
+    with new_test_table(cql, test_keyspace, schema) as table:
+        cql.execute(f"INSERT INTO {table} (p) VALUES ('dog')")
+        assert list(cql.execute(f"SELECT p FROM {table} WHERE m[2] = 3 ALLOW FILTERING")) == []
--- a/test/cql-pytest/test_materialized_view.py
+++ b/test/cql-pytest/test_materialized_view.py
@@ -98,3 +98,51 @@ def test_mv_empty_string_partition_key(cql, test_keyspace):
            # because Cassandra forbids an empty partition key on select
            with pytest.raises(InvalidRequest, match='Key may not be empty'):
                cql.execute(f"SELECT * FROM {mv} WHERE v=''")
+
+# Refs #10851. The code used to create a wildcard selection for all columns,
+# which erroneously also includes static columns if such are present in the
+# base table. Currently views only operate on regular columns and the filtering
+# code assumes that. TODO: once we implement static column support for materialized
+# views, this test case will be a nice regression test to ensure that everything still
+# works if the static columns are *not* used in the view.
+def test_filter_with_unused_static_column(cql, test_keyspace):
+    schema = 'p int, c int, v int, s int static, primary key (p,c)'
+    with new_test_table(cql, test_keyspace, schema) as table:
+        with new_materialized_view(cql, table, select='p,c,v', pk='p,c,v', where='p IS NOT NULL and c IS NOT NULL and v = 44') as mv:
+            cql.execute(f"INSERT INTO {table} (p,c,v) VALUES (42,43,44)")
+            cql.execute(f"INSERT INTO {table} (p,c,v) VALUES (1,2,3)")
+            assert list(cql.execute(f"SELECT * FROM {mv}")) == [(42, 43, 44)]
+
+# Reproducer for issue #9450 - when a view's key column name is a (quoted)
+# keyword, writes used to fail because they generated internally broken CQL
+# with the column name not quoted.
+def test_mv_quoted_column_names(cql, test_keyspace):
+    for colname in ['"dog"', '"Dog"', 'DOG', '"to"', 'int']:
+        with new_test_table(cql, test_keyspace, f'p int primary key, {colname} int') as table:
+            with new_materialized_view(cql, table, '*', f'{colname}, p', f'{colname} is not null and p is not null') as mv:
+                cql.execute(f'INSERT INTO {table} (p, {colname}) values (1, 2)')
+                # Validate that not only the write didn't fail, it actually
+                # write the right thing to the view. NOTE: on a single-node
+                # Scylla, view update is synchronous so we can just read and
+                # don't need to wait or retry.
+                assert list(cql.execute(f'SELECT * from {mv}')) == [(2, 1)]
+
+# Same as test_mv_quoted_column_names above (reproducing issue #9450), just
+# check *view building* - i.e., pre-existing data in the base table that
+# needs to be copied to the view. The view building cannot return an error
+# to the user, but can fail to write the desired data into the view.
+def test_mv_quoted_column_names_build(cql, test_keyspace):
+    for colname in ['"dog"', '"Dog"', 'DOG', '"to"', 'int']:
+        with new_test_table(cql, test_keyspace, f'p int primary key, {colname} int') as table:
+            cql.execute(f'INSERT INTO {table} (p, {colname}) values (1, 2)')
+            with new_materialized_view(cql, table, '*', f'{colname}, p', f'{colname} is not null and p is not null') as mv:
+                # When Scylla's view builder fails as it did in issue #9450,
+                # there is no way to tell this state apart from a view build
+                # that simply hasn't completed (besides looking at the logs,
+                # which we don't). This means, unfortunately, that a failure
+                # of this test is slow - it needs to wait for a timeout.
+                start_time = time.time()
+                while time.time() < start_time + 30:
+                    if list(cql.execute(f'SELECT * from {mv}')) == [(2, 1)]:
+                        break
+                assert list(cql.execute(f'SELECT * from {mv}')) == [(2, 1)]
--- a/test/cql-pytest/test_null.py
+++ b/test/cql-pytest/test_null.py
@@ -28,7 +28,7 @@ from util import unique_name, random_string, new_test_table
@pytest.fixture(scope="module")
 def table1(cql, test_keyspace):
    table = test_keyspace + "." + unique_name()
-    cql.execute(f"CREATE TABLE {table} (p text, c text, v text, primary key (p, c))")
+    cql.execute(f"CREATE TABLE {table} (p text, c text, v text, i int, s set<int>, m map<int, int>, primary key (p, c))")
    yield table
    cql.execute("DROP TABLE " + table)

@@ -188,3 +188,65 @@ def test_empty_string_key2(cql, test_keyspace):
        cql.execute(f"INSERT INTO {table} (p1,p2,c,v) VALUES ('', '', '', 'cat')")
        cql.execute(f"INSERT INTO {table} (p1,p2,c,v) VALUES ('x', 'y', 'z', 'dog')")
        assert list(cql.execute(f"SELECT v FROM {table} WHERE p1='' AND p2='' AND c=''")) == [('cat',)]
+
+# Cassandra considers the null subscript 'm[null]' to be an invalid request.
+# In Scylla we decided to it differently (we think better): m[null] is simply
+# a null, so the filter 'WHERE m[null] = 3' is not an error - it just doesn't
+# match anything. This is more consistent with our usual null handling (null[2]
+# and null < 2 are both defined as returning null), and will also allow us
+# in the future to support non-constant subscript - for example m[a] where
+# the column a can be null for some rows and non-null for other rows.
+# Before we implemented the above decision, we had multiple bugs in this case,
+# resulting in bizarre errors and even crashes (see #10361, #10399 and #10417).
+#
+# Because this test uses a shared table (table1), then depending on how it's
+# run, it sometimes sees an empty table and sometimes a table with data
+# (and null values for the map m...), so this test mixes several different
+# concerns and problems. The same problems are better covered separately
+# by test_filtering.py::test_filtering_with_subscript and
+# test_filtering.py::test_filtering_null_map_with_subscript so this test
+# should eventually be deleted.
+def test_map_subscript_null(cql, table1, cassandra_bug):
+    assert list(cql.execute(f"SELECT p FROM {table1} WHERE m[null] = 3 ALLOW FILTERING")) == []
+    assert list(cql.execute(cql.prepare(f"SELECT p FROM {table1} WHERE m[?] = 3 ALLOW FILTERING"), [None])) == []
+
+# Similarly, CONTAINS restriction with NULL should also match nothing.
+# Reproduces #10359.
+@pytest.mark.xfail(reason="Issue #10359")
+def test_filtering_contains_null(cassandra_bug, cql, table1):
+    p = unique_key_string()
+    cql.execute(f"INSERT INTO {table1} (p,c,s) VALUES ('{p}', '1', {{1, 2}})")
+    cql.execute(f"INSERT INTO {table1} (p,c,s) VALUES ('{p}', '2', {{3, 4}})")
+    cql.execute(f"INSERT INTO {table1} (p,c) VALUES ('{p}', '3')")
+    assert list(cql.execute(f"SELECT c FROM {table1} WHERE p='{p}' AND s CONTAINS NULL ALLOW FILTERING")) == []
+
+# Similarly, CONTAINS KEY restriction with NULL should also match nothing.
+# Reproduces #10359.
+@pytest.mark.xfail(reason="Issue #10359")
+def test_filtering_contains_key_null(cassandra_bug, cql, table1):
+    p = unique_key_string()
+    cql.execute(f"INSERT INTO {table1} (p,c,m) VALUES ('{p}', '1', {{1: 2}})")
+    cql.execute(f"INSERT INTO {table1} (p,c,m) VALUES ('{p}', '2', {{3: 4}})")
+    cql.execute(f"INSERT INTO {table1} (p,c) VALUES ('{p}', '3')")
+    assert list(cql.execute(f"SELECT c FROM {table1} WHERE p='{p}' AND m CONTAINS KEY NULL ALLOW FILTERING")) == []
+
+# The above tests test_filtering_eq_null and test_filtering_inequality_null
+# have WHERE x=NULL or x>NULL where "x" is a regular column. Such a
+# comparison requires ALLOW FILTERING for non-NULL parameters, so we also
+# require it for NULL. Unlike the previous tests, this one also passed on
+# Cassandra.
+def test_filtering_null_comparison_no_filtering(cql, table1):
+    with pytest.raises(InvalidRequest, match='ALLOW FILTERING'):
+        cql.execute(f"SELECT c FROM {table1} WHERE p='x' AND i=NULL")
+    with pytest.raises(InvalidRequest, match='ALLOW FILTERING'):
+        cql.execute(f"SELECT c FROM {table1} WHERE p='x' AND i>NULL")
+    with pytest.raises(InvalidRequest, match='ALLOW FILTERING'):
+        cql.execute(f"SELECT c FROM {table1} WHERE p='x' AND i>=NULL")
+    with pytest.raises(InvalidRequest, match='ALLOW FILTERING'):
+        cql.execute(f"SELECT c FROM {table1} WHERE p='x' AND i<NULL")
+    with pytest.raises(InvalidRequest, match='ALLOW FILTERING'):
+        cql.execute(f"SELECT c FROM {table1} WHERE p='x' AND i<=NULL")
+    with pytest.raises(InvalidRequest, match='ALLOW FILTERING'):
+        cql.execute(f"SELECT c FROM {table1} WHERE p='x' AND s CONTAINS NULL")
+    with pytest.raises(InvalidRequest, match='ALLOW FILTERING'):
+        cql.execute(f"SELECT c FROM {table1} WHERE p='x' AND m CONTAINS KEY NULL")
--- a/test/cql-pytest/test_scan.py
+++ b/test/cql-pytest/test_scan.py
@@ -0,0 +1,64 @@
+# Copyright 2022-present ScyllaDB
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+#############################################################################
+# Tests for scanning SELECT requests (which read many rows and/or many
+# partitions).
+# We have a separate test file test_filtering.py for scans which also involve
+# filtering, and test_allow_filtering.py for checking when "ALLOW FILTERING"
+# is needed in scan. test_secondary_index.py also contains tests for scanning
+# using a secondary index.
+#############################################################################
+
+import pytest
+from util import new_test_table
+from cassandra.query import SimpleStatement
+
+# Regression test for #9482
+def test_scan_ending_with_static_row(cql, test_keyspace):
+    with new_test_table(cql, test_keyspace, "pk int, ck int, s int STATIC, v int, PRIMARY KEY (pk, ck)") as table:
+        stmt = cql.prepare(f"UPDATE {table} SET s = ? WHERE pk = ?")
+        for pk in range(100):
+            cql.execute(stmt, (0, pk))
+
+        statement = SimpleStatement(f"SELECT * FROM {table}", fetch_size=10)
+        # This will trigger an error in either processing or building the query
+        # results. The success criteria for this test is the query finishing
+        # without errors.
+        res = list(cql.execute(statement))
+
+
+# Test that if we have multi-column restrictions on the clustering key
+# and additional filtering on regular columns, both restrictions are obeyed.
+# Reproduces #6200.
+def test_multi_column_restrictions_and_filtering(cql, test_keyspace):
+    with new_test_table(cql, test_keyspace, "p int, c1 int, c2 int, r int, PRIMARY KEY (p, c1, c2)") as table:
+        stmt = cql.prepare(f"INSERT INTO {table} (p, c1, c2, r) VALUES (1, ?, ?, ?)")
+        for i in range(2):
+            for j in range(2):
+                cql.execute(stmt, [i, j, j])
+        assert list(cql.execute(f"SELECT c1,c2,r FROM {table} WHERE p=1 AND (c1, c2) = (0,1)")) == [(0,1,1)]
+        # Since in that result r=1, adding "AND r=1" should return the same
+        # result, and adding "AND r=0" should return nothing.
+        assert list(cql.execute(f"SELECT c1,c2,r FROM {table} WHERE p=1 AND (c1, c2) = (0,1) AND r=1 ALLOW FILTERING")) == [(0,1,1)]
+        # Reproduces #6200:
+        assert list(cql.execute(f"SELECT c1,c2,r FROM {table} WHERE p=1 AND (c1, c2) = (0,1) AND r=0 ALLOW FILTERING")) == []
+
+# Test that if we have a range multi-column restrictions on the clustering key
+# and additional filtering on regular columns, both restrictions are obeyed.
+# Similar to test_multi_column_restrictions_and_filtering, but uses a range
+# restriction on the clustering key columns.
+# Reproduces #12014, the code is taken from a reproducer provided by a user.
+def test_multi_column_range_restrictions_and_filtering(cql, test_keyspace):
+    with new_test_table(cql, test_keyspace, "pk int, ts timestamp, id int, processed boolean, PRIMARY KEY (pk, ts, id)") as table:
+        cql.execute(f"INSERT INTO {table} (pk, ts, id, processed) VALUES (0, currentTimestamp(), 0, true)")
+        cql.execute(f"INSERT INTO {table} (pk, ts, id, processed) VALUES (0, currentTimestamp(), 1, true)")
+        cql.execute(f"INSERT INTO {table} (pk, ts, id, processed) VALUES (0, currentTimestamp(), 2, false)")
+        cql.execute(f"INSERT INTO {table} (pk, ts, id, processed) VALUES (0, currentTimestamp(), 3, false)")
+        # This select doesn't use multi-column restrictions, the result shouldn't change when it does.
+        rows1 = list(cql.execute(f"SELECT id, processed FROM {table} WHERE pk = 0 AND ts >= 0 AND processed = false ALLOW FILTERING"))
+        assert rows1 == [(2, False), (3, False)]
+        # Reproduces #12014
+        rows2 = list(cql.execute(f"SELECT id, processed FROM {table} WHERE pk = 0 AND (ts, id) >= (0, 0) AND processed = false ALLOW FILTERING"))
+        assert rows1 == rows2
--- a/test/cql-pytest/util.py
+++ b/test/cql-pytest/util.py
@@ -102,6 +102,19 @@ def new_materialized_view(cql, table, select, pk, where):
    finally:
        cql.execute(f"DROP MATERIALIZED VIEW {mv}")

+# A utility function for creating a new temporary secondary index of
+# an existing table.
+@contextmanager
+def new_secondary_index(cql, table, column, name='', extra=''):
+    keyspace = table.split('.')[0]
+    if not name:
+        name = unique_name()
+    cql.execute(f"CREATE INDEX {name} ON {table} ({column}) {extra}")
+    try:
+        yield f"{keyspace}.{name}"
+    finally:
+        cql.execute(f"DROP INDEX {keyspace}.{name}")
+
 def project(column_name_string, rows):
    """Returns a list of column values from each of the rows."""
    return [getattr(r, column_name_string) for r in rows]
--- a/Show More
+++ b/Show More