release: prepare for 3.0-rc1

Merge "Proper support for static rows in SSTables 3.x" from Vladimir
This patchset addresses two issues with static rows support in SSTables 3.x. ('mc' format): 1. Since collections are allowed in static rows, we need to check for complex deletion, set corresponding flag and write tombstones, if any. 2. Column indices need to be partitioned for static columns the same way they are partitioned for regular ones. * github.com/argenet/scylla.git projects/sstables-30/columns-proper-order-followup/v1: sstables: Partition static columns by atomicity when reading/writing SSTables 3.x. sstables: Use std::reference_wrapper<> instead of a helper structure. sstables: Check for complex deletion when writing static rows. tests: Add/fix comments to test_write_interleaved_atomic_and_collection_columns. tests: Add test covering inverleaved atomic and collection cells in static row. (cherry picked from commit 62c7685b0d)
2018-10-31 12:08:43 +02:00 · 2018-10-30 14:51:21 +01:00 · 2018-10-29 09:52:25 +00:00 · 2018-10-28 15:51:47 +02:00 · 2018-10-26 13:30:12 +03:00 · 2018-10-24 19:32:57 +03:00
124 changed files with 1820 additions and 576 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=3.0.rc1

 if test -f version
 then
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -2228,11 +2228,11 @@
               "description":"The column family"
            },
            "total":{
-               "type":"int",
+               "type":"long",
               "description":"The total snapshot size"
            },
            "live":{
-               "type":"int",
+               "type":"long",
               "description":"The live snapshot size"
            }
         }
--- a/clustering_ranges_walker.hh
+++ b/clustering_ranges_walker.hh
@@ -200,8 +200,9 @@ public:
        return _current_start;
    }

-    position_in_partition_view upper_bound() const {
-        return _current_end;
+    // Returns the upper bound of the last range in provided ranges set
+    position_in_partition_view uppermost_bound() const {
+        return position_in_partition_view::for_range_end(_ranges.back());
    }

    // When lower_bound() changes, this also does
--- a/cql3/error_collector.hh
+++ b/cql3/error_collector.hh
@@ -67,6 +67,12 @@ class error_collector : public error_listener<RecognizerType, ExceptionBaseType>
     */
    const sstring_view _query;

+    /**
+     * An empty bitset to be used as a workaround for AntLR null dereference
+     * bug.
+     */
+    static typename ExceptionBaseType::BitsetListType _empty_bit_list;
+
 public:

    /**
@@ -144,6 +150,14 @@ private:
            break;
        }
        default:
+            // AntLR Exception class has a bug of dereferencing a null
+            // pointer in the displayRecognitionError. The following
+            // if statement makes sure it will not be null before the
+            // call to that function (displayRecognitionError).
+            // bug reference: https://github.com/antlr/antlr3/issues/191
+            if (!ex->get_expectingSet()) {
+                ex->set_expectingSet(&_empty_bit_list);
+            }
            ex->displayRecognitionError(token_names, msg);
        }
        return msg.str();
@@ -345,4 +359,8 @@ private:
 #endif
 };

+template<typename RecognizerType, typename TokenType, typename ExceptionBaseType>
+typename ExceptionBaseType::BitsetListType
+error_collector<RecognizerType,TokenType,ExceptionBaseType>::_empty_bit_list = typename ExceptionBaseType::BitsetListType();
+
 }
--- a/cql3/restrictions/primary_key_restrictions.hh
+++ b/cql3/restrictions/primary_key_restrictions.hh
@@ -106,6 +106,11 @@ public:
    virtual size_t prefix_size() const {
        return 0;
    }
+
+    size_t prefix_size(const schema_ptr schema) const {
+        return 0;
+    }
+
 };

 template<>
@@ -129,5 +134,23 @@ inline bool primary_key_restrictions<clustering_key>::needs_filtering(const sche
    return false;
 }

+template<>
+inline size_t primary_key_restrictions<clustering_key>::prefix_size(const schema_ptr schema) const {
+    size_t count = 0;
+    if (schema->clustering_key_columns().empty()) {
+        return count;
+    }
+    auto column_defs = get_column_defs();
+    column_id expected_column_id = schema->clustering_key_columns().begin()->id;
+    for (auto&& cdef : column_defs) {
+        if (schema->position(*cdef) != expected_column_id) {
+            return count;
+        }
+        expected_column_id++;
+        count++;
+    }
+    return count;
+}
+
 }
 }
--- a/cql3/restrictions/single_column_primary_key_restrictions.hh
+++ b/cql3/restrictions/single_column_primary_key_restrictions.hh
@@ -166,19 +166,7 @@ public:
    }

    virtual size_t prefix_size() const override {
-        size_t count = 0;
-        if (_schema->clustering_key_columns().empty()) {
-            return count;
-        }
-        column_id expected_column_id = _schema->clustering_key_columns().begin()->id;
-        for (const auto& restriction_entry : _restrictions->restrictions()) {
-            if (_schema->position(*restriction_entry.first) != expected_column_id) {
-                return count;
-            }
-            expected_column_id++;
-            count++;
-        }
-        return count;
+        return primary_key_restrictions<ValueType>::prefix_size(_schema);
    }

    ::shared_ptr<single_column_primary_key_restrictions<clustering_key>> get_longest_prefix_restrictions() {
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -337,6 +337,52 @@ const std::vector<::shared_ptr<restrictions>>& statement_restrictions::index_res
    return _index_restrictions;
 }

+std::optional<secondary_index::index> statement_restrictions::find_idx(secondary_index::secondary_index_manager& sim) const {
+    for (::shared_ptr<cql3::restrictions::restrictions> restriction : index_restrictions()) {
+        for (const auto& cdef : restriction->get_column_defs()) {
+            for (auto index : sim.list_indexes()) {
+                if (index.depends_on(*cdef)) {
+                    return std::make_optional<secondary_index::index>(std::move(index));
+                }
+            }
+        }
+    }
+    return std::nullopt;
+}
+
+std::vector<const column_definition*> statement_restrictions::get_column_defs_for_filtering(database& db) const {
+    std::vector<const column_definition*> column_defs_for_filtering;
+    if (need_filtering()) {
+        auto& sim = db.find_column_family(_schema).get_index_manager();
+        std::optional<secondary_index::index> opt_idx = find_idx(sim);
+        auto column_uses_indexing = [&opt_idx] (const column_definition* cdef) {
+            return opt_idx && opt_idx->depends_on(*cdef);
+        };
+        if (_partition_key_restrictions->needs_filtering(*_schema)) {
+            for (auto&& cdef : _partition_key_restrictions->get_column_defs()) {
+                if (!column_uses_indexing(cdef)) {
+                    column_defs_for_filtering.emplace_back(cdef);
+                }
+            }
+        }
+        if (_clustering_columns_restrictions->needs_filtering(*_schema)) {
+            column_id first_non_prefix_id = _schema->clustering_key_columns().begin()->id +
+                    _clustering_columns_restrictions->prefix_size(_schema);
+            for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
+                if ((cdef->id >= first_non_prefix_id) && (!column_uses_indexing(cdef))) {
+                    column_defs_for_filtering.emplace_back(cdef);
+                }
+            }
+        }
+        for (auto&& cdef : _nonprimary_key_restrictions->get_column_defs()) {
+            if (!column_uses_indexing(cdef)) {
+                column_defs_for_filtering.emplace_back(cdef);
+            }
+        }
+    }
+    return column_defs_for_filtering;
+}
+
 void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering) {
    // If there is a queriable index, no special condition are required on the other restrictions.
    // But we still need to know 2 things:
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -163,6 +163,20 @@ public:
        return _clustering_columns_restrictions;
    }

+    /**
+     * Builds a possibly empty collection of column definitions that will be used for filtering
+     * @param db - the database context
+     * @return A list with the column definitions needed for filtering.
+     */
+    std::vector<const column_definition*> get_column_defs_for_filtering(database& db) const;
+
+    /**
+     * Determines the index to be used with the restriction.
+     * @param db - the database context (for extracting index manager)
+     * @return If an index can be used, an optional containing this index, otherwise an empty optional.
+     */
+    std::optional<secondary_index::index> find_idx(secondary_index::secondary_index_manager& sim) const;
+
    /**
     * Checks if the partition key has some unrestricted components.
     * @return <code>true</code> if the partition key has some unrestricted components, <code>false</code> otherwise.
--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -156,9 +156,9 @@ public:
        return _factories->uses_function(ks_name, function_name);
    }

-    virtual uint32_t add_column_for_ordering(const column_definition& c) override {
-        uint32_t index = selection::add_column_for_ordering(c);
-        _factories->add_selector_for_ordering(c, index);
+    virtual uint32_t add_column_for_post_processing(const column_definition& c) override {
+        uint32_t index = selection::add_column_for_post_processing(c);
+        _factories->add_selector_for_post_processing(c, index);
        return index;
    }

@@ -227,7 +227,7 @@ protected:
    return simple_selection::make(schema, std::move(columns), false);
 }

-uint32_t selection::add_column_for_ordering(const column_definition& c) {
+uint32_t selection::add_column_for_post_processing(const column_definition& c) {
    _columns.push_back(&c);
    _metadata->add_non_serialized_column(c.column_specification);
    return _columns.size() - 1;
--- a/cql3/selection/selection.hh
+++ b/cql3/selection/selection.hh
@@ -176,7 +176,7 @@ public:
    static ::shared_ptr<selection> wildcard(schema_ptr schema);
    static ::shared_ptr<selection> for_columns(schema_ptr schema, std::vector<const column_definition*> columns);

-    virtual uint32_t add_column_for_ordering(const column_definition& c);
+    virtual uint32_t add_column_for_post_processing(const column_definition& c);

    virtual bool uses_function(const sstring &ks_name, const sstring& function_name) const {
        return false;
--- a/cql3/selection/selector_factories.cc
+++ b/cql3/selection/selector_factories.cc
@@ -53,6 +53,7 @@ selector_factories::selector_factories(std::vector<::shared_ptr<selectable>> sel
    : _contains_write_time_factory(false)
    , _contains_ttl_factory(false)
    , _number_of_aggregate_factories(0)
+    , _number_of_factories_for_post_processing(0)
 {
    _factories.reserve(selectables.size());

@@ -76,8 +77,9 @@ bool selector_factories::uses_function(const sstring& ks_name, const sstring& fu
    return false;
 }

-void selector_factories::add_selector_for_ordering(const column_definition& def, uint32_t index) {
+void selector_factories::add_selector_for_post_processing(const column_definition& def, uint32_t index) {
    _factories.emplace_back(simple_selector::new_factory(def.name_as_text(), index, def.type));
+    ++_number_of_factories_for_post_processing;
 }

 std::vector<::shared_ptr<selector>> selector_factories::new_instances() const {
--- a/cql3/selection/selector_factories.hh
+++ b/cql3/selection/selector_factories.hh
@@ -74,6 +74,11 @@ private:
     */
    uint32_t _number_of_aggregate_factories;

+    /**
+     * The number of factories that are only for post processing.
+     */
+    uint32_t _number_of_factories_for_post_processing;
+
 public:
    /**
     * Creates a new <code>SelectorFactories</code> instance and collect the column definitions.
@@ -97,11 +102,12 @@ public:
    bool uses_function(const sstring& ks_name, const sstring& function_name) const;

    /**
-     * Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY purposes.
+     * Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY or post
+     * processing purposes.
     * @param def the column that is needed for ordering
     * @param index the index of the column definition in the Selection's list of columns
     */
-    void add_selector_for_ordering(const column_definition& def, uint32_t index);
+    void add_selector_for_post_processing(const column_definition& def, uint32_t index);

    /**
     * Checks if this <code>SelectorFactories</code> contains only factories for aggregates.
@@ -111,7 +117,7 @@ public:
     */
    bool contains_only_aggregate_functions() const {
        auto size = _factories.size();
-        return size != 0 && _number_of_aggregate_factories == size;
+        return size != 0 && _number_of_aggregate_factories  == (size - _number_of_factories_for_post_processing);
    }

    /**
--- a/cql3/statements/create_index_statement.cc
+++ b/cql3/statements/create_index_statement.cc
@@ -137,10 +137,15 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c

        bool is_map = dynamic_cast<const collection_type_impl *>(cd->type.get()) != nullptr
                      && dynamic_cast<const collection_type_impl *>(cd->type.get())->is_map();
-        bool is_frozen_collection = cd->type->is_collection() && !cd->type->is_multi_cell();
+        bool is_collection = cd->type->is_collection();
+        bool is_frozen_collection = is_collection && !cd->type->is_multi_cell();

        if (is_frozen_collection) {
            validate_for_frozen_collection(target);
+        } else if (is_collection) {
+            // NOTICE(sarna): should be lifted after #2962 (indexes on non-frozen collections) is implemented
+            throw exceptions::invalid_request_exception(
+                    sprint("Cannot create secondary index on non-frozen collection column %s", cd->name_as_text()));
        } else {
            validate_not_full_index(target);
            validate_is_values_index_if_target_column_not_collection(cd, target);
--- a/cql3/statements/create_view_statement.cc
+++ b/cql3/statements/create_view_statement.cc
@@ -315,6 +315,27 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
        throw exceptions::invalid_request_exception(sprint("No columns are defined for Materialized View other than primary key"));
    }

+    // The unique feature of a filter by a non-key column is that the
+    // value of such column can be updated - and also be expired with TTL
+    // and cause the view row to appear and disappear. We don't currently
+    // support support this case - see issue #3430, and neither does
+    // Cassandra - see see CASSANDRA-13798 and CASSANDRA-13832.
+    // Actually, as CASSANDRA-13798 explains, the problem is "the liveness of
+    // view row is now depending on multiple base columns (multiple filtered
+    // non-pk base column + base column used in view pk)". When the filtered
+    // column *is* the base column added to the view pk, we don't have this
+    // problem. And this case actually works correctly.
+    auto non_pk_restrictions = restrictions->get_non_pk_restriction();
+    if (non_pk_restrictions.size() == 1 && has_non_pk_column &&
+            std::find(target_primary_keys.begin(), target_primary_keys.end(), non_pk_restrictions.cbegin()->first) != target_primary_keys.end()) {
+        // This case (filter by new PK column of the view) works, as explained above
+    } else if (!non_pk_restrictions.empty()) {
+        auto column_names = ::join(", ", non_pk_restrictions | boost::adaptors::map_keys | boost::adaptors::transformed(std::mem_fn(&column_definition::name_as_text)));
+        throw exceptions::invalid_request_exception(sprint(
+                "Non-primary key columns cannot be restricted in the SELECT statement used for materialized view %s creation (got restrictions on: %s)",
+                column_family(), column_names));
+    }
+
    schema_builder builder{keyspace(), column_family()};
    auto add_columns = [this, &builder] (std::vector<const column_definition*>& defs, column_kind kind) mutable {
        for (auto* def : defs) {
--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -141,6 +141,10 @@ private:
    /** If ALLOW FILTERING was not specified, this verifies that it is not needed */
    void check_needs_filtering(::shared_ptr<restrictions::statement_restrictions> restrictions);

+    void ensure_filtering_columns_retrieval(database& db,
+                                            ::shared_ptr<selection::selection> selection,
+                                            ::shared_ptr<restrictions::statement_restrictions> restrictions);
+
    bool contains_alias(::shared_ptr<column_identifier> name);

    ::shared_ptr<column_specification> limit_receiver();
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -410,7 +410,7 @@ select_statement::do_execute(service::storage_proxy& proxy,
    }

    command->slice.options.set<query::partition_slice::option::allow_short_read>();
-    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
+    auto timeout_duration = options.get_timeout_config().*get_timeout_config_selector();
    auto p = service::pager::query_pagers::pager(_schema, _selection,
            state, options, command, std::move(key_ranges), _stats, _restrictions->need_filtering() ? _restrictions : nullptr);

@@ -418,9 +418,10 @@ select_statement::do_execute(service::storage_proxy& proxy,
        return do_with(
                cql3::selection::result_set_builder(*_selection, now,
                        options.get_cql_serialization_format()),
-                [this, p, page_size, now, timeout](auto& builder) {
+                [this, p, page_size, now, timeout_duration](auto& builder) {
                    return do_until([p] {return p->is_exhausted();},
-                            [p, &builder, page_size, now, timeout] {
+                            [p, &builder, page_size, now, timeout_duration] {
+                                auto timeout = db::timeout_clock::now() + timeout_duration;
                                return p->fetch_page(builder, page_size, now, timeout);
                            }
                    ).then([this, &builder] {
@@ -439,6 +440,7 @@ select_statement::do_execute(service::storage_proxy& proxy,
                        " you must either remove the ORDER BY or the IN and sort client side, or disable paging for this query");
    }

+    auto timeout = db::timeout_clock::now() + timeout_duration;
    if (_selection->is_trivial() && !_restrictions->need_filtering()) {
        return p->fetch_page_generator(page_size, now, timeout, _stats).then([this, p, limit] (result_generator generator) {
            auto meta = [&] () -> shared_ptr<const cql3::metadata> {
@@ -492,15 +494,9 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const clustering_
    return KeyType::from_range(exploded_base_key);
 }

-future<shared_ptr<cql_transport::messages::result_message>>
-indexed_table_select_statement::execute_base_query(
-        service::storage_proxy& proxy,
-        dht::partition_range_vector&& partition_ranges,
-        service::query_state& state,
-        const query_options& options,
-        gc_clock::time_point now,
-        ::shared_ptr<const service::pager::paging_state> paging_state) {
-    auto cmd = ::make_lw_shared<query::read_command>(
+lw_shared_ptr<query::read_command>
+indexed_table_select_statement::prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging) {
+    lw_shared_ptr<query::read_command> cmd = ::make_lw_shared<query::read_command>(
            _schema->id(),
            _schema->version(),
            make_partition_slice(options),
@@ -510,9 +506,25 @@ indexed_table_select_statement::execute_base_query(
            query::max_partitions,
            utils::UUID(),
            options.get_timestamp(state));
-    if (options.get_page_size() > 0) {
+    if (use_paging) {
        cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
+        cmd->slice.options.set<query::partition_slice::option::send_partition_key>();
+        if (_schema->clustering_key_size() > 0) {
+            cmd->slice.options.set<query::partition_slice::option::send_clustering_key>();
+        }
    }
+    return cmd;
+}
+
+future<shared_ptr<cql_transport::messages::result_message>>
+indexed_table_select_statement::execute_base_query(
+        service::storage_proxy& proxy,
+        dht::partition_range_vector&& partition_ranges,
+        service::query_state& state,
+        const query_options& options,
+        gc_clock::time_point now,
+        ::shared_ptr<const service::pager::paging_state> paging_state) {
+    auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
    dht::partition_range_vector per_vnode_ranges;
    per_vnode_ranges.reserve(partition_ranges.size());
@@ -586,19 +598,7 @@ indexed_table_select_statement::execute_base_query(
        const query_options& options,
        gc_clock::time_point now,
        ::shared_ptr<const service::pager::paging_state> paging_state) {
-    auto cmd = make_lw_shared<query::read_command>(
-            _schema->id(),
-            _schema->version(),
-            make_partition_slice(options),
-            get_limit(options),
-            now,
-            tracing::make_trace_info(state.get_trace_state()),
-            query::max_partitions,
-            utils::UUID(),
-            options.get_timestamp(state));
-    if (options.get_page_size() > 0) {
-        cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
-    }
+    auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
    auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();

    struct base_query_state {
@@ -774,7 +774,8 @@ indexed_table_select_statement::prepare(database& db,
                                        ordering_comparator_type ordering_comparator,
                                        ::shared_ptr<term> limit, cql_stats &stats)
 {
-    auto index_opt = find_idx(db, schema, restrictions);
+    auto& sim = db.find_column_family(schema).get_index_manager();
+    auto index_opt = restrictions->find_idx(sim);
    if (!index_opt) {
        throw std::runtime_error("No index found.");
    }
@@ -798,24 +799,6 @@ indexed_table_select_statement::prepare(database& db,

 }

-
-stdx::optional<secondary_index::index> indexed_table_select_statement::find_idx(database& db,
-                                                                                schema_ptr schema,
-                                                                                ::shared_ptr<restrictions::statement_restrictions> restrictions)
-{
-    auto& sim = db.find_column_family(schema).get_index_manager();
-    for (::shared_ptr<cql3::restrictions::restrictions> restriction : restrictions->index_restrictions()) {
-        for (const auto& cdef : restriction->get_column_defs()) {
-            for (auto index : sim.list_indexes()) {
-                if (index.depends_on(*cdef)) {
-                    return stdx::make_optional<secondary_index::index>(std::move(index));
-                }
-            }
-        }
-    }
-    return stdx::nullopt;
-}
-
 indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms,
                                                           ::shared_ptr<parameters> parameters,
                                                           ::shared_ptr<selection::selection> selection,
@@ -1219,6 +1202,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
    }

    check_needs_filtering(restrictions);
+    ensure_filtering_columns_retrieval(db, selection, restrictions);

    ::shared_ptr<cql3::statements::select_statement> stmt;
    if (restrictions->uses_secondary_indexing()) {
@@ -1357,7 +1341,7 @@ select_statement::get_ordering_comparator(schema_ptr schema,
        }
        auto index = selection->index_of(*def);
        if (index < 0) {
-            index = selection->add_column_for_ordering(*def);
+            index = selection->add_column_for_post_processing(*def);
        }

        sorters.emplace_back(index, def->type);
@@ -1444,6 +1428,23 @@ void select_statement::check_needs_filtering(::shared_ptr<restrictions::statemen
    }
 }

+/**
+ * Adds columns that are needed for the purpose of filtering to the selection.
+ * The columns that are added to the selection are columns that
+ * are needed for filtering on the coordinator but are not part of the selection.
+ * The columns are added with a meta-data indicating they are not to be returned
+ * to the user.
+ */
+void select_statement::ensure_filtering_columns_retrieval(database& db,
+                                        ::shared_ptr<selection::selection> selection,
+                                        ::shared_ptr<restrictions::statement_restrictions> restrictions) {
+    for (auto&& cdef : restrictions->get_column_defs_for_filtering(db)) {
+        if (!selection->has_column(*cdef)) {
+            selection->add_column_for_post_processing(*cdef);
+        }
+    }
+}
+
 bool select_statement::contains_alias(::shared_ptr<column_identifier> name) {
    return std::any_of(_select_clause.begin(), _select_clause.end(), [name] (auto raw) {
        return raw->alias && *name == *raw->alias;
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -186,10 +186,6 @@ public:
                                   schema_ptr view_schema);

 private:
-    static stdx::optional<secondary_index::index> find_idx(database& db,
-                                                           schema_ptr schema,
-                                                           ::shared_ptr<restrictions::statement_restrictions> restrictions);
-
    virtual future<::shared_ptr<cql_transport::messages::result_message>> do_execute(service::storage_proxy& proxy,
                                                                                     service::query_state& state, const query_options& options) override;

@@ -214,6 +210,9 @@ private:
            gc_clock::time_point now,
            ::shared_ptr<const service::pager::paging_state> paging_state);

+    lw_shared_ptr<query::read_command>
+    prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging);
+
    future<shared_ptr<cql_transport::messages::result_message>>
    execute_base_query(
            service::storage_proxy& proxy,
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -1673,14 +1673,14 @@ const db::commitlog::config& db::commitlog::active_config() const {
 // No commit_io_check needed in the log reader since the database will fail
 // on error at startup if required
 future<std::unique_ptr<subscription<temporary_buffer<char>, db::replay_position>>>
-db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func next, position_type off, const db::extensions* exts) {
+db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class read_io_prio_class, commit_load_reader_func next, position_type off, const db::extensions* exts) {
    struct work {
    private:
-        file_input_stream_options make_file_input_stream_options() {
+        file_input_stream_options make_file_input_stream_options(seastar::io_priority_class read_io_prio_class) {
            file_input_stream_options fo;
            fo.buffer_size = db::commitlog::segment::default_size;
            fo.read_ahead = 10;
-            fo.io_priority_class = service::get_local_commitlog_priority();
+            fo.io_priority_class = read_io_prio_class;
            return fo;
        }
    public:
@@ -1699,8 +1699,8 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
        bool header = true;
        bool failed = false;

-        work(file f, position_type o = 0)
-                : f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options())), start_off(o) {
+        work(file f, seastar::io_priority_class read_io_prio_class, position_type o = 0)
+                : f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options(read_io_prio_class))), start_off(o) {
        }
        work(work&&) = default;

@@ -1918,9 +1918,9 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
        return fut;
    });

-    return fut.then([off, next](file f) {
+    return fut.then([off, next, read_io_prio_class] (file f) {
        f = make_checked_file(commit_error_handler, std::move(f));
-        auto w = make_lw_shared<work>(std::move(f), off);
+        auto w = make_lw_shared<work>(std::move(f), read_io_prio_class, off);
        auto ret = w->s.listen(next);

        w->s.started().then(std::bind(&work::read_file, w.get())).then([w] {
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -355,7 +355,7 @@ public:
    };

    static future<std::unique_ptr<subscription<temporary_buffer<char>, replay_position>>> read_log_file(
-            const sstring&, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
+            const sstring&, seastar::io_priority_class read_io_prio_class, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
 private:
    commitlog(config);

--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -58,6 +58,7 @@
 #include "converting_mutation_partition_applier.hh"
 #include "schema_registry.hh"
 #include "commitlog_entry.hh"
+#include "service/priority_manager.hh"

 static logging::logger rlogger("commitlog_replayer");

@@ -223,7 +224,7 @@ db::commitlog_replayer::impl::recover(sstring file, const sstring& fname_prefix)
    auto s = make_lw_shared<stats>();
    auto& exts = _qp.local().db().local().get_config().extensions();

-    return db::commitlog::read_log_file(file,
+    return db::commitlog::read_log_file(file, service::get_local_commitlog_priority(),
            std::bind(&impl::process, this, s.get(), std::placeholders::_1,
                    std::placeholders::_2), p, &exts).then([](auto s) {
        auto f = s->done();
--- a/db/config.hh
+++ b/db/config.hh
@@ -453,7 +453,7 @@ public:
            "The maximum number of tombstones a query can scan before aborting."  \
    )   \
    /* Network timeout settings */  \
-    val(range_request_timeout_in_ms, uint32_t, 10000, Unused,     \
+    val(range_request_timeout_in_ms, uint32_t, 10000, Used,     \
            "The time in milliseconds that the coordinator waits for sequential or index scans to complete."  \
    )   \
    val(read_request_timeout_in_ms, uint32_t, 5000, Used,     \
@@ -472,7 +472,7 @@ public:
            "The time in milliseconds that the coordinator waits for write operations to complete.\n"  \
            "Related information: About hinted handoff writes"  \
    )   \
-    val(request_timeout_in_ms, uint32_t, 10000, Unused,     \
+    val(request_timeout_in_ms, uint32_t, 10000, Used,     \
            "The default timeout for other, miscellaneous operations.\n"  \
            "Related information: About hinted handoff writes"  \
    )   \
@@ -621,7 +621,7 @@ public:
    val(thrift_framed_transport_size_in_mb, uint32_t, 15, Unused,     \
            "Frame size (maximum field length) for Thrift. The frame is the row or part of the row the application is inserting."  \
    )   \
-    val(thrift_max_message_length_in_mb, uint32_t, 16, Unused,     \
+    val(thrift_max_message_length_in_mb, uint32_t, 16, Used,     \
            "The maximum length of a Thrift message in megabytes, including all fields and internal Thrift overhead (1 byte of overhead for each frame). Message length is usually used in conjunction with batches. A frame length greater than or equal to 24 accommodates a batch with four inserts, each of which is 24 bytes. The required message length is greater than or equal to 24+24+24+24+4 (number of frames)."  \
    )   \
    /* Security properties */   \
--- a/db/hints/manager.cc
+++ b/db/hints/manager.cc
@@ -35,6 +35,7 @@
 #include "disk-error-handler.hh"
 #include "lister.hh"
 #include "db/timeout_clock.hh"
+#include "service/priority_manager.hh"

 using namespace std::literals::chrono_literals;

@@ -95,6 +96,7 @@ future<> manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr
        return compute_hints_dir_device_id();
    }).then([this] {
        _strorage_service_anchor->register_subscriber(this);
+        set_started();
    });
 }

@@ -105,7 +107,7 @@ future<> manager::stop() {
        _strorage_service_anchor->unregister_subscriber(this);
    }

-    _stopping = true;
+    set_stopping();

    return _draining_eps_gate.close().finally([this] {
        return parallel_for_each(_ep_managers, [] (auto& pair) {
@@ -277,7 +279,7 @@ inline bool manager::have_ep_manager(ep_key_type ep) const noexcept {
 }

 bool manager::store_hint(ep_key_type ep, schema_ptr s, lw_shared_ptr<const frozen_mutation> fm, tracing::trace_state_ptr tr_state) noexcept {
-    if (_stopping || !can_hint_for(ep)) {
+    if (stopping() || !started() || !can_hint_for(ep)) {
        manager_logger.trace("Can't store a hint to {}", ep);
        ++_stats.dropped;
        return false;
@@ -502,7 +504,7 @@ bool manager::check_dc_for(ep_key_type ep) const noexcept {
 }

 void manager::drain_for(gms::inet_address endpoint) {
-    if (_stopping) {
+    if (stopping()) {
        return;
    }

@@ -543,6 +545,7 @@ manager::end_point_hints_manager::sender::sender(end_point_hints_manager& parent
    , _resource_manager(_shard_manager._resource_manager)
    , _proxy(local_storage_proxy)
    , _db(local_db)
+    , _hints_cpu_sched_group(_db.get_streaming_scheduling_group())
    , _gossiper(local_gossiper)
    , _file_update_mutex(_ep_manager.file_update_mutex())
 {}
@@ -555,6 +558,7 @@ manager::end_point_hints_manager::sender::sender(const sender& other, end_point_
    , _resource_manager(_shard_manager._resource_manager)
    , _proxy(other._proxy)
    , _db(other._db)
+    , _hints_cpu_sched_group(other._hints_cpu_sched_group)
    , _gossiper(other._gossiper)
    , _file_update_mutex(_ep_manager.file_update_mutex())
 {}
@@ -610,7 +614,10 @@ manager::end_point_hints_manager::sender::clock::duration manager::end_point_hin
 }

 void manager::end_point_hints_manager::sender::start() {
-    _stopped = seastar::async([this] {
+    seastar::thread_attributes attr;
+
+    attr.sched_group = _hints_cpu_sched_group;
+    _stopped = seastar::async(std::move(attr), [this] {
        manager_logger.trace("ep_manager({})::sender: started", end_point_key());
        while (!stopping()) {
            try {
@@ -693,7 +700,7 @@ bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fnam
    lw_shared_ptr<send_one_file_ctx> ctx_ptr = make_lw_shared<send_one_file_ctx>();

    try {
-        auto s = commitlog::read_log_file(fname, [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
+        auto s = commitlog::read_log_file(fname, service::get_local_streaming_read_priority(), [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
            // Check that we can still send the next hint. Don't try to send it if the destination host
            // is DOWN or if we have already failed to send some of the previous hints.
            if (!draining() && ctx_ptr->state.contains(send_state::segment_replay_failed)) {
@@ -759,7 +766,7 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
    int replayed_segments_count = 0;

    try {
-        while (have_segments()) {
+        while (replay_allowed() && have_segments()) {
            if (!send_one_file(*_segments_to_replay.begin())) {
                break;
            }
@@ -936,5 +943,15 @@ future<> manager::rebalance(sstring hints_directory) {
    });
 }

+void manager::update_backlog(size_t backlog, size_t max_backlog) {
+    _backlog_size = backlog;
+    _max_backlog_size = max_backlog;
+    if (backlog < max_backlog) {
+        allow_hints();
+    } else {
+        forbid_hints_for_eps_with_pending_hints();
+    }
+}
+
 }
 }
--- a/db/hints/manager.hh
+++ b/db/hints/manager.hh
@@ -69,6 +69,8 @@ private:
    class drain_tag {};
    using drain = seastar::bool_class<drain_tag>;

+    friend class space_watchdog;
+
 public:
    class end_point_hints_manager {
    public:
@@ -119,6 +121,7 @@ public:
            resource_manager& _resource_manager;
            service::storage_proxy& _proxy;
            database& _db;
+            seastar::scheduling_group _hints_cpu_sched_group;
            gms::gossiper& _gossiper;
            seastar::shared_mutex& _file_update_mutex;

@@ -179,6 +182,10 @@ public:
                return _state.contains(state::stopping);
            }

+            bool replay_allowed() const noexcept {
+                return _ep_manager.replay_allowed();
+            }
+
            /// \brief Try to send one hint read from the file.
            ///  - Limit the maximum memory size of hints "in the air" and the maximum total number of hints "in the air".
            ///  - Discard the hints that are older than the grace seconds value of the corresponding table.
@@ -328,6 +335,10 @@ public:
            return _hints_in_progress;
        }

+        bool replay_allowed() const noexcept {
+            return _shard_manager.replay_allowed();
+        }
+
        bool can_hint() const noexcept {
            return _state.contains(state::can_hint);
        }
@@ -393,6 +404,17 @@ public:
        }
    };

+    enum class state {
+        started,                // hinting is currently allowed (start() call is complete)
+        replay_allowed,         // replaying (hints sending) is allowed
+        stopping                // hinting is not allowed - stopping is in progress (stop() method has been called)
+    };
+
+    using state_set = enum_set<super_enum<state,
+        state::started,
+        state::replay_allowed,
+        state::stopping>>;
+
 private:
    using ep_key_type = typename end_point_hints_manager::key_type;
    using ep_managers_map_type = std::unordered_map<ep_key_type, end_point_hints_manager>;
@@ -403,6 +425,7 @@ public:
    static const std::chrono::seconds hint_file_write_timeout;

 private:
+    state_set _state;
    const boost::filesystem::path _hints_dir;
    dev_t _hints_dir_device_id = 0;

@@ -414,7 +437,7 @@ private:
    locator::snitch_ptr& _local_snitch_ptr;
    int64_t _max_hint_window_us = 0;
    database& _local_db;
-    bool _stopping = false;
+
    seastar::gate _draining_eps_gate; // gate used to control the progress of ep_managers stopping not in the context of manager::stop() call

    resource_manager& _resource_manager;
@@ -424,9 +447,14 @@ private:
    seastar::metrics::metric_groups _metrics;
    std::unordered_set<ep_key_type> _eps_with_pending_hints;

+    size_t _max_backlog_size;
+    size_t _backlog_size;
+
 public:
    manager(sstring hints_directory, std::vector<sstring> hinted_dcs, int64_t max_hint_window_ms, resource_manager&res_manager, distributed<database>& db);
    virtual ~manager();
+    manager(manager&&) = delete;
+    manager& operator=(manager&&) = delete;
    void register_metrics(const sstring& group_name);
    future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
    future<> stop();
@@ -503,6 +531,18 @@ public:
    void forbid_hints();
    void forbid_hints_for_eps_with_pending_hints();

+    size_t max_backlog_size() const {
+        return _max_backlog_size;
+    }
+
+    size_t backlog_size() const {
+        return _backlog_size;
+    }
+
+    void allow_replaying() noexcept {
+        _state.set(state::replay_allowed);
+    }
+
    /// \brief Rebalance hints segments among all present shards.
    ///
    /// The difference between the number of segments on every two shard will be not greater than 1 after the
@@ -616,6 +656,28 @@ private:
    /// \param endpoint node that left the cluster
    void drain_for(gms::inet_address endpoint);

+    void update_backlog(size_t backlog, size_t max_backlog);
+
+    bool stopping() const noexcept {
+        return _state.contains(state::stopping);
+    }
+
+    void set_stopping() noexcept {
+        _state.set(state::stopping);
+    }
+
+    bool started() const noexcept {
+        return _state.contains(state::started);
+    }
+
+    void set_started() noexcept {
+        _state.set(state::started);
+    }
+
+    bool replay_allowed() const noexcept {
+        return _state.contains(state::replay_allowed);
+    }
+
 public:
    ep_managers_map_type::iterator find_ep_manager(ep_key_type ep_key) noexcept {
        return _ep_managers.find(ep_key);
--- a/db/hints/resource_manager.cc
+++ b/db/hints/resource_manager.cc
@@ -27,6 +27,7 @@
 #include "lister.hh"
 #include "disk-error-handler.hh"
 #include "seastarx.hh"
+#include <seastar/core/sleep.hh>

 namespace db {
 namespace hints {
@@ -65,19 +66,28 @@ const std::chrono::seconds space_watchdog::_watchdog_period = std::chrono::secon
 space_watchdog::space_watchdog(shard_managers_set& managers, per_device_limits_map& per_device_limits_map)
    : _shard_managers(managers)
    , _per_device_limits_map(per_device_limits_map)
-    , _timer([this] { on_timer(); })
 {}

 void space_watchdog::start() {
-    _timer.arm(timer_clock_type::now());
+    _started = seastar::async([this] {
+        while (!_as.abort_requested()) {
+            try {
+                on_timer();
+            } catch (...) {
+                resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
+                // Stop all hint generators if space_watchdog callback failed
+                for (manager& shard_manager : _shard_managers) {
+                    shard_manager.forbid_hints();
+                }
+            }
+            seastar::sleep_abortable(_watchdog_period, _as).get();
+        }
+    }).handle_exception_type([] (const seastar::sleep_aborted& ignored) { });
 }

 future<> space_watchdog::stop() noexcept {
-    try {
-        return _gate.close().finally([this] { _timer.cancel(); });
-    } catch (...) {
-        return make_exception_future<>(std::current_exception());
-    }
+    _as.request_abort();
+    return std::move(_started);
 }

 future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager& shard_manager, ep_key_type ep_key) {
@@ -94,83 +104,62 @@ future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager&
    });
 }

+// Called from the context of a seastar::thread.
 void space_watchdog::on_timer() {
-    with_gate(_gate, [this] {
-        return futurize_apply([this] {
-            _total_size = 0;
+    // The hints directories are organized as follows:
+    // <hints root>
+    //    |- <shard1 ID>
+    //    |  |- <EP1 address>
+    //    |     |- <hints file1>
+    //    |     |- <hints file2>
+    //    |     |- ...
+    //    |  |- <EP2 address>
+    //    |     |- ...
+    //    |  |-...
+    //    |- <shard2 ID>
+    //    |  |- ...
+    //    ...
+    //    |- <shardN ID>
+    //    |  |- ...
+    //

-            return do_for_each(_shard_managers, [this] (manager& shard_manager) {
-                shard_manager.clear_eps_with_pending_hints();
-
-                // The hints directories are organized as follows:
-                // <hints root>
-                //    |- <shard1 ID>
-                //    |  |- <EP1 address>
-                //    |     |- <hints file1>
-                //    |     |- <hints file2>
-                //    |     |- ...
-                //    |  |- <EP2 address>
-                //    |     |- ...
-                //    |  |-...
-                //    |- <shard2 ID>
-                //    |  |- ...
-                //    ...
-                //    |- <shardN ID>
-                //    |  |- ...
+    for (auto& per_device_limits : _per_device_limits_map | boost::adaptors::map_values) {
+        _total_size = 0;
+        for (manager& shard_manager : per_device_limits.managers) {
+            shard_manager.clear_eps_with_pending_hints();
+            lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
+                _files_count = 0;
+                // Let's scan per-end-point directories and enumerate hints files...
                //
-                return lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
-                    _files_count = 0;
-                    // Let's scan per-end-point directories and enumerate hints files...
-                    //
-                    // Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
-                    // not hintable).
-                    // If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
-                    // continue to enumeration - there is no one to change them.
-                    auto it = shard_manager.find_ep_manager(de.name);
-                    if (it != shard_manager.ep_managers_end()) {
-                        return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
-                             return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
-                        });
-                    } else {
-                        return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
-                    }
-                });
-            }).then([this] {
-                return do_for_each(_per_device_limits_map, [this](per_device_limits_map::value_type& per_device_limits_entry) {
-                    space_watchdog::per_device_limits& per_device_limits = per_device_limits_entry.second;
-
-                    size_t adjusted_quota = 0;
-                    size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
-                        return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
+                // Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
+                // not hintable).
+                // If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
+                // continue to enumeration - there is no one to change them.
+                auto it = shard_manager.find_ep_manager(de.name);
+                if (it != shard_manager.ep_managers_end()) {
+                    return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
+                        return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
                    });
-                    if (per_device_limits.max_shard_disk_space_size > delta) {
-                        adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
-                    }
+                } else {
+                    return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
+                }
+            }).get();
+        }

-                    bool can_hint = _total_size < adjusted_quota;
-                    resource_manager_logger.trace("space_watchdog: total_size ({}) {} max_shard_disk_space_size ({})", _total_size, can_hint ? "<" : ">=", adjusted_quota);
-
-                    if (!can_hint) {
-                        for (manager& shard_manager : per_device_limits.managers) {
-                            shard_manager.forbid_hints_for_eps_with_pending_hints();
-                        }
-                    } else {
-                        for (manager& shard_manager : per_device_limits.managers) {
-                            shard_manager.allow_hints();
-                        }
-    }
-                });
-            });
-        }).handle_exception([this] (auto eptr) {
-            resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
-            // Stop all hint generators if space_watchdog callback failed
-            for (manager& shard_manager : _shard_managers) {
-                shard_manager.forbid_hints();
-            }
-        }).finally([this] {
-            _timer.arm(_watchdog_period);
+        // Adjust the quota to take into account the space we guarantee to every end point manager
+        size_t adjusted_quota = 0;
+        size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
+            return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
        });
-    });
+        if (per_device_limits.max_shard_disk_space_size > delta) {
+            adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
+        }
+
+        resource_manager_logger.trace("space_watchdog: consuming {}/{} bytes", _total_size, adjusted_quota);
+        for (manager& shard_manager : per_device_limits.managers) {
+            shard_manager.update_backlog(_total_size, adjusted_quota);
+        }
+    }
 }

 future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr) {
@@ -183,6 +172,10 @@ future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, s
    });
 }

+void resource_manager::allow_replaying() noexcept {
+    boost::for_each(_shard_managers, [] (manager& m) { m.allow_replaying(); });
+}
+
 future<> resource_manager::stop() noexcept {
    return parallel_for_each(_shard_managers, [](manager& m) {
        return m.stop();
@@ -201,14 +194,18 @@ future<> resource_manager::prepare_per_device_limits() {
        auto it = _per_device_limits_map.find(device_id);
        if (it == _per_device_limits_map.end()) {
            return is_mountpoint(shard_manager.hints_dir().parent_path()).then([this, device_id, &shard_manager](bool is_mountpoint) {
-                // By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
-                size_t max_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
-                // If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
-                // Then, reserve 90% of all space instead of 10% above.
-                if (is_mountpoint) {
-                    max_size *= 9;
+                auto [it, inserted] = _per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{});
+                // Since we possibly deferred, we need to recheck the _per_device_limits_map.
+                if (inserted) {
+                    // By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
+                    it->second.max_shard_disk_space_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
+                    // If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
+                    // Then, reserve 90% of all space instead of 10% above.
+                    if (is_mountpoint) {
+                        it->second.max_shard_disk_space_size *= 9;
+                    }
                }
-                _per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{{std::ref(shard_manager)}, max_size});
+                it->second.managers.emplace_back(std::ref(shard_manager));
            });
        } else {
            it->second.managers.emplace_back(std::ref(shard_manager));
--- a/db/hints/resource_manager.hh
+++ b/db/hints/resource_manager.hh
@@ -22,6 +22,7 @@
 #pragma once

 #include <cstdint>
+#include <seastar/core/abort_source.hh>
 #include <seastar/core/semaphore.hh>
 #include <seastar/core/gate.hh>
 #include <seastar/core/memory.hh>
@@ -78,8 +79,8 @@ private:
    shard_managers_set& _shard_managers;
    per_device_limits_map& _per_device_limits_map;

-    seastar::gate _gate;
-    seastar::timer<timer_clock_type> _timer;
+    future<> _started = make_ready_future<>();
+    seastar::abort_source _as;
    int _files_count = 0;

 public:
@@ -137,6 +138,9 @@ public:
        , _space_watchdog(_shard_managers, _per_device_limits_map)
    {}

+    resource_manager(resource_manager&&) = delete;
+    resource_manager& operator=(resource_manager&&) = delete;
+
    future<semaphore_units<semaphore_default_exception_factory>> get_send_units_for(size_t buf_size);

    bool too_many_hints_in_progress() const {
@@ -156,6 +160,7 @@ public:
    }

    future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
+    void allow_replaying() noexcept;
    future<> stop() noexcept;
    void register_manager(manager& m);
    future<> prepare_per_device_limits();
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -1226,6 +1226,20 @@ future<> view_builder::calculate_shard_build_step(
        }
    }

+    // All shards need to arrive at the same decisions on whether or not to
+    // restart a view build at some common token (reshard), and which token
+    // to restart at. So we need to wait until all shards have read the view
+    // build statuses before they can all proceed to make the (same) decision.
+    // If we don't synchronoize here, a fast shard may make a decision, start
+    // building and finish a build step - before the slowest shard even read
+    // the view build information.
+    container().invoke_on(0, [] (view_builder& builder) {
+        if (++builder._shards_finished_read == smp::count) {
+            builder._shards_finished_read_promise.set_value();
+        }
+        return builder._shards_finished_read_promise.get_shared_future();
+    }).get();
+
    std::unordered_set<utils::UUID> loaded_views;
    if (view_build_status_per_shard.size() != smp::count) {
        reshard(std::move(view_build_status_per_shard), loaded_views);
--- a/db/view/view_builder.hh
+++ b/db/view/view_builder.hh
@@ -151,6 +151,10 @@ class view_builder final : public service::migration_listener::only_view_notific
    future<> _started = make_ready_future<>();
    // Used to coordinate between shards the conclusion of the build process for a particular view.
    std::unordered_set<utils::UUID> _built_views;
+    // Counter and promise (both on shard 0 only!) allowing to wait for all
+    // shards to have read the view build statuses
+    unsigned _shards_finished_read = 0;
+    seastar::shared_promise<> _shards_finished_read_promise;
    // Used for testing.
    std::unordered_map<std::pair<sstring, sstring>, seastar::shared_promise<>, utils::tuple_hash> _build_notifiers;

--- a/dist/common/sysctl.d/99-scylla-aio.conf
+++ b/dist/common/sysctl.d/99-scylla-aio.conf
@@ -0,0 +1,2 @@
+# Raise max AIO events
+fs.aio-max-nr = 1048576
--- a/dist/debian/debian/scylla-kernel-conf.install
+++ b/dist/debian/debian/scylla-kernel-conf.install
@@ -1 +1,2 @@
 dist/common/sysctl.d/99-scylla-sched.conf /etc/sysctl.d
+dist/common/sysctl.d/99-scylla-aio.conf /etc/sysctl.d
--- a/dist/debian/debian/scylla-kernel-conf.postinst
+++ b/dist/debian/debian/scylla-kernel-conf.postinst
@@ -9,6 +9,7 @@ if [[ $KVER =~ 3\.13\.0\-([0-9]+)-generic ]]; then
 else
    # expect failures in virtualized environments
    sysctl -p/etc/sysctl.d/99-scylla-sched.conf || :
+    sysctl -p/etc/sysctl.d/99-scylla-aio.conf || :
 fi

 #DEBHELPER#
--- a/dist/redhat/scylla.spec.mustache
+++ b/dist/redhat/scylla.spec.mustache
@@ -283,6 +283,7 @@ if Scylla is the main application on your server and you wish to optimize its la
 # We cannot use the sysctl_apply rpm macro because it is not present in 7.0
 # following is a "manual" expansion
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
+/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :

 %files kernel-conf
 %defattr(-,root,root)
--- a/gms/endpoint_state.hh
+++ b/gms/endpoint_state.hh
@@ -129,26 +129,8 @@ public:
        update_is_normal();
    }

-    void apply_application_state(application_state key, versioned_value&& value) {
-        auto&& e = _application_state[key];
-        if (e.version < value.version) {
-            e = std::move(value);
-        }
-        update_is_normal();
-    }
-
-    void apply_application_state(application_state key, const versioned_value& value) {
-        auto&& e = _application_state[key];
-        if (e.version < value.version) {
-            e = value;
-        }
-        update_is_normal();
-    }
-
-    void apply_application_state(const endpoint_state& es) {
-        for (auto&& e : es._application_state) {
-            apply_application_state(e.first, e.second);
-        }
+    void add_application_state(const endpoint_state& es) {
+        _application_state = es._application_state;
        update_is_normal();
    }

--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -930,7 +930,7 @@ void gossiper::make_random_gossip_digest(utils::chunked_vector<gossip_digest>& g
 future<> gossiper::replicate(inet_address ep, const endpoint_state& es) {
    return container().invoke_on_all([ep, es, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
        if (engine().cpu_id() != orig) {
-            g.endpoint_state_map[ep].apply_application_state(es);
+            g.endpoint_state_map[ep].add_application_state(es);
        }
    });
 }
@@ -939,7 +939,7 @@ future<> gossiper::replicate(inet_address ep, const std::map<application_state,
    return container().invoke_on_all([ep, &src, &changed, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
        if (engine().cpu_id() != orig) {
            for (auto&& key : changed) {
-                g.endpoint_state_map[ep].apply_application_state(key, src.at(key));
+                g.endpoint_state_map[ep].add_application_state(key, src.at(key));
            }
        }
    });
@@ -948,7 +948,7 @@ future<> gossiper::replicate(inet_address ep, const std::map<application_state,
 future<> gossiper::replicate(inet_address ep, application_state key, const versioned_value& value) {
    return container().invoke_on_all([ep, key, &value, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
        if (engine().cpu_id() != orig) {
-            g.endpoint_state_map[ep].apply_application_state(key, value);
+            g.endpoint_state_map[ep].add_application_state(key, value);
        }
    });
 }
@@ -1175,11 +1175,13 @@ stdx::optional<endpoint_state> gossiper::get_endpoint_state_for_endpoint(inet_ad
    }
 }

-void gossiper::reset_endpoint_state_map() {
-    endpoint_state_map.clear();
+future<> gossiper::reset_endpoint_state_map() {
    _unreachable_endpoints.clear();
    _live_endpoints.clear();
    _live_endpoints_just_added.clear();
+    return container().invoke_on_all([] (gossiper& g) {
+        g.endpoint_state_map.clear();
+    });
 }

 std::unordered_map<inet_address, endpoint_state>& gms::gossiper::get_endpoint_states() {
@@ -1662,6 +1664,7 @@ void gossiper::maybe_initialize_local_state(int generation_nbr) {
    }
 }

+// Runs inside seastar::async context
 void gossiper::add_saved_endpoint(inet_address ep) {
    if (ep == get_broadcast_address()) {
        logger.debug("Attempt to add self as saved endpoint");
@@ -1687,6 +1690,7 @@ void gossiper::add_saved_endpoint(inet_address ep) {
    }
    ep_state.mark_dead();
    endpoint_state_map[ep] = ep_state;
+    replicate(ep, ep_state).get();
    _unreachable_endpoints[ep] = now();
    logger.trace("Adding saved endpoint {} {}", ep, ep_state.get_heart_beat_state().get_generation());
 }
@@ -1924,6 +1928,7 @@ void gossiper::mark_as_shutdown(const inet_address& endpoint) {
        auto& ep_state = *es;
        ep_state.add_application_state(application_state::STATUS, storage_service_value_factory().shutdown(true));
        ep_state.get_heart_beat_state().force_highest_possible_version_unsafe();
+        replicate(endpoint, ep_state).get();
        mark_dead(endpoint, ep_state);
        get_local_failure_detector().force_conviction(endpoint);
    }
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -417,7 +417,7 @@ public:
    stdx::optional<endpoint_state> get_endpoint_state_for_endpoint(inet_address ep) const;

    // removes ALL endpoint states; should only be called after shadow gossip
-    void reset_endpoint_state_map();
+    future<> reset_endpoint_state_map();

    std::unordered_map<inet_address, endpoint_state>& get_endpoint_states();

--- a/locator/abstract_replication_strategy.cc
+++ b/locator/abstract_replication_strategy.cc
@@ -119,9 +119,17 @@ insert_token_range_to_sorted_container_while_unwrapping(
        const dht::token& tok,
        dht::token_range_vector& ret) {
    if (prev_tok < tok) {
-        ret.emplace_back(
-                dht::token_range::bound(prev_tok, false),
-                dht::token_range::bound(tok, true));
+        auto pos = ret.end();
+        if (!ret.empty() && !std::prev(pos)->end()) {
+            // We inserted a wrapped range (a, b] previously as
+            // (-inf, b], (a, +inf). So now we insert in the next-to-last
+            // position to keep the last range (a, +inf) at the end.
+            pos = std::prev(pos);
+        }
+        ret.insert(pos,
+                dht::token_range{
+                        dht::token_range::bound(prev_tok, false),
+                        dht::token_range::bound(tok, true)});
    } else {
        ret.emplace_back(
                dht::token_range::bound(prev_tok, false),
--- a/main.cc
+++ b/main.cc
@@ -703,6 +703,17 @@ int main(int ac, char** av) {
            supervisor::notify("starting streaming service");
            streaming::stream_session::init_streaming_service(db).get();
            api::set_server_stream_manager(ctx).get();
+
+            supervisor::notify("starting hinted handoff manager");
+            if (hinted_handoff_enabled) {
+                db::hints::manager::rebalance(cfg->hints_directory()).get();
+            }
+            db::hints::manager::rebalance(cfg->data_file_directories()[0] + "/view_pending_updates").get();
+
+            proxy.invoke_on_all([] (service::storage_proxy& local_proxy) {
+                local_proxy.start_hints_manager(gms::get_local_gossiper().shared_from_this(), service::get_local_storage_service().shared_from_this());
+            }).get();
+
            supervisor::notify("starting messaging service");
            // Start handling REPAIR_CHECKSUM_RANGE messages
            netw::get_messaging_service().invoke_on_all([&db] (auto& ms) {
@@ -739,14 +750,9 @@ int main(int ac, char** av) {
            gms::get_local_gossiper().wait_for_gossip_to_settle().get();
            api::set_server_gossip_settle(ctx).get();

-            supervisor::notify("starting hinted handoff manager");
-            if (hinted_handoff_enabled) {
-                db::hints::manager::rebalance(cfg->hints_directory()).get();
-            }
-            db::hints::manager::rebalance(cfg->data_file_directories()[0] + "/view_pending_updates").get();
-
+            supervisor::notify("allow replaying hints");
            proxy.invoke_on_all([] (service::storage_proxy& local_proxy) {
-                local_proxy.start_hints_manager(gms::get_local_gossiper().shared_from_this(), service::get_local_storage_service().shared_from_this());
+                local_proxy.allow_replaying_hints();
            }).get();

            static sharded<db::view::view_builder> view_builder;
--- a/memtable.hh
+++ b/memtable.hh
@@ -214,7 +214,9 @@ private:

        void update(const schema& s, const deletable_row& dr) {
            update(dr.marker());
-            update(dr.deleted_at().tomb());
+            row_tombstone row_tomb = dr.deleted_at();
+            update(row_tomb.regular());
+            update(row_tomb.tomb());
            update(s, dr.cells(), column_kind::regular_column);
        }

--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -135,12 +135,14 @@ struct messaging_service::rpc_protocol_wrapper : public rpc_protocol { using rpc
 // This should be integrated into messaging_service proper.
 class messaging_service::rpc_protocol_client_wrapper {
    std::unique_ptr<rpc_protocol::client> _p;
+    ::shared_ptr<seastar::tls::server_credentials> _credentials;
 public:
    rpc_protocol_client_wrapper(rpc_protocol& proto, rpc::client_options opts, ipv4_addr addr, ipv4_addr local = ipv4_addr())
            : _p(std::make_unique<rpc_protocol::client>(proto, std::move(opts), addr, local)) {
    }
    rpc_protocol_client_wrapper(rpc_protocol& proto, rpc::client_options opts, ipv4_addr addr, ipv4_addr local, ::shared_ptr<seastar::tls::server_credentials> c)
            : _p(std::make_unique<rpc_protocol::client>(proto, std::move(opts), seastar::tls::socket(c), addr, local))
+            , _credentials(c)
    {}
    auto get_stats() const { return _p->get_stats(); }
    future<> stop() { return _p->stop(); }
@@ -148,6 +150,19 @@ public:
        return _p->error();
    }
    operator rpc_protocol::client&() { return *_p; }
+
+    /**
+     * #3787 Must ensure we use the right type of socker. I.e. tls or not.
+     * See above, we retain credentials object so we here can know if we
+     * are tls or not.
+     */
+    template<typename Serializer, typename... Out>
+    future<rpc::sink<Out...>> make_stream_sink() {
+        if (_credentials) {
+            return _p->make_stream_sink<Serializer, Out...>(seastar::tls::socket(_credentials));
+        }
+        return _p->make_stream_sink<Serializer, Out...>();
+    }
 };

 struct messaging_service::rpc_protocol_server_wrapper : public rpc_protocol::server { using rpc_protocol::server::server; };
@@ -639,8 +654,9 @@ rpc::sink<int32_t> messaging_service::make_sink_for_stream_mutation_fragments(rp

 future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>>
 messaging_service::make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, msg_addr id) {
-    rpc_protocol::client& rpc_client = *get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, id);
-    return rpc_client.make_stream_sink<netw::serializer, frozen_mutation_fragment>().then([this, plan_id, schema_id, cf_id, estimated_partitions, &rpc_client] (rpc::sink<frozen_mutation_fragment> sink) mutable {
+    auto wrapper = get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, id);
+    rpc_protocol::client& rpc_client = *wrapper;
+    return wrapper->make_stream_sink<netw::serializer, frozen_mutation_fragment>().then([this, plan_id, schema_id, cf_id, estimated_partitions, &rpc_client] (rpc::sink<frozen_mutation_fragment> sink) mutable {
        auto rpc_handler = rpc()->make_client<rpc::source<int32_t> (utils::UUID, utils::UUID, utils::UUID, uint64_t, rpc::sink<frozen_mutation_fragment>)>(messaging_verb::STREAM_MUTATION_FRAGMENTS);
        return rpc_handler(rpc_client , plan_id, schema_id, cf_id, estimated_partitions, sink).then([sink] (rpc::source<int32_t> source) mutable {
            return make_ready_future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>>(std::move(sink), std::move(source));
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -470,6 +470,9 @@ read_context::ready_to_save_state* read_context::prepare_reader_for_saving(
    if (stopped_reader_fut.failed()) {
        mmq_log.debug("Failed to stop reader on shard {}: {}", shard, stopped_reader_fut.get_exception());
        ++_db.local().get_stats().multishard_query_failed_reader_stops;
+        // We don't want to leave the reader in dismantling state, lest stop()
+        // will try to wait on the reader_fut again and crash the application.
+        rs = {};
        return nullptr;
    }

@@ -609,9 +612,17 @@ future<> read_context::save_readers(circular_buffer<mutation_fragment> unconsume
            }

            if (auto* maybe_future_dismantling_state = std::get_if<future_dismantling_state>(&rs)) {
-                return maybe_future_dismantling_state->fut.then([this, &rs,
-                        finish_saving = std::move(finish_saving)] (dismantling_state&& next_state) mutable {
-                    rs = std::move(next_state);
+                return maybe_future_dismantling_state->fut.then_wrapped([this, &rs,
+                        finish_saving = std::move(finish_saving)] (future<dismantling_state>&& next_state_fut) mutable {
+                    if (next_state_fut.failed()) {
+                        mmq_log.debug("Failed to stop reader: {}", next_state_fut.get_exception());
+                        ++_db.local().get_stats().multishard_query_failed_reader_stops;
+                        // We don't want to leave the reader in future dismantling state, lest
+                        // stop() will try to wait on the fut again and crash the application.
+                        rs = {};
+                        return make_ready_future<>();
+                    }
+                    rs = next_state_fut.get0();
                    return finish_saving(std::get<dismantling_state>(rs));
                });
            }
--- a/schema.cc
+++ b/schema.cc
@@ -1121,6 +1121,21 @@ schema::has_static_columns() const {
    return !static_columns().empty();
 }

+column_count_type
+schema::columns_count(column_kind kind) const {
+    switch (kind) {
+    case column_kind::partition_key:
+        return partition_key_size();
+    case column_kind::clustering_key:
+        return clustering_key_size();
+    case column_kind::static_column:
+        return static_columns_count();
+    case column_kind::regular_column:
+        return regular_columns_count();
+    default:
+        std::abort();
+    }
+}
 column_count_type
 schema::partition_key_size() const {
    return column_offset(column_kind::clustering_key);
--- a/schema.hh
+++ b/schema.hh
@@ -701,6 +701,7 @@ public:
    bool is_last_partition_key(const column_definition& def) const;
    bool has_multi_cell_collections() const;
    bool has_static_columns() const;
+    column_count_type columns_count(column_kind kind) const;
    column_count_type partition_key_size() const;
    column_count_type clustering_key_size() const;
    column_count_type static_columns_count() const;
--- a/2
+++ b/2
--- a/service/pager/query_pagers.cc
+++ b/service/pager/query_pagers.cc
@@ -379,7 +379,7 @@ public:
    }

    ::shared_ptr<const paging_state> query_pager::state() const {
-        return ::make_shared<paging_state>(*_last_pkey, _last_ckey, _exhausted ? 0 : _max, _cmd->query_uuid, _last_replicas, _query_read_repair_decision);
+        return ::make_shared<paging_state>(_last_pkey.value_or(partition_key::make_empty()), _last_ckey, _exhausted ? 0 : _max, _cmd->query_uuid, _last_replicas, _query_read_repair_decision);
    }

 }
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -3074,8 +3074,9 @@ storage_proxy::query_result_local(schema_ptr s, lw_shared_ptr<query::read_comman
        unsigned shard = _db.local().shard_of(pr.start()->value().token());
        _stats.replica_cross_shard_ops += shard != engine().cpu_id();
        return _db.invoke_on(shard, [max_size, gs = global_schema_ptr(s), prv = dht::partition_range_vector({pr}) /* FIXME: pr is copied */, cmd, opts, timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (database& db) mutable {
-            tracing::trace(gt, "Start querying the token range that starts with {}", seastar::value_of([&prv] { return prv.begin()->start()->value().token(); }));
-            return db.query(gs, *cmd, opts, prv, gt, max_size, timeout).then([trace_state = gt.get()](auto&& f, cache_temperature ht) {
+            auto trace_state = gt.get();
+            tracing::trace(trace_state, "Start querying the token range that starts with {}", seastar::value_of([&prv] { return prv.begin()->start()->value().token(); }));
+            return db.query(gs, *cmd, opts, prv, trace_state, max_size, timeout).then([trace_state](auto&& f, cache_temperature ht) {
                tracing::trace(trace_state, "Querying is done");
                return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>(make_foreign(std::move(f)), ht);
            });
@@ -4218,6 +4219,10 @@ future<> storage_proxy::start_hints_manager(shared_ptr<gms::gossiper> gossiper_p
    return _hints_resource_manager.start(shared_from_this(), gossiper_ptr, ss_ptr);
 }

+void storage_proxy::allow_replaying_hints() noexcept {
+    return _hints_resource_manager.allow_replaying();
+}
+
 future<> storage_proxy::stop_hints_manager() {
    return _hints_resource_manager.stop();
 }
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -390,6 +390,7 @@ public:
    future<> stop();
    future<> stop_hints_manager();
    future<> start_hints_manager(shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
+    void allow_replaying_hints() noexcept;

    const stats& get_stats() const {
        return _stats;
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -353,7 +353,7 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
                        gossiper.check_knows_remote_features(local_features, peer_features);
                    }

-                    gossiper.reset_endpoint_state_map();
+                    gossiper.reset_endpoint_state_map().get();
                    for (auto ep : loaded_endpoints) {
                        gossiper.add_saved_endpoint(ep);
                    }
@@ -367,7 +367,7 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
            slogger.info("Checking remote features with gossip");
            gossiper.do_shadow_round().get();
            gossiper.check_knows_remote_features(local_features);
-            gossiper.reset_endpoint_state_map();
+            gossiper.reset_endpoint_state_map().get();
            for (auto ep : loaded_endpoints) {
                gossiper.add_saved_endpoint(ep);
            }
@@ -1570,7 +1570,7 @@ future<> storage_service::check_for_endpoint_collision() {
                            throw std::runtime_error("Other bootstrapping/leaving/moving nodes detected, cannot bootstrap while consistent_rangemovement is true (check_for_endpoint_collision)");
                        } else {
                            gossiper.goto_shadow_round();
-                            gossiper.reset_endpoint_state_map();
+                            gossiper.reset_endpoint_state_map().get();
                            found_bootstrapping_node = true;
                            auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(gms::gossiper::clk::now() - t).count();
                            slogger.info("Checking bootstrapping/leaving/moving nodes: node={}, status={}, sleep 1 second and check again ({} seconds elapsed) (check_for_endpoint_collision)", addr, state, elapsed);
@@ -1582,7 +1582,7 @@ future<> storage_service::check_for_endpoint_collision() {
            }
        } while (found_bootstrapping_node);
        slogger.info("Checking bootstrapping/leaving/moving nodes: ok (check_for_endpoint_collision)");
-        gossiper.reset_endpoint_state_map();
+        gossiper.reset_endpoint_state_map().get();
    });
 }

@@ -1632,8 +1632,9 @@ future<std::unordered_set<token>> storage_service::prepare_replacement_info() {
        auto tokens = get_tokens_for(replace_address);
        // use the replacee's host Id as our own so we receive hints, etc
        return db::system_keyspace::set_local_host_id(host_id).discard_result().then([replace_address, tokens = std::move(tokens)] {
-            gms::get_local_gossiper().reset_endpoint_state_map(); // clean up since we have what we need
-            return make_ready_future<std::unordered_set<token>>(std::move(tokens));
+            return gms::get_local_gossiper().reset_endpoint_state_map().then([tokens = std::move(tokens)] { // clean up since we have what we need
+                return make_ready_future<std::unordered_set<token>>(std::move(tokens));
+            });
        });
    });
 }
@@ -2046,6 +2047,7 @@ future<> storage_service::start_rpc_server() {
        auto keepalive = cfg.rpc_keepalive();
        thrift_server_config tsc;
        tsc.timeout_config = make_timeout_config(cfg);
+        tsc.max_request_size = cfg.thrift_max_message_length_in_mb() * (uint64_t(1) << 20);
        return seastar::net::dns::resolve_name(addr).then([&ss, tserver, addr, port, keepalive, tsc] (seastar::net::inet_address ip) {
            return tserver->start(std::ref(ss._db), std::ref(cql3::get_query_processor()), std::ref(ss._auth_service), tsc).then([tserver, port, addr, ip, keepalive] {
                // #293 - do not stop anything
--- a/sstables/column_translation.hh
+++ b/sstables/column_translation.hh
@@ -53,63 +53,76 @@ inline column_values_fixed_lengths get_clustering_values_fixed_lengths(const ser
 * This way we don't need to looku them up by column name every time.
 */
 class column_translation {
+public:
+    struct column_info {
+        // Disengaged 'id' means the column is missing from the current schema
+        std::optional<column_id> id;
+        std::optional<uint32_t> value_length;
+        bool is_collection;
+        bool is_counter;
+    };
+
+private:

    struct state {

-        static std::tuple<std::vector<std::optional<column_id>>,
-                          std::vector<std::optional<uint32_t>>,
-                          std::vector<bool>,
-                          std::vector<bool>> build(
+        static std::vector<column_info> build(
                const schema& s,
                const utils::chunked_vector<serialization_header::column_desc>& src,
                bool is_static) {
-            std::vector<std::optional<column_id>> ids;
-            std::vector<std::optional<column_id>> lens;
-            std::vector<bool> is_collection;
-            std::vector<bool> is_counter;
+            std::vector<column_info> cols;
            if (s.is_dense()) {
                if (is_static) {
-                    ids.push_back(s.static_begin()->id);
-                    lens.push_back(s.static_begin()->type->value_length_if_fixed());
-                    is_collection.push_back(s.static_begin()->is_multi_cell());
-                    is_counter.push_back(s.static_begin()->is_counter());
+                    cols.push_back(column_info{
+                        s.static_begin()->id,
+                        s.static_begin()->type->value_length_if_fixed(),
+                        s.static_begin()->is_multi_cell(),
+                        s.static_begin()->is_counter()
+                    });
                } else {
-                    ids.push_back(s.regular_begin()->id);
-                    lens.push_back(s.regular_begin()->type->value_length_if_fixed());
-                    is_collection.push_back(s.regular_begin()->is_multi_cell());
-                    is_counter.push_back(s.regular_begin()->is_counter());
+                    cols.push_back(column_info{
+                        s.regular_begin()->id,
+                        s.regular_begin()->type->value_length_if_fixed(),
+                        s.regular_begin()->is_multi_cell(),
+                        s.regular_begin()->is_counter()
+                    });
                }
            } else {
-                ids.reserve(src.size());
-                lens.reserve(src.size());
+                cols.reserve(src.size());
                for (auto&& desc : src) {
+                    const bytes& type_name = desc.type_name.value;
+                    data_type type = db::marshal::type_parser::parse(to_sstring_view(type_name));
                    const column_definition* def = s.get_column_definition(desc.name.value);
+                    std::optional<column_id> id;
                    if (def) {
-                        ids.push_back(def->id);
-                        lens.push_back(def->type->value_length_if_fixed());
-                        is_collection.push_back(def->is_multi_cell());
-                        is_counter.push_back(def->is_counter());
-                    } else {
-                        ids.push_back(std::nullopt);
-                        lens.push_back(std::nullopt);
-                        is_collection.push_back(false);
-                        is_counter.push_back(false);
+                        if (def->is_multi_cell() != type->is_multi_cell() || def->is_counter() != type->is_counter()) {
+                            throw malformed_sstable_exception(sprint(
+                                    "{} definition in serialization header does not match schema. "
+                                    "Schema collection = {}, counter = {}. Header collection = {}, counter = {}",
+                                    def->name(),
+                                    def->is_multi_cell(),
+                                    def->is_counter(),
+                                    type->is_multi_cell(),
+                                    type->is_counter()));
+                        }
+                        id = def->id;
                    }
+                    cols.push_back(column_info{
+                        id,
+                        type->value_length_if_fixed(),
+                        type->is_multi_cell(),
+                        type->is_counter()
+                    });
                }
+                boost::range::stable_partition(cols, [](const column_info& column) { return !column.is_collection; });
            }
-            return std::make_tuple(std::move(ids), std::move(lens), std::move(is_collection), std::move(is_counter));
+            return cols;
        }

        utils::UUID schema_uuid;
-        std::vector<std::optional<column_id>> regular_schema_column_id_from_sstable;
-        std::vector<std::optional<column_id>> static_schema_column_id_from_sstable;
-        column_values_fixed_lengths regular_column_value_fix_lengths;
-        column_values_fixed_lengths static_column_value_fix_lengths;
+        std::vector<column_info> regular_schema_columns_from_sstable;
+        std::vector<column_info> static_schema_columns_from_sstable;
        column_values_fixed_lengths clustering_column_value_fix_lengths;
-        std::vector<bool> static_column_is_collection;
-        std::vector<bool> regular_column_is_collection;
-        std::vector<bool> static_column_is_counter;
-        std::vector<bool> regular_column_is_counter;

        state() = default;
        state(const state&) = delete;
@@ -118,19 +131,11 @@ class column_translation {
        state& operator=(state&&) = default;

        state(const schema& s, const serialization_header& header)
-                : schema_uuid(s.version()) {
-            std::tie(regular_schema_column_id_from_sstable,
-                     regular_column_value_fix_lengths,
-                     regular_column_is_collection,
-                     regular_column_is_counter) =
-                    build(s, header.regular_columns.elements, false);
-            std::tie(static_schema_column_id_from_sstable,
-                     static_column_value_fix_lengths,
-                     static_column_is_collection,
-                     static_column_is_counter) =
-                    build(s, header.static_columns.elements, true);
-            clustering_column_value_fix_lengths = get_clustering_values_fixed_lengths(header);
-        }
+            : schema_uuid(s.version())
+            , regular_schema_columns_from_sstable(build(s, header.regular_columns.elements, false))
+            , static_schema_columns_from_sstable(build(s, header.static_columns.elements, true))
+            , clustering_column_value_fix_lengths (get_clustering_values_fixed_lengths(header))
+        {}
    };

    lw_shared_ptr<const state> _state = make_lw_shared<const state>();
@@ -143,33 +148,15 @@ public:
        return *this;
    }

-    const std::vector<std::optional<column_id>>& regular_columns() const {
-        return _state->regular_schema_column_id_from_sstable;
+    const std::vector<column_info>& regular_columns() const {
+        return _state->regular_schema_columns_from_sstable;
    }
-    const std::vector<std::optional<column_id>>& static_columns() const {
-        return _state->static_schema_column_id_from_sstable;
-    }
-    const std::vector<std::optional<uint32_t>>& regular_column_value_fix_legths() const {
-        return _state->regular_column_value_fix_lengths;
-    }
-    const std::vector<std::optional<uint32_t>>& static_column_value_fix_legths() const {
-        return _state->static_column_value_fix_lengths;
+    const std::vector<column_info>& static_columns() const {
+        return _state->static_schema_columns_from_sstable;
    }
    const std::vector<std::optional<uint32_t>>& clustering_column_value_fix_legths() const {
        return _state->clustering_column_value_fix_lengths;
    }
-    const std::vector<bool>& static_column_is_collection() const {
-        return _state->static_column_is_collection;
-    }
-    const std::vector<bool>& regular_column_is_collection() const {
-        return _state->regular_column_is_collection;
-    }
-    const std::vector<bool>& static_column_is_counter() const {
-        return _state->static_column_is_counter;
-    }
-    const std::vector<bool>& regular_column_is_counter() const {
-        return _state->regular_column_is_counter;
-    }
 };

 };   // namespace sstables
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -531,11 +531,11 @@ public:
    }

    void report_start(const sstring& formatted_msg) const override {
-        clogger.debug("Compacting {}", formatted_msg);
+        clogger.info("Compacting {}", formatted_msg);
    }

    void report_finish(const sstring& formatted_msg, std::chrono::time_point<db_clock> ended_at) const override {
-        clogger.debug("Compacted {}", formatted_msg);
+        clogger.info("Compacted {}", formatted_msg);
    }

    void backlog_tracker_adjust_charges() override {
--- a/sstables/liveness_info.hh
+++ b/sstables/liveness_info.hh
@@ -65,9 +65,13 @@ public:
        if (!is_set()) {
            return row_marker();
        }
-        return _ttl != gc_clock::duration::zero() || _local_deletion_time != gc_clock::time_point::max()
-            ? row_marker(_timestamp, _ttl, _local_deletion_time)
-            : row_marker(_timestamp);
+        if (is_expired_liveness_ttl(_ttl.count())) {
+            return row_marker{tombstone{_timestamp, _local_deletion_time}};
+        } else if (_ttl != gc_clock::duration::zero() || _local_deletion_time != gc_clock::time_point::max()) {
+            return row_marker{_timestamp, _ttl, _local_deletion_time};
+        }
+
+        return row_marker{_timestamp};
    }
 };

--- a/sstables/m_format_read_helpers.hh
+++ b/sstables/m_format_read_helpers.hh
@@ -131,7 +131,7 @@ inline api::timestamp_type parse_timestamp(const serialization_header& header,
 }

 inline gc_clock::duration parse_ttl(uint32_t value) {
-    if (value > std::numeric_limits<gc_clock::duration::rep>::max()) {
+    if (value > max_ttl.count() && ! is_expired_liveness_ttl(value)) {
        throw malformed_sstable_exception(format("Too big ttl: {}", value));
    }
    return gc_clock::duration(value);
--- a/sstables/m_format_write_helpers.cc
+++ b/sstables/m_format_write_helpers.cc
@@ -203,10 +203,10 @@ void write_clustering_prefix(file_writer& out, const schema& s,
 class missing_columns_input_range
        : public input_range_base<missing_columns_input_range, uint64_t> {
 private:
-    const schema& _schema;
+    const indexed_columns& _columns;
    const row& _row;
    mutable uint64_t _current_value = 0;
-    mutable column_id _current_id = 0;
+    mutable size_t _current_index = 0;
    mutable bool _large_mode_produced_size = false;

    enum class encoding_mode {
@@ -216,35 +216,35 @@ private:
    } _mode;

 public:
-    missing_columns_input_range(const schema& s, const row& row)
-            : _schema(s)
+    missing_columns_input_range(const indexed_columns& columns, const row& row)
+            : _columns(columns)
            , _row(row) {

        auto row_size = _row.size();
-        auto total_size = _schema.regular_columns_count();
+        auto total_size = _columns.size();

-        _current_id = row_size < total_size ? 0 : total_size;
+        _current_index = row_size < total_size ? 0 : total_size;
        _mode = (total_size < 64)           ? encoding_mode::small :
                (row_size < total_size / 2) ? encoding_mode::large_encode_present :
                encoding_mode::large_encode_missing;
    }

    bool next() const {
-        auto total_size = _schema.regular_columns_count();
-        if (_current_id == total_size) {
+        auto total_size = _columns.size();
+        if (_current_index == total_size) {
            // No more values to encode
            return false;
        }

        if (_mode ==  encoding_mode::small) {
            // Set bit for every missing column
-            for (column_id id = 0; id < total_size; ++id) {
-                auto cell = _row.find_cell(id);
+            for (const auto& element: _columns | boost::adaptors::indexed()) {
+                auto cell = _row.find_cell(element.value().get().id);
                if (!cell) {
-                    _current_value |= (uint64_t(1) << id);
+                    _current_value |= (uint64_t(1) << element.index());
                }
            }
-            _current_id = total_size;
+            _current_index = total_size;
            return true;
        } else {
            // For either of large modes, output the difference between total size and row size first
@@ -255,25 +255,25 @@ public:
            }

            if (_mode == encoding_mode::large_encode_present) {
-                while (_current_id < total_size) {
-                    auto cell = _row.find_cell(_current_id);
+                while (_current_index < total_size) {
+                    auto cell = _row.find_cell(_columns[_current_index].get().id);
                    if (cell) {
-                        _current_value = _current_id;
-                        ++_current_id;
+                        _current_value = _current_index;
+                        ++_current_index;
                        return true;
                    }
-                    ++_current_id;
+                    ++_current_index;
                }
            } else {
                assert(_mode == encoding_mode::large_encode_missing);
-                while (_current_id < total_size) {
-                    auto cell = _row.find_cell(_current_id);
+                while (_current_index < total_size) {
+                    auto cell = _row.find_cell(_columns[_current_index].get().id);
                    if (!cell) {
-                        _current_value = _current_id;
-                        ++_current_id;
+                        _current_value = _current_index;
+                        ++_current_index;
                        return true;
                    }
-                    ++_current_id;
+                    ++_current_index;
                }
            }
        }
@@ -285,12 +285,12 @@ public:

    explicit operator bool() const
    {
-        return (_current_id < _schema.regular_columns_count());
+        return (_current_index < _columns.size());
    }
 };

-void write_missing_columns(file_writer& out, const schema& s, const row& row) {
-    for (const auto value: missing_columns_input_range{s, row}) {
+void write_missing_columns(file_writer& out, const indexed_columns& columns, const row& row) {
+    for (const auto value: missing_columns_input_range{columns, row}) {
        write_vint(out, value);
    }
 }
--- a/sstables/m_format_write_helpers.hh
+++ b/sstables/m_format_write_helpers.hh
@@ -37,6 +37,8 @@ namespace sstables {

 class file_writer;

+using indexed_columns = std::vector<std::reference_wrapper<const column_definition>>;
+
 // Utilities for writing integral values in variable-length format
 // See vint-serialization.hh for more details
 void write_unsigned_vint(file_writer& out, uint64_t value);
@@ -75,7 +77,7 @@ void write_clustering_prefix(file_writer& out, const schema& s,
        const clustering_key_prefix& prefix, ephemerally_full_prefix is_ephemerally_full);

 // Writes encoded information about missing columns in the given row
-void write_missing_columns(file_writer& out, const schema& s, const row& row);
+void write_missing_columns(file_writer& out, const indexed_columns& columns, const row& row);

 // Helper functions for writing delta-encoded time-related values
 void write_delta_timestamp(file_writer& out, api::timestamp_type timestamp, const encoding_stats& enc_stats);
--- a/sstables/mp_row_consumer.hh
+++ b/sstables/mp_row_consumer.hh
@@ -810,7 +810,8 @@ class mp_row_consumer_m : public consumer_m {
    streamed_mutation::forwarding _fwd;

    std::optional<clustering_row> _in_progress_row;
-    std::variant<std::monostate, clustering_row, range_tombstone> _stored;
+    std::optional<clustering_row> _stored_row;
+    std::optional<range_tombstone> _stored_tombstone;
    static_row _in_progress_static_row;
    bool _inside_static_row = false;

@@ -825,6 +826,10 @@ class mp_row_consumer_m : public consumer_m {
        clustering_key_prefix ck;
        bound_kind kind;
        tombstone tomb;
+
+        position_in_partition_view position() {
+            return position_in_partition_view(position_in_partition_view::range_tag_t{}, bound_view(ck, kind));
+        }
    };

    inline friend std::ostream& operator<<(std::ostream& o, const sstables::mp_row_consumer_m::range_tombstone_start& rt_start) {
@@ -872,49 +877,6 @@ class mp_row_consumer_m : public consumer_m {
        return _schema->column_at(column_type, *column_id);
    }

-    inline proceed maybe_push_row(clustering_row&& cr) {
-        sstlog.trace("mp_row_consumer_m {}: maybe_push_row({})", this, cr);
-        auto action = _mf_filter->apply(cr);
-        switch (action) {
-        case mutation_fragment_filter::result::emit:
-            if (_opened_range_tombstone) {
-                /* We have an opened range tombstone which means that the current row is spanned by that RT.
-                 * Since the row is to be emitted, so is the range tombstone that we form from the opened start
-                 * and the end built from the row position because it also overlaps with query ranges.
-                 */
-                auto ck = cr.key();
-                bool was_non_full_key = clustering_key::make_full(*_schema, ck);
-                auto end_kind =  was_non_full_key ? bound_kind::excl_end : bound_kind::incl_end;
-                _reader->push_mutation_fragment(range_tombstone(std::move(_opened_range_tombstone->ck),
-                                                                _opened_range_tombstone->kind,
-                                                                ck,
-                                                                end_kind,
-                                                                _opened_range_tombstone->tomb));
-                _opened_range_tombstone->ck = std::move(ck);
-                _opened_range_tombstone->kind = was_non_full_key ? bound_kind::incl_start : bound_kind::excl_start;
-            }
-            _reader->push_mutation_fragment(std::move(cr));
-            break;
-        case mutation_fragment_filter::result::ignore:
-            if (_opened_range_tombstone) {
-                // Trim the opened range up to the clustering key of the current row
-                auto& ck = cr.key();
-                bool was_non_full_key = clustering_key::make_full(*_schema, ck);
-                _opened_range_tombstone->ck = std::move(ck);
-                _opened_range_tombstone->kind = was_non_full_key ? bound_kind::incl_start : bound_kind::excl_start;
-            }
-            if (_mf_filter->is_current_range_changed()) {
-                return proceed::no;
-            }
-            break;
-        case mutation_fragment_filter::result::store_and_finish:
-            _stored.emplace<clustering_row>(std::move(cr));
-            return proceed::no;
-        }
-
-        return proceed(!_reader->is_buffer_full());
-    }
-
    inline proceed maybe_push_range_tombstone(range_tombstone&& rt) {
        const auto action = _mf_filter->apply(rt);
        switch (action) {
@@ -927,13 +889,23 @@ class mp_row_consumer_m : public consumer_m {
            }
            break;
        case mutation_fragment_filter::result::store_and_finish:
-            _stored.emplace<range_tombstone>(std::move(rt));
+            _stored_tombstone = std::move(rt);
+            _reader->on_end_of_stream();
            return proceed::no;
        }

        return proceed(!_reader->is_buffer_full());
    }

+    inline void reset_for_new_partition() {
+        _is_mutation_end = true;
+        _in_progress_row.reset();
+        _stored_row.reset();
+        _stored_tombstone.reset();
+        _mf_filter.reset();
+        _opened_range_tombstone.reset();
+    }
+
 public:

    /*
@@ -979,16 +951,32 @@ public:
            return proceed::no;
        }

-        std::visit(overloaded_functor{
-            [this] (clustering_row&& cr) {
-                maybe_push_row(std::move(cr));
-            },
-            [this] (range_tombstone&& rt) {
-                maybe_push_range_tombstone(std::move(rt));
-            },
-            [] (std::monostate) {
+        auto maybe_push = [this] (auto&& mfopt) {
+            if (mfopt) {
+                switch (_mf_filter->apply(*mfopt)) {
+                case mutation_fragment_filter::result::emit:
+                    _reader->push_mutation_fragment(*std::exchange(mfopt, {}));
+                    break;
+                case mutation_fragment_filter::result::ignore:
+                    mfopt.reset();
+                    if (_mf_filter->is_current_range_changed()) {
+                       return true;
+                    }
+                    break;
+                case mutation_fragment_filter::result::store_and_finish:
+                    _reader->on_end_of_stream();
+                    return true;
+                }
            }
-        }, std::exchange(_stored, std::monostate{}));
+            return false;
+        };
+
+        if (maybe_push(_stored_tombstone)) {
+            return proceed::no;
+        }
+        if (maybe_push(_stored_row)) {
+            return proceed::no;
+        }

        return proceed::yes;
    }
@@ -1057,11 +1045,15 @@ public:
        return proceed::yes;
    }

-    virtual proceed consume_row_marker_and_tombstone(const liveness_info& info, tombstone t) override {
-        sstlog.trace("mp_row_consumer_m {}: consume_row_marker_and_tombstone({}, {}), key={}",
-            this, info.to_row_marker(), t, _in_progress_row->position());
-        _in_progress_row->apply(t);
+    virtual proceed consume_row_marker_and_tombstone(
+            const liveness_info& info, tombstone tomb, tombstone shadowable_tomb) override {
+        sstlog.trace("mp_row_consumer_m {}: consume_row_marker_and_tombstone({}, {}, {}), key={}",
+            this, info.to_row_marker(), tomb, shadowable_tomb, _in_progress_row->position());
        _in_progress_row->apply(info.to_row_marker());
+        _in_progress_row->apply(tomb);
+        if (shadowable_tomb) {
+            _in_progress_row->apply(shadowable_tombstone{shadowable_tomb});
+        }
        return proceed::yes;
    }

@@ -1212,41 +1204,61 @@ public:
            if (!_cells.empty()) {
                fill_cells(column_kind::regular_column, _in_progress_row->cells());
            }
-            return maybe_push_row(*std::exchange(_in_progress_row, {}));
+            if (_opened_range_tombstone) {
+                /* We have an opened range tombstone which means that the current row is spanned by that RT.
+                 */
+                auto ck = _in_progress_row->key();
+                bool was_non_full_key = clustering_key::make_full(*_schema, ck);
+                auto end_kind =  was_non_full_key ? bound_kind::excl_end : bound_kind::incl_end;
+                assert(!_stored_tombstone);
+                _stored_tombstone = range_tombstone(std::move(_opened_range_tombstone->ck),
+                                                              _opened_range_tombstone->kind,
+                                                              ck,
+                                                              end_kind,
+                                                              _opened_range_tombstone->tomb);
+                _opened_range_tombstone->ck = std::move(ck);
+                _opened_range_tombstone->kind = was_non_full_key ? bound_kind::incl_start : bound_kind::excl_start;
+            }
+            _stored_row = *std::exchange(_in_progress_row, {});
+            return proceed(push_ready_fragments() == proceed::yes && !_reader->is_buffer_full());
        }

        return proceed(!_reader->is_buffer_full());
    }

-    virtual proceed consume_partition_end() override {
-        sstlog.trace("mp_row_consumer_m {}: consume_partition_end()", this);
+    virtual void on_end_of_stream() override {
        if (_opened_range_tombstone) {
            if (!_mf_filter || _mf_filter->out_of_range()) {
                throw sstables::malformed_sstable_exception("Unclosed range tombstone.");
            }
-            auto range_end = _mf_filter->upper_bound();
+            auto range_end = _mf_filter->uppermost_bound();
            position_in_partition::less_compare less(*_schema);
            auto start_pos = position_in_partition_view(position_in_partition_view::range_tag_t{},
                                                        bound_view(_opened_range_tombstone->ck, _opened_range_tombstone->kind));
            if (!less(range_end, start_pos)) {
                auto end_bound = range_end.as_end_bound_view();
-                consume_range_tombstone_end(end_bound.prefix(), end_bound.kind(), _opened_range_tombstone->tomb);
+                auto rt = range_tombstone {std::move(_opened_range_tombstone->ck),
+                                           _opened_range_tombstone->kind,
+                                           end_bound.prefix(),
+                                           end_bound.kind(),
+                                           _opened_range_tombstone->tomb};
+                _opened_range_tombstone.reset();
+                _reader->push_mutation_fragment(std::move(rt));
            }
        }
-        _is_mutation_end = true;
-        _in_progress_row.reset();
-        _stored.emplace<std::monostate>();
-        _mf_filter.reset();
+        consume_partition_end();
+    }
+
+    virtual proceed consume_partition_end() override {
+        sstlog.trace("mp_row_consumer_m {}: consume_partition_end()", this);
+        reset_for_new_partition();
        return proceed::no;
    }

    virtual void reset(sstables::indexable_element el) override {
        sstlog.trace("mp_row_consumer_m {}: reset({})", this, static_cast<int>(el));
        if (el == indexable_element::partition) {
-            _is_mutation_end = true;
-            _in_progress_row.reset();
-            _stored.emplace<std::monostate>();
-            _mf_filter.reset();
+            reset_for_new_partition();
        } else {
            _is_mutation_end = false;
        }
--- a/sstables/mutation_fragment_filter.hh
+++ b/sstables/mutation_fragment_filter.hh
@@ -154,8 +154,8 @@ public:
        return _walker.lower_bound();
    }

-    position_in_partition_view upper_bound() const {
-        return _walker.upper_bound();
+    position_in_partition_view uppermost_bound() const {
+        return _walker.uppermost_bound();
    }
 };

--- a/sstables/partition.cc
+++ b/sstables/partition.cc
@@ -361,7 +361,7 @@ private:
        });
    }
    future<> advance_context(std::optional<position_in_partition_view> pos) {
-        if (!pos) {
+        if (!pos || pos->is_before_all_fragments(*_schema)) {
            return make_ready_future<>();
        }
        assert (_current_partition_key);
--- a/sstables/row.hh
+++ b/sstables/row.hh
@@ -162,7 +162,8 @@ public:

    virtual proceed consume_row_start(const std::vector<temporary_buffer<char>>& ecp) = 0;

-    virtual proceed consume_row_marker_and_tombstone(const sstables::liveness_info& info, tombstone t) = 0;
+    virtual proceed consume_row_marker_and_tombstone(
+            const sstables::liveness_info& info, tombstone tomb, tombstone shadowable_tomb) = 0;

    virtual proceed consume_static_row_start() = 0;

@@ -193,6 +194,8 @@ public:

    virtual proceed consume_row_end() = 0;

+    virtual void on_end_of_stream() = 0;
+
    // Called when the reader is fast forwarded to given element.
    virtual void reset(sstables::indexable_element) = 0;

@@ -239,6 +242,7 @@ private:
    } _state = state::ROW_START;

    row_consumer& _consumer;
+    shared_sstable _sst;

    temporary_buffer<char> _key;
    temporary_buffer<char> _val;
@@ -268,6 +272,14 @@ public:
    // leave only the unprocessed part. The caller must handle calling
    // process() again, and/or refilling the buffer, as needed.
    data_consumer::processing_result process_state(temporary_buffer<char>& data) {
+        try {
+            return do_process_state(data);
+        } catch (malformed_sstable_exception& exp) {
+            throw malformed_sstable_exception(exp.what(), _sst->get_filename());
+        }
+    }
+private:
+    data_consumer::processing_result do_process_state(temporary_buffer<char>& data) {
 #if 0
        // Testing hack: call process() for tiny chunks separately, to verify
        // that primitive types crossing input buffer are handled correctly.
@@ -506,13 +518,15 @@ public:

        return row_consumer::proceed::yes;
    }
+public:

    data_consume_rows_context(const schema&,
-                              const shared_sstable&,
+                              const shared_sstable& sst,
                              row_consumer& consumer,
                              input_stream<char>&& input, uint64_t start, uint64_t maxlen)
                : continuous_data_consumer(std::move(input), start, maxlen)
-                , _consumer(consumer) {
+                , _consumer(consumer)
+                , _sst(sst) {
    }

    void verify_end_state() {
@@ -571,6 +585,9 @@ private:
        ROW_BODY_DELETION,
        ROW_BODY_DELETION_2,
        ROW_BODY_DELETION_3,
+        ROW_BODY_SHADOWABLE_DELETION,
+        ROW_BODY_SHADOWABLE_DELETION_2,
+        ROW_BODY_SHADOWABLE_DELETION_3,
        ROW_BODY_MARKER,
        ROW_BODY_MISSING_COLUMNS,
        ROW_BODY_MISSING_COLUMNS_2,
@@ -608,8 +625,10 @@ private:
    } _state = state::PARTITION_START;

    consumer_m& _consumer;
+    shared_sstable _sst;
    const serialization_header& _header;
    column_translation _column_translation;
+    const bool _has_shadowable_tombstones;

    temporary_buffer<char> _pk;

@@ -620,16 +639,14 @@ private:

    std::vector<temporary_buffer<char>> _row_key;

-    boost::iterator_range<std::vector<std::optional<column_id>>::const_iterator> _column_ids;
-    boost::iterator_range<std::vector<std::optional<uint32_t>>::const_iterator> _column_value_fix_lengths;
-    boost::iterator_range<std::vector<bool>::const_iterator> _column_is_collection;
-    boost::iterator_range<std::vector<bool>::const_iterator> _column_is_counter;
+    boost::iterator_range<std::vector<column_translation::column_info>::const_iterator> _columns;
    boost::dynamic_bitset<uint64_t> _columns_selector;
    uint64_t _missing_columns_to_read;

    boost::iterator_range<std::vector<std::optional<uint32_t>>::const_iterator> _ck_column_value_fix_lengths;

    tombstone _row_tombstone;
+    tombstone _row_shadowable_tombstone;

    column_flags_m _column_flags{0};
    api::timestamp_type _column_timestamp;
@@ -656,46 +673,34 @@ private:
     */
    tombstone _left_range_tombstone;
    tombstone _right_range_tombstone;
-    void setup_columns(const std::vector<std::optional<column_id>>& column_ids,
-                       const std::vector<std::optional<uint32_t>>& column_value_fix_lengths,
-                       const std::vector<bool>& column_is_collection,
-                       const std::vector<bool>& column_is_counter) {
-        _column_ids = boost::make_iterator_range(column_ids);
-        _column_value_fix_lengths = boost::make_iterator_range(column_value_fix_lengths);
-        _column_is_collection = boost::make_iterator_range(column_is_collection);
-        _column_is_counter = boost::make_iterator_range(column_is_counter);
+    void setup_columns(const std::vector<column_translation::column_info>& columns) {
+        _columns = boost::make_iterator_range(columns);
    }
    bool is_current_column_present() {
-        return _columns_selector.test(_columns_selector.size() - _column_ids.size());
+        return _columns_selector.test(_columns_selector.size() - _columns.size());
    }
    void skip_absent_columns() {
        size_t pos = _columns_selector.find_first();
        if (pos == boost::dynamic_bitset<uint64_t>::npos) {
-            pos = _column_ids.size();
+            pos = _columns.size();
        }
-        _column_ids.advance_begin(pos);
-        _column_value_fix_lengths.advance_begin(pos);
-        _column_is_collection.advance_begin(pos);
-        _column_is_counter.advance_begin(pos);
+        _columns.advance_begin(pos);
    }
-    bool no_more_columns() { return _column_ids.empty(); }
+    bool no_more_columns() { return _columns.empty(); }
    void move_to_next_column() {
-        size_t current_pos = _columns_selector.size() - _column_ids.size();
+        size_t current_pos = _columns_selector.size() - _columns.size();
        size_t next_pos = _columns_selector.find_next(current_pos);
-        size_t jump_to_next = (next_pos == boost::dynamic_bitset<uint64_t>::npos) ? _column_ids.size()
+        size_t jump_to_next = (next_pos == boost::dynamic_bitset<uint64_t>::npos) ? _columns.size()
                                                                                  : next_pos - current_pos;
-        _column_ids.advance_begin(jump_to_next);
-        _column_value_fix_lengths.advance_begin(jump_to_next);
-        _column_is_collection.advance_begin(jump_to_next);
-        _column_is_counter.advance_begin(jump_to_next);
+        _columns.advance_begin(jump_to_next);
    }
-    bool is_column_simple() { return !_column_is_collection.front(); }
-    bool is_column_counter() { return _column_is_counter.front(); }
+    bool is_column_simple() { return !_columns.front().is_collection; }
+    bool is_column_counter() { return _columns.front().is_counter; }
    std::optional<column_id> get_column_id() {
-        return _column_ids.front();
+        return _columns.front().id;
    }
    std::optional<uint32_t> get_column_value_length() {
-        return _column_value_fix_lengths.front();
+        return _columns.front().value_length;
    }
    void setup_ck(const std::vector<std::optional<uint32_t>>& column_value_fix_lengths) {
        _row_key.clear();
@@ -751,6 +756,14 @@ public:
    }

    data_consumer::processing_result process_state(temporary_buffer<char>& data) {
+        try {
+            return do_process_state(data);
+        } catch (malformed_sstable_exception& exp) {
+            throw malformed_sstable_exception(exp.what(), _sst->get_filename());
+        }
+    }
+private:
+    data_consumer::processing_result do_process_state(temporary_buffer<char>& data) {
        switch (_state) {
        case state::PARTITION_START:
        partition_start_label:
@@ -786,6 +799,7 @@ public:
        flags_label:
            _liveness.reset();
            _row_tombstone = {};
+            _row_shadowable_tombstone = {};
            if (read_8(data) != read_status::ready) {
                _state = state::FLAGS_2;
                break;
@@ -804,10 +818,7 @@ public:
            } else if (!_flags.has_extended_flags()) {
                _extended_flags = unfiltered_extended_flags_m(uint8_t{0u});
                _state = state::CLUSTERING_ROW;
-                setup_columns(_column_translation.regular_columns(),
-                              _column_translation.regular_column_value_fix_legths(),
-                              _column_translation.regular_column_is_collection(),
-                              _column_translation.regular_column_is_counter());
+                setup_columns(_column_translation.regular_columns());
                _ck_size = _column_translation.clustering_column_value_fix_legths().size();
                goto clustering_row_label;
            }
@@ -817,12 +828,12 @@ public:
            }
        case state::EXTENDED_FLAGS:
            _extended_flags = unfiltered_extended_flags_m(_u8);
+            if (_extended_flags.has_cassandra_shadowable_deletion()) {
+                throw std::runtime_error("SSTables with Cassandra-style shadowable deletion cannot be read by Scylla");
+            }
            if (_extended_flags.is_static()) {
                if (_is_first_unfiltered) {
-                    setup_columns(_column_translation.static_columns(),
-                                  _column_translation.static_column_value_fix_legths(),
-                                  _column_translation.static_column_is_collection(),
-                                  _column_translation.static_column_is_counter());
+                    setup_columns(_column_translation.static_columns());
                    _is_first_unfiltered = false;
                    _consumer.consume_static_row_start();
                    goto row_body_label;
@@ -830,10 +841,7 @@ public:
                    throw malformed_sstable_exception("static row should be a first unfiltered in a partition");
                }
            }
-            setup_columns(_column_translation.regular_columns(),
-                          _column_translation.regular_column_value_fix_legths(),
-                          _column_translation.regular_column_is_collection(),
-                          _column_translation.regular_column_is_counter());
+            setup_columns(_column_translation.regular_columns());
            _ck_size = _column_translation.clustering_column_value_fix_legths().size();
        case state::CLUSTERING_ROW:
        clustering_row_label:
@@ -943,8 +951,8 @@ public:
        case state::ROW_BODY_DELETION:
        row_body_deletion_label:
            if (!_flags.has_deletion()) {
-                _state = state::ROW_BODY_MARKER;
-                goto row_body_marker_label;
+                _state = state::ROW_BODY_SHADOWABLE_DELETION;
+                goto row_body_shadowable_deletion_label;
            }
            if (read_unsigned_vint(data) != read_status::ready) {
                _state = state::ROW_BODY_DELETION_2;
@@ -958,9 +966,32 @@ public:
            }
        case state::ROW_BODY_DELETION_3:
            _row_tombstone.deletion_time = parse_expiry(_header, _u64);
+        case state::ROW_BODY_SHADOWABLE_DELETION:
+        row_body_shadowable_deletion_label:
+            if (_extended_flags.has_scylla_shadowable_deletion()) {
+                if (!_has_shadowable_tombstones) {
+                    throw malformed_sstable_exception("Scylla shadowable tombstone flag is set but not supported on this SSTables");
+                }
+            } else {
+                _state = state::ROW_BODY_MARKER;
+                goto row_body_marker_label;
+            }
+            if (read_unsigned_vint(data) != read_status::ready) {
+                _state = state::ROW_BODY_SHADOWABLE_DELETION_2;
+                break;
+            }
+        case state::ROW_BODY_SHADOWABLE_DELETION_2:
+            _row_shadowable_tombstone.timestamp = parse_timestamp(_header, _u64);
+            if (read_unsigned_vint(data) != read_status::ready) {
+                _state = state::ROW_BODY_SHADOWABLE_DELETION_3;
+                break;
+            }
+        case state::ROW_BODY_SHADOWABLE_DELETION_3:
+            _row_shadowable_tombstone.deletion_time = parse_expiry(_header, _u64);
        case state::ROW_BODY_MARKER:
        row_body_marker_label:
-            if (_consumer.consume_row_marker_and_tombstone(_liveness, std::move(_row_tombstone)) == consumer_m::proceed::no) {
+            if (_consumer.consume_row_marker_and_tombstone(
+                    _liveness, std::move(_row_tombstone), std::move(_row_shadowable_tombstone)) == consumer_m::proceed::no) {
                _state = state::ROW_BODY_MISSING_COLUMNS;
                break;
            }
@@ -972,7 +1003,7 @@ public:
                }
                goto row_body_missing_columns_2_label;
            } else {
-                _columns_selector = boost::dynamic_bitset<uint64_t>(_column_ids.size());
+                _columns_selector = boost::dynamic_bitset<uint64_t>(_columns.size());
                _columns_selector.set();
            }
        case state::COLUMN:
@@ -1110,17 +1141,17 @@ public:
        case state::ROW_BODY_MISSING_COLUMNS_2:
        row_body_missing_columns_2_label: {
            uint64_t missing_column_bitmap_or_count = _u64;
-            if (_column_ids.size() < 64) {
+            if (_columns.size() < 64) {
                _columns_selector.clear();
                _columns_selector.append(missing_column_bitmap_or_count);
                _columns_selector.flip();
-                _columns_selector.resize(_column_ids.size());
+                _columns_selector.resize(_columns.size());
                skip_absent_columns();
                goto column_label;
            }
-            _columns_selector.resize(_column_ids.size());
-            if (_column_ids.size() - missing_column_bitmap_or_count < _column_ids.size() / 2) {
-                _missing_columns_to_read = _column_ids.size() - missing_column_bitmap_or_count;
+            _columns_selector.resize(_columns.size());
+            if (_columns.size() - missing_column_bitmap_or_count < _columns.size() / 2) {
+                _missing_columns_to_read = _columns.size() - missing_column_bitmap_or_count;
                _columns_selector.reset();
            } else {
                _missing_columns_to_read = missing_column_bitmap_or_count;
@@ -1275,6 +1306,7 @@ public:

        return row_consumer::proceed::yes;
    }
+public:

    data_consume_rows_context_m(const schema& s,
                                const shared_sstable& sst,
@@ -1284,8 +1316,10 @@ public:
                                uint64_t maxlen)
        : continuous_data_consumer(std::move(input), start, maxlen)
        , _consumer(consumer)
+        , _sst(sst)
        , _header(sst->get_serialization_header())
        , _column_translation(sst->get_column_translation(s, _header))
+        , _has_shadowable_tombstones(sst->has_shadowable_tombstones())
        , _liveness(_header)
    { }

@@ -1294,7 +1328,7 @@ public:
        // filter and using a promoted index), we may be in FLAGS or FLAGS_2
        // state instead of PARTITION_START.
        if (_state == state::FLAGS || _state == state::FLAGS_2) {
-            _consumer.consume_partition_end();
+            _consumer.on_end_of_stream();
            return;
        }
        if (_state != state::PARTITION_START || _prestate != prestate::NONE) {
--- a/sstables/sstables.cc
+++ b/sstables/sstables.cc
@@ -2632,10 +2632,14 @@ enum class row_extended_flags : uint8_t {
    none = 0x00,
    // Whether the encoded row is a static. If there is no extended flag, the row is assumed not static.
    is_static = 0x01,
-    // Whether the row deletion is shadowable. If there is no extended flag (or no row deletion)
-    // the deletion is assumed not shadowable.
+    // Cassandra-specific flag, indicates whether the row deletion is shadowable.
    // This flag is deprecated in Origin - see CASSANDRA-11500.
-    has_shadowable_deletion = 0x02,
+    // This flag is never set by Scylla and it fails to read files that have it set.
+    has_shadowable_deletion_cassandra = 0x02,
+    // Scylla-specific flag, indicates whether the row deletion is shadowable.
+    // If set, the shadowable tombstone is writen right after the row deletion.
+    // This is only used by Materialized Views that are not supposed to be exported.
+    has_shadowable_deletion_scylla = 0x80,
 };

 // A range tombstone marker (RT marker) represents a bound of a range tombstone
@@ -2680,6 +2684,18 @@ GCC6_CONCEPT(
    };
 )

+static indexed_columns get_indexed_columns_partitioned_by_atomicity(schema::const_iterator_range_type columns) {
+    indexed_columns result;
+    result.reserve(columns.size());
+    for (const auto& col: columns) {
+        result.emplace_back(col);
+    }
+    boost::range::stable_partition(
+            result,
+            [](const std::reference_wrapper<const column_definition>& column) { return column.get().is_atomic();});
+    return result;
+}
+
 // Used for writing SSTables in 'mc' format.
 class sstable_writer_m : public sstable_writer::writer_impl {
 private:
@@ -2704,6 +2720,19 @@ private:
    index_sampling_state _index_sampling_state;
    range_tombstone_stream _range_tombstones;

+    // For static and regular columns, we write all simple columns first followed by collections
+    // These containers have columns partitioned by atomicity
+    const indexed_columns _static_columns;
+    const indexed_columns _regular_columns;
+
+    struct cdef_and_collection {
+        const column_definition* cdef;
+        std::reference_wrapper<const atomic_cell_or_collection> collection;
+    };
+
+    // Used to defer writing collections until all atomic cells are written
+    std::vector<cdef_and_collection> _collections;
+
    std::optional<rt_marker> _end_open_marker;

    struct clustering_info {
@@ -2791,7 +2820,7 @@ private:
    void write_collection(file_writer& writer, const column_definition& cdef, collection_mutation_view collection,
                          const row_time_properties& properties, bool has_complex_deletion);

-    void write_cells(file_writer& writer, column_kind kind, const row& row_body, const row_time_properties& properties, bool has_complex_deletion = false);
+    void write_cells(file_writer& writer, column_kind kind, const row& row_body, const row_time_properties& properties, bool has_complex_deletion);
    void write_row_body(file_writer& writer, const clustering_row& row, bool has_complex_deletion);
    void write_static_row(const row& static_row);

@@ -2828,6 +2857,8 @@ public:
        , _enc_stats(enc_stats)
        , _shard(shard)
        , _range_tombstones(_schema)
+        , _static_columns(get_indexed_columns_partitioned_by_atomicity(s.static_columns()))
+        , _regular_columns(get_indexed_columns_partitioned_by_atomicity(s.regular_columns()))
    {
        _sst.generate_toc(_schema.get_compressor_params().get_compressor(), _schema.bloom_filter_fp_chance());
        _sst.write_toc(_pc);
@@ -3150,39 +3181,43 @@ void sstable_writer_m::write_liveness_info(file_writer& writer, const row_marker
    uint64_t timestamp = marker.timestamp();
    _c_stats.update_timestamp(timestamp);
    write_delta_timestamp(writer, timestamp);
-    if (marker.is_expiring()) {
-        auto ttl = marker.ttl().count();
-        auto ldt = marker.expiry().time_since_epoch().count();
+
+    auto write_expiring_liveness_info = [this, &writer] (uint32_t ttl, uint64_t ldt) {
        _c_stats.update_ttl(ttl);
        _c_stats.update_local_deletion_time(ldt);
        write_delta_ttl(writer, ttl);
        write_delta_local_deletion_time(writer, ldt);
+    };
+    if (!marker.is_live()) {
+        write_expiring_liveness_info(expired_liveness_ttl, marker.deletion_time().time_since_epoch().count());
+    } else if (marker.is_expiring()) {
+        write_expiring_liveness_info(marker.ttl().count(), marker.expiry().time_since_epoch().count());
    }
 }

 void sstable_writer_m::write_collection(file_writer& writer, const column_definition& cdef,
        collection_mutation_view collection, const row_time_properties& properties, bool has_complex_deletion) {
    auto& ctype = *static_pointer_cast<const collection_type_impl>(cdef.type);
-  collection.data.with_linearized([&] (bytes_view collection_bv) {
-    auto mview = ctype.deserialize_mutation_form(collection_bv);
-    if (has_complex_deletion) {
-        auto dt = to_deletion_time(mview.tomb);
-        write_delta_deletion_time(writer, dt);
-        if (mview.tomb) {
-            _c_stats.update_timestamp(dt.marked_for_delete_at);
-            _c_stats.update_local_deletion_time(dt.local_deletion_time);
+    collection.data.with_linearized([&] (bytes_view collection_bv) {
+        auto mview = ctype.deserialize_mutation_form(collection_bv);
+        if (has_complex_deletion) {
+            auto dt = to_deletion_time(mview.tomb);
+            write_delta_deletion_time(writer, dt);
+            if (mview.tomb) {
+                _c_stats.update_timestamp(dt.marked_for_delete_at);
+                _c_stats.update_local_deletion_time(dt.local_deletion_time);
+            }
        }
-    }

-    write_vint(writer, mview.cells.size());
-    if (!mview.cells.empty()) {
-        ++_c_stats.column_count;
-    }
-    for (const auto& [cell_path, cell]: mview.cells) {
-        ++_c_stats.cells_count;
-        write_cell(writer, cell, cdef, properties, cell_path);
-    }
-  });
+        write_vint(writer, mview.cells.size());
+        if (!mview.cells.empty()) {
+            ++_c_stats.column_count;
+        }
+        for (const auto& [cell_path, cell]: mview.cells) {
+            ++_c_stats.cells_count;
+            write_cell(writer, cell, cdef, properties, cell_path);
+        }
+    });
 }

 void sstable_writer_m::write_cells(file_writer& writer, column_kind kind, const row& row_body,
@@ -3191,11 +3226,11 @@ void sstable_writer_m::write_cells(file_writer& writer, column_kind kind, const
    // This differs from Origin where all updated columns are tracked and the set of filled columns of a row
    // is compared with the set of all columns filled in the memtable. So our encoding may be less optimal in some cases
    // but still valid.
-    write_missing_columns(writer, _schema, row_body);
+    write_missing_columns(writer, kind == column_kind::static_column ? _static_columns : _regular_columns, row_body);
    row_body.for_each_cell([this, &writer, kind, &properties, has_complex_deletion] (column_id id, const atomic_cell_or_collection& c) {
        auto&& column_definition = _schema.column_at(kind, id);
        if (!column_definition.is_atomic()) {
-            write_collection(writer, column_definition, c.as_collection_mutation(), properties, has_complex_deletion);
+            _collections.push_back({&column_definition, c});
            return;
        }
        atomic_cell_view cell = c.as_atomic_cell(column_definition);
@@ -3203,15 +3238,26 @@ void sstable_writer_m::write_cells(file_writer& writer, column_kind kind, const
        ++_c_stats.column_count;
        write_cell(writer, cell, column_definition, properties);
    });
+
+    for (const auto& col: _collections) {
+        write_collection(writer, *col.cdef, col.collection.get().as_collection_mutation(), properties, has_complex_deletion);
+    }
+    _collections.clear();
 }

 void sstable_writer_m::write_row_body(file_writer& writer, const clustering_row& row, bool has_complex_deletion) {
    write_liveness_info(writer, row.marker());
-    if (row.tomb()) {
-        auto dt = to_deletion_time(row.tomb().tomb());
+    auto write_tombstone_and_update_stats = [this, &writer] (const tombstone& t) {
+         auto dt = to_deletion_time(t);
        _c_stats.update_timestamp(dt.marked_for_delete_at);
        _c_stats.update_local_deletion_time(dt.local_deletion_time);
        write_delta_deletion_time(writer, dt);
+    };
+    if (row.tomb().regular()) {
+        write_tombstone_and_update_stats(row.tomb().regular());
+    }
+    if (row.tomb().is_shadowable()) {
+        write_tombstone_and_update_stats(row.tomb().tomb());
    }
    row_time_properties properties;
    if (!row.marker().is_missing()) {
@@ -3237,6 +3283,27 @@ uint64_t calculate_write_size(Func&& func) {
    return written_size;
 }

+// Find if any collection in the row contains a collection-wide tombstone
+static bool row_has_complex_deletion(const schema& s, const row& r, column_kind kind) {
+    bool result = false;
+    r.for_each_cell_until([&] (column_id id, const atomic_cell_or_collection& c) {
+        auto&& cdef = s.column_at(kind, id);
+        if (cdef.is_atomic()) {
+            return stop_iteration::no;
+        }
+        auto t = static_pointer_cast<const collection_type_impl>(cdef.type);
+        return c.as_collection_mutation().data.with_linearized([&] (bytes_view c_bv) {
+            auto mview = t->deserialize_mutation_form(c_bv);
+            if (mview.tomb) {
+                result = true;
+            }
+            return stop_iteration(static_cast<bool>(mview.tomb));
+        });
+    });
+
+    return result;
+}
+
 void sstable_writer_m::write_static_row(const row& static_row) {
    assert(_schema.is_compound());

@@ -3246,13 +3313,16 @@ void sstable_writer_m::write_static_row(const row& static_row) {
    if (static_row.size() == _schema.static_columns_count()) {
        flags |= row_flags::has_all_columns;
    }
-
+    bool has_complex_deletion = row_has_complex_deletion(_schema, static_row, column_kind::static_column);
+    if (has_complex_deletion) {
+        flags |= row_flags::has_complex_deletion;
+    }
    write(_sst.get_version(), *_data_writer, flags);
    write(_sst.get_version(), *_data_writer, row_extended_flags::is_static);

    // Calculate the size of the row body
-    auto write_row = [this, &static_row] (file_writer& writer) {
-        write_cells(writer, column_kind::static_column, static_row, row_time_properties{});
+    auto write_row = [this, &static_row, has_complex_deletion] (file_writer& writer) {
+        write_cells(writer, column_kind::static_column, static_row, row_time_properties{}, has_complex_deletion);
    };

    uint64_t row_body_size = calculate_write_size(write_row) + unsigned_vint::serialized_size(0);
@@ -3273,48 +3343,29 @@ stop_iteration sstable_writer_m::consume(static_row&& sr) {
    return stop_iteration::no;
 }

-// Find if any collection in the row contains a collection-wide tombstone
-static bool row_has_complex_deletion(const schema& s, const row& r) {
-    bool result = false;
-    r.for_each_cell_until([&] (column_id id, const atomic_cell_or_collection& c) {
-        auto&& cdef = s.column_at(column_kind::regular_column, id);
-        if (cdef.is_atomic()) {
-            return stop_iteration::no;
-        }
-        auto t = static_pointer_cast<const collection_type_impl>(cdef.type);
-      return c.as_collection_mutation().data.with_linearized([&] (bytes_view c_bv) {
-        auto mview = t->deserialize_mutation_form(c_bv);
-        if (mview.tomb) {
-            result = true;
-        }
-        return stop_iteration(static_cast<bool>(mview.tomb));
-    });
-    });
-
-    return result;
-}
-
 void sstable_writer_m::write_clustered(const clustering_row& clustered_row, uint64_t prev_row_size) {
    row_flags flags = row_flags::none;
    row_extended_flags ext_flags = row_extended_flags::none;
-    if (clustered_row.marker().is_live()) {
+    const row_marker& marker = clustered_row.marker();
+    if (!marker.is_missing()) {
        flags |= row_flags::has_timestamp;
-        if (clustered_row.marker().is_expiring()) {
+        if (!marker.is_live() || marker.is_expiring()) {
            flags |= row_flags::has_ttl;
        }
    }

-    if (clustered_row.tomb().tomb()) {
+    if (clustered_row.tomb().regular()) {
        flags |= row_flags::has_deletion;
-        if (clustered_row.tomb().tomb() && clustered_row.tomb().is_shadowable()) {
-            ext_flags = row_extended_flags::has_shadowable_deletion;
-        }
+    }
+    if (clustered_row.tomb().is_shadowable()) {
+        flags |= row_flags::extension_flag;
+        ext_flags = row_extended_flags::has_shadowable_deletion_scylla;
    }

    if (clustered_row.cells().size() == _schema.regular_columns_count()) {
        flags |= row_flags::has_all_columns;
    }
-    bool has_complex_deletion = row_has_complex_deletion(_schema, clustered_row.cells());
+    bool has_complex_deletion = row_has_complex_deletion(_schema, clustered_row.cells(), column_kind::regular_column);
    if (has_complex_deletion) {
        flags |= row_flags::has_complex_deletion;
    }
@@ -3689,7 +3740,8 @@ const sstring sstable::filename(sstring dir, sstring ks, sstring cf, version_typ
                                format_types format, sstring component) {
    static std::unordered_map<version_types, const char*, enum_hash<version_types>> fmtmap = {
        { sstable::version_types::ka, "{0}-{1}-{2}-{3}-{5}" },
-        { sstable::version_types::la, "{2}-{3}-{4}-{5}" }
+        { sstable::version_types::la, "{2}-{3}-{4}-{5}" },
+        { sstable::version_types::mc, "{2}-{3}-{4}-{5}" }
    };

    return dir + "/" + seastar::format(fmtmap[version], ks, cf, _version_string.at(version), to_sstring(generation), _format_string.at(format), component);
--- a/sstables/sstables.hh
+++ b/sstables/sstables.hh
@@ -611,6 +611,10 @@ public:
        return _schema->is_compound() || !has_scylla_component() || _components->scylla_metadata->has_feature(sstable_feature::NonCompoundRangeTombstones);
    }

+    bool has_shadowable_tombstones() const {
+        return has_scylla_component() && _components->scylla_metadata->has_feature(sstable_feature::ShadowableTombstones);
+    }
+
    bool has_correct_max_deletion_time() const {
        return has_scylla_component();
    }
--- a/sstables/types.hh
+++ b/sstables/types.hh
@@ -442,7 +442,8 @@ struct sharding_metadata {
 enum sstable_feature : uint8_t {
    NonCompoundPIEntries = 0,       // See #2993
    NonCompoundRangeTombstones = 1, // See #2986
-    End = 2
+    ShadowableTombstones = 2, // See #3885
+    End = 4,
 };

 // Scylla-specific features enabled for a particular sstable.
@@ -529,6 +530,13 @@ struct hash<sstables::metadata_type> : enum_hash<sstables::metadata_type> {};

 namespace sstables {

+// Special value to represent expired (i.e., 'dead') liveness info
+constexpr static uint32_t expired_liveness_ttl = std::numeric_limits<uint32_t>::max();
+
+inline bool is_expired_liveness_ttl(uint32_t ttl) {
+    return ttl == expired_liveness_ttl;
+}
+
 struct statistics {
    disk_hash<uint32_t, metadata_type, uint32_t> hash;
    std::unordered_map<metadata_type, std::unique_ptr<metadata>> contents;
@@ -595,6 +603,12 @@ public:

 class unfiltered_extended_flags_m final {
    static const uint8_t IS_STATIC = 0x01u;
+    // This flag is used by Cassandra but not supported by Scylla because
+    // Scylla's representation of shadowable tombstones is different.
+    // We only check it on reading and error out if set but never set ourselves.
+    static const uint8_t HAS_CASSANDRA_SHADOWABLE_DELETION = 0x02u;
+    // This flag is Scylla-specific and used for writing shadowable tombstones.
+    static const uint8_t HAS_SCYLLA_SHADOWABLE_DELETION = 0x80u;
    uint8_t _flags;
    bool check_flag(const uint8_t flag) const {
        return (_flags & flag) != 0u;
@@ -604,6 +618,12 @@ public:
    bool is_static() const {
        return check_flag(IS_STATIC);
    }
+    bool has_cassandra_shadowable_deletion() const {
+        return check_flag(HAS_CASSANDRA_SHADOWABLE_DELETION);
+    }
+    bool has_scylla_shadowable_deletion() const {
+        return check_flag(HAS_SCYLLA_SHADOWABLE_DELETION);
+    }
 };

 class column_flags_m final {
--- a/streaming/stream_session.cc
+++ b/streaming/stream_session.cc
@@ -205,7 +205,7 @@ void stream_session::init_messaging_service_handler() {
                    };
                    distribute_reader_and_consume_on_shards(s, dht::global_partitioner(),
                        make_flat_mutation_reader<generating_reader>(s, std::move(get_next_mutation_fragment)),
-                        [cf_id, plan_id, s, estimated_partitions] (flat_mutation_reader reader) {
+                        [cf_id, plan_id, estimated_partitions] (flat_mutation_reader reader) {
                            auto& cf = service::get_local_storage_service().db().local().find_column_family(cf_id);
                            sstables::sstable_writer_config sst_cfg;
                            sst_cfg.large_partition_handler = cf.get_large_partition_handler();
--- a/tests/commitlog_test.cc
+++ b/tests/commitlog_test.cc
@@ -40,6 +40,7 @@
 #include "db/commitlog/commitlog.hh"
 #include "db/commitlog/rp_set.hh"
 #include "log.hh"
+#include "service/priority_manager.hh"

 using namespace db;

@@ -290,7 +291,7 @@ SEASTAR_TEST_CASE(test_commitlog_delete_when_over_disk_limit) {
 SEASTAR_TEST_CASE(test_commitlog_reader){
    static auto count_mutations_in_segment = [] (sstring path) -> future<size_t> {
        auto count = make_lw_shared<size_t>(0);
-        return db::commitlog::read_log_file(path, [count](temporary_buffer<char> buf, db::replay_position rp) {
+        return db::commitlog::read_log_file(path, service::get_local_commitlog_priority(), [count](temporary_buffer<char> buf, db::replay_position rp) {
            sstring str(buf.get(), buf.size());
            BOOST_CHECK_EQUAL(str, "hej bubba cow");
            (*count)++;
@@ -392,7 +393,7 @@ SEASTAR_TEST_CASE(test_commitlog_entry_corruption){
                        BOOST_REQUIRE(!segments.empty());
                        auto seg = segments[0];
                        return corrupt_segment(seg, rps->at(1).pos + 4, 0x451234ab).then([seg, rps, &log] {
-                            return db::commitlog::read_log_file(seg, [rps](temporary_buffer<char> buf, db::replay_position rp) {
+                            return db::commitlog::read_log_file(seg, service::get_local_commitlog_priority(), [rps](temporary_buffer<char> buf, db::replay_position rp) {
                                BOOST_CHECK_EQUAL(rp, rps->at(0));
                                return make_ready_future<>();
                            }).then([](auto s) {
@@ -435,7 +436,7 @@ SEASTAR_TEST_CASE(test_commitlog_chunk_corruption){
                        BOOST_REQUIRE(!segments.empty());
                        auto seg = segments[0];
                        return corrupt_segment(seg, rps->at(0).pos - 4, 0x451234ab).then([seg, rps, &log] {
-                            return db::commitlog::read_log_file(seg, [rps](temporary_buffer<char> buf, db::replay_position rp) {
+                            return db::commitlog::read_log_file(seg, service::get_local_commitlog_priority(), [rps](temporary_buffer<char> buf, db::replay_position rp) {
                                BOOST_FAIL("Should not reach");
                                return make_ready_future<>();
                            }).then([](auto s) {
@@ -477,7 +478,7 @@ SEASTAR_TEST_CASE(test_commitlog_reader_produce_exception){
                        auto segments = log.get_active_segment_names();
                        BOOST_REQUIRE(!segments.empty());
                        auto seg = segments[0];
-                        return db::commitlog::read_log_file(seg, [](temporary_buffer<char> buf, db::replay_position rp) {
+                        return db::commitlog::read_log_file(seg, service::get_local_commitlog_priority(), [](temporary_buffer<char> buf, db::replay_position rp) {
                            return make_exception_future(std::runtime_error("I am in a throwing mode"));
                        }).then([](auto s) {
                            return do_with(std::move(s), [](auto& s) {
--- a/tests/cql_assertions.cc
+++ b/tests/cql_assertions.cc
@@ -143,3 +143,11 @@ rows_assertions result_msg_assertions::is_rows() {
 result_msg_assertions assert_that(shared_ptr<cql_transport::messages::result_message> msg) {
    return result_msg_assertions(msg);
 }
+
+rows_assertions rows_assertions::with_serialized_columns_count(size_t columns_count) {
+    size_t serialized_column_count = _rows->rs().get_metadata().column_count();
+    if (serialized_column_count != columns_count) {
+        fail(sprint("Expected %d serialized columns(s) but got %d", columns_count, serialized_column_count));
+    }
+    return {*this};
+}
--- a/tests/cql_assertions.hh
+++ b/tests/cql_assertions.hh
@@ -40,6 +40,7 @@ public:
    rows_assertions with_rows(std::initializer_list<std::initializer_list<bytes_opt>> rows);
    // Verifies that the result has the following rows and only those rows.
    rows_assertions with_rows_ignore_order(std::vector<std::vector<bytes_opt>> rows);
+    rows_assertions with_serialized_columns_count(size_t columns_count);
 };

 class result_msg_assertions {
--- a/tests/cql_query_test.cc
+++ b/tests/cql_query_test.cc
@@ -4040,3 +4040,81 @@ SEASTAR_TEST_CASE(test_select_with_mixed_order_table) {
        }
    });
 }
+
+SEASTAR_TEST_CASE(test_filtering) {
+    return do_with_cql_env_thread([] (cql_test_env& e) {
+        e.execute_cql("CREATE TABLE cf (k int, v int,m int,n int,o int,p int static, PRIMARY KEY ((k,v),m,n));").get();
+        e.execute_cql(
+                "BEGIN UNLOGGED BATCH \n"
+                "INSERT INTO cf (k, v, m, n, o, p) VALUES (1, 1, 1, 1, 1 ,1 ); \n"
+                "INSERT INTO cf (k, v, m, n, o, p) VALUES (2, 1, 2, 1, 2 ,2 ); \n"
+                "INSERT INTO cf (k, v, m, n, o, p) VALUES (3, 1, 3, 1, 3 ,3 ); \n"
+                "INSERT INTO cf (k, v, m, n, o, p) VALUES (4, 2, 1, 2, 4 ,4 ); \n"
+                "INSERT INTO cf (k, v, m, n, o, p) VALUES (5, 2, 2, 2, 5 ,5 ); \n"
+                "INSERT INTO cf (k, v, m, n, o, p) VALUES (6, 2, 3, 2, 6 ,6 ); \n"
+                "INSERT INTO cf (k, v, m, n, o, p) VALUES (7, 3, 1, 3, 7 ,7 ); \n"
+                "INSERT INTO cf (k, v, m, n, o, p) VALUES (8, 3, 2, 3, 8 ,8 ); \n"
+                "INSERT INTO cf (k, v, m, n, o, p) VALUES (9, 3, 3, 3, 9 ,9 ); \n"
+                "INSERT INTO cf (k, v, m, n, o, p) VALUES (10, 4, 1, 4,10,10); \n"
+                "INSERT INTO cf (k, v, m, n, o, p) VALUES (11, 4, 2, 4,11,11); \n"
+                "INSERT INTO cf (k, v, m, n, o, p) VALUES (12, 5, 3, 5,12,12); \n"
+                "INSERT INTO cf (k, v, m, n, o, p) VALUES (12, 5, 4, 5,13,13); \n"
+                "APPLY BATCH;"
+        ).get();
+
+        // Notice the with_serialized_columns_count() check before the set comparison.
+        // Since we are dealing with the result set before serializing to the client,
+        // there is an extra column that is used for the filtering, this column will
+        // not be present in the responce to the client and with_serialized_columns_count()
+        // verifies exactly that.
+
+        // test filtering on partition keys
+        {
+            auto msg = e.execute_cql("SELECT k FROM cf WHERE v=3 ALLOW FILTERING;").get0();
+            assert_that(msg).is_rows().with_serialized_columns_count(1).with_rows_ignore_order({
+                { int32_type->decompose(7), int32_type->decompose(3)},
+                { int32_type->decompose(8), int32_type->decompose(3) },
+                { int32_type->decompose(9), int32_type->decompose(3) },
+            });
+        }
+
+        // test filtering on clustering keys
+        {
+            auto msg = e.execute_cql("SELECT k FROM cf WHERE n=4 ALLOW FILTERING;").get0();
+            assert_that(msg).is_rows().with_serialized_columns_count(1).with_rows_ignore_order({
+                { int32_type->decompose(10), int32_type->decompose(4) },
+                { int32_type->decompose(11), int32_type->decompose(4) },
+            });
+        }
+
+        //test filtering on regular columns
+        {
+            auto msg = e.execute_cql("SELECT k FROM cf WHERE o>7 ALLOW FILTERING;").get0();
+            assert_that(msg).is_rows().with_serialized_columns_count(1).with_rows_ignore_order({
+                { int32_type->decompose(8),  int32_type->decompose(8) },
+                { int32_type->decompose(9),  int32_type->decompose(9) },
+                { int32_type->decompose(10), int32_type->decompose(10) },
+                { int32_type->decompose(11), int32_type->decompose(11) },
+                { int32_type->decompose(12), int32_type->decompose(12) },
+                { int32_type->decompose(12), int32_type->decompose(13) },
+            });
+        }
+
+        //test filtering on static columns
+        {
+            auto msg = e.execute_cql("SELECT k FROM cf WHERE p>=10 AND p<=12 ALLOW FILTERING;").get0();
+            assert_that(msg).is_rows().with_serialized_columns_count(1).with_rows_ignore_order({
+                { int32_type->decompose(10), int32_type->decompose(10) },
+                { int32_type->decompose(11), int32_type->decompose(11) },
+            });
+        }
+        //test filtering with count
+        {
+            auto msg = e.execute_cql("SELECT COUNT(k) FROM cf WHERE n>3 ALLOW FILTERING;").get0();
+            assert_that(msg).is_rows().with_serialized_columns_count(1).with_size(1).with_rows_ignore_order({
+                { long_type->decompose(4L), int32_type->decompose(4) },
+            });
+        }
+
+    });
+}
--- a/tests/mutation_source_test.cc
+++ b/tests/mutation_source_test.cc
@@ -700,6 +700,46 @@ static void test_date_tiered_clustering_slicing(populate_fn populate) {
    }
 }

+static void test_dropped_column_handling(populate_fn populate) {
+    BOOST_TEST_MESSAGE(__PRETTY_FUNCTION__);
+    schema_ptr write_schema = schema_builder("ks", "cf")
+        .with_column("pk", int32_type, column_kind::partition_key)
+        .with_column("ck", int32_type, column_kind::clustering_key)
+        .with_column("val1", int32_type)
+        .with_column("val2", int32_type)
+        .build();
+    schema_ptr read_schema = schema_builder("ks", "cf")
+        .with_column("pk", int32_type, column_kind::partition_key)
+        .with_column("ck", int32_type, column_kind::clustering_key)
+        .with_column("val2", int32_type)
+        .build();
+    auto val2_cdef = read_schema->get_column_definition(to_bytes("val2"));
+    auto to_ck = [write_schema] (int ck) {
+        return clustering_key::from_single_value(*write_schema, int32_type->decompose(ck));
+    };
+    auto bytes = int32_type->decompose(int32_t(0));
+    auto pk = partition_key::from_single_value(*write_schema, bytes);
+    auto dk = dht::global_partitioner().decorate_key(*write_schema, pk);
+    mutation partition(write_schema, pk);
+    auto add_row = [&partition, &to_ck, write_schema] (int ck, int v1, int v2) {
+        static constexpr api::timestamp_type write_timestamp = 1525385507816568;
+        clustering_key ckey = to_ck(ck);
+        partition.partition().apply_insert(*write_schema, ckey, write_timestamp);
+        partition.set_cell(ckey, "val1", data_value{v1}, write_timestamp);
+        partition.set_cell(ckey, "val2", data_value{v2}, write_timestamp);
+    };
+    add_row(1, 101, 201);
+    add_row(2, 102, 202);
+    add_row(3, 103, 203);
+    assert_that(populate(write_schema, {partition}).make_reader(read_schema))
+        .produces_partition_start(dk)
+        .produces_row(to_ck(1), {{val2_cdef, int32_type->decompose(int32_t(201))}})
+        .produces_row(to_ck(2), {{val2_cdef, int32_type->decompose(int32_t(202))}})
+        .produces_row(to_ck(3), {{val2_cdef, int32_type->decompose(int32_t(203))}})
+        .produces_partition_end()
+        .produces_end_of_stream();
+}
+
 static void test_clustering_slices(populate_fn populate) {
    BOOST_TEST_MESSAGE(__PRETTY_FUNCTION__);
    auto s = schema_builder("ks", "cf")
@@ -807,16 +847,14 @@ static void test_clustering_slices(populate_fn populate) {
          .produces_row_with_key(ck2)
          .produces_end_of_stream();
    }
-
    {
        auto slice = partition_slice_builder(*s)
            .with_range(query::clustering_range::make_singular(make_ck(1)))
            .build();
        assert_that(ds.make_reader(s, pr, slice))
-            .produces(row1 + row2 + row3 + row4 + row5 + del_1)
+            .produces(row1 + row2 + row3 + row4 + row5 + del_1, slice.row_ranges(*s, pk.key()))
            .produces_end_of_stream();
    }
-
    {
        auto slice = partition_slice_builder(*s)
            .with_range(query::clustering_range::make_singular(make_ck(2)))
@@ -831,7 +869,7 @@ static void test_clustering_slices(populate_fn populate) {
            .with_range(query::clustering_range::make_singular(make_ck(1, 2)))
            .build();
        assert_that(ds.make_reader(s, pr, slice))
-            .produces(row3 + row4 + del_1)
+            .produces(row3 + row4 + del_1, slice.row_ranges(*s, pk.key()))
            .produces_end_of_stream();
    }

@@ -840,7 +878,7 @@ static void test_clustering_slices(populate_fn populate) {
            .with_range(query::clustering_range::make_singular(make_ck(3)))
            .build();
        assert_that(ds.make_reader(s, pr, slice))
-            .produces(row8 + del_3)
+            .produces(row8 + del_3, slice.row_ranges(*s, pk.key()))
            .produces_end_of_stream();
    }

@@ -1064,6 +1102,7 @@ void run_mutation_reader_tests(populate_fn populate) {
    test_range_queries(populate);
    test_query_only_static_row(populate);
    test_query_no_clustering_ranges_no_static_columns(populate);
+    test_dropped_column_handling(populate);
 }

 void test_next_partition(populate_fn populate) {
--- a/tests/network_topology_strategy_test.cc
+++ b/tests/network_topology_strategy_test.cc
@@ -30,6 +30,7 @@
 #include <map>
 #include <iostream>
 #include <sstream>
+#include <boost/range/algorithm/adjacent_find.hpp>

 static logging::logger nlogger("NetworkTopologyStrategyLogger");

@@ -52,6 +53,27 @@ void print_natural_endpoints(double point, const std::vector<inet_address> v) {
    nlogger.debug("{}", strm.str());
 }

+#ifndef SEASTAR_DEBUG
+static void verify_sorted(const dht::token_range_vector& trv) {
+    auto not_strictly_before = [] (const dht::token_range a, const dht::token_range b) {
+        return !b.start()
+                || !a.end()
+                || a.end()->value() > b.start()->value()
+                || (a.end()->value() == b.start()->value() && a.end()->is_inclusive() && b.start()->is_inclusive());
+    };
+    BOOST_CHECK(boost::adjacent_find(trv, not_strictly_before) == trv.end());
+}
+#endif
+
+static void check_ranges_are_sorted(abstract_replication_strategy* ars, gms::inet_address ep) {
+    // Too slow in debug mode
+#ifndef SEASTAR_DEBUG
+    verify_sorted(ars->get_ranges(ep));
+    verify_sorted(ars->get_primary_ranges(ep));
+    verify_sorted(ars->get_primary_ranges_within_dc(ep));
+#endif
+}
+
 void strategy_sanity_check(
    abstract_replication_strategy* ars_ptr,
    const std::map<sstring, sstring>& options) {
@@ -150,6 +172,7 @@ void full_ring_check(const std::vector<ring_point>& ring_points,
        auto endpoints2 = ars_ptr->get_natural_endpoints(t2);

        endpoints_check(ars_ptr, endpoints2);
+        check_ranges_are_sorted(ars_ptr, rp.host);
        BOOST_CHECK(cache_hit_count + 1 == ars_ptr->get_cache_hits_count());
        BOOST_CHECK(endpoints1 == endpoints2);
    }
--- a/tests/secondary_index_test.cc
+++ b/tests/secondary_index_test.cc
@@ -342,7 +342,7 @@ SEASTAR_TEST_CASE(test_index_with_partition_key) {

        // Queries that restrict only a part of the partition key and an index require filtering, because we need to compute token
        // in order to create a valid index view query
-        BOOST_REQUIRE_THROW(e.execute_cql("SELECT * from tab WHERE a = 1 and e = 5"), exceptions::invalid_request_exception);
+        BOOST_REQUIRE_THROW(e.execute_cql("SELECT * from tab WHERE a = 1 and e = 5").get(), exceptions::invalid_request_exception);

        // Indexed queries with full primary key are allowed without filtering as well
        eventually([&] {
@@ -362,7 +362,7 @@ SEASTAR_TEST_CASE(test_index_with_partition_key) {
        });

        // This query needs filtering, because clustering key restrictions do not form a prefix
-        BOOST_REQUIRE_THROW(e.execute_cql("SELECT * from tab WHERE a = 1 and b = 2 and d = 4 and e = 5"), exceptions::invalid_request_exception);
+        BOOST_REQUIRE_THROW(e.execute_cql("SELECT * from tab WHERE a = 1 and b = 2 and d = 4 and e = 5").get(), exceptions::invalid_request_exception);
        eventually([&] {
            auto res = e.execute_cql("SELECT * from tab WHERE a = 1 and b = 2 and d = 4 and e = 5 ALLOW FILTERING").get0();
            assert_that(res).is_rows().with_rows({
@@ -461,3 +461,45 @@ SEASTAR_TEST_CASE(test_index_on_pk_ck_with_paging) {
        });
    });
 }
+
+SEASTAR_TEST_CASE(test_secondary_index_collections) {
+    return do_with_cql_env_thread([] (cql_test_env& e) {
+        e.execute_cql("create table t (p int primary key, s1 set<int>, m1 map<int, text>, l1 list<int>, s2 frozen<set<int>>, m2 frozen<map<int, text>>, l2 frozen<list<int>>)").get();
+
+        //NOTICE(sarna): should be lifted after issue #2962 is resolved
+        BOOST_REQUIRE_THROW(e.execute_cql("create index on t(s1)").get(), exceptions::invalid_request_exception);
+        BOOST_REQUIRE_THROW(e.execute_cql("create index on t(m1)").get(), exceptions::invalid_request_exception);
+        BOOST_REQUIRE_THROW(e.execute_cql("create index on t(l1)").get(), exceptions::invalid_request_exception);
+
+        e.execute_cql("create index on t(FULL(s2))").get();
+        e.execute_cql("create index on t(FULL(m2))").get();
+        e.execute_cql("create index on t(FULL(l2))").get();
+
+        e.execute_cql("insert into t(p, s2, m2, l2) values (1, {1}, {1: 'one', 2: 'two'}, [2])").get();
+        e.execute_cql("insert into t(p, s2, m2, l2) values (2, {2}, {3: 'three'}, [3, 4, 5])").get();
+        e.execute_cql("insert into t(p, s2, m2, l2) values (3, {3}, {5: 'five', 7: 'seven'}, [7, 8, 9])").get();
+
+        auto set_type = set_type_impl::get_instance(int32_type, true);
+        auto map_type = map_type_impl::get_instance(int32_type, utf8_type, true);
+        auto list_type = list_type_impl::get_instance(int32_type, true);
+
+        eventually([&] {
+            auto res = e.execute_cql("SELECT p from t where s2 = {2}").get0();
+            assert_that(res).is_rows().with_rows({{{int32_type->decompose(2)}}});
+            res = e.execute_cql("SELECT p from t where s2 = {}").get0();
+            assert_that(res).is_rows().with_size(0);
+        });
+        eventually([&] {
+            auto res = e.execute_cql("SELECT p from t where m2 = {5: 'five', 7: 'seven'}").get0();
+            assert_that(res).is_rows().with_rows({{{int32_type->decompose(3)}}});
+            res = e.execute_cql("SELECT p from t where m2 = {1: 'one', 2: 'three'}").get0();
+            assert_that(res).is_rows().with_size(0);
+        });
+        eventually([&] {
+            auto res = e.execute_cql("SELECT p from t where l2 = [2]").get0();
+            assert_that(res).is_rows().with_rows({{{int32_type->decompose(1)}}});
+            res = e.execute_cql("SELECT p from t where l2 = [3]").get0();
+            assert_that(res).is_rows().with_size(0);
+        });
+    });
+}
--- a/tests/sstable_3_x_test.cc
+++ b/tests/sstable_3_x_test.cc
@@ -4463,3 +4463,290 @@ SEASTAR_THREAD_TEST_CASE(test_complex_column_zero_subcolumns_read) {
    r.produces_end_of_stream();
 }

+SEASTAR_THREAD_TEST_CASE(test_uncompressed_read_two_rows_fast_forwarding) {
+    auto abj = defer([] { await_background_jobs().get(); });
+    // Following tests run on files in tests/sstables/3.x/uncompressed/read_two_rows_fast_forwarding
+    // They were created using following CQL statements:
+    //
+    // CREATE TABLE two_rows_fast_forwarding (pk int, ck int, rc int, PRIMARY KEY (pk, ck)) WITH compression = {'sstable_compression': ''};
+    // INSERT INTO two_rows_fast_forwarding (pk, ck, rc) VALUES (0, 7, 7);
+    // INSERT INTO two_rows_fast_forwarding (pk, ck, rc) VALUES (0, 8, 8);
+
+    static const sstring path = "tests/sstables/3.x/uncompressed/read_two_rows_fast_forwarding";
+    static thread_local const schema_ptr s =
+        schema_builder("test_ks", "two_rows_fast_forwarding")
+            .with_column("pk", int32_type, column_kind::partition_key)
+            .with_column("ck", int32_type, column_kind::clustering_key)
+            .with_column("rc", int32_type)
+            .build();
+    sstable_assertions sst(s, path);
+    sst.load();
+
+    auto to_pkey = [&] (int key) {
+        auto bytes = int32_type->decompose(int32_t(key));
+        auto pk = partition_key::from_single_value(*s, bytes);
+        return dht::global_partitioner().decorate_key(*s, pk);
+    };
+
+    auto to_ckey = [&] (int key) {
+        auto bytes = int32_type->decompose(int32_t(key));
+        return clustering_key::from_single_value(*s, bytes);
+    };
+
+    auto rc_cdef = s->get_column_definition(to_bytes("rc"));
+    BOOST_REQUIRE(rc_cdef);
+
+    auto to_expected = [rc_cdef] (int val) {
+        return std::vector<flat_reader_assertions::expected_column>{{rc_cdef, int32_type->decompose(int32_t(val))}};
+    };
+
+    auto r = assert_that(sst.read_range_rows_flat(query::full_partition_range,
+                                                  s->full_slice(),
+                                                  default_priority_class(),
+                                                  no_resource_tracking(),
+                                                  streamed_mutation::forwarding::yes));
+    r.produces_partition_start(to_pkey(0))
+        .produces_end_of_stream();
+
+    r.fast_forward_to(to_ckey(2), to_ckey(3));
+    r.produces_end_of_stream();
+
+    r.fast_forward_to(to_ckey(4), to_ckey(5));
+    r.produces_end_of_stream();
+
+    r.fast_forward_to(to_ckey(6), to_ckey(9));
+    r.produces_row(to_ckey(7), to_expected(7))
+        .produces_row(to_ckey(8), to_expected(8))
+        .produces_end_of_stream();
+}
+
+SEASTAR_THREAD_TEST_CASE(test_dead_row_marker) {
+    auto abj = defer([] { await_background_jobs().get(); });
+    sstring table_name = "dead_row_marker";
+    // CREATE TABLE dead_row_marker (pk int, ck int, st int static, rc int , PRIMARY KEY (pk, ck)) WITH compression = {'sstable_compression': ''};
+    schema_builder builder("sst3", table_name);
+    builder.with_column("pk", int32_type, column_kind::partition_key);
+    builder.with_column("ck", int32_type, column_kind::clustering_key);
+    builder.with_column("st", int32_type, column_kind::static_column);
+    builder.with_column("rc", int32_type);
+    builder.set_compressor_params(compression_parameters());
+    schema_ptr s = builder.build(schema_builder::compact_storage::no);
+
+    lw_shared_ptr<memtable> mt = make_lw_shared<memtable>(s);
+
+    auto key = partition_key::from_deeply_exploded(*s, { 1 });
+    mutation mut{s, key};
+    mut.set_static_cell("st", data_value{1135}, write_timestamp);
+
+    clustering_key ckey = clustering_key::from_deeply_exploded(*s, { 2 });
+    auto& clustered_row = mut.partition().clustered_row(*s, ckey);
+    clustered_row.apply(row_marker{tombstone{write_timestamp, write_time_point}});
+
+    mut.set_cell(ckey, "rc", data_value{777}, write_timestamp);
+
+    mt->apply(mut);
+
+    tmpdir tmp = write_and_compare_sstables(s, mt, table_name);
+}
+
+SEASTAR_THREAD_TEST_CASE(test_shadowable_deletion) {
+    /* The created SSTables content should match that of
+     * an MV filled with the following queries:
+     *
+     * CREATE TABLE cf (p int PRIMARY KEY, v int) WITH compression = {'sstable_compression': ''};
+     * CREATE MATERIALIZED VIEW mv AS SELECT * FROM cf WHERE p IS NOT NULL AND v IS NOT NULL PRIMARY KEY (v, p);
+     * INSERT INTO cf (p, v) VALUES (1, 0);
+     * UPDATE cf SET v = 1 WHERE p = 1;
+     */
+    auto abj = defer([] { await_background_jobs().get(); });
+    sstring table_name = "shadowable_deletion";
+    schema_builder builder("sst3", table_name);
+    builder.with_column("pk", int32_type, column_kind::partition_key);
+    builder.with_column("ck", int32_type, column_kind::clustering_key);
+    builder.set_compressor_params(compression_parameters());
+    schema_ptr s = builder.build(schema_builder::compact_storage::no);
+
+    lw_shared_ptr<memtable> mt = make_lw_shared<memtable>(s);
+
+    clustering_key ckey = clustering_key::from_deeply_exploded(*s, { 1 });
+    mutation mut1{s, partition_key::from_deeply_exploded(*s, {1})};
+    {
+        auto& clustered_row = mut1.partition().clustered_row(*s, ckey);
+        clustered_row.apply(row_marker{api::timestamp_type{1540230880370422}});
+        mt->apply(mut1);
+    }
+
+    mutation mut2{s, partition_key::from_deeply_exploded(*s, {0})};
+    {
+        auto& clustered_row = mut2.partition().clustered_row(*s, ckey);
+        api::timestamp_type ts {1540230874370065};
+        gc_clock::time_point tp {gc_clock::duration(1540230880)};
+        clustered_row.apply(row_marker{api::timestamp_type{ts}});
+        clustered_row.apply(shadowable_tombstone(ts, tp));
+        mt->apply(mut2);
+    }
+
+    tmpdir tmp = write_and_compare_sstables(s, mt, table_name);
+    validate_read(s, tmp.path, {mut1, mut2});
+}
+
+SEASTAR_THREAD_TEST_CASE(test_regular_and_shadowable_deletion) {
+    /* The created SSTables content should match that of
+     * an MV filled with the following queries:
+     *
+     * CREATE TABLE cf (p INT, c INT, v INT, PRIMARY KEY (p, c));
+     * CREATE MATERIALIZED VIEW mvf AS SELECT * FROM cf WHERE p IS NOT NULL AND c IS NOT NULL AND v IS NOT NULL PRIMARY KEY (v, p, c);
+     * INSERT INTO cf (p, c, v) VALUES (1, 1, 0) USING TIMESTAMP 1540230874370001;
+     * DELETE FROM cf USING TIMESTAMP 1540230874370001 WHERE p = 1 AND c = 1;
+     * UPDATE cf USING TIMESTAMP 1540230874370002 SET v = 0 WHERE p = 1 AND c = 1;
+     * UPDATE cf USING TIMESTAMP 1540230874370003 SET v = 1 WHERE p = 1 AND c = 1;
+     */
+    auto abj = defer([] { await_background_jobs().get(); });
+    sstring table_name = "regular_and_shadowable_deletion";
+    schema_builder builder("sst3", table_name);
+    builder.with_column("v", int32_type, column_kind::partition_key);
+    builder.with_column("p", int32_type, column_kind::clustering_key);
+    builder.with_column("c", int32_type, column_kind::clustering_key);
+    builder.set_compressor_params(compression_parameters());
+    schema_ptr s = builder.build(schema_builder::compact_storage::no);
+
+    auto make_tombstone = [] (int64_t ts, int32_t tp) {
+        return tombstone{api::timestamp_type{ts}, gc_clock::time_point(gc_clock::duration(tp))};
+    };
+
+    lw_shared_ptr<memtable> mt = make_lw_shared<memtable>(s);
+
+    clustering_key ckey = clustering_key::from_deeply_exploded(*s, { {1}, {1} });
+    mutation mut1{s, partition_key::from_deeply_exploded(*s, {1})};
+    {
+        auto& clustered_row = mut1.partition().clustered_row(*s, ckey);
+        clustered_row.apply(row_marker{api::timestamp_type{1540230874370003}});
+        clustered_row.apply(make_tombstone(1540230874370001, 1540251167));
+        mt->apply(mut1);
+    }
+
+    mutation mut2{s, partition_key::from_deeply_exploded(*s, {0})};
+    {
+        auto& clustered_row = mut2.partition().clustered_row(*s, ckey);
+        gc_clock::time_point tp {gc_clock::duration(1540230880)};
+        clustered_row.apply(row_marker{api::timestamp_type{1540230874370002}});
+        clustered_row.apply(make_tombstone(1540230874370001, 1540251167));
+        clustered_row.apply(shadowable_tombstone(make_tombstone(1540230874370002, 1540251216)));
+        mt->apply(mut2);
+    }
+
+    tmpdir tmp = write_and_compare_sstables(s, mt, table_name);
+    validate_read(s, tmp.path, {mut1, mut2});
+}
+
+SEASTAR_THREAD_TEST_CASE(test_write_static_row_with_missing_columns) {
+    auto abj = defer([] { await_background_jobs().get(); });
+    sstring table_name = "static_row_with_missing_columns";
+    // CREATE TABLE static_row (pk int, ck int, st1 int static, st2 int static, rc int, PRIMARY KEY (pk, ck)) WITH compression = {'sstable_compression': ''};
+    schema_builder builder("sst3", table_name);
+    builder.with_column("pk", int32_type, column_kind::partition_key);
+    builder.with_column("ck", int32_type, column_kind::clustering_key);
+    builder.with_column("st1", int32_type, column_kind::static_column);
+    builder.with_column("st2", int32_type, column_kind::static_column);
+    builder.with_column("rc", int32_type);
+    builder.set_compressor_params(compression_parameters());
+    schema_ptr s = builder.build(schema_builder::compact_storage::no);
+
+    lw_shared_ptr<memtable> mt = make_lw_shared<memtable>(s);
+
+    // INSERT INTO static_row (pk, ck, st1, rc) VALUES (0, 1, 2, 3);
+    auto key = partition_key::from_deeply_exploded(*s, {0});
+    mutation mut{s, key};
+    clustering_key ckey = clustering_key::from_deeply_exploded(*s, { 1 });
+    mut.partition().apply_insert(*s, ckey, write_timestamp);
+    mut.set_static_cell("st1", data_value{2}, write_timestamp);
+    mut.set_cell(ckey, "rc", data_value{3}, write_timestamp);
+    mt->apply(mut);
+
+    tmpdir tmp = write_and_compare_sstables(s, mt, table_name);
+    validate_read(s, tmp.path, {mut});
+}
+
+SEASTAR_THREAD_TEST_CASE(test_write_interleaved_atomic_and_collection_columns) {
+    auto abj = defer([] { await_background_jobs().get(); });
+    sstring table_name = "interleaved_atomic_and_collection_columns";
+    // CREATE TABLE interleaved_atomic_and_collection_columns ( pk int, ck int, rc1 int, rc2 set<int>, rc3 int, rc4 set<int>,
+    //     rc5 int, rc6 set<int>, PRIMARY KEY (pk, ck)) WITH compression = {'sstable_compression': ''};
+    auto set_of_ints_type = set_type_impl::get_instance(int32_type, true);
+    schema_builder builder("sst3", table_name);
+    builder.with_column("pk", int32_type, column_kind::partition_key);
+    builder.with_column("ck", int32_type, column_kind::clustering_key);
+    builder.with_column("rc1", int32_type);
+    builder.with_column("rc2", set_of_ints_type);
+    builder.with_column("rc3", int32_type);
+    builder.with_column("rc4", set_of_ints_type);
+    builder.with_column("rc5", int32_type);
+    builder.with_column("rc6", set_of_ints_type);
+    builder.set_compressor_params(compression_parameters());
+    schema_ptr s = builder.build(schema_builder::compact_storage::no);
+
+    lw_shared_ptr<memtable> mt = make_lw_shared<memtable>(s);
+
+    // INSERT INTO interleaved_atomic_and_collection_columns (pk, ck, rc1, rc4, rc5)
+    //     VALUES (0, 1, 2, {3, 4}, 5) USING TIMESTAMP 1525385507816568;
+    auto key = partition_key::from_deeply_exploded(*s, {0});
+    mutation mut{s, key};
+    clustering_key ckey = clustering_key::from_deeply_exploded(*s, { 1 });
+    mut.partition().apply_insert(*s, ckey, write_timestamp);
+    mut.set_cell(ckey, "rc1", data_value{2}, write_timestamp);
+
+    set_type_impl::mutation set_values;
+    set_values.tomb = tombstone {write_timestamp - 1, write_time_point};
+    set_values.cells.emplace_back(int32_type->decompose(3), atomic_cell::make_live(*bytes_type, write_timestamp, bytes_view{}));
+    set_values.cells.emplace_back(int32_type->decompose(4), atomic_cell::make_live(*bytes_type, write_timestamp, bytes_view{}));
+    mut.set_clustered_cell(ckey, *s->get_column_definition("rc4"), set_of_ints_type->serialize_mutation_form(set_values));
+
+    mut.set_cell(ckey, "rc5", data_value{5}, write_timestamp);
+    mt->apply(mut);
+
+    tmpdir tmp = write_and_compare_sstables(s, mt, table_name);
+    validate_read(s, tmp.path, {mut});
+}
+
+SEASTAR_THREAD_TEST_CASE(test_write_static_interleaved_atomic_and_collection_columns) {
+    auto abj = defer([] { await_background_jobs().get(); });
+    sstring table_name = "static_interleaved_atomic_and_collection_columns";
+    // CREATE TABLE static_interleaved_atomic_and_collection_columns ( pk int, ck int, st1 int static,
+    //     st2 set<int> static, st3 int static, st4 set<int> static, st5 int static, st6 set<int> static,
+    //     PRIMARY KEY (pk, ck)) WITH compression = {'sstable_compression': ''};
+    auto set_of_ints_type = set_type_impl::get_instance(int32_type, true);
+    schema_builder builder("sst3", table_name);
+    builder.with_column("pk", int32_type, column_kind::partition_key);
+    builder.with_column("ck", int32_type, column_kind::clustering_key);
+    builder.with_column("st1", int32_type, column_kind::static_column);
+    builder.with_column("st2", set_of_ints_type, column_kind::static_column);
+    builder.with_column("st3", int32_type, column_kind::static_column);
+    builder.with_column("st4", set_of_ints_type, column_kind::static_column);
+    builder.with_column("st5", int32_type, column_kind::static_column);
+    builder.with_column("st6", set_of_ints_type, column_kind::static_column);
+    builder.set_compressor_params(compression_parameters());
+    schema_ptr s = builder.build(schema_builder::compact_storage::no);
+
+    lw_shared_ptr<memtable> mt = make_lw_shared<memtable>(s);
+
+    // INSERT INTO static_interleaved_atomic_and_collection_columns (pk, ck, st1, st4, st5)
+    //     VALUES (0, 1, 2, {3, 4}, 5) USING TIMESTAMP 1525385507816568;
+    auto key = partition_key::from_deeply_exploded(*s, {0});
+    mutation mut{s, key};
+    clustering_key ckey = clustering_key::from_deeply_exploded(*s, { 1 });
+    mut.partition().apply_insert(*s, ckey, write_timestamp);
+    mut.set_static_cell("st1", data_value{2}, write_timestamp);
+
+    set_type_impl::mutation set_values;
+    set_values.tomb = tombstone {write_timestamp - 1, write_time_point};
+    set_values.cells.emplace_back(int32_type->decompose(3), atomic_cell::make_live(*bytes_type, write_timestamp, bytes_view{}));
+    set_values.cells.emplace_back(int32_type->decompose(4), atomic_cell::make_live(*bytes_type, write_timestamp, bytes_view{}));
+    mut.set_static_cell(*s->get_column_definition("st4"), set_of_ints_type->serialize_mutation_form(set_values));
+
+    mut.set_static_cell("st5", data_value{5}, write_timestamp);
+    mt->apply(mut);
+
+    tmpdir tmp = write_and_compare_sstables(s, mt, table_name);
+    validate_read(s, tmp.path, {mut});
+}
+
--- a/tests/sstable_datafile_test.cc
+++ b/tests/sstable_datafile_test.cc
@@ -2510,38 +2510,34 @@ SEASTAR_TEST_CASE(check_multi_schema) {
    //        e blob
    //);
    return for_each_sstable_version([] (const sstables::sstable::version_types version) {
-        // We prohibit altering types for SSTables in 'mc' format.
-        // This is compliant with the Origin behaviour - see CASSANDRA-12443
-        if (version != sstables::sstable::version_types::mc) {
-            auto set_of_ints_type = set_type_impl::get_instance(int32_type, true);
-            auto builder = schema_builder("test", "test_multi_schema")
-                .with_column("a", int32_type, column_kind::partition_key)
-                .with_column("c", set_of_ints_type)
-                .with_column("d", int32_type)
-                .with_column("e", int32_type);
-            auto s = builder.build();
+        auto set_of_ints_type = set_type_impl::get_instance(int32_type, true);
+        auto builder = schema_builder("test", "test_multi_schema")
+            .with_column("a", int32_type, column_kind::partition_key)
+            .with_column("c", set_of_ints_type)
+            .with_column("d", int32_type)
+            .with_column("e", bytes_type);
+        auto s = builder.build();

-            auto sst = make_sstable(s, get_test_dir("multi_schema_test", s), 1, version, big);
-            auto f = sst->load();
-            return f.then([sst, s] {
-                auto reader = make_lw_shared(sstable_reader(sst, s));
-                return read_mutation_from_flat_mutation_reader(*reader, db::no_timeout).then([reader, s] (mutation_opt m) {
-                    BOOST_REQUIRE(m);
-                    BOOST_REQUIRE(m->key().equal(*s, partition_key::from_singular(*s, 0)));
-                    auto& rows = m->partition().clustered_rows();
-                    BOOST_REQUIRE_EQUAL(rows.calculate_size(), 1);
-                    auto& row = rows.begin()->row();
-                    BOOST_REQUIRE(!row.deleted_at());
-                    auto& cells = row.cells();
-                    BOOST_REQUIRE_EQUAL(cells.size(), 1);
-                    auto& cdef = *s->get_column_definition("e");
-                    BOOST_REQUIRE_EQUAL(cells.cell_at(cdef.id).as_atomic_cell(cdef).value(), int32_type->decompose(5));
-                    return (*reader)(db::no_timeout);
-                }).then([reader, s] (mutation_fragment_opt m) {
-                    BOOST_REQUIRE(!m);
-                });
+        auto sst = make_sstable(s, get_test_dir("multi_schema_test", s), 1, version, big);
+        auto f = sst->load();
+        return f.then([sst, s] {
+            auto reader = make_lw_shared(sstable_reader(sst, s));
+            return read_mutation_from_flat_mutation_reader(*reader, db::no_timeout).then([reader, s] (mutation_opt m) {
+                BOOST_REQUIRE(m);
+                BOOST_REQUIRE(m->key().equal(*s, partition_key::from_singular(*s, 0)));
+                auto& rows = m->partition().clustered_rows();
+                BOOST_REQUIRE_EQUAL(rows.calculate_size(), 1);
+                auto& row = rows.begin()->row();
+                BOOST_REQUIRE(!row.deleted_at());
+                auto& cells = row.cells();
+                BOOST_REQUIRE_EQUAL(cells.size(), 1);
+                auto& cdef = *s->get_column_definition("e");
+                BOOST_REQUIRE_EQUAL(cells.cell_at(cdef.id).as_atomic_cell(cdef).value(), int32_type->decompose(5));
+                return (*reader)(db::no_timeout);
+            }).then([reader, s] (mutation_fragment_opt m) {
+                BOOST_REQUIRE(!m);
            });
-        }
+        });
        return make_ready_future<>();
    });
 }
--- a/tests/sstable_mutation_test.cc
+++ b/tests/sstable_mutation_test.cc
@@ -410,7 +410,7 @@ SEASTAR_TEST_CASE(test_sstable_conforms_to_mutation_source) {
    return seastar::async([] {
        auto wait_bg = seastar::defer([] { sstables::await_background_jobs().get(); });
        storage_service_for_tests ssft;
-        for (auto version : {sstables::sstable::version_types::ka, sstables::sstable::version_types::la}) {
+        for (auto version : all_sstable_versions) {
            for (auto index_block_size : {1, 128, 64*1024}) {
                sstable_writer_config cfg;
                cfg.promoted_index_block_size = index_block_size;
--- a/tests/sstables/3.x/uncompressed/read_two_rows_fast_forwarding/mc-1-big-CRC.db
+++ b/tests/sstables/3.x/uncompressed/read_two_rows_fast_forwarding/mc-1-big-CRC.db
--- a/tests/sstables/3.x/uncompressed/read_two_rows_fast_forwarding/mc-1-big-Data.db
+++ b/tests/sstables/3.x/uncompressed/read_two_rows_fast_forwarding/mc-1-big-Data.db
--- a/tests/sstables/3.x/uncompressed/read_two_rows_fast_forwarding/mc-1-big-Digest.crc32
+++ b/tests/sstables/3.x/uncompressed/read_two_rows_fast_forwarding/mc-1-big-Digest.crc32
@@ -0,0 +1 @@
+2712473259
--- a/tests/sstables/3.x/uncompressed/read_two_rows_fast_forwarding/mc-1-big-Filter.db
+++ b/tests/sstables/3.x/uncompressed/read_two_rows_fast_forwarding/mc-1-big-Filter.db
--- a/tests/sstables/3.x/uncompressed/read_two_rows_fast_forwarding/mc-1-big-Index.db
+++ b/tests/sstables/3.x/uncompressed/read_two_rows_fast_forwarding/mc-1-big-Index.db
--- a/tests/sstables/3.x/uncompressed/read_two_rows_fast_forwarding/mc-1-big-Statistics.db
+++ b/tests/sstables/3.x/uncompressed/read_two_rows_fast_forwarding/mc-1-big-Statistics.db
--- a/tests/sstables/3.x/uncompressed/read_two_rows_fast_forwarding/mc-1-big-Summary.db
+++ b/tests/sstables/3.x/uncompressed/read_two_rows_fast_forwarding/mc-1-big-Summary.db
--- a/tests/sstables/3.x/uncompressed/read_two_rows_fast_forwarding/mc-1-big-TOC.txt
+++ b/tests/sstables/3.x/uncompressed/read_two_rows_fast_forwarding/mc-1-big-TOC.txt
@@ -0,0 +1,8 @@
+Data.db
+Digest.crc32
+Index.db
+TOC.txt
+Filter.db
+Statistics.db
+Summary.db
+CRC.db
--- a/tests/sstables/3.x/uncompressed/write_dead_row_marker/mc-1-big-CRC.db
+++ b/tests/sstables/3.x/uncompressed/write_dead_row_marker/mc-1-big-CRC.db
--- a/tests/sstables/3.x/uncompressed/write_dead_row_marker/mc-1-big-Data.db
+++ b/tests/sstables/3.x/uncompressed/write_dead_row_marker/mc-1-big-Data.db
--- a/tests/sstables/3.x/uncompressed/write_dead_row_marker/mc-1-big-Digest.crc32
+++ b/tests/sstables/3.x/uncompressed/write_dead_row_marker/mc-1-big-Digest.crc32
@@ -0,0 +1 @@
+2548599407
--- a/tests/sstables/3.x/uncompressed/write_dead_row_marker/mc-1-big-Filter.db
+++ b/tests/sstables/3.x/uncompressed/write_dead_row_marker/mc-1-big-Filter.db
--- a/tests/sstables/3.x/uncompressed/write_dead_row_marker/mc-1-big-Index.db
+++ b/tests/sstables/3.x/uncompressed/write_dead_row_marker/mc-1-big-Index.db
--- a/tests/sstables/3.x/uncompressed/write_dead_row_marker/mc-1-big-Statistics.db
+++ b/tests/sstables/3.x/uncompressed/write_dead_row_marker/mc-1-big-Statistics.db
--- a/tests/sstables/3.x/uncompressed/write_dead_row_marker/mc-1-big-Summary.db
+++ b/tests/sstables/3.x/uncompressed/write_dead_row_marker/mc-1-big-Summary.db
--- a/tests/sstables/3.x/uncompressed/write_dead_row_marker/mc-1-big-TOC.txt
+++ b/tests/sstables/3.x/uncompressed/write_dead_row_marker/mc-1-big-TOC.txt
@@ -0,0 +1,8 @@
+CRC.db
+Filter.db
+Statistics.db
+TOC.txt
+Digest.crc32
+Index.db
+Summary.db
+Data.db
--- a/tests/sstables/3.x/uncompressed/write_interleaved_atomic_and_collection_columns/mc-1-big-Data.db
+++ b/tests/sstables/3.x/uncompressed/write_interleaved_atomic_and_collection_columns/mc-1-big-Data.db
--- a/tests/sstables/3.x/uncompressed/write_interleaved_atomic_and_collection_columns/mc-1-big-Digest.crc32
+++ b/tests/sstables/3.x/uncompressed/write_interleaved_atomic_and_collection_columns/mc-1-big-Digest.crc32
@@ -0,0 +1 @@
+1117317764
--- a/tests/sstables/3.x/uncompressed/write_interleaved_atomic_and_collection_columns/mc-1-big-Filter.db
+++ b/tests/sstables/3.x/uncompressed/write_interleaved_atomic_and_collection_columns/mc-1-big-Filter.db
--- a/tests/sstables/3.x/uncompressed/write_interleaved_atomic_and_collection_columns/mc-1-big-Index.db
+++ b/tests/sstables/3.x/uncompressed/write_interleaved_atomic_and_collection_columns/mc-1-big-Index.db
--- a/tests/sstables/3.x/uncompressed/write_interleaved_atomic_and_collection_columns/mc-1-big-Statistics.db
+++ b/tests/sstables/3.x/uncompressed/write_interleaved_atomic_and_collection_columns/mc-1-big-Statistics.db
--- a/tests/sstables/3.x/uncompressed/write_regular_and_shadowable_deletion/mc-1-big-Data.db
+++ b/tests/sstables/3.x/uncompressed/write_regular_and_shadowable_deletion/mc-1-big-Data.db
--- a/tests/sstables/3.x/uncompressed/write_regular_and_shadowable_deletion/mc-1-big-Digest.crc32
+++ b/tests/sstables/3.x/uncompressed/write_regular_and_shadowable_deletion/mc-1-big-Digest.crc32
@@ -0,0 +1 @@
+476890539
--- a/tests/sstables/3.x/uncompressed/write_regular_and_shadowable_deletion/mc-1-big-Filter.db
+++ b/tests/sstables/3.x/uncompressed/write_regular_and_shadowable_deletion/mc-1-big-Filter.db
--- a/tests/sstables/3.x/uncompressed/write_regular_and_shadowable_deletion/mc-1-big-Index.db
+++ b/tests/sstables/3.x/uncompressed/write_regular_and_shadowable_deletion/mc-1-big-Index.db
--- a/tests/sstables/3.x/uncompressed/write_regular_and_shadowable_deletion/mc-1-big-Statistics.db
+++ b/tests/sstables/3.x/uncompressed/write_regular_and_shadowable_deletion/mc-1-big-Statistics.db
--- a/tests/sstables/3.x/uncompressed/write_shadowable_deletion/mc-1-big-Data.db
+++ b/tests/sstables/3.x/uncompressed/write_shadowable_deletion/mc-1-big-Data.db
--- a/tests/sstables/3.x/uncompressed/write_shadowable_deletion/mc-1-big-Digest.crc32
+++ b/tests/sstables/3.x/uncompressed/write_shadowable_deletion/mc-1-big-Digest.crc32
@@ -0,0 +1 @@
+1657818438
--- a/tests/sstables/3.x/uncompressed/write_shadowable_deletion/mc-1-big-Filter.db
+++ b/tests/sstables/3.x/uncompressed/write_shadowable_deletion/mc-1-big-Filter.db
--- a/Show More
+++ b/Show More