gossip: Fix tokens assignment in assassinate_endpoint

The tokens vector is defined a few lines above and is needed outsie the if block. Do not redefine it again in the if block, otherwise the tokens will be empty. Found by code inspection. Fixes #3551. Message-Id: <c7a06375c65c950e94236571127f533e5a60cbfd.1530002177.git.asias@scylladb.com> (cherry picked from commit c3b5a2ecd5)
locator::ec2_multi_region_snitch: don't call for ec2_snitch::gossiper_starting()
2018-06-27 12:01:19 +03:00 · 2018-06-12 19:02:48 +03:00 · 2018-05-24 12:02:15 +03:00 · 2018-05-24 11:14:20 +03:00 · 2018-05-24 11:08:13 +03:00 · 2018-05-24 15:24:29 +08:00
563 changed files with 42635 additions and 12115 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,140 @@
+##
+## For best results, first compile the project using the Ninja build-system.
+##
+
+cmake_minimum_required(VERSION 3.7)
+project(scylla)
+
+if (NOT DEFINED ENV{CLION_IDE})
+    message(FATAL_ERROR "This CMakeLists.txt file is only valid for use in CLion")
+endif()
+
+# Default value. A more accurate list is populated through `pkg-config` below if `seastar.pc` is available.
+set(SEASTAR_INCLUDE_DIRS "seastar")
+
+# These paths are always available, since they're included in the repository. Additional DPDK headers are placed while
+# Seastar is built, and are captured in `SEASTAR_INCLUDE_DIRS` through parsing the Seastar pkg-config file (below).
+set(SEASTAR_DPDK_INCLUDE_DIRS
+        seastar/dpdk/lib/librte_eal/common/include
+        seastar/dpdk/lib/librte_eal/common/include/generic
+        seastar/dpdk/lib/librte_eal/common/include/x86
+        seastar/dpdk/lib/librte_ether)
+
+find_package(PkgConfig REQUIRED)
+
+set(ENV{PKG_CONFIG_PATH} "${CMAKE_SOURCE_DIR}/seastar/build/release:$ENV{PKG_CONFIG_PATH}")
+pkg_check_modules(SEASTAR seastar)
+
+find_package(Boost COMPONENTS filesystem program_options system thread)
+
+##
+## Populate the names of all source and header files in the indicated paths in a designated variable.
+##
+## When RECURSIVE is specified, directories are traversed recursively.
+##
+## Use: scan_scylla_source_directories(VAR my_result_var [RECURSIVE] PATHS [path1 path2 ...])
+##
+function (scan_scylla_source_directories)
+    set(options RECURSIVE)
+    set(oneValueArgs VAR)
+    set(multiValueArgs PATHS)
+    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" "${ARGN}")
+
+    set(globs "")
+
+    foreach (dir ${args_PATHS})
+        list(APPEND globs "${dir}/*.cc" "${dir}/*.hh")
+    endforeach()
+
+    if (args_RECURSIVE)
+        set(glob_kind GLOB_RECURSE)
+    else()
+        set(glob_kind GLOB)
+    endif()
+
+    file(${glob_kind} var
+            ${globs})
+
+    set(${args_VAR} ${var} PARENT_SCOPE)
+endfunction()
+
+## Although Seastar is an external project, it is common enough to explore the sources while doing
+## Scylla development that we'll treat the Seastar sources as part of this project for easier navigation.
+scan_scylla_source_directories(
+        VAR SEASTAR_SOURCE_FILES
+        RECURSIVE
+
+        PATHS
+          seastar/core
+          seastar/http
+          seastar/json
+          seastar/net
+          seastar/rpc
+          seastar/tests
+          seastar/util)
+
+scan_scylla_source_directories(
+        VAR SCYLLA_ROOT_SOURCE_FILES
+        PATHS .)
+
+scan_scylla_source_directories(
+        VAR SCYLLA_SUB_SOURCE_FILES
+        RECURSIVE
+
+        PATHS
+          api
+          auth
+          cql3
+          db
+          dht
+          exceptions
+          gms
+          index
+          io
+          locator
+          message
+          repair
+          service
+          sstables
+          streaming
+          tests
+          thrift
+          tracing
+          transport
+          utils)
+
+scan_scylla_source_directories(
+        VAR SCYLLA_GEN_SOURCE_FILES
+        RECURSIVE
+        PATHS build/release/gen)
+
+set(SCYLLA_SOURCE_FILES
+        ${SCYLLA_ROOT_SOURCE_FILES}
+        ${SCYLLA_GEN_SOURCE_FILES}
+        ${SCYLLA_SUB_SOURCE_FILES})
+
+add_executable(scylla
+        ${SEASTAR_SOURCE_FILES}
+        ${SCYLLA_SOURCE_FILES})
+
+# Note that since CLion does not undestand GCC6 concepts, we always disable them (even if users configure otherwise).
+# CLion seems to have trouble with `-U` (macro undefinition), so we do it this way instead.
+list(REMOVE_ITEM SEASTAR_CFLAGS "-DHAVE_GCC6_CONCEPTS")
+
+# If the Seastar pkg-config information is available, append to the default flags.
+#
+# For ease of browsing the source code, we always pretend that DPDK is enabled.
+target_compile_options(scylla PUBLIC
+        -std=gnu++14
+        -DHAVE_DPDK
+        -DHAVE_HWLOC
+        "${SEASTAR_CFLAGS}")
+
+# The order matters here: prefer the "static" DPDK directories to any dynamic paths from pkg-config. Some files are only
+# available dynamically, though.
+target_include_directories(scylla PUBLIC
+        .
+        ${SEASTAR_DPDK_INCLUDE_DIRS}
+        ${SEASTAR_INCLUDE_DIRS}
+        ${Boost_INCLUDE_DIRS}
+        build/release/gen)
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=2.0.4

 if test -f version
 then
--- a/api/api.hh
+++ b/api/api.hh
@@ -29,6 +29,7 @@
 #include "utils/histogram.hh"
 #include "http/exception.hh"
 #include "api_init.hh"
+#include "seastarx.hh"

 namespace api {

--- a/api/cache_service.cc
+++ b/api/cache_service.cc
@@ -252,13 +252,13 @@ void set_cache_service(http_context& ctx, routes& r) {
        // In origin row size is the weighted size.
        // We currently do not support weights, so we use num entries instead
        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
-            return cf.get_row_cache().num_entries();
+            return cf.get_row_cache().partitions();
        }, std::plus<uint64_t>());
    });

    cs::get_row_entries.set(r, [&ctx] (std::unique_ptr<request> req) {
        return map_reduce_cf(ctx, 0, [](const column_family& cf) {
-            return cf.get_row_cache().num_entries();
+            return cf.get_row_cache().partitions();
        }, std::plus<uint64_t>());
    });

--- a/api/column_family.cc
+++ b/api/column_family.cc
@@ -182,15 +182,6 @@ static int64_t max_row_size(column_family& cf) {
    return res;
 }

-static double update_ratio(double acc, double f, double total) {
-    if (f && !total) {
-        throw bad_param_exception("total should include all elements");
-    } else if (total) {
-        acc += f / total;
-    }
-    return acc;
-}
-
 static integral_ratio_holder mean_row_size(column_family& cf) {
    integral_ratio_holder res;
    for (auto i: *cf.get_sstables() ) {
@@ -283,6 +274,16 @@ static std::vector<uint64_t> concat_sstable_count_per_level(std::vector<uint64_t
    return a;
 }

+ratio_holder filter_false_positive_as_ratio_holder(const sstables::shared_sstable& sst) {
+    double f = sst->filter_get_false_positive();
+    return ratio_holder(f + sst->filter_get_true_positive(), f);
+}
+
+ratio_holder filter_recent_false_positive_as_ratio_holder(const sstables::shared_sstable& sst) {
+    double f = sst->filter_get_recent_false_positive();
+    return ratio_holder(f + sst->filter_get_recent_true_positive(), f);
+}
+
 void set_column_family(http_context& ctx, routes& r) {
    cf::get_column_family_name.set(r, [&ctx] (const_req req){
        vector<sstring> res;
@@ -604,39 +605,27 @@ void set_column_family(http_context& ctx, routes& r) {
    });

    cf::get_bloom_filter_false_ratio.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], double(0), [] (column_family& cf) {
-            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), double(0), [](double s, auto& sst) {
-                double f = sst->filter_get_false_positive();
-                return update_ratio(s, f, f + sst->filter_get_true_positive());
-            });
-        }, std::plus<double>());
+        return map_reduce_cf(ctx, req->param["name"], ratio_holder(), [] (column_family& cf) {
+            return boost::accumulate(*cf.get_sstables() | boost::adaptors::transformed(filter_false_positive_as_ratio_holder), ratio_holder());
+        }, std::plus<>());
    });

    cf::get_all_bloom_filter_false_ratio.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, double(0), [] (column_family& cf) {
-            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), double(0), [](double s, auto& sst) {
-                double f = sst->filter_get_false_positive();
-                return update_ratio(s, f, f + sst->filter_get_true_positive());
-            });
-        }, std::plus<double>());
+        return map_reduce_cf(ctx, ratio_holder(), [] (column_family& cf) {
+            return boost::accumulate(*cf.get_sstables() | boost::adaptors::transformed(filter_false_positive_as_ratio_holder), ratio_holder());
+        }, std::plus<>());
    });

    cf::get_recent_bloom_filter_false_ratio.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, req->param["name"], double(0), [] (column_family& cf) {
-            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), double(0), [](double s, auto& sst) {
-                double f = sst->filter_get_recent_false_positive();
-                return update_ratio(s, f, f + sst->filter_get_recent_true_positive());
-            });
-        }, std::plus<double>());
+        return map_reduce_cf(ctx, req->param["name"], ratio_holder(), [] (column_family& cf) {
+            return boost::accumulate(*cf.get_sstables() | boost::adaptors::transformed(filter_recent_false_positive_as_ratio_holder), ratio_holder());
+        }, std::plus<>());
    });

    cf::get_all_recent_bloom_filter_false_ratio.set(r, [&ctx] (std::unique_ptr<request> req) {
-        return map_reduce_cf(ctx, double(0), [] (column_family& cf) {
-            return std::accumulate(cf.get_sstables()->begin(), cf.get_sstables()->end(), double(0), [](double s, auto& sst) {
-                double f = sst->filter_get_recent_false_positive();
-                return update_ratio(s, f, f + sst->filter_get_recent_true_positive());
-            });
-        }, std::plus<double>());
+        return map_reduce_cf(ctx, ratio_holder(), [] (column_family& cf) {
+            return boost::accumulate(*cf.get_sstables() | boost::adaptors::transformed(filter_recent_false_positive_as_ratio_holder), ratio_holder());
+        }, std::plus<>());
    });

    cf::get_bloom_filter_disk_space_used.set(r, [&ctx] (std::unique_ptr<request> req) {
--- a/api/compaction_manager.cc
+++ b/api/compaction_manager.cc
@@ -26,7 +26,6 @@

 namespace api {

-using namespace scollectd;
 namespace cm = httpd::compaction_manager_json;
 using namespace json;

--- a/api/hinted_handoff.cc
+++ b/api/hinted_handoff.cc
@@ -24,7 +24,6 @@

 namespace api {

-using namespace scollectd;
 using namespace json;
 namespace hh = httpd::hinted_handoff_json;

--- a/api/lsa.cc
+++ b/api/lsa.cc
@@ -29,11 +29,11 @@

 namespace api {

-static logging::logger logger("lsa-api");
+static logging::logger alogger("lsa-api");

 void set_lsa(http_context& ctx, routes& r) {
    httpd::lsa_json::lsa_compact.set(r, [&ctx](std::unique_ptr<request> req) {
-        logger.info("Triggering compaction");
+        alogger.info("Triggering compaction");
        return ctx.db.invoke_on_all([] (database&) {
            logalloc::shard_tracker().reclaim(std::numeric_limits<size_t>::max());
        }).then([] {
--- a/api/messaging_service.cc
+++ b/api/messaging_service.cc
@@ -27,7 +27,7 @@
 #include <sstream>

 using namespace httpd::messaging_service_json;
-using namespace net;
+using namespace netw;

 namespace api {

@@ -120,13 +120,13 @@ void set_messaging_service(http_context& ctx, routes& r) {
    }));

    get_version.set(r, [](const_req req) {
-        return net::get_local_messaging_service().get_raw_version(req.get_query_param("addr"));
+        return netw::get_local_messaging_service().get_raw_version(req.get_query_param("addr"));
    });

    get_dropped_messages_by_ver.set(r, [](std::unique_ptr<request> req) {
        shared_ptr<std::vector<uint64_t>> map = make_shared<std::vector<uint64_t>>(num_verb);

-        return net::get_messaging_service().map_reduce([map](const uint64_t* local_map) mutable {
+        return netw::get_messaging_service().map_reduce([map](const uint64_t* local_map) mutable {
            for (auto i = 0; i < num_verb; i++) {
                (*map)[i]+= local_map[i];
            }
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -802,10 +802,8 @@ void set_storage_service(http_context& ctx, routes& r) {
        return make_ready_future<json::json_return_type>(json_void());
    });

-    ss::get_metrics_load.set(r, [](std::unique_ptr<request> req) {
-        //TBD
-        unimplemented();
-        return make_ready_future<json::json_return_type>(0);
+    ss::get_metrics_load.set(r, [&ctx](std::unique_ptr<request> req) {
+        return get_cf_stats(ctx, &column_family::stats::live_disk_space_used);
    });

    ss::get_exceptions.set(r, [](const_req req) {
--- a/atomic_cell.hh
+++ b/atomic_cell.hh
@@ -29,10 +29,11 @@
 #include "net/byteorder.hh"
 #include <cstdint>
 #include <iosfwd>
+#include <seastar/util/gcc6-concepts.hh>

-template<typename T>
+template<typename T, typename Input>
 static inline
-void set_field(managed_bytes& v, unsigned offset, T val) {
+void set_field(Input& v, unsigned offset, T val) {
    reinterpret_cast<net::packed<T>*>(v.begin() + offset)->raw = net::hton(val);
 }

@@ -58,6 +59,7 @@ private:
    static constexpr int8_t EXPIRY_FLAG = 0x02; // When present, expiry field is present. Set only for live cells
    static constexpr int8_t REVERT_FLAG = 0x04; // transient flag used to efficiently implement ReversiblyMergeable for atomic cells.
    static constexpr int8_t COUNTER_UPDATE_FLAG = 0x08; // Cell is a counter update.
+    static constexpr int8_t COUNTER_IN_PLACE_REVERT = 0x10;
    static constexpr unsigned flags_size = 1;
    static constexpr unsigned timestamp_offset = flags_size;
    static constexpr unsigned timestamp_size = 8;
@@ -67,6 +69,7 @@ private:
    static constexpr unsigned deletion_time_size = 4;
    static constexpr unsigned ttl_offset = expiry_offset + expiry_size;
    static constexpr unsigned ttl_size = 4;
+    friend class counter_cell_builder;
 private:
    static bool is_counter_update(bytes_view cell) {
        return cell[0] & COUNTER_UPDATE_FLAG;
@@ -74,10 +77,17 @@ private:
    static bool is_revert_set(bytes_view cell) {
        return cell[0] & REVERT_FLAG;
    }
+    static bool is_counter_in_place_revert_set(bytes_view cell) {
+        return cell[0] & COUNTER_IN_PLACE_REVERT;
+    }
    template<typename BytesContainer>
    static void set_revert(BytesContainer& cell, bool revert) {
        cell[0] = (cell[0] & ~REVERT_FLAG) | (revert * REVERT_FLAG);
    }
+    template<typename BytesContainer>
+    static void set_counter_in_place_revert(BytesContainer& cell, bool flag) {
+        cell[0] = (cell[0] & ~COUNTER_IN_PLACE_REVERT) | (flag * COUNTER_IN_PLACE_REVERT);
+    }
    static bool is_live(const bytes_view& cell) {
        return cell[0] & LIVE_FLAG;
    }
@@ -91,13 +101,30 @@ private:
    static api::timestamp_type timestamp(const bytes_view& cell) {
        return get_field<api::timestamp_type>(cell, timestamp_offset);
    }
+    template<typename BytesContainer>
+    static void set_timestamp(BytesContainer& cell, api::timestamp_type ts) {
+        set_field(cell, timestamp_offset, ts);
+    }
    // Can be called on live cells only
-    static bytes_view value(bytes_view cell) {
+private:
+    template<typename BytesView>
+    static BytesView do_get_value(BytesView cell) {
        auto expiry_field_size = bool(cell[0] & EXPIRY_FLAG) * (expiry_size + ttl_size);
        auto value_offset = flags_size + timestamp_size + expiry_field_size;
        cell.remove_prefix(value_offset);
        return cell;
    }
+public:
+    static bytes_view value(bytes_view cell) {
+        return do_get_value(cell);
+    }
+    static bytes_mutable_view value(bytes_mutable_view cell) {
+        return do_get_value(cell);
+    }
+    // Can be called on live counter update cells only
+    static int64_t counter_update_value(bytes_view cell) {
+        return get_field<int64_t>(cell, flags_size + timestamp_size);
+    }
    // Can be called only when is_dead() is true.
    static gc_clock::time_point deletion_time(const bytes_view& cell) {
        assert(is_dead(cell));
@@ -130,12 +157,12 @@ private:
        std::copy_n(value.begin(), value.size(), b.begin() + value_offset);
        return b;
    }
-    static managed_bytes make_live_counter_update(api::timestamp_type timestamp, bytes_view value) {
+    static managed_bytes make_live_counter_update(api::timestamp_type timestamp, int64_t value) {
        auto value_offset = flags_size + timestamp_size;
-        managed_bytes b(managed_bytes::initialized_later(), value_offset + value.size());
+        managed_bytes b(managed_bytes::initialized_later(), value_offset + sizeof(value));
        b[0] = LIVE_FLAG | COUNTER_UPDATE_FLAG;
        set_field(b, timestamp_offset, timestamp);
-        std::copy_n(value.begin(), value.size(), b.begin() + value_offset);
+        set_field(b, value_offset, value);
        return b;
    }
    static managed_bytes make_live(api::timestamp_type timestamp, bytes_view value, gc_clock::time_point expiry, gc_clock::duration ttl) {
@@ -148,6 +175,31 @@ private:
        std::copy_n(value.begin(), value.size(), b.begin() + value_offset);
        return b;
    }
+    // make_live_from_serializer() is intended for users that need to serialise
+    // some object or objects to the format used in atomic_cell::value().
+    // With just make_live() the patter would look like follows:
+    // 1. allocate a buffer and write to it serialised objects
+    // 2. pass that buffer to make_live()
+    // 3. make_live() needs to prepend some metadata to the cell value so it
+    //    allocates a new buffer and copies the content of the original one
+    //
+    // The allocation and copy of a buffer can be avoided.
+    // make_live_from_serializer() allows the user code to specify the timestamp
+    // and size of the cell value as well as provide the serialiser function
+    // object, which would write the serialised value of the cell to the buffer
+    // given to it by make_live_from_serializer().
+    template<typename Serializer>
+    GCC6_CONCEPT(requires requires(Serializer serializer, bytes::iterator it) {
+        serializer(it);
+    })
+    static managed_bytes make_live_from_serializer(api::timestamp_type timestamp, size_t size, Serializer&& serializer) {
+        auto value_offset = flags_size + timestamp_size;
+        managed_bytes b(managed_bytes::initialized_later(), value_offset + size);
+        b[0] = LIVE_FLAG;
+        set_field(b, timestamp_offset, timestamp);
+        serializer(b.begin() + value_offset);
+        return b;
+    }
    template<typename ByteContainer>
    friend class atomic_cell_base;
    friend class atomic_cell;
@@ -167,6 +219,9 @@ public:
    bool is_revert_set() const {
        return atomic_cell_type::is_revert_set(_data);
    }
+    bool is_counter_in_place_revert_set() const {
+        return atomic_cell_type::is_counter_in_place_revert_set(_data);
+    }
    bool is_live() const {
        return atomic_cell_type::is_live(_data);
    }
@@ -189,10 +244,17 @@ public:
    api::timestamp_type timestamp() const {
        return atomic_cell_type::timestamp(_data);
    }
+    void set_timestamp(api::timestamp_type ts) {
+        atomic_cell_type::set_timestamp(_data, ts);
+    }
    // Can be called on live cells only
-    bytes_view value() const {
+    auto value() const {
        return atomic_cell_type::value(_data);
    }
+    // Can be called on live counter update cells only
+    int64_t counter_update_value() const {
+        return atomic_cell_type::counter_update_value(_data);
+    }
    // Can be called only when is_dead(gc_clock::time_point)
    gc_clock::time_point deletion_time() const {
        return !is_live() ? atomic_cell_type::deletion_time(_data) : expiry() - ttl();
@@ -215,6 +277,9 @@ public:
    void set_revert(bool revert) {
        atomic_cell_type::set_revert(_data, revert);
    }
+    void set_counter_in_place_revert(bool flag) {
+        atomic_cell_type::set_counter_in_place_revert(_data, flag);
+    }
 };

 class atomic_cell_view final : public atomic_cell_base<bytes_view> {
@@ -226,6 +291,14 @@ public:
    friend std::ostream& operator<<(std::ostream& os, const atomic_cell_view& acv);
 };

+class atomic_cell_mutable_view final : public atomic_cell_base<bytes_mutable_view> {
+    atomic_cell_mutable_view(bytes_mutable_view data) : atomic_cell_base(std::move(data)) {}
+public:
+    static atomic_cell_mutable_view from_bytes(bytes_mutable_view data) { return atomic_cell_mutable_view(data); }
+
+    friend class atomic_cell;
+};
+
 class atomic_cell_ref final : public atomic_cell_base<managed_bytes&> {
 public:
    atomic_cell_ref(managed_bytes& buf) : atomic_cell_base(buf) {}
@@ -254,12 +327,9 @@ public:
    static atomic_cell make_live(api::timestamp_type timestamp, const bytes& value) {
        return make_live(timestamp, bytes_view(value));
    }
-    static atomic_cell make_live_counter_update(api::timestamp_type timestamp, bytes_view value) {
+    static atomic_cell make_live_counter_update(api::timestamp_type timestamp, int64_t value) {
        return atomic_cell_type::make_live_counter_update(timestamp, value);
    }
-    static atomic_cell make_live_counter_update(api::timestamp_type timestamp, const bytes& value) {
-        return atomic_cell_type::make_live_counter_update(timestamp, bytes_view(value));
-    }
    static atomic_cell make_live(api::timestamp_type timestamp, bytes_view value,
        gc_clock::time_point expiry, gc_clock::duration ttl)
    {
@@ -277,6 +347,10 @@ public:
            return atomic_cell_type::make_live(timestamp, value, gc_clock::now() + *ttl, *ttl);
        }
    }
+    template<typename Serializer>
+    static atomic_cell make_live_from_serializer(api::timestamp_type timestamp, size_t size, Serializer&& serializer) {
+        return atomic_cell_type::make_live_from_serializer(timestamp, size, std::forward<Serializer>(serializer));
+    }
    friend class atomic_cell_or_collection;
    friend std::ostream& operator<<(std::ostream& os, const atomic_cell& ac);
 };
@@ -314,11 +388,6 @@ collection_mutation::operator collection_mutation_view() const {
    return { data };
 }

-namespace db {
-template<typename T>
-class serializer;
-}
-
 class column_definition;

 int compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right);
--- a/atomic_cell_or_collection.hh
+++ b/atomic_cell_or_collection.hh
@@ -39,10 +39,14 @@ public:
    static atomic_cell_or_collection from_atomic_cell(atomic_cell data) { return { std::move(data._data) }; }
    atomic_cell_view as_atomic_cell() const { return atomic_cell_view::from_bytes(_data); }
    atomic_cell_ref as_atomic_cell_ref() { return { _data }; }
+    atomic_cell_mutable_view as_mutable_atomic_cell() { return atomic_cell_mutable_view::from_bytes(_data); }
    atomic_cell_or_collection(collection_mutation cm) : _data(std::move(cm.data)) {}
    explicit operator bool() const {
        return !_data.empty();
    }
+    bool can_use_mutable_view() const {
+        return !_data.is_fragmented();
+    }
    static atomic_cell_or_collection from_collection_mutation(collection_mutation data) {
        return std::move(data.data);
    }
--- a/auth/auth.cc
+++ b/auth/auth.cc
@@ -61,7 +61,7 @@ const sstring auth::auth::USERS_CF("users");
 static const sstring USER_NAME("name");
 static const sstring SUPER("super");

-static logging::logger logger("auth");
+static logging::logger alogger("auth");

 // TODO: configurable
 using namespace std::chrono_literals;
@@ -114,7 +114,7 @@ struct hash<auth::authenticated_user> {

 class auth::auth::permissions_cache {
 public:
-    typedef utils::loading_cache<std::pair<authenticated_user, data_resource>, permission_set, utils::tuple_hash> cache_type;
+    typedef utils::loading_cache<std::pair<authenticated_user, data_resource>, permission_set, utils::loading_cache_reload_enabled::yes, utils::simple_entry_size<permission_set>, utils::tuple_hash> cache_type;
    typedef typename cache_type::key_type key_type;

    permissions_cache()
@@ -123,25 +123,14 @@ public:
    }

    permissions_cache(const db::config& cfg)
-                    : _cache(cfg.permissions_cache_max_entries(), expiry(cfg),
-                                    std::chrono::milliseconds(
-                                                    cfg.permissions_validity_in_ms()),
-                                    [](const key_type& k) {
-                                        logger.debug("Refreshing permissions for {}", k.first.name());
-                                        return authorizer::get().authorize(::make_shared<authenticated_user>(k.first), k.second);
-                                    }) {
-    }
-
-    static std::chrono::milliseconds expiry(const db::config& cfg) {
-        auto exp = cfg.permissions_update_interval_in_ms();
-        if (exp == 0 || exp == std::numeric_limits<uint32_t>::max()) {
-            exp = cfg.permissions_validity_in_ms();
-        }
-        return std::chrono::milliseconds(exp);
-    }
+                    : _cache(cfg.permissions_cache_max_entries(), std::chrono::milliseconds(cfg.permissions_validity_in_ms()), std::chrono::milliseconds(cfg.permissions_update_interval_in_ms()), alogger,
+                        [] (const key_type& k) {
+                            alogger.debug("Refreshing permissions for {}", k.first.name());
+                            return authorizer::get().authorize(::make_shared<authenticated_user>(k.first), k.second);
+                        }) {}

    future<> stop() {
-        return make_ready_future<>();
+        return _cache.stop();
    }

    future<permission_set> get(::shared_ptr<authenticated_user> user, data_resource resource) {
@@ -152,6 +141,15 @@ private:
    cache_type _cache;
 };

+namespace std { // for ADL, yuch
+
+std::ostream& operator<<(std::ostream& os, const std::pair<auth::authenticated_user, auth::data_resource>& p) {
+    os << "{user: " << p.first.name() << ", data_resource: " << p.second << "}";
+    return os;
+}
+
+}
+
 static distributed<auth::auth::permissions_cache> perm_cache;

 /**
@@ -178,7 +176,7 @@ struct waiter {
            tmr.cancel();
            done.set_exception(std::runtime_error("shutting down"));
        }
-        logger.trace("Deleting scheduled task");
+        alogger.trace("Deleting scheduled task");
    }
    void kill() {
    }
@@ -192,7 +190,7 @@ static std::vector<waiter_ptr> & thread_waiters() {
 }

 void auth::auth::schedule_when_up(scheduled_func f) {
-    logger.trace("Adding scheduled task");
+    alogger.trace("Adding scheduled task");

    auto & waiters = thread_waiters();

@@ -208,7 +206,7 @@ void auth::auth::schedule_when_up(scheduled_func f) {
            waiters.erase(i);
        }
    }).then([f = std::move(f)] {
-        logger.trace("Running scheduled task");
+        alogger.trace("Running scheduled task");
        return f();
    }).handle_exception([](auto ep) {
        return make_ready_future();
@@ -246,7 +244,8 @@ future<> auth::auth::setup() {
        std::map<sstring, sstring> opts;
        opts["replication_factor"] = "1";
        auto ksm = keyspace_metadata::new_keyspace(AUTH_KS, "org.apache.cassandra.locator.SimpleStrategy", opts, true);
-        f = service::get_local_migration_manager().announce_new_keyspace(ksm, false);
+        // We use min_timestamp so that default keyspace metadata will loose with any manual adjustments. See issue #2129.
+        f = service::get_local_migration_manager().announce_new_keyspace(ksm, api::min_timestamp, false);
    }

    return f.then([] {
@@ -267,12 +266,12 @@ future<> auth::auth::setup() {
                    auto query = sprint("INSERT INTO %s.%s (%s, %s) VALUES (?, ?) USING TIMESTAMP 0",
                                    AUTH_KS, USERS_CF, USER_NAME, SUPER);
                    cql3::get_local_query_processor().process(query, db::consistency_level::ONE, {DEFAULT_SUPERUSER_NAME, true}).then([](auto) {
-                        logger.info("Created default superuser '{}'", DEFAULT_SUPERUSER_NAME);
+                        alogger.info("Created default superuser '{}'", DEFAULT_SUPERUSER_NAME);
                    }).handle_exception([](auto ep) {
                        try {
                            std::rethrow_exception(ep);
                        } catch (exceptions::request_execution_exception&) {
-                            logger.warn("Skipped default superuser setup: some nodes were not ready");
+                            alogger.warn("Skipped default superuser setup: some nodes were not ready");
                        }
                    });
                }
@@ -330,14 +329,13 @@ future<bool> auth::auth::is_super_user(const sstring& username) {
                    });
 }

-future<> auth::auth::insert_user(const sstring& username, bool is_super)
-                throw (exceptions::request_execution_exception) {
+future<> auth::auth::insert_user(const sstring& username, bool is_super) {
    return cql3::get_local_query_processor().process(sprint("INSERT INTO %s.%s (%s, %s) VALUES (?, ?)",
                    AUTH_KS, USERS_CF, USER_NAME, SUPER),
                    consistency_for_user(username), { username, is_super }).discard_result();
 }

-future<> auth::auth::delete_user(const sstring& username) throw(exceptions::request_execution_exception) {
+future<> auth::auth::delete_user(const sstring& username) {
    return cql3::get_local_query_processor().process(sprint("DELETE FROM %s.%s WHERE %s = ?",
                    AUTH_KS, USERS_CF, USER_NAME),
                    consistency_for_user(username), { username }).discard_result();
--- a/auth/auth.hh
+++ b/auth/auth.hh
@@ -50,11 +50,10 @@
 #include "exceptions/exceptions.hh"
 #include "permission.hh"
 #include "data_resource.hh"
+#include "authenticated_user.hh"

 namespace auth {

-class authenticated_user;
-
 class auth {
 public:
    class permissions_cache;
@@ -91,7 +90,7 @@ public:
     * @param isSuper User's new status.
     * @throws RequestExecutionException
     */
-    static future<> insert_user(const sstring& username, bool is_super) throw(exceptions::request_execution_exception);
+    static future<> insert_user(const sstring& username, bool is_super);

    /**
     * Deletes the user from AUTH_KS.USERS_CF.
@@ -99,7 +98,7 @@ public:
     * @param username Username to delete.
     * @throws RequestExecutionException
     */
-    static future<> delete_user(const sstring& username) throw(exceptions::request_execution_exception);
+    static future<> delete_user(const sstring& username);

    /**
     * Sets up Authenticator and Authorizer.
@@ -122,3 +121,5 @@ public:
    static void schedule_when_up(scheduled_func);
 };
 }
+
+std::ostream& operator<<(std::ostream& os, const std::pair<auth::authenticated_user, auth::data_resource>& p);
--- a/auth/authenticated_user.hh
+++ b/auth/authenticated_user.hh
@@ -43,6 +43,7 @@

 #include <seastar/core/sstring.hh>
 #include <seastar/core/future.hh>
+#include "seastarx.hh"

 namespace auth {

--- a/auth/authenticator.cc
+++ b/auth/authenticator.cc
@@ -72,7 +72,7 @@ sstring auth::authenticator::option_to_string(option opt) {
 static std::unique_ptr<auth::authenticator> global_authenticator;

 future<>
-auth::authenticator::setup(const sstring& type) throw (exceptions::configuration_exception) {
+auth::authenticator::setup(const sstring& type) {
    if (auth::auth::is_class_type(type, ALLOW_ALL_AUTHENTICATOR_NAME)) {
        class allow_all_authenticator : public authenticator {
        public:
@@ -88,16 +88,16 @@ auth::authenticator::setup(const sstring& type) throw (exceptions::configuration
            option_set alterable_options() const override {
                return option_set();
            }
-            future<::shared_ptr<authenticated_user>> authenticate(const credentials_map& credentials) const throw(exceptions::authentication_exception) override {
+            future<::shared_ptr<authenticated_user>> authenticate(const credentials_map& credentials) const override {
                return make_ready_future<::shared_ptr<authenticated_user>>(::make_shared<authenticated_user>());
            }
-            future<> create(sstring username, const option_map& options) throw(exceptions::request_validation_exception, exceptions::request_execution_exception) override {
+            future<> create(sstring username, const option_map& options) override {
                return make_ready_future();
            }
-            future<> alter(sstring username, const option_map& options) throw(exceptions::request_validation_exception, exceptions::request_execution_exception) override {
+            future<> alter(sstring username, const option_map& options) override {
                return make_ready_future();
            }
-            future<> drop(sstring username) throw(exceptions::request_validation_exception, exceptions::request_execution_exception) override {
+            future<> drop(sstring username) override {
                return make_ready_future();
            }
            const resource_ids& protected_resources() const override {
--- a/auth/authenticator.hh
+++ b/auth/authenticator.hh
@@ -92,7 +92,7 @@ public:
     * For example, use this method to create any required keyspaces/column families.
     * Note: Only call from main thread.
     */
-    static future<> setup(const sstring& type) throw(exceptions::configuration_exception);
+    static future<> setup(const sstring& type);

    /**
     * Returns the system authenticator. Must have called setup before calling this.
@@ -129,7 +129,7 @@ public:
     *
     * @throws authentication_exception if credentials don't match any known user.
     */
-    virtual future<::shared_ptr<authenticated_user>> authenticate(const credentials_map& credentials) const throw(exceptions::authentication_exception) = 0;
+    virtual future<::shared_ptr<authenticated_user>> authenticate(const credentials_map& credentials) const = 0;

    /**
     * Called during execution of CREATE USER query (also may be called on startup, see seedSuperuserOptions method).
@@ -141,7 +141,7 @@ public:
     * @throws exceptions::request_validation_exception
     * @throws exceptions::request_execution_exception
     */
-    virtual future<> create(sstring username, const option_map& options) throw(exceptions::request_validation_exception, exceptions::request_execution_exception) = 0;
+    virtual future<> create(sstring username, const option_map& options) = 0;

    /**
     * Called during execution of ALTER USER query.
@@ -154,7 +154,7 @@ public:
     * @throws exceptions::request_validation_exception
     * @throws exceptions::request_execution_exception
     */
-    virtual future<> alter(sstring username, const option_map& options) throw(exceptions::request_validation_exception, exceptions::request_execution_exception) = 0;
+    virtual future<> alter(sstring username, const option_map& options) = 0;


    /**
@@ -164,7 +164,7 @@ public:
     * @throws exceptions::request_validation_exception
     * @throws exceptions::request_execution_exception
     */
-    virtual future<> drop(sstring username) throw(exceptions::request_validation_exception, exceptions::request_execution_exception) = 0;
+    virtual future<> drop(sstring username) = 0;

     /**
     * Set of resources that should be made inaccessible to users and only accessible internally.
@@ -177,9 +177,9 @@ public:
    class sasl_challenge {
    public:
        virtual ~sasl_challenge() {}
-        virtual bytes evaluate_response(bytes_view client_response) throw(exceptions::authentication_exception) = 0;
+        virtual bytes evaluate_response(bytes_view client_response) = 0;
        virtual bool is_complete() const = 0;
-        virtual future<::shared_ptr<authenticated_user>> get_authenticated_user() const throw(exceptions::authentication_exception) = 0;
+        virtual future<::shared_ptr<authenticated_user>> get_authenticated_user() const = 0;
    };

    /**
--- a/auth/authorizer.hh
+++ b/auth/authorizer.hh
@@ -51,6 +51,8 @@
 #include "permission.hh"
 #include "data_resource.hh"

+#include "seastarx.hh"
+
 namespace auth {

 class authenticated_user;
--- a/auth/data_resource.cc
+++ b/auth/data_resource.cc
@@ -115,16 +115,14 @@ auth::data_resource auth::data_resource::get_parent() const {
    }
 }

-const sstring& auth::data_resource::keyspace() const
-                throw (std::invalid_argument) {
+const sstring& auth::data_resource::keyspace() const {
    if (is_root_level()) {
        throw std::invalid_argument("ROOT data resource has no keyspace");
    }
    return _ks;
 }

-const sstring& auth::data_resource::column_family() const
-                throw (std::invalid_argument) {
+const sstring& auth::data_resource::column_family() const {
    if (!is_column_family_level()) {
        throw std::invalid_argument(sprint("%s data resource has no column family", name()));
    }
--- a/auth/data_resource.hh
+++ b/auth/data_resource.hh
@@ -45,6 +45,7 @@
 #include <iosfwd>
 #include <set>
 #include <seastar/core/sstring.hh>
+#include "seastarx.hh"

 namespace auth {

@@ -117,13 +118,13 @@ public:
     * @return keyspace of the resource.
     * @throws std::invalid_argument if it's the root-level resource.
     */
-    const sstring& keyspace() const throw(std::invalid_argument);
+    const sstring& keyspace() const;

    /**
     * @return column family of the resource.
     * @throws std::invalid_argument if it's not a cf-level resource.
     */
-    const sstring& column_family() const throw(std::invalid_argument);
+    const sstring& column_family() const;

    /**
     * @return Whether or not the resource has a parent in the hierarchy.
--- a/auth/default_authorizer.cc
+++ b/auth/default_authorizer.cc
@@ -62,7 +62,7 @@ static const sstring RESOURCE_NAME = "resource";
 static const sstring PERMISSIONS_NAME = "permissions";
 static const sstring PERMISSIONS_CF = "permissions";

-static logging::logger logger("default_authorizer");
+static logging::logger alogger("default_authorizer");

 auth::default_authorizer::default_authorizer() {
 }
@@ -107,7 +107,7 @@ future<auth::permission_set> auth::default_authorizer::authorize(
                }
                return make_ready_future<permission_set>(permissions::from_strings(res->one().get_set<sstring>(PERMISSIONS_NAME)));
            } catch (exceptions::request_execution_exception& e) {
-                logger.warn("CassandraAuthorizer failed to authorize {} for {}", user->name(), resource);
+                alogger.warn("CassandraAuthorizer failed to authorize {} for {}", user->name(), resource);
                return make_ready_future<permission_set>(permissions::NONE);
            }
        });
@@ -196,7 +196,7 @@ future<> auth::default_authorizer::revoke_all(sstring dropped_user) {
                        try {
                            std::rethrow_exception(ep);
                        } catch (exceptions::request_execution_exception& e) {
-                            logger.warn("CassandraAuthorizer failed to revoke all permissions of {}: {}", dropped_user, e);
+                            alogger.warn("CassandraAuthorizer failed to revoke all permissions of {}: {}", dropped_user, e);
                        }
                    });
 }
@@ -217,13 +217,13 @@ future<> auth::default_authorizer::revoke_all(data_resource resource) {
                    try {
                        std::rethrow_exception(ep);
                    } catch (exceptions::request_execution_exception& e) {
-                        logger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", resource, e);
+                        alogger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", resource, e);
                    }

                });
            });
        } catch (exceptions::request_execution_exception& e) {
-            logger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", resource, e);
+            alogger.warn("CassandraAuthorizer failed to revoke all permissions on {}: {}", resource, e);
            return make_ready_future();
        }
    });
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -61,7 +61,7 @@ static const sstring DEFAULT_USER_NAME = auth::auth::DEFAULT_SUPERUSER_NAME;
 static const sstring DEFAULT_USER_PASSWORD = auth::auth::DEFAULT_SUPERUSER_NAME;
 static const sstring CREDENTIALS_CF = "credentials";

-static logging::logger logger("password_authenticator");
+static logging::logger plogger("password_authenticator");

 auth::password_authenticator::~password_authenticator()
 {}
@@ -169,7 +169,7 @@ future<> auth::password_authenticator::init() {
                                                    USER_NAME, SALTED_HASH
                                    ),
                                    db::consistency_level::ONE, {DEFAULT_USER_NAME, hashpw(DEFAULT_USER_PASSWORD)}).then([](auto) {
-                                        logger.info("Created default user '{}'", DEFAULT_USER_NAME);
+                                        plogger.info("Created default user '{}'", DEFAULT_USER_NAME);
                                    });
                }
            });
@@ -201,8 +201,7 @@ auth::authenticator::option_set auth::password_authenticator::alterable_options(
 }

 future<::shared_ptr<auth::authenticated_user> > auth::password_authenticator::authenticate(
-                const credentials_map& credentials) const
-                                throw (exceptions::authentication_exception) {
+                const credentials_map& credentials) const {
    if (!credentials.count(USERNAME_KEY)) {
        throw exceptions::authentication_exception(sprint("Required key '%s' is missing", USERNAME_KEY));
    }
@@ -241,9 +240,7 @@ future<::shared_ptr<auth::authenticated_user> > auth::password_authenticator::au
 }

 future<> auth::password_authenticator::create(sstring username,
-                const option_map& options)
-                                throw (exceptions::request_validation_exception,
-                                exceptions::request_execution_exception) {
+                const option_map& options) {
    try {
        auto password = boost::any_cast<sstring>(options.at(option::PASSWORD));
        auto query = sprint("INSERT INTO %s.%s (%s, %s) VALUES (?, ?)",
@@ -256,9 +253,7 @@ future<> auth::password_authenticator::create(sstring username,
 }

 future<> auth::password_authenticator::alter(sstring username,
-                const option_map& options)
-                                throw (exceptions::request_validation_exception,
-                                exceptions::request_execution_exception) {
+                const option_map& options) {
    try {
        auto password = boost::any_cast<sstring>(options.at(option::PASSWORD));
        auto query = sprint("UPDATE %s.%s SET %s = ? WHERE %s = ?",
@@ -270,9 +265,7 @@ future<> auth::password_authenticator::alter(sstring username,
    }
 }

-future<> auth::password_authenticator::drop(sstring username)
-                throw (exceptions::request_validation_exception,
-                exceptions::request_execution_exception) {
+future<> auth::password_authenticator::drop(sstring username) {
    try {
        auto query = sprint("DELETE FROM %s.%s WHERE %s = ?",
                        auth::AUTH_KS, CREDENTIALS_CF, USER_NAME);
@@ -308,9 +301,8 @@ const auth::resource_ids& auth::password_authenticator::protected_resources() co
         * would expect
         * @throws javax.security.sasl.SaslException
         */
-        bytes evaluate_response(bytes_view client_response)
-                        throw (exceptions::authentication_exception) override {
-            logger.debug("Decoding credentials from client token");
+        bytes evaluate_response(bytes_view client_response) override {
+            plogger.debug("Decoding credentials from client token");

            sstring username, password;

@@ -347,8 +339,7 @@ const auth::resource_ids& auth::password_authenticator::protected_resources() co
        bool is_complete() const override {
            return _complete;
        }
-        future<::shared_ptr<authenticated_user>> get_authenticated_user() const
-                        throw (exceptions::authentication_exception) override {
+        future<::shared_ptr<authenticated_user>> get_authenticated_user() const override {
            return _authenticator.authenticate(_credentials);
        }
    private:
--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -58,10 +58,10 @@ public:
    bool require_authentication() const override;
    option_set supported_options() const override;
    option_set alterable_options() const override;
-    future<::shared_ptr<authenticated_user>> authenticate(const credentials_map& credentials) const throw(exceptions::authentication_exception) override;
-    future<> create(sstring username, const option_map& options) throw(exceptions::request_validation_exception, exceptions::request_execution_exception) override;
-    future<> alter(sstring username, const option_map& options) throw(exceptions::request_validation_exception, exceptions::request_execution_exception) override;
-    future<> drop(sstring username) throw(exceptions::request_validation_exception, exceptions::request_execution_exception) override;
+    future<::shared_ptr<authenticated_user>> authenticate(const credentials_map& credentials) const override;
+    future<> create(sstring username, const option_map& options) override;
+    future<> alter(sstring username, const option_map& options) override;
+    future<> drop(sstring username) override;
    const resource_ids& protected_resources() const override;
    ::shared_ptr<sasl_challenge> new_sasl_challenge() const override;

--- a/auth/permission.hh
+++ b/auth/permission.hh
@@ -44,6 +44,7 @@
 #include <unordered_set>
 #include <seastar/core/sstring.hh>

+#include "seastarx.hh"
 #include "enum_set.hh"

 namespace auth {
--- a/bytes.hh
+++ b/bytes.hh
@@ -21,14 +21,17 @@

 #pragma once

+#include "seastarx.hh"
 #include "core/sstring.hh"
 #include "hashing.hh"
 #include <experimental/optional>
 #include <iosfwd>
 #include <functional>
+#include "utils/mutable_view.hh"

 using bytes = basic_sstring<int8_t, uint32_t, 31>;
 using bytes_view = std::experimental::basic_string_view<int8_t>;
+using bytes_mutable_view = basic_mutable_view<bytes_view::value_type>;
 using bytes_opt = std::experimental::optional<bytes>;
 using sstring_view = std::experimental::string_view;

--- a/cache_streamed_mutation.hh
+++ b/cache_streamed_mutation.hh
@@ -0,0 +1,538 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <vector>
+#include "row_cache.hh"
+#include "mutation_reader.hh"
+#include "streamed_mutation.hh"
+#include "partition_version.hh"
+#include "utils/logalloc.hh"
+#include "query-request.hh"
+#include "partition_snapshot_reader.hh"
+#include "partition_snapshot_row_cursor.hh"
+#include "read_context.hh"
+
+namespace cache {
+
+class lsa_manager {
+    row_cache& _cache;
+public:
+    lsa_manager(row_cache& cache) : _cache(cache) { }
+    template<typename Func>
+    decltype(auto) run_in_read_section(const Func& func) {
+        return _cache._read_section(_cache._tracker.region(), [&func] () {
+            return with_linearized_managed_bytes([&func] () {
+                return func();
+            });
+        });
+    }
+    template<typename Func>
+    decltype(auto) run_in_update_section(const Func& func) {
+        return _cache._update_section(_cache._tracker.region(), [&func] () {
+            return with_linearized_managed_bytes([&func] () {
+                return func();
+            });
+        });
+    }
+    template<typename Func>
+    void run_in_update_section_with_allocator(Func&& func) {
+        return _cache._update_section(_cache._tracker.region(), [this, &func] () {
+            return with_linearized_managed_bytes([this, &func] () {
+                return with_allocator(_cache._tracker.region().allocator(), [this, &func] () mutable {
+                    return func();
+                });
+            });
+        });
+    }
+    logalloc::region& region() { return _cache._tracker.region(); }
+    logalloc::allocating_section& read_section() { return _cache._read_section; }
+};
+
+class cache_streamed_mutation final : public streamed_mutation::impl {
+    enum class state {
+        before_static_row,
+
+        // Invariants:
+        //  - position_range(_lower_bound, _upper_bound) covers all not yet emitted positions from current range
+        //  - _next_row points to the nearest row in cache >= _lower_bound
+        //  - _next_row_in_range = _next.position() < _upper_bound
+        reading_from_cache,
+
+        // Starts reading from underlying reader.
+        // The range to read is position_range(_lower_bound, min(_next_row.position(), _upper_bound)).
+        // Invariants:
+        //  - _next_row_in_range = _next.position() < _upper_bound
+        move_to_underlying,
+
+        // Invariants:
+        // - Upper bound of the read is min(_next_row.position(), _upper_bound)
+        // - _next_row_in_range = _next.position() < _upper_bound
+        // - _last_row_key contains the key of last emitted clustering_row
+        reading_from_underlying,
+
+        end_of_stream
+    };
+    lw_shared_ptr<partition_snapshot> _snp;
+    position_in_partition::tri_compare _position_cmp;
+
+    query::clustering_key_filter_ranges _ck_ranges;
+    query::clustering_row_ranges::const_iterator _ck_ranges_curr;
+    query::clustering_row_ranges::const_iterator _ck_ranges_end;
+
+    lsa_manager _lsa_manager;
+
+    stdx::optional<clustering_key> _last_row_key;
+
+    // We need to be prepared that we may get overlapping and out of order
+    // range tombstones. We must emit fragments with strictly monotonic positions,
+    // so we can't just trim such tombstones to the position of the last fragment.
+    // To solve that, range tombstones are accumulated first in a range_tombstone_stream
+    // and emitted once we have a fragment with a larger position.
+    range_tombstone_stream _tombstones;
+
+    // Holds the lower bound of a position range which hasn't been processed yet.
+    // Only fragments with positions < _lower_bound have been emitted.
+    position_in_partition _lower_bound;
+    position_in_partition_view _upper_bound;
+
+    state _state = state::before_static_row;
+    lw_shared_ptr<read_context> _read_context;
+    partition_snapshot_row_cursor _next_row;
+    bool _next_row_in_range = false;
+
+    future<> do_fill_buffer();
+    void copy_from_cache_to_buffer();
+    future<> process_static_row();
+    void move_to_end();
+    void move_to_next_range();
+    void move_to_current_range();
+    void move_to_next_entry();
+    // Emits all delayed range tombstones with positions smaller than upper_bound.
+    void drain_tombstones(position_in_partition_view upper_bound);
+    // Emits all delayed range tombstones.
+    void drain_tombstones();
+    void add_to_buffer(const partition_snapshot_row_cursor&);
+    void add_clustering_row_to_buffer(mutation_fragment&&);
+    void add_to_buffer(range_tombstone&&);
+    void add_to_buffer(mutation_fragment&&);
+    future<> read_from_underlying();
+    future<> start_reading_from_underlying();
+    bool after_current_range(position_in_partition_view position);
+    bool can_populate() const;
+    void maybe_update_continuity();
+    void maybe_add_to_cache(const mutation_fragment& mf);
+    void maybe_add_to_cache(const clustering_row& cr);
+    void maybe_add_to_cache(const range_tombstone& rt);
+    void maybe_add_to_cache(const static_row& sr);
+    void maybe_set_static_row_continuous();
+public:
+    cache_streamed_mutation(schema_ptr s,
+                            dht::decorated_key dk,
+                            query::clustering_key_filter_ranges&& crr,
+                            lw_shared_ptr<read_context> ctx,
+                            lw_shared_ptr<partition_snapshot> snp,
+                            row_cache& cache)
+        : streamed_mutation::impl(std::move(s), dk, snp->partition_tombstone())
+        , _snp(std::move(snp))
+        , _position_cmp(*_schema)
+        , _ck_ranges(std::move(crr))
+        , _ck_ranges_curr(_ck_ranges.begin())
+        , _ck_ranges_end(_ck_ranges.end())
+        , _lsa_manager(cache)
+        , _tombstones(*_schema)
+        , _lower_bound(position_in_partition::before_all_clustered_rows())
+        , _upper_bound(position_in_partition_view::before_all_clustered_rows())
+        , _read_context(std::move(ctx))
+        , _next_row(*_schema, cache._tracker.region(), *_snp)
+    { }
+    cache_streamed_mutation(const cache_streamed_mutation&) = delete;
+    cache_streamed_mutation(cache_streamed_mutation&&) = delete;
+    virtual future<> fill_buffer() override;
+    virtual ~cache_streamed_mutation() {
+        maybe_merge_versions(_snp, _lsa_manager.region(), _lsa_manager.read_section());
+    }
+};
+
+inline
+future<> cache_streamed_mutation::process_static_row() {
+    if (_snp->version()->partition().static_row_continuous()) {
+        _read_context->cache().on_row_hit();
+        row sr = _lsa_manager.run_in_read_section([this] {
+            return _snp->static_row();
+        });
+        if (!sr.empty()) {
+            push_mutation_fragment(mutation_fragment(static_row(std::move(sr))));
+        }
+        return make_ready_future<>();
+    } else {
+        _read_context->cache().on_row_miss();
+        return _read_context->get_next_fragment().then([this] (mutation_fragment_opt&& sr) {
+            if (sr) {
+                assert(sr->is_static_row());
+                maybe_add_to_cache(sr->as_static_row());
+                push_mutation_fragment(std::move(*sr));
+            }
+            maybe_set_static_row_continuous();
+        });
+    }
+}
+
+inline
+future<> cache_streamed_mutation::fill_buffer() {
+    if (_state == state::before_static_row) {
+        auto after_static_row = [this] {
+            if (_ck_ranges_curr == _ck_ranges_end) {
+                _end_of_stream = true;
+                _state = state::end_of_stream;
+                return make_ready_future<>();
+            }
+            _state = state::reading_from_cache;
+            _lsa_manager.run_in_read_section([this] {
+                move_to_current_range();
+            });
+            return fill_buffer();
+        };
+        if (_schema->has_static_columns()) {
+            return process_static_row().then(std::move(after_static_row));
+        } else {
+            return after_static_row();
+        }
+    }
+    return do_until([this] { return _end_of_stream || is_buffer_full(); }, [this] {
+        return do_fill_buffer();
+    });
+}
+
+inline
+future<> cache_streamed_mutation::do_fill_buffer() {
+    if (_state == state::move_to_underlying) {
+        _state = state::reading_from_underlying;
+        auto end = _next_row_in_range ? position_in_partition(_next_row.position())
+                                      : position_in_partition(_upper_bound);
+        return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)}).then([this] {
+            return read_from_underlying();
+        });
+    }
+    if (_state == state::reading_from_underlying) {
+        return read_from_underlying();
+    }
+    // assert(_state == state::reading_from_cache)
+    return _lsa_manager.run_in_read_section([this] {
+        auto same_pos = _next_row.maybe_refresh();
+        // FIXME: If continuity changed anywhere between _lower_bound and _next_row.position()
+        // we need to redo the lookup with _lower_bound. There is no eviction yet, so not yet a problem.
+        assert(same_pos);
+        while (!is_buffer_full() && _state == state::reading_from_cache) {
+            copy_from_cache_to_buffer();
+            if (need_preempt()) {
+                break;
+            }
+        }
+        return make_ready_future<>();
+    });
+}
+
+inline
+future<> cache_streamed_mutation::read_from_underlying() {
+    return consume_mutation_fragments_until(_read_context->get_streamed_mutation(),
+        [this] { return _state != state::reading_from_underlying || is_buffer_full(); },
+        [this] (mutation_fragment mf) {
+            _read_context->cache().on_row_miss();
+            maybe_add_to_cache(mf);
+            add_to_buffer(std::move(mf));
+        },
+        [this] {
+            _state = state::reading_from_cache;
+            _lsa_manager.run_in_update_section([this] {
+                auto same_pos = _next_row.maybe_refresh();
+                assert(same_pos); // FIXME: handle eviction
+                if (_next_row_in_range) {
+                    maybe_update_continuity();
+                    add_to_buffer(_next_row);
+                    move_to_next_entry();
+                } else {
+                    if (no_clustering_row_between(*_schema, _upper_bound, _next_row.position())) {
+                        this->maybe_update_continuity();
+                    } else {
+                        // FIXME: Insert dummy entry at _upper_bound.
+                        _read_context->cache().on_mispopulate();
+                    }
+                    move_to_next_range();
+                }
+            });
+            return make_ready_future<>();
+        });
+}
+
+inline
+void cache_streamed_mutation::maybe_update_continuity() {
+    if (can_populate() && _next_row.is_in_latest_version()) {
+        if (_last_row_key) {
+            if (_next_row.previous_row_in_latest_version_has_key(*_last_row_key)) {
+                _next_row.set_continuous(true);
+            }
+        } else if (!_ck_ranges_curr->start()) {
+            _next_row.set_continuous(true);
+        }
+    } else {
+        _read_context->cache().on_mispopulate();
+    }
+}
+
+inline
+void cache_streamed_mutation::maybe_add_to_cache(const mutation_fragment& mf) {
+    if (mf.is_range_tombstone()) {
+        maybe_add_to_cache(mf.as_range_tombstone());
+    } else {
+        assert(mf.is_clustering_row());
+        const clustering_row& cr = mf.as_clustering_row();
+        maybe_add_to_cache(cr);
+    }
+}
+
+inline
+void cache_streamed_mutation::maybe_add_to_cache(const clustering_row& cr) {
+    if (!can_populate()) {
+        _read_context->cache().on_mispopulate();
+        return;
+    }
+    _lsa_manager.run_in_update_section_with_allocator([this, &cr] {
+        mutation_partition& mp = _snp->version()->partition();
+        rows_entry::compare less(*_schema);
+
+        // FIXME: If _next_row is up to date, but latest version doesn't have iterator in
+        // current row (could be far away, so we'd do this often), then this will do
+        // the lookup in mp. This is not necessary, because _next_row has iterators for
+        // next rows in each version, even if they're not part of the current row.
+        // They're currently buried in the heap, but you could keep a vector of
+        // iterators per each version in addition to the heap.
+        auto new_entry = alloc_strategy_unique_ptr<rows_entry>(
+            current_allocator().construct<rows_entry>(cr.key(), cr.tomb(), cr.marker(), cr.cells()));
+        new_entry->set_continuous(false);
+        auto it = _next_row.has_valid_row_from_latest_version()
+                  ? _next_row.get_iterator_in_latest_version() : mp.clustered_rows().lower_bound(cr.key(), less);
+        auto insert_result = mp.clustered_rows().insert_check(it, *new_entry, less);
+        if (insert_result.second) {
+            _read_context->cache().on_row_insert();
+            new_entry.release();
+        }
+        it = insert_result.first;
+
+        rows_entry& e = *it;
+        if (_last_row_key) {
+            if (it == mp.clustered_rows().begin()) {
+                // FIXME: check whether entry for _last_row_key is in older versions and if so set
+                // continuity to true.
+                _read_context->cache().on_mispopulate();
+            } else {
+                auto prev_it = it;
+                --prev_it;
+                clustering_key_prefix::equality eq(*_schema);
+                if (eq(*_last_row_key, prev_it->key())) {
+                    e.set_continuous(true);
+                }
+            }
+        } else if (!_ck_ranges_curr->start()) {
+            e.set_continuous(true);
+        } else {
+            // FIXME: Insert dummy entry at _ck_ranges_curr->start()
+            _read_context->cache().on_mispopulate();
+        }
+    });
+}
+
+inline
+bool cache_streamed_mutation::after_current_range(position_in_partition_view p) {
+    return _position_cmp(p, _upper_bound) >= 0;
+}
+
+inline
+future<> cache_streamed_mutation::start_reading_from_underlying() {
+    _state = state::move_to_underlying;
+    return make_ready_future<>();
+}
+
+inline
+void cache_streamed_mutation::copy_from_cache_to_buffer() {
+    position_in_partition_view next_lower_bound = _next_row.dummy() ? _next_row.position() : position_in_partition_view::after_key(_next_row.key());
+    for (auto&& rts : _snp->range_tombstones(*_schema, _lower_bound, _next_row_in_range ? next_lower_bound : _upper_bound)) {
+        add_to_buffer(std::move(rts));
+        if (is_buffer_full()) {
+            return;
+        }
+    }
+    if (_next_row_in_range) {
+        add_to_buffer(_next_row);
+        move_to_next_entry();
+    } else {
+        move_to_next_range();
+    }
+}
+
+inline
+void cache_streamed_mutation::move_to_end() {
+    drain_tombstones();
+    _end_of_stream = true;
+    _state = state::end_of_stream;
+}
+
+inline
+void cache_streamed_mutation::move_to_next_range() {
+    ++_ck_ranges_curr;
+    if (_ck_ranges_curr == _ck_ranges_end) {
+        move_to_end();
+    } else {
+        move_to_current_range();
+    }
+}
+
+inline
+void cache_streamed_mutation::move_to_current_range() {
+    _last_row_key = std::experimental::nullopt;
+    _lower_bound = position_in_partition::for_range_start(*_ck_ranges_curr);
+    _upper_bound = position_in_partition_view::for_range_end(*_ck_ranges_curr);
+    auto complete_until_next = _next_row.advance_to(_lower_bound) || _next_row.continuous();
+    _next_row_in_range = !after_current_range(_next_row.position());
+    if (!complete_until_next) {
+        start_reading_from_underlying();
+    }
+}
+
+// _next_row must be inside the range.
+inline
+void cache_streamed_mutation::move_to_next_entry() {
+    if (no_clustering_row_between(*_schema, _next_row.position(), _upper_bound)) {
+        move_to_next_range();
+    } else {
+        if (!_next_row.next()) {
+            move_to_end();
+            return;
+        }
+        _next_row_in_range = !after_current_range(_next_row.position());
+        if (!_next_row.continuous()) {
+            start_reading_from_underlying();
+        }
+    }
+}
+
+inline
+void cache_streamed_mutation::drain_tombstones(position_in_partition_view pos) {
+    while (auto mfo = _tombstones.get_next(pos)) {
+        push_mutation_fragment(std::move(*mfo));
+    }
+}
+
+inline
+void cache_streamed_mutation::drain_tombstones() {
+    while (auto mfo = _tombstones.get_next()) {
+        push_mutation_fragment(std::move(*mfo));
+    }
+}
+
+inline
+void cache_streamed_mutation::add_to_buffer(mutation_fragment&& mf) {
+    if (mf.is_clustering_row()) {
+        add_clustering_row_to_buffer(std::move(mf));
+    } else {
+        assert(mf.is_range_tombstone());
+        add_to_buffer(std::move(mf).as_range_tombstone());
+    }
+}
+
+inline
+void cache_streamed_mutation::add_to_buffer(const partition_snapshot_row_cursor& row) {
+    if (!row.dummy()) {
+        _read_context->cache().on_row_hit();
+        add_clustering_row_to_buffer(row.row());
+    }
+}
+
+inline
+void cache_streamed_mutation::add_clustering_row_to_buffer(mutation_fragment&& mf) {
+    auto& row = mf.as_clustering_row();
+    drain_tombstones(row.position());
+    _last_row_key = row.key();
+    _lower_bound = position_in_partition::after_key(row.key());
+    push_mutation_fragment(std::move(mf));
+}
+
+inline
+void cache_streamed_mutation::add_to_buffer(range_tombstone&& rt) {
+    // This guarantees that rt starts after any emitted clustering_row
+    if (!rt.trim_front(*_schema, _lower_bound)) {
+        return;
+    }
+    _lower_bound = position_in_partition(rt.position());
+    _tombstones.apply(std::move(rt));
+    drain_tombstones(_lower_bound);
+}
+
+inline
+void cache_streamed_mutation::maybe_add_to_cache(const range_tombstone& rt) {
+    if (can_populate()) {
+        _lsa_manager.run_in_update_section_with_allocator([&] {
+            _snp->version()->partition().row_tombstones().apply_monotonically(*_schema, rt);
+        });
+    } else {
+        _read_context->cache().on_mispopulate();
+    }
+}
+
+inline
+void cache_streamed_mutation::maybe_add_to_cache(const static_row& sr) {
+    if (can_populate()) {
+        _read_context->cache().on_row_insert();
+        _lsa_manager.run_in_update_section_with_allocator([&] {
+            _snp->version()->partition().static_row().apply(*_schema, column_kind::static_column, sr.cells());
+        });
+    } else {
+        _read_context->cache().on_mispopulate();
+    }
+}
+
+inline
+void cache_streamed_mutation::maybe_set_static_row_continuous() {
+    if (can_populate()) {
+        _snp->version()->partition().set_static_row_continuous(true);
+    } else {
+        _read_context->cache().on_mispopulate();
+    }
+}
+
+inline
+bool cache_streamed_mutation::can_populate() const {
+    return _snp->at_latest_version() && _read_context->cache().phase_of(_read_context->key()) == _read_context->phase();
+}
+
+} // namespace cache
+
+inline streamed_mutation make_cache_streamed_mutation(schema_ptr s,
+                                                      dht::decorated_key dk,
+                                                      query::clustering_key_filter_ranges crr,
+                                                      row_cache& cache,
+                                                      lw_shared_ptr<cache::read_context> ctx,
+                                                      lw_shared_ptr<partition_snapshot> snp)
+{
+    return make_streamed_mutation<cache::cache_streamed_mutation>(
+        std::move(s), std::move(dk), std::move(crr), std::move(ctx), std::move(snp), cache);
+}
--- a/caching_options.hh
+++ b/caching_options.hh
@@ -24,6 +24,7 @@
 #include <boost/lexical_cast.hpp>
 #include "exceptions/exceptions.hh"
 #include "json.hh"
+#include "seastarx.hh"

 class schema;

@@ -58,30 +59,34 @@ class caching_options {
    caching_options() : _key_cache(default_key), _row_cache(default_row) {}
 public:

-    sstring to_sstring() const {
-        return json::to_json(std::map<sstring, sstring>({{ "keys", _key_cache }, { "rows_per_partition", _row_cache }}));
+    std::map<sstring, sstring> to_map() const {
+        return {{ "keys", _key_cache }, { "rows_per_partition", _row_cache }};
    }

-    static caching_options from_sstring(const sstring& str) {
-        auto map = json::to_map(str);
-        if (map.size() > 2) {
-            throw exceptions::configuration_exception("Invalid map: " + str); 
-        }
-        sstring k;
-        sstring r;
-        if (map.count("keys")) {
-            k = map.at("keys");
-        } else {
-            k = default_key;
-        }
+    sstring to_sstring() const {
+        return json::to_json(to_map());
+    }

-        if (map.count("rows_per_partition")) {
-            r = map.at("rows_per_partition");
-        } else {
-            r = default_row;
+    template<typename Map>
+    static caching_options from_map(const Map & map) {
+        sstring k = default_key;
+        sstring r = default_row;
+
+        for (auto& p : map) {
+            if (p.first == "keys") {
+                k = p.second;
+            } else if (p.first == "rows_per_partition") {
+                r = p.second;
+            } else {
+                throw exceptions::configuration_exception("Invalid caching option: " + p.first);
+            }
        }
        return caching_options(k, r);
    }
+    static caching_options from_sstring(const sstring& str) {
+        return from_map(json::to_map(str));
+    }
+
    bool operator==(const caching_options& other) const {
        return _key_cache == other._key_cache && _row_cache == other._row_cache;
    }
--- a/cell_locking.hh
+++ b/cell_locking.hh
@@ -22,13 +22,28 @@
 #pragma once

 #include <boost/intrusive/unordered_set.hpp>
+
+#if __has_include(<boost/container/small_vector.hpp>)
+
 #include <boost/container/small_vector.hpp>

+template <typename T, size_t N>
+using small_vector = boost::container::small_vector<T, N>;
+
+#else
+
+#include <vector>
+template <typename T, size_t N>
+using small_vector = std::vector<T>;
+
+#endif
+
 #include "fnv1a_hasher.hh"
+#include "streamed_mutation.hh"
 #include "mutation_partition.hh"

 class cells_range {
-    using ids_vector_type = boost::container::small_vector<column_id, 5>;
+    using ids_vector_type = small_vector<column_id, 5>;

    position_in_partition_view _position;
    ids_vector_type _ids;
@@ -121,7 +136,17 @@ public:

 class locked_cell;

+struct cell_locker_stats {
+    uint64_t lock_acquisitions = 0;
+    uint64_t operations_waiting_for_lock = 0;
+};
+
 class cell_locker {
+public:
+    using timeout_clock = lowres_clock;
+private:
+    using semaphore_type = basic_semaphore<default_timeout_exception_factory, timeout_clock>;
+
    class partition_entry;

    struct cell_address {
@@ -133,7 +158,7 @@ class cell_locker {
                       public enable_lw_shared_from_this<cell_entry> {
        partition_entry& _parent;
        cell_address _address;
-        semaphore _semaphore { 0 };
+        semaphore_type _semaphore { 0 };

        friend class cell_locker;
    public:
@@ -147,7 +172,7 @@ class cell_locker {
        // temporarily removed from its parent partition_entry.
        // Returns true if the cell_entry still exist in the new schema and
        // should be reinserted.
-        bool upgrade(const schema& from, const schema& to, column_kind kind) {
+        bool upgrade(const schema& from, const schema& to, column_kind kind) noexcept {
            auto& old_column_mapping = from.get_column_mapping();
            auto& column = old_column_mapping.column_at(kind, _address.id);
            auto cdef = to.get_column_definition(column.name());
@@ -162,15 +187,17 @@ class cell_locker {
            return _address.position;
        }

-        future<> lock() {
-            return _semaphore.wait();
+        future<> lock(timeout_clock::time_point _timeout) {
+            return _semaphore.wait(_timeout);
        }
        void unlock() {
            _semaphore.signal();
        }

        ~cell_entry() {
-            assert(is_linked());
+            if (!is_linked()) {
+                return;
+            }
            unlink();
            if (!--_parent._cell_count) {
                delete &_parent;
@@ -219,14 +246,14 @@ class cell_locker {
                                             bi::hash<cell_entry::hasher>,
                                             bi::constant_time_size<false>>;

-        static constexpr size_t initial_bucket_count = 64;
+        static constexpr size_t initial_bucket_count = 16;
        using max_load_factor = std::ratio<3, 4>;
-
        dht::decorated_key _key;
        cell_locker& _parent;
        size_t _rehash_at_size = compute_rehash_at_size(initial_bucket_count);
        std::unique_ptr<cells_type::bucket_type[]> _buckets; // TODO: start with internal storage?
        size_t _cell_count = 0; // cells_type::empty() is not O(1) if the hook is auto-unlink
+        cells_type::bucket_type _internal_buckets[initial_bucket_count];
        cells_type _cells;
        schema_ptr _schema;

@@ -250,8 +277,7 @@ class cell_locker {
        partition_entry(schema_ptr s, cell_locker& parent, const dht::decorated_key& dk)
            : _key(dk)
            , _parent(parent)
-            , _buckets(std::make_unique<cells_type::bucket_type[]>(initial_bucket_count))
-            , _cells(cells_type::bucket_traits(_buckets.get(), initial_bucket_count),
+            , _cells(cells_type::bucket_traits(_internal_buckets, initial_bucket_count),
                     cell_entry::hasher(*s), cell_entry::equal_compare(*s))
            , _schema(s)
        { }
@@ -286,10 +312,9 @@ class cell_locker {
        };

        class equal_compare {
-            schema_ptr _schema;
            dht::decorated_key_equals_comparator _cmp;
        public:
-            explicit equal_compare(const schema s) : _cmp(s) { }
+            explicit equal_compare(const schema& s) : _cmp(s) { }
            bool operator()(const dht::decorated_key& dk, const partition_entry& pe) {
                return _cmp(dk, pe._key);
            }
@@ -319,6 +344,7 @@ class cell_locker {
    // partitions_type uses equality comparator which keeps a reference to the
    // original schema, we must ensure that it doesn't die.
    schema_ptr _original_schema;
+    cell_locker_stats& _stats;

    friend class locked_cell;
 private:
@@ -339,12 +365,13 @@ private:
        }
    }
 public:
-    explicit cell_locker(schema_ptr s)
+    explicit cell_locker(schema_ptr s, cell_locker_stats& stats)
        : _buckets(std::make_unique<partitions_type::bucket_type[]>(initial_bucket_count))
        , _partitions(partitions_type::bucket_traits(_buckets.get(), initial_bucket_count),
                      partition_entry::hasher(), partition_entry::equal_compare(*s))
        , _schema(s)
        , _original_schema(std::move(s))
+        , _stats(stats)
    { }

    ~cell_locker() {
@@ -359,7 +386,8 @@ public:
    }

    // partition_cells_range is required to be in cell_locker::schema()
-    future<std::vector<locked_cell>> lock_cells(const dht::decorated_key& dk, partition_cells_range&& range);
+    future<std::vector<locked_cell>> lock_cells(const dht::decorated_key& dk, partition_cells_range&& range,
+                                                timeout_clock::time_point timeout);
 };


@@ -386,46 +414,51 @@ struct cell_locker::locker {

    partition_cells_range _range;
    partition_cells_range::iterator _current_ck;
-    cells_range _cells_range;
    cells_range::const_iterator _current_cell;

+    timeout_clock::time_point _timeout;
    std::vector<locked_cell> _locks;
+    cell_locker_stats& _stats;
 private:
    void update_ck() {
        if (!is_done()) {
-            _cells_range = *_current_ck;
-            _current_cell = _cells_range.begin();
+            _current_cell = _current_ck->begin();
        }
    }

    future<> lock_next();

    bool is_done() const { return _current_ck == _range.end(); }
-    std::vector<locked_cell> get() && { return std::move(_locks); }
 public:
-    explicit locker(const ::schema& s, partition_entry& pe, partition_cells_range&& range)
+    explicit locker(const ::schema& s, cell_locker_stats& st, partition_entry& pe, partition_cells_range&& range, timeout_clock::time_point timeout)
        : _hasher(s)
        , _eq_cmp(s)
        , _partition_entry(pe)
        , _range(std::move(range))
        , _current_ck(_range.begin())
+        , _timeout(timeout)
+        , _stats(st)
    {
        update_ck();
    }

-    future<std::vector<locked_cell>> lock_all() && {
+    locker(const locker&) = delete;
+    locker(locker&&) = delete;
+
+    future<> lock_all() {
        // Cannot defer before first call to lock_next().
        return lock_next().then([this] {
            return do_until([this] { return is_done(); }, [this] {
                return lock_next();
-            }).then([&] {
-                return std::move(*this).get();
            });
        });
    }
+
+    std::vector<locked_cell> get() && { return std::move(_locks); }
 };

-future<std::vector<locked_cell>> cell_locker::lock_cells(const dht::decorated_key& dk, partition_cells_range&& range) {
+inline
+future<std::vector<locked_cell>> cell_locker::lock_cells(const dht::decorated_key& dk, partition_cells_range&& range, timeout_clock::time_point timeout) {
    partition_entry::hasher pe_hash;
    partition_entry::equal_compare pe_eq(*_schema);

@@ -447,6 +480,7 @@ future<std::vector<locked_cell>> cell_locker::lock_cells(const dht::decorated_ke
            }
            for (auto&& c : r) {
                auto cell = make_lw_shared<cell_entry>(*partition, position_in_partition(r.position()), c);
+                _stats.lock_acquisitions++;
                partition->insert(cell);
                locks.emplace_back(std::move(cell));
            }
@@ -460,14 +494,17 @@ future<std::vector<locked_cell>> cell_locker::lock_cells(const dht::decorated_ke
        return make_ready_future<std::vector<locked_cell>>(std::move(locks));
    }

-    return do_with(locker(*_schema, *it, std::move(range)), [] (auto& locker)  mutable {
-        return std::move(locker).lock_all();
+    auto l = std::make_unique<locker>(*_schema, _stats, *it, std::move(range), timeout);
+    auto f = l->lock_all();
+    return f.then([l = std::move(l)] {
+        return std::move(*l).get();
    });
 }

+inline
 future<> cell_locker::locker::lock_next() {
    while (!is_done()) {
-        if (_current_cell == _cells_range.end() || _cells_range.empty()) {
+        if (_current_cell == _current_ck->end()) {
            ++_current_ck;
            update_ck();
            continue;
@@ -475,35 +512,37 @@ future<> cell_locker::locker::lock_next() {

        auto cid = *_current_cell++;

-        cell_address ca { position_in_partition(_cells_range.position()), cid };
+        cell_address ca { position_in_partition(_current_ck->position()), cid };
        auto it = _partition_entry.cells().find(ca, _hasher, _eq_cmp);
        if (it != _partition_entry.cells().end()) {
-            return it->lock().then([this, ce = it->shared_from_this()] () mutable {
+            _stats.operations_waiting_for_lock++;
+            return it->lock(_timeout).then([this, ce = it->shared_from_this()] () mutable {
+                _stats.operations_waiting_for_lock--;
+                _stats.lock_acquisitions++;
                _locks.emplace_back(std::move(ce));
            });
        }

-        auto cell = make_lw_shared<cell_entry>(_partition_entry, position_in_partition(_cells_range.position()), cid);
+        auto cell = make_lw_shared<cell_entry>(_partition_entry, position_in_partition(_current_ck->position()), cid);
+        _stats.lock_acquisitions++;
        _partition_entry.insert(cell);
        _locks.emplace_back(std::move(cell));
    }
    return make_ready_future<>();
 }

+inline
 bool cell_locker::partition_entry::upgrade(schema_ptr new_schema) {
    if (_schema == new_schema) {
        return true;
    }

-    auto buckets = std::make_unique<cells_type::bucket_type[]>(initial_bucket_count);
+    auto buckets = std::make_unique<cells_type::bucket_type[]>(_cells.bucket_count());
    auto cells = cells_type(cells_type::bucket_traits(buckets.get(), _cells.bucket_count()),
                            cell_entry::hasher(*new_schema), cell_entry::equal_compare(*new_schema));

-    while (!_cells.empty()) {
-        auto it = _cells.begin();
-        auto& cell = *it;
-        _cells.erase(it);
-
+    _cells.clear_and_dispose([&] (cell_entry* cell_ptr) noexcept {
+        auto& cell = *cell_ptr;
        auto kind = cell.position().is_static_row() ? column_kind::static_column
                                                    : column_kind::regular_column;
        auto reinsert = cell.upgrade(*_schema, *new_schema, kind);
@@ -512,9 +551,16 @@ bool cell_locker::partition_entry::upgrade(schema_ptr new_schema) {
        } else {
            _cell_count--;
        }
-    }
+    });

+    // bi::unordered_set move assignment is actually a swap.
+    // Original _buckets cannot be destroyed before the container using them is
+    // so we need to explicitly make sure that the original _cells is no more.
    _cells = std::move(cells);
+    auto destroy = [] (auto) { };
+    destroy(std::move(cells));
+
    _buckets = std::move(buckets);
+    _schema = new_schema;
    return _cell_count;
 }
--- a/checked-file-impl.hh
+++ b/checked-file-impl.hh
@@ -112,6 +112,11 @@ public:
        });
    }

+    virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) override {
+        return do_io_check(_error_handler, [&] {
+            return get_file_impl(_file)->dma_read_bulk(offset, range_size, pc);
+        });
+    }
 private:
    const io_error_handler& _error_handler;
    file _file;
--- a/clocks-impl.cc
+++ b/clocks-impl.cc
@@ -19,6 +19,6 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

-#include "gc_clock.hh"
+#include "clocks-impl.hh"

 std::atomic<int64_t> clocks_offset;
--- a/clocks-impl.hh
+++ b/clocks-impl.hh
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <chrono>
+#include <cstdint>
+
+extern std::atomic<int64_t> clocks_offset;
+
+template<typename Duration>
+static inline void forward_jump_clocks(Duration delta)
+{
+    auto d = std::chrono::duration_cast<std::chrono::seconds>(delta).count();
+    clocks_offset.fetch_add(d, std::memory_order_relaxed);
+}
+
+static inline std::chrono::seconds get_clocks_offset()
+{
+    auto off = clocks_offset.load(std::memory_order_relaxed);
+    return std::chrono::seconds(off);
+}
+
+// Returns a time point which is earlier from t by d, or minimum time point if it cannot be represented.
+template<typename Clock, typename Duration, typename Rep, typename Period>
+inline
+auto saturating_subtract(std::chrono::time_point<Clock, Duration> t, std::chrono::duration<Rep, Period> d) -> decltype(t) {
+    return std::max(t, decltype(t)::min() + d) - d;
+}
--- a/clustering_bounds_comparator.hh
+++ b/clustering_bounds_comparator.hh
@@ -54,8 +54,8 @@ static inline bound_kind flip_bound_kind(bound_kind bk)
 }

 class bound_view {
-    const static thread_local clustering_key empty_prefix;
 public:
+    const static thread_local clustering_key empty_prefix;
    const clustering_key_prefix& prefix;
    bound_kind kind;
    bound_view(const clustering_key_prefix& prefix, bound_kind kind)
@@ -133,20 +133,33 @@ public:
    static bound_view top() {
        return {empty_prefix, bound_kind::incl_end};
    }
-    /*
-    template<template<typename> typename T, typename U>
-    concept bool Range() {
-        return requires (T<U> range) {
-            { range.start() } -> stdx::optional<U>;
-            { range.end() } -> stdx::optional<U>;
-        };
-    };*/
-    template<template<typename> typename Range>
-    static std::pair<bound_view, bound_view> from_range(const Range<clustering_key_prefix>& range) {
-        return {
-            range.start() ? bound_view(range.start()->value(), range.start()->is_inclusive() ? bound_kind::incl_start : bound_kind::excl_start) : bottom(),
-            range.end() ? bound_view(range.end()->value(), range.end()->is_inclusive() ? bound_kind::incl_end : bound_kind::excl_end) : top(),
-        };
+    template<template<typename> typename R>
+    GCC6_CONCEPT( requires Range<R, clustering_key_prefix_view> )
+    static bound_view from_range_start(const R<clustering_key_prefix>& range) {
+        return range.start()
+               ? bound_view(range.start()->value(), range.start()->is_inclusive() ? bound_kind::incl_start : bound_kind::excl_start)
+               : bottom();
+    }
+    template<template<typename> typename R>
+    GCC6_CONCEPT( requires Range<R, clustering_key_prefix> )
+    static bound_view from_range_end(const R<clustering_key_prefix>& range) {
+        return range.end()
+               ? bound_view(range.end()->value(), range.end()->is_inclusive() ? bound_kind::incl_end : bound_kind::excl_end)
+               : top();
+    }
+    template<template<typename> typename R>
+    GCC6_CONCEPT( requires Range<R, clustering_key_prefix> )
+    static std::pair<bound_view, bound_view> from_range(const R<clustering_key_prefix>& range) {
+        return {from_range_start(range), from_range_end(range)};
+    }
+    template<template<typename> typename R>
+    GCC6_CONCEPT( requires Range<R, clustering_key_prefix_view> )
+    static stdx::optional<typename R<clustering_key_prefix_view>::bound> to_range_bound(const bound_view& bv) {
+        if (&bv.prefix == &empty_prefix) {
+            return {};
+        }
+        bool inclusive = bv.kind != bound_kind::excl_end && bv.kind != bound_kind::excl_start;
+        return {typename R<clustering_key_prefix_view>::bound(bv.prefix.view(), inclusive)};
    }
    friend std::ostream& operator<<(std::ostream& out, const bound_view& b) {
        return out << "{bound: prefix=" << b.prefix << ", kind=" << b.kind << "}";
--- a/clustering_key_filter.hh
+++ b/clustering_key_filter.hh
@@ -54,6 +54,7 @@ public:
    auto end() const { return _ref.end(); }
    bool empty() const { return _ref.empty(); }
    size_t size() const { return _ref.size(); }
+    const clustering_row_ranges& ranges() const { return _ref; }

    static clustering_key_filter_ranges get_ranges(const schema& schema, const query::partition_slice& slice, const partition_key& key) {
        const query::clustering_row_ranges& ranges = slice.row_ranges(schema, key);
--- a/clustering_ranges_walker.hh
+++ b/clustering_ranges_walker.hh
@@ -0,0 +1,219 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "schema.hh"
+#include "query-request.hh"
+#include "streamed_mutation.hh"
+
+// Utility for in-order checking of overlap with position ranges.
+class clustering_ranges_walker {
+    const schema& _schema;
+    const query::clustering_row_ranges& _ranges;
+    query::clustering_row_ranges::const_iterator _current;
+    query::clustering_row_ranges::const_iterator _end;
+    bool _in_current; // next position is known to be >= _current_start
+    bool _with_static_row;
+    position_in_partition_view _current_start;
+    position_in_partition_view _current_end;
+    stdx::optional<position_in_partition> _trim;
+    size_t _change_counter = 1;
+private:
+    bool advance_to_next_range() {
+        _in_current = false;
+        if (!_current_start.is_static_row()) {
+            if (_current == _end) {
+                return false;
+            }
+            ++_current;
+        }
+        ++_change_counter;
+        if (_current == _end) {
+            _current_end = _current_start = position_in_partition_view::after_all_clustered_rows();
+            return false;
+        }
+        _current_start = position_in_partition_view::for_range_start(*_current);
+        _current_end = position_in_partition_view::for_range_end(*_current);
+        return true;
+    }
+public:
+    clustering_ranges_walker(const schema& s, const query::clustering_row_ranges& ranges, bool with_static_row = true)
+        : _schema(s)
+        , _ranges(ranges)
+        , _current(ranges.begin())
+        , _end(ranges.end())
+        , _in_current(with_static_row)
+        , _with_static_row(with_static_row)
+        , _current_start(position_in_partition_view::for_static_row())
+        , _current_end(position_in_partition_view::before_all_clustered_rows())
+    {
+        if (!with_static_row) {
+            if (_current == _end) {
+                _current_start = position_in_partition_view::before_all_clustered_rows();
+            } else {
+                _current_start = position_in_partition_view::for_range_start(*_current);
+                _current_end = position_in_partition_view::for_range_end(*_current);
+            }
+        }
+    }
+    clustering_ranges_walker(clustering_ranges_walker&& o) noexcept
+        : _schema(o._schema)
+        , _ranges(o._ranges)
+        , _current(o._current)
+        , _end(o._end)
+        , _in_current(o._in_current)
+        , _with_static_row(o._with_static_row)
+        , _current_start(o._current_start)
+        , _current_end(o._current_end)
+        , _trim(std::move(o._trim))
+        , _change_counter(o._change_counter)
+    { }
+    clustering_ranges_walker& operator=(clustering_ranges_walker&& o) {
+        if (this != &o) {
+            this->~clustering_ranges_walker();
+            new (this) clustering_ranges_walker(std::move(o));
+        }
+        return *this;
+    }
+
+    // Excludes positions smaller than pos from the ranges.
+    // pos should be monotonic.
+    // No constraints between pos and positions passed to advance_to().
+    //
+    // After the invocation, when !out_of_range(), lower_bound() returns the smallest position still contained.
+    void trim_front(position_in_partition pos) {
+        position_in_partition::less_compare less(_schema);
+
+        do {
+            if (!less(_current_start, pos)) {
+                break;
+            }
+            if (less(pos, _current_end)) {
+                _trim = std::move(pos);
+                _current_start = *_trim;
+                _in_current = false;
+                ++_change_counter;
+                break;
+            }
+        } while (advance_to_next_range());
+    }
+
+    // Returns true if given position is contained.
+    // Must be called with monotonic positions.
+    // Idempotent.
+    bool advance_to(position_in_partition_view pos) {
+        position_in_partition::less_compare less(_schema);
+
+        do {
+            if (!_in_current && less(pos, _current_start)) {
+                break;
+            }
+            // All subsequent clustering keys are larger than the start of this
+            // range so there is no need to check that again.
+            _in_current = true;
+
+            if (less(pos, _current_end)) {
+                return true;
+            }
+        } while (advance_to_next_range());
+
+        return false;
+    }
+
+    // Returns true if the range expressed by start and end (as in position_range) overlaps
+    // with clustering ranges.
+    // Must be called with monotonic start position. That position must also be greater than
+    // the last position passed to the other advance_to() overload.
+    // Idempotent.
+    bool advance_to(position_in_partition_view start, position_in_partition_view end) {
+        position_in_partition::less_compare less(_schema);
+
+        do {
+            if (!less(_current_start, end)) {
+                break;
+            }
+            if (less(start, _current_end)) {
+                return true;
+            }
+        } while (advance_to_next_range());
+
+        return false;
+    }
+
+    // Returns true if the range tombstone expressed by start and end (as in position_range) overlaps
+    // with clustering ranges.
+    // No monotonicity restrictions on argument values across calls.
+    // Does not affect lower_bound().
+    // Idempotent.
+    bool contains_tombstone(position_in_partition_view start, position_in_partition_view end) const {
+        position_in_partition::less_compare less(_schema);
+
+        if (_trim && less(end, *_trim)) {
+            return false;
+        }
+
+        auto i = _current;
+        while (i != _end) {
+            auto range_start = position_in_partition_view::for_range_start(*i);
+            if (less(end, range_start)) {
+                return false;
+            }
+            auto range_end = position_in_partition_view::for_range_end(*i);
+            if (less(start, range_end)) {
+                return true;
+            }
+            ++i;
+        }
+
+        return false;
+    }
+
+    // Returns true if advanced past all contained positions. Any later advance_to() until reset() will return false.
+    bool out_of_range() const {
+        return !_in_current && _current == _end;
+    }
+
+    // Resets the state of the walker so that advance_to() can be now called for new sequence of positions.
+    // Any range trimmings still hold after this.
+    void reset() {
+        auto trim = std::move(_trim);
+        auto ctr = _change_counter;
+        *this = clustering_ranges_walker(_schema, _ranges, _with_static_row);
+        _change_counter = ctr + 1;
+        if (trim) {
+            trim_front(std::move(*trim));
+        }
+    }
+
+    // Can be called only when !out_of_range()
+    position_in_partition_view lower_bound() const {
+        return _current_start;
+    }
+
+    // When lower_bound() changes, this also does
+    // Always > 0.
+    size_t lower_bound_change_counter() const {
+        return _change_counter;
+    }
+};
--- a/compaction_strategy.hh
+++ b/compaction_strategy.hh
@@ -39,6 +39,7 @@ class compaction_strategy_impl;
 class sstable;
 class sstable_set;
 struct compaction_descriptor;
+struct resharding_descriptor;

 class compaction_strategy {
    ::shared_ptr<compaction_strategy_impl> _compaction_strategy_impl;
@@ -54,6 +55,8 @@ public:
    // Return a list of sstables to be compacted after applying the strategy.
    compaction_descriptor get_sstables_for_compaction(column_family& cfs, std::vector<lw_shared_ptr<sstable>> candidates);

+    std::vector<resharding_descriptor> get_resharding_jobs(column_family& cf, std::vector<lw_shared_ptr<sstable>> candidates);
+
    // Some strategies may look at the compacted and resulting sstables to
    // get some useful information for subsequent compactions.
    void notify_completion(const std::vector<lw_shared_ptr<sstable>>& removed, const std::vector<lw_shared_ptr<sstable>>& added);
--- a/compound.hh
+++ b/compound.hh
@@ -130,10 +130,10 @@ public:
    bytes decompose_value(const value_type& values) {
        return serialize_value(values);
    }
-    class iterator : public std::iterator<std::input_iterator_tag, bytes_view> {
+    class iterator : public std::iterator<std::input_iterator_tag, const bytes_view> {
    private:
        bytes_view _v;
-        value_type _current;
+        bytes_view _current;
    private:
        void read_current() {
            size_type len;
@@ -220,6 +220,9 @@ public:
        assert(AllowPrefixes == allow_prefixes::yes);
        return std::distance(begin(v), end(v)) == (ssize_t)_types.size();
    }
+    bool is_empty(bytes_view v) const {
+        return begin(v) == end(v);
+    }
    void validate(bytes_view v) {
        // FIXME: implement
        warn(unimplemented::cause::VALIDATION);
--- a/compound_compat.hh
+++ b/compound_compat.hh
@@ -184,6 +184,8 @@ bytes to_legacy(CompoundType& type, bytes_view packed) {
    return legacy_form;
 }

+class composite_view;
+
 // Represents a value serialized according to Origin's CompositeType.
 // If is_compound is true, then the value is one or more components encoded as:
 //
@@ -202,7 +204,7 @@ public:
            , _is_compound(is_compound)
    { }

-    composite(bytes&& b)
+    explicit composite(bytes&& b)
            : _bytes(std::move(b))
            , _is_compound(true)
    { }
@@ -239,7 +241,7 @@ public:
    using component_view = std::pair<bytes_view, eoc>;
 private:
    template<typename Value, typename = std::enable_if_t<!std::is_same<const data_value, std::decay_t<Value>>::value>>
-    static size_t size(Value& val) {
+    static size_t size(const Value& val) {
        return val.size();
    }
    static size_t size(const data_value& val) {
@@ -304,23 +306,36 @@ public:
        return f(const_cast<bytes&>(_bytes));
    }

+    // marker is ignored if !is_compound
    template<typename RangeOfSerializedComponents>
-    static bytes serialize_value(RangeOfSerializedComponents&& values, bool is_compound = true) {
+    static composite serialize_value(RangeOfSerializedComponents&& values, bool is_compound = true, eoc marker = eoc::none) {
        auto size = serialized_size(values, is_compound);
        bytes b(bytes::initialized_later(), size);
        auto i = b.begin();
        serialize_value(std::forward<decltype(values)>(values), i, is_compound);
-        return b;
+        if (is_compound && !b.empty()) {
+            b.back() = eoc_type(marker);
+        }
+        return composite(std::move(b), is_compound);
+    }
+
+    template<typename RangeOfSerializedComponents>
+    static composite serialize_static(const schema& s, RangeOfSerializedComponents&& values) {
+        // FIXME: Optimize
+        auto b = bytes(size_t(2), bytes::value_type(0xff));
+        std::vector<bytes_view> sv(s.clustering_key_size());
+        b += composite::serialize_value(boost::range::join(sv, std::forward<RangeOfSerializedComponents>(values)), true).release_bytes();
+        return composite(std::move(b));
+    }
+
+    static eoc to_eoc(int8_t eoc_byte) {
+        return eoc_byte == 0 ? eoc::none : (eoc_byte < 0 ? eoc::start : eoc::end);
    }

    class iterator : public std::iterator<std::input_iterator_tag, const component_view> {
        bytes_view _v;
        component_view _current;
    private:
-        eoc to_eoc(int8_t eoc_byte) {
-            return eoc_byte == 0 ? eoc::none : (eoc_byte < 0 ? eoc::start : eoc::end);
-        }
-
        void read_current() {
            size_type len;
            {
@@ -406,6 +421,10 @@ public:
        return _bytes;
    }

+    bytes release_bytes() && {
+        return std::move(_bytes);
+    }
+
    size_t size() const {
        return _bytes.size();
    }
@@ -426,26 +445,20 @@ public:
        return _is_compound;
    }

-    // The following factory functions assume this composite is a compound value.
    template <typename ClusteringElement>
    static composite from_clustering_element(const schema& s, const ClusteringElement& ce) {
-        return serialize_value(ce.components(s));
+        return serialize_value(ce.components(s), s.is_compound());
    }

-    static composite from_exploded(const std::vector<bytes_view>& v, eoc marker = eoc::none) {
+    static composite from_exploded(const std::vector<bytes_view>& v, bool is_compound, eoc marker = eoc::none) {
        if (v.size() == 0) {
-            return bytes(size_t(1), bytes::value_type(marker));
+            return composite(bytes(size_t(1), bytes::value_type(marker)), is_compound);
        }
-        auto b = serialize_value(v);
-        b.back() = eoc_type(marker);
-        return composite(std::move(b));
+        return serialize_value(v, is_compound, marker);
    }

    static composite static_prefix(const schema& s) {
-        static bytes static_marker(size_t(2), bytes::value_type(0xff));
-
-        std::vector<bytes_view> sv(s.clustering_key_size());
-        return static_marker + serialize_value(sv);
+        return serialize_static(s, std::vector<bytes_view>());
    }

    explicit operator bytes_view() const {
@@ -456,6 +469,15 @@ public:
    friend inline std::ostream& operator<<(std::ostream& os, const std::pair<Component, eoc>& c) {
        return os << "{value=" << c.first << "; eoc=" << sprint("0x%02x", eoc_type(c.second) & 0xff) << "}";
    }
+
+    friend std::ostream& operator<<(std::ostream& os, const composite& v);
+
+    struct tri_compare {
+        const std::vector<data_type>& _types;
+        tri_compare(const std::vector<data_type>& types) : _types(types) {}
+        int operator()(const composite&, const composite&) const;
+        int operator()(composite_view, composite_view) const;
+    };
 };

 class composite_view final {
@@ -476,14 +498,15 @@ public:
            , _is_compound(true)
    { }

-    std::vector<bytes> explode() const {
+    std::vector<bytes_view> explode() const {
        if (!_is_compound) {
-            return { to_bytes(_bytes) };
+            return { _bytes };
        }

-        std::vector<bytes> ret;
+        std::vector<bytes_view> ret;
+        ret.reserve(8);
        for (auto it = begin(), e = end(); it != e; ) {
-            ret.push_back(to_bytes(it->first));
+            ret.push_back(it->first);
            auto marker = it->second;
            ++it;
            if (it != e && marker != composite::eoc::none) {
@@ -505,6 +528,15 @@ public:
        return { begin(), end() };
    }

+    composite::eoc last_eoc() const {
+        if (!_is_compound || _bytes.empty()) {
+            return composite::eoc::none;
+        }
+        bytes_view v(_bytes);
+        v.remove_prefix(v.size() - 1);
+        return composite::to_eoc(read_simple<composite::eoc_type>(v));
+    }
+
    auto values() const {
        return components() | boost::adaptors::transformed([](auto&& c) { return c.first; });
    }
@@ -527,4 +559,46 @@ public:

    bool operator==(const composite_view& k) const { return k._bytes == _bytes && k._is_compound == _is_compound; }
    bool operator!=(const composite_view& k) const { return !(k == *this); }
+
+    friend inline std::ostream& operator<<(std::ostream& os, composite_view v) {
+        return os << "{" << ::join(", ", v.components()) << ", compound=" << v._is_compound << ", static=" << v.is_static() << "}";
+    }
 };
+
+inline
+std::ostream& operator<<(std::ostream& os, const composite& v) {
+    return os << composite_view(v);
+}
+
+inline
+int composite::tri_compare::operator()(const composite& v1, const composite& v2) const {
+    return (*this)(composite_view(v1), composite_view(v2));
+}
+
+inline
+int composite::tri_compare::operator()(composite_view v1, composite_view v2) const {
+    // See org.apache.cassandra.db.composites.AbstractCType#compare
+    if (v1.empty()) {
+        return v2.empty() ? 0 : -1;
+    }
+    if (v2.empty()) {
+        return 1;
+    }
+    if (v1.is_static() != v2.is_static()) {
+        return v1.is_static() ? -1 : 1;
+    }
+    auto a_values = v1.components();
+    auto b_values = v2.components();
+    auto cmp = [&](const data_type& t, component_view c1, component_view c2) {
+        // First by value, then by EOC
+        auto r = t->compare(c1.first, c2.first);
+        if (r) {
+            return r;
+        }
+        return static_cast<int>(c1.second) - static_cast<int>(c2.second);
+    };
+    return lexicographical_tri_compare(_types.begin(), _types.end(),
+        a_values.begin(), a_values.end(),
+        b_values.begin(), b_values.end(),
+        cmp);
+}
--- a/conf/scylla.yaml
+++ b/conf/scylla.yaml
@@ -89,6 +89,15 @@ listen_address: localhost
 # For security reasons, you should not expose this port to the internet.  Firewall it if needed.
 native_transport_port: 9042

+# Enabling native transport encryption in client_encryption_options allows you to either use
+# encryption for the standard port or to use a dedicated, additional port along with the unencrypted
+# standard native_transport_port.
+# Enabling client encryption and keeping native_transport_port_ssl disabled will use encryption
+# for native_transport_port. Setting native_transport_port_ssl to a different value
+# from native_transport_port will use encryption for native_transport_port_ssl while
+# keeping native_transport_port unencrypted.
+#native_transport_port_ssl: 9142
+
 # Throttles all outbound streaming file transfers on this node to the
 # given total throughput in Mbps. This is necessary because Scylla does
 # mostly sequential IO when streaming data during bootstrap or repair, which
@@ -192,6 +201,9 @@ api_address: 127.0.0.1
 # Caution should be taken on increasing the size of this threshold as it can lead to node instability.
 batch_size_warn_threshold_in_kb: 5

+# Fail any multiple-partition batch exceeding this value. 50kb (10x warn threshold) by default.
+batch_size_fail_threshold_in_kb: 50
+
 # Authentication backend, identifying users
 # Out of the box, Scylla provides org.apache.cassandra.auth.{AllowAllAuthenticator,
 # PasswordAuthenticator}.
@@ -223,6 +235,9 @@ batch_size_warn_threshold_in_kb: 5
 # be set.
 # broadcast_rpc_address: 1.2.3.4

+# Uncomment to enable experimental features
+# experimental: true
+
 ###################################################
 ## Not currently supported, reserved for future use
 ###################################################
@@ -279,28 +294,6 @@ batch_size_warn_threshold_in_kb: 5
 #
 partitioner: org.apache.cassandra.dht.Murmur3Partitioner

-
-# policy for data disk failures:
-# die: shut down gossip and Thrift and kill the JVM for any fs errors or
-#      single-sstable errors, so the node can be replaced.
-# stop_paranoid: shut down gossip and Thrift even for single-sstable errors.
-# stop: shut down gossip and Thrift, leaving the node effectively dead, but
-#       can still be inspected via JMX.
-# best_effort: stop using the failed disk and respond to requests based on
-#              remaining available sstables.  This means you WILL see obsolete
-#              data at CL.ONE!
-# ignore: ignore fatal errors and let requests fail, as in pre-1.2 Scylla
-# disk_failure_policy: stop
-
-# policy for commit disk failures:
-# die: shut down gossip and Thrift and kill the JVM, so the node can be replaced.
-# stop: shut down gossip and Thrift, leaving the node effectively dead, but
-#       can still be inspected via JMX.
-# stop_commit: shutdown the commit log, letting writes collect but
-#              continuing to service reads, as in pre-2.0.5 Scylla
-# ignore: ignore fatal errors and let the batches fail
-# commit_failure_policy: stop
-
 # Maximum size of the key cache in memory.
 #
 # Each key cache hit saves 1 seek and each row cache hit saves 2 seeks at the
@@ -727,22 +720,17 @@ commitlog_total_space_in_mb: -1
 #    certificate: conf/scylla.crt
 #    keyfile: conf/scylla.key
 #    truststore: <none, use system trust>
+#    require_client_auth: False
+#    priority_string: <none, use default>

 # enable or disable client/server encryption.
 # client_encryption_options:
 #    enabled: false
 #    certificate: conf/scylla.crt
 #    keyfile: conf/scylla.key
-
-    # require_client_auth: false
-    # Set trustore and truststore_password if require_client_auth is true
-    # truststore: conf/.truststore
-    # truststore_password: cassandra
-    # More advanced defaults below:
-    # protocol: TLS
-    # algorithm: SunX509
-    # store_type: JKS
-    # cipher_suites: [TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_DHE_RSA_WITH_AES_128_CBC_SHA,TLS_DHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA]
+#    truststore: <none, use system trust>
+#    require_client_auth: False
+#    priority_string: <none, use default>

 # internode_compression controls whether traffic between nodes is
 # compressed.
@@ -788,3 +776,23 @@ commitlog_total_space_in_mb: -1
 # By default, Scylla binds all interfaces to the prometheus API
 # It is possible to restrict the listening address to a specific one
 # prometheus_address: 0.0.0.0
+
+# Distribution of data among cores (shards) within a node
+#
+# Scylla distributes data within a node among shards, using a round-robin
+# strategy:
+#  [shard0] [shard1] ... [shardN-1] [shard0] [shard1] ... [shardN-1] ...
+#
+# Scylla versions 1.6 and below used just one repetition of the pattern;
+# this intefered with data placement among nodes (vnodes).
+#
+# Scylla versions 1.7 and above use 4096 repetitions of the pattern; this
+# provides for better data distribution.
+#
+# the value below is log (base 2) of the number of repetitions.
+#
+# Set to 0 to avoid rewriting all data when upgrading from Scylla 1.6 and
+# below.
+#
+# Keep at 12 for new clusters.
+murmur3_partitioner_ignore_msb_bits: 12
--- a/configure.py
+++ b/configure.py
@@ -34,7 +34,7 @@ for line in open('/etc/os-release'):
        os_ids += value.split(' ')

 # distribution "internationalization", converting package names.
-# Fedora name is key, values is distro -> package name dict. 
+# Fedora name is key, values is distro -> package name dict.
 i18n_xlat = {
    'boost-devel': {
        'debian': 'libboost-dev',
@@ -48,7 +48,7 @@ def pkgname(name):
        for id in os_ids:
            if id in dict:
                return dict[id]
-    return name 
+    return name

 def get_flags():
    with open('/proc/cpuinfo') as f:
@@ -93,7 +93,7 @@ def try_compile(compiler, source = '', flags = []):
 def warning_supported(warning, compiler):
    # gcc ignores -Wno-x even if it is not supported
    adjusted = re.sub('^-Wno-', '-W', warning)
-    return try_compile(flags = [adjusted], compiler = compiler)
+    return try_compile(flags = ['-Werror', adjusted], compiler = compiler)

 def debug_flag(compiler):
    src_with_auto = textwrap.dedent('''\
@@ -175,6 +175,8 @@ scylla_tests = [
    'tests/keys_test',
    'tests/partitioner_test',
    'tests/frozen_mutation_test',
+    'tests/serialized_action_test',
+    'tests/clustering_ranges_walker_test',
    'tests/perf/perf_mutation',
    'tests/lsa_async_eviction_test',
    'tests/lsa_sync_eviction_test',
@@ -183,6 +185,9 @@ scylla_tests = [
    'tests/perf/perf_hash',
    'tests/perf/perf_cql_parser',
    'tests/perf/perf_simple_query',
+    'tests/perf/perf_fast_forward',
+    'tests/cache_streamed_mutation_test',
+    'tests/row_cache_stress_test',
    'tests/memory_footprint',
    'tests/perf/perf_sstable',
    'tests/cql_query_test',
@@ -194,6 +199,7 @@ scylla_tests = [
    'tests/test-serialization',
    'tests/sstable_test',
    'tests/sstable_mutation_test',
+    'tests/sstable_resharding_test',
    'tests/memtable_test',
    'tests/commitlog_test',
    'tests/cartesian_product_test',
@@ -215,6 +221,7 @@ scylla_tests = [
    'tests/murmur_hash_test',
    'tests/allocation_strategy_test',
    'tests/logalloc_test',
+    'tests/log_histogram_test',
    'tests/managed_vector_test',
    'tests/crc_test',
    'tests/flush_queue_test',
@@ -230,6 +237,8 @@ scylla_tests = [
    'tests/virtual_reader_test',
    'tests/view_schema_test',
    'tests/counter_test',
+    'tests/cell_locker_test',
+    'tests/loading_cache_test',
 ]

 apps = [
@@ -260,6 +269,8 @@ arg_parser.add_argument('--ldflags', action = 'store', dest = 'user_ldflags', de
                        help = 'Extra flags for the linker')
 arg_parser.add_argument('--compiler', action = 'store', dest = 'cxx', default = 'g++',
                        help = 'C++ compiler path')
+arg_parser.add_argument('--c-compiler', action='store', dest='cc', default='gcc',
+                        help='C compiler path')
 arg_parser.add_argument('--with-osv', action = 'store', dest = 'with_osv', default = '',
                        help = 'Shortcut for compile for OSv')
 arg_parser.add_argument('--enable-dpdk', action = 'store_true', dest = 'dpdk', default = False,
@@ -280,6 +291,10 @@ arg_parser.add_argument('--python', action = 'store', dest = 'python', default =
                        help = 'Python3 path')
 add_tristate(arg_parser, name = 'hwloc', dest = 'hwloc', help = 'hwloc support')
 add_tristate(arg_parser, name = 'xen', dest = 'xen', help = 'Xen support')
+arg_parser.add_argument('--enable-gcc6-concepts', dest='gcc6_concepts', action='store_true', default=False,
+                        help='enable experimental support for C++ Concepts as implemented in GCC 6')
+arg_parser.add_argument('--enable-alloc-failure-injector', dest='alloc_failure_injector', action='store_true', default=False,
+                        help='enable allocation failure injection')
 args = arg_parser.parse_args()

 defines = []
@@ -343,6 +358,7 @@ scylla_core = (['database.cc',
                 'cql3/statements/create_view_statement.cc',
                 'cql3/statements/create_type_statement.cc',
                 'cql3/statements/create_user_statement.cc',
+                 'cql3/statements/drop_index_statement.cc',
                 'cql3/statements/drop_keyspace_statement.cc',
                 'cql3/statements/drop_table_statement.cc',
                 'cql3/statements/drop_view_statement.cc',
@@ -408,16 +424,22 @@ scylla_core = (['database.cc',
                 'cql3/selection/selector.cc',
                 'cql3/restrictions/statement_restrictions.cc',
                 'cql3/result_set.cc',
+                 'cql3/variable_specifications.cc',
                 'db/consistency_level.cc',
                 'db/system_keyspace.cc',
                 'db/schema_tables.cc',
+                 'db/cql_type_parser.cc',
+                 'db/legacy_schema_migrator.cc',
                 'db/commitlog/commitlog.cc',
                 'db/commitlog/commitlog_replayer.cc',
                 'db/commitlog/commitlog_entry.cc',
                 'db/config.cc',
+                 'db/heat_load_balance.cc',
                 'db/index/secondary_index.cc',
                 'db/marshal/type_parser.cc',
                 'db/batchlog_manager.cc',
+                 'db/view/view.cc',
+                 'index/secondary_index_manager.cc',
                 'io/io.cc',
                 'utils/utils.cc',
                 'utils/UUID_gen.cc',
@@ -438,6 +460,7 @@ scylla_core = (['database.cc',
                 'gms/gossip_digest_ack2.cc',
                 'gms/endpoint_state.cc',
                 'gms/application_state.cc',
+                 'gms/inet_address.cc',
                 'dht/i_partitioner.cc',
                 'dht/murmur3_partitioner.cc',
                 'dht/byte_ordered_partitioner.cc',
@@ -465,7 +488,7 @@ scylla_core = (['database.cc',
                 'service/client_state.cc',
                 'service/migration_task.cc',
                 'service/storage_service.cc',
-                 'service/load_broadcaster.cc',
+                 'service/misc_services.cc',
                 'service/pager/paging_state.cc',
                 'service/pager/query_pagers.cc',
                 'streaming/stream_task.cc',
@@ -481,12 +504,12 @@ scylla_core = (['database.cc',
                 'streaming/stream_manager.cc',
                 'streaming/stream_result_future.cc',
                 'streaming/stream_session_state.cc',
-                 'gc_clock.cc',
+                 'clocks-impl.cc',
                 'partition_slice_builder.cc',
                 'init.cc',
+                 'lister.cc',
                 'repair/repair.cc',
                 'exceptions/exceptions.cc',
-                 'dns.cc',
                 'auth/auth.cc',
                 'auth/authenticated_user.cc',
                 'auth/authenticator.cc',
@@ -562,6 +585,7 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/commitlog.idl.hh',
        'idl/tracing.idl.hh',
        'idl/consistency_level.idl.hh',
+        'idl/cache_temperature.idl.hh',
        ]

 scylla_tests_dependencies = scylla_core + api + idls + [
@@ -607,6 +631,8 @@ tests_not_using_seastar_test_framework = set([
    'tests/perf/perf_cql_parser',
    'tests/message',
    'tests/perf/perf_simple_query',
+    'tests/perf/perf_fast_forward',
+    'tests/row_cache_stress_test',
    'tests/memory_footprint',
    'tests/gossip',
    'tests/perf/perf_sstable',
@@ -619,30 +645,39 @@ for t in tests_not_using_seastar_test_framework:
 for t in scylla_tests:
    deps[t] = [t + '.cc']
    if t not in tests_not_using_seastar_test_framework:
-        deps[t] += scylla_tests_dependencies 
+        deps[t] += scylla_tests_dependencies
        deps[t] += scylla_tests_seastar_deps
    else:
        deps[t] += scylla_core + api + idls + ['tests/cql_test_env.cc']

 deps['tests/sstable_test'] += ['tests/sstable_datafile_test.cc']

-deps['tests/bytes_ostream_test'] = ['tests/bytes_ostream_test.cc']
+deps['tests/bytes_ostream_test'] = ['tests/bytes_ostream_test.cc', 'utils/managed_bytes.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
 deps['tests/input_stream_test'] = ['tests/input_stream_test.cc']
-deps['tests/UUID_test'] = ['utils/UUID_gen.cc', 'tests/UUID_test.cc']
+deps['tests/UUID_test'] = ['utils/UUID_gen.cc', 'tests/UUID_test.cc', 'utils/uuid.cc', 'utils/managed_bytes.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
 deps['tests/murmur_hash_test'] = ['bytes.cc', 'utils/murmur_hash.cc', 'tests/murmur_hash_test.cc']
 deps['tests/allocation_strategy_test'] = ['tests/allocation_strategy_test.cc', 'utils/logalloc.cc', 'utils/dynamic_bitset.cc']
+deps['tests/log_histogram_test'] = ['tests/log_histogram_test.cc']
 deps['tests/anchorless_list_test'] = ['tests/anchorless_list_test.cc']

 warnings = [
    '-Wno-mismatched-tags',  # clang-only
    '-Wno-maybe-uninitialized', # false positives on gcc 5
+    '-Wno-tautological-compare',
+    '-Wno-parentheses-equality',
+    '-Wno-c++11-narrowing',
+    '-Wno-c++1z-extensions',
+    '-Wno-sometimes-uninitialized',
+    '-Wno-return-stack-address',
+    '-Wno-missing-braces',
+    '-Wno-unused-lambda-capture',
    ]

 warnings = [w
            for w in warnings
            if warning_supported(warning = w, compiler = args.cxx)]

-warnings = ' '.join(warnings)
+warnings = ' '.join(warnings + ['-Wno-error=deprecated-declarations'])

 dbgflag = debug_flag(args.cxx) if args.debuginfo else ''
 tests_link_rule = 'link' if args.tests_debuginfo else 'link_stripped'
@@ -696,6 +731,9 @@ if not try_compile(compiler=args.cxx, source='''\
    print('Installed boost version too old.  Please update {}.'.format(pkgname("boost-devel")))
    sys.exit(1)

+
+has_sanitize_address_use_after_scope = try_compile(compiler=args.cxx, flags=['-fsanitize-address-use-after-scope'], source='int f() {}')
+
 defines = ' '.join(['-D' + d for d in defines])

 globals().update(vars(args))
@@ -718,7 +756,7 @@ scylla_release = file.read().strip()

 extra_cxxflags["release.cc"] = "-DSCYLLA_VERSION=\"\\\"" + scylla_version + "\\\"\" -DSCYLLA_RELEASE=\"\\\"" + scylla_release + "\\\"\""

-seastar_flags = ['--disable-xen']
+seastar_flags = []
 if args.dpdk:
    # fake dependencies on dpdk, so that it is built before anything else
    seastar_flags += ['--enable-dpdk']
@@ -728,9 +766,13 @@ if args.staticcxx:
    seastar_flags += ['--static-stdc++']
 if args.staticboost:
    seastar_flags += ['--static-boost']
+if args.gcc6_concepts:
+    seastar_flags += ['--enable-gcc6-concepts']
+if args.alloc_failure_injector:
+    seastar_flags += ['--enable-alloc-failure-injector']

 seastar_cflags = args.user_cflags + " -march=nehalem"
-seastar_flags += ['--compiler', args.cxx, '--cflags=%s' % (seastar_cflags)]
+seastar_flags += ['--compiler', args.cxx, '--c-compiler', args.cc, '--cflags=%s' % (seastar_cflags)]

 status = subprocess.call([python, './configure.py'] + seastar_flags, cwd = 'seastar')

@@ -825,7 +867,7 @@ with open(buildfile, 'w') as f:
        f.write(textwrap.dedent('''\
            cxxflags_{mode} = -I. -I $builddir/{mode}/gen -I seastar -I seastar/build/{mode}/gen
            rule cxx.{mode}
-              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} -c -o $out $in
+              command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} $obj_cxxflags -c -o $out $in
              description = CXX $out
              depfile = $out.d
            rule link.{mode}
@@ -843,7 +885,16 @@ with open(buildfile, 'w') as f:
                command = thrift -gen cpp:cob_style -out $builddir/{mode}/gen $in
                description = THRIFT $in
            rule antlr3.{mode}
-                command = sed -e '/^#if 0/,/^#endif/d' $in > $builddir/{mode}/gen/$in && antlr3 $builddir/{mode}/gen/$in && sed -i 's/^\\( *\)\\(ImplTraits::CommonTokenType\\* [a-zA-Z0-9_]* = NULL;\\)$$/\\1const \\2/' build/{mode}/gen/${{stem}}Parser.cpp
+                # We replace many local `ExceptionBaseType* ex` variables with a single function-scope one.
+                # Because we add such a variable to every function, and because `ExceptionBaseType` is not a global
+                # name, we also add a global typedef to avoid compilation errors. 
+                command = sed -e '/^#if 0/,/^#endif/d' $in > $builddir/{mode}/gen/$in $
+                     && antlr3 $builddir/{mode}/gen/$in $
+                     && sed -i -e 's/^\\( *\)\\(ImplTraits::CommonTokenType\\* [a-zA-Z0-9_]* = NULL;\\)$$/\\1const \\2/' $
+                        -e '1i using ExceptionBaseType = int;' $
+                        -e 's/^{{/{{ ExceptionBaseType\* ex = nullptr;/; $
+                            s/ExceptionBaseType\* ex = new/ex = new/' $
+                        build/{mode}/gen/${{stem}}Parser.cpp
                description = ANTLR3 $in
            ''').format(mode = mode, **modeval))
        f.write('build {mode}: phony {artifacts}\n'.format(mode = mode,
@@ -886,7 +937,7 @@ with open(buildfile, 'w') as f:
                if binary.startswith('tests/'):
                    local_libs = '$libs'
                    if binary not in tests_not_using_seastar_test_framework or binary in pure_boost_tests:
-                        local_libs += ' ' + maybe_static(args.staticboost, '-lboost_unit_test_framework') 
+                        local_libs += ' ' + maybe_static(args.staticboost, '-lboost_unit_test_framework')
                    if has_thrift:
                        local_libs += ' ' + thrift_libs + ' ' + maybe_static(args.staticboost, '-lboost_system')
                    # Our code's debugging information is huge, and multiplied
@@ -943,7 +994,7 @@ with open(buildfile, 'w') as f:
            f.write('build {}: ragel {}\n'.format(hh, src))
        for hh in swaggers:
            src = swaggers[hh]
-            f.write('build {}: swagger {}\n'.format(hh,src))
+            f.write('build {}: swagger {} | seastar/json/json2code.py\n'.format(hh,src))
        for hh in serializers:
            src = serializers[hh]
            f.write('build {}: serializer {} | idl-compiler.py\n'.format(hh,src))
@@ -960,6 +1011,9 @@ with open(buildfile, 'w') as f:
            for cc in grammar.sources('$builddir/{}/gen'.format(mode)):
                obj = cc.replace('.cpp', '.o')
                f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
+                if cc.endswith('Parser.cpp') and has_sanitize_address_use_after_scope:
+                    # Parsers end up using huge amounts of stack space and overflowing their stack 
+                    f.write('  obj_cxxflags = -fno-sanitize-address-use-after-scope\n')
        f.write('build seastar/build/{mode}/libseastar.a seastar/build/{mode}/apps/iotune/iotune seastar/build/{mode}/gen/http/request_parser.hh seastar/build/{mode}/gen/http/http_response_parser.hh: ninja {seastar_deps}\n'
                .format(**locals()))
        f.write('  pool = seastar_pool\n')
--- a/converting_mutation_partition_applier.hh
+++ b/converting_mutation_partition_applier.hh
@@ -22,6 +22,7 @@
 #pragma once

 #include "mutation_partition_view.hh"
+#include "mutation_partition.hh"
 #include "schema.hh"

 // Mutation partition visitor which applies visited data into
@@ -37,12 +38,12 @@ private:
    static bool is_compatible(const column_definition& new_def, const data_type& old_type, column_kind kind) {
        return ::is_compatible(new_def.kind, kind) && new_def.type->is_value_compatible_with(*old_type);
    }
-    void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) {
+    static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) {
        if (is_compatible(new_def, old_type, kind) && cell.timestamp() > new_def.dropped_at()) {
            dst.apply(new_def, atomic_cell_or_collection(cell));
        }
    }
-    void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, collection_mutation_view cell) {
+    static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, collection_mutation_view cell) {
        if (!is_compatible(new_def, old_type, kind)) {
            return;
        }
@@ -94,8 +95,8 @@ public:
        _p.apply_row_tombstone(_p_schema, rt);
    }

-    virtual void accept_row(clustering_key_view key, tombstone deleted_at, const row_marker& rm) override {
-        deletable_row& r = _p.clustered_row(_p_schema, key);
+    virtual void accept_row(position_in_partition_view key, const row_tombstone& deleted_at, const row_marker& rm, is_dummy dummy, is_continuous continuous) override {
+        deletable_row& r = _p.clustered_row(_p_schema, key, dummy, continuous);
        r.apply(rm);
        r.apply(deleted_at);
        _current_row = &r;
@@ -116,4 +117,14 @@ public:
            accept_cell(_current_row->cells(), column_kind::regular_column, *def, col.type(), collection);
        }
    }
+
+    // Appends the cell to dst upgrading it to the new schema.
+    // Cells must have monotonic names.
+    static void append_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, const atomic_cell_or_collection& cell) {
+        if (new_def.is_atomic()) {
+            accept_cell(dst, kind, new_def, old_type, cell.as_atomic_cell());
+        } else {
+            accept_cell(dst, kind, new_def, old_type, cell.as_collection_mutation());
+        }
+    }
 };
--- a/counters.cc
+++ b/counters.cc
@@ -29,6 +29,15 @@ counter_id counter_id::local()
    return counter_id(service::get_local_storage_service().get_local_id());
 }

+bool counter_id::less_compare_1_7_4::operator()(const counter_id& a, const counter_id& b) const
+{
+    if (a._most_significant != b._most_significant) {
+        return a._most_significant < b._most_significant;
+    } else {
+        return a._least_significant < b._least_significant;
+    }
+}
+
 std::ostream& operator<<(std::ostream& os, const counter_id& id) {
    return os << id.to_uuid();
 }
@@ -42,10 +51,106 @@ std::ostream& operator<<(std::ostream& os, counter_cell_view ccv) {
    return os << "{counter_cell timestamp: " << ccv.timestamp() << " shards: {" << ::join(", ", ccv.shards()) << "}}";
 }

+void counter_cell_builder::do_sort_and_remove_duplicates()
+{
+    boost::range::sort(_shards, [] (auto& a, auto& b) { return a.id() < b.id(); });
+
+    std::vector<counter_shard> new_shards;
+    new_shards.reserve(_shards.size());
+    for (auto& cs : _shards) {
+        if (new_shards.empty() || new_shards.back().id() != cs.id()) {
+            new_shards.emplace_back(cs);
+        } else {
+            new_shards.back().apply(cs);
+        }
+    }
+    _shards = std::move(new_shards);
+    _sorted = true;
+}
+
+std::vector<counter_shard> counter_cell_view::shards_compatible_with_1_7_4() const
+{
+    auto sorted_shards = boost::copy_range<std::vector<counter_shard>>(shards());
+    counter_id::less_compare_1_7_4 cmp;
+    boost::range::sort(sorted_shards, [&] (auto& a, auto& b) {
+        return cmp(a.id(), b.id());
+    });
+    return sorted_shards;
+}
+
+static bool apply_in_place(atomic_cell_or_collection& dst, atomic_cell_or_collection& src)
+{
+    auto dst_ccmv = counter_cell_mutable_view(dst.as_mutable_atomic_cell());
+    auto src_ccmv = counter_cell_mutable_view(src.as_mutable_atomic_cell());
+    auto dst_shards = dst_ccmv.shards();
+    auto src_shards = src_ccmv.shards();
+
+    auto dst_it = dst_shards.begin();
+    auto src_it = src_shards.begin();
+
+    while (src_it != src_shards.end()) {
+        while (dst_it != dst_shards.end() && dst_it->id() < src_it->id()) {
+            ++dst_it;
+        }
+        if (dst_it == dst_shards.end() || dst_it->id() != src_it->id()) {
+            // Fast-path failed. Revert and fall back to the slow path.
+            if (dst_it == dst_shards.end()) {
+                --dst_it;
+            }
+            while (src_it != src_shards.begin()) {
+                --src_it;
+                while (dst_it->id() != src_it->id()) {
+                    --dst_it;
+                }
+                src_it->swap_value_and_clock(*dst_it);
+            }
+            return false;
+        }
+        if (dst_it->logical_clock() < src_it->logical_clock()) {
+            dst_it->swap_value_and_clock(*src_it);
+        } else {
+            src_it->set_value_and_clock(*dst_it);
+        }
+        ++src_it;
+    }
+
+    auto dst_ts = dst_ccmv.timestamp();
+    auto src_ts = src_ccmv.timestamp();
+    dst_ccmv.set_timestamp(std::max(dst_ts, src_ts));
+    src_ccmv.set_timestamp(dst_ts);
+    src.as_mutable_atomic_cell().set_counter_in_place_revert(true);
+    return true;
+}
+
+static void revert_in_place_apply(atomic_cell_or_collection& dst, atomic_cell_or_collection& src)
+{
+    assert(dst.can_use_mutable_view() && src.can_use_mutable_view());
+    auto dst_ccmv = counter_cell_mutable_view(dst.as_mutable_atomic_cell());
+    auto src_ccmv = counter_cell_mutable_view(src.as_mutable_atomic_cell());
+    auto dst_shards = dst_ccmv.shards();
+    auto src_shards = src_ccmv.shards();
+
+    auto dst_it = dst_shards.begin();
+    auto src_it = src_shards.begin();
+
+    while (src_it != src_shards.end()) {
+        while (dst_it != dst_shards.end() && dst_it->id() < src_it->id()) {
+            ++dst_it;
+        }
+        assert(dst_it != dst_shards.end() && dst_it->id() == src_it->id());
+        dst_it->swap_value_and_clock(*src_it);
+        ++src_it;
+    }
+
+    auto dst_ts = dst_ccmv.timestamp();
+    auto src_ts = src_ccmv.timestamp();
+    dst_ccmv.set_timestamp(src_ts);
+    src_ccmv.set_timestamp(dst_ts);
+    src.as_mutable_atomic_cell().set_counter_in_place_revert(false);
+}
+
 bool counter_cell_view::apply_reversibly(atomic_cell_or_collection& dst, atomic_cell_or_collection& src)
 {
-    // TODO: optimise for single shard existing in the other
-    // TODO: optimise for no new shards?
    auto dst_ac = dst.as_atomic_cell();
    auto src_ac = src.as_atomic_cell();

@@ -58,23 +163,29 @@ bool counter_cell_view::apply_reversibly(atomic_cell_or_collection& dst, atomic_
    }

    if (dst_ac.is_counter_update() && src_ac.is_counter_update()) {
-        // FIXME: store deltas just as a normal int64_t and get rid of these calls
-        // to long_type
-        auto src_v = value_cast<int64_t>(long_type->deserialize_value(src_ac.value()));
-        auto dst_v = value_cast<int64_t>(long_type->deserialize_value(dst_ac.value()));
+        auto src_v = src_ac.counter_update_value();
+        auto dst_v = dst_ac.counter_update_value();
        dst = atomic_cell::make_live_counter_update(std::max(dst_ac.timestamp(), src_ac.timestamp()),
-                                                    long_type->decompose(src_v + dst_v));
+                                                    src_v + dst_v);
        return true;
    }

    assert(!dst_ac.is_counter_update());
    assert(!src_ac.is_counter_update());

-    auto a_shards = counter_cell_view(dst_ac).shards();
-    auto b_shards = counter_cell_view(src_ac).shards();
+    if (counter_cell_view(dst_ac).shard_count() >= counter_cell_view(src_ac).shard_count()
+        && dst.can_use_mutable_view() && src.can_use_mutable_view()) {
+        if (apply_in_place(dst, src)) {
+            return true;
+        }
+    }
+
+    src.as_mutable_atomic_cell().set_counter_in_place_revert(false);
+    auto dst_shards = counter_cell_view(dst_ac).shards();
+    auto src_shards = counter_cell_view(src_ac).shards();

    counter_cell_builder result;
-    combine(a_shards.begin(), a_shards.end(), b_shards.begin(), b_shards.end(),
+    combine(dst_shards.begin(), dst_shards.end(), src_shards.begin(), src_shards.end(),
            result.inserter(), counter_shard_view::less_compare_by_id(), [] (auto& x, auto& y) {
                return x.logical_clock() < y.logical_clock() ? y : x;
            });
@@ -87,10 +198,12 @@ bool counter_cell_view::apply_reversibly(atomic_cell_or_collection& dst, atomic_
 void counter_cell_view::revert_apply(atomic_cell_or_collection& dst, atomic_cell_or_collection& src)
 {
    if (dst.as_atomic_cell().is_counter_update()) {
-        auto src_v = value_cast<int64_t>(long_type->deserialize_value(src.as_atomic_cell().value()));
-        auto dst_v = value_cast<int64_t>(long_type->deserialize_value(dst.as_atomic_cell().value()));
+        auto src_v = src.as_atomic_cell().counter_update_value();
+        auto dst_v = dst.as_atomic_cell().counter_update_value();
        dst = atomic_cell::make_live(dst.as_atomic_cell().timestamp(),
                                     long_type->decompose(dst_v - src_v));
+    } else if (src.as_atomic_cell().is_counter_in_place_revert_set()) {
+        revert_in_place_apply(dst, src);
    } else {
        std::swap(dst, src);
    }
@@ -101,10 +214,11 @@ stdx::optional<atomic_cell> counter_cell_view::difference(atomic_cell_view a, at
    assert(!a.is_counter_update());
    assert(!b.is_counter_update());

-    if (!b.is_live()) {
+    if (!b.is_live() || !a.is_live()) {
+        if (b.is_live() || (!a.is_live() && compare_atomic_cell_for_merge(b, a) < 0)) {
+            return atomic_cell(a);
+        }
        return { };
-    } else if (!a.is_live()) {
-        return atomic_cell(a);
    }

    auto a_shards = counter_cell_view(a).shards();
@@ -139,28 +253,68 @@ stdx::optional<atomic_cell> counter_cell_view::difference(atomic_cell_view a, at
 void transform_counter_updates_to_shards(mutation& m, const mutation* current_state, uint64_t clock_offset) {
    // FIXME: allow current_state to be frozen_mutation

-    auto transform_new_row_to_shards = [clock_offset] (auto& cr) {
-        cr.row().cells().for_each_cell([clock_offset] (auto, atomic_cell_or_collection& ac_o_c) {
+    auto transform_new_row_to_shards = [clock_offset] (auto& cells) {
+        cells.for_each_cell([clock_offset] (auto, atomic_cell_or_collection& ac_o_c) {
            auto acv = ac_o_c.as_atomic_cell();
            if (!acv.is_live()) {
                return; // continue -- we are in lambda
            }
-            auto delta = value_cast<int64_t>(long_type->deserialize_value(acv.value()));
-            counter_cell_builder ccb;
-            ccb.add_shard(counter_shard(counter_id::local(), delta, clock_offset + 1));
-            ac_o_c = ccb.build(acv.timestamp());
+            auto delta = acv.counter_update_value();
+            auto cs = counter_shard(counter_id::local(), delta, clock_offset + 1);
+            ac_o_c = counter_cell_builder::from_single_shard(acv.timestamp(), cs);
        });
    };

    if (!current_state) {
+        transform_new_row_to_shards(m.partition().static_row());
        for (auto& cr : m.partition().clustered_rows()) {
-            transform_new_row_to_shards(cr);
+            transform_new_row_to_shards(cr.row().cells());
        }
        return;
    }

    clustering_key::less_compare cmp(*m.schema());

+    auto transform_row_to_shards = [clock_offset] (auto& transformee, auto& state) {
+        std::deque<std::pair<column_id, counter_shard>> shards;
+        state.for_each_cell([&] (column_id id, const atomic_cell_or_collection& ac_o_c) {
+            auto acv = ac_o_c.as_atomic_cell();
+            if (!acv.is_live()) {
+                return; // continue -- we are in lambda
+            }
+            counter_cell_view ccv(acv);
+            auto cs = ccv.local_shard();
+            if (!cs) {
+                return; // continue
+            }
+            shards.emplace_back(std::make_pair(id, counter_shard(*cs)));
+        });
+
+        transformee.for_each_cell([&] (column_id id, atomic_cell_or_collection& ac_o_c) {
+            auto acv = ac_o_c.as_atomic_cell();
+            if (!acv.is_live()) {
+                return; // continue -- we are in lambda
+            }
+            while (!shards.empty() && shards.front().first < id) {
+                shards.pop_front();
+            }
+
+            auto delta = acv.counter_update_value();
+
+            if (shards.empty() || shards.front().first > id) {
+                auto cs = counter_shard(counter_id::local(), delta, clock_offset + 1);
+                ac_o_c = counter_cell_builder::from_single_shard(acv.timestamp(), cs);
+            } else {
+                auto& cs = shards.front().second;
+                cs.update(delta, clock_offset + 1);
+                ac_o_c = counter_cell_builder::from_single_shard(acv.timestamp(), cs);
+                shards.pop_front();
+            }
+        });
+    };
+
+    transform_row_to_shards(m.partition().static_row(), current_state->partition().static_row());
+
    auto& cstate = current_state->partition();
    auto it = cstate.clustered_rows().begin();
    auto end = cstate.clustered_rows().end();
@@ -169,60 +323,10 @@ void transform_counter_updates_to_shards(mutation& m, const mutation* current_st
            ++it;
        }
        if (it == end || cmp(cr.key(), it->key())) {
-            transform_new_row_to_shards(cr);
+            transform_new_row_to_shards(cr.row().cells());
            continue;
        }

-        struct counter_shard_or_tombstone {
-            stdx::optional<counter_shard> shard;
-            tombstone tomb;
-        };
-        std::deque<std::pair<column_id, counter_shard_or_tombstone>> shards;
-        it->row().cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& ac_o_c) {
-            auto acv = ac_o_c.as_atomic_cell();
-            if (!acv.is_live()) {
-                counter_shard_or_tombstone cs_o_t { { },
-                                                    tombstone(acv.timestamp(), acv.deletion_time()) };
-                shards.emplace_back(std::make_pair(id, cs_o_t));
-                return; // continue -- we are in lambda
-            }
-            counter_cell_view ccv(acv);
-            auto cs = ccv.local_shard();
-            if (!cs) {
-                return; // continue
-            }
-            shards.emplace_back(std::make_pair(id, counter_shard_or_tombstone { counter_shard(*cs), tombstone() }));
-        });
-
-        cr.row().cells().for_each_cell([&] (column_id id, atomic_cell_or_collection& ac_o_c) {
-            auto acv = ac_o_c.as_atomic_cell();
-            if (!acv.is_live()) {
-                return; // continue -- we are in lambda
-            }
-            while (!shards.empty() && shards.front().first < id) {
-                shards.pop_front();
-            }
-
-            auto delta = value_cast<int64_t>(long_type->deserialize_value(acv.value()));
-
-            counter_cell_builder ccb;
-            if (shards.empty() || shards.front().first > id) {
-                ccb.add_shard(counter_shard(counter_id::local(), delta, clock_offset + 1));
-            } else if (shards.front().second.tomb.timestamp == api::missing_timestamp) {
-                auto& cs = *shards.front().second.shard;
-                cs.update(delta, clock_offset + 1);
-                ccb.add_shard(cs);
-                shards.pop_front();
-            } else {
-                // We are apply the tombstone that's already there second time.
-                // It is not necessary but there is no easy way to remove cell
-                // from a mutation.
-                tombstone t = shards.front().second.tomb;
-                ac_o_c = atomic_cell::make_dead(t.timestamp, t.deletion_time);
-                shards.pop_front();
-                return; // continue -- we are in lambda
-            }
-            ac_o_c = ccb.build(acv.timestamp());
-        });
+        transform_row_to_shards(cr.row().cells(), it->row().cells());
    }
 }
--- a/counters.hh
+++ b/counters.hh
@@ -36,6 +36,10 @@ class counter_id {
    int64_t _least_significant;
    int64_t _most_significant;
 public:
+    static_assert(std::is_same<decltype(std::declval<utils::UUID>().get_least_significant_bits()), int64_t>::value
+            &&  std::is_same<decltype(std::declval<utils::UUID>().get_most_significant_bits()), int64_t>::value,
+        "utils::UUID is expected to work with two signed 64-bit integers");
+
    counter_id() = default;
    explicit counter_id(utils::UUID uuid) noexcept
        : _least_significant(uuid.get_least_significant_bits())
@@ -49,12 +53,20 @@ public:
    bool operator<(const counter_id& other) const {
        return to_uuid() < other.to_uuid();
    }
+    bool operator>(const counter_id& other) const {
+        return other.to_uuid() < to_uuid();
+    }
    bool operator==(const counter_id& other) const {
        return to_uuid() == other.to_uuid();
    }
    bool operator!=(const counter_id& other) const {
        return !(*this == other);
    }
+public:
+    // (Wrong) Counter ID ordering used by Scylla 1.7.4 and earlier.
+    struct less_compare_1_7_4 {
+        bool operator()(const counter_id& a, const counter_id& b) const;
+    };
 public:
    static counter_id local();

@@ -67,7 +79,8 @@ static_assert(std::is_pod<counter_id>::value, "counter_id should be a POD type")

 std::ostream& operator<<(std::ostream& os, const counter_id& id);

-class counter_shard_view {
+template<typename View>
+class basic_counter_shard_view {
    enum class offset : unsigned {
        id = 0u,
        value = unsigned(id) + sizeof(counter_id),
@@ -75,32 +88,58 @@ class counter_shard_view {
        total_size = unsigned(logical_clock) + sizeof(int64_t),
    };
 private:
-    bytes_view::const_pointer _base;
+    typename View::pointer _base;
 private:
    template<typename T>
    T read(offset off) const {
        T value;
-        std::copy_n(_base + static_cast<unsigned>(off), sizeof(T), reinterpret_cast<char*>(&value));
+        std::copy_n(_base + static_cast<unsigned>(off), sizeof(T), reinterpret_cast<signed char*>(&value));
        return value;
    }
 public:
    static constexpr auto size = size_t(offset::total_size);
 public:
-    counter_shard_view() = default;
-    explicit counter_shard_view(bytes_view::const_pointer ptr) noexcept
+    basic_counter_shard_view() = default;
+    explicit basic_counter_shard_view(typename View::pointer ptr) noexcept
        : _base(ptr) { }

    counter_id id() const { return read<counter_id>(offset::id); }
    int64_t value() const { return read<int64_t>(offset::value); }
    int64_t logical_clock() const { return read<int64_t>(offset::logical_clock); }

+    void swap_value_and_clock(basic_counter_shard_view& other) noexcept {
+        static constexpr size_t off = size_t(offset::value);
+        static constexpr size_t size = size_t(offset::total_size) - off;
+
+        typename View::value_type tmp[size];
+        std::copy_n(_base + off, size, tmp);
+        std::copy_n(other._base + off, size, _base + off);
+        std::copy_n(tmp, size, other._base + off);
+    }
+
+    void set_value_and_clock(const basic_counter_shard_view& other) noexcept {
+        static constexpr size_t off = size_t(offset::value);
+        static constexpr size_t size = size_t(offset::total_size) - off;
+        std::copy_n(other._base + off, size, _base + off);
+    }
+
+    bool operator==(const basic_counter_shard_view& other) const {
+        return id() == other.id() && value() == other.value()
+               && logical_clock() == other.logical_clock();
+    }
+    bool operator!=(const basic_counter_shard_view& other) const {
+        return !(*this == other);
+    }
+
    struct less_compare_by_id {
-        bool operator()(const counter_shard_view& x, const counter_shard_view& y) const {
+        bool operator()(const basic_counter_shard_view& x, const basic_counter_shard_view& y) const {
            return x.id() < y.id();
        }
    };
 };

+using counter_shard_view = basic_counter_shard_view<bytes_view>;
+
 std::ostream& operator<<(std::ostream& os, counter_shard_view csv);

 class counter_shard {
@@ -110,7 +149,23 @@ class counter_shard {
 private:
    template<typename T>
    static void write(const T& value, bytes::iterator& out) {
-        out = std::copy_n(reinterpret_cast<const char*>(&value), sizeof(T), out);
+        out = std::copy_n(reinterpret_cast<const signed char*>(&value), sizeof(T), out);
+    }
+private:
+    // Shared logic for applying counter_shards and counter_shard_views.
+    // T is either counter_shard or basic_counter_shard_view<U>.
+    template<typename T>
+    GCC6_CONCEPT(requires requires(T shard) {
+        { shard.value() } -> int64_t;
+        { shard.logical_clock() } -> int64_t;
+    })
+    counter_shard& do_apply(T&& other) noexcept {
+        auto other_clock = other.logical_clock();
+        if (_logical_clock < other_clock) {
+            _logical_clock = other_clock;
+            _value = other.value();
+        }
+        return *this;
    }
 public:
    counter_shard(counter_id id, int64_t value, int64_t logical_clock) noexcept
@@ -136,12 +191,11 @@ public:
    }

    counter_shard& apply(counter_shard_view other) noexcept {
-        auto other_clock = other.logical_clock();
-        if (_logical_clock < other_clock) {
-            _logical_clock = other_clock;
-            _value = other.value();
-        }
-        return *this;
+        return do_apply(other);
+    }
+
+    counter_shard& apply(const counter_shard& other) noexcept {
+        return do_apply(other);
    }

    static size_t serialized_size() {
@@ -156,6 +210,9 @@ public:

 class counter_cell_builder {
    std::vector<counter_shard> _shards;
+    bool _sorted = true;
+private:
+    void do_sort_and_remove_duplicates();
 public:
    counter_cell_builder() = default;
    counter_cell_builder(size_t shard_count) {
@@ -166,6 +223,21 @@ public:
        _shards.emplace_back(cs);
    }

+    void add_maybe_unsorted_shard(const counter_shard& cs) {
+        add_shard(cs);
+        if (_sorted && _shards.size() > 1) {
+            auto current = _shards.rbegin();
+            auto previous = std::next(current);
+            _sorted = current->id() > previous->id();
+        }
+    }
+
+    void sort_and_remove_duplicates() {
+        if (!_sorted) {
+            do_sort_and_remove_duplicates();
+        }
+    }
+
    size_t serialized_size() const {
        return _shards.size() * counter_shard::serialized_size();
    }
@@ -180,10 +252,15 @@ public:
    }

    atomic_cell build(api::timestamp_type timestamp) const {
-        bytes b(bytes::initialized_later(), serialized_size());
-        auto out = b.begin();
-        serialize(out);
-        return atomic_cell::make_live(timestamp, b);
+        return atomic_cell::make_live_from_serializer(timestamp, serialized_size(), [this] (bytes::iterator out) {
+            serialize(out);
+        });
+    }
+
+    static atomic_cell from_single_shard(api::timestamp_type timestamp, const counter_shard& cs) {
+        return atomic_cell::make_live_from_serializer(timestamp, counter_shard::serialized_size(), [&cs] (bytes::iterator out) {
+            cs.serialize(out);
+        });
    }

    class inserter_iterator : public std::iterator<std::output_iterator_tag, counter_shard> {
@@ -210,26 +287,28 @@ public:
 // <counter_id>   := <int64_t><int64_t>
 // <shard>        := <counter_id><int64_t:value><int64_t:logical_clock>
 // <counter_cell> := <shard>*
-class counter_cell_view {
-    atomic_cell_view _cell;
+template<typename View>
+class basic_counter_cell_view {
+protected:
+    atomic_cell_base<View> _cell;
 private:
-    class shard_iterator : public std::iterator<std::input_iterator_tag, const counter_shard_view> {
-        bytes_view::const_pointer _current;
-        counter_shard_view _current_view;
+    class shard_iterator : public std::iterator<std::input_iterator_tag, basic_counter_shard_view<View>> {
+        typename View::pointer _current;
+        basic_counter_shard_view<View> _current_view;
    public:
        shard_iterator() = default;
-        shard_iterator(bytes_view::const_pointer ptr) noexcept
+        shard_iterator(typename View::pointer ptr) noexcept
            : _current(ptr), _current_view(ptr) { }

-        const counter_shard_view& operator*() const noexcept {
+        basic_counter_shard_view<View>& operator*() noexcept {
            return _current_view;
        }
-        const counter_shard_view* operator->() const noexcept {
+        basic_counter_shard_view<View>* operator->() noexcept {
            return &_current_view;
        }
        shard_iterator& operator++() noexcept {
            _current += counter_shard_view::size;
-            _current_view = counter_shard_view(_current);
+            _current_view = basic_counter_shard_view<View>(_current);
            return *this;
        }
        shard_iterator operator++(int) noexcept {
@@ -237,6 +316,16 @@ private:
            operator++();
            return it;
        }
+        shard_iterator& operator--() noexcept {
+            _current -= counter_shard_view::size;
+            _current_view = basic_counter_shard_view<View>(_current);
+            return *this;
+        }
+        shard_iterator operator--(int) noexcept {
+            auto it = *this;
+            operator--();
+            return it;
+        }
        bool operator==(const shard_iterator& other) const noexcept {
            return _current == other._current;
        }
@@ -257,7 +346,7 @@ public:
    }
 public:
    // ac must be a live counter cell
-    explicit counter_cell_view(atomic_cell_view ac) noexcept : _cell(ac) {
+    explicit basic_counter_cell_view(atomic_cell_base<View> ac) noexcept : _cell(ac) {
        assert(_cell.is_live());
        assert(!_cell.is_counter_update());
    }
@@ -287,6 +376,17 @@ public:
        return get_shard(counter_id::local());
    }

+    bool operator==(const basic_counter_cell_view& other) const {
+        return timestamp() == other.timestamp() && boost::equal(shards(), other.shards());
+    }
+};
+
+struct counter_cell_view : basic_counter_cell_view<bytes_view> {
+    using basic_counter_cell_view::basic_counter_cell_view;
+
+    // Returns counter shards in an order that is compatible with Scylla 1.7.4.
+    std::vector<counter_shard> shards_compatible_with_1_7_4() const;
+
    // Reversibly applies two counter cells, at least one of them must be live.
    // Returns true iff dst was modified.
    static bool apply_reversibly(atomic_cell_or_collection& dst, atomic_cell_or_collection& src);
@@ -301,6 +401,12 @@ public:
    friend std::ostream& operator<<(std::ostream& os, counter_cell_view ccv);
 };

+struct counter_cell_mutable_view : basic_counter_cell_view<bytes_mutable_view> {
+    using basic_counter_cell_view::basic_counter_cell_view;
+
+    void set_timestamp(api::timestamp_type ts) { _cell.set_timestamp(ts); }
+};
+
 // Transforms mutation dst from counter updates to counter shards using state
 // stored in current_state.
 // If current_state is present it has to be in the same schema as dst.
--- a/cpu_controller.hh
+++ b/cpu_controller.hh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <seastar/core/thread.hh>
+#include <seastar/core/timer.hh>
+#include <chrono>
+
+// Simple proportional controller to adjust shares of memtable/streaming flushes.
+//
+// Goal is to flush as fast as we can, but not so fast that we steal all the CPU from incoming
+// requests, and at the same time minimize user-visible fluctuations in the flush quota.
+//
+// What that translates to is we'll try to keep virtual dirty's firt derivative at 0 (IOW, we keep
+// virtual dirty constant), which means that the rate of incoming writes is equal to the rate of
+// flushed bytes.
+//
+// The exact point at which the controller stops determines the desired flush CPU usage. As we
+// approach the hard dirty limit, we need to be more aggressive. We will therefore define two
+// thresholds, and increase the constant as we cross them.
+//
+//  1) the soft limit line
+//  2) halfway between soft limit and dirty limit
+//
+// The constants q1 and q2 are used to determine the proportional factor at each stage.
+//
+// Below the soft limit, we are in no particular hurry to flush, since it means we're set to
+// complete flushing before we a new memtable is ready. The quota is dirty * q1, and q1 is set to a
+// low number.
+//
+// The first half of the virtual dirty region is where we expect to be usually, so we have a low
+// slope corresponding to a sluggish response between q1 * soft_limit and q2.
+//
+// In the second half, we're getting close to the hard dirty limit so we increase the slope and
+// become more responsive, up to a maximum quota of qmax.
+//
+// For now we'll just set them in the structure not to complicate the constructor. But q1, q2 and
+// qmax can easily become parameters if we find another user.
+class flush_cpu_controller {
+    static constexpr float hard_dirty_limit = 0.50;
+    static constexpr float q1 = 0.01;
+    static constexpr float q2 = 0.2;
+    static constexpr float qmax = 1;
+
+    float _current_quota = 0.0f;
+    float _goal;
+    std::function<float()> _current_dirty;
+    std::chrono::milliseconds _interval;
+    timer<> _update_timer;
+
+    seastar::thread_scheduling_group _scheduling_group;
+    seastar::thread_scheduling_group *_current_scheduling_group = nullptr;
+
+    void adjust();
+public:
+    seastar::thread_scheduling_group* scheduling_group() {
+        return _current_scheduling_group;
+    }
+    float current_quota() const {
+        return _current_quota;
+    }
+
+    struct disabled {
+        seastar::thread_scheduling_group *backup;
+    };
+    flush_cpu_controller(disabled d) : _scheduling_group(std::chrono::nanoseconds(0), 0), _current_scheduling_group(d.backup) {}
+    flush_cpu_controller(std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty);
+    flush_cpu_controller(flush_cpu_controller&&) = default;
+};
+
+
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -46,6 +46,7 @@ options {
 #include "cql3/statements/drop_type_statement.hh"
 #include "cql3/statements/alter_type_statement.hh"
 #include "cql3/statements/property_definitions.hh"
+#include "cql3/statements/drop_index_statement.hh"
 #include "cql3/statements/drop_table_statement.hh"
 #include "cql3/statements/drop_view_statement.hh"
 #include "cql3/statements/truncate_statement.hh"
@@ -318,9 +319,7 @@ cqlStatement returns [shared_ptr<raw::parsed_statement> stmt]
    | st10=createIndexStatement        { $stmt = st10; }
    | st11=dropKeyspaceStatement       { $stmt = st11; }
    | st12=dropTableStatement          { $stmt = st12; }
-#if 0
    | st13=dropIndexStatement          { $stmt = st13; }
-#endif
    | st14=alterTableStatement         { $stmt = st14; }
    | st15=alterKeyspaceStatement      { $stmt = st15; }
    | st16=grantStatement              { $stmt = st16; }
@@ -778,12 +777,13 @@ createIndexStatement returns [::shared_ptr<create_index_statement> expr]
        auto props = make_shared<index_prop_defs>();
        bool if_not_exists = false;
        auto name = ::make_shared<cql3::index_name>();
+        std::vector<::shared_ptr<index_target::raw>> targets;
    }
    : K_CREATE (K_CUSTOM { props->is_custom = true; })? K_INDEX (K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
-        (idxName[name])? K_ON cf=columnFamilyName '(' id=indexIdent ')'
+        (idxName[name])? K_ON cf=columnFamilyName '(' (target1=indexIdent { targets.emplace_back(target1); } (',' target2=indexIdent { targets.emplace_back(target2); } )*)? ')'
        (K_USING cls=STRING_LITERAL { props->custom_class = sstring{$cls.text}; })?
        (K_WITH properties[props])?
-      { $expr = ::make_shared<create_index_statement>(cf, name, id, props, if_not_exists); }
+      { $expr = ::make_shared<create_index_statement>(cf, name, targets, props, if_not_exists); }
    ;

 indexIdent returns [::shared_ptr<index_target::raw> id]
@@ -957,16 +957,14 @@ dropViewStatement returns [::shared_ptr<drop_view_statement> stmt]
      { $stmt = ::make_shared<drop_view_statement>(cf, if_exists); }
    ;

-#if 0
 /**
 * DROP INDEX [IF EXISTS] <INDEX_NAME>
 */
-dropIndexStatement returns [DropIndexStatement expr]
-    @init { boolean ifExists = false; }
-    : K_DROP K_INDEX (K_IF K_EXISTS { ifExists = true; } )? index=indexName
-      { $expr = new DropIndexStatement(index, ifExists); }
+dropIndexStatement returns [::shared_ptr<drop_index_statement> expr]
+    @init { bool if_exists = false; }
+    : K_DROP K_INDEX (K_IF K_EXISTS { if_exists = true; } )? index=indexName
+      { $expr = ::make_shared<drop_index_statement>(index, if_exists); }
    ;
-#endif

 /**
  * TRUNCATE <CF>;
@@ -1303,6 +1301,10 @@ normalColumnOperation[operations_type& operations, ::shared_ptr<cql3::column_ide
          }
          add_raw_update(operations, key, make_shared<cql3::operation::addition>(cql3::constants::literal::integer($i.text)));
      }
+    | K_SCYLLA_COUNTER_SHARD_LIST '(' t=term ')'
+      {
+          add_raw_update(operations, key, ::make_shared<cql3::operation::set_counter_value_from_tuple_list>(t));      
+      }
    ;

 specializedColumnOperation[std::vector<std::pair<shared_ptr<cql3::column_identifier::raw>,
@@ -1548,6 +1550,8 @@ basic_unreserved_keyword returns [sstring str]
        | K_DISTINCT
        | K_CONTAINS
        | K_STATIC
+        | K_FROZEN
+        | K_TUPLE
        | K_FUNCTION
        | K_AGGREGATE
        | K_SFUNC
@@ -1688,6 +1692,7 @@ K_REPLACE:     R E P L A C E;
 K_DETERMINISTIC: D E T E R M I N I S T I C;

 K_SCYLLA_TIMEUUID_LIST_INDEX: S C Y L L A '_' T I M E U U I D '_' L I S T '_' I N D E X;
+K_SCYLLA_COUNTER_SHARD_LIST: S C Y L L A '_' C O U N T E R '_' S H A R D '_' L I S T; 

 // Case-insensitive alpha characters
 fragment A: ('a'|'A');
--- a/cql3/column_identifier.cc
+++ b/cql3/column_identifier.cc
@@ -23,6 +23,8 @@
 #include "exceptions/exceptions.hh"
 #include "cql3/selection/simple_selector.hh"

+#include <regex>
+
 namespace cql3 {

 column_identifier::column_identifier(sstring raw_text, bool keep_case) {
@@ -59,6 +61,17 @@ sstring column_identifier::to_string() const {
    return _text;
 }

+sstring column_identifier::to_cql_string() const {
+    static const std::regex unquoted_identifier_re("[a-z][a-z0-9_]*");
+    if (std::regex_match(_text.begin(), _text.end(), unquoted_identifier_re)) {
+        return _text;
+    }
+    static const std::regex double_quote_re("\"");
+    std::string result = _text;
+    std::regex_replace(result, double_quote_re, "\"\"");
+    return '"' + result + '"';
+}
+
 column_identifier::raw::raw(sstring raw_text, bool keep_case)
    : _raw_text{raw_text}
    , _text{raw_text}
--- a/cql3/column_identifier.hh
+++ b/cql3/column_identifier.hh
@@ -80,6 +80,8 @@ public:

    sstring to_string() const;

+    sstring to_cql_string() const;
+
    friend std::ostream& operator<<(std::ostream& out, const column_identifier& i) {
        return out << i._text;
    }
--- a/cql3/constants.cc
+++ b/cql3/constants.cc
@@ -159,7 +159,7 @@ constants::literal::prepare(database& db, const sstring& keyspace, ::shared_ptr<
    return ::make_shared<value>(cql3::raw_value::make_value(parsed_value(receiver->type)));
 }

-void constants::deleter::execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) {
+void constants::deleter::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
    if (column.type->is_multi_cell()) {
        collection_type_impl::mutation coll_m;
        coll_m.tomb = params.make_tombstone();
--- a/cql3/constants.hh
+++ b/cql3/constants.hh
@@ -197,7 +197,7 @@ public:
    public:
        using operation::operation;

-        virtual void execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) override {
+        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override {
            auto value = _t->bind_and_get(params._options);
            if (value.is_null()) {
                m.set_cell(prefix, column, std::move(make_dead_cell(params)));
@@ -210,7 +210,7 @@ public:
    struct adder final : operation {
        using operation::operation;

-        virtual void execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) override {
+        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override {
            auto value = _t->bind_and_get(params._options);
            if (value.is_null()) {
                throw exceptions::invalid_request_exception("Invalid null value for counter increment");
@@ -225,7 +225,7 @@ public:
    struct subtracter final : operation {
        using operation::operation;

-        virtual void execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) override {
+        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override {
            auto value = _t->bind_and_get(params._options);
            if (value.is_null()) {
                throw exceptions::invalid_request_exception("Invalid null value for counter increment");
@@ -246,7 +246,7 @@ public:
            : operation(column, {})
        { }

-        virtual void execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) override;
+        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };
 };

--- a/cql3/cql3_type.cc
+++ b/cql3/cql3_type.cc
@@ -19,11 +19,39 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <iostream>
+#include <iterator>
+#include <regex>
+
 #include "cql3_type.hh"
+#include "cql3/util.hh"
 #include "ut_name.hh"

 namespace cql3 {

+sstring cql3_type::to_string() const {
+    if (_type->is_user_type()) {
+        return "frozen<" + util::maybe_quote(_name) + ">";
+    }
+    if (_type->is_tuple()) {
+        return "frozen<" + _name + ">";
+    }
+    return _name;
+}
+
+shared_ptr<cql3_type> cql3_type::raw::prepare(database& db, const sstring& keyspace) {
+    try {
+        auto&& ks = db.find_keyspace(keyspace);
+        return prepare_internal(keyspace, ks.metadata()->user_types());
+    } catch (no_such_keyspace& nsk) {
+        throw exceptions::invalid_request_exception("Unknown keyspace " + keyspace);
+    }
+}
+
+bool cql3_type::raw::references_user_type(const sstring& name) const {
+    return false;
+}
+
 class cql3_type::raw_type : public raw {
 private:
    shared_ptr<cql3_type> _type;
@@ -35,6 +63,9 @@ public:
    virtual shared_ptr<cql3_type> prepare(database& db, const sstring& keyspace) {
        return _type;
    }
+    shared_ptr<cql3_type> prepare_internal(const sstring&, lw_shared_ptr<user_types_metadata>) override {
+        return _type;
+    }

    virtual bool supports_freezing() const {
        return false;
@@ -76,7 +107,7 @@ public:
        return true;
    }

-    virtual shared_ptr<cql3_type> prepare(database& db, const sstring& keyspace) override {
+    virtual shared_ptr<cql3_type> prepare_internal(const sstring& keyspace, lw_shared_ptr<user_types_metadata> user_types) override {
        assert(_values); // "Got null values type for a collection";

        if (!_frozen && _values->supports_freezing() && !_values->_frozen) {
@@ -93,16 +124,20 @@ public:
        }

        if (_kind == &collection_type_impl::kind::list) {
-            return make_shared(cql3_type(to_string(), list_type_impl::get_instance(_values->prepare(db, keyspace)->get_type(), !_frozen), false));
+            return make_shared(cql3_type(to_string(), list_type_impl::get_instance(_values->prepare_internal(keyspace, user_types)->get_type(), !_frozen), false));
        } else if (_kind == &collection_type_impl::kind::set) {
-            return make_shared(cql3_type(to_string(), set_type_impl::get_instance(_values->prepare(db, keyspace)->get_type(), !_frozen), false));
+            return make_shared(cql3_type(to_string(), set_type_impl::get_instance(_values->prepare_internal(keyspace, user_types)->get_type(), !_frozen), false));
        } else if (_kind == &collection_type_impl::kind::map) {
            assert(_keys); // "Got null keys type for a collection";
-            return make_shared(cql3_type(to_string(), map_type_impl::get_instance(_keys->prepare(db, keyspace)->get_type(), _values->prepare(db, keyspace)->get_type(), !_frozen), false));
+            return make_shared(cql3_type(to_string(), map_type_impl::get_instance(_keys->prepare_internal(keyspace, user_types)->get_type(), _values->prepare_internal(keyspace, user_types)->get_type(), !_frozen), false));
        }
        abort();
    }

+    bool references_user_type(const sstring& name) const override {
+        return (_keys && _keys->references_user_type(name)) || _values->references_user_type(name);
+    }
+
    virtual sstring to_string() const override {
        sstring start = _frozen ? "frozen<" : "";
        sstring end = _frozen ? ">" : "";
@@ -132,7 +167,7 @@ public:
        _frozen = true;
    }

-    virtual shared_ptr<cql3_type> prepare(database& db, const sstring& keyspace) override {
+    virtual shared_ptr<cql3_type> prepare_internal(const sstring& keyspace, lw_shared_ptr<user_types_metadata> user_types) override {
        if (_name.has_keyspace()) {
            // The provided keyspace is the one of the current statement this is part of. If it's different from the keyspace of
            // the UTName, we reject since we want to limit user types to their own keyspace (see #6643)
@@ -144,23 +179,23 @@ public:
        } else {
            _name.set_keyspace(keyspace);
        }
-
+        if (!user_types) {
+            // bootstrap mode.
+            throw exceptions::invalid_request_exception(sprint("Unknown type %s", _name));
+        }
        try {
-            auto&& ks = db.find_keyspace(_name.get_keyspace());
-            try {
-                auto&& type = ks.metadata()->user_types()->get_type(_name.get_user_type_name());
-                if (!_frozen) {
-                    throw exceptions::invalid_request_exception("Non-frozen User-Defined types are not supported, please use frozen<>");
-                }
-                return make_shared<cql3_type>(_name.to_string(), std::move(type));
-            } catch (std::out_of_range& e) {
-                throw exceptions::invalid_request_exception(sprint("Unknown type %s", _name));
+            auto&& type = user_types->get_type(_name.get_user_type_name());
+            if (!_frozen) {
+                throw exceptions::invalid_request_exception("Non-frozen User-Defined types are not supported, please use frozen<>");
            }
-        } catch (no_such_keyspace& nsk) {
-            throw exceptions::invalid_request_exception("Unknown keyspace " + _name.get_keyspace());
+            return make_shared<cql3_type>(_name.to_string(), std::move(type));
+        } catch (std::out_of_range& e) {
+            throw exceptions::invalid_request_exception(sprint("Unknown type %s", _name));
        }
    }
-
+    bool references_user_type(const sstring& name) const override {
+        return _name.get_string_type_name() == name;
+    }
    virtual bool supports_freezing() const override {
        return true;
    }
@@ -191,7 +226,7 @@ public:
        }
        _frozen = true;
    }
-    virtual shared_ptr<cql3_type> prepare(database& db, const sstring& keyspace) override {
+    virtual shared_ptr<cql3_type> prepare_internal(const sstring& keyspace, lw_shared_ptr<user_types_metadata> user_types) override {
        if (!_frozen) {
            freeze();
        }
@@ -200,10 +235,17 @@ public:
            if (t->is_counter()) {
                throw exceptions::invalid_request_exception("Counters are not allowed inside tuples");
            }
-            ts.push_back(t->prepare(db, keyspace)->get_type());
+            ts.push_back(t->prepare_internal(keyspace, user_types)->get_type());
        }
        return make_cql3_tuple_type(tuple_type_impl::get_instance(std::move(ts)));
    }
+
+    bool references_user_type(const sstring& name) const override {
+        return std::any_of(_types.begin(), _types.end(), [&name](auto t) {
+            return t->references_user_type(name);
+        });
+    }
+
    virtual sstring to_string() const override {
        return sprint("tuple<%s>", join(", ", _types));
    }
@@ -271,6 +313,7 @@ thread_local shared_ptr<cql3_type> cql3_type::bigint = make("bigint", long_type,
 thread_local shared_ptr<cql3_type> cql3_type::blob = make("blob", bytes_type, cql3_type::kind::BLOB);
 thread_local shared_ptr<cql3_type> cql3_type::boolean = make("boolean", boolean_type, cql3_type::kind::BOOLEAN);
 thread_local shared_ptr<cql3_type> cql3_type::double_ = make("double", double_type, cql3_type::kind::DOUBLE);
+thread_local shared_ptr<cql3_type> cql3_type::empty = make("empty", empty_type, cql3_type::kind::EMPTY);
 thread_local shared_ptr<cql3_type> cql3_type::float_ = make("float", float_type, cql3_type::kind::FLOAT);
 thread_local shared_ptr<cql3_type> cql3_type::int_ = make("int", int32_type, cql3_type::kind::INT);
 thread_local shared_ptr<cql3_type> cql3_type::smallint = make("smallint", short_type, cql3_type::kind::SMALLINT);
@@ -297,8 +340,9 @@ cql3_type::values() {
        cql3_type::counter,
        cql3_type::decimal,
        cql3_type::double_,
+        cql3_type::empty,
        cql3_type::float_,
-        cql3_type:inet,
+        cql3_type::inet,
        cql3_type::int_,
        cql3_type::smallint,
        cql3_type::text,
@@ -329,5 +373,23 @@ operator<<(std::ostream& os, const cql3_type::raw& r) {
    return os << r.to_string();
 }

+namespace util {
+
+sstring maybe_quote(const sstring& s) {
+    static const std::regex unquoted("\\w*");
+    static const std::regex double_quote("\"");
+
+    if (std::regex_match(s.begin(), s.end(), unquoted)) {
+        return s;
+    }
+    std::ostringstream ss;
+    ss << "\"";
+    std::regex_replace(std::ostreambuf_iterator<char>(ss), s.begin(), s.end(), double_quote, "\"\"");
+    ss << "\"";
+    return ss.str();
+}
+
+}
+
 }

--- a/cql3/cql3_type.hh
+++ b/cql3/cql3_type.hh
@@ -47,6 +47,7 @@
 #include "enum_set.hh"

 class database;
+class user_types_metadata;

 namespace cql3 {

@@ -63,19 +64,22 @@ public:
    bool is_counter() const { return _type->is_counter(); }
    bool is_native() const { return _native; }
    data_type get_type() const { return _type; }
-    sstring to_string() const { return _name; }
+    sstring to_string() const;

    // For UserTypes, we need to know the current keyspace to resolve the
    // actual type used, so Raw is a "not yet prepared" CQL3Type.
    class raw {
    public:
+        virtual ~raw() {}
        bool _frozen = false;
        virtual bool supports_freezing() const = 0;
        virtual bool is_collection() const;
        virtual bool is_counter() const;
+        virtual bool references_user_type(const sstring&) const;
        virtual std::experimental::optional<sstring> keyspace() const;
        virtual void freeze();
-        virtual shared_ptr<cql3_type> prepare(database& db, const sstring& keyspace) = 0;
+        virtual shared_ptr<cql3_type> prepare_internal(const sstring& keyspace, lw_shared_ptr<user_types_metadata>) = 0;
+        virtual shared_ptr<cql3_type> prepare(database& db, const sstring& keyspace);
        static shared_ptr<raw> from(shared_ptr<cql3_type> type);
        static shared_ptr<raw> user_type(ut_name name);
        static shared_ptr<raw> map(shared_ptr<raw> t1, shared_ptr<raw> t2);
@@ -98,7 +102,7 @@ private:

 public:
    enum class kind : int8_t {
-        ASCII, BIGINT, BLOB, BOOLEAN, COUNTER, DECIMAL, DOUBLE, FLOAT, INT, SMALLINT, TINYINT, INET, TEXT, TIMESTAMP, UUID, VARCHAR, VARINT, TIMEUUID, DATE, TIME
+        ASCII, BIGINT, BLOB, BOOLEAN, COUNTER, DECIMAL, DOUBLE, EMPTY, FLOAT, INT, SMALLINT, TINYINT, INET, TEXT, TIMESTAMP, UUID, VARCHAR, VARINT, TIMEUUID, DATE, TIME
    };
    using kind_enum = super_enum<kind,
        kind::ASCII,
@@ -108,6 +112,7 @@ public:
        kind::COUNTER,
        kind::DECIMAL,
        kind::DOUBLE,
+        kind::EMPTY,
        kind::FLOAT,
        kind::INET,
        kind::INT,
@@ -133,6 +138,7 @@ public:
    static thread_local shared_ptr<cql3_type> blob;
    static thread_local shared_ptr<cql3_type> boolean;
    static thread_local shared_ptr<cql3_type> double_;
+    static thread_local shared_ptr<cql3_type> empty;
    static thread_local shared_ptr<cql3_type> float_;
    static thread_local shared_ptr<cql3_type> int_;
    static thread_local shared_ptr<cql3_type> smallint;
--- a/cql3/cql_statement.hh
+++ b/cql3/cql_statement.hh
@@ -46,7 +46,7 @@
 #include "service/storage_proxy.hh"
 #include "cql3/query_options.hh"

-namespace transport {
+namespace cql_transport {

 namespace messages {

@@ -89,7 +89,7 @@ public:
     * @param state the current query state
     * @param options options for this query (consistency, variables, pageSize, ...)
     */
-    virtual future<::shared_ptr<transport::messages::result_message>>
+    virtual future<::shared_ptr<cql_transport::messages::result_message>>
        execute(distributed<service::storage_proxy>& proxy, service::query_state& state, const query_options& options) = 0;

    /**
@@ -97,7 +97,7 @@ public:
     *
     * @param state the current query state
     */
-    virtual future<::shared_ptr<transport::messages::result_message>>
+    virtual future<::shared_ptr<cql_transport::messages::result_message>>
        execute_internal(distributed<service::storage_proxy>& proxy, service::query_state& state, const query_options& options) = 0;

    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const = 0;
--- a/cql3/error_listener.hh
+++ b/cql3/error_listener.hh
@@ -41,6 +41,7 @@

 #pragma once

+#include "seastarx.hh"
 #include <seastar/core/sstring.hh>
 #include <antlr3.hpp>

--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -67,6 +67,18 @@ functions::init() {
    declare(aggregate_fcts::make_max_function<int64_t>());
    declare(aggregate_fcts::make_min_function<int64_t>());

+    declare(aggregate_fcts::make_count_function<float>());
+    declare(aggregate_fcts::make_max_function<float>());
+    declare(aggregate_fcts::make_min_function<float>());
+
+    declare(aggregate_fcts::make_count_function<double>());
+    declare(aggregate_fcts::make_max_function<double>());
+    declare(aggregate_fcts::make_min_function<double>());
+
+    declare(aggregate_fcts::make_count_function<sstring>());
+    declare(aggregate_fcts::make_max_function<sstring>());
+    declare(aggregate_fcts::make_min_function<sstring>());
+
    //FIXME:
    //declare(aggregate_fcts::make_count_function<bytes>());
    //declare(aggregate_fcts::make_max_function<bytes>());
@@ -78,15 +90,17 @@ functions::init() {
    declare(make_blob_as_varchar_fct());
    declare(aggregate_fcts::make_sum_function<int32_t>());
    declare(aggregate_fcts::make_sum_function<int64_t>());
-    declare(aggregate_fcts::make_avg_function<int32_t>());
-    declare(aggregate_fcts::make_avg_function<int64_t>());
+    declare(aggregate_fcts::make_sum_function<float>());
+    declare(aggregate_fcts::make_sum_function<double>());
 #if 0
-    declare(AggregateFcts.sumFunctionForFloat);
-    declare(AggregateFcts.sumFunctionForDouble);
    declare(AggregateFcts.sumFunctionForDecimal);
    declare(AggregateFcts.sumFunctionForVarint);
-    declare(AggregateFcts.avgFunctionForFloat);
-    declare(AggregateFcts.avgFunctionForDouble);
+#endif
+    declare(aggregate_fcts::make_avg_function<int32_t>());
+    declare(aggregate_fcts::make_avg_function<int64_t>());
+    declare(aggregate_fcts::make_avg_function<float>());
+    declare(aggregate_fcts::make_avg_function<double>());
+#if 0
    declare(AggregateFcts.avgFunctionForVarint);
    declare(AggregateFcts.avgFunctionForDecimal);
 #endif
--- a/cql3/keyspace_element_name.hh
+++ b/cql3/keyspace_element_name.hh
@@ -42,6 +42,7 @@
 #pragma once

 #include "core/sstring.hh"
+#include "seastarx.hh"

 #include <experimental/optional>

--- a/cql3/lists.cc
+++ b/cql3/lists.cc
@@ -111,7 +111,7 @@ lists::literal::test_assignment(database& db, const sstring& keyspace, shared_pt

 sstring
 lists::literal::to_string() const {
-    return ::to_string(_elements);
+    return std::to_string(_elements);
 }

 lists::value
@@ -242,7 +242,7 @@ lists::precision_time::get_next(db_clock::time_point millis) {
 }

 void
-lists::setter::execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) {
+lists::setter::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
    const auto& value = _t->bind(params._options);
    if (value == constants::UNSET_VALUE) {
        return;
@@ -270,15 +270,10 @@ lists::setter_by_index::collect_marker_specification(shared_ptr<variable_specifi
 }

 void
-lists::setter_by_index::execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) {
+lists::setter_by_index::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
    // we should not get here for frozen lists
    assert(column.type->is_multi_cell()); // "Attempted to set an individual element on a frozen list";

-    std::experimental::optional<clustering_key> row_key;
-    if (!column.is_static()) {
-        row_key = clustering_key::from_clustering_prefix(*params._schema, prefix);
-    }
-
    auto index = _idx->bind_and_get(params._options);
    if (index.is_null()) {
        throw exceptions::invalid_request_exception("Invalid null value for list index");
@@ -292,7 +287,7 @@ lists::setter_by_index::execute(mutation& m, const exploded_clustering_prefix& p
    }

    auto idx = net::ntoh(int32_t(*unaligned_cast<int32_t>(index->begin())));
-    auto&& existing_list_opt = params.get_prefetched_list(m.key(), std::move(row_key), column);
+    auto&& existing_list_opt = params.get_prefetched_list(m.key().view(), prefix.view(), column);
    if (!existing_list_opt) {
        throw exceptions::invalid_request_exception("Attempted to set an element on a list which is null");
    }
@@ -327,15 +322,10 @@ lists::setter_by_uuid::requires_read() {
 }

 void
-lists::setter_by_uuid::execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) {
+lists::setter_by_uuid::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
    // we should not get here for frozen lists
    assert(column.type->is_multi_cell()); // "Attempted to set an individual element on a frozen list";

-    std::experimental::optional<clustering_key> row_key;
-    if (!column.is_static()) {
-        row_key = clustering_key::from_clustering_prefix(*params._schema, prefix);
-    }
-
    auto index = _idx->bind_and_get(params._options);
    auto value = _t->bind_and_get(params._options);

@@ -355,7 +345,7 @@ lists::setter_by_uuid::execute(mutation& m, const exploded_clustering_prefix& pr
 }

 void
-lists::appender::execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) {
+lists::appender::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
    const auto& value = _t->bind(params._options);
    if (value == constants::UNSET_VALUE) {
        return;
@@ -367,7 +357,7 @@ lists::appender::execute(mutation& m, const exploded_clustering_prefix& prefix,
 void
 lists::do_append(shared_ptr<term> value,
        mutation& m,
-        const exploded_clustering_prefix& prefix,
+        const clustering_key_prefix& prefix,
        const column_definition& column,
        const update_parameters& params) {
    auto&& list_value = dynamic_pointer_cast<lists::value>(value);
@@ -401,7 +391,7 @@ lists::do_append(shared_ptr<term> value,
 }

 void
-lists::prepender::execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) {
+lists::prepender::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
    assert(column.type->is_multi_cell()); // "Attempted to prepend to a frozen list";
    auto&& value = _t->bind(params._options);
    if (!value || value == constants::UNSET_VALUE) {
@@ -433,15 +423,10 @@ lists::discarder::requires_read() {
 }

 void
-lists::discarder::execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) {
+lists::discarder::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
    assert(column.type->is_multi_cell()); // "Attempted to delete from a frozen list";

-    std::experimental::optional<clustering_key> row_key;
-    if (!column.is_static()) {
-        row_key = clustering_key::from_clustering_prefix(*params._schema, prefix);
-    }
-
-    auto&& existing_list = params.get_prefetched_list(m.key(), std::move(row_key), column);
+    auto&& existing_list = params.get_prefetched_list(m.key().view(), prefix.view(), column);
    // We want to call bind before possibly returning to reject queries where the value provided is not a list.
    auto&& value = _t->bind(params._options);

@@ -490,7 +475,7 @@ lists::discarder_by_index::requires_read() {
 }

 void
-lists::discarder_by_index::execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) {
+lists::discarder_by_index::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
    assert(column.type->is_multi_cell()); // "Attempted to delete an item by index from a frozen list";
    auto&& index = _t->bind(params._options);
    if (!index) {
@@ -504,11 +489,7 @@ lists::discarder_by_index::execute(mutation& m, const exploded_clustering_prefix
    auto cvalue = dynamic_pointer_cast<constants::value>(index);
    assert(cvalue);

-    std::experimental::optional<clustering_key> row_key;
-    if (!column.is_static()) {
-        row_key = clustering_key::from_clustering_prefix(*params._schema, prefix);
-    }
-    auto&& existing_list_opt = params.get_prefetched_list(m.key(), std::move(row_key), column);
+    auto&& existing_list_opt = params.get_prefetched_list(m.key().view(), prefix.view(), column);
    int32_t idx = read_simple_exactly<int32_t>(*cvalue->_bytes);
    if (!existing_list_opt) {
        throw exceptions::invalid_request_exception("Attempted to delete an element from a list which is null");
--- a/cql3/lists.hh
+++ b/cql3/lists.hh
@@ -146,7 +146,7 @@ public:
        setter(const column_definition& column, shared_ptr<term> t)
                : operation(column, std::move(t)) {
        }
-        virtual void execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) override;
+        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

    class setter_by_index : public operation {
@@ -158,7 +158,7 @@ public:
        }
        virtual bool requires_read() override;
        virtual void collect_marker_specification(shared_ptr<variable_specifications> bound_names);
-        virtual void execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) override;
+        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

    class setter_by_uuid : public setter_by_index {
@@ -167,25 +167,25 @@ public:
            : setter_by_index(column, std::move(idx), std::move(t)) {
        }
        virtual bool requires_read() override;
-        virtual void execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) override;
+        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

    class appender : public operation {
    public:
        using operation::operation;
-        virtual void execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) override;
+        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

    static void do_append(shared_ptr<term> value,
            mutation& m,
-            const exploded_clustering_prefix& prefix,
+            const clustering_key_prefix& prefix,
            const column_definition& column,
            const update_parameters& params);

    class prepender : public operation {
    public:
        using operation::operation;
-        virtual void execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) override;
+        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

    class discarder : public operation {
@@ -194,7 +194,7 @@ public:
                : operation(column, std::move(t)) {
        }
        virtual bool requires_read() override;
-        virtual void execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) override;
+        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

    class discarder_by_index : public operation {
@@ -203,7 +203,7 @@ public:
                : operation(column, std::move(idx)) {
        }
        virtual bool requires_read() override;
-        virtual void execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params);
+        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params);
    };
 };

--- a/cql3/maps.cc
+++ b/cql3/maps.cc
@@ -269,7 +269,7 @@ maps::marker::bind(const query_options& options) {
 }

 void
-maps::setter::execute(mutation& m, const exploded_clustering_prefix& row_key, const update_parameters& params) {
+maps::setter::execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params) {
    auto value = _t->bind(params._options);
    if (value == constants::UNSET_VALUE) {
        return;
@@ -292,7 +292,7 @@ maps::setter_by_key::collect_marker_specification(shared_ptr<variable_specificat
 }

 void
-maps::setter_by_key::execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) {
+maps::setter_by_key::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
    using exceptions::invalid_request_exception;
    assert(column.type->is_multi_cell()); // "Attempted to set a value for a single key on a frozen map"m
    auto key = _k->bind_and_get(params._options);
@@ -315,7 +315,7 @@ maps::setter_by_key::execute(mutation& m, const exploded_clustering_prefix& pref
 }

 void
-maps::putter::execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) {
+maps::putter::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
    assert(column.type->is_multi_cell()); // "Attempted to add items to a frozen map";
    auto value = _t->bind(params._options);
    if (value != constants::UNSET_VALUE) {
@@ -324,7 +324,7 @@ maps::putter::execute(mutation& m, const exploded_clustering_prefix& prefix, con
 }

 void
-maps::do_put(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params,
+maps::do_put(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params,
        shared_ptr<term> value, const column_definition& column) {
    auto map_value = dynamic_pointer_cast<maps::value>(value);
    if (column.type->is_multi_cell()) {
@@ -353,7 +353,7 @@ maps::do_put(mutation& m, const exploded_clustering_prefix& prefix, const update
 }

 void
-maps::discarder_by_key::execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) {
+maps::discarder_by_key::execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) {
    assert(column.type->is_multi_cell()); // "Attempted to delete a single key in a frozen map";
    auto&& key = _t->bind(params._options);
    if (!key) {
--- a/cql3/maps.hh
+++ b/cql3/maps.hh
@@ -116,7 +116,7 @@ public:
                : operation(column, std::move(t)) {
        }

-        virtual void execute(mutation& m, const exploded_clustering_prefix& row_key, const update_parameters& params) override;
+        virtual void execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params) override;
    };

    class setter_by_key : public operation {
@@ -126,7 +126,7 @@ public:
            : operation(column, std::move(t)), _k(std::move(k)) {
        }
        virtual void collect_marker_specification(shared_ptr<variable_specifications> bound_names) override;
-        virtual void execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) override;
+        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

    class putter : public operation {
@@ -134,10 +134,10 @@ public:
        putter(const column_definition& column, shared_ptr<term> t)
            : operation(column, std::move(t)) {
        }
-        virtual void execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) override;
+        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };

-    static void do_put(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params,
+    static void do_put(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params,
            shared_ptr<term> value, const column_definition& column);

    class discarder_by_key : public operation {
@@ -145,7 +145,7 @@ public:
        discarder_by_key(const column_definition& column, shared_ptr<term> k)
                : operation(column, std::move(k)) {
        }
-        virtual void execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) override;
+        virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override;
    };
 };

--- a/cql3/operation.cc
+++ b/cql3/operation.cc
@@ -36,6 +36,7 @@
 * You should have received a copy of the GNU General Public License
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */
+#include <utility>

 #include "operation.hh"
 #include "operation_impl.hh"
@@ -192,6 +193,78 @@ operation::set_value::prepare(database& db, const sstring& keyspace, const colum
    }
 }

+::shared_ptr <operation>
+operation::set_counter_value_from_tuple_list::prepare(database& db, const sstring& keyspace, const column_definition& receiver) {
+    static thread_local const data_type counter_tuple_type = tuple_type_impl::get_instance({int32_type, uuid_type, long_type, long_type});
+    static thread_local const data_type counter_tuple_list_type = list_type_impl::get_instance(counter_tuple_type, true);
+
+    if (!receiver.type->is_counter()) {
+        throw exceptions::invalid_request_exception(sprint("Column %s is not a counter", receiver.name_as_text()));
+    }
+
+    // We need to fake a column of list<tuple<...>> to prepare the value term
+    auto & os = receiver.column_specification;
+    auto spec = make_shared<cql3::column_specification>(os->ks_name, os->cf_name, os->name, counter_tuple_list_type);
+    auto v = _value->prepare(db, keyspace, spec);
+
+    // Will not be used elsewhere, so make it local.
+    class counter_setter : public operation {
+    public:
+        using operation::operation;
+
+        bool is_raw_counter_shard_write() const override {
+            return true;
+        }
+        void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) override {
+            const auto& value = _t->bind(params._options);
+            auto&& list_value = dynamic_pointer_cast<lists::value>(value);
+
+            if (!list_value) {
+                throw std::invalid_argument("Invalid input data to counter set");
+            }
+
+            counter_id last(utils::UUID(0, 0));
+            counter_cell_builder ccb(list_value->_elements.size());
+            for (auto& bo : list_value->_elements) {
+                // lexical etc cast fails should be enough type checking here.
+                auto tuple = value_cast<tuple_type_impl::native_type>(counter_tuple_type->deserialize(*bo));
+                auto shard = value_cast<int>(tuple[0]);
+                auto id = counter_id(value_cast<utils::UUID>(tuple[1]));
+                auto clock = value_cast<int64_t>(tuple[2]);
+                auto value = value_cast<int64_t>(tuple[3]);
+
+                using namespace std::rel_ops;
+
+                if (id <= last) {
+                    throw marshal_exception(
+                                    sprint("invalid counter id order, %s <= %s",
+                                                    id.to_uuid().to_sstring(),
+                                                    last.to_uuid().to_sstring()));
+                }
+                last = id;
+                // TODO: maybe allow more than global values to propagate,
+                // though we don't (yet at least) in sstable::partition so...
+                switch (shard) {
+                case 'g':
+                    ccb.add_shard(counter_shard(id, value, clock));
+                    break;
+                case 'l':
+                    throw marshal_exception("encountered a local shard in a counter cell");
+                case 'r':
+                    throw marshal_exception("encountered remote shards in a counter cell");
+                default:
+                    throw marshal_exception(sprint("encountered unknown shard %d in a counter cell", shard));
+                }
+            }
+            // Note. this is a counter value cell, not an update.
+            // see counters.cc, we need to detect this.
+            m.set_cell(prefix, column, ccb.build(params.timestamp()));
+        }
+    };
+
+    return make_shared<counter_setter>(receiver, v);
+};
+
 bool
 operation::set_value::is_compatible_with(::shared_ptr <raw_update> other) {
    // We don't allow setting multiple time the same column, because 1)
--- a/cql3/operation.hh
+++ b/cql3/operation.hh
@@ -103,6 +103,10 @@ public:
        return _t && _t->uses_function(ks_name, function_name);
    }

+    virtual bool is_raw_counter_shard_write() const {
+        return false;
+    }
+
    /**
    * @return whether the operation requires a read of the previous value to be executed
    * (only lists setterByIdx, discard and discardByIdx requires that).
@@ -126,7 +130,7 @@ public:
    /**
     * Execute the operation.
     */
-    virtual void execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) = 0;
+    virtual void execute(mutation& m, const clustering_key_prefix& prefix, const update_parameters& params) = 0;

    /**
     * A parsed raw UPDATE operation.
@@ -193,6 +197,7 @@ public:
    };

    class set_value;
+    class set_counter_value_from_tuple_list;

    class set_element : public raw_update {
        const shared_ptr<term::raw> _selector;
--- a/cql3/operation_impl.hh
+++ b/cql3/operation_impl.hh
@@ -50,7 +50,7 @@
 namespace cql3 {

 class operation::set_value : public raw_update {
-private:
+protected:
    ::shared_ptr<term::raw> _value;
 public:
    set_value(::shared_ptr<term::raw> value) : _value(std::move(value)) {}
@@ -67,6 +67,12 @@ public:
    virtual bool is_compatible_with(::shared_ptr <raw_update> other) override;
 };

+class operation::set_counter_value_from_tuple_list : public set_value {
+public:
+    using set_value::set_value;
+    ::shared_ptr <operation> prepare(database& db, const sstring& keyspace, const column_definition& receiver) override;
+};
+
 class operation::column_deletion : public raw_deletion {
 private:
    ::shared_ptr<column_identifier::raw> _id;
--- a/cql3/operator.hh
+++ b/cql3/operator.hh
@@ -44,6 +44,7 @@
 #include <cstddef>
 #include <iosfwd>
 #include "core/sstring.hh"
+#include "seastarx.hh"

 namespace cql3 {

--- a/cql3/prepared_statements_cache.hh
+++ b/cql3/prepared_statements_cache.hh
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C) 2017 ScyllaDB
+ *
+ * Modified by ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "utils/loading_cache.hh"
+#include "cql3/statements/prepared_statement.hh"
+
+namespace cql3 {
+
+using prepared_cache_entry = std::unique_ptr<statements::prepared_statement>;
+
+struct prepared_cache_entry_size {
+    size_t operator()(const prepared_cache_entry& val) {
+        // TODO: improve the size approximation
+        return 10000;
+    }
+};
+
+typedef bytes cql_prepared_id_type;
+typedef int32_t thrift_prepared_id_type;
+
+/// \brief The key of the prepared statements cache
+///
+/// We are going to store the CQL and Thrift prepared statements in the same cache therefore we need generate the key
+/// that is going to be unique in both cases. Thrift use int32_t as a prepared statement ID, CQL - MD5 digest.
+///
+/// We are going to use an std::pair<CQL_PREP_ID_TYPE, int64_t> as a key. For CQL statements we will use {CQL_PREP_ID, std::numeric_limits<int64_t>::max()} as a key
+/// and for Thrift - {CQL_PREP_ID_TYPE(0), THRIFT_PREP_ID}. This way CQL and Thrift keys' values will never collide.
+class prepared_cache_key_type {
+public:
+    using cache_key_type = std::pair<cql_prepared_id_type, int64_t>;
+
+private:
+    cache_key_type _key;
+
+public:
+    prepared_cache_key_type() = default;
+    explicit prepared_cache_key_type(cql_prepared_id_type cql_id) : _key(std::move(cql_id), std::numeric_limits<int64_t>::max()) {}
+    explicit prepared_cache_key_type(thrift_prepared_id_type thrift_id) : _key(cql_prepared_id_type(), thrift_id) {}
+
+    cache_key_type& key() { return _key; }
+    const cache_key_type& key() const { return _key; }
+
+    static const cql_prepared_id_type& cql_id(const prepared_cache_key_type& key) {
+        return key.key().first;
+    }
+    static thrift_prepared_id_type thrift_id(const prepared_cache_key_type& key) {
+        return key.key().second;
+    }
+};
+
+class prepared_statements_cache {
+public:
+    struct stats {
+        uint64_t prepared_cache_evictions = 0;
+    };
+
+    static stats& shard_stats() {
+        static thread_local stats _stats;
+        return _stats;
+    }
+
+    struct prepared_cache_stats_updater {
+        static void inc_hits() noexcept {}
+        static void inc_misses() noexcept {}
+        static void inc_blocks() noexcept {}
+        static void inc_evictions() noexcept {
+            ++shard_stats().prepared_cache_evictions;
+        }
+    };
+
+private:
+    using cache_key_type = typename prepared_cache_key_type::cache_key_type;
+    using cache_type = utils::loading_cache<cache_key_type, prepared_cache_entry, utils::loading_cache_reload_enabled::no, prepared_cache_entry_size, utils::tuple_hash, std::equal_to<cache_key_type>, prepared_cache_stats_updater>;
+    using cache_value_ptr = typename cache_type::value_ptr;
+    using cache_iterator = typename cache_type::iterator;
+    using checked_weak_ptr = typename statements::prepared_statement::checked_weak_ptr;
+    struct value_extractor_fn {
+        checked_weak_ptr operator()(prepared_cache_entry& e) const {
+            return e->checked_weak_from_this();
+        }
+    };
+
+    static const std::chrono::minutes entry_expiry;
+
+public:
+    using key_type = prepared_cache_key_type;
+    using value_type = checked_weak_ptr;
+    using statement_is_too_big = typename cache_type::entry_is_too_big;
+    /// \note both iterator::reference and iterator::value_type are checked_weak_ptr
+    using iterator = boost::transform_iterator<value_extractor_fn, cache_iterator>;
+
+private:
+    cache_type _cache;
+    value_extractor_fn _value_extractor_fn;
+
+public:
+    prepared_statements_cache(logging::logger& logger)
+        : _cache(memory::stats().total_memory() / 256, entry_expiry, logger)
+    {}
+
+    template <typename LoadFunc>
+    future<value_type> get(const key_type& key, LoadFunc&& load) {
+        return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); }).then([] (cache_value_ptr v_ptr) {
+            return make_ready_future<value_type>((*v_ptr)->checked_weak_from_this());
+        });
+    }
+
+    iterator find(const key_type& key) {
+        return boost::make_transform_iterator(_cache.find(key.key()), _value_extractor_fn);
+    }
+
+    iterator end() {
+        return boost::make_transform_iterator(_cache.end(), _value_extractor_fn);
+    }
+
+    iterator begin() {
+        return boost::make_transform_iterator(_cache.begin(), _value_extractor_fn);
+    }
+
+    template <typename Pred>
+    void remove_if(Pred&& pred) {
+        static_assert(std::is_same<bool, std::result_of_t<Pred(::shared_ptr<cql_statement>)>>::value, "Bad Pred signature");
+
+        _cache.remove_if([&pred] (const prepared_cache_entry& e) {
+            return pred(e->statement);
+        });
+    }
+
+    size_t size() const {
+        return _cache.size();
+    }
+
+    size_t memory_footprint() const {
+        return _cache.memory_footprint();
+    }
+};
+}
+
+namespace std { // for prepared_statements_cache log printouts
+inline std::ostream& operator<<(std::ostream& os, const typename cql3::prepared_cache_key_type::cache_key_type& p) {
+    os << "{cql_id: " << p.first << ", thrift_id: " << p.second << "}";
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const cql3::prepared_cache_key_type& p) {
+    os << p.key();
+    return os;
+}
+}
--- a/cql3/query_options.cc
+++ b/cql3/query_options.cc
@@ -82,17 +82,6 @@ query_options::query_options(db::consistency_level consistency,
 {
 }

-query_options::query_options(query_options&& o, std::vector<std::vector<cql3::raw_value_view>> value_views)
-    : query_options(std::move(o))
-{
-    std::vector<query_options> tmp;
-    tmp.reserve(value_views.size());
-    std::transform(value_views.begin(), value_views.end(), std::back_inserter(tmp), [this](auto& vals) {
-        return query_options(_consistency, {}, vals, _skip_metadata, _options, _cql_serialization_format);
-    });
-    _batch_options = std::move(tmp);
-}
-
 query_options::query_options(db::consistency_level cl, std::vector<cql3::raw_value> values)
    : query_options(
          cl,
--- a/cql3/query_options.hh
+++ b/cql3/query_options.hh
@@ -41,6 +41,7 @@

 #pragma once

+#include <seastar/util/gcc6-concepts.hh>
 #include "timestamp.hh"
 #include "bytes.hh"
 #include "db/consistency_level.hh"
@@ -77,6 +78,26 @@ private:
    const specific_options _options;
    cql_serialization_format _cql_serialization_format;
    std::experimental::optional<std::vector<query_options>> _batch_options;
+
+private:
+    /**
+     * @brief Batch query_options constructor.
+     *
+     * Requirements:
+     *   - @tparam OneMutationDataRange has a begin() and end() iterators.
+     *   - The values of @tparam OneMutationDataRange are of either raw_value_view or raw_value types.
+     *
+     * @param o Base query_options object. query_options objects for each statement in the batch will derive the values from it.
+     * @param values_ranges a vector of values ranges for each statement in the batch.
+     */
+    template<typename OneMutationDataRange>
+    GCC6_CONCEPT( requires requires (OneMutationDataRange range) {
+         std::begin(range);
+         std::end(range);
+    } && ( requires (OneMutationDataRange range) { { *range.begin() } -> raw_value_view; } ||
+           requires (OneMutationDataRange range) { { *range.begin() } -> raw_value; } ) )
+    explicit query_options(query_options&& o, std::vector<OneMutationDataRange> values_ranges);
+
 public:
    query_options(query_options&&) = default;
    query_options(const query_options&) = delete;
@@ -94,8 +115,25 @@ public:
                           specific_options options,
                           cql_serialization_format sf);

-    // Batch query_options constructor
-    explicit query_options(query_options&&, std::vector<std::vector<cql3::raw_value_view>> value_views);
+    /**
+     * @brief Batch query_options factory.
+     *
+     * Requirements:
+     *   - @tparam OneMutationDataRange has a begin() and end() iterators.
+     *   - The values of @tparam OneMutationDataRange are of either raw_value_view or raw_value types.
+     *
+     * @param o Base query_options object. query_options objects for each statement in the batch will derive the values from it.
+     * @param values_ranges a vector of values ranges for each statement in the batch.
+     */
+    template<typename OneMutationDataRange>
+    GCC6_CONCEPT( requires requires (OneMutationDataRange range) {
+         std::begin(range);
+         std::end(range);
+    } && ( requires (OneMutationDataRange range) { { *range.begin() } -> raw_value_view; } ||
+           requires (OneMutationDataRange range) { { *range.begin() } -> raw_value; } ) )
+    static query_options make_batch_options(query_options&& o, std::vector<OneMutationDataRange> values_ranges) {
+        return query_options(std::move(o), std::move(values_ranges));
+    }

    // It can't be const because of prepare()
    static thread_local query_options DEFAULT;
@@ -130,4 +168,21 @@ private:
    void fill_value_views();
 };

+template<typename OneMutationDataRange>
+GCC6_CONCEPT( requires requires (OneMutationDataRange range) {
+     std::begin(range);
+     std::end(range);
+} && ( requires (OneMutationDataRange range) { { *range.begin() } -> raw_value_view; } ||
+       requires (OneMutationDataRange range) { { *range.begin() } -> raw_value; } ) )
+query_options::query_options(query_options&& o, std::vector<OneMutationDataRange> values_ranges)
+    : query_options(std::move(o))
+{
+    std::vector<query_options> tmp;
+    tmp.reserve(values_ranges.size());
+    std::transform(values_ranges.begin(), values_ranges.end(), std::back_inserter(tmp), [this](auto& values_range) {
+        return query_options(_consistency, {}, std::move(values_range), _skip_metadata, _options, _cql_serialization_format);
+    });
+    _batch_options = std::move(tmp);
+}
+
 }
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -54,14 +54,17 @@
 namespace cql3 {

 using namespace statements;
-using namespace transport::messages;
+using namespace cql_transport::messages;

 logging::logger log("query_processor");
+logging::logger prep_cache_log("prepared_statements_cache");

 distributed<query_processor> _the_query_processor;

 const sstring query_processor::CQL_VERSION = "3.3.1";

+const std::chrono::minutes prepared_statements_cache::entry_expiry = std::chrono::minutes(60);
+
 class query_processor::internal_state {
    service::query_state _qs;
 public:
@@ -95,6 +98,7 @@ query_processor::query_processor(distributed<service::storage_proxy>& proxy,
    , _proxy(proxy)
    , _db(db)
    , _internal_state(new internal_state())
+    , _prepared_cache(prep_cache_log)
 {
    namespace sm = seastar::metrics;

@@ -130,6 +134,15 @@ query_processor::query_processor(distributed<service::storage_proxy>& proxy,

        sm::make_derive("batches_unlogged_from_logged", _cql_stats.batches_unlogged_from_logged,
                        sm::description("Counts a total number of LOGGED batches that were executed as UNLOGGED batches.")),
+
+        sm::make_derive("prepared_cache_evictions", [] { return prepared_statements_cache::shard_stats().prepared_cache_evictions; },
+                        sm::description("Counts a number of prepared statements cache entries evictions.")),
+
+        sm::make_gauge("prepared_cache_size", [this] { return _prepared_cache.size(); },
+                        sm::description("A number of entries in the prepared statements cache.")),
+
+        sm::make_gauge("prepared_cache_memory_footprint", [this] { return _prepared_cache.memory_footprint(); },
+                        sm::description("Size (in bytes) of the prepared statements cache.")),
    });

    service::get_local_migration_manager().register_listener(_migration_subscriber.get());
@@ -179,7 +192,7 @@ query_processor::process_statement(::shared_ptr<cql_statement> statement,

        statement->validate(_proxy, client_state);

-        auto fut = make_ready_future<::shared_ptr<transport::messages::result_message>>();
+        auto fut = make_ready_future<::shared_ptr<cql_transport::messages::result_message>>();
        if (client_state.is_internal()) {
            fut = statement->execute_internal(_proxy, query_state, options);
        } else  {
@@ -196,80 +209,34 @@ query_processor::process_statement(::shared_ptr<cql_statement> statement,
    });
 }

-future<::shared_ptr<transport::messages::result_message::prepared>>
-query_processor::prepare(const std::experimental::string_view& query_string, service::query_state& query_state)
+future<::shared_ptr<cql_transport::messages::result_message::prepared>>
+query_processor::prepare(sstring query_string, service::query_state& query_state)
 {
    auto& client_state = query_state.get_client_state();
-    return prepare(query_string, client_state, client_state.is_thrift());
+    return prepare(std::move(query_string), client_state, client_state.is_thrift());
 }

-future<::shared_ptr<transport::messages::result_message::prepared>>
-query_processor::prepare(const std::experimental::string_view& query_string,
-                         const service::client_state& client_state,
-                         bool for_thrift)
+future<::shared_ptr<cql_transport::messages::result_message::prepared>>
+query_processor::prepare(sstring query_string, const service::client_state& client_state, bool for_thrift)
 {
-    auto existing = get_stored_prepared_statement(query_string, client_state.get_raw_keyspace(), for_thrift);
-    if (existing) {
-        return make_ready_future<::shared_ptr<transport::messages::result_message::prepared>>(existing);
+    using namespace cql_transport::messages;
+    if (for_thrift) {
+        return prepare_one<result_message::prepared::thrift>(std::move(query_string), client_state, compute_thrift_id, prepared_cache_key_type::thrift_id);
+    } else {
+        return prepare_one<result_message::prepared::cql>(std::move(query_string), client_state, compute_id, prepared_cache_key_type::cql_id);
    }
-    auto prepared = get_statement(query_string, client_state);
-    auto bound_terms = prepared->statement->get_bound_terms();
-    if (bound_terms > std::numeric_limits<uint16_t>::max()) {
-        throw exceptions::invalid_request_exception(sprint("Too many markers(?). %d markers exceed the allowed maximum of %d", bound_terms, std::numeric_limits<uint16_t>::max()));
-    }
-    assert(bound_terms == prepared->bound_names.size());
-    return store_prepared_statement(query_string, client_state.get_raw_keyspace(), std::move(prepared), for_thrift);
 }

-::shared_ptr<transport::messages::result_message::prepared>
+::shared_ptr<cql_transport::messages::result_message::prepared>
 query_processor::get_stored_prepared_statement(const std::experimental::string_view& query_string,
                                               const sstring& keyspace,
                                               bool for_thrift)
 {
+    using namespace cql_transport::messages;
    if (for_thrift) {
-        auto statement_id = compute_thrift_id(query_string, keyspace);
-        auto it = _thrift_prepared_statements.find(statement_id);
-        if (it == _thrift_prepared_statements.end()) {
-            return ::shared_ptr<result_message::prepared>();
-        }
-        return ::make_shared<result_message::prepared::thrift>(statement_id, it->second);
+        return get_stored_prepared_statement_one<result_message::prepared::thrift>(query_string, keyspace, compute_thrift_id, prepared_cache_key_type::thrift_id);
    } else {
-        auto statement_id = compute_id(query_string, keyspace);
-        auto it = _prepared_statements.find(statement_id);
-        if (it == _prepared_statements.end()) {
-            return ::shared_ptr<result_message::prepared>();
-        }
-        return ::make_shared<result_message::prepared::cql>(statement_id, it->second);
-    }
-}
-
-future<::shared_ptr<transport::messages::result_message::prepared>>
-query_processor::store_prepared_statement(const std::experimental::string_view& query_string,
-                                          const sstring& keyspace,
-                                          ::shared_ptr<statements::prepared_statement> prepared,
-                                          bool for_thrift)
-{
-#if 0
-    // Concatenate the current keyspace so we don't mix prepared statements between keyspace (#5352).
-    // (if the keyspace is null, queryString has to have a fully-qualified keyspace so it's fine.
-    long statementSize = measure(prepared.statement);
-    // don't execute the statement if it's bigger than the allowed threshold
-    if (statementSize > MAX_CACHE_PREPARED_MEMORY)
-        throw new InvalidRequestException(String.format("Prepared statement of size %d bytes is larger than allowed maximum of %d bytes.",
-                                                        statementSize,
-                                                        MAX_CACHE_PREPARED_MEMORY));
-#endif
-    prepared->raw_cql_statement = query_string.data();
-    if (for_thrift) {
-        auto statement_id = compute_thrift_id(query_string, keyspace);
-        _thrift_prepared_statements.emplace(statement_id, prepared);
-        auto msg = ::make_shared<result_message::prepared::thrift>(statement_id, prepared);
-        return make_ready_future<::shared_ptr<result_message::prepared>>(std::move(msg));
-    } else {
-        auto statement_id = compute_id(query_string, keyspace);
-        _prepared_statements.emplace(statement_id, prepared);
-        auto msg = ::make_shared<result_message::prepared::cql>(statement_id, prepared);
-        return make_ready_future<::shared_ptr<result_message::prepared>>(std::move(msg));
+        return get_stored_prepared_statement_one<result_message::prepared::cql>(query_string, keyspace, compute_id, prepared_cache_key_type::cql_id);
    }
 }

@@ -286,22 +253,22 @@ static sstring hash_target(const std::experimental::string_view& query_string, c
    return keyspace + query_string.to_string();
 }

-bytes query_processor::compute_id(const std::experimental::string_view& query_string, const sstring& keyspace)
+prepared_cache_key_type query_processor::compute_id(const std::experimental::string_view& query_string, const sstring& keyspace)
 {
-    return md5_calculate(hash_target(query_string, keyspace));
+    return prepared_cache_key_type(md5_calculate(hash_target(query_string, keyspace)));
 }

-int32_t query_processor::compute_thrift_id(const std::experimental::string_view& query_string, const sstring& keyspace)
+prepared_cache_key_type query_processor::compute_thrift_id(const std::experimental::string_view& query_string, const sstring& keyspace)
 {
    auto target = hash_target(query_string, keyspace);
    uint32_t h = 0;
    for (auto&& c : hash_target(query_string, keyspace)) {
        h = 31*h + c;
    }
-    return static_cast<int32_t>(h);
+    return prepared_cache_key_type(static_cast<int32_t>(h));
 }

-::shared_ptr<prepared_statement>
+std::unique_ptr<prepared_statement>
 query_processor::get_statement(const sstring_view& query, const service::client_state& client_state)
 {
 #if 0
@@ -340,7 +307,7 @@ query_processor::parse_statement(const sstring_view& query)
    }
 }

-query_options query_processor::make_internal_options(::shared_ptr<statements::prepared_statement> p,
+query_options query_processor::make_internal_options(const statements::prepared_statement::checked_weak_ptr& p,
                                                     const std::initializer_list<data_value>& values,
                                                     db::consistency_level cl)
 {
@@ -362,7 +329,7 @@ query_options query_processor::make_internal_options(::shared_ptr<statements::pr
    return query_options(cl, bound_values);
 }

-::shared_ptr<statements::prepared_statement> query_processor::prepare_internal(const sstring& query_string)
+statements::prepared_statement::checked_weak_ptr query_processor::prepare_internal(const sstring& query_string)
 {
    auto& p = _internal_statements[query_string];
    if (p == nullptr) {
@@ -370,7 +337,7 @@ query_options query_processor::make_internal_options(::shared_ptr<statements::pr
        np->statement->validate(_proxy, *_internal_state);
        p = std::move(np); // inserts it into map
    }
-    return p;
+    return p->checked_weak_from_this();
 }

 future<::shared_ptr<untyped_result_set>>
@@ -380,17 +347,16 @@ query_processor::execute_internal(const sstring& query_string,
    if (log.is_enabled(logging::log_level::trace)) {
        log.trace("execute_internal: \"{}\" ({})", query_string, ::join(", ", values));
    }
-    auto p = prepare_internal(query_string);
-    return execute_internal(p, values);
+    return execute_internal(prepare_internal(query_string), values);
 }

 future<::shared_ptr<untyped_result_set>>
-query_processor::execute_internal(::shared_ptr<statements::prepared_statement> p,
+query_processor::execute_internal(statements::prepared_statement::checked_weak_ptr p,
                                  const std::initializer_list<data_value>& values)
 {
    auto opts = make_internal_options(p, values);
    return do_with(std::move(opts), [this, p = std::move(p)](auto& opts) {
-        return p->statement->execute_internal(_proxy, *_internal_state, opts).then([p](auto msg) {
+        return p->statement->execute_internal(_proxy, *_internal_state, opts).then([stmt = p->statement](auto msg) {
            return make_ready_future<::shared_ptr<untyped_result_set>>(::make_shared<untyped_result_set>(msg));
        });
    });
@@ -402,27 +368,30 @@ query_processor::process(const sstring& query_string,
                         const std::initializer_list<data_value>& values,
                         bool cache)
 {
-    auto p = cache ? prepare_internal(query_string) : parse_statement(query_string)->prepare(_db.local(), _cql_stats);
-    if (!cache) {
+    if (cache) {
+        return process(prepare_internal(query_string), cl, values);
+    } else {
+        auto p = parse_statement(query_string)->prepare(_db.local(), _cql_stats);
        p->statement->validate(_proxy, *_internal_state);
+        auto checked_weak_ptr = p->checked_weak_from_this();
+        return process(std::move(checked_weak_ptr), cl, values).finally([p = std::move(p)] {});
    }
-    return process(p, cl, values);
 }

 future<::shared_ptr<untyped_result_set>>
-query_processor::process(::shared_ptr<statements::prepared_statement> p,
+query_processor::process(statements::prepared_statement::checked_weak_ptr p,
                         db::consistency_level cl,
                         const std::initializer_list<data_value>& values)
 {
    auto opts = make_internal_options(p, values, cl);
    return do_with(std::move(opts), [this, p = std::move(p)](auto & opts) {
-        return p->statement->execute(_proxy, *_internal_state, opts).then([p](auto msg) {
+        return p->statement->execute(_proxy, *_internal_state, opts).then([](auto msg) {
            return make_ready_future<::shared_ptr<untyped_result_set>>(::make_shared<untyped_result_set>(msg));
        });
    });
 }

-future<::shared_ptr<transport::messages::result_message>>
+future<::shared_ptr<cql_transport::messages::result_message>>
 query_processor::process_batch(::shared_ptr<statements::batch_statement> batch,
                               service::query_state& query_state,
                               query_options& options)
@@ -522,7 +491,7 @@ void query_processor::migration_subscriber::on_drop_view(const sstring& ks_name,

 void query_processor::migration_subscriber::remove_invalid_prepared_statements(sstring ks_name, std::experimental::optional<sstring> cf_name)
 {
-    _qp->invalidate_prepared_statements([&] (::shared_ptr<cql_statement> stmt) {
+    _qp->_prepared_cache.remove_if([&] (::shared_ptr<cql_statement> stmt) {
        return this->should_invalidate(ks_name, cf_name, stmt);
    });
 }
--- a/cql3/query_processor.hh
+++ b/cql3/query_processor.hh
@@ -57,6 +57,7 @@
 #include "statements/prepared_statement.hh"
 #include "transport/messages/result_message.hh"
 #include "untyped_result_set.hh"
+#include "prepared_statements_cache.hh"

 namespace cql3 {

@@ -64,9 +65,32 @@ namespace statements {
 class batch_statement;
 }

+class prepared_statement_is_too_big : public std::exception {
+public:
+    static constexpr int max_query_prefix = 100;
+
+private:
+    sstring _msg;
+
+public:
+    prepared_statement_is_too_big(const sstring& query_string)
+        : _msg(seastar::format("Prepared statement is too big: {}", query_string.substr(0, max_query_prefix)))
+    {
+        // mark that we clipped the query string
+        if (query_string.size() > max_query_prefix) {
+            _msg += "...";
+        }
+    }
+
+    virtual const char* what() const noexcept override {
+        return _msg.c_str();
+    }
+};
+
 class query_processor {
 public:
    class migration_subscriber;
+
 private:
    std::unique_ptr<migration_subscriber> _migration_subscriber;
    distributed<service::storage_proxy>& _proxy;
@@ -127,10 +151,8 @@ private:
        }
    };
 #endif
-
-    std::unordered_map<bytes, ::shared_ptr<statements::prepared_statement>> _prepared_statements;
-    std::unordered_map<int32_t, ::shared_ptr<statements::prepared_statement>> _thrift_prepared_statements;
-    std::unordered_map<sstring, ::shared_ptr<statements::prepared_statement>> _internal_statements;
+    prepared_statements_cache _prepared_cache;
+    std::unordered_map<sstring, std::unique_ptr<statements::prepared_statement>> _internal_statements;
 #if 0

    // A map for prepared statements used internally (which we don't want to mix with user statement, in particular we don't
@@ -221,21 +243,14 @@ private:
    }
 #endif
 public:
-    ::shared_ptr<statements::prepared_statement> get_prepared(const bytes& id) {
-        auto it = _prepared_statements.find(id);
-        if (it == _prepared_statements.end()) {
-            return ::shared_ptr<statements::prepared_statement>{};
+    statements::prepared_statement::checked_weak_ptr get_prepared(const prepared_cache_key_type& key) {
+        auto it = _prepared_cache.find(key);
+        if (it == _prepared_cache.end()) {
+            return statements::prepared_statement::checked_weak_ptr();
        }
-        return it->second;
+        return *it;
    }

-    ::shared_ptr<statements::prepared_statement> get_prepared_for_thrift(int32_t id) {
-        auto it = _thrift_prepared_statements.find(id);
-        if (it == _thrift_prepared_statements.end()) {
-            return ::shared_ptr<statements::prepared_statement>{};
-        }
-        return it->second;
-    }
 #if 0
    public static void validateKey(ByteBuffer key) throws InvalidRequestException
    {
@@ -275,7 +290,7 @@ public:
    }
 #endif
 public:
-    future<::shared_ptr<transport::messages::result_message>> process_statement(::shared_ptr<cql_statement> statement,
+    future<::shared_ptr<cql_transport::messages::result_message>> process_statement(::shared_ptr<cql_statement> statement,
            service::query_state& query_state, const query_options& options);

 #if 0
@@ -286,7 +301,7 @@ public:
    }
 #endif

-    future<::shared_ptr<transport::messages::result_message>> process(const std::experimental::string_view& query_string,
+    future<::shared_ptr<cql_transport::messages::result_message>> process(const std::experimental::string_view& query_string,
            service::query_state& query_state, query_options& options);

 #if 0
@@ -340,23 +355,23 @@ public:
    }
 #endif
 private:
-    query_options make_internal_options(::shared_ptr<statements::prepared_statement>, const std::initializer_list<data_value>&, db::consistency_level = db::consistency_level::ONE);
+    query_options make_internal_options(const statements::prepared_statement::checked_weak_ptr& p, const std::initializer_list<data_value>&, db::consistency_level = db::consistency_level::ONE);
 public:
    future<::shared_ptr<untyped_result_set>> execute_internal(
            const sstring& query_string,
            const std::initializer_list<data_value>& = { });

-    ::shared_ptr<statements::prepared_statement> prepare_internal(const sstring& query);
+    statements::prepared_statement::checked_weak_ptr prepare_internal(const sstring& query);

    future<::shared_ptr<untyped_result_set>> execute_internal(
-            ::shared_ptr<statements::prepared_statement>,
+            statements::prepared_statement::checked_weak_ptr p,
            const std::initializer_list<data_value>& = { });

    future<::shared_ptr<untyped_result_set>> process(
                    const sstring& query_string,
                    db::consistency_level, const std::initializer_list<data_value>& = { }, bool cache = false);
    future<::shared_ptr<untyped_result_set>> process(
-                    ::shared_ptr<statements::prepared_statement>,
+                    statements::prepared_statement::checked_weak_ptr p,
                    db::consistency_level, const std::initializer_list<data_value>& = { });

    /*
@@ -434,43 +449,62 @@ public:
    }
 #endif

-    future<::shared_ptr<transport::messages::result_message::prepared>>
-    prepare(const std::experimental::string_view& query_string, service::query_state& query_state);
+    future<::shared_ptr<cql_transport::messages::result_message::prepared>>
+    prepare(sstring query_string, service::query_state& query_state);

-    future<::shared_ptr<transport::messages::result_message::prepared>>
-    prepare(const std::experimental::string_view& query_string, const service::client_state& client_state, bool for_thrift);
+    future<::shared_ptr<cql_transport::messages::result_message::prepared>>
+    prepare(sstring query_string, const service::client_state& client_state, bool for_thrift);

-    static bytes compute_id(const std::experimental::string_view& query_string, const sstring& keyspace);
-    static int32_t compute_thrift_id(const std::experimental::string_view& query_string, const sstring& keyspace);
+    static prepared_cache_key_type compute_id(const std::experimental::string_view& query_string, const sstring& keyspace);
+    static prepared_cache_key_type compute_thrift_id(const std::experimental::string_view& query_string, const sstring& keyspace);

 private:
-    ::shared_ptr<transport::messages::result_message::prepared>
-    get_stored_prepared_statement(const std::experimental::string_view& query_string, const sstring& keyspace, bool for_thrift);
+    ///
+    /// \tparam ResultMsgType type of the returned result message (CQL or Thrift)
+    /// \tparam PreparedKeyGenerator a function that generates the prepared statement cache key for given query and keyspace
+    /// \tparam IdGetter a function that returns the corresponding prepared statement ID (CQL or Thrift) for a given prepared statement cache key
+    /// \param query_string
+    /// \param client_state
+    /// \param id_gen prepared ID generator, called before the first deferring
+    /// \param id_getter prepared ID getter, passed to deferred context by reference. The caller must ensure its liveness.
+    /// \return
+    template <typename ResultMsgType, typename PreparedKeyGenerator, typename IdGetter>
+    future<::shared_ptr<cql_transport::messages::result_message::prepared>>
+    prepare_one(sstring query_string, const service::client_state& client_state, PreparedKeyGenerator&& id_gen, IdGetter&& id_getter) {
+        return do_with(id_gen(query_string, client_state.get_raw_keyspace()), std::move(query_string), [this, &client_state, &id_getter] (const prepared_cache_key_type& key, const sstring& query_string) {
+            return _prepared_cache.get(key, [this, &query_string, &client_state] {
+                auto prepared = get_statement(query_string, client_state);
+                auto bound_terms = prepared->statement->get_bound_terms();
+                if (bound_terms > std::numeric_limits<uint16_t>::max()) {
+                    throw exceptions::invalid_request_exception(sprint("Too many markers(?). %d markers exceed the allowed maximum of %d", bound_terms, std::numeric_limits<uint16_t>::max()));
+                }
+                assert(bound_terms == prepared->bound_names.size());
+                prepared->raw_cql_statement = query_string;
+                return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
+            }).then([&key, &id_getter] (auto prep_ptr) {
+                return make_ready_future<::shared_ptr<cql_transport::messages::result_message::prepared>>(::make_shared<ResultMsgType>(id_getter(key), std::move(prep_ptr)));
+            }).handle_exception_type([&query_string] (typename prepared_statements_cache::statement_is_too_big&) {
+                return make_exception_future<::shared_ptr<cql_transport::messages::result_message::prepared>>(prepared_statement_is_too_big(query_string));
+            });
+        });
+    };

-    future<::shared_ptr<transport::messages::result_message::prepared>>
-    store_prepared_statement(const std::experimental::string_view& query_string, const sstring& keyspace, ::shared_ptr<statements::prepared_statement> prepared, bool for_thrift);
+    template <typename ResultMsgType, typename KeyGenerator, typename IdGetter>
+    ::shared_ptr<cql_transport::messages::result_message::prepared>
+    get_stored_prepared_statement_one(const std::experimental::string_view& query_string, const sstring& keyspace, KeyGenerator&& key_gen, IdGetter&& id_getter)
+    {
+        auto cache_key = key_gen(query_string, keyspace);
+        auto it = _prepared_cache.find(cache_key);
+        if (it == _prepared_cache.end()) {
+            return ::shared_ptr<cql_transport::messages::result_message::prepared>();
+        }

-    // Erases the statements for which filter returns true.
-    template <typename Pred>
-    void invalidate_prepared_statements(Pred filter) {
-        static_assert(std::is_same<bool, std::result_of_t<Pred(::shared_ptr<cql_statement>)>>::value,
-                      "bad Pred signature");
-        for (auto it = _prepared_statements.begin(); it != _prepared_statements.end(); ) {
-            if (filter(it->second->statement)) {
-                it = _prepared_statements.erase(it);
-            } else {
-                ++it;
-            }
-        }
-        for (auto it = _thrift_prepared_statements.begin(); it != _thrift_prepared_statements.end(); ) {
-            if (filter(it->second->statement)) {
-                it = _thrift_prepared_statements.erase(it);
-            } else {
-                ++it;
-            }
-        }
+        return ::make_shared<ResultMsgType>(id_getter(cache_key), *it);
    }

+    ::shared_ptr<cql_transport::messages::result_message::prepared>
+    get_stored_prepared_statement(const std::experimental::string_view& query_string, const sstring& keyspace, bool for_thrift);
+
 #if 0
    public ResultMessage processPrepared(CQLStatement statement, QueryState queryState, QueryOptions options)
    throws RequestExecutionException, RequestValidationException
@@ -497,10 +531,10 @@ private:
 #endif

 public:
-    future<::shared_ptr<transport::messages::result_message>> process_batch(::shared_ptr<statements::batch_statement>,
+    future<::shared_ptr<cql_transport::messages::result_message>> process_batch(::shared_ptr<statements::batch_statement>,
            service::query_state& query_state, query_options& options);

-    ::shared_ptr<statements::prepared_statement> get_statement(const std::experimental::string_view& query,
+    std::unique_ptr<statements::prepared_statement> get_statement(const std::experimental::string_view& query,
            const service::client_state& client_state);
    static ::shared_ptr<statements::raw::parsed_statement> parse_statement(const std::experimental::string_view& query);

--- a/cql3/restrictions/abstract_restriction.hh
+++ b/cql3/restrictions/abstract_restriction.hh
@@ -94,6 +94,26 @@ public:
        return true;
    }

+    /**
+     * Whether the specified row satisfied this restriction.
+     * Assumes the row is live, but not all cells. If a cell
+     * isn't live and there's a restriction on its column,
+     * then the function returns false.
+     *
+     * @param schema the schema the row belongs to
+     * @param key the partition key
+     * @param ckey the clustering key
+     * @param cells the remaining row columns
+     * @return the restriction resulting of the merge
+     * @throws InvalidRequestException if the restrictions cannot be merged
+     */
+    virtual bool is_satisfied_by(const schema& schema,
+                                 const partition_key& key,
+                                 const clustering_key_prefix& ckey,
+                                 const row& cells,
+                                 const query_options& options,
+                                 gc_clock::time_point now) const = 0;
+
 protected:
 #if 0
    protected static ByteBuffer validateIndexedValue(ColumnSpecification columnSpec,
@@ -113,7 +133,7 @@ protected:
     * @param function_name the function name
     * @return <code>true</code> if the specified term is using the specified function, <code>false</code> otherwise.
     */
-    static bool uses_function(::shared_ptr<term> term, const sstring& ks_name, const sstring& function_name) {
+    static bool term_uses_function(::shared_ptr<term> term, const sstring& ks_name, const sstring& function_name) {
        return bool(term) && term->uses_function(ks_name, function_name);
    }

@@ -125,9 +145,9 @@ protected:
     * @param function_name the function name
     * @return <code>true</code> if one of the specified term is using the specified function, <code>false</code> otherwise.
     */
-    static bool uses_function(const std::vector<::shared_ptr<term>>& terms, const sstring& ks_name, const sstring& function_name) {
+    static bool term_uses_function(const std::vector<::shared_ptr<term>>& terms, const sstring& ks_name, const sstring& function_name) {
        for (auto&& value : terms) {
-            if (uses_function(value, ks_name, function_name)) {
+            if (term_uses_function(value, ks_name, function_name)) {
                return true;
            }
        }
--- a/cql3/restrictions/multi_column_restriction.hh
+++ b/cql3/restrictions/multi_column_restriction.hh
@@ -85,6 +85,20 @@ public:
        do_merge_with(as_pkr);
    }

+    bool is_satisfied_by(const schema& schema,
+                         const partition_key& key,
+                         const clustering_key_prefix& ckey,
+                         const row& cells,
+                         const query_options& options,
+                         gc_clock::time_point now) const override {
+        for (auto&& range : bounds_ranges(options)) {
+            if (!range.contains(ckey, clustering_key_prefix::prefix_equal_tri_compare(schema))) {
+                return false;
+            }
+        }
+        return true;
+    }
+
 protected:
    virtual void do_merge_with(::shared_ptr<primary_key_restrictions<clustering_key_prefix>> other) = 0;

@@ -155,7 +169,7 @@ public:
    { }

    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
-        return abstract_restriction::uses_function(_value, ks_name, function_name);
+        return abstract_restriction::term_uses_function(_value, ks_name, function_name);
    }

    virtual sstring to_string() const override {
@@ -304,11 +318,11 @@ public:
    { }

    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override  {
-        return abstract_restriction::uses_function(_values, ks_name, function_name);
+        return abstract_restriction::term_uses_function(_values, ks_name, function_name);
    }

    virtual sstring to_string() const override  {
-        return sprint("IN(%s)", ::to_string(_values));
+        return sprint("IN(%s)", std::to_string(_values));
    }

 protected:
@@ -428,8 +442,8 @@ public:
    }

    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
-        return (_slice.has_bound(statements::bound::START) && abstract_restriction::uses_function(_slice.bound(statements::bound::START), ks_name, function_name))
-                || (_slice.has_bound(statements::bound::END) && abstract_restriction::uses_function(_slice.bound(statements::bound::END), ks_name, function_name));
+        return (_slice.has_bound(statements::bound::START) && abstract_restriction::term_uses_function(_slice.bound(statements::bound::START), ks_name, function_name))
+                || (_slice.has_bound(statements::bound::END) && abstract_restriction::term_uses_function(_slice.bound(statements::bound::END), ks_name, function_name));
    }

    virtual bool is_inclusive(statements::bound b) const override {
--- a/cql3/restrictions/single_column_primary_key_restrictions.hh
+++ b/cql3/restrictions/single_column_primary_key_restrictions.hh
@@ -46,6 +46,7 @@
 #include "cartesian_product.hh"
 #include "cql3/restrictions/primary_key_restrictions.hh"
 #include "cql3/restrictions/single_column_restrictions.hh"
+#include <boost/algorithm/cxx11/all_of.hpp>
 #include <boost/range/adaptor/transformed.hpp>
 #include <boost/range/adaptor/filtered.hpp>

@@ -96,6 +97,14 @@ public:
        return _in;
    }

+    virtual bool has_bound(statements::bound b) const override {
+        return boost::algorithm::all_of(_restrictions->restrictions(), [b] (auto&& r) { return r.second->has_bound(b); });
+    }
+
+    virtual bool is_inclusive(statements::bound b) const override {
+        return boost::algorithm::all_of(_restrictions->restrictions(), [b] (auto&& r) { return r.second->is_inclusive(b); });
+    }
+
    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
        return _restrictions->uses_function(ks_name, function_name);
    }
@@ -115,7 +124,7 @@ public:
                if (restriction->is_slice()) {
                    throw exceptions::invalid_request_exception(sprint(
                        "PRIMARY KEY column \"%s\" cannot be restricted (preceding column \"%s\" is restricted by a non-EQ relation)",
-                        _restrictions->next_column(new_column)->name_as_text(), new_column.name_as_text()));
+                        last_column.name_as_text(), new_column.name_as_text()));
                }
            }

@@ -331,6 +340,17 @@ public:
    sstring to_string() const override {
        return sprint("Restrictions(%s)", join(", ", get_column_defs()));
    }
+
+    virtual bool is_satisfied_by(const schema& schema,
+                                 const partition_key& key,
+                                 const clustering_key_prefix& ckey,
+                                 const row& cells,
+                                 const query_options& options,
+                                 gc_clock::time_point now) const override {
+        return boost::algorithm::all_of(
+            _restrictions->restrictions() | boost::adaptors::map_values,
+            [&] (auto&& r) { return r->is_satisfied_by(schema, key, ckey, cells, options, now); });
+    }
 };

 template<>
--- a/cql3/restrictions/single_column_restriction.hh
+++ b/cql3/restrictions/single_column_restriction.hh
@@ -49,6 +49,8 @@
 #include "schema.hh"
 #include "to_string.hh"
 #include "exceptions/exceptions.hh"
+#include "keys.hh"
+#include "mutation_partition.hh"

 namespace cql3 {

@@ -105,6 +107,13 @@ public:

    class slice;
    class contains;
+
+protected:
+    bytes_view_opt get_value(const schema& schema,
+            const partition_key& key,
+            const clustering_key_prefix& ckey,
+            const row& cells,
+            gc_clock::time_point now) const;
 };

 class single_column_restriction::EQ final : public single_column_restriction {
@@ -117,7 +126,7 @@ public:
    { }

    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
-        return abstract_restriction::uses_function(_value, ks_name, function_name);
+        return abstract_restriction::term_uses_function(_value, ks_name, function_name);
    }

    virtual bool is_EQ() const override {
@@ -143,6 +152,13 @@ public:
            "%s cannot be restricted by more than one relation if it includes an Equal", _column_def.name_as_text()));
    }

+    virtual bool is_satisfied_by(const schema& schema,
+                                 const partition_key& key,
+                                 const clustering_key_prefix& ckey,
+                                 const row& cells,
+                                 const query_options& options,
+                                 gc_clock::time_point now) const override;
+
 #if 0
        @Override
        protected boolean isSupportedBy(SecondaryIndex index)
@@ -167,6 +183,13 @@ public:
            "%s cannot be restricted by more than one relation if it includes a IN", _column_def.name_as_text()));
    }

+    virtual bool is_satisfied_by(const schema& schema,
+                                 const partition_key& key,
+                                 const clustering_key_prefix& ckey,
+                                 const row& cells,
+                                 const query_options& options,
+                                 gc_clock::time_point now) const override;
+
 #if 0
    @Override
    protected final boolean isSupportedBy(SecondaryIndex index)
@@ -186,7 +209,7 @@ public:
    { }

    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
-        return abstract_restriction::uses_function(_values, ks_name, function_name);
+        return abstract_restriction::term_uses_function(_values, ks_name, function_name);
    }

    virtual std::vector<bytes_opt> values(const query_options& options) const override {
@@ -198,7 +221,7 @@ public:
    }

    virtual sstring to_string() const override {
-        return sprint("IN(%s)", ::to_string(_values));
+        return sprint("IN(%s)", std::to_string(_values));
    }
 };

@@ -237,8 +260,8 @@ public:
    { }

    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
-        return (_slice.has_bound(statements::bound::START) && abstract_restriction::uses_function(_slice.bound(statements::bound::START), ks_name, function_name))
-                || (_slice.has_bound(statements::bound::END) && abstract_restriction::uses_function(_slice.bound(statements::bound::END), ks_name, function_name));
+        return (_slice.has_bound(statements::bound::START) && abstract_restriction::term_uses_function(_slice.bound(statements::bound::START), ks_name, function_name))
+                || (_slice.has_bound(statements::bound::END) && abstract_restriction::term_uses_function(_slice.bound(statements::bound::END), ks_name, function_name));
    }

    virtual bool is_slice() const override {
@@ -310,6 +333,13 @@ public:
    virtual sstring to_string() const override {
        return sprint("SLICE%s", _slice);
    }
+
+    virtual bool is_satisfied_by(const schema& schema,
+                                 const partition_key& key,
+                                 const clustering_key_prefix& ckey,
+                                 const row& cells,
+                                 const query_options& options,
+                                 gc_clock::time_point now) const override;
 };

 // This holds CONTAINS, CONTAINS_KEY, and map[key] = value restrictions because we might want to have any combination of them.
@@ -403,15 +433,15 @@ public:
    }

    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
-        return abstract_restriction::uses_function(_values, ks_name, function_name)
-            || abstract_restriction::uses_function(_keys, ks_name, function_name)
-            || abstract_restriction::uses_function(_entry_keys, ks_name, function_name)
-            || abstract_restriction::uses_function(_entry_values, ks_name, function_name);
+        return abstract_restriction::term_uses_function(_values, ks_name, function_name)
+            || abstract_restriction::term_uses_function(_keys, ks_name, function_name)
+            || abstract_restriction::term_uses_function(_entry_keys, ks_name, function_name)
+            || abstract_restriction::term_uses_function(_entry_values, ks_name, function_name);
    }

    virtual sstring to_string() const override {
        return sprint("CONTAINS(values=%s, keys=%s, entryKeys=%s, entryValues=%s)",
-            ::to_string(_values), ::to_string(_keys), ::to_string(_entry_keys), ::to_string(_entry_values));
+            std::to_string(_values), std::to_string(_keys), std::to_string(_entry_keys), std::to_string(_entry_values));
    }

    virtual bool has_bound(statements::bound b) const override {
@@ -426,6 +456,13 @@ public:
        throw exceptions::unsupported_operation_exception();
    }

+    virtual bool is_satisfied_by(const schema& schema,
+                                 const partition_key& key,
+                                 const clustering_key_prefix& ckey,
+                                 const row& cells,
+                                 const query_options& options,
+                                 gc_clock::time_point now) const override;
+
 #if 0
        private List<ByteBuffer> keys(const query_options& options) {
            return bindAndGet(keys, options);
--- a/cql3/restrictions/single_column_restrictions.hh
+++ b/cql3/restrictions/single_column_restrictions.hh
@@ -75,7 +75,7 @@ private:
     * The _restrictions per column.
     */
 public:
-    using restrictions_map = std::map<const column_definition*, ::shared_ptr<restriction>, column_definition_comparator>;
+    using restrictions_map = std::map<const column_definition*, ::shared_ptr<single_column_restriction>, column_definition_comparator>;
 private:
    restrictions_map _restrictions;
    bool _is_all_eq = true;
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -31,6 +31,8 @@
 #include "cql3/single_column_relation.hh"
 #include "cql3/constants.hh"

+#include "stdx.hh"
+
 namespace cql3 {
 namespace restrictions {

@@ -88,6 +90,14 @@ public:
    sstring to_string() const override {
        return "Initial restrictions";
    }
+    virtual bool is_satisfied_by(const schema& schema,
+                                 const partition_key& key,
+                                 const clustering_key_prefix& ckey,
+                                 const row& cells,
+                                 const query_options& options,
+                                 gc_clock::time_point now) const override {
+        return true;
+    }
 };

 template<>
@@ -144,6 +154,7 @@ to_column_definition(const schema_ptr& schema, const ::shared_ptr<column_identif

 statement_restrictions::statement_restrictions(database& db,
        schema_ptr schema,
+        statements::statement_type type,
        const std::vector<::shared_ptr<relation>>& where_clause,
        ::shared_ptr<variable_specifications> bound_names,
        bool selects_only_static_columns,
@@ -199,7 +210,7 @@ statement_restrictions::statement_restrictions(database& db,
            || nonprimary_key_restrictions->has_supporting_index(secondaryIndexManager);*/

    // At this point, the select statement if fully constructed, but we still have a few things to validate
-    process_partition_key_restrictions(has_queriable_index);
+    process_partition_key_restrictions(has_queriable_index, for_view);

    // Some but not all of the partition key columns have been specified;
    // hence we need turn these restrictions into index expressions.
@@ -208,11 +219,18 @@ statement_restrictions::statement_restrictions(database& db,
    }

    if (selects_only_static_columns && has_clustering_columns_restriction()) {
-        throw exceptions::invalid_request_exception(
-            "Cannot restrict clustering columns when selecting only static columns");
+        if (type.is_update() || type.is_delete()) {
+            throw exceptions::invalid_request_exception(sprint(
+                "Invalid restrictions on clustering columns since the %s statement modifies only static columns", type));
+        }
+
+        if (type.is_select()) {
+            throw exceptions::invalid_request_exception(
+                "Cannot restrict clustering columns when selecting only static columns");
+        }
    }

-    process_clustering_columns_restrictions(has_queriable_index, select_a_collection);
+    process_clustering_columns_restrictions(has_queriable_index, select_a_collection, for_view);

    // Covers indexes on the first clustering column (among others).
    if (_is_key_range && has_queriable_clustering_column_index)
@@ -254,7 +272,7 @@ statement_restrictions::statement_restrictions(database& db,
        _index_restrictions.push_back(_nonprimary_key_restrictions);
    }

-    if (_uses_secondary_indexing) {
+    if (_uses_secondary_indexing && !for_view) {
        fail(unimplemented::cause::INDEXES);
 #if 0
        validate_secondary_index_selections(selects_only_static_columns);
@@ -289,7 +307,7 @@ bool statement_restrictions::uses_function(const sstring& ks_name, const sstring
            || _nonprimary_key_restrictions->uses_function(ks_name, function_name);
 }

-void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index) {
+void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view) {
    // If there is a queriable index, no special condition are required on the other restrictions.
    // But we still need to know 2 things:
    // - If we don't have a queriable index, is the query ok
@@ -299,7 +317,7 @@ void statement_restrictions::process_partition_key_restrictions(bool has_queriab
    if (_partition_key_restrictions->is_on_token()) {
        _is_key_range = true;
    } else if (has_partition_key_unrestricted_components()) {
-        if (!_partition_key_restrictions->empty()) {
+        if (!_partition_key_restrictions->empty() && !for_view) {
            if (!has_queriable_index) {
                throw exceptions::invalid_request_exception(sprint("Partition key parts: %s must be restricted as other parts are",
                    join(", ", get_partition_key_unrestricted_components())));
@@ -315,7 +333,11 @@ bool statement_restrictions::has_partition_key_unrestricted_components() const {
    return _partition_key_restrictions->size() < _schema->partition_key_size();
 }

-void statement_restrictions::process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection) {
+bool statement_restrictions::has_unrestricted_clustering_columns() const {
+    return _clustering_columns_restrictions->size() < _schema->clustering_key_size();
+}
+
+void statement_restrictions::process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view) {
    if (!has_clustering_columns_restriction()) {
        return;
    }
@@ -335,7 +357,7 @@ void statement_restrictions::process_clustering_columns_restrictions(bool has_qu
        const column_definition* clustering_column = &(*clustering_columns_iter);
        ++clustering_columns_iter;

-        if (clustering_column != restricted_column) {
+        if (clustering_column != restricted_column && !for_view) {
            if (!has_queriable_index) {
                throw exceptions::invalid_request_exception(sprint(
                    "PRIMARY KEY column \"%s\" cannot be restricted as preceding column \"%s\" is not restricted",
@@ -392,5 +414,274 @@ void statement_restrictions::validate_secondary_index_selections(bool selects_on
    }
 }

+static bytes_view_opt do_get_value(const schema& schema,
+        const column_definition& cdef,
+        const partition_key& key,
+        const clustering_key_prefix& ckey,
+        const row& cells,
+        gc_clock::time_point now) {
+    switch(cdef.kind) {
+        case column_kind::partition_key:
+            return key.get_component(schema, cdef.component_index());
+        case column_kind::clustering_key:
+            return ckey.get_component(schema, cdef.component_index());
+        default:
+            auto cell = cells.find_cell(cdef.id);
+            if (!cell) {
+                return stdx::nullopt;
+            }
+            assert(cdef.is_atomic());
+            auto c = cell->as_atomic_cell();
+            return c.is_dead(now) ? stdx::nullopt : bytes_view_opt(c.value());
+    }
+}
+
+bytes_view_opt single_column_restriction::get_value(const schema& schema,
+        const partition_key& key,
+        const clustering_key_prefix& ckey,
+        const row& cells,
+        gc_clock::time_point now) const {
+    return do_get_value(schema, _column_def, key, ckey, cells, std::move(now));
+}
+
+bool single_column_restriction::EQ::is_satisfied_by(const schema& schema,
+        const partition_key& key,
+        const clustering_key_prefix& ckey,
+        const row& cells,
+        const query_options& options,
+        gc_clock::time_point now) const {
+    if (_column_def.type->is_counter()) {
+        fail(unimplemented::cause::COUNTERS);
+    }
+    auto operand = value(options);
+    if (operand) {
+        auto cell_value = get_value(schema, key, ckey, cells, now);
+        return cell_value && _column_def.type->compare(*operand, *cell_value) == 0;
+    }
+    return false;
+}
+
+bool single_column_restriction::IN::is_satisfied_by(const schema& schema,
+        const partition_key& key,
+        const clustering_key_prefix& ckey,
+        const row& cells,
+        const query_options& options,
+        gc_clock::time_point now) const {
+    if (_column_def.type->is_counter()) {
+        fail(unimplemented::cause::COUNTERS);
+    }
+    auto cell_value = get_value(schema, key, ckey, cells, now);
+    if (!cell_value) {
+        return false;
+    }
+    auto operands = values(options);
+    return std::any_of(operands.begin(), operands.end(), [&] (auto&& operand) {
+        return operand && _column_def.type->compare(*operand, *cell_value) == 0;
+    });
+}
+
+static query::range<bytes_view> to_range(const term_slice& slice, const query_options& options) {
+    using range_type = query::range<bytes_view>;
+    auto extract_bound = [&] (statements::bound bound) -> stdx::optional<range_type::bound> {
+        if (!slice.has_bound(bound)) {
+            return { };
+        }
+        auto value = slice.bound(bound)->bind_and_get(options);
+        if (!value) {
+            return { };
+        }
+        return { range_type::bound(*value, slice.is_inclusive(bound)) };
+    };
+    return range_type(
+        extract_bound(statements::bound::START),
+        extract_bound(statements::bound::END));
+}
+
+bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
+        const partition_key& key,
+        const clustering_key_prefix& ckey,
+        const row& cells,
+        const query_options& options,
+        gc_clock::time_point now) const {
+    if (_column_def.type->is_counter()) {
+        fail(unimplemented::cause::COUNTERS);
+    }
+    auto cell_value = get_value(schema, key, ckey, cells, now);
+    if (!cell_value) {
+        return false;
+    }
+    return to_range(_slice, options).contains(*cell_value, _column_def.type->as_tri_comparator());
+}
+
+bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
+        const partition_key& key,
+        const clustering_key_prefix& ckey,
+        const row& cells,
+        const query_options& options,
+        gc_clock::time_point now) const {
+    if (_column_def.type->is_counter()) {
+        fail(unimplemented::cause::COUNTERS);
+    }
+    if (!_column_def.type->is_collection()) {
+        return false;
+    }
+
+    auto col_type = static_pointer_cast<const collection_type_impl>(_column_def.type);
+    if ((!_keys.empty() || !_entry_keys.empty()) && !col_type->is_map()) {
+        return false;
+    }
+    assert(_entry_keys.size() == _entry_values.size());
+
+    auto&& map_key_type = col_type->name_comparator();
+    auto&& element_type = col_type->is_set() ? col_type->name_comparator() : col_type->value_comparator();
+    if (_column_def.type->is_multi_cell()) {
+        auto cell = cells.find_cell(_column_def.id);
+        auto&& elements = col_type->deserialize_mutation_form(cell->as_collection_mutation()).cells;
+        auto end = std::remove_if(elements.begin(), elements.end(), [now] (auto&& element) {
+            return element.second.is_dead(now);
+        });
+        for (auto&& value : _values) {
+            auto val = value->bind_and_get(options);
+            if (!val) {
+                continue;
+            }
+            auto found = std::find_if(elements.begin(), end, [&] (auto&& element) {
+                return element_type->compare(element.second.value(), *val) == 0;
+            });
+            if (found == end) {
+                return false;
+            }
+        }
+        for (auto&& key : _keys) {
+            auto k = key->bind_and_get(options);
+            if (!k) {
+                continue;
+            }
+            auto found = std::find_if(elements.begin(), end, [&] (auto&& element) {
+                return map_key_type->compare(element.first, *k) == 0;
+            });
+            if (found == end) {
+                return false;
+            }
+        }
+        for (uint32_t i = 0; i < _entry_keys.size(); ++i) {
+            auto map_key = _entry_keys[i]->bind_and_get(options);
+            auto map_value = _entry_values[i]->bind_and_get(options);
+            if (!map_key || !map_value) {
+                continue;
+            }
+            auto found = std::find_if(elements.begin(), end, [&] (auto&& element) {
+                return map_key_type->compare(element.first, *map_key) == 0;
+            });
+            if (found == end || element_type->compare(found->second.value(), *map_value) != 0) {
+                return false;
+            }
+        }
+    } else {
+        auto cell_value = get_value(schema, key, ckey, cells, now);
+        if (!cell_value) {
+            return false;
+        }
+        auto deserialized = _column_def.type->deserialize(*cell_value);
+        for (auto&& value : _values) {
+            auto val = value->bind_and_get(options);
+            if (!val) {
+                continue;
+            }
+            auto exists_in = [&](auto&& range) {
+                auto found = std::find_if(range.begin(), range.end(), [&] (auto&& element) {
+                    return element_type->compare(element.serialize(), *val) == 0;
+                });
+                return found != range.end();
+            };
+            if (col_type->is_list()) {
+                if (!exists_in(value_cast<list_type_impl::native_type>(deserialized))) {
+                    return false;
+                }
+            } else if (col_type->is_set()) {
+                if (!exists_in(value_cast<set_type_impl::native_type>(deserialized))) {
+                    return false;
+                }
+            } else {
+                auto data_map = value_cast<map_type_impl::native_type>(deserialized);
+                if (!exists_in(data_map | boost::adaptors::transformed([] (auto&& p) { return p.second; }))) {
+                    return false;
+                }
+            }
+        }
+        if (col_type->is_map()) {
+            auto& data_map = value_cast<map_type_impl::native_type>(deserialized);
+            for (auto&& key : _keys) {
+                auto k = key->bind_and_get(options);
+                if (!k) {
+                    continue;
+                }
+                auto found = std::find_if(data_map.begin(), data_map.end(), [&] (auto&& element) {
+                    return map_key_type->compare(element.first.serialize(), *k) == 0;
+                });
+                if (found == data_map.end()) {
+                    return false;
+                }
+            }
+            for (uint32_t i = 0; i < _entry_keys.size(); ++i) {
+                auto map_key = _entry_keys[i]->bind_and_get(options);
+                auto map_value = _entry_values[i]->bind_and_get(options);
+                if (!map_key || !map_value) {
+                    continue;
+                }
+                auto found = std::find_if(data_map.begin(), data_map.end(), [&] (auto&& element) {
+                    return map_key_type->compare(element.first.serialize(), *map_key) == 0;
+                });
+                if (found == data_map.end() || element_type->compare(found->second.serialize(), *map_value) != 0) {
+                    return false;
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+bool token_restriction::EQ::is_satisfied_by(const schema& schema,
+        const partition_key& key,
+        const clustering_key_prefix& ckey,
+        const row& cells,
+        const query_options& options,
+        gc_clock::time_point now) const {
+    bool satisfied = false;
+    auto cdef = _column_definitions.begin();
+    for (auto&& operand : values(options)) {
+        if (operand) {
+            auto cell_value = do_get_value(schema, **cdef, key, ckey, cells, now);
+            satisfied = cell_value && (*cdef)->type->compare(*operand, *cell_value) == 0;
+        }
+        if (!satisfied) {
+            break;
+        }
+    }
+    return satisfied;
+}
+
+bool token_restriction::slice::is_satisfied_by(const schema& schema,
+        const partition_key& key,
+        const clustering_key_prefix& ckey,
+        const row& cells,
+        const query_options& options,
+        gc_clock::time_point now) const {
+    bool satisfied = false;
+    auto range = to_range(_slice, options);
+    for (auto* cdef : _column_definitions) {
+        auto cell_value = do_get_value(schema, *cdef, key, ckey, cells, now);
+        if (!cell_value) {
+            return false;
+        }
+        satisfied = range.contains(*cell_value, cdef->type->as_tri_comparator());
+        if (!satisfied) {
+            break;
+        }
+    }
+    return satisfied;
+}
+
 }
 }
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -49,6 +49,7 @@
 #include "cql3/restrictions/single_column_restrictions.hh"
 #include "cql3/relation.hh"
 #include "cql3/variable_specifications.hh"
+#include "cql3/statements/statement_type.hh"

 namespace cql3 {

@@ -111,6 +112,7 @@ public:

    statement_restrictions(database& db,
        schema_ptr schema,
+        statements::statement_type type,
        const std::vector<::shared_ptr<relation>>& where_clause,
        ::shared_ptr<variable_specifications> bound_names,
        bool selects_only_static_columns,
@@ -150,8 +152,13 @@ public:
        return _uses_secondary_indexing;
    }

-private:
-    void process_partition_key_restrictions(bool has_queriable_index);
+    ::shared_ptr<primary_key_restrictions<partition_key>> get_partition_key_restrictions() const {
+        return _partition_key_restrictions;
+    }
+
+    ::shared_ptr<primary_key_restrictions<clustering_key_prefix>> get_clustering_columns_restrictions() const {
+        return _clustering_columns_restrictions;
+    }

    /**
     * Checks if the partition key has some unrestricted components.
@@ -159,6 +166,14 @@ private:
     */
    bool has_partition_key_unrestricted_components() const;

+    /**
+     * Checks if the clustering key has some unrestricted components.
+     * @return <code>true</code> if the clustering key has some unrestricted components, <code>false</code> otherwise.
+     */
+    bool has_unrestricted_clustering_columns() const;
+private:
+    void process_partition_key_restrictions(bool has_queriable_index, bool for_view);
+
    /**
     * Returns the partition key components that are not restricted.
     * @return the partition key components that are not restricted.
@@ -172,7 +187,7 @@ private:
     * @param select_a_collection <code>true</code> if the query should return a collection column
     * @throws InvalidRequestException if the request is invalid
     */
-    void process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection);
+    void process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view);

    /**
     * Returns the <code>Restrictions</code> for the specified type of columns.
@@ -378,6 +393,13 @@ public:
        auto&& restricted = get_restrictions(cdef->kind).get()->get_column_defs();
        return std::find(restricted.begin(), restricted.end(), cdef) != restricted.end();
    }
+
+     /**
+      * @return the non-primary key restrictions.
+      */
+    const single_column_restrictions::restrictions_map& get_non_pk_restriction() const {
+        return _nonprimary_key_restrictions->restrictions();
+    }
 };

 }
--- a/cql3/restrictions/token_restriction.hh
+++ b/cql3/restrictions/token_restriction.hh
@@ -157,7 +157,7 @@ public:
    }

    bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
-        return abstract_restriction::uses_function(_value, ks_name, function_name);
+        return abstract_restriction::term_uses_function(_value, ks_name, function_name);
    }

    void merge_with(::shared_ptr<restriction>) override {
@@ -173,6 +173,13 @@ public:
    sstring to_string() const override {
        return sprint("EQ(%s)", _value->to_string());
    }
+
+    virtual bool is_satisfied_by(const schema& schema,
+                                 const partition_key& key,
+                                 const clustering_key_prefix& ckey,
+                                 const row& cells,
+                                 const query_options& options,
+                                 gc_clock::time_point now) const override;
 };

 class token_restriction::slice final : public token_restriction {
@@ -203,11 +210,11 @@ public:
    bool uses_function(const sstring& ks_name,
            const sstring& function_name) const override {
        return (_slice.has_bound(statements::bound::START)
-                && abstract_restriction::uses_function(
+                && abstract_restriction::term_uses_function(
                        _slice.bound(statements::bound::START), ks_name,
                        function_name))
                || (_slice.has_bound(statements::bound::END)
-                        && abstract_restriction::uses_function(
+                        && abstract_restriction::term_uses_function(
                                _slice.bound(statements::bound::END),
                                ks_name, function_name));
    }
@@ -246,6 +253,13 @@ public:
    sstring to_string() const override {
        return sprint("SLICE%s", _slice);
    }
+
+    virtual bool is_satisfied_by(const schema& schema,
+                                 const partition_key& key,
+                                 const clustering_key_prefix& ckey,
+                                 const row& cells,
+                                 const query_options& options,
+                                 gc_clock::time_point now) const override;
 };

 }
--- a/cql3/result_set.cc
+++ b/cql3/result_set.cc
@@ -44,8 +44,10 @@
 namespace cql3 {

 metadata::metadata(std::vector<::shared_ptr<column_specification>> names_)
-    : metadata(flag_enum_set(), std::move(names_), names_.size(), {})
-{ }
+        : _flags(flag_enum_set())
+        , names(std::move(names_)) {
+    _column_count = names.size();
+}

 metadata::metadata(flag_enum_set flags, std::vector<::shared_ptr<column_specification>> names_, uint32_t column_count,
        ::shared_ptr<const service::pager::paging_state> paging_state)
--- a/cql3/result_set.hh
+++ b/cql3/result_set.hh
@@ -75,7 +75,7 @@ public:
    std::vector<::shared_ptr<column_specification>> names;

 private:
-    const uint32_t _column_count;
+    uint32_t _column_count;
    ::shared_ptr<const service::pager::paging_state> _paging_state;

 public:
@@ -153,8 +153,8 @@ public:
    void trim(size_t limit);

    template<typename RowComparator>
-    void sort(RowComparator&& cmp) {
-        std::sort(_rows.begin(), _rows.end(), std::forward<RowComparator>(cmp));
+    void sort(const RowComparator& cmp) {
+        std::sort(_rows.begin(), _rows.end(), std::ref(cmp));
    }

    metadata& get_metadata();
--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -39,6 +39,8 @@
 * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <boost/range/adaptor/transformed.hpp>
+
 #include "cql3/selection/selection.hh"
 #include "cql3/selection/selector_factories.hh"
 #include "cql3/result_set.hh"
@@ -203,14 +205,10 @@ protected:
 };

 ::shared_ptr<selection> selection::wildcard(schema_ptr schema) {
-    std::vector<const column_definition*> cds;
-    auto& columns = schema->all_columns_in_select_order();
-    cds.reserve(columns.size());
-    for (auto& c : columns) {
-        if (!schema->is_dense() || !c.is_regular() || !c.name().empty()) {
-            cds.emplace_back(&c);
-        }
-    }
+    auto columns = schema->all_columns_in_select_order();
+    auto cds = boost::copy_range<std::vector<const column_definition*>>(columns | boost::adaptors::transformed([](const column_definition& c) {
+        return &c;
+    }));
    return simple_selection::make(schema, std::move(cds), true);
 }

--- a/cql3/sets.cc
+++ b/cql3/sets.cc
@@ -224,7 +224,7 @@ sets::marker::bind(const query_options& options) {
 }

 void
-sets::setter::execute(mutation& m, const exploded_clustering_prefix& row_key, const update_parameters& params) {
+sets::setter::execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params) {
    const auto& value = _t->bind(params._options);
    if (value == constants::UNSET_VALUE) {
        return;
@@ -241,7 +241,7 @@ sets::setter::execute(mutation& m, const exploded_clustering_prefix& row_key, co
 }

 void
-sets::adder::execute(mutation& m, const exploded_clustering_prefix& row_key, const update_parameters& params) {
+sets::adder::execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params) {
    const auto& value = _t->bind(params._options);
    if (value == constants::UNSET_VALUE) {
        return;
@@ -251,7 +251,7 @@ sets::adder::execute(mutation& m, const exploded_clustering_prefix& row_key, con
 }

 void
-sets::adder::do_add(mutation& m, const exploded_clustering_prefix& row_key, const update_parameters& params,
+sets::adder::do_add(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params,
        shared_ptr<term> value, const column_definition& column) {
    auto set_value = dynamic_pointer_cast<sets::value>(std::move(value));
    auto set_type = dynamic_pointer_cast<const set_type_impl>(column.type);
@@ -281,7 +281,7 @@ sets::adder::do_add(mutation& m, const exploded_clustering_prefix& row_key, cons
 }

 void
-sets::discarder::execute(mutation& m, const exploded_clustering_prefix& row_key, const update_parameters& params) {
+sets::discarder::execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params) {
    assert(column.type->is_multi_cell()); // "Attempted to remove items from a frozen set";

    auto&& value = _t->bind(params._options);
@@ -305,7 +305,7 @@ sets::discarder::execute(mutation& m, const exploded_clustering_prefix& row_key,
                    ctype->serialize_mutation_form(mut)));
 }

-void sets::element_discarder::execute(mutation& m, const exploded_clustering_prefix& row_key, const update_parameters& params)
+void sets::element_discarder::execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params)
 {
    assert(column.type->is_multi_cell() && "Attempted to remove items from a frozen set");
    auto elt = _t->bind(params._options);
--- a/cql3/sets.hh
+++ b/cql3/sets.hh
@@ -112,7 +112,7 @@ public:
        setter(const column_definition& column, shared_ptr<term> t)
                : operation(column, std::move(t)) {
        }
-        virtual void execute(mutation& m, const exploded_clustering_prefix& row_key, const update_parameters& params) override;
+        virtual void execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params) override;
    };

    class adder : public operation {
@@ -120,8 +120,8 @@ public:
        adder(const column_definition& column, shared_ptr<term> t)
            : operation(column, std::move(t)) {
        }
-        virtual void execute(mutation& m, const exploded_clustering_prefix& row_key, const update_parameters& params) override;
-        static void do_add(mutation& m, const exploded_clustering_prefix& row_key, const update_parameters& params,
+        virtual void execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params) override;
+        static void do_add(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params,
                shared_ptr<term> value, const column_definition& column);
    };

@@ -131,14 +131,14 @@ public:
        discarder(const column_definition& column, shared_ptr<term> t)
            : operation(column, std::move(t)) {
        }
-        virtual void execute(mutation& m, const exploded_clustering_prefix& row_key, const update_parameters& params) override;
+        virtual void execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params) override;
    };

    class element_discarder : public operation {
    public:
        element_discarder(const column_definition& column, shared_ptr<term> t)
            : operation(column, std::move(t)) { }
-        virtual void execute(mutation& m, const exploded_clustering_prefix& row_key, const update_parameters& params) override;
+        virtual void execute(mutation& m, const clustering_key_prefix& row_key, const update_parameters& params) override;
    };
 };

--- a/cql3/single_column_relation.hh
+++ b/cql3/single_column_relation.hh
@@ -139,7 +139,7 @@ protected:
        }

        if (is_IN()) {
-            return sprint("%s IN %s", entity_as_string, ::to_string(_in_values));
+            return sprint("%s IN (%s)", entity_as_string, join(", ", _in_values));
        }

        return sprint("%s %s %s", entity_as_string, _relation_type, _value->to_string());
--- a/cql3/statements/alter_keyspace_statement.cc
+++ b/cql3/statements/alter_keyspace_statement.cc
@@ -63,7 +63,7 @@ void cql3::statements::alter_keyspace_statement::validate(distributed<service::s
        service::get_local_storage_proxy().get_db().local().find_keyspace(_name); // throws on failure
        auto tmp = _name;
        std::transform(tmp.begin(), tmp.end(), tmp.begin(), ::tolower);
-        if (tmp == db::system_keyspace::NAME) {
+        if (is_system_keyspace(tmp)) {
            throw exceptions::invalid_request_exception("Cannot alter system keyspace");
        }

@@ -89,21 +89,18 @@ void cql3::statements::alter_keyspace_statement::validate(distributed<service::s
    }
 }

-future<bool> cql3::statements::alter_keyspace_statement::announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) {
+future<shared_ptr<cql_transport::event::schema_change>> cql3::statements::alter_keyspace_statement::announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) {
    auto old_ksm = service::get_local_storage_proxy().get_db().local().find_keyspace(_name).metadata();
-    return service::get_local_migration_manager().announce_keyspace_update(_attrs->as_ks_metadata_update(old_ksm), is_local_only).then([] {
-       return true;
+    return service::get_local_migration_manager().announce_keyspace_update(_attrs->as_ks_metadata_update(old_ksm), is_local_only).then([this] {
+        using namespace cql_transport;
+        return make_shared<event::schema_change>(
+                event::schema_change::change_type::UPDATED,
+                keyspace());
    });
 }

-shared_ptr<transport::event::schema_change> cql3::statements::alter_keyspace_statement::change_event() {
-    return make_shared<transport::event::schema_change>(
-                    transport::event::schema_change::change_type::UPDATED,
-                    keyspace());
-}
-
-shared_ptr<cql3::statements::prepared_statement>
+std::unique_ptr<cql3::statements::prepared_statement>
 cql3::statements::alter_keyspace_statement::prepare(database& db, cql_stats& stats) {
-    return make_shared<prepared_statement>(make_shared<alter_keyspace_statement>(*this));
+    return std::make_unique<prepared_statement>(make_shared<alter_keyspace_statement>(*this));
 }

--- a/cql3/statements/alter_keyspace_statement.hh
+++ b/cql3/statements/alter_keyspace_statement.hh
@@ -61,9 +61,8 @@ public:

    future<> check_access(const service::client_state& state) override;
    void validate(distributed<service::storage_proxy>& proxy, const service::client_state& state) override;
-    future<bool> announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) override;
-    shared_ptr<transport::event::schema_change> change_event() override;
-    virtual shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
+    future<shared_ptr<cql_transport::event::schema_change>> announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) override;
+    virtual std::unique_ptr<prepared> prepare(database& db, cql_stats& stats) override;
 };

 }
--- a/cql3/statements/alter_table_statement.cc
+++ b/cql3/statements/alter_table_statement.cc
@@ -40,6 +40,7 @@
 */

 #include "cql3/statements/alter_table_statement.hh"
+#include "index/secondary_index_manager.hh"
 #include "prepared_statement.hh"
 #include "service/migration_manager.hh"
 #include "validation.hh"
@@ -47,6 +48,7 @@
 #include <boost/range/adaptor/filtered.hpp>
 #include <boost/range/adaptor/transformed.hpp>
 #include "cql3/util.hh"
+#include "view_info.hh"

 namespace cql3 {

@@ -151,12 +153,20 @@ static void validate_column_rename(const schema& schema, const column_identifier
        throw exceptions::invalid_request_exception(sprint("Cannot rename non PRIMARY KEY part %s", from));
    }

-    if (def->is_indexed()) {
-        throw exceptions::invalid_request_exception(sprint("Cannot rename column %s because it is secondary indexed", from));
+    if (!schema.indices().empty()) {
+        auto& sim = secondary_index::get_secondary_index_manager();
+        auto dependent_indices = sim.local().get_dependent_indices(*def);
+        if (!dependent_indices.empty()) {
+            auto index_names = ::join(", ", dependent_indices | boost::adaptors::transformed([](const index_metadata& im) {
+                return im.name();
+            }));
+            throw exceptions::invalid_request_exception(
+                    sprint("Cannot rename column %s because it has dependent secondary indexes (%s)", from, index_names));
+        }
    }
 }

-future<bool> alter_table_statement::announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only)
+future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only)
 {
    auto& db = proxy.local().get_db().local();
    auto schema = validation::validate_column_family(db, keyspace(), column_family());
@@ -178,7 +188,7 @@ future<bool> alter_table_statement::announce_migration(distributed<service::stor
    }

    auto& cf = db.find_column_family(schema);
-    std::vector<schema_ptr> view_updates;
+    std::vector<view_ptr> view_updates;

    switch (_type) {
    case alter_table_statement::type::add:
@@ -219,8 +229,13 @@ future<bool> alter_table_statement::announce_migration(distributed<service::stor
                throw exceptions::invalid_request_exception("Cannot use non-frozen collections with super column families");
            }

-            auto it = schema->collections().find(column_name->name());
-            if (it != schema->collections().end() && !type->is_compatible_with(*it->second)) {
+
+            // If there used to be a non-frozen collection column with the same name (that has been dropped),
+            // we could still have some data using the old type, and so we can't allow adding a collection
+            // with the same name unless the types are compatible (see #6276).
+            auto& dropped = schema->dropped_columns();
+            auto i = dropped.find(column_name->text());
+            if (i != dropped.end() && !type->is_compatible_with(*i->second.type)) {
                throw exceptions::invalid_request_exception(sprint("Cannot add a collection with the name %s "
                    "because a collection with the same name and a different type has already been used in the past", column_name));
            }
@@ -235,7 +250,7 @@ future<bool> alter_table_statement::announce_migration(distributed<service::stor
                if (view->view_info()->include_all_columns()) {
                    schema_builder builder(view);
                    builder.with_column(column_name->name(), type);
-                    view_updates.push_back(builder.build());
+                    view_updates.push_back(view_ptr(builder.build()));
                }
            }
        }
@@ -261,7 +276,7 @@ future<bool> alter_table_statement::announce_migration(distributed<service::stor
                schema_builder builder(view);
                auto view_type = validate_alter(view, *view_def, *validator);
                builder.with_altered_column_type(column_name->name(), std::move(view_type));
-                view_updates.push_back(builder.build());
+                view_updates.push_back(view_ptr(builder.build()));
            }
        }
        break;
@@ -347,32 +362,26 @@ future<bool> alter_table_statement::announce_migration(distributed<service::stor
                    builder.with_view_info(view->view_info()->base_id(), view->view_info()->base_name(),
                            view->view_info()->include_all_columns(), std::move(new_where));

-                    view_updates.push_back(builder.build());
+                    view_updates.push_back(view_ptr(builder.build()));
                }
            }
        }
        break;
    }

-    auto f = service::get_local_migration_manager().announce_column_family_update(cfm.build(), false, is_local_only);
-    return f.then([is_local_only, view_updates = std::move(view_updates)] {
-        return parallel_for_each(view_updates, [is_local_only] (auto&& view) {
-            return service::get_local_migration_manager().announce_view_update(view_ptr(std::move(view)), is_local_only);
-        });
-    }).then([] {
-        return true;
+    return service::get_local_migration_manager().announce_column_family_update(cfm.build(), false, std::move(view_updates), is_local_only).then([this] {
+        using namespace cql_transport;
+        return make_shared<event::schema_change>(
+                event::schema_change::change_type::UPDATED,
+                event::schema_change::target_type::TABLE,
+                keyspace(),
+                column_family());
    });
 }

-shared_ptr<transport::event::schema_change> alter_table_statement::change_event()
-{
-    return make_shared<transport::event::schema_change>(transport::event::schema_change::change_type::UPDATED,
-        transport::event::schema_change::target_type::TABLE, keyspace(), column_family());
-}
-
-shared_ptr<cql3::statements::prepared_statement>
+std::unique_ptr<cql3::statements::prepared_statement>
 cql3::statements::alter_table_statement::prepare(database& db, cql_stats& stats) {
-    return make_shared<prepared_statement>(make_shared<alter_table_statement>(*this));
+    return std::make_unique<prepared_statement>(make_shared<alter_table_statement>(*this));
 }

 }
--- a/cql3/statements/alter_table_statement.hh
+++ b/cql3/statements/alter_table_statement.hh
@@ -78,9 +78,8 @@ public:

    virtual future<> check_access(const service::client_state& state) override;
    virtual void validate(distributed<service::storage_proxy>& proxy, const service::client_state& state) override;
-    virtual future<bool> announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) override;
-    virtual shared_ptr<transport::event::schema_change> change_event() override;
-    virtual shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) override;
+    virtual std::unique_ptr<prepared> prepare(database& db, cql_stats& stats) override;
 };

 }
--- a/cql3/statements/alter_type_statement.cc
+++ b/cql3/statements/alter_type_statement.cc
@@ -43,6 +43,7 @@
 #include "schema_builder.hh"
 #include "service/migration_manager.hh"
 #include "boost/range/adaptor/map.hpp"
+#include "stdx.hh"

 namespace cql3 {

@@ -71,29 +72,19 @@ void alter_type_statement::validate(distributed<service::storage_proxy>& proxy,
    // It doesn't really change anything anyway.
 }

-shared_ptr<transport::event::schema_change> alter_type_statement::change_event()
-{
-    using namespace transport;
-
-    return make_shared<transport::event::schema_change>(event::schema_change::change_type::UPDATED,
-                                                        event::schema_change::target_type::TYPE,
-                                                        keyspace(),
-                                                        _name.get_string_type_name());
-}
-
 const sstring& alter_type_statement::keyspace() const
 {
    return _name.get_keyspace();
 }

-static int32_t get_idx_of_field(user_type type, shared_ptr<column_identifier> field)
+static stdx::optional<uint32_t> get_idx_of_field(user_type type, shared_ptr<column_identifier> field)
 {
    for (uint32_t i = 0; i < type->field_names().size(); ++i) {
        if (field->name() == type->field_names()[i]) {
-            return i;
+            return {i};
        }
    }
-    return -1;
+    return {};
 }

 void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, bool is_local_only)
@@ -114,19 +105,19 @@ void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, b
    for (auto&& schema : ks.metadata()->cf_meta_data() | boost::adaptors::map_values) {
        auto cfm = schema_builder(schema);
        bool modified = false;
-        for (auto&& column : schema->all_columns() | boost::adaptors::map_values) {
-            auto t_opt = column->type->update_user_type(updated);
+        for (auto&& column : schema->all_columns()) {
+            auto t_opt = column.type->update_user_type(updated);
            if (t_opt) {
                modified = true;
                // We need to update this column
-                cfm.with_altered_column_type(column->name(), *t_opt);
+                cfm.with_altered_column_type(column.name(), *t_opt);
            }
        }
        if (modified) {
            if (schema->is_view()) {
                service::get_local_migration_manager().announce_view_update(view_ptr(cfm.build()), is_local_only).get();
            } else {
-                service::get_local_migration_manager().announce_column_family_update(cfm.build(), false, is_local_only).get();
+                service::get_local_migration_manager().announce_column_family_update(cfm.build(), false, {}, is_local_only).get();
            }
        }
    }
@@ -144,14 +135,19 @@ void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, b
    }
 }

-future<bool> alter_type_statement::announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only)
+future<shared_ptr<cql_transport::event::schema_change>> alter_type_statement::announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only)
 {
    return seastar::async([this, &proxy, is_local_only] {
        auto&& db = proxy.local().get_db().local();
        try {
            auto&& ks = db.find_keyspace(keyspace());
            do_announce_migration(db, ks, is_local_only);
-            return true;
+            using namespace cql_transport;
+            return make_shared<event::schema_change>(
+                    event::schema_change::change_type::UPDATED,
+                    event::schema_change::target_type::TYPE,
+                    keyspace(),
+                    _name.get_string_type_name());
        } catch (no_such_keyspace& e) {
            throw exceptions::invalid_request_exception(sprint("Cannot alter type in unknown keyspace %s", keyspace()));
        }
@@ -168,7 +164,7 @@ alter_type_statement::add_or_alter::add_or_alter(const ut_name& name, bool is_ad

 user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_update) const
 {
-    if (get_idx_of_field(to_update, _field_name) >= 0) {
+    if (get_idx_of_field(to_update, _field_name)) {
        throw exceptions::invalid_request_exception(sprint("Cannot add new field %s to type %s: a field of the same name already exists", _field_name->name(), _name.to_string()));
    }

@@ -185,19 +181,19 @@ user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_

 user_type alter_type_statement::add_or_alter::do_alter(database& db, user_type to_update) const
 {
-    uint32_t idx = get_idx_of_field(to_update, _field_name);
-    if (idx < 0) {
+    stdx::optional<uint32_t> idx = get_idx_of_field(to_update, _field_name);
+    if (!idx) {
        throw exceptions::invalid_request_exception(sprint("Unknown field %s in type %s", _field_name->name(), _name.to_string()));
    }

-    auto previous = to_update->field_types()[idx];
+    auto previous = to_update->field_types()[*idx];
    auto new_type = _field_type->prepare(db, keyspace())->get_type();
    if (!new_type->is_compatible_with(*previous)) {
        throw exceptions::invalid_request_exception(sprint("Type %s in incompatible with previous type %s of field %s in user type %s", _field_type->to_string(), previous->as_cql3_type()->to_string(), _field_name->name(), _name.to_string()));
    }

    std::vector<data_type> new_types(to_update->field_types());
-    new_types[idx] = new_type;
+    new_types[*idx] = new_type;
    return user_type_impl::get_instance(to_update->_keyspace, to_update->_name, to_update->field_names(), std::move(new_types));
 }

@@ -221,25 +217,25 @@ user_type alter_type_statement::renames::make_updated_type(database& db, user_ty
    std::vector<bytes> new_names(to_update->field_names());
    for (auto&& rename : _renames) {
        auto&& from = rename.first;
-        int32_t idx = get_idx_of_field(to_update, from);
-        if (idx < 0) {
+        stdx::optional<uint32_t> idx = get_idx_of_field(to_update, from);
+        if (!idx) {
            throw exceptions::invalid_request_exception(sprint("Unknown field %s in type %s", from->to_string(), _name.to_string()));
        }
-        new_names[idx] = rename.second->name();
+        new_names[*idx] = rename.second->name();
    }
    auto&& updated = user_type_impl::get_instance(to_update->_keyspace, to_update->_name, std::move(new_names), to_update->field_types());
    create_type_statement::check_for_duplicate_names(updated);
    return updated;
 }

-shared_ptr<cql3::statements::prepared_statement>
+std::unique_ptr<cql3::statements::prepared_statement>
 alter_type_statement::add_or_alter::prepare(database& db, cql_stats& stats) {
-    return make_shared<prepared_statement>(make_shared<alter_type_statement::add_or_alter>(*this));
+    return std::make_unique<prepared_statement>(make_shared<alter_type_statement::add_or_alter>(*this));
 }

-shared_ptr<cql3::statements::prepared_statement>
+std::unique_ptr<cql3::statements::prepared_statement>
 alter_type_statement::renames::prepare(database& db, cql_stats& stats) {
-    return make_shared<prepared_statement>(make_shared<alter_type_statement::renames>(*this));
+    return std::make_unique<prepared_statement>(make_shared<alter_type_statement::renames>(*this));
 }

 }
--- a/cql3/statements/alter_type_statement.hh
+++ b/cql3/statements/alter_type_statement.hh
@@ -61,11 +61,9 @@ public:

    virtual void validate(distributed<service::storage_proxy>& proxy, const service::client_state& state) override;

-    virtual shared_ptr<transport::event::schema_change> change_event() override;
-
    virtual const sstring& keyspace() const override;

-    virtual future<bool> announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) override;

    class add_or_alter;
    class renames;
@@ -84,7 +82,7 @@ public:
                 const shared_ptr<column_identifier> field_name,
                 const shared_ptr<cql3_type::raw> field_type);
    virtual user_type make_updated_type(database& db, user_type to_update) const override;
-    virtual shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
+    virtual std::unique_ptr<prepared> prepare(database& db, cql_stats& stats) override;
 private:
    user_type do_add(database& db, user_type to_update) const;
    user_type do_alter(database& db, user_type to_update) const;
@@ -101,7 +99,7 @@ public:
    void add_rename(shared_ptr<column_identifier> previous_name, shared_ptr<column_identifier> new_name);

    virtual user_type make_updated_type(database& db, user_type to_update) const override;
-    virtual shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
+    virtual std::unique_ptr<prepared> prepare(database& db, cql_stats& stats) override;
 };

 }
--- a/cql3/statements/alter_user_statement.cc
+++ b/cql3/statements/alter_user_statement.cc
@@ -92,7 +92,7 @@ future<> cql3::statements::alter_user_statement::check_access(const service::cli
    });
 }

-future<::shared_ptr<transport::messages::result_message>>
+future<::shared_ptr<cql_transport::messages::result_message>>
 cql3::statements::alter_user_statement::execute(distributed<service::storage_proxy>& proxy, service::query_state& state, const query_options& options) {
    return auth::auth::is_existing_user(_username).then([this](bool exists) {
        if (!exists) {
@@ -104,7 +104,7 @@ cql3::statements::alter_user_statement::execute(distributed<service::storage_pro
                return auth::auth::insert_user(_username, *_superuser);
            });
        }
-        return f.then([] { return  make_ready_future<::shared_ptr<transport::messages::result_message>>(); });
+        return f.then([] { return  make_ready_future<::shared_ptr<cql_transport::messages::result_message>>(); });
    });
 }

--- a/cql3/statements/alter_user_statement.hh
+++ b/cql3/statements/alter_user_statement.hh
@@ -62,7 +62,7 @@ public:
    void validate(distributed<service::storage_proxy>&, const service::client_state&) override;
    future<> check_access(const service::client_state&) override;

-    future<::shared_ptr<transport::messages::result_message>> execute(distributed<service::storage_proxy>&
+    future<::shared_ptr<cql_transport::messages::result_message>> execute(distributed<service::storage_proxy>&
                    , service::query_state&
                    , const query_options&) override;
 };
--- a/cql3/statements/alter_view_statement.cc
+++ b/cql3/statements/alter_view_statement.cc
@@ -43,6 +43,7 @@
 #include "cql3/statements/prepared_statement.hh"
 #include "service/migration_manager.hh"
 #include "validation.hh"
+#include "view_info.hh"

 namespace cql3 {

@@ -72,7 +73,7 @@ void alter_view_statement::validate(distributed<service::storage_proxy>&, const
    // validated in announce_migration()
 }

-future<bool> alter_view_statement::announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only)
+future<shared_ptr<cql_transport::event::schema_change>> alter_view_statement::announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only)
 {
    auto&& db = proxy.local().get_db().local();
    schema_ptr schema = validation::validate_column_family(db, keyspace(), column_family());
@@ -96,24 +97,27 @@ future<bool> alter_view_statement::announce_migration(distributed<service::stora
                "low might cause undelivered updates to expire before being replayed.");
    }

-    return service::get_local_migration_manager().announce_view_update(view_ptr(builder.build()), is_local_only).then([] {
-        return true;
+    if (builder.default_time_to_live().count() > 0) {
+        throw exceptions::invalid_request_exception(
+                "Cannot set or alter default_time_to_live for a materialized view. "
+                "Data in a materialized view always expire at the same time than "
+                "the corresponding data in the parent table.");
+    }
+
+    return service::get_local_migration_manager().announce_view_update(view_ptr(builder.build()), is_local_only).then([this] {
+        using namespace cql_transport;
+
+        return make_shared<event::schema_change>(
+                event::schema_change::change_type::UPDATED,
+                event::schema_change::target_type::TABLE,
+                keyspace(),
+                column_family());
    });
 }

-shared_ptr<transport::event::schema_change> alter_view_statement::change_event()
-{
-    using namespace transport;
-
-    return make_shared<event::schema_change>(event::schema_change::change_type::UPDATED,
-                                             event::schema_change::target_type::TABLE,
-                                             keyspace(),
-                                             column_family());
-}
-
-shared_ptr<cql3::statements::prepared_statement>
+std::unique_ptr<cql3::statements::prepared_statement>
 alter_view_statement::prepare(database& db, cql_stats& stats) {
-    return make_shared<prepared_statement>(make_shared<alter_view_statement>(*this));
+    return std::make_unique<prepared_statement>(make_shared<alter_view_statement>(*this));
 }

 }
--- a/cql3/statements/alter_view_statement.hh
+++ b/cql3/statements/alter_view_statement.hh
@@ -63,11 +63,9 @@ public:

    virtual void validate(distributed<service::storage_proxy>&, const service::client_state& state) override;

-    virtual future<bool> announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) override;
+    virtual future<shared_ptr<cql_transport::event::schema_change>> announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) override;

-    virtual shared_ptr<transport::event::schema_change> change_event() override;
-
-    virtual shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
+    virtual std::unique_ptr<prepared> prepare(database& db, cql_stats& stats) override;
 };

 }
--- a/cql3/statements/authentication_statement.cc
+++ b/cql3/statements/authentication_statement.cc
@@ -46,9 +46,9 @@ uint32_t cql3::statements::authentication_statement::get_bound_terms() {
    return 0;
 }

-::shared_ptr<cql3::statements::prepared_statement> cql3::statements::authentication_statement::prepare(
+std::unique_ptr<cql3::statements::prepared_statement> cql3::statements::authentication_statement::prepare(
                database& db, cql_stats& stats) {
-    return ::make_shared<prepared>(this->shared_from_this());
+    return std::make_unique<prepared>(this->shared_from_this());
 }

 bool cql3::statements::authentication_statement::uses_function(
@@ -75,7 +75,7 @@ future<> cql3::statements::authentication_statement::check_access(const service:
    return make_ready_future<>();
 }

-future<::shared_ptr<transport::messages::result_message>> cql3::statements::authentication_statement::execute_internal(
+future<::shared_ptr<cql_transport::messages::result_message>> cql3::statements::authentication_statement::execute_internal(
                distributed<service::storage_proxy>& proxy,
                service::query_state& state, const query_options& options) {
    // Internal queries are exclusively on the system keyspace and makes no sense here
--- a/cql3/statements/authentication_statement.hh
+++ b/cql3/statements/authentication_statement.hh
@@ -54,7 +54,7 @@ class authentication_statement : public raw::parsed_statement, public cql_statem
 public:
    uint32_t get_bound_terms() override;

-    ::shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
+    std::unique_ptr<prepared> prepare(database& db, cql_stats& stats) override;

    bool uses_function(const sstring& ks_name, const sstring& function_name) const override;

@@ -66,7 +66,7 @@ public:

    void validate(distributed<service::storage_proxy>&, const service::client_state& state) override;

-    future<::shared_ptr<transport::messages::result_message>>
+    future<::shared_ptr<cql_transport::messages::result_message>>
    execute_internal(distributed<service::storage_proxy>& proxy, service::query_state& state, const query_options& options) override;
 };

--- a/cql3/statements/authorization_statement.cc
+++ b/cql3/statements/authorization_statement.cc
@@ -46,9 +46,9 @@ uint32_t cql3::statements::authorization_statement::get_bound_terms() {
    return 0;
 }

-::shared_ptr<cql3::statements::prepared_statement> cql3::statements::authorization_statement::prepare(
+std::unique_ptr<cql3::statements::prepared_statement> cql3::statements::authorization_statement::prepare(
                database& db, cql_stats& stats) {
-    return ::make_shared<parsed_statement::prepared>(this->shared_from_this());
+    return std::make_unique<parsed_statement::prepared>(this->shared_from_this());
 }

 bool cql3::statements::authorization_statement::uses_function(
@@ -75,7 +75,7 @@ future<> cql3::statements::authorization_statement::check_access(const service::
    return make_ready_future<>();
 }

-future<::shared_ptr<transport::messages::result_message>> cql3::statements::authorization_statement::execute_internal(
+future<::shared_ptr<cql_transport::messages::result_message>> cql3::statements::authorization_statement::execute_internal(
                distributed<service::storage_proxy>& proxy,
                service::query_state& state, const query_options& options) {
    // Internal queries are exclusively on the system keyspace and makes no sense here
--- a/cql3/statements/authorization_statement.hh
+++ b/cql3/statements/authorization_statement.hh
@@ -58,7 +58,7 @@ class authorization_statement : public raw::parsed_statement, public cql_stateme
 public:
    uint32_t get_bound_terms() override;

-    ::shared_ptr<prepared> prepare(database& db, cql_stats& stats) override;
+    std::unique_ptr<prepared> prepare(database& db, cql_stats& stats) override;

    bool uses_function(const sstring& ks_name, const sstring& function_name) const override;

@@ -70,7 +70,7 @@ public:

    void validate(distributed<service::storage_proxy>&, const service::client_state& state) override;

-    future<::shared_ptr<transport::messages::result_message>>
+    future<::shared_ptr<cql_transport::messages::result_message>>
    execute_internal(distributed<service::storage_proxy>& proxy, service::query_state& state, const query_options& options) override;

 protected:
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -40,6 +40,7 @@
 #include "batch_statement.hh"
 #include "raw/batch_statement.hh"
 #include "db/config.hh"
+#include <seastar/core/execution_stage.hh>

 namespace {

@@ -77,6 +78,14 @@ batch_statement::batch_statement(int bound_terms, type type_,
 {
 }

+batch_statement::batch_statement(type type_,
+                                 std::vector<shared_ptr<modification_statement>> statements,
+                                 std::unique_ptr<attributes> attrs,
+                                 cql_stats& stats)
+    : batch_statement(-1, type_, std::move(statements), std::move(attrs), stats)
+{
+}
+
 bool batch_statement::uses_function(const sstring& ks_name, const sstring& function_name) const
 {
    return _attrs->uses_function(ks_name, function_name)
@@ -149,6 +158,13 @@ void batch_statement::validate()
                        | boost::adaptors::uniqued) != 1))) {
        throw exceptions::invalid_request_exception("Batch with conditions cannot span multiple tables");
    }
+    std::experimental::optional<bool> raw_counter;
+    for (auto& s : _statements) {
+        if (raw_counter && s->is_raw_counter_shard_write() != *raw_counter) {
+            throw exceptions::invalid_request_exception("Cannot mix raw and regular counter statements in batch");
+        }
+        raw_counter = s->is_raw_counter_shard_write();
+    }
 }

 void batch_statement::validate(distributed<service::storage_proxy>& proxy, const service::client_state& state)
@@ -200,7 +216,12 @@ future<std::vector<mutation>> batch_statement::get_mutations(distributed<service
 }

 void batch_statement::verify_batch_size(const std::vector<mutation>& mutations) {
+    if (mutations.size() <= 1) {
+        return;     // We only warn for batch spanning multiple mutations
+    }
+
    size_t warn_threshold = service::get_local_storage_proxy().get_db().local().get_config().batch_size_warn_threshold_in_kb() * 1024;
+    size_t fail_threshold = service::get_local_storage_proxy().get_db().local().get_config().batch_size_fail_threshold_in_kb() * 1024;

    class my_partition_visitor : public mutation_partition_visitor {
    public:
@@ -212,7 +233,7 @@ void batch_statement::verify_batch_size(const std::vector<mutation>& mutations)
            size += v.data.size();
        }
        void accept_row_tombstone(const range_tombstone&) override {}
-        void accept_row(clustering_key_view, tombstone, const row_marker&) override {}
+        void accept_row(position_in_partition_view, const row_tombstone&, const row_marker&, is_dummy, is_continuous) override {}
        void accept_row_cell(column_id, atomic_cell_view v) override {
            size += v.value().size();
        }
@@ -230,24 +251,36 @@ void batch_statement::verify_batch_size(const std::vector<mutation>& mutations)
    }

    if (v.size > warn_threshold) {
-        std::unordered_set<sstring> ks_cf_pairs;
-        for (auto&& m : mutations) {
-            ks_cf_pairs.insert(m.schema()->ks_name() + "." + m.schema()->cf_name());
+        auto error = [&] (const char* type, size_t threshold) -> sstring {
+            std::unordered_set<sstring> ks_cf_pairs;
+            for (auto&& m : mutations) {
+                ks_cf_pairs.insert(m.schema()->ks_name() + "." + m.schema()->cf_name());
+            }
+            return sprint("Batch of prepared statements for %s is of size %d, exceeding specified %s threshold of %d by %d.",
+                    join(", ", ks_cf_pairs), v.size, type, threshold, v.size - threshold);
+        };
+        if (v.size > fail_threshold) {
+            _logger.error(error("FAIL", fail_threshold).c_str());
+            throw exceptions::invalid_request_exception("Batch too large");
+        } else {
+            _logger.warn(error("WARN", warn_threshold).c_str());
        }
-        _logger.warn(
-                        "Batch of prepared statements for {} is of size {}, exceeding specified threshold of {} by {}.{}",
-                        join(", ", ks_cf_pairs), v.size, warn_threshold,
-                        v.size - warn_threshold, "");
    }
 }

-future<shared_ptr<transport::messages::result_message>> batch_statement::execute(
+struct batch_statement_executor {
+    static auto get() { return &batch_statement::do_execute; }
+};
+static thread_local auto batch_stage = seastar::make_execution_stage("cql3_batch", batch_statement_executor::get());
+
+future<shared_ptr<cql_transport::messages::result_message>> batch_statement::execute(
        distributed<service::storage_proxy>& storage, service::query_state& state, const query_options& options) {
    ++_stats.batches;
-    return execute(storage, state, options, false, options.get_timestamp(state));
+    return batch_stage(this, seastar::ref(storage), seastar::ref(state),
+                       seastar::cref(options), false, options.get_timestamp(state));
 }

-future<shared_ptr<transport::messages::result_message>> batch_statement::execute(
+future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_execute(
        distributed<service::storage_proxy>& storage,
        service::query_state& query_state, const query_options& options,
        bool local, api::timestamp_type now)
@@ -266,8 +299,8 @@ future<shared_ptr<transport::messages::result_message>> batch_statement::execute
    return get_mutations(storage, options, local, now, query_state.get_trace_state()).then([this, &storage, &options, tr_state = query_state.get_trace_state()] (std::vector<mutation> ms) mutable {
        return execute_without_conditions(storage, std::move(ms), options.get_consistency(), std::move(tr_state));
    }).then([] {
-        return make_ready_future<shared_ptr<transport::messages::result_message>>(
-                make_shared<transport::messages::result_message::void_message>());
+        return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(
+                make_shared<cql_transport::messages::result_message::void_message>());
    });
 }

@@ -305,7 +338,7 @@ future<> batch_statement::execute_without_conditions(
    return storage.local().mutate_with_triggers(std::move(mutations), cl, mutate_atomic, std::move(tr_state));
 }

-future<shared_ptr<transport::messages::result_message>> batch_statement::execute_with_conditions(
+future<shared_ptr<cql_transport::messages::result_message>> batch_statement::execute_with_conditions(
        distributed<service::storage_proxy>& storage,
        const query_options& options,
        service::query_state& state)
@@ -358,7 +391,7 @@ future<shared_ptr<transport::messages::result_message>> batch_statement::execute
 #endif
 }

-future<shared_ptr<transport::messages::result_message>> batch_statement::execute_internal(
+future<shared_ptr<cql_transport::messages::result_message>> batch_statement::execute_internal(
        distributed<service::storage_proxy>& proxy,
        service::query_state& query_state, const query_options& options)
 {
@@ -377,12 +410,22 @@ future<shared_ptr<transport::messages::result_message>> batch_statement::execute

 namespace raw {

-shared_ptr<prepared_statement>
+std::unique_ptr<prepared_statement>
 batch_statement::prepare(database& db, cql_stats& stats) {
    auto&& bound_names = get_bound_variables();

+    stdx::optional<sstring> first_ks;
+    stdx::optional<sstring> first_cf;
+    bool have_multiple_cfs = false;
+
    std::vector<shared_ptr<cql3::statements::modification_statement>> statements;
    for (auto&& parsed : _parsed_statements) {
+        if (!first_ks) {
+            first_ks = parsed->keyspace();
+            first_cf = parsed->column_family();
+        } else {
+            have_multiple_cfs = first_ks.value() != parsed->keyspace() || first_cf.value() != parsed->column_family();
+        }
        statements.push_back(parsed->prepare(db, bound_names, stats));
    }

@@ -392,8 +435,13 @@ batch_statement::prepare(database& db, cql_stats& stats) {
    cql3::statements::batch_statement batch_statement_(bound_names->size(), _type, std::move(statements), std::move(prep_attrs), stats);
    batch_statement_.validate();

-    return ::make_shared<prepared>(make_shared(std::move(batch_statement_)),
-                                                     bound_names->get_specifications());
+    std::vector<uint16_t> partition_key_bind_indices;
+    if (!have_multiple_cfs && batch_statement_.get_statements().size() > 0) {
+        partition_key_bind_indices = bound_names->get_partition_key_bind_indexes(batch_statement_.get_statements()[0]->s);
+    }
+    return std::make_unique<prepared>(make_shared(std::move(batch_statement_)),
+                                                     bound_names->get_specifications(),
+                                                     std::move(partition_key_bind_indices));
 }

 }
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -87,6 +87,11 @@ public:
                    std::unique_ptr<attributes> attrs,
                    cql_stats& stats);

+    batch_statement(type type_,
+                    std::vector<shared_ptr<modification_statement>> statements,
+                    std::unique_ptr<attributes> attrs,
+                    cql_stats& stats);
+
    virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override;

    virtual bool depends_on_keyspace(const sstring& ks_name) const override;
@@ -115,10 +120,11 @@ public:
     */
    static void verify_batch_size(const std::vector<mutation>& mutations);

-    virtual future<shared_ptr<transport::messages::result_message>> execute(
+    virtual future<shared_ptr<cql_transport::messages::result_message>> execute(
            distributed<service::storage_proxy>& storage, service::query_state& state, const query_options& options) override;
 private:
-    future<shared_ptr<transport::messages::result_message>> execute(
+    friend class batch_statement_executor;
+    future<shared_ptr<cql_transport::messages::result_message>> do_execute(
            distributed<service::storage_proxy>& storage,
            service::query_state& query_state, const query_options& options,
            bool local, api::timestamp_type now);
@@ -129,12 +135,12 @@ private:
            db::consistency_level cl,
            tracing::trace_state_ptr tr_state);

-    future<shared_ptr<transport::messages::result_message>> execute_with_conditions(
+    future<shared_ptr<cql_transport::messages::result_message>> execute_with_conditions(
            distributed<service::storage_proxy>& storage,
            const query_options& options,
            service::query_state& state);
 public:
-    virtual future<shared_ptr<transport::messages::result_message>> execute_internal(
+    virtual future<shared_ptr<cql_transport::messages::result_message>> execute_internal(
            distributed<service::storage_proxy>& proxy,
            service::query_state& query_state, const query_options& options) override;

--- a/cql3/statements/cf_prop_defs.cc
+++ b/cql3/statements/cf_prop_defs.cc
@@ -41,6 +41,8 @@

 #include "cql3/statements/cf_prop_defs.hh"

+#include <boost/algorithm/string/predicate.hpp>
+
 namespace cql3 {

 namespace statements {
@@ -61,9 +63,12 @@ const sstring cf_prop_defs::KW_MEMTABLE_FLUSH_PERIOD = "memtable_flush_period_in

 const sstring cf_prop_defs::KW_COMPACTION = "compaction";
 const sstring cf_prop_defs::KW_COMPRESSION = "compression";
+const sstring cf_prop_defs::KW_CRC_CHECK_CHANCE = "crc_check_chance";

 const sstring cf_prop_defs::COMPACTION_STRATEGY_CLASS_KEY = "class";

+const sstring cf_prop_defs::COMPACTION_ENABLED_KEY = "enabled";
+
 void cf_prop_defs::validate() {
    // Skip validation if the comapction strategy class is already set as it means we've alreayd
    // prepared (and redoing it would set strategyClass back to null, which we don't want)
@@ -76,7 +81,7 @@ void cf_prop_defs::validate() {
        KW_GCGRACESECONDS, KW_CACHING, KW_DEFAULT_TIME_TO_LIVE,
        KW_MIN_INDEX_INTERVAL, KW_MAX_INDEX_INTERVAL, KW_SPECULATIVE_RETRY,
        KW_BF_FP_CHANCE, KW_MEMTABLE_FLUSH_PERIOD, KW_COMPACTION,
-        KW_COMPRESSION,
+        KW_COMPRESSION, KW_CRC_CHECK_CHANCE
    });
    static std::set<sstring> obsolete_keywords({
        sstring("index_interval"),
@@ -187,6 +192,13 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder) {
    builder.set_min_compaction_threshold(min_compaction_threshold);
    builder.set_max_compaction_threshold(max_compaction_threshold);

+    if (has_property(KW_COMPACTION)) {
+        if (get_compaction_options().count(COMPACTION_ENABLED_KEY)) {
+            auto enabled = boost::algorithm::iequals(get_compaction_options().at(COMPACTION_ENABLED_KEY), "true");
+            builder.set_compaction_enabled(enabled);
+        }
+    }
+
    builder.set_default_time_to_live(gc_clock::duration(get_int(KW_DEFAULT_TIME_TO_LIVE, DEFAULT_DEFAULT_TIME_TO_LIVE)));

    if (has_property(KW_SPECULATIVE_RETRY)) {
--- a/cql3/statements/cf_prop_defs.hh
+++ b/cql3/statements/cf_prop_defs.hh
@@ -70,8 +70,10 @@ public:

    static const sstring KW_COMPACTION;
    static const sstring KW_COMPRESSION;
+    static const sstring KW_CRC_CHECK_CHANCE;

    static const sstring COMPACTION_STRATEGY_CLASS_KEY;
+    static const sstring COMPACTION_ENABLED_KEY;

    // FIXME: In origin the following consts are in CFMetaData.
    static constexpr int32_t DEFAULT_DEFAULT_TIME_TO_LIVE = 0;
--- a/Show More
+++ b/Show More