release: prepare for 1.0.4

tests: Add unit tests for schema_registry
(cherry picked from commit 90c31701e3)
2026-05-01 05:35:48 +00:00 · 2016-05-29 10:41:38 +03:00 · 2016-05-18 14:52:45 +03:00 · 2016-05-18 13:53:14 +03:00 · 2016-05-18 13:52:24 +03:00 · 2016-05-15 13:36:39 +03:00
322 changed files with 11741 additions and 6299 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ cscope.*
 dist/ami/files/*.rpm
 dist/ami/variables.json
 dist/ami/scylla_deploy.sh
+*.pyc
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 #!/bin/sh

-VERSION=666.development
+VERSION=1.0.4

 if test -f version
 then
--- a/api/api-doc/storage_service.json
+++ b/api/api-doc/storage_service.json
@@ -836,6 +836,22 @@
                     "type":"string",
                     "paramType":"query"
                  },
+                  {
+                     "name":"startToken",
+                     "description":"Token on which to begin repair",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
+                  {
+                     "name":"endToken",
+                     "description":"Token on which to end repair",
+                     "required":false,
+                     "allowMultiple":false,
+                     "type":"string",
+                     "paramType":"query"
+                  },
                  {
                     "name":"columnFamilies",
                     "description":"Which column families to repair in the given keyspace. Multiple columns families can be named separated by commas. If this option is missing, all column families in the keyspace are repaired.",
--- a/api/storage_proxy.cc
+++ b/api/storage_proxy.cc
@@ -214,16 +214,16 @@ void set_storage_proxy(http_context& ctx, routes& r) {
    });

    sp::get_schema_versions.set(r, [](std::unique_ptr<request> req)  {
-        //TBD
-        // FIXME
-        // describe_schema_versions is not implemented yet
-        // this is a work around
-        std::vector<sp::mapper_list> res;
-        sp::mapper_list entry;
-        entry.key = boost::lexical_cast<std::string>(utils::fb_utilities::get_broadcast_address());
-        entry.value.push(service::get_local_storage_service().get_schema_version());
-        res.push_back(entry);
-        return make_ready_future<json::json_return_type>(res);
+        return service::get_local_storage_service().describe_schema_versions().then([] (auto result) {
+            std::vector<sp::mapper_list> res;
+            for (auto e : result) {
+                sp::mapper_list entry;
+                entry.key = std::move(e.first);
+                entry.value = std::move(e.second);
+                res.emplace_back(std::move(entry));
+            }
+            return make_ready_future<json::json_return_type>(std::move(res));
+        });
    });

    sp::get_cas_read_timeouts.set(r, [](std::unique_ptr<request> req) {
--- a/api/storage_service.cc
+++ b/api/storage_service.cc
@@ -280,10 +280,12 @@ void set_storage_service(http_context& ctx, routes& r) {
        return ctx.db.invoke_on_all([keyspace, column_families] (database& db) {
            std::vector<column_family*> column_families_vec;
            auto& cm = db.get_compaction_manager();
-            for (auto entry : column_families) {
-                column_family* cf = &db.find_column_family(keyspace, entry);
-                cm.submit_cleanup_job(cf);
+            for (auto cf : column_families) {
+                column_families_vec.push_back(&db.find_column_family(keyspace, cf));
            }
+            return parallel_for_each(column_families_vec, [&cm] (column_family* cf) {
+                return cm.perform_cleanup(cf);
+            });
        }).then([]{
            return make_ready_future<json::json_return_type>(0);
        });
@@ -326,7 +328,8 @@ void set_storage_service(http_context& ctx, routes& r) {

    ss::repair_async.set(r, [&ctx](std::unique_ptr<request> req) {
        static std::vector<sstring> options = {"primaryRange", "parallelism", "incremental",
-                "jobThreads", "ranges", "columnFamilies", "dataCenters", "hosts", "trace"};
+                "jobThreads", "ranges", "columnFamilies", "dataCenters", "hosts", "trace",
+                "startToken", "endToken" };
        std::unordered_map<sstring, sstring> options_map;
        for (auto o : options) {
            auto s = req->get_query_param(o);
@@ -585,6 +588,8 @@ void set_storage_service(http_context& ctx, routes& r) {
        auto val_str = req->get_query_param("value");
        bool value = (val_str == "True") || (val_str == "true") || (val_str == "1");
        return service::get_local_storage_service().db().invoke_on_all([value] (database& db) {
+            db.set_enable_incremental_backups(value);
+
            // Change both KS and CF, so they are in sync
            for (auto& pair: db.get_keyspaces()) {
                auto& ks = pair.second;
--- a/api/stream_manager.cc
+++ b/api/stream_manager.cc
@@ -32,11 +32,16 @@ namespace hs = httpd::stream_manager_json;

 static void set_summaries(const std::vector<streaming::stream_summary>& from,
        json::json_list<hs::stream_summary>& to) {
-    for (auto sum : from) {
+    if (!from.empty()) {
        hs::stream_summary res;
-        res.cf_id = boost::lexical_cast<std::string>(sum.cf_id);
-        res.files = sum.files;
-        res.total_size = sum.total_size;
+        res.cf_id = boost::lexical_cast<std::string>(from.front().cf_id);
+        // For each stream_session, we pretend we are sending/receiving one
+        // file, to make it compatible with nodetool.
+        res.files = 1;
+        // We can not estimate total number of bytes the stream_session will
+        // send or recvieve since we don't know the size of the frozen_mutation
+        // until we read it.
+        res.total_size = 0;
        to.push(res);
    }
 }
@@ -85,18 +90,22 @@ static hs::stream_state get_state(
 void set_stream_manager(http_context& ctx, routes& r) {
    hs::get_current_streams.set(r,
            [] (std::unique_ptr<request> req) {
-                return streaming::get_stream_manager().map_reduce0([](streaming::stream_manager& stream) {
-                    std::vector<hs::stream_state> res;
-                    for (auto i : stream.get_initiated_streams()) {
-                        res.push_back(get_state(*i.second.get()));
-                    }
-                    for (auto i : stream.get_receiving_streams()) {
-                        res.push_back(get_state(*i.second.get()));
-                    }
-                    return res;
-                }, std::vector<hs::stream_state>(),concat<hs::stream_state>).
-                then([](const std::vector<hs::stream_state>& res) {
-                    return make_ready_future<json::json_return_type>(res);
+                return streaming::get_stream_manager().invoke_on_all([] (auto& sm) {
+                    return sm.update_all_progress_info();
+                }).then([] {
+                    return streaming::get_stream_manager().map_reduce0([](streaming::stream_manager& stream) {
+                        std::vector<hs::stream_state> res;
+                        for (auto i : stream.get_initiated_streams()) {
+                            res.push_back(get_state(*i.second.get()));
+                        }
+                        for (auto i : stream.get_receiving_streams()) {
+                            res.push_back(get_state(*i.second.get()));
+                        }
+                        return res;
+                    }, std::vector<hs::stream_state>(),concat<hs::stream_state>).
+                    then([](const std::vector<hs::stream_state>& res) {
+                        return make_ready_future<json::json_return_type>(res);
+                    });
                });
            });

@@ -111,17 +120,9 @@ void set_stream_manager(http_context& ctx, routes& r) {
    hs::get_total_incoming_bytes.set(r, [](std::unique_ptr<request> req) {
        gms::inet_address peer(req->param["peer"]);
        return streaming::get_stream_manager().map_reduce0([peer](streaming::stream_manager& sm) {
-            int64_t res = 0;
-            for (auto sr : sm.get_all_streams()) {
-                if (sr) {
-                    for (auto session : sr->get_coordinator()->get_all_stream_sessions()) {
-                        if (session->peer == peer) {
-                            res += session->get_bytes_received();
-                        }
-                    }
-                }
-            }
-            return res;
+            return sm.get_progress_on_all_shards(peer).then([] (auto sbytes) {
+                return sbytes.bytes_received;
+            });
        }, 0, std::plus<int64_t>()).then([](int64_t res) {
            return make_ready_future<json::json_return_type>(res);
        });
@@ -129,15 +130,9 @@ void set_stream_manager(http_context& ctx, routes& r) {

    hs::get_all_total_incoming_bytes.set(r, [](std::unique_ptr<request> req) {
        return streaming::get_stream_manager().map_reduce0([](streaming::stream_manager& sm) {
-            int64_t res = 0;
-            for (auto sr : sm.get_all_streams()) {
-                if (sr) {
-                    for (auto session : sr->get_coordinator()->get_all_stream_sessions()) {
-                        res += session->get_bytes_received();
-                    }
-                }
-            }
-            return res;
+            return sm.get_progress_on_all_shards().then([] (auto sbytes) {
+                return sbytes.bytes_received;
+            });
        }, 0, std::plus<int64_t>()).then([](int64_t res) {
            return make_ready_future<json::json_return_type>(res);
        });
@@ -145,18 +140,10 @@ void set_stream_manager(http_context& ctx, routes& r) {

    hs::get_total_outgoing_bytes.set(r, [](std::unique_ptr<request> req) {
        gms::inet_address peer(req->param["peer"]);
-        return streaming::get_stream_manager().map_reduce0([peer](streaming::stream_manager& sm) {
-            int64_t res = 0;
-            for (auto sr : sm.get_all_streams()) {
-                if (sr) {
-                    for (auto session : sr->get_coordinator()->get_all_stream_sessions()) {
-                        if (session->peer == peer) {
-                            res += session->get_bytes_sent();
-                        }
-                    }
-                }
-            }
-            return res;
+        return streaming::get_stream_manager().map_reduce0([peer] (streaming::stream_manager& sm) {
+            return sm.get_progress_on_all_shards(peer).then([] (auto sbytes) {
+                return sbytes.bytes_sent;
+            });
        }, 0, std::plus<int64_t>()).then([](int64_t res) {
            return make_ready_future<json::json_return_type>(res);
        });
@@ -164,15 +151,9 @@ void set_stream_manager(http_context& ctx, routes& r) {

    hs::get_all_total_outgoing_bytes.set(r, [](std::unique_ptr<request> req) {
        return streaming::get_stream_manager().map_reduce0([](streaming::stream_manager& sm) {
-            int64_t res = 0;
-            for (auto sr : sm.get_all_streams()) {
-                if (sr) {
-                    for (auto session : sr->get_coordinator()->get_all_stream_sessions()) {
-                        res += session->get_bytes_sent();
-                    }
-                }
-            }
-            return res;
+            return sm.get_progress_on_all_shards().then([] (auto sbytes) {
+                return sbytes.bytes_sent;
+            });
        }, 0, std::plus<int64_t>()).then([](int64_t res) {
            return make_ready_future<json::json_return_type>(res);
        });
--- a/atomic_cell.hh
+++ b/atomic_cell.hh
@@ -54,9 +54,9 @@ class atomic_cell_or_collection;
 */
 class atomic_cell_type final {
 private:
-    static constexpr int8_t DEAD_FLAGS = 0;
    static constexpr int8_t LIVE_FLAG = 0x01;
    static constexpr int8_t EXPIRY_FLAG = 0x02; // When present, expiry field is present. Set only for live cells
+    static constexpr int8_t REVERT_FLAG = 0x04; // transient flag used to efficiently implement ReversiblyMergeable for atomic cells.
    static constexpr unsigned flags_size = 1;
    static constexpr unsigned timestamp_offset = flags_size;
    static constexpr unsigned timestamp_size = 8;
@@ -67,14 +67,21 @@ private:
    static constexpr unsigned ttl_offset = expiry_offset + expiry_size;
    static constexpr unsigned ttl_size = 4;
 private:
+    static bool is_revert_set(bytes_view cell) {
+        return cell[0] & REVERT_FLAG;
+    }
+    template<typename BytesContainer>
+    static void set_revert(BytesContainer& cell, bool revert) {
+        cell[0] = (cell[0] & ~REVERT_FLAG) | (revert * REVERT_FLAG);
+    }
    static bool is_live(const bytes_view& cell) {
-        return cell[0] != DEAD_FLAGS;
+        return cell[0] & LIVE_FLAG;
    }
    static bool is_live_and_has_ttl(const bytes_view& cell) {
        return cell[0] & EXPIRY_FLAG;
    }
    static bool is_dead(const bytes_view& cell) {
-        return cell[0] == DEAD_FLAGS;
+        return !is_live(cell);
    }
    // Can be called on live and dead cells
    static api::timestamp_type timestamp(const bytes_view& cell) {
@@ -106,7 +113,7 @@ private:
    }
    static managed_bytes make_dead(api::timestamp_type timestamp, gc_clock::time_point deletion_time) {
        managed_bytes b(managed_bytes::initialized_later(), flags_size + timestamp_size + deletion_time_size);
-        b[0] = DEAD_FLAGS;
+        b[0] = 0;
        set_field(b, timestamp_offset, timestamp);
        set_field(b, deletion_time_offset, deletion_time.time_since_epoch().count());
        return b;
@@ -140,8 +147,11 @@ protected:
    ByteContainer _data;
 protected:
    atomic_cell_base(ByteContainer&& data) : _data(std::forward<ByteContainer>(data)) { }
-    atomic_cell_base(const ByteContainer& data) : _data(data) { }
+    friend class atomic_cell_or_collection;
 public:
+    bool is_revert_set() const {
+        return atomic_cell_type::is_revert_set(_data);
+    }
    bool is_live() const {
        return atomic_cell_type::is_live(_data);
    }
@@ -187,10 +197,13 @@ public:
    bytes_view serialize() const {
        return _data;
    }
+    void set_revert(bool revert) {
+        atomic_cell_type::set_revert(_data, revert);
+    }
 };

 class atomic_cell_view final : public atomic_cell_base<bytes_view> {
-    atomic_cell_view(bytes_view data) : atomic_cell_base(data) {}
+    atomic_cell_view(bytes_view data) : atomic_cell_base(std::move(data)) {}
 public:
    static atomic_cell_view from_bytes(bytes_view data) { return atomic_cell_view(data); }

@@ -198,6 +211,11 @@ public:
    friend std::ostream& operator<<(std::ostream& os, const atomic_cell_view& acv);
 };

+class atomic_cell_ref final : public atomic_cell_base<managed_bytes&> {
+public:
+    atomic_cell_ref(managed_bytes& buf) : atomic_cell_base(buf) {}
+};
+
 class atomic_cell final : public atomic_cell_base<managed_bytes> {
    atomic_cell(managed_bytes b) : atomic_cell_base(std::move(b)) {}
 public:
--- a/atomic_cell_hash.hh
+++ b/atomic_cell_hash.hh
@@ -27,16 +27,18 @@
 #include "atomic_cell.hh"
 #include "hashing.hh"

-template<typename Hasher>
-void feed_hash(collection_mutation_view cell, Hasher& h, const data_type& type) {
-    auto&& ctype = static_pointer_cast<const collection_type_impl>(type);
-    auto m_view = ctype->deserialize_mutation_form(cell);
-    ::feed_hash(h, m_view.tomb);
-    for (auto&& key_and_value : m_view.cells) {
-        ::feed_hash(h, key_and_value.first);
-        ::feed_hash(h, key_and_value.second);
+template<>
+struct appending_hash<collection_mutation_view> {
+    template<typename Hasher>
+    void operator()(Hasher& h, collection_mutation_view cell) const {
+        auto m_view = collection_type_impl::deserialize_mutation_form(cell);
+        ::feed_hash(h, m_view.tomb);
+        for (auto&& key_and_value : m_view.cells) {
+            ::feed_hash(h, key_and_value.first);
+            ::feed_hash(h, key_and_value.second);
+        }
    }
-}
+};

 template<>
 struct appending_hash<atomic_cell_view> {
@@ -55,3 +57,19 @@ struct appending_hash<atomic_cell_view> {
        }
    }
 };
+
+template<>
+struct appending_hash<atomic_cell> {
+    template<typename Hasher>
+    void operator()(Hasher& h, const atomic_cell& cell) const {
+        feed_hash(h, static_cast<atomic_cell_view>(cell));
+    }
+};
+
+template<>
+struct appending_hash<collection_mutation> {
+    template<typename Hasher>
+    void operator()(Hasher& h, const collection_mutation& cm) const {
+        feed_hash(h, static_cast<collection_mutation_view>(cm));
+    }
+};
--- a/atomic_cell_or_collection.hh
+++ b/atomic_cell_or_collection.hh
@@ -27,11 +27,10 @@

 // A variant type that can hold either an atomic_cell, or a serialized collection.
 // Which type is stored is determined by the schema.
+// Has an "empty" state.
+// Objects moved-from are left in an empty state.
 class atomic_cell_or_collection final {
    managed_bytes _data;
-
-    template<typename T>
-    friend class db::serializer;
 private:
    atomic_cell_or_collection(managed_bytes&& data) : _data(std::move(data)) {}
 public:
@@ -39,6 +38,7 @@ public:
    atomic_cell_or_collection(atomic_cell ac) : _data(std::move(ac._data)) {}
    static atomic_cell_or_collection from_atomic_cell(atomic_cell data) { return { std::move(data._data) }; }
    atomic_cell_view as_atomic_cell() const { return atomic_cell_view::from_bytes(_data); }
+    atomic_cell_ref as_atomic_cell_ref() { return { _data }; }
    atomic_cell_or_collection(collection_mutation cm) : _data(std::move(cm.data)) {}
    explicit operator bool() const {
        return !_data.empty();
@@ -63,11 +63,5 @@ public:
            ::feed_hash(as_collection_mutation(), h, def.type);
        }
    }
-    void linearize() {
-        _data.linearize();
-    }
-    void unlinearize() {
-        _data.scatter();
-    }
    friend std::ostream& operator<<(std::ostream&, const atomic_cell_or_collection&);
 };
--- a/auth/auth.cc
+++ b/auth/auth.cc
@@ -103,35 +103,41 @@ static auth_migration_listener auth_migration;
 * Should be abstracted to some sort of global server function
 * probably.
 */
+struct waiter {
+    promise<> done;
+    timer<> tmr;
+    waiter() : tmr([this] {done.set_value();})
+    {
+        tmr.arm(auth::auth::SUPERUSER_SETUP_DELAY);
+    }
+    ~waiter() {
+        if (tmr.armed()) {
+            tmr.cancel();
+            done.set_exception(std::runtime_error("shutting down"));
+        }
+        logger.trace("Deleting scheduled task");
+    }
+    void kill() {
+    }
+};
+
+typedef std::unique_ptr<waiter> waiter_ptr;
+
+static std::vector<waiter_ptr> & thread_waiters() {
+    static thread_local std::vector<waiter_ptr> the_waiters;
+    return the_waiters;
+}
+
 void auth::auth::schedule_when_up(scheduled_func f) {
-    struct waiter {
-        promise<> done;
-        timer<> tmr;
-        waiter() : tmr([this] {done.set_value();})
-        {
-            tmr.arm(SUPERUSER_SETUP_DELAY);
-        }
-        ~waiter() {
-            if (tmr.armed()) {
-                tmr.cancel();
-                done.set_exception(std::runtime_error("shutting down"));
-            }
-            logger.trace("Deleting scheduled task");
-        }
-        void kill() {
-        }
-    };
-
-    typedef std::unique_ptr<waiter> waiter_ptr;
-
-    static thread_local std::vector<waiter_ptr> waiters;
-
    logger.trace("Adding scheduled task");

+    auto & waiters = thread_waiters();
+
    waiters.emplace_back(std::make_unique<waiter>());
    auto* w = waiters.back().get();

    w->done.get_future().finally([w] {
+        auto & waiters = thread_waiters();
        auto i = std::find_if(waiters.begin(), waiters.end(), [w](const waiter_ptr& p) {
                            return p.get() == w;
                        });
@@ -146,7 +152,6 @@ void auth::auth::schedule_when_up(scheduled_func f) {
    });
 }

-
 bool auth::auth::is_class_type(const sstring& type, const sstring& classname) {
    if (type == classname) {
        return true;
@@ -205,6 +210,15 @@ future<> auth::auth::setup() {
    });
 }

+future<> auth::auth::shutdown() {
+    // just make sure we don't have pending tasks.
+    // this is mostly relevant for test cases where
+    // db-env-shutdown != process shutdown
+    return smp::invoke_on_all([] {
+        thread_waiters().clear();
+    });
+}
+
 static db::consistency_level consistency_for_user(const sstring& username) {
    if (username == auth::auth::DEFAULT_SUPERUSER_NAME) {
        return db::consistency_level::QUORUM;
--- a/auth/auth.hh
+++ b/auth/auth.hh
@@ -102,6 +102,7 @@ public:
     * Sets up Authenticator and Authorizer.
     */
    static future<> setup();
+    static future<> shutdown();

    /**
     * Set up table from given CREATE TABLE statement under system_auth keyspace, if not already done so.
--- a/bytes_ostream.hh
+++ b/bytes_ostream.hh
@@ -21,11 +21,12 @@

 #pragma once

-#include "types.hh"
-#include "net/byteorder.hh"
+#include <boost/range/iterator_range.hpp>
+
+#include "bytes.hh"
 #include "core/unaligned.hh"
 #include "hashing.hh"
-
+#include "seastar/core/simple-stream.hh"
 /**
 * Utility for writing data into a buffer when its final size is not known up front.
 *
@@ -42,6 +43,14 @@ private:
    struct chunk {
        // FIXME: group fragment pointers to reduce pointer chasing when packetizing
        std::unique_ptr<chunk> next;
+        ~chunk() {
+            auto p = std::move(next);
+            while (p) {
+                // Avoid recursion when freeing chunks
+                auto p_next = std::move(p->next);
+                p = std::move(p_next);
+            }
+        }
        size_type offset; // Also means "size" after chunk is closed
        size_type size;
        value_type data[0];
@@ -163,16 +172,12 @@ public:
    template <typename T>
    struct place_holder {
        value_type* ptr;
+        // makes the place_holder looks like a stream
+        seastar::simple_output_stream get_stream() {
+            return seastar::simple_output_stream{reinterpret_cast<char*>(ptr)};
+        }
    };

-    // Writes given values in big-endian format
-    template <typename T>
-    inline
-    std::enable_if_t<std::is_fundamental<T>::value, void>
-    write(T val) {
-        *reinterpret_cast<unaligned<T>*>(alloc(sizeof(T))) = net::hton(val);
-    }
-
    // Returns a place holder for a value to be written later.
    template <typename T>
    inline
@@ -210,19 +215,6 @@ public:
        write(bytes_view(reinterpret_cast<const signed char*>(ptr), size));
    }

-    // Writes given sequence of bytes with a preceding length component encoded in big-endian format
-    inline void write_blob(bytes_view v) {
-        assert((size_type)v.size() == v.size());
-        write<size_type>(v.size());
-        write(v);
-    }
-
-    // Writes given value into the place holder in big-endian format
-    template <typename T>
-    inline void set(place_holder<T> ph, T val) {
-        *reinterpret_cast<unaligned<T>*>(ph.ptr) = net::hton(val);
-    }
-
    bool is_linearized() const {
        return !_begin || !_begin->next;
    }
--- a/canonical_mutation.cc
+++ b/canonical_mutation.cc
@@ -24,80 +24,66 @@
 #include "mutation_partition_serializer.hh"
 #include "converting_mutation_partition_applier.hh"
 #include "hashing_partition_visitor.hh"
-
-template class db::serializer<canonical_mutation>;
-
-//
-// Representation layout:
-//
-// <canonical_mutation> ::= <column_family_id> <table_schema_version> <partition_key> <column-mapping> <partition>
-//
-// For <partition> see mutation_partition_serializer.cc
-// For <column-mapping> see db::serializer<column_mapping>
-//
+#include "utils/UUID.hh"
+#include "serializer.hh"
+#include "idl/uuid.dist.hh"
+#include "idl/keys.dist.hh"
+#include "idl/mutation.dist.hh"
+#include "serializer_impl.hh"
+#include "serialization_visitors.hh"
+#include "idl/uuid.dist.impl.hh"
+#include "idl/keys.dist.impl.hh"
+#include "idl/mutation.dist.impl.hh"

 canonical_mutation::canonical_mutation(bytes data)
        : _data(std::move(data))
 { }

 canonical_mutation::canonical_mutation(const mutation& m)
-    : _data([&m] {
-        bytes_ostream out;
-        db::serializer<utils::UUID>(m.column_family_id()).write(out);
-        db::serializer<table_schema_version>(m.schema()->version()).write(out);
-        db::serializer<partition_key_view>(m.key()).write(out);
-        db::serializer<column_mapping>(m.schema()->get_column_mapping()).write(out);
-        mutation_partition_serializer ser(*m.schema(), m.partition());
-        ser.write(out);
-        return to_bytes(out.linearize());
-    }())
-{ }
+{
+    mutation_partition_serializer part_ser(*m.schema(), m.partition());
+
+    bytes_ostream out;
+    ser::writer_of_canonical_mutation wr(out);
+    std::move(wr).write_table_id(m.schema()->id())
+                 .write_schema_version(m.schema()->version())
+                 .write_key(m.key())
+                 .write_mapping(m.schema()->get_column_mapping())
+                 .partition([&] (auto wr) {
+                     part_ser.write(std::move(wr));
+                 }).end_canonical_mutation();
+    _data = to_bytes(out.linearize());
+}

 utils::UUID canonical_mutation::column_family_id() const {
-    data_input in(_data);
-    return db::serializer<utils::UUID>::read(in);
+    auto in = ser::as_input_stream(_data);
+    auto mv = ser::deserialize(in, boost::type<ser::canonical_mutation_view>());
+    return mv.table_id();
 }

 mutation canonical_mutation::to_mutation(schema_ptr s) const {
-    data_input in(_data);
+    auto in = ser::as_input_stream(_data);
+    auto mv = ser::deserialize(in, boost::type<ser::canonical_mutation_view>());

-    auto cf_id = db::serializer<utils::UUID>::read(in);
+    auto cf_id = mv.table_id();
    if (s->id() != cf_id) {
        throw std::runtime_error(sprint("Attempted to deserialize canonical_mutation of table %s with schema of table %s (%s.%s)",
                                        cf_id, s->id(), s->ks_name(), s->cf_name()));
    }

-    auto version = db::serializer<table_schema_version>::read(in);
-    auto pk = partition_key(db::serializer<partition_key_view>::read(in));
+    auto version = mv.schema_version();
+    auto pk = mv.key();

    mutation m(std::move(pk), std::move(s));

    if (version == m.schema()->version()) {
-        db::serializer<column_mapping>::skip(in);
-        auto partition_view = mutation_partition_serializer::read_as_view(in);
+        auto partition_view = mutation_partition_view::from_view(mv.partition());
        m.partition().apply(*m.schema(), partition_view, *m.schema());
    } else {
-        column_mapping cm = db::serializer<column_mapping>::read(in);
+        column_mapping cm = mv.mapping();
        converting_mutation_partition_applier v(cm, *m.schema(), m.partition());
-        auto partition_view = mutation_partition_serializer::read_as_view(in);
+        auto partition_view = mutation_partition_view::from_view(mv.partition());
        partition_view.accept(cm, v);
    }
    return m;
 }
-
-template<>
-db::serializer<canonical_mutation>::serializer(const canonical_mutation& v)
-        : _item(v)
-        , _size(db::serializer<bytes>(v._data).size())
-{ }
-
-template<>
-void
-db::serializer<canonical_mutation>::write(output& out, const canonical_mutation& v) {
-    db::serializer<bytes>(v._data).write(out);
-}
-
-template<>
-canonical_mutation db::serializer<canonical_mutation>::read(input& in) {
-    return canonical_mutation(db::serializer<bytes>::read(in));
-}
--- a/canonical_mutation.hh
+++ b/canonical_mutation.hh
@@ -24,7 +24,6 @@
 #include "bytes.hh"
 #include "schema.hh"
 #include "database_fwd.hh"
-#include "db/serializer.hh"
 #include "mutation_partition_visitor.hh"
 #include "mutation_partition_serializer.hh"

@@ -33,8 +32,8 @@
 // Safe to pass serialized across nodes.
 class canonical_mutation {
    bytes _data;
-    canonical_mutation(bytes);
 public:
+    explicit canonical_mutation(bytes);
    explicit canonical_mutation(const mutation&);

    canonical_mutation(canonical_mutation&&) = default;
@@ -51,15 +50,6 @@ public:

    utils::UUID column_family_id() const;

-    friend class db::serializer<canonical_mutation>;
+    const bytes& representation() const { return _data; }
+
 };
-
-namespace db {
-
-template<> serializer<canonical_mutation>::serializer(const canonical_mutation&);
-template<> void serializer<canonical_mutation>::write(output&, const canonical_mutation&);
-template<> canonical_mutation serializer<canonical_mutation>::read(input&);
-
-extern template class serializer<canonical_mutation>;
-
-}
--- a/compound.hh
+++ b/compound.hh
@@ -26,29 +26,10 @@
 #include <algorithm>
 #include <vector>
 #include <boost/range/iterator_range.hpp>
+#include <boost/range/adaptor/transformed.hpp>
 #include "utils/serialization.hh"
 #include "unimplemented.hh"

-// value_traits is meant to abstract away whether we are working on 'bytes'
-// elements or 'bytes_opt' elements. We don't support optional values, but
-// there are some generic layers which use this code which provide us with
-// data in that format. In order to avoid allocation and rewriting that data
-// into a new vector just to throw it away soon after that, we accept that
-// format too.
-
-template <typename T>
-struct value_traits {
-    static const T& unwrap(const T& t) { return t; }
-};
-
-template<>
-struct value_traits<bytes_opt> {
-    static const bytes& unwrap(const bytes_opt& t) {
-        assert(t);
-        return *t;
-    }
-};
-
 enum class allow_prefixes { no, yes };

 template<allow_prefixes AllowPrefixes = allow_prefixes::no>
@@ -62,13 +43,14 @@ public:
    static constexpr bool is_prefixable = AllowPrefixes == allow_prefixes::yes;
    using prefix_type = compound_type<allow_prefixes::yes>;
    using value_type = std::vector<bytes>;
+    using size_type = uint16_t;

    compound_type(std::vector<data_type> types)
        : _types(std::move(types))
        , _byte_order_equal(std::all_of(_types.begin(), _types.end(), [] (auto t) {
                return t->is_byte_order_equal();
            }))
-        , _byte_order_comparable(!is_prefixable && _types.size() == 1 && _types[0]->is_byte_order_comparable())
+        , _byte_order_comparable(false)
        , _is_reversed(_types.size() == 1 && _types[0]->is_reversed())
    { }

@@ -85,79 +67,54 @@ public:
    prefix_type as_prefix() {
        return prefix_type(_types);
    }
-
+private:
    /*
     * Format:
-     *   <len(value1)><value1><len(value2)><value2>...<len(value_n-1)><value_n-1>(len(value_n))?<value_n>
+     *   <len(value1)><value1><len(value2)><value2>...<len(value_n)><value_n>
     *
-     * For non-prefixable compounds, the value corresponding to the last component of types doesn't
-     * have its length encoded, its length is deduced from the input range.
-     *
-     * serialize_value() and serialize_optionals() for single element rely on the fact that for a single-element
-     * compounds their serialized form is equal to the serialized form of the component.
     */
-    template<typename Wrapped>
-    void serialize_value(const std::vector<Wrapped>& values, bytes::iterator& out) {
-        if (AllowPrefixes == allow_prefixes::yes) {
-            assert(values.size() <= _types.size());
-        } else {
-            assert(values.size() == _types.size());
-        }
-
-        size_t n_left = _types.size();
-        for (auto&& wrapped : values) {
-            auto&& val = value_traits<Wrapped>::unwrap(wrapped);
-            assert(val.size() <= std::numeric_limits<uint16_t>::max());
-            if (--n_left || AllowPrefixes == allow_prefixes::yes) {
-                write<uint16_t>(out, uint16_t(val.size()));
-            }
+    template<typename RangeOfSerializedComponents>
+    static void serialize_value(RangeOfSerializedComponents&& values, bytes::iterator& out) {
+        for (auto&& val : values) {
+            assert(val.size() <= std::numeric_limits<size_type>::max());
+            write<size_type>(out, size_type(val.size()));
            out = std::copy(val.begin(), val.end(), out);
        }
    }
-    template <typename Wrapped>
-    size_t serialized_size(const std::vector<Wrapped>& values) {
+    template <typename RangeOfSerializedComponents>
+    static size_t serialized_size(RangeOfSerializedComponents&& values) {
        size_t len = 0;
-        size_t n_left = _types.size();
-        for (auto&& wrapped : values) {
-            auto&& val = value_traits<Wrapped>::unwrap(wrapped);
-            assert(val.size() <= std::numeric_limits<uint16_t>::max());
-            if (--n_left || AllowPrefixes == allow_prefixes::yes) {
-                len += sizeof(uint16_t);
-            }
-            len += val.size();
+        for (auto&& val : values) {
+            len += sizeof(size_type) + val.size();
        }
        return len;
    }
+public:
    bytes serialize_single(bytes&& v) {
-        if (AllowPrefixes == allow_prefixes::no) {
-            assert(_types.size() == 1);
-            return std::move(v);
-        } else {
-            // FIXME: Optimize
-            std::vector<bytes> vec;
-            vec.reserve(1);
-            vec.emplace_back(std::move(v));
-            return ::serialize_value(*this, vec);
-        }
+        return serialize_value({std::move(v)});
    }
-    bytes serialize_value(const std::vector<bytes>& values) {
-        return ::serialize_value(*this, values);
-    }
-    bytes serialize_value(std::vector<bytes>&& values) {
-        if (AllowPrefixes == allow_prefixes::no && _types.size() == 1 && values.size() == 1) {
-            return std::move(values[0]);
+    template<typename RangeOfSerializedComponents>
+    static bytes serialize_value(RangeOfSerializedComponents&& values) {
+        auto size = serialized_size(values);
+        if (size > std::numeric_limits<size_type>::max()) {
+            throw std::runtime_error(sprint("Key size too large: %d > %d", size, std::numeric_limits<size_type>::max()));
        }
-        return ::serialize_value(*this, values);
+        bytes b(bytes::initialized_later(), size);
+        auto i = b.begin();
+        serialize_value(values, i);
+        return b;
+    }
+    template<typename T>
+    static bytes serialize_value(std::initializer_list<T> values) {
+        return serialize_value(boost::make_iterator_range(values.begin(), values.end()));
    }
    bytes serialize_optionals(const std::vector<bytes_opt>& values) {
-        return ::serialize_value(*this, values);
-    }
-    bytes serialize_optionals(std::vector<bytes_opt>&& values) {
-        if (AllowPrefixes == allow_prefixes::no && _types.size() == 1 && values.size() == 1) {
-            assert(values[0]);
-            return std::move(*values[0]);
-        }
-        return ::serialize_value(*this, values);
+        return serialize_value(values | boost::adaptors::transformed([] (const bytes_opt& bo) -> bytes_view {
+            if (!bo) {
+                throw std::logic_error("attempted to create key component from empty optional");
+            }
+            return *bo;
+        }));
    }
    bytes serialize_value_deep(const std::vector<data_value>& values) {
        // TODO: Optimize
@@ -171,37 +128,21 @@ public:
        return serialize_value(partial);
    }
    bytes decompose_value(const value_type& values) {
-        return ::serialize_value(*this, values);
+        return serialize_value(values);
    }
    class iterator : public std::iterator<std::input_iterator_tag, bytes_view> {
    private:
-        ssize_t _types_left;
        bytes_view _v;
        value_type _current;
    private:
        void read_current() {
-            if (_types_left == 0) {
-                if (!_v.empty()) {
-                    throw marshal_exception();
-                }
-                _v = bytes_view(nullptr, 0);
-                return;
-            }
-            --_types_left;
-            uint16_t len;
-            if (_types_left == 0 && AllowPrefixes == allow_prefixes::no) {
-                len = _v.size();
-            } else {
+            size_type len;
+            {
                if (_v.empty()) {
-                    if (AllowPrefixes == allow_prefixes::yes) {
-                        _types_left = 0;
-                        _v = bytes_view(nullptr, 0);
-                        return;
-                    } else {
-                        throw marshal_exception();
-                    }
+                    _v = bytes_view(nullptr, 0);
+                    return;
                }
-                len = read_simple<uint16_t>(_v);
+                len = read_simple<size_type>(_v);
                if (_v.size() < len) {
                    throw marshal_exception();
                }
@@ -211,10 +152,10 @@ public:
        }
    public:
        struct end_iterator_tag {};
-        iterator(const compound_type& t, const bytes_view& v) : _types_left(t._types.size()), _v(v) {
+        iterator(const bytes_view& v) : _v(v) {
            read_current();
        }
-        iterator(end_iterator_tag, const bytes_view& v) : _types_left(0), _v(nullptr, 0) {}
+        iterator(end_iterator_tag, const bytes_view& v) : _v(nullptr, 0) {}
        iterator& operator++() {
            read_current();
            return *this;
@@ -226,21 +167,18 @@ public:
        }
        const value_type& operator*() const { return _current; }
        const value_type* operator->() const { return &_current; }
-        bool operator!=(const iterator& i) const { return _v.begin() != i._v.begin() || _types_left != i._types_left; }
-        bool operator==(const iterator& i) const { return _v.begin() == i._v.begin() && _types_left == i._types_left; }
+        bool operator!=(const iterator& i) const { return _v.begin() != i._v.begin(); }
+        bool operator==(const iterator& i) const { return _v.begin() == i._v.begin(); }
    };
-    iterator begin(const bytes_view& v) const {
-        return iterator(*this, v);
+    static iterator begin(const bytes_view& v) {
+        return iterator(v);
    }
-    iterator end(const bytes_view& v) const {
+    static iterator end(const bytes_view& v) {
        return iterator(typename iterator::end_iterator_tag(), v);
    }
-    boost::iterator_range<iterator> components(const bytes_view& v) const {
+    static boost::iterator_range<iterator> components(const bytes_view& v) {
        return { begin(v), end(v) };
    }
-    auto iter_items(const bytes_view& v) {
-        return boost::iterator_range<iterator>(begin(v), end(v));
-    }
    value_type deserialize_value(bytes_view v) {
        std::vector<bytes> result;
        result.reserve(_types.size());
@@ -258,7 +196,7 @@ public:
        }
        auto t = _types.begin();
        size_t h = 0;
-        for (auto&& value : iter_items(v)) {
+        for (auto&& value : components(v)) {
            h ^= (*t)->hash(value);
            ++t;
        }
@@ -277,12 +215,6 @@ public:
                return type->compare(v1, v2);
            });
    }
-    bytes from_string(sstring_view s) {
-        throw std::runtime_error(sprint("%s not implemented", __PRETTY_FUNCTION__));
-    }
-    sstring to_string(const bytes& b) {
-        throw std::runtime_error(sprint("%s not implemented", __PRETTY_FUNCTION__));
-    }
    // Retruns true iff given prefix has no missing components
    bool is_full(bytes_view v) const {
        assert(AllowPrefixes == allow_prefixes::yes);
--- a/configure.py
+++ b/configure.py
@@ -25,6 +25,31 @@ from distutils.spawn import find_executable

 configure_args = str.join(' ', [shlex.quote(x) for x in sys.argv[1:]])

+for line in open('/etc/os-release'):
+    key, _, value = line.partition('=')
+    value = value.strip().strip('"')
+    if key == 'ID':
+        os_ids = [value]
+    if key == 'ID_LIKE':
+        os_ids += value.split(' ')
+
+# distribution "internationalization", converting package names.
+# Fedora name is key, values is distro -> package name dict. 
+i18n_xlat = {
+    'boost-devel': {
+        'debian': 'libboost-dev',
+        'ubuntu': 'libboost-dev (libboost1.55-dev on 14.04)',
+    },
+}
+
+def pkgname(name):
+    if name in i18n_xlat:
+        dict = i18n_xlat[name]
+        for id in os_ids:
+            if id in dict:
+                return dict[id]
+    return name 
+
 def get_flags():
    with open('/proc/cpuinfo') as f:
        for line in f:
@@ -137,6 +162,7 @@ modes = {

 scylla_tests = [
    'tests/mutation_test',
+    'tests/schema_registry_test',
    'tests/canonical_mutation_test',
    'tests/range_test',
    'tests/types_test',
@@ -167,7 +193,6 @@ scylla_tests = [
    'tests/commitlog_test',
    'tests/cartesian_product_test',
    'tests/hash_test',
-    'tests/serializer_test',
    'tests/map_difference_test',
    'tests/message',
    'tests/gossip',
@@ -190,6 +215,7 @@ scylla_tests = [
    'tests/flush_queue_test',
    'tests/dynamic_bitset_test',
    'tests/auth_test',
+    'tests/idl_test',
 ]

 apps = [
@@ -198,7 +224,11 @@ apps = [

 tests = scylla_tests

-all_artifacts = apps + tests
+other = [
+    'iotune',
+    ]
+
+all_artifacts = apps + tests + other

 arg_parser = argparse.ArgumentParser('Configure scylla')
 arg_parser.add_argument('--static', dest = 'static', action = 'store_const', default = '',
@@ -235,7 +265,6 @@ add_tristate(arg_parser, name = 'xen', dest = 'xen', help = 'Xen support')
 args = arg_parser.parse_args()

 defines = []
-scylla_libs = '-llz4 -lsnappy -lz -lboost_thread -lcryptopp -lrt -lyaml-cpp -lboost_date_time'

 extra_cxxflags = {}

@@ -289,6 +318,7 @@ scylla_core = (['database.cc',
                 'cql3/statements/cf_statement.cc',
                 'cql3/statements/create_keyspace_statement.cc',
                 'cql3/statements/create_table_statement.cc',
+                 'cql3/statements/create_type_statement.cc',
                 'cql3/statements/drop_keyspace_statement.cc',
                 'cql3/statements/drop_table_statement.cc',
                 'cql3/statements/schema_altering_statement.cc',
@@ -343,7 +373,7 @@ scylla_core = (['database.cc',
                 'db/schema_tables.cc',
                 'db/commitlog/commitlog.cc',
                 'db/commitlog/commitlog_replayer.cc',
-                 'db/serializer.cc',
+                 'db/commitlog/commitlog_entry.cc',
                 'db/config.cc',
                 'db/index/secondary_index.cc',
                 'db/marshal/type_parser.cc',
@@ -357,6 +387,7 @@ scylla_core = (['database.cc',
                 'utils/rate_limiter.cc',
                 'utils/file_lock.cc',
                 'utils/dynamic_bitset.cc',
+                 'utils/managed_bytes.cc',
                 'gms/version_generator.cc',
                 'gms/versioned_value.cc',
                 'gms/gossiper.cc',
@@ -378,6 +409,7 @@ scylla_core = (['database.cc',
                 'locator/simple_strategy.cc',
                 'locator/local_strategy.cc',
                 'locator/network_topology_strategy.cc',
+                 'locator/everywhere_replication_strategy.cc',
                 'locator/token_metadata.cc',
                 'locator/locator.cc',
                 'locator/snitch_base.cc',
@@ -391,7 +423,6 @@ scylla_core = (['database.cc',
                 'service/client_state.cc',
                 'service/migration_task.cc',
                 'service/storage_service.cc',
-                 'service/pending_range_calculator_service.cc',
                 'service/load_broadcaster.cc',
                 'service/pager/paging_state.cc',
                 'service/pager/query_pagers.cc',
@@ -471,6 +502,14 @@ idls = ['idl/gossip_digest.idl.hh',
        'idl/reconcilable_result.idl.hh',
        'idl/streaming.idl.hh',
        'idl/paging_state.idl.hh',
+        'idl/frozen_schema.idl.hh',
+        'idl/partition_checksum.idl.hh',
+        'idl/replay_position.idl.hh',
+        'idl/truncation_record.idl.hh',
+        'idl/mutation.idl.hh',
+        'idl/query.idl.hh',
+        'idl/idl_test.idl.hh',
+        'idl/commitlog.idl.hh',
        ]

 scylla_tests_dependencies = scylla_core + api + idls + [
@@ -514,6 +553,7 @@ tests_not_using_seastar_test_framework = set([
    'tests/perf/perf_sstable',
    'tests/managed_vector_test',
    'tests/dynamic_bitset_test',
+    'tests/idl_test',
 ])

 for t in tests_not_using_seastar_test_framework:
@@ -556,16 +596,44 @@ else:
    args.pie = ''
    args.fpie = ''

-optional_packages = ['libsystemd']
+# a list element means a list of alternative packages to consider
+# the first element becomes the HAVE_pkg define
+# a string element is a package name with no alternatives
+optional_packages = [['libsystemd', 'libsystemd-daemon']]
 pkgs = []

-for pkg in optional_packages:
-    if have_pkg(pkg):
-        pkgs.append(pkg)
-        upkg = pkg.upper().replace('-', '_')
-        defines.append('HAVE_{}=1'.format(upkg))
-    else:
-        print('Missing optional package {pkg}'.format(**locals()))
+def setup_first_pkg_of_list(pkglist):
+    # The HAVE_pkg symbol is taken from the first alternative
+    upkg = pkglist[0].upper().replace('-', '_')
+    for pkg in pkglist:
+        if have_pkg(pkg):
+            pkgs.append(pkg)
+            defines.append('HAVE_{}=1'.format(upkg))
+            return True
+    return False
+
+for pkglist in optional_packages:
+    if isinstance(pkglist, str):
+        pkglist = [pkglist]
+    if not setup_first_pkg_of_list(pkglist):
+        if len(pkglist) == 1:
+            print('Missing optional package {pkglist[0]}'.format(**locals()))
+        else:
+            alternatives = ':'.join(pkglist[1:])
+            print('Missing optional package {pkglist[0]} (or alteratives {alternatives})'.format(**locals()))
+
+if not try_compile(compiler=args.cxx, source='#include <boost/version.hpp>'):
+    print('Boost not installed.  Please install {}.'.format(pkgname("boost-devel")))
+    sys.exit(1)
+
+if not try_compile(compiler=args.cxx, source='''\
+        #include <boost/version.hpp>
+        #if BOOST_VERSION < 105500
+        #error Boost version too low
+        #endif
+        '''):
+    print('Installed boost version too old.  Please update {}.'.format(pkgname("boost-devel")))
+    sys.exit(1)

 defines = ' '.join(['-D' + d for d in defines])

@@ -595,6 +663,8 @@ if args.dpdk:
    seastar_flags += ['--enable-dpdk']
 elif args.dpdk_target:
    seastar_flags += ['--dpdk-target', args.dpdk_target]
+if args.staticcxx:
+    seastar_flags += ['--static-stdc++']

 seastar_cflags = args.user_cflags + " -march=nehalem"
 seastar_flags += ['--compiler', args.cxx, '--cflags=%s' % (seastar_cflags)]
@@ -628,7 +698,7 @@ for mode in build_modes:
 seastar_deps = 'practically_anything_can_change_so_lets_run_it_every_time_and_restat.'

 args.user_cflags += " " + pkg_config("--cflags", "jsoncpp")
-libs = "-lyaml-cpp -llz4 -lz -lsnappy " + pkg_config("--libs", "jsoncpp") + ' -lboost_filesystem' + ' -lcrypt'
+libs = "-lyaml-cpp -llz4 -lz -lsnappy " + pkg_config("--libs", "jsoncpp") + ' -lboost_filesystem' + ' -lcrypt' + ' -lboost_date_time'
 for pkg in pkgs:
    args.user_cflags += ' ' + pkg_config('--cflags', pkg)
    libs += ' ' + pkg_config('--libs', pkg)
@@ -664,12 +734,15 @@ with open(buildfile, 'w') as f:
            command = seastar/json/json2code.py -f $in -o $out
            description = SWAGGER $out
        rule serializer
-            command = ./idl-compiler.py --ns ser -f $in -o $out
+            command = {python} ./idl-compiler.py --ns ser -f $in -o $out
            description = IDL compiler $out
        rule ninja
            command = {ninja} -C $subdir $target
            restat = 1
            description = NINJA $out
+        rule copy
+            command = cp $in $out
+            description = COPY $out
        ''').format(**globals()))
    for mode in build_modes:
        modeval = modes[mode]
@@ -706,6 +779,8 @@ with open(buildfile, 'w') as f:
        thrifts = set()
        antlr3_grammars = set()
        for binary in build_artifacts:
+            if binary in other:
+                continue
            srcs = deps[binary]
            objs = ['$builddir/' + mode + '/' + src.replace('.cc', '.o')
                    for src in srcs
@@ -771,7 +846,8 @@ with open(buildfile, 'w') as f:
        for obj in compiles:
            src = compiles[obj]
            gen_headers = list(ragels.keys())
-            gen_headers += ['seastar/build/{}/http/request_parser.hh'.format(mode)]
+            gen_headers += ['seastar/build/{}/gen/http/request_parser.hh'.format(mode)]
+            gen_headers += ['seastar/build/{}/gen/http/http_response_parser.hh'.format(mode)]
            for th in thrifts:
                gen_headers += th.headers('$builddir/{}/gen'.format(mode))
            for g in antlr3_grammars:
@@ -802,10 +878,14 @@ with open(buildfile, 'w') as f:
                                                                   grammar.source.rsplit('.', 1)[0]))
            for cc in grammar.sources('$builddir/{}/gen'.format(mode)):
                obj = cc.replace('.cpp', '.o')
-                f.write('build {}: cxx.{} {}\n'.format(obj, mode, cc))
-        f.write('build seastar/build/{}/libseastar.a: ninja {}\n'.format(mode, seastar_deps))
+                f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
+        f.write('build seastar/build/{mode}/libseastar.a seastar/build/{mode}/apps/iotune/iotune seastar/build/{mode}/gen/http/request_parser.hh seastar/build/{mode}/gen/http/http_response_parser.hh: ninja {seastar_deps}\n'
+                .format(**locals()))
        f.write('  subdir = seastar\n')
-        f.write('  target = build/{}/libseastar.a\n'.format(mode))
+        f.write('  target = build/{mode}/libseastar.a build/{mode}/apps/iotune/iotune build/{mode}/gen/http/request_parser.hh build/{mode}/gen/http/http_response_parser.hh\n'.format(**locals()))
+        f.write(textwrap.dedent('''\
+            build build/{mode}/iotune: copy seastar/build/{mode}/apps/iotune/iotune
+            ''').format(**locals()))
    f.write('build {}: phony\n'.format(seastar_deps))
    f.write(textwrap.dedent('''\
        rule configure
@@ -816,10 +896,6 @@ with open(buildfile, 'w') as f:
            command = find -name '*.[chS]' -o -name "*.cc" -o -name "*.hh" | cscope -bq -i-
            description = CSCOPE
        build cscope: cscope
-        rule request_parser_hh
-           command = {ninja} -C seastar build/release/gen/http/request_parser.hh build/debug/gen/http/request_parser.hh
-           description = GEN seastar/http/request_parser.hh
-        build seastar/build/release/http/request_parser.hh seastar/build/debug/http/request_parser.hh: request_parser_hh
        rule clean
            command = rm -rf build
            description = CLEAN
--- a/converting_mutation_partition_applier.hh
+++ b/converting_mutation_partition_applier.hh
@@ -75,7 +75,7 @@ public:
    }

    virtual void accept_static_cell(column_id id, atomic_cell_view cell) override {
-        const column_mapping::column& col = _visited_column_mapping.static_column_at(id);
+        const column_mapping_entry& col = _visited_column_mapping.static_column_at(id);
        const column_definition* def = _p_schema.get_column_definition(col.name());
        if (def) {
            accept_cell(_p._static_row, column_kind::static_column, *def, col.type(), cell);
@@ -83,7 +83,7 @@ public:
    }

    virtual void accept_static_cell(column_id id, collection_mutation_view collection) override {
-        const column_mapping::column& col = _visited_column_mapping.static_column_at(id);
+        const column_mapping_entry& col = _visited_column_mapping.static_column_at(id);
        const column_definition* def = _p_schema.get_column_definition(col.name());
        if (def) {
            accept_cell(_p._static_row, column_kind::static_column, *def, col.type(), collection);
@@ -102,7 +102,7 @@ public:
    }

    virtual void accept_row_cell(column_id id, atomic_cell_view cell) override {
-        const column_mapping::column& col = _visited_column_mapping.regular_column_at(id);
+        const column_mapping_entry& col = _visited_column_mapping.regular_column_at(id);
        const column_definition* def = _p_schema.get_column_definition(col.name());
        if (def) {
            accept_cell(_current_row->cells(), column_kind::regular_column, *def, col.type(), cell);
@@ -110,7 +110,7 @@ public:
    }

    virtual void accept_row_cell(column_id id, collection_mutation_view collection) override {
-        const column_mapping::column& col = _visited_column_mapping.regular_column_at(id);
+        const column_mapping_entry& col = _visited_column_mapping.regular_column_at(id);
        const column_definition* def = _p_schema.get_column_definition(col.name());
        if (def) {
            accept_cell(_current_row->cells(), column_kind::regular_column, *def, col.type(), collection);
--- a/cql3/Cql.g
+++ b/cql3/Cql.g
@@ -36,6 +36,7 @@ options {
 #include "cql3/statements/drop_keyspace_statement.hh"
 #include "cql3/statements/create_index_statement.hh"
 #include "cql3/statements/create_table_statement.hh"
+#include "cql3/statements/create_type_statement.hh"
 #include "cql3/statements/property_definitions.hh"
 #include "cql3/statements/drop_table_statement.hh"
 #include "cql3/statements/truncate_statement.hh"
@@ -283,7 +284,9 @@ cqlStatement returns [shared_ptr<parsed_statement> stmt]
    | st22=listUsersStatement          { $stmt = st22; }
    | st23=createTriggerStatement      { $stmt = st23; }
    | st24=dropTriggerStatement        { $stmt = st24; }
+#endif
    | st25=createTypeStatement         { $stmt = st25; }
+#if 0
    | st26=alterTypeStatement          { $stmt = st26; }
    | st27=dropTypeStatement           { $stmt = st27; }
    | st28=createFunctionStatement     { $stmt = st28; }
@@ -695,7 +698,6 @@ cfamOrdering[shared_ptr<cql3::statements::create_table_statement::raw_statement>
    ;


-#if 0
 /**
 * CREATE TYPE foo (
 *    <name1> <type1>,
@@ -703,17 +705,16 @@ cfamOrdering[shared_ptr<cql3::statements::create_table_statement::raw_statement>
 *    ....
 * )
 */
-createTypeStatement returns [CreateTypeStatement expr]
-    @init { boolean ifNotExists = false; }
-    : K_CREATE K_TYPE (K_IF K_NOT K_EXISTS { ifNotExists = true; } )?
-         tn=userTypeName { $expr = new CreateTypeStatement(tn, ifNotExists); }
+createTypeStatement returns [::shared_ptr<create_type_statement> expr]
+    @init { bool if_not_exists = false; }
+    : K_CREATE K_TYPE (K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
+         tn=userTypeName { $expr = ::make_shared<create_type_statement>(tn, if_not_exists); }
         '(' typeColumns[expr] ( ',' typeColumns[expr]? )* ')'
    ;

-typeColumns[CreateTypeStatement expr]
-    : k=ident v=comparatorType { $expr.addDefinition(k, v); }
+typeColumns[::shared_ptr<create_type_statement> expr]
+    : k=ident v=comparatorType { $expr->add_definition(k, v); }
    ;
-#endif


 /**
--- a/cql3/column_condition.hh
+++ b/cql3/column_condition.hh
@@ -737,7 +737,7 @@ public:
        /** A condition on a collection element. For example: "IF col['key'] = 'foo'" */
        static ::shared_ptr<raw> collection_condition(::shared_ptr<term::raw> value, ::shared_ptr<term::raw> collection_element,
                const operator_type& op) {
-            return ::make_shared<raw>(std::move(value), std::vector<::shared_ptr<term::raw>>{}, ::shared_ptr<abstract_marker::in_raw>{}, std::move(collection_element), operator_type::IN);
+            return ::make_shared<raw>(std::move(value), std::vector<::shared_ptr<term::raw>>{}, ::shared_ptr<abstract_marker::in_raw>{}, std::move(collection_element), op);
        }

        /** An IN condition on a collection element. For example: "IF col['key'] IN ('foo', 'bar', ...)" */
--- a/cql3/column_identifier.cc
+++ b/cql3/column_identifier.cc
@@ -121,3 +121,7 @@ column_identifier::new_selector_factory(database& db, schema_ptr schema, std::ve
 }

 }
+
+bool cql3::column_identifier::text_comparator::operator()(const cql3::column_identifier& c1, const cql3::column_identifier& c2) const {
+    return c1.text() < c2.text();
+}
--- a/cql3/column_identifier.hh
+++ b/cql3/column_identifier.hh
@@ -61,6 +61,11 @@ public:
 private:
    sstring _text;
 public:
+    // less comparator sorting by text
+    struct text_comparator {
+        bool operator()(const column_identifier& c1, const column_identifier& c2) const;
+    };
+
    column_identifier(sstring raw_text, bool keep_case);

    column_identifier(bytes bytes_, data_type type);
--- a/cql3/functions/aggregate_fcts.hh
+++ b/cql3/functions/aggregate_fcts.hh
@@ -58,10 +58,10 @@ public:
    virtual void reset() override {
        _count = 0;
    }
-    virtual opt_bytes compute(serialization_format sf) override {
+    virtual opt_bytes compute(cql_serialization_format sf) override {
        return long_type->decompose(_count);
    }
-    virtual void add_input(serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
        ++_count;
    }
 };
@@ -83,10 +83,10 @@ public:
    virtual void reset() override {
        _sum = {};
    }
-    virtual opt_bytes compute(serialization_format sf) override {
+    virtual opt_bytes compute(cql_serialization_format sf) override {
        return data_type_for<Type>()->decompose(_sum);
    }
-    virtual void add_input(serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -120,14 +120,14 @@ public:
        _sum = {};
        _count = 0;
    }
-    virtual opt_bytes compute(serialization_format sf) override {
+    virtual opt_bytes compute(cql_serialization_format sf) override {
        Type ret = 0;
        if (_count) {
            ret = _sum / _count;
        }
        return data_type_for<Type>()->decompose(ret);
    }
-    virtual void add_input(serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -159,13 +159,13 @@ public:
    virtual void reset() override {
        _max = {};
    }
-    virtual opt_bytes compute(serialization_format sf) override {
+    virtual opt_bytes compute(cql_serialization_format sf) override {
        if (!_max) {
            return {};
        }
        return data_type_for<Type>()->decompose(*_max);
    }
-    virtual void add_input(serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -206,13 +206,13 @@ public:
    virtual void reset() override {
        _min = {};
    }
-    virtual opt_bytes compute(serialization_format sf) override {
+    virtual opt_bytes compute(cql_serialization_format sf) override {
        if (!_min) {
            return {};
        }
        return data_type_for<Type>()->decompose(*_min);
    }
-    virtual void add_input(serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
@@ -255,10 +255,10 @@ public:
    virtual void reset() override {
        _count = 0;
    }
-    virtual opt_bytes compute(serialization_format sf) override {
+    virtual opt_bytes compute(cql_serialization_format sf) override {
        return long_type->decompose(_count);
    }
-    virtual void add_input(serialization_format sf, const std::vector<opt_bytes>& values) override {
+    virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
        if (!values[0]) {
            return;
        }
--- a/cql3/functions/aggregate_function.hh
+++ b/cql3/functions/aggregate_function.hh
@@ -77,7 +77,7 @@ public:
         * @param protocol_version native protocol version
         * @param values the values to add to the aggregate.
         */
-        virtual void add_input(serialization_format sf, const std::vector<opt_bytes>& values) = 0;
+        virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) = 0;

        /**
         * Computes and returns the aggregate current value.
@@ -85,7 +85,7 @@ public:
         * @param protocol_version native protocol version
         * @return the aggregate current value.
         */
-        virtual opt_bytes compute(serialization_format sf) = 0;
+        virtual opt_bytes compute(cql_serialization_format sf) = 0;

        /**
         * Reset this aggregate.
--- a/cql3/functions/bytes_conversion_fcts.hh
+++ b/cql3/functions/bytes_conversion_fcts.hh
@@ -58,7 +58,7 @@ shared_ptr<function>
 make_to_blob_function(data_type from_type) {
    auto name = from_type->as_cql3_type()->to_string() + "asblob";
    return make_native_scalar_function<true>(name, bytes_type, { from_type },
-            [] (serialization_format sf, const std::vector<bytes_opt>& parameters) {
+            [] (cql_serialization_format sf, const std::vector<bytes_opt>& parameters) {
        return parameters[0];
    });
 }
@@ -68,7 +68,7 @@ shared_ptr<function>
 make_from_blob_function(data_type to_type) {
    sstring name = sstring("blobas") + to_type->as_cql3_type()->to_string();
    return make_native_scalar_function<true>(name, to_type, { bytes_type },
-            [name, to_type] (serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+            [name, to_type] (cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
        auto&& val = parameters[0];
        if (!val) {
            return val;
@@ -89,7 +89,7 @@ inline
 shared_ptr<function>
 make_varchar_as_blob_fct() {
    return make_native_scalar_function<true>("varcharasblob", bytes_type, { utf8_type },
-            [] (serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+            [] (cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
        return parameters[0];
    });
 }
@@ -98,7 +98,7 @@ inline
 shared_ptr<function>
 make_blob_as_varchar_fct() {
    return make_native_scalar_function<true>("blobasvarchar", utf8_type, { bytes_type },
-            [] (serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+            [] (cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
        return parameters[0];
    });
 }
--- a/cql3/functions/function_call.hh
+++ b/cql3/functions/function_call.hh
@@ -61,11 +61,11 @@ public:
    virtual shared_ptr<terminal> bind(const query_options& options) override;
    virtual bytes_view_opt bind_and_get(const query_options& options) override;
 private:
-    static bytes_opt execute_internal(serialization_format sf, scalar_function& fun, std::vector<bytes_opt> params);
+    static bytes_opt execute_internal(cql_serialization_format sf, scalar_function& fun, std::vector<bytes_opt> params);
 public:
    virtual bool contains_bind_marker() const override;
 private:
-    static shared_ptr<terminal> make_terminal(shared_ptr<function> fun, bytes_opt result, serialization_format sf);
+    static shared_ptr<terminal> make_terminal(shared_ptr<function> fun, bytes_opt result, cql_serialization_format sf);
 public:
    class raw : public term::raw {
        function_name _name;
--- a/cql3/functions/functions.cc
+++ b/cql3/functions/functions.cc
@@ -299,7 +299,7 @@ function_call::collect_marker_specification(shared_ptr<variable_specifications>

 shared_ptr<terminal>
 function_call::bind(const query_options& options) {
-    return make_terminal(_fun, to_bytes_opt(bind_and_get(options)), options.get_serialization_format());
+    return make_terminal(_fun, to_bytes_opt(bind_and_get(options)), options.get_cql_serialization_format());
 }

 bytes_view_opt
@@ -315,12 +315,12 @@ function_call::bind_and_get(const query_options& options) {
        }
        buffers.push_back(std::move(to_bytes_opt(val)));
    }
-    auto result = execute_internal(options.get_serialization_format(), *_fun, std::move(buffers));
+    auto result = execute_internal(options.get_cql_serialization_format(), *_fun, std::move(buffers));
    return options.make_temporary(result);
 }

 bytes_opt
-function_call::execute_internal(serialization_format sf, scalar_function& fun, std::vector<bytes_opt> params) {
+function_call::execute_internal(cql_serialization_format sf, scalar_function& fun, std::vector<bytes_opt> params) {
    bytes_opt result = fun.execute(sf, params);
    try {
        // Check the method didn't lied on it's declared return type
@@ -347,7 +347,7 @@ function_call::contains_bind_marker() const {
 }

 shared_ptr<terminal>
-function_call::make_terminal(shared_ptr<function> fun, bytes_opt result, serialization_format sf)  {
+function_call::make_terminal(shared_ptr<function> fun, bytes_opt result, cql_serialization_format sf)  {
    if (!dynamic_pointer_cast<const collection_type_impl>(fun->return_type())) {
        return ::make_shared<constants::value>(std::move(result));
    }
@@ -413,7 +413,7 @@ function_call::raw::prepare(database& db, const sstring& keyspace, ::shared_ptr<
    // If all parameters are terminal and the function is pure, we can
    // evaluate it now, otherwise we'd have to wait execution time
    if (all_terminal && scalar_fun->is_pure()) {
-        return make_terminal(scalar_fun, execute(*scalar_fun, parameters), query_options::DEFAULT.get_serialization_format());
+        return make_terminal(scalar_fun, execute(*scalar_fun, parameters), query_options::DEFAULT.get_cql_serialization_format());
    } else {
        return ::make_shared<function_call>(scalar_fun, parameters);
    }
@@ -429,7 +429,7 @@ function_call::raw::execute(scalar_function& fun, std::vector<shared_ptr<term>>
        buffers.push_back(std::move(param));
    }

-    return execute_internal(serialization_format::internal(), fun, buffers);
+    return execute_internal(cql_serialization_format::internal(), fun, buffers);
 }

 assignment_testable::test_result
--- a/cql3/functions/native_scalar_function.hh
+++ b/cql3/functions/native_scalar_function.hh
@@ -74,7 +74,10 @@ public:
            : native_scalar_function(std::move(name), std::move(return_type), std::move(arg_types))
            , _func(std::forward<Func>(func)) {
    }
-    virtual bytes_opt execute(serialization_format sf, const std::vector<bytes_opt>& parameters) override {
+    virtual bool is_pure() override {
+        return Pure;
+    }
+    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
        return _func(sf, parameters);
    }
 };
--- a/cql3/functions/scalar_function.hh
+++ b/cql3/functions/scalar_function.hh
@@ -58,7 +58,7 @@ public:
     * @return the result of applying this function to the parameter
     * @throws InvalidRequestException if this function cannot not be applied to the parameter
     */
-    virtual bytes_opt execute(serialization_format sf, const std::vector<bytes_opt>& parameters) = 0;
+    virtual bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) = 0;
 };


--- a/cql3/functions/time_uuid_fcts.hh
+++ b/cql3/functions/time_uuid_fcts.hh
@@ -56,7 +56,7 @@ inline
 shared_ptr<function>
 make_now_fct() {
    return make_native_scalar_function<false>("now", timeuuid_type, {},
-            [] (serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
        return {to_bytes(utils::UUID_gen::get_time_UUID())};
    });
 }
@@ -65,7 +65,7 @@ inline
 shared_ptr<function>
 make_min_timeuuid_fct() {
    return make_native_scalar_function<true>("mintimeuuid", timeuuid_type, { timestamp_type },
-            [] (serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
        auto& bb = values[0];
        if (!bb) {
            return {};
@@ -84,7 +84,7 @@ inline
 shared_ptr<function>
 make_max_timeuuid_fct() {
    return make_native_scalar_function<true>("maxtimeuuid", timeuuid_type, { timestamp_type },
-            [] (serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
        // FIXME: should values be a vector<optional<bytes>>?
        auto& bb = values[0];
        if (!bb) {
@@ -104,7 +104,7 @@ inline
 shared_ptr<function>
 make_date_of_fct() {
    return make_native_scalar_function<true>("dateof", timestamp_type, { timeuuid_type },
-            [] (serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
@@ -119,7 +119,7 @@ inline
 shared_ptr<function>
 make_unix_timestamp_of_fcf() {
    return make_native_scalar_function<true>("unixtimestampof", long_type, { timeuuid_type },
-            [] (serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
+            [] (cql_serialization_format sf, const std::vector<bytes_opt>& values) -> bytes_opt {
        using namespace utils;
        auto& bb = values[0];
        if (!bb) {
--- a/cql3/functions/token_fct.hh
+++ b/cql3/functions/token_fct.hh
@@ -61,10 +61,9 @@ public:
                    , _schema(s) {
    }

-    bytes_opt execute(serialization_format sf, const std::vector<bytes_opt>& parameters) override {
-        auto buf = _schema->partition_key_type()->serialize_optionals(parameters);
-        auto view = partition_key_view::from_bytes(std::move(buf));
-        auto tok = dht::global_partitioner().get_token(*_schema, view);
+    bytes_opt execute(cql_serialization_format sf, const std::vector<bytes_opt>& parameters) override {
+        auto key = partition_key::from_optional_exploded(*_schema, parameters);
+        auto tok = dht::global_partitioner().get_token(*_schema, key);
        warn(unimplemented::cause::VALIDATION);
        return dht::global_partitioner().token_to_bytes(tok);
    }
--- a/cql3/functions/uuid_fcts.hh
+++ b/cql3/functions/uuid_fcts.hh
@@ -53,7 +53,7 @@ inline
 shared_ptr<function>
 make_uuid_fct() {
    return make_native_scalar_function<false>("uuid", uuid_type, {},
-            [] (serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
+            [] (cql_serialization_format sf, const std::vector<bytes_opt>& parameters) -> bytes_opt {
        return {uuid_type->decompose(utils::make_random_uuid())};
    });
 }
--- a/cql3/lists.cc
+++ b/cql3/lists.cc
@@ -108,7 +108,7 @@ lists::literal::to_string() const {
 }

 lists::value
-lists::value::from_serialized(bytes_view v, list_type type, serialization_format sf) {
+lists::value::from_serialized(bytes_view v, list_type type, cql_serialization_format sf) {
    try {
        // Collections have this small hack that validate cannot be called on a serialized object,
        // but compose does the validation (so we're fine).
@@ -128,11 +128,11 @@ lists::value::from_serialized(bytes_view v, list_type type, serialization_format

 bytes_opt
 lists::value::get(const query_options& options) {
-    return get_with_protocol_version(options.get_serialization_format());
+    return get_with_protocol_version(options.get_cql_serialization_format());
 }

 bytes
-lists::value::get_with_protocol_version(serialization_format sf) {
+lists::value::get_with_protocol_version(cql_serialization_format sf) {
    // Can't use boost::indirect_iterator, because optional is not an iterator
    auto deref = [] (bytes_opt& x) { return *x; };
    return collection_type_impl::pack(
@@ -212,7 +212,7 @@ lists::marker::bind(const query_options& options) {
    if (!value) {
        return nullptr;
    } else {
-        return make_shared(value::from_serialized(*value, std::move(ltype), options.get_serialization_format()));
+        return make_shared(value::from_serialized(*value, std::move(ltype), options.get_cql_serialization_format()));
    }
 }

@@ -259,7 +259,10 @@ lists::setter_by_index::execute(mutation& m, const exploded_clustering_prefix& p
    // we should not get here for frozen lists
    assert(column.type->is_multi_cell()); // "Attempted to set an individual element on a frozen list";

-    auto row_key = clustering_key::from_clustering_prefix(*params._schema, prefix);
+    std::experimental::optional<clustering_key> row_key;
+    if (!column.is_static()) {
+        row_key = clustering_key::from_clustering_prefix(*params._schema, prefix);
+    }

    auto index = _idx->bind_and_get(params._options);
    auto value = _t->bind_and_get(params._options);
@@ -269,32 +272,30 @@ lists::setter_by_index::execute(mutation& m, const exploded_clustering_prefix& p
    }

    auto idx = net::ntoh(int32_t(*unaligned_cast<int32_t>(index->begin())));
-
-    auto existing_list_opt = params.get_prefetched_list(m.key(), row_key, column);
+    auto&& existing_list_opt = params.get_prefetched_list(m.key(), std::move(row_key), column);
    if (!existing_list_opt) {
        throw exceptions::invalid_request_exception("Attempted to set an element on a list which is null");
    }
-    collection_mutation_view existing_list_ser = *existing_list_opt;
    auto ltype = dynamic_pointer_cast<const list_type_impl>(column.type);
-    collection_type_impl::mutation_view existing_list = ltype->deserialize_mutation_form(existing_list_ser);
+    auto&& existing_list = *existing_list_opt;
    // we verified that index is an int32_type
-    if (idx < 0 || size_t(idx) >= existing_list.cells.size()) {
+    if (idx < 0 || size_t(idx) >= existing_list.size()) {
        throw exceptions::invalid_request_exception(sprint("List index %d out of bound, list has size %d",
-                idx, existing_list.cells.size()));
+                idx, existing_list.size()));
    }

-    bytes_view eidx = existing_list.cells[idx].first;
+    const bytes& eidx = existing_list[idx].key;
    list_type_impl::mutation mut;
    mut.cells.reserve(1);
    if (!value) {
-        mut.cells.emplace_back(to_bytes(eidx), params.make_dead_cell());
+        mut.cells.emplace_back(eidx, params.make_dead_cell());
    } else {
        if (value->size() > std::numeric_limits<uint16_t>::max()) {
            throw exceptions::invalid_request_exception(
                    sprint("List value is too long. List values are limited to %d bytes but %d bytes value provided",
                            std::numeric_limits<uint16_t>::max(), value->size()));
        }
-        mut.cells.emplace_back(to_bytes(eidx), params.make_cell(*value));
+        mut.cells.emplace_back(eidx, params.make_cell(*value));
    }
    auto smut = ltype->serialize_mutation_form(mut);
    m.set_cell(prefix, column, atomic_cell_or_collection::from_collection_mutation(std::move(smut)));
@@ -337,13 +338,8 @@ lists::do_append(shared_ptr<term> t,
        if (!value) {
            m.set_cell(prefix, column, params.make_dead_cell());
        } else {
-            auto&& to_add = list_value->_elements;
-            auto deref = [] (const bytes_opt& v) { return *v; };
-            auto&& newv = collection_mutation{list_type_impl::pack(
-                    boost::make_transform_iterator(to_add.begin(), deref),
-                    boost::make_transform_iterator(to_add.end(), deref),
-                    to_add.size(), serialization_format::internal())};
-            m.set_cell(prefix, column, atomic_cell_or_collection::from_collection_mutation(std::move(newv)));
+            auto newv = list_value->get_with_protocol_version(cql_serialization_format::internal());
+            m.set_cell(prefix, column, params.make_cell(std::move(newv)));
        }
    }
 }
@@ -383,8 +379,13 @@ lists::discarder::requires_read() {
 void
 lists::discarder::execute(mutation& m, const exploded_clustering_prefix& prefix, const update_parameters& params) {
    assert(column.type->is_multi_cell()); // "Attempted to delete from a frozen list";
-    auto&& row_key = clustering_key::from_clustering_prefix(*params._schema, prefix);
-    auto&& existing_list = params.get_prefetched_list(m.key(), row_key, column);
+
+    std::experimental::optional<clustering_key> row_key;
+    if (!column.is_static()) {
+        row_key = clustering_key::from_clustering_prefix(*params._schema, prefix);
+    }
+
+    auto&& existing_list = params.get_prefetched_list(m.key(), std::move(row_key), column);
    // We want to call bind before possibly returning to reject queries where the value provided is not a list.
    auto&& value = _t->bind(params._options);

@@ -394,9 +395,9 @@ lists::discarder::execute(mutation& m, const exploded_clustering_prefix& prefix,
        return;
    }

-    auto&& elist = ltype->deserialize_mutation_form(*existing_list);
+    auto&& elist = *existing_list;

-    if (elist.cells.empty()) {
+    if (elist.empty()) {
        return;
    }

@@ -413,14 +414,14 @@ lists::discarder::execute(mutation& m, const exploded_clustering_prefix& prefix,
    // toDiscard will be small and keeping a list will be more efficient.
    auto&& to_discard = lvalue->_elements;
    collection_type_impl::mutation mnew;
-    for (auto&& cell : elist.cells) {
+    for (auto&& cell : elist) {
        auto have_value = [&] (bytes_view value) {
            return std::find_if(to_discard.begin(), to_discard.end(),
                                [ltype, value] (auto&& v) { return ltype->get_elements_type()->equal(*v, value); })
                                         != to_discard.end();
        };
-        if (cell.second.is_live() && have_value(cell.second.value())) {
-            mnew.cells.emplace_back(bytes(cell.first.begin(), cell.first.end()), params.make_dead_cell());
+        if (have_value(cell.value)) {
+            mnew.cells.emplace_back(cell.key, params.make_dead_cell());
        }
    }
    auto mnew_ser = ltype->serialize_mutation_form(mnew);
@@ -444,18 +445,21 @@ lists::discarder_by_index::execute(mutation& m, const exploded_clustering_prefix
    auto cvalue = dynamic_pointer_cast<constants::value>(index);
    assert(cvalue);

-    auto row_key = clustering_key::from_clustering_prefix(*params._schema, prefix);
-    auto&& existing_list = params.get_prefetched_list(m.key(), row_key, column);
+    std::experimental::optional<clustering_key> row_key;
+    if (!column.is_static()) {
+        row_key = clustering_key::from_clustering_prefix(*params._schema, prefix);
+    }
+    auto&& existing_list_opt = params.get_prefetched_list(m.key(), std::move(row_key), column);
    int32_t idx = read_simple_exactly<int32_t>(*cvalue->_bytes);
-    if (!existing_list) {
+    if (!existing_list_opt) {
        throw exceptions::invalid_request_exception("Attempted to delete an element from a list which is null");
    }
-    auto&& deserialized = ltype->deserialize_mutation_form(*existing_list);
-    if (idx < 0 || size_t(idx) >= deserialized.cells.size()) {
-        throw exceptions::invalid_request_exception(sprint("List index %d out of bound, list has size %d", idx, deserialized.cells.size()));
+    auto&& existing_list = *existing_list_opt;
+    if (idx < 0 || size_t(idx) >= existing_list.size()) {
+        throw exceptions::invalid_request_exception(sprint("List index %d out of bound, list has size %d", idx, existing_list.size()));
    }
    collection_type_impl::mutation mut;
-    mut.cells.emplace_back(to_bytes(deserialized.cells[idx].first), params.make_dead_cell());
+    mut.cells.emplace_back(existing_list[idx].key, params.make_dead_cell());
    m.set_cell(prefix, column, ltype->serialize_mutation_form(mut));
 }

--- a/cql3/lists.hh
+++ b/cql3/lists.hh
@@ -78,9 +78,9 @@ public:
        explicit value(std::vector<bytes_opt> elements)
            : _elements(std::move(elements)) {
        }
-        static value from_serialized(bytes_view v, list_type type, serialization_format sf);
+        static value from_serialized(bytes_view v, list_type type, cql_serialization_format sf);
        virtual bytes_opt get(const query_options& options) override;
-        virtual bytes get_with_protocol_version(serialization_format sf) override;
+        virtual bytes get_with_protocol_version(cql_serialization_format sf) override;
        bool equals(shared_ptr<list_type_impl> lt, const value& v);
        virtual std::vector<bytes_opt> get_elements() override;
        virtual sstring to_string() const;
--- a/cql3/maps.cc
+++ b/cql3/maps.cc
@@ -152,7 +152,7 @@ maps::literal::to_string() const {
 }

 maps::value
-maps::value::from_serialized(bytes_view value, map_type type, serialization_format sf) {
+maps::value::from_serialized(bytes_view value, map_type type, cql_serialization_format sf) {
    try {
        // Collections have this small hack that validate cannot be called on a serialized object,
        // but compose does the validation (so we're fine).
@@ -171,11 +171,11 @@ maps::value::from_serialized(bytes_view value, map_type type, serialization_form

 bytes_opt
 maps::value::get(const query_options& options) {
-    return get_with_protocol_version(options.get_serialization_format());
+    return get_with_protocol_version(options.get_cql_serialization_format());
 }

 bytes
-maps::value::get_with_protocol_version(serialization_format sf) {
+maps::value::get_with_protocol_version(cql_serialization_format sf) {
    //FIXME: share code with serialize_partially_deserialized_form
    size_t len = collection_value_len(sf) * map.size() * 2 + collection_size_len(sf);
    for (auto&& e : map) {
@@ -257,7 +257,7 @@ maps::marker::bind(const query_options& options) {
                    maps::value::from_serialized(*val,
                            static_pointer_cast<const map_type_impl>(
                                    _receiver->type),
-                            options.get_serialization_format())) :
+                            options.get_cql_serialization_format())) :
            nullptr;
 }

@@ -333,7 +333,7 @@ maps::do_put(mutation& m, const exploded_clustering_prefix& prefix, const update
            m.set_cell(prefix, column, params.make_dead_cell());
        } else {
            auto v = map_type_impl::serialize_partially_deserialized_form({map_value->map.begin(), map_value->map.end()},
-                    serialization_format::internal());
+                    cql_serialization_format::internal());
            m.set_cell(prefix, column, params.make_cell(std::move(v)));
        }
    }
--- a/cql3/maps.hh
+++ b/cql3/maps.hh
@@ -81,9 +81,9 @@ public:
        value(std::map<bytes, bytes, serialized_compare> map)
            : map(std::move(map)) {
        }
-        static value from_serialized(bytes_view value, map_type type, serialization_format sf);
+        static value from_serialized(bytes_view value, map_type type, cql_serialization_format sf);
        virtual bytes_opt get(const query_options& options) override;
-        virtual bytes get_with_protocol_version(serialization_format sf);
+        virtual bytes get_with_protocol_version(cql_serialization_format sf);
        bool equals(map_type mt, const value& v);
        virtual sstring to_string() const;
    };
--- a/cql3/query_options.cc
+++ b/cql3/query_options.cc
@@ -47,7 +47,7 @@ namespace cql3 {
 thread_local const query_options::specific_options query_options::specific_options::DEFAULT{-1, {}, {}, api::missing_timestamp};

 thread_local query_options query_options::DEFAULT{db::consistency_level::ONE, std::experimental::nullopt,
-    {}, false, query_options::specific_options::DEFAULT, version::native_protocol(), serialization_format::use_32_bit()};
+    {}, false, query_options::specific_options::DEFAULT, cql_serialization_format::latest()};

 query_options::query_options(db::consistency_level consistency,
                             std::experimental::optional<std::vector<sstring_view>> names,
@@ -55,16 +55,14 @@ query_options::query_options(db::consistency_level consistency,
                             std::vector<bytes_view_opt> value_views,
                             bool skip_metadata,
                             specific_options options,
-                             int32_t protocol_version,
-                             serialization_format sf)
+                             cql_serialization_format sf)
    : _consistency(consistency)
    , _names(std::move(names))
    , _values(std::move(values))
    , _value_views(std::move(value_views))
    , _skip_metadata(skip_metadata)
    , _options(std::move(options))
-    , _protocol_version(protocol_version)
-    , _serialization_format(sf)
+    , _cql_serialization_format(sf)
 {
 }

@@ -73,8 +71,7 @@ query_options::query_options(db::consistency_level consistency,
                             std::vector<bytes_view_opt> value_views,
                             bool skip_metadata,
                             specific_options options,
-                             int32_t protocol_version,
-                             serialization_format sf)
+                             cql_serialization_format sf)
    : query_options(
          consistency,
          std::move(names),
@@ -82,7 +79,6 @@ query_options::query_options(db::consistency_level consistency,
          std::move(value_views),
          skip_metadata,
          std::move(options),
-          protocol_version,
          sf
      )
 {
@@ -94,7 +90,7 @@ query_options::query_options(query_options&& o, std::vector<std::vector<bytes_vi
    std::vector<query_options> tmp;
    tmp.reserve(value_views.size());
    std::transform(value_views.begin(), value_views.end(), std::back_inserter(tmp), [this](auto& vals) {
-        return query_options(_consistency, {}, vals, _skip_metadata, _options, _protocol_version, _serialization_format);
+        return query_options(_consistency, {}, vals, _skip_metadata, _options, _cql_serialization_format);
    });
    _batch_options = std::move(tmp);
 }
@@ -107,8 +103,7 @@ query_options::query_options(db::consistency_level cl, std::vector<bytes_opt> va
          {},
          false,
          query_options::specific_options::DEFAULT,
-          version::native_protocol(),
-          serialization_format::use_32_bit()
+          cql_serialization_format::latest()
      )
 {
    for (auto&& value : _values) {
@@ -178,12 +173,12 @@ api::timestamp_type query_options::get_timestamp(service::query_state& state) co

 int query_options::get_protocol_version() const
 {
-    return _protocol_version;
+    return _cql_serialization_format.protocol_version();
 }

-serialization_format query_options::get_serialization_format() const
+cql_serialization_format query_options::get_cql_serialization_format() const
 {
-    return _serialization_format;
+    return _cql_serialization_format;
 }

 const query_options::specific_options& query_options::get_specific_options() const
--- a/cql3/query_options.hh
+++ b/cql3/query_options.hh
@@ -48,7 +48,7 @@
 #include "service/pager/paging_state.hh"
 #include "cql3/column_specification.hh"
 #include "cql3/column_identifier.hh"
-#include "serialization_format.hh"
+#include "cql_serialization_format.hh"

 namespace cql3 {

@@ -74,8 +74,7 @@ private:
    mutable std::vector<std::vector<int8_t>> _temporaries;
    const bool _skip_metadata;
    const specific_options _options;
-    const int32_t _protocol_version; // transient
-    serialization_format _serialization_format;
+    cql_serialization_format _cql_serialization_format;
    std::experimental::optional<std::vector<query_options>> _batch_options;
 public:
    query_options(query_options&&) = default;
@@ -87,22 +86,19 @@ public:
                           std::vector<bytes_view_opt> value_views,
                           bool skip_metadata,
                           specific_options options,
-                           int32_t protocol_version,
-                           serialization_format sf);
+                           cql_serialization_format sf);
    explicit query_options(db::consistency_level consistency,
                           std::experimental::optional<std::vector<sstring_view>> names,
                           std::vector<bytes_view_opt> value_views,
                           bool skip_metadata,
                           specific_options options,
-                           int32_t protocol_version,
-                           serialization_format sf);
+                           cql_serialization_format sf);

    explicit query_options(db::consistency_level consistency,
                           std::vector<std::vector<bytes_view_opt>> value_views,
                           bool skip_metadata,
                           specific_options options,
-                           int32_t protocol_version,
-                           serialization_format sf);
+                           cql_serialization_format sf);

    // Batch query_options constructor
    explicit query_options(query_options&&, std::vector<std::vector<bytes_view_opt>> value_views);
@@ -131,7 +127,7 @@ public:
     * a native protocol request (i.e. it's been allocated locally or by CQL-over-thrift).
     */
    int get_protocol_version() const;
-    serialization_format get_serialization_format() const;
+    cql_serialization_format get_cql_serialization_format() const;
    // Mainly for the sake of BatchQueryOptions
    const specific_options& get_specific_options() const;
    const query_options& for_statement(size_t i) const;
--- a/cql3/query_processor.cc
+++ b/cql3/query_processor.cc
@@ -423,10 +423,9 @@ void query_processor::migration_subscriber::on_update_keyspace(const sstring& ks

 void query_processor::migration_subscriber::on_update_column_family(const sstring& ks_name, const sstring& cf_name, bool columns_changed)
 {
-    if (columns_changed) {
-        log.info("Column definitions for {}.{} changed, invalidating related prepared statements", ks_name, cf_name);
-        remove_invalid_prepared_statements(ks_name, cf_name);
-    }
+    // #1255: Ignoring columns_changed deliberately.
+    log.info("Column definitions for {}.{} changed, invalidating related prepared statements", ks_name, cf_name);
+    remove_invalid_prepared_statements(ks_name, cf_name);
 }

 void query_processor::migration_subscriber::on_update_user_type(const sstring& ks_name, const sstring& type_name)
--- a/cql3/result_set.hh
+++ b/cql3/result_set.hh
@@ -287,6 +287,13 @@ public:

 };

+inline ::shared_ptr<cql3::metadata> make_empty_metadata()
+{
+    auto result = ::make_shared<cql3::metadata>(std::vector<::shared_ptr<cql3::column_specification>>{});
+    result->set_skip_metadata();
+    return result;
+}
+
 class result_set {
 #if 0
    private static final ColumnIdentifier COUNT_COLUMN = new ColumnIdentifier("count", false);
--- a/cql3/selection/aggregate_function_selector.hh
+++ b/cql3/selection/aggregate_function_selector.hh
@@ -53,7 +53,7 @@ public:
        return true;
    }

-    virtual void add_input(serialization_format sf, result_set_builder& rs) override {
+    virtual void add_input(cql_serialization_format sf, result_set_builder& rs) override {
        // Aggregation of aggregation is not supported
        size_t m = _arg_selectors.size();
        for (size_t i = 0; i < m; ++i) {
@@ -65,7 +65,7 @@ public:
        _aggregate->add_input(sf, _args);
    }

-    virtual bytes_opt get_output(serialization_format sf) override {
+    virtual bytes_opt get_output(cql_serialization_format sf) override {
        return _aggregate->compute(sf);
    }

--- a/cql3/selection/field_selector.hh
+++ b/cql3/selection/field_selector.hh
@@ -87,11 +87,11 @@ public:
        return false;
    }

-    virtual void add_input(serialization_format sf, result_set_builder& rs) override {
+    virtual void add_input(cql_serialization_format sf, result_set_builder& rs) override {
        _selected->add_input(sf, rs);
    }

-    virtual bytes_opt get_output(serialization_format sf) override {
+    virtual bytes_opt get_output(cql_serialization_format sf) override {
        auto&& value = _selected->get_output(sf);
        if (!value) {
            return std::experimental::nullopt;
--- a/cql3/selection/scalar_function_selector.hh
+++ b/cql3/selection/scalar_function_selector.hh
@@ -57,7 +57,7 @@ public:
        return _arg_selectors[0]->is_aggregate();
    }

-    virtual void add_input(serialization_format sf, result_set_builder& rs) override {
+    virtual void add_input(cql_serialization_format sf, result_set_builder& rs) override {
        size_t m = _arg_selectors.size();
        for (size_t i = 0; i < m; ++i) {
            auto&& s = _arg_selectors[i];
@@ -68,7 +68,7 @@ public:
    virtual void reset() override {
    }

-    virtual bytes_opt get_output(serialization_format sf) override {
+    virtual bytes_opt get_output(cql_serialization_format sf) override {
        size_t m = _arg_selectors.size();
        for (size_t i = 0; i < m; ++i) {
            auto&& s = _arg_selectors[i];
--- a/cql3/selection/selectable.cc
+++ b/cql3/selection/selectable.cc
@@ -52,6 +52,11 @@ selectable::writetime_or_ttl::new_selector_factory(database& db, schema_ptr s, s
    return writetime_or_ttl_selector::new_factory(def->name_as_text(), add_and_get_index(*def, defs), _is_writetime);
 }

+sstring
+selectable::writetime_or_ttl::to_string() const {
+    return sprint("%s(%s)", _is_writetime ? "writetime" : "ttl", _id->to_string());
+}
+
 shared_ptr<selectable>
 selectable::writetime_or_ttl::raw::prepare(schema_ptr s) {
    return make_shared<writetime_or_ttl>(_id->prepare_column_identifier(s), _is_writetime);
@@ -78,6 +83,11 @@ selectable::with_function::new_selector_factory(database& db, schema_ptr s, std:
    return abstract_function_selector::new_factory(std::move(fun), std::move(factories));
 }

+sstring
+selectable::with_function::to_string() const {
+    return sprint("%s(%s)", _function_name.name, join(", ", _args));
+}
+
 shared_ptr<selectable>
 selectable::with_function::raw::prepare(schema_ptr s) {
        std::vector<shared_ptr<selectable>> prepared_args;
@@ -101,7 +111,7 @@ selectable::with_field_selection::new_selector_factory(database& db, schema_ptr
    if (!ut) {
        throw exceptions::invalid_request_exception(
                sprint("Invalid field selection: %s of type %s is not a user type",
-                       "FIXME: selectable" /* FIMXME: _selected */, ut->as_cql3_type()));
+                       _selected->to_string(), factory->new_instance()->get_type()->as_cql3_type()));
    }
    for (size_t i = 0; i < ut->size(); ++i) {
        if (ut->field_name(i) != _field->bytes_) {
@@ -110,7 +120,12 @@ selectable::with_field_selection::new_selector_factory(database& db, schema_ptr
        return field_selector::new_factory(std::move(ut), i, std::move(factory));
    }
    throw exceptions::invalid_request_exception(sprint("%s of type %s has no field %s",
-                                                       "FIXME: selectable" /* FIXME: _selected */, ut->as_cql3_type(), _field));
+                                                       _selected->to_string(), ut->as_cql3_type(), _field));
+}
+
+sstring
+selectable::with_field_selection::to_string() const {
+    return sprint("%s.%s", _selected->to_string(), _field->to_string());
 }

 shared_ptr<selectable>
@@ -126,6 +141,10 @@ selectable::with_field_selection::raw::processes_selection() const {
    return true;
 }

+std::ostream & operator<<(std::ostream &os, const selectable& s) {
+    return os << s.to_string();
+}
+
 }

 }
--- a/cql3/selection/selectable.hh
+++ b/cql3/selection/selectable.hh
@@ -55,6 +55,7 @@ class selectable {
 public:
    virtual ~selectable() {}
    virtual ::shared_ptr<selector::factory> new_selector_factory(database& db, schema_ptr schema, std::vector<const column_definition*>& defs) = 0;
+    virtual sstring to_string() const = 0;
 protected:
    static size_t add_and_get_index(const column_definition& def, std::vector<const column_definition*>& defs) {
        auto i = std::find(defs.begin(), defs.end(), &def);
@@ -84,6 +85,8 @@ public:
    class with_field_selection;
 };

+std::ostream & operator<<(std::ostream &os, const selectable& s);
+
 class selectable::with_function : public selectable {
    functions::function_name _function_name;
    std::vector<shared_ptr<selectable>> _args;
@@ -92,17 +95,7 @@ public:
        : _function_name(std::move(fname)), _args(std::move(args)) {
    }

-#if 0
-    @Override
-    public String toString()
-    {
-        return new StrBuilder().append(functionName)
-                               .append("(")
-                               .appendWithSeparators(args, ", ")
-                               .append(")")
-                               .toString();
-    }
-#endif
+    virtual sstring to_string() const override;

    virtual shared_ptr<selector::factory> new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) override;
    class raw : public selectable::raw {
--- a/cql3/selection/selectable_with_field_selection.hh
+++ b/cql3/selection/selectable_with_field_selection.hh
@@ -59,13 +59,7 @@ public:
            : _selected(std::move(selected)), _field(std::move(field)) {
    }

-#if 0
-    @Override
-    public String toString()
-    {
-        return String.format("%s.%s", selected, field);
-    }
-#endif
+    virtual sstring to_string() const override;

    virtual shared_ptr<selector::factory> new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) override;

--- a/cql3/selection/selection.cc
+++ b/cql3/selection/selection.cc
@@ -63,7 +63,8 @@ selection::selection(schema_ptr schema,
 query::partition_slice::option_set selection::get_query_options() {
    query::partition_slice::option_set opts;

-    opts.set_if<query::partition_slice::option::send_timestamp_and_expiry>(_collect_timestamps || _collect_TTLs);
+    opts.set_if<query::partition_slice::option::send_timestamp>(_collect_timestamps);
+    opts.set_if<query::partition_slice::option::send_expiry>(_collect_TTLs);

    opts.set_if<query::partition_slice::option::send_partition_key>(
        std::any_of(_columns.begin(), _columns.end(),
@@ -112,11 +113,11 @@ protected:
            _current.clear();
        }

-        virtual std::vector<bytes_opt> get_output_row(serialization_format sf) override {
+        virtual std::vector<bytes_opt> get_output_row(cql_serialization_format sf) override {
            return std::move(_current);
        }

-        virtual void add_input_row(serialization_format sf, result_set_builder& rs) override {
+        virtual void add_input_row(cql_serialization_format sf, result_set_builder& rs) override {
            _current = std::move(*rs.current);
        }

@@ -180,7 +181,7 @@ protected:
            return _factories->contains_only_aggregate_functions();
        }

-        virtual std::vector<bytes_opt> get_output_row(serialization_format sf) override {
+        virtual std::vector<bytes_opt> get_output_row(cql_serialization_format sf) override {
            std::vector<bytes_opt> output_row;
            output_row.reserve(_selectors.size());
            for (auto&& s : _selectors) {
@@ -189,7 +190,7 @@ protected:
            return output_row;
        }

-        virtual void add_input_row(serialization_format sf, result_set_builder& rs) {
+        virtual void add_input_row(cql_serialization_format sf, result_set_builder& rs) {
            for (auto&& s : _selectors) {
                s->add_input(sf, rs);
            }
@@ -252,11 +253,11 @@ selection::collect_metadata(schema_ptr schema, const std::vector<::shared_ptr<ra
    return r;
 }

-result_set_builder::result_set_builder(const selection& s, db_clock::time_point now, serialization_format sf)
+result_set_builder::result_set_builder(const selection& s, db_clock::time_point now, cql_serialization_format sf)
    : _result_set(std::make_unique<result_set>(::make_shared<metadata>(*(s.get_result_metadata()))))
    , _selectors(s.new_selectors())
    , _now(now)
-    , _serialization_format(sf)
+    , _cql_serialization_format(sf)
 {
    if (s._collect_timestamps) {
        _timestamps.resize(s._columns.size(), 0);
@@ -295,17 +296,16 @@ void result_set_builder::add(const column_definition& def, const query::result_a
    }
 }

-void result_set_builder::add(const column_definition& def, collection_mutation_view c) {
-    auto&& ctype = static_cast<const collection_type_impl*>(def.type.get());
-    current->emplace_back(ctype->to_value(c, _serialization_format));
+void result_set_builder::add_collection(const column_definition& def, bytes_view c) {
+    current->emplace_back(to_bytes(c));
    // timestamps, ttls meaningless for collections
 }

 void result_set_builder::new_row() {
    if (current) {
-        _selectors->add_input_row(_serialization_format, *this);
+        _selectors->add_input_row(_cql_serialization_format, *this);
        if (!_selectors->is_aggregate()) {
-            _result_set->add_row(_selectors->get_output_row(_serialization_format));
+            _result_set->add_row(_selectors->get_output_row(_cql_serialization_format));
            _selectors->reset();
        }
        current->clear();
@@ -319,13 +319,13 @@ void result_set_builder::new_row() {

 std::unique_ptr<result_set> result_set_builder::build() {
    if (current) {
-        _selectors->add_input_row(_serialization_format, *this);
-        _result_set->add_row(_selectors->get_output_row(_serialization_format));
+        _selectors->add_input_row(_cql_serialization_format, *this);
+        _result_set->add_row(_selectors->get_output_row(_cql_serialization_format));
        _selectors->reset();
        current = std::experimental::nullopt;
    }
    if (_result_set->empty() && _selectors->is_aggregate()) {
-        _result_set->add_row(_selectors->get_output_row(_serialization_format));
+        _result_set->add_row(_selectors->get_output_row(_cql_serialization_format));
    }
    return std::move(_result_set);
 }
@@ -344,7 +344,7 @@ void result_set_builder::visitor::add_value(const column_definition& def,
            _builder.add_empty();
            return;
        }
-        _builder.add(def, *cell);
+        _builder.add_collection(def, *cell);
    } else {
        auto cell = i.next_atomic_cell();
        if (!cell) {
--- a/cql3/selection/selection.hh
+++ b/cql3/selection/selection.hh
@@ -69,9 +69,9 @@ public:
    * @param rs the <code>ResultSetBuilder</code>
    * @throws InvalidRequestException
    */
-    virtual void add_input_row(serialization_format sf, result_set_builder& rs) = 0;
+    virtual void add_input_row(cql_serialization_format sf, result_set_builder& rs) = 0;

-    virtual std::vector<bytes_opt> get_output_row(serialization_format sf) = 0;
+    virtual std::vector<bytes_opt> get_output_row(cql_serialization_format sf) = 0;

    virtual void reset() = 0;
 };
@@ -236,13 +236,13 @@ private:
    std::vector<api::timestamp_type> _timestamps;
    std::vector<int32_t> _ttls;
    const db_clock::time_point _now;
-    serialization_format _serialization_format;
+    cql_serialization_format _cql_serialization_format;
 public:
-    result_set_builder(const selection& s, db_clock::time_point now, serialization_format sf);
+    result_set_builder(const selection& s, db_clock::time_point now, cql_serialization_format sf);
    void add_empty();
    void add(bytes_opt value);
    void add(const column_definition& def, const query::result_atomic_cell_view& c);
-    void add(const column_definition& def, collection_mutation_view c);
+    void add_collection(const column_definition& def, bytes_view c);
    void new_row();
    std::unique_ptr<result_set> build();
    api::timestamp_type timestamp_of(size_t idx);
--- a/cql3/selection/selector.hh
+++ b/cql3/selection/selector.hh
@@ -71,7 +71,7 @@ public:
     * @param rs the <code>result_set_builder</code>
     * @throws InvalidRequestException if a problem occurs while add the input value
     */
-    virtual void add_input(serialization_format sf, result_set_builder& rs) = 0;
+    virtual void add_input(cql_serialization_format sf, result_set_builder& rs) = 0;

    /**
     * Returns the selector output.
@@ -80,7 +80,7 @@ public:
     * @return the selector output
     * @throws InvalidRequestException if a problem occurs while computing the output value
     */
-    virtual bytes_opt get_output(serialization_format sf) = 0;
+    virtual bytes_opt get_output(cql_serialization_format sf) = 0;

    /**
     * Returns the <code>selector</code> output type.
--- a/cql3/selection/simple_selector.hh
+++ b/cql3/selection/simple_selector.hh
@@ -88,12 +88,12 @@ public:
        , _type(type)
    { }

-    virtual void add_input(serialization_format sf, result_set_builder& rs) override {
+    virtual void add_input(cql_serialization_format sf, result_set_builder& rs) override {
        // TODO: can we steal it?
        _current = (*rs.current)[_idx];
    }

-    virtual bytes_opt get_output(serialization_format sf) override {
+    virtual bytes_opt get_output(cql_serialization_format sf) override {
        return std::move(_current);
    }

--- a/cql3/selection/writetime_or_ttl.hh
+++ b/cql3/selection/writetime_or_ttl.hh
@@ -58,13 +58,7 @@ public:
            : _id(std::move(id)), _is_writetime(is_writetime) {
    }

-#if 0
-    @Override
-    public String toString()
-    {
-        return (isWritetime ? "writetime" : "ttl") + "(" + id + ")";
-    }
-#endif
+    virtual sstring to_string() const override;

    virtual shared_ptr<selector::factory> new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) override;

--- a/cql3/selection/writetime_or_ttl_selector.hh
+++ b/cql3/selection/writetime_or_ttl_selector.hh
@@ -86,7 +86,7 @@ public:
        return make_shared<wtots_factory>(std::move(column_name), idx, is_writetime);
    }

-    virtual void add_input(serialization_format sf, result_set_builder& rs) override {
+    virtual void add_input(cql_serialization_format sf, result_set_builder& rs) override {
        if (_is_writetime) {
            int64_t ts = rs.timestamp_of(_idx);
            if (ts != api::missing_timestamp) {
@@ -108,7 +108,7 @@ public:
        }
    }

-    virtual bytes_opt get_output(serialization_format sf) override {
+    virtual bytes_opt get_output(cql_serialization_format sf) override {
        return _current;
    }

--- a/cql3/sets.cc
+++ b/cql3/sets.cc
@@ -120,7 +120,7 @@ sets::literal::to_string() const {
 }

 sets::value
-sets::value::from_serialized(bytes_view v, set_type type, serialization_format sf) {
+sets::value::from_serialized(bytes_view v, set_type type, cql_serialization_format sf) {
    try {
        // Collections have this small hack that validate cannot be called on a serialized object,
        // but compose does the validation (so we're fine).
@@ -138,11 +138,11 @@ sets::value::from_serialized(bytes_view v, set_type type, serialization_format s

 bytes_opt
 sets::value::get(const query_options& options) {
-    return get_with_protocol_version(options.get_serialization_format());
+    return get_with_protocol_version(options.get_cql_serialization_format());
 }

 bytes
-sets::value::get_with_protocol_version(serialization_format sf) {
+sets::value::get_with_protocol_version(cql_serialization_format sf) {
    return collection_type_impl::pack(_elements.begin(), _elements.end(),
            _elements.size(), sf);
 }
@@ -215,7 +215,7 @@ sets::marker::bind(const query_options& options) {
        return nullptr;
    } else {
        auto as_set_type = static_pointer_cast<const set_type_impl>(_receiver->type);
-        return make_shared(value::from_serialized(*value, as_set_type, options.get_serialization_format()));
+        return make_shared(value::from_serialized(*value, as_set_type, options.get_cql_serialization_format()));
    }
 }

@@ -258,16 +258,14 @@ sets::adder::do_add(mutation& m, const exploded_clustering_prefix& row_key, cons
        auto smut = set_type->serialize_mutation_form(mut);

        m.set_cell(row_key, column, std::move(smut));
-    } else {
+    } else if (set_value != nullptr) {
        // for frozen sets, we're overwriting the whole cell
        auto v = set_type->serialize_partially_deserialized_form(
                {set_value->_elements.begin(), set_value->_elements.end()},
-                serialization_format::internal());
-        if (set_value->_elements.empty()) {
-            m.set_cell(row_key, column, params.make_dead_cell());
-        } else {
-            m.set_cell(row_key, column, params.make_cell(std::move(v)));
-        }
+                cql_serialization_format::internal());
+        m.set_cell(row_key, column, params.make_cell(std::move(v)));
+    } else {
+        m.set_cell(row_key, column, params.make_dead_cell());
    }
 }

--- a/cql3/sets.hh
+++ b/cql3/sets.hh
@@ -78,9 +78,9 @@ public:
        value(std::set<bytes, serialized_compare> elements)
                : _elements(std::move(elements)) {
        }
-        static value from_serialized(bytes_view v, set_type type, serialization_format sf);
+        static value from_serialized(bytes_view v, set_type type, cql_serialization_format sf);
        virtual bytes_opt get(const query_options& options) override;
-        virtual bytes get_with_protocol_version(serialization_format sf) override;
+        virtual bytes get_with_protocol_version(cql_serialization_format sf) override;
        bool equals(set_type st, const value& v);
        virtual sstring to_string() const override;
    };
--- a/cql3/statements/batch_statement.hh
+++ b/cql3/statements/batch_statement.hh
@@ -169,26 +169,21 @@ public:
    }
 private:
    future<std::vector<mutation>> get_mutations(distributed<service::storage_proxy>& storage, const query_options& options, bool local, api::timestamp_type now) {
-        struct collector {
-            std::vector<mutation> _result;
-            std::vector<mutation> get() && { return std::move(_result); }
-            void operator()(std::vector<mutation> more) {
-                std::move(more.begin(), more.end(), std::back_inserter(_result));
-            }
-        };
-        auto get_mutations_for_statement = [this, &storage, &options, now, local] (size_t i) {
-            auto&& statement = _statements[i];
-            auto&& statement_options = options.for_statement(i);
-            auto timestamp = _attrs->get_timestamp(now, statement_options);
-            return statement->get_mutations(storage, statement_options, local, timestamp);
-        };
-        // FIXME: origin tries hard to merge mutations to same keyspace, for
-        //        some reason.
-        return map_reduce(
-                boost::make_counting_iterator<size_t>(0),
-                boost::make_counting_iterator<size_t>(_statements.size()),
-                get_mutations_for_statement,
-                collector());
+        // Do not process in parallel because operations like list append/prepend depend on execution order.
+        return do_with(std::vector<mutation>(), [this, &storage, &options, now, local] (auto&& result) {
+            return do_for_each(boost::make_counting_iterator<size_t>(0),
+                               boost::make_counting_iterator<size_t>(_statements.size()),
+                               [this, &storage, &options, now, local, &result] (size_t i) {
+                auto&& statement = _statements[i];
+                auto&& statement_options = options.for_statement(i);
+                auto timestamp = _attrs->get_timestamp(now, statement_options);
+                return statement->get_mutations(storage, statement_options, local, timestamp).then([&result] (auto&& more) {
+                    std::move(more.begin(), more.end(), std::back_inserter(result));
+                });
+            }).then([&result] {
+                return std::move(result);
+            });
+        });
    }

 public:
--- a/cql3/statements/create_table_statement.cc
+++ b/cql3/statements/create_table_statement.cc
@@ -44,6 +44,7 @@
 #include <regex>

 #include <boost/range/adaptor/map.hpp>
+#include <boost/range/algorithm/adjacent_find.hpp>

 #include "cql3/statements/create_table_statement.hh"

@@ -173,13 +174,12 @@ create_table_statement::raw_statement::raw_statement(::shared_ptr<cf_name> name,
        throw exceptions::invalid_request_exception(sprint("Table names shouldn't be more than %d characters long (got \"%s\")", schema::NAME_LENGTH, cf_name.c_str()));
    }

-    for (auto&& entry : _defined_names) {
-        auto c = std::count_if(_defined_names.begin(), _defined_names.end(), [&entry] (auto e) {
-            return entry->text() == e->text();
-        });
-        if (c > 1) {
-            throw exceptions::invalid_request_exception(sprint("Multiple definition of identifier %s", entry->text().c_str()));
-        }
+    // Check for duplicate column names
+    auto i = boost::range::adjacent_find(_defined_names, [] (auto&& e1, auto&& e2) {
+        return e1->text() == e2->text();
+    });
+    if (i != _defined_names.end()) {
+        throw exceptions::invalid_request_exception(sprint("Multiple definition of identifier %s", (*i)->text()));
    }

    properties->validate();
--- a/cql3/statements/create_table_statement.hh
+++ b/cql3/statements/create_table_statement.hh
@@ -51,6 +51,7 @@

 #include "core/shared_ptr.hh"

+#include <seastar/util/indirect.hh>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -139,7 +140,8 @@ private:
    create_table_statement::column_set_type _static_columns;

    bool _use_compact_storage = false;
-    std::multiset<::shared_ptr<column_identifier>> _defined_names;
+    std::multiset<::shared_ptr<column_identifier>,
+            indirect_less<::shared_ptr<column_identifier>, column_identifier::text_comparator>> _defined_names;
    bool _if_not_exists;
 public:
    raw_statement(::shared_ptr<cf_name> name, bool if_not_exists);
--- a/cql3/statements/create_type_statement.cc
+++ b/cql3/statements/create_type_statement.cc
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright 2016 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cql3/statements/create_type_statement.hh"
+
+namespace cql3 {
+
+namespace statements {
+
+create_type_statement::create_type_statement(const ut_name& name, bool if_not_exists)
+    : _name{name}
+    , _if_not_exists{if_not_exists}
+{
+}
+
+void create_type_statement::prepare_keyspace(const service::client_state& state)
+{
+    if (!_name.has_keyspace()) {
+        _name.set_keyspace(state.get_keyspace());
+    }
+}
+
+void create_type_statement::add_definition(::shared_ptr<column_identifier> name, ::shared_ptr<cql3_type::raw> type)
+{
+    _column_names.emplace_back(name);
+    _column_types.emplace_back(type);
+}
+
+void create_type_statement::check_access(const service::client_state& state)
+{
+    warn(unimplemented::cause::PERMISSIONS);
+#if 0
+    state.hasKeyspaceAccess(keyspace(), Permission.CREATE);
+#endif
+}
+
+void create_type_statement::validate(distributed<service::storage_proxy>&, const service::client_state& state)
+{
+#if 0
+    KSMetaData ksm = Schema.instance.getKSMetaData(name.getKeyspace());
+    if (ksm == null)
+        throw new InvalidRequestException(String.format("Cannot add type in unknown keyspace %s", name.getKeyspace()));
+
+    if (ksm.userTypes.getType(name.getUserTypeName()) != null && !ifNotExists)
+        throw new InvalidRequestException(String.format("A user type of name %s already exists", name));
+
+    for (CQL3Type.Raw type : columnTypes)
+        if (type.isCounter())
+            throw new InvalidRequestException("A user type cannot contain counters");
+#endif
+}
+
+#if 0
+public static void checkForDuplicateNames(UserType type) throws InvalidRequestException
+{
+    for (int i = 0; i < type.size() - 1; i++)
+    {
+        ByteBuffer fieldName = type.fieldName(i);
+        for (int j = i+1; j < type.size(); j++)
+        {
+            if (fieldName.equals(type.fieldName(j)))
+                throw new InvalidRequestException(String.format("Duplicate field name %s in type %s",
+                                                                UTF8Type.instance.getString(fieldName),
+                                                                UTF8Type.instance.getString(type.name)));
+        }
+    }
+}
+#endif
+
+shared_ptr<transport::event::schema_change> create_type_statement::change_event()
+{
+    using namespace transport;
+
+    return make_shared<transport::event::schema_change>(event::schema_change::change_type::CREATED,
+                                                        event::schema_change::target_type::TYPE,
+                                                        keyspace(),
+                                                        _name.get_string_type_name());
+}
+
+const sstring& create_type_statement::keyspace() const
+{
+    return _name.get_keyspace();
+}
+
+#if 0
+private UserType createType() throws InvalidRequestException
+{
+    List<ByteBuffer> names = new ArrayList<>(columnNames.size());
+    for (ColumnIdentifier name : columnNames)
+        names.add(name.bytes);
+
+    List<AbstractType<?>> types = new ArrayList<>(columnTypes.size());
+    for (CQL3Type.Raw type : columnTypes)
+        types.add(type.prepare(keyspace()).getType());
+
+    return new UserType(name.getKeyspace(), name.getUserTypeName(), names, types);
+}
+#endif
+
+future<bool> create_type_statement::announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only)
+{
+    throw std::runtime_error("User-defined types are not supported yet");
+#if 0
+   KSMetaData ksm = Schema.instance.getKSMetaData(name.getKeyspace());
+   assert ksm != null; // should haven't validate otherwise
+
+   // Can happen with ifNotExists
+   if (ksm.userTypes.getType(name.getUserTypeName()) != null)
+       return false;
+
+   UserType type = createType();
+   checkForDuplicateNames(type);
+   MigrationManager.announceNewType(type, isLocalOnly);
+   return true;
+#endif
+}
+
+}
+
+}
--- a/cql3/statements/create_type_statement.hh
+++ b/cql3/statements/create_type_statement.hh
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright 2016 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "cql3/statements/schema_altering_statement.hh"
+#include "cql3/cql3_type.hh"
+#include "cql3/ut_name.hh"
+
+namespace cql3 {
+
+namespace statements {
+
+class create_type_statement : public schema_altering_statement {
+    ut_name _name;
+    std::vector<::shared_ptr<column_identifier>> _column_names;
+    std::vector<::shared_ptr<cql3_type::raw>> _column_types;
+    bool _if_not_exists;
+public:
+    create_type_statement(const ut_name& name, bool if_not_exists);
+
+    virtual void prepare_keyspace(const service::client_state& state) override;
+
+    void add_definition(::shared_ptr<column_identifier> name, ::shared_ptr<cql3_type::raw> type);
+
+    virtual void check_access(const service::client_state& state) override;
+
+    virtual void validate(distributed<service::storage_proxy>&, const service::client_state& state) override;
+
+    virtual shared_ptr<transport::event::schema_change> change_event() override;
+
+    virtual const sstring& keyspace() const override;
+
+    virtual future<bool> announce_migration(distributed<service::storage_proxy>& proxy, bool is_local_only) override;
+};
+
+}
+
+}
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -186,11 +186,30 @@ modification_statement::make_update_parameters(
 class prefetch_data_builder {
    update_parameters::prefetch_data& _data;
    const query::partition_slice& _ps;
+    schema_ptr _schema;
    std::experimental::optional<partition_key> _pkey;
+private:
+    void add_cell(update_parameters::prefetch_data::row& cells, const column_definition& def, const std::experimental::optional<bytes_view>& cell) {
+        if (cell) {
+            auto ctype = static_pointer_cast<const collection_type_impl>(def.type);
+            if (!ctype->is_multi_cell()) {
+                throw std::logic_error(sprint("cannot prefetch frozen collection: %s", def.name_as_text()));
+            }
+            auto map_type = map_type_impl::get_instance(ctype->name_comparator(), ctype->value_comparator(), true);
+            update_parameters::prefetch_data::cell_list list;
+            // FIXME: Iterate over a range instead of fully exploded collection
+            auto dv = map_type->deserialize(*cell);
+            for (auto&& el : value_cast<map_type_impl::native_type>(dv)) {
+                list.emplace_back(update_parameters::prefetch_data::cell{el.first.serialize(), el.second.serialize()});
+            }
+            cells.emplace(def.id, std::move(list));
+        }
+    };
 public:
-    prefetch_data_builder(update_parameters::prefetch_data& data, const query::partition_slice& ps)
+    prefetch_data_builder(schema_ptr s, update_parameters::prefetch_data& data, const query::partition_slice& ps)
        : _data(data)
        , _ps(ps)
+        , _schema(std::move(s))
    { }

    void accept_new_partition(const partition_key& key, uint32_t row_count) {
@@ -205,20 +224,9 @@ public:
                    const query::result_row_view& row) {
        update_parameters::prefetch_data::row cells;

-        auto add_cell = [&cells] (column_id id, std::experimental::optional<collection_mutation_view>&& cell) {
-            if (cell) {
-                cells.emplace(id, collection_mutation{to_bytes(cell->data)});
-            }
-        };
-
-        auto static_row_iterator = static_row.iterator();
-        for (auto&& id : _ps.static_columns) {
-            add_cell(id, static_row_iterator.next_collection_cell());
-        }
-
        auto row_iterator = row.iterator();
        for (auto&& id : _ps.regular_columns) {
-            add_cell(id, row_iterator.next_collection_cell());
+            add_cell(cells, _schema->regular_column_at(id), row_iterator.next_collection_cell());
        }

        _data.rows.emplace(std::make_pair(*_pkey, key), std::move(cells));
@@ -228,7 +236,16 @@ public:
        assert(0);
    }

-    void accept_partition_end(const query::result_row_view& static_row) {}
+    void accept_partition_end(const query::result_row_view& static_row) {
+        update_parameters::prefetch_data::row cells;
+
+        auto static_row_iterator = static_row.iterator();
+        for (auto&& id : _ps.static_columns) {
+            add_cell(cells, _schema->static_column_at(id), static_row_iterator.next_collection_cell());
+        }
+
+        _data.rows.emplace(std::make_pair(*_pkey, std::experimental::nullopt), std::move(cells));
+    }
 };

 future<update_parameters::prefetched_rows_type>
@@ -265,7 +282,8 @@ modification_statement::read_required_rows(
            std::move(regular_cols),
            query::partition_slice::option_set::of<
                query::partition_slice::option::send_partition_key,
-                query::partition_slice::option::send_clustering_key>());
+                query::partition_slice::option::send_clustering_key,
+                query::partition_slice::option::collections_as_maps>());
    std::vector<query::partition_range> pr;
    for (auto&& pk : *keys) {
        pr.emplace_back(dht::global_partitioner().decorate_key(*s, pk));
@@ -278,7 +296,7 @@ modification_statement::read_required_rows(
        bytes_ostream buf(result->buf());
        query::result_view v(buf.linearize());
        auto prefetched_rows = update_parameters::prefetched_rows_type({update_parameters::prefetch_data(s)});
-        v.consume(ps, prefetch_data_builder(prefetched_rows.value(), ps));
+        v.consume(ps, prefetch_data_builder(s, prefetched_rows.value(), ps));
        return prefetched_rows;
    });
 }
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -117,6 +117,11 @@ select_statement::for_selection(schema_ptr schema, ::shared_ptr<selection::selec
        ::shared_ptr<term>{});
 }

+::shared_ptr<cql3::metadata> select_statement::get_result_metadata() const {
+    // FIXME: COUNT needs special result metadata handling.
+    return _selection->get_result_metadata();
+}
+
 uint32_t select_statement::get_bound_terms() {
    return _bound_terms;
 }
@@ -170,7 +175,7 @@ select_statement::make_partition_slice(const query_options& options) {
    if (_parameters->is_distinct()) {
        _opts.set(query::partition_slice::option::distinct);
        return query::partition_slice({ query::clustering_range::make_open_ended_both_sides() },
-            std::move(static_columns), {}, _opts);
+            std::move(static_columns), {}, _opts, nullptr, options.get_cql_serialization_format());
    }

    auto bounds = _restrictions->get_clustering_bounds(options);
@@ -179,7 +184,7 @@ select_statement::make_partition_slice(const query_options& options) {
        std::reverse(bounds.begin(), bounds.end());
    }
    return query::partition_slice(std::move(bounds),
-        std::move(static_columns), std::move(regular_columns), _opts);
+        std::move(static_columns), std::move(regular_columns), _opts, nullptr, options.get_cql_serialization_format());
 }

 int32_t select_statement::get_limit(const query_options& options) const {
@@ -246,7 +251,7 @@ select_statement::execute(distributed<service::storage_proxy>& proxy, service::q
    if (aggregate) {
        return do_with(
                cql3::selection::result_set_builder(*_selection, now,
-                        options.get_serialization_format()),
+                        options.get_cql_serialization_format()),
                [p, page_size, now](auto& builder) {
                    return do_until([p] {return p->is_exhausted();},
                            [p, &builder, page_size, now] {
@@ -338,8 +343,8 @@ shared_ptr<transport::messages::result_message> select_statement::process_result
        db_clock::time_point now) {

    cql3::selection::result_set_builder builder(*_selection, now,
-            options.get_serialization_format());
-    query::result_view::consume(results->buf(), cmd->slice,
+            options.get_cql_serialization_format());
+    query::result_view::consume(*results, cmd->slice,
            cql3::selection::result_set_builder::visitor(builder, *_schema,
                    *_selection));
    auto rs = builder.build();
@@ -529,9 +534,12 @@ select_statement::raw_statement::get_ordering_comparator(schema_ptr schema,
 }

 bool select_statement::raw_statement::is_reversed(schema_ptr schema) {
-    std::experimental::optional<bool> reversed_map[schema->clustering_key_size()];

-    uint32_t i = 0;
+    assert(_parameters->orderings().size() > 0);
+    parameters::orderings_type::size_type i = 0;
+    bool is_reversed_ = false;
+    bool relation_order_unsupported = false;
+
    for (auto&& e : _parameters->orderings()) {
        ::shared_ptr<column_identifier> column = e.first->prepare_column_identifier(schema);
        bool reversed = e.second;
@@ -551,32 +559,23 @@ bool select_statement::raw_statement::is_reversed(schema_ptr schema) {
                "Order by currently only support the ordering of columns following their declared order in the PRIMARY KEY");
        }

-        reversed_map[i] = std::experimental::make_optional(reversed != def->type->is_reversed());
+        bool current_reverse_status = (reversed != def->type->is_reversed());
+
+        if (i == 0) {
+            is_reversed_ = current_reverse_status;
+        }
+
+        if (is_reversed_ != current_reverse_status) {
+            relation_order_unsupported = true;
+        }
        ++i;
    }

-    // GCC incorrenctly complains about "*is_reversed_" below
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
-
-    // Check that all bool in reversedMap, if set, agrees
-    std::experimental::optional<bool> is_reversed_{};
-    for (auto&& b : reversed_map) {
-        if (b) {
-            if (!is_reversed_) {
-                is_reversed_ = b;
-            } else {
-                if ((*is_reversed_) != *b) {
-                    throw exceptions::invalid_request_exception("Unsupported order by relation");
-                }
-            }
-        }
+    if (relation_order_unsupported) {
+        throw exceptions::invalid_request_exception("Unsupported order by relation");
    }

-    assert(is_reversed_);
-    return *is_reversed_;
-
-#pragma GCC diagnostic pop
+    return is_reversed_;
 }

 /** If ALLOW FILTERING was not specified, this verifies that it is not needed */
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -121,6 +121,7 @@ public:
    static ::shared_ptr<select_statement> for_selection(
        schema_ptr schema, ::shared_ptr<selection::selection> selection);

+    ::shared_ptr<cql3::metadata> get_result_metadata() const;
    virtual uint32_t get_bound_terms() override;
    virtual void check_access(const service::client_state& state) override;
    virtual void validate(distributed<service::storage_proxy>&, const service::client_state& state) override;
--- a/cql3/statements/update_statement.cc
+++ b/cql3/statements/update_statement.cc
@@ -78,7 +78,7 @@ void update_statement::add_update_for_key(mutation& m, const exploded_clustering
        // If there are static columns, there also must be clustering columns, in which
        // case empty prefix can only refer to the static row.
        bool is_static_prefix = s->has_static_columns() && !prefix;
-        if (type == statement_type::INSERT && !is_static_prefix) {
+        if (type == statement_type::INSERT && !is_static_prefix && s->is_cql3_table()) {
            auto& row = m.partition().clustered_row(clustering_key::from_clustering_prefix(*s, prefix));
            row.apply(row_marker(params.timestamp(), params.ttl(), params.expiry()));
        }
@@ -137,19 +137,17 @@ update_statement::parsed_insert::prepare_internal(database& db, schema_ptr schem
        throw exceptions::invalid_request_exception("No columns provided to INSERT");
    }

+    std::unordered_set<bytes> column_ids;
    for (size_t i = 0; i < _column_names.size(); i++) {
        auto id = _column_names[i]->prepare_column_identifier(schema);
        auto def = get_column_definition(schema, *id);
        if (!def) {
            throw exceptions::invalid_request_exception(sprint("Unknown identifier %s", *id));
        }
-
-        for (size_t j = 0; j < i; j++) {
-            auto other_id = _column_names[j]->prepare_column_identifier(schema);
-            if (*id == *other_id) {
-                throw exceptions::invalid_request_exception(sprint("Multiple definitions found for column %s", *id));
-            }
+        if (column_ids.count(id->name())) {
+            throw exceptions::invalid_request_exception(sprint("Multiple definitions found for column %s", *id));
        }
+        column_ids.emplace(id->name());

        auto&& value = _column_values[i];

--- a/cql3/term.hh
+++ b/cql3/term.hh
@@ -205,7 +205,7 @@ class collection_terminal {
 public:
    virtual ~collection_terminal() {}
    /** Gets the value of the collection when serialized with the given protocol version format */
-    virtual bytes get_with_protocol_version(serialization_format sf) = 0;
+    virtual bytes get_with_protocol_version(cql_serialization_format sf) = 0;
 };

 /**
--- a/cql3/tuples.hh
+++ b/cql3/tuples.hh
@@ -202,12 +202,12 @@ public:
                buffers[i] = to_bytes_opt(_elements[i]->bind_and_get(options));
                // Inside tuples, we must force the serialization of collections to v3 whatever protocol
                // version is in use since we're going to store directly that serialized value.
-                if (options.get_serialization_format() != serialization_format::internal()
+                if (options.get_cql_serialization_format() != cql_serialization_format::internal()
                        && _type->type(i)->is_collection()) {
                    if (buffers[i]) {
                        buffers[i] = static_pointer_cast<const collection_type_impl>(_type->type(i))->reserialize(
-                                options.get_serialization_format(),
-                                serialization_format::internal(),
+                                options.get_cql_serialization_format(),
+                                cql_serialization_format::internal(),
                                bytes_view(*buffers[i]));
                    }
                }
@@ -251,7 +251,7 @@ public:
            try {
                // Collections have this small hack that validate cannot be called on a serialized object,
                // but the deserialization does the validation (so we're fine).
-                auto l = value_cast<list_type_impl::native_type>(type->deserialize(value, options.get_serialization_format()));
+                auto l = value_cast<list_type_impl::native_type>(type->deserialize(value, options.get_cql_serialization_format()));
                auto ttype = dynamic_pointer_cast<const tuple_type_impl>(type->get_elements_type());
                assert(ttype);

--- a/cql3/update_parameters.cc
+++ b/cql3/update_parameters.cc
@@ -43,17 +43,17 @@

 namespace cql3 {

-std::experimental::optional<collection_mutation_view>
+const update_parameters::prefetch_data::cell_list*
 update_parameters::get_prefetched_list(
-    const partition_key& pkey,
-    const clustering_key& row_key,
+    partition_key pkey,
+    std::experimental::optional<clustering_key> ckey,
    const column_definition& column) const
 {
    if (!_prefetched) {
        return {};
    }

-    auto i = _prefetched->rows.find(std::make_pair(pkey, row_key));
+    auto i = _prefetched->rows.find(std::make_pair(std::move(pkey), std::move(ckey)));
    if (i == _prefetched->rows.end()) {
        return {};
    }
@@ -63,7 +63,7 @@ update_parameters::get_prefetched_list(
    if (j == row.end()) {
        return {};
    }
-    return {j->second};
+    return &j->second;
 }

 update_parameters::prefetch_data::prefetch_data(schema_ptr schema)
--- a/cql3/update_parameters.hh
+++ b/cql3/update_parameters.hh
@@ -58,8 +58,9 @@ namespace cql3 {
 */
 class update_parameters final {
 public:
+    // Holder for data needed by CQL list updates which depend on current state of the list.
    struct prefetch_data {
-        using key = std::pair<partition_key, clustering_key>;
+        using key = std::pair<partition_key, std::experimental::optional<clustering_key>>;
        struct key_hashing {
            partition_key::hashing pk_hash;
            clustering_key::hashing ck_hash;
@@ -70,7 +71,7 @@ public:
            { }

            size_t operator()(const key& k) const {
-                return pk_hash(k.first) ^ ck_hash(k.second);
+                return pk_hash(k.first) ^ (k.second ? ck_hash(*k.second) : 0);
            }
        };
        struct key_equality {
@@ -83,10 +84,16 @@ public:
            { }

            bool operator()(const key& k1, const key& k2) const {
-                return pk_eq(k1.first, k2.first) && ck_eq(k1.second, k2.second);
+                return pk_eq(k1.first, k2.first)
+                       && bool(k1.second) == bool(k2.second) && (!k1.second || ck_eq(*k1.second, *k2.second));
            }
        };
-        using row = std::unordered_map<column_id, collection_mutation>;
+        struct cell {
+            bytes key;
+            bytes value;
+        };
+        using cell_list = std::vector<cell>;
+        using row = std::unordered_map<column_id, cell_list>;
    public:
        std::unordered_map<key, row, key_hashing, key_equality> rows;
        schema_ptr schema;
@@ -183,8 +190,11 @@ public:
        return _timestamp;
    }

-    std::experimental::optional<collection_mutation_view> get_prefetched_list(
-        const partition_key& pkey, const clustering_key& row_key, const column_definition& column) const;
+    const prefetch_data::cell_list*
+    get_prefetched_list(
+        partition_key pkey,
+        std::experimental::optional<clustering_key> ckey,
+        const column_definition& column) const;
 };

 }
--- a/cql3/user_types.cc
+++ b/cql3/user_types.cc
@@ -161,15 +161,15 @@ void user_types::delayed_value::collect_marker_specification(shared_ptr<variable
 }

 std::vector<bytes_opt> user_types::delayed_value::bind_internal(const query_options& options) {
-    auto sf = options.get_serialization_format();
+    auto sf = options.get_cql_serialization_format();
    std::vector<bytes_opt> buffers;
    for (size_t i = 0; i < _type->size(); ++i) {
        buffers.push_back(to_bytes_opt(_values[i]->bind_and_get(options)));
        // Inside UDT values, we must force the serialization of collections to v3 whatever protocol
        // version is in use since we're going to store directly that serialized value.
-        if (sf != serialization_format::use_32_bit() && _type->field_type(i)->is_collection() && buffers.back()) {
+        if (!sf.collection_format_unchanged() && _type->field_type(i)->is_collection() && buffers.back()) {
            auto&& ctype = static_pointer_cast<const collection_type_impl>(_type->field_type(i));
-            buffers.back() = ctype->reserialize(sf, serialization_format::use_32_bit(), bytes_view(*buffers.back()));
+            buffers.back() = ctype->reserialize(sf, cql_serialization_format::latest(), bytes_view(*buffers.back()));
        }
    }
    return buffers;
--- a/cql3/ut_name.cc
+++ b/cql3/ut_name.cc
@@ -56,7 +56,7 @@ void ut_name::set_keyspace(sstring keyspace) {
    _ks_name = std::experimental::optional<sstring>{keyspace};
 }

-sstring ut_name::get_keyspace() const {
+const sstring& ut_name::get_keyspace() const {
    return _ks_name.value();
 }

--- a/cql3/ut_name.hh
+++ b/cql3/ut_name.hh
@@ -58,7 +58,7 @@ public:

    void set_keyspace(sstring keyspace);

-    sstring get_keyspace() const;
+    const sstring& get_keyspace() const;

    bytes get_user_type_name() const;

--- a/cql_serialization_format.hh
+++ b/cql_serialization_format.hh
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <iostream>
+
+using cql_protocol_version_type = uint8_t;
+
+// Abstraction of transport protocol-dependent serialization format
+// Protocols v1, v2 used 16 bits for collection sizes, while v3 and
+// above use 32 bits.  But letting every bit of the code know what
+// transport protocol we're using (and in some cases, we aren't using
+// any transport -- it's for internal storage) is bad, so abstract it
+// away here.
+
+class cql_serialization_format {
+    cql_protocol_version_type _version;
+public:
+    static constexpr cql_protocol_version_type latest_version = 3;
+    explicit cql_serialization_format(cql_protocol_version_type version) : _version(version) {}
+    static cql_serialization_format latest() { return cql_serialization_format{latest_version}; }
+    static cql_serialization_format internal() { return latest(); }
+    bool using_32_bits_for_collections() const { return _version >= 3; }
+    bool operator==(cql_serialization_format x) const { return _version == x._version; }
+    bool operator!=(cql_serialization_format x) const { return !operator==(x); }
+    cql_protocol_version_type protocol_version() const { return _version; }
+    friend std::ostream& operator<<(std::ostream& out, const cql_serialization_format& sf) {
+        return out << static_cast<int>(sf._version);
+    }
+    bool collection_format_unchanged(cql_serialization_format other = cql_serialization_format::latest()) const {
+        return using_32_bits_for_collections() == other.using_32_bits_for_collections();
+    }
+};
--- a/database.cc
+++ b/database.cc
--- a/database.hh
+++ b/database.hh
@@ -41,6 +41,7 @@
 #include <set>
 #include <iostream>
 #include <boost/functional/hash.hpp>
+#include <boost/range/algorithm/find.hpp>
 #include <experimental/optional>
 #include <string.h>
 #include "types.hh"
@@ -56,7 +57,6 @@
 #include "tombstone.hh"
 #include "atomic_cell.hh"
 #include "query-request.hh"
-#include "query-result.hh"
 #include "keys.hh"
 #include "mutation.hh"
 #include "memtable.hh"
@@ -71,6 +71,7 @@
 #include "sstables/compaction.hh"
 #include "key_reader.hh"
 #include <seastar/core/rwlock.hh>
+#include <seastar/core/shared_future.hh>

 class frozen_mutation;
 class reconcilable_result;
@@ -97,9 +98,132 @@ void make(database& db, bool durable, bool volatile_testing_only);
 }
 }

+class throttle_state {
+    size_t _max_space;
+    logalloc::region_group& _region_group;
+    throttle_state* _parent;
+
+    circular_buffer<promise<>> _throttled_requests;
+    timer<> _throttling_timer{[this] { unthrottle(); }};
+    void unthrottle();
+    bool should_throttle() const {
+        if (_region_group.memory_used() > _max_space) {
+            return true;
+        }
+        if (_parent) {
+            return _parent->should_throttle();
+        }
+        return false;
+    }
+public:
+    throttle_state(size_t max_space, logalloc::region_group& region, throttle_state* parent = nullptr)
+        : _max_space(max_space)
+        , _region_group(region)
+        , _parent(parent)
+    {}
+
+    future<> throttle();
+};
+
+
 class replay_position_reordered_exception : public std::exception {};

-using memtable_list = std::vector<lw_shared_ptr<memtable>>;
+// We could just add all memtables, regardless of types, to a single list, and
+// then filter them out when we read them. Here's why I have chosen not to do
+// it:
+//
+// First, some of the methods in which a memtable is involved (like seal) are
+// assume a commitlog, and go through great care of updating the replay
+// position, flushing the log, etc.  We want to bypass those, and that has to
+// be done either by sprikling the seal code with conditionals, or having a
+// separate method for each seal.
+//
+// Also, if we ever want to put some of the memtables in as separate allocator
+// region group to provide for extra QoS, having the classes properly wrapped
+// will make that trivial: just pass a version of new_memtable() that puts it
+// in a different region, while the list approach would require a lot of
+// conditionals as well.
+//
+// If we are going to have different methods, better have different instances
+// of a common class.
+class memtable_list {
+    using shared_memtable = lw_shared_ptr<memtable>;
+    std::vector<shared_memtable> _memtables;
+    std::function<future<> ()> _seal_fn;
+    std::function<schema_ptr()> _current_schema;
+    size_t _max_memtable_size;
+    logalloc::region_group* _dirty_memory_region_group;
+public:
+    memtable_list(std::function<future<> ()> seal_fn, std::function<schema_ptr()> cs, size_t max_memtable_size, logalloc::region_group* region_group)
+        : _memtables({})
+        , _seal_fn(seal_fn)
+        , _current_schema(cs)
+        , _max_memtable_size(max_memtable_size)
+        , _dirty_memory_region_group(region_group) {
+        add_memtable();
+    }
+
+    shared_memtable back() {
+        return _memtables.back();
+    }
+
+    // The caller has to make sure the element exist before calling this.
+    void erase(const shared_memtable& element) {
+        _memtables.erase(boost::range::find(_memtables, element));
+    }
+    void clear() {
+        _memtables.clear();
+    }
+
+    size_t size() const {
+        return _memtables.size();
+    }
+
+    future<> seal_active_memtable() {
+        return _seal_fn();
+    }
+
+    auto begin() noexcept {
+        return _memtables.begin();
+    }
+
+    auto begin() const noexcept {
+        return _memtables.begin();
+    }
+
+    auto end() noexcept {
+        return _memtables.end();
+    }
+
+    auto end() const noexcept {
+        return _memtables.end();
+    }
+
+    memtable& active_memtable() {
+        return *_memtables.back();
+    }
+
+    void add_memtable() {
+        _memtables.emplace_back(new_memtable());
+    }
+
+    bool should_flush() {
+        return active_memtable().occupancy().total_space() >= _max_memtable_size;
+    }
+
+    void seal_on_overflow() {
+        if (should_flush()) {
+            // FIXME: if sparse, do some in-memory compaction first
+            // FIXME: maybe merge with other in-memory memtables
+            _seal_fn();
+        }
+    }
+private:
+    lw_shared_ptr<memtable> new_memtable() {
+        return make_lw_shared<memtable>(_current_schema(), _dirty_memory_region_group);
+    }
+};
+
 using sstable_list = sstables::sstable_list;

 // The CF has a "stats" structure. But we don't want all fields here,
@@ -122,7 +246,9 @@ public:
        bool enable_commitlog = true;
        bool enable_incremental_backups = false;
        size_t max_memtable_size = 5'000'000;
+        size_t max_streaming_memtable_size = 5'000'000;
        logalloc::region_group* dirty_memory_region_group = nullptr;
+        logalloc::region_group* streaming_dirty_memory_region_group = nullptr;
        ::cf_stats* cf_stats = nullptr;
    };
    struct no_commitlog {};
@@ -154,14 +280,43 @@ private:
    config _config;
    stats _stats;
    lw_shared_ptr<memtable_list> _memtables;
+
+    // In older incarnations, we simply commited the mutations to memtables.
+    // However, doing that makes it harder for us to provide QoS within the
+    // disk subsystem. Keeping them in separate memtables allow us to properly
+    // classify those streams into its own I/O class
+    //
+    // We could write those directly to disk, but we still want the mutations
+    // coming through the wire to go to a memtable staging area.  This has two
+    // major advantages:
+    //
+    // first, it will allow us to properly order the partitions. They are
+    // hopefuly sent in order but we can't really guarantee that without
+    // sacrificing sender-side parallelism.
+    //
+    // second, we will be able to coalesce writes from multiple plan_id's and
+    // even multiple senders, as well as automatically tapping into the dirty
+    // memory throttling mechanism, guaranteeing we will not overload the
+    // server.
+    lw_shared_ptr<memtable_list> _streaming_memtables;
+
+    lw_shared_ptr<memtable_list> make_memtable_list();
+    lw_shared_ptr<memtable_list> make_streaming_memtable_list();
+
    // generation -> sstable. Ordered by key so we can easily get the most recent.
    lw_shared_ptr<sstable_list> _sstables;
+    // sstables that have been compacted (so don't look up in query) but
+    // have not been deleted yet, so must not GC any tombstones in other sstables
+    // that may delete data in these sstables:
+    std::vector<sstables::shared_sstable> _sstables_compacted_but_not_deleted;
+    // Control background fibers waiting for sstables to be deleted
+    seastar::gate _sstable_deletion_gate;
    // There are situations in which we need to stop writing sstables. Flushers will take
    // the read lock, and the ones that wish to stop that process will take the write lock.
    rwlock _sstables_lock;
    mutable row_cache _cache; // Cache covers only sstables.
-    int64_t _sstable_generation = 1;
-    unsigned _mutation_count = 0;
+    std::experimental::optional<int64_t> _sstable_generation = {};
+
    db::replay_position _highest_flushed_rp;
    // Provided by the database that owns this commitlog
    db::commitlog* _commitlog;
@@ -172,30 +327,43 @@ private:
    int _compaction_disabled = 0;
    class memtable_flush_queue;
    std::unique_ptr<memtable_flush_queue> _flush_queue;
-    // Store generation of sstables being compacted at the moment. That's needed to prevent a
-    // sstable from being compacted twice.
-    std::unordered_set<unsigned long> _compacting_generations;
+    // Because streaming mutations bypass the commitlog, there is
+    // no need for the complications of the flush queue. Besides, it
+    // is easier to just use a common gate than it is to modify the flush_queue
+    // to work both with and without a replay position.
+    //
+    // Last but not least, we seldom need to guarantee any ordering here: as long
+    // as all data is waited for, we're good.
+    seastar::gate _streaming_flush_gate;
 private:
-    void update_stats_for_new_sstable(uint64_t new_sstable_data_size);
+    void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable);
    void add_sstable(sstables::sstable&& sstable);
    void add_sstable(lw_shared_ptr<sstables::sstable> sstable);
-    void add_memtable();
+    lw_shared_ptr<memtable> new_memtable();
+    lw_shared_ptr<memtable> new_streaming_memtable();
    future<stop_iteration> try_flush_memtable_to_sstable(lw_shared_ptr<memtable> memt);
    future<> update_cache(memtable&, lw_shared_ptr<sstable_list> old_sstables);
    struct merge_comparator;

    // update the sstable generation, making sure that new new sstables don't overwrite this one.
    void update_sstables_known_generation(unsigned generation) {
-        _sstable_generation = std::max<uint64_t>(_sstable_generation, generation /  smp::count + 1);
+        if (!_sstable_generation) {
+            _sstable_generation = 1;
+        }
+        _sstable_generation = std::max<uint64_t>(*_sstable_generation, generation /  smp::count + 1);
    }

    uint64_t calculate_generation_for_new_table() {
-        return _sstable_generation++ * smp::count + engine().cpu_id();
+        assert(_sstable_generation);
+        // FIXME: better way of ensuring we don't attempt to
+        // overwrite an existing table.
+        return (*_sstable_generation)++ * smp::count + engine().cpu_id();
    }

    // Rebuild existing _sstables with new_sstables added to it and sstables_to_remove removed from it.
    void rebuild_sstable_list(const std::vector<sstables::shared_sstable>& new_sstables,
                              const std::vector<sstables::shared_sstable>& sstables_to_remove);
+    void rebuild_statistics();
 private:
    // Creates a mutation reader which covers sstables.
    // Caller needs to ensure that column_family remains live (FIXME: relax this).
@@ -207,7 +375,29 @@ private:
    key_source sstables_as_key_source() const;
    partition_presence_checker make_partition_presence_checker(lw_shared_ptr<sstable_list> old_sstables);
    std::chrono::steady_clock::time_point _sstable_writes_disabled_at;
+    void do_trigger_compaction();
 public:
+
+    // This function should be called when this column family is ready for writes, IOW,
+    // to produce SSTables. Extensive details about why this is important can be found
+    // in Scylla's Github Issue #1014
+    //
+    // Nothing should be writing to SSTables before we have the chance to populate the
+    // existing SSTables and calculate what should the next generation number be.
+    //
+    // However, if that happens, we want to protect against it in a way that does not
+    // involve overwriting existing tables. This is one of the ways to do it: every
+    // column family starts in an unwriteable state, and when it can finally be written
+    // to, we mark it as writeable.
+    //
+    // Note that this *cannot* be a part of add_column_family. That adds a column family
+    // to a db in memory only, and if anybody is about to write to a CF, that was most
+    // likely already called. We need to call this explicitly when we are sure we're ready
+    // to issue disk operations safely.
+    void mark_ready_for_writes() {
+        update_sstables_known_generation(0);
+    }
+
    // Creates a mutation reader which covers all data sources for this column family.
    // Caller needs to ensure that column_family remains live (FIXME: relax this).
    // Note: for data queries use query() instead.
@@ -227,7 +417,7 @@ public:
    // FIXME: in case a query is satisfied from a single memtable, avoid a copy
    using const_mutation_partition_ptr = std::unique_ptr<const mutation_partition>;
    using const_row_ptr = std::unique_ptr<const row>;
-    memtable& active_memtable() { return *_memtables->back(); }
+    memtable& active_memtable() { return _memtables->active_memtable(); }
    const row_cache& get_row_cache() const {
        return _cache;
    }
@@ -252,10 +442,11 @@ public:
    // The mutation is always upgraded to current schema.
    void apply(const frozen_mutation& m, const schema_ptr& m_schema, const db::replay_position& = db::replay_position());
    void apply(const mutation& m, const db::replay_position& = db::replay_position());
+    void apply_streaming_mutation(schema_ptr, const frozen_mutation&);

    // Returns at most "cmd.limit" rows
    future<lw_shared_ptr<query::result>> query(schema_ptr,
-        const query::read_command& cmd,
+        const query::read_command& cmd, query::result_request request,
        const std::vector<query::partition_range>& ranges);

    future<> populate(sstring datadir);
@@ -264,6 +455,7 @@ public:
    future<> stop();
    future<> flush();
    future<> flush(const db::replay_position&);
+    future<> flush_streaming_mutations(std::vector<query::partition_range> ranges = std::vector<query::partition_range>{});
    void clear(); // discards memtable(s) without flushing them to disk.
    future<db::replay_position> discard_sstables(db_clock::time_point);

@@ -274,14 +466,19 @@ public:
    future<int64_t> disable_sstable_write() {
        _sstable_writes_disabled_at = std::chrono::steady_clock::now();
        return _sstables_lock.write_lock().then([this] {
-            return make_ready_future<int64_t>((*_sstables->end()).first);
+            if (_sstables->empty()) {
+                return make_ready_future<int64_t>(0);
+            }
+            return make_ready_future<int64_t>((*_sstables->rbegin()).first);
        });
    }

-    // SSTable writes are now allowed again, and generation is updated to new_generation
+    // SSTable writes are now allowed again, and generation is updated to new_generation if != -1
    // returns the amount of microseconds elapsed since we disabled writes.
    std::chrono::steady_clock::duration enable_sstable_write(int64_t new_generation) {
-        update_sstables_known_generation(new_generation);
+        if (new_generation != -1) {
+            update_sstables_known_generation(new_generation);
+        }
        _sstables_lock.write_unlock();
        return std::chrono::steady_clock::now() - _sstable_writes_disabled_at;
    }
@@ -295,9 +492,11 @@ public:
    // very dangerous to do that with live SSTables. This is meant to be used with SSTables
    // that are not yet managed by the system.
    //
+    // Parameter all_generations stores the generation of all SSTables in the system, so it
+    // will be easy to determine which SSTable is new.
    // An example usage would query all shards asking what is the highest SSTable number known
    // to them, and then pass that + 1 as "start".
-    future<std::vector<sstables::entry_descriptor>> reshuffle_sstables(int64_t start);
+    future<std::vector<sstables::entry_descriptor>> reshuffle_sstables(std::set<int64_t> all_generations, int64_t start);

    // FIXME: this is just an example, should be changed to something more
    // general. compact_all_sstables() starts a compaction of all sstables.
@@ -331,6 +530,7 @@ public:
    }

    lw_shared_ptr<sstable_list> get_sstables();
+    lw_shared_ptr<sstable_list> get_sstables_including_compacted_undeleted();
    size_t sstables_count();
    int64_t get_unleveled_sstables() const;

@@ -362,15 +562,15 @@ public:
    Result run_with_compaction_disabled(Func && func) {
        ++_compaction_disabled;
        return _compaction_manager.remove(this).then(std::forward<Func>(func)).finally([this] {
-            if (--_compaction_disabled == 0) {
-                trigger_compaction();
+            // #934. The pending counter is actually a great indicator into whether we
+            // actually need to trigger a compaction again.
+            if (--_compaction_disabled == 0 && _stats.pending_compactions > 0) {
+                // we're turning if on again, use function that does not increment
+                // the counter further.
+                do_trigger_compaction();
            }
        });
    }
-
-    std::unordered_set<unsigned long>& compacting_generations() {
-        return _compacting_generations;
-    }
 private:
    // One does not need to wait on this future if all we are interested in, is
    // initiating the write.  The writes initiated here will eventually
@@ -380,23 +580,42 @@ private:
    // But it is possible to synchronously wait for the seal to complete by
    // waiting on this future. This is useful in situations where we want to
    // synchronously flush data to disk.
-    //
-    // FIXME: A better interface would guarantee that all writes before this
-    // one are also complete
    future<> seal_active_memtable();

+    // I am assuming here that the repair process will potentially send ranges containing
+    // few mutations, definitely not enough to fill a memtable. It wants to know whether or
+    // not each of those ranges individually succeeded or failed, so we need a future for
+    // each.
+    //
+    // One of the ways to fix that, is changing the repair itself to send more mutations at
+    // a single batch. But relying on that is a bad idea for two reasons:
+    //
+    // First, the goals of the SSTable writer and the repair sender are at odds. The SSTable
+    // writer wants to write as few SSTables as possible, while the repair sender wants to
+    // break down the range in pieces as small as it can and checksum them individually, so
+    // it doesn't have to send a lot of mutations for no reason.
+    //
+    // Second, even if the repair process wants to process larger ranges at once, some ranges
+    // themselves may be small. So while most ranges would be large, we would still have
+    // potentially some fairly small SSTables lying around.
+    //
+    // The best course of action in this case is to coalesce the incoming streams write-side.
+    // repair can now choose whatever strategy - small or big ranges - it wants, resting assure
+    // that the incoming memtables will be coalesced together.
+    shared_promise<> _waiting_streaming_flushes;
+    timer<> _delayed_streaming_flush{[this] { seal_active_streaming_memtable(); }};
+    future<> seal_active_streaming_memtable();
+    future<> seal_active_streaming_memtable_delayed();
+
    // filter manifest.json files out
    static bool manifest_json_filter(const sstring& fname);

-    seastar::gate _in_flight_seals;
-
    // Iterate over all partitions.  Protocol is the same as std::all_of(),
    // so that iteration can be stopped by returning false.
    // Func signature: bool (const decorated_key& dk, const mutation_partition& mp)
    template <typename Func>
    future<bool> for_all_partitions(schema_ptr, Func&& func) const;
    future<sstables::entry_descriptor> probe_file(sstring sstdir, sstring fname);
-    void seal_on_overflow();
    void check_valid_rp(const db::replay_position&) const;
 public:
    // Iterate over all partitions.  Protocol is the same as std::all_of(),
@@ -499,7 +718,9 @@ public:
        bool enable_cache = true;
        bool enable_incremental_backups = false;
        size_t max_memtable_size = 5'000'000;
+        size_t max_streaming_memtable_size = 5'000'000;
        logalloc::region_group* dirty_memory_region_group = nullptr;
+        logalloc::region_group* streaming_dirty_memory_region_group = nullptr;
        ::cf_stats* cf_stats = nullptr;
    };
 private:
@@ -561,18 +782,19 @@ public:
 class database {
    ::cf_stats _cf_stats;
    logalloc::region_group _dirty_memory_region_group;
+    logalloc::region_group _streaming_dirty_memory_region_group;
    std::unordered_map<sstring, keyspace> _keyspaces;
    std::unordered_map<utils::UUID, lw_shared_ptr<column_family>> _column_families;
    std::unordered_map<std::pair<sstring, sstring>, utils::UUID, utils::tuple_hash> _ks_cf_to_uuid;
    std::unique_ptr<db::commitlog> _commitlog;
    std::unique_ptr<db::config> _cfg;
    size_t _memtable_total_space = 500 << 20;
+    size_t _streaming_memtable_total_space = 500 << 20;
    utils::UUID _version;
    // compaction_manager object is referenced by all column families of a database.
    compaction_manager _compaction_manager;
    std::vector<scollectd::registration> _collectd;
-    timer<> _throttling_timer{[this] { unthrottle(); }};
-    circular_buffer<promise<>> _throttled_requests;
+    bool _enable_incremental_backups = false;

    future<> init_commitlog();
    future<> apply_in_memory(const frozen_mutation& m, const schema_ptr& m_schema, const db::replay_position&);
@@ -586,12 +808,16 @@ private:
    void create_in_memory_keyspace(const lw_shared_ptr<keyspace_metadata>& ksm);
    friend void db::system_keyspace::make(database& db, bool durable, bool volatile_testing_only);
    void setup_collectd();
-    future<> throttle();
+
+    throttle_state _memtables_throttler;
+    throttle_state _streaming_throttler;
+
    future<> do_apply(schema_ptr, const frozen_mutation&);
-    void unthrottle();
 public:
    static utils::UUID empty_version;

+    void set_enable_incremental_backups(bool val) { _enable_incremental_backups = val; }
+
    future<> parse_system_tables(distributed<service::storage_proxy>&);
    database();
    database(const db::config&);
@@ -618,8 +844,6 @@ public:

    void add_column_family(schema_ptr schema, column_family::config cfg);

-    future<> drop_column_family(db_clock::time_point changed_at, const sstring& ks_name, const sstring& cf_name);
-
    /* throws std::out_of_range if missing */
    const utils::UUID& find_uuid(const sstring& ks, const sstring& cf) const throw (std::out_of_range);
    const utils::UUID& find_uuid(const schema_ptr&) const throw (std::out_of_range);
@@ -644,6 +868,7 @@ public:
    const column_family& find_column_family(const utils::UUID&) const throw (no_such_column_family);
    column_family& find_column_family(const schema_ptr&) throw (no_such_column_family);
    const column_family& find_column_family(const schema_ptr&) const throw (no_such_column_family);
+    bool column_family_exists(const utils::UUID& uuid) const;
    schema_ptr find_schema(const sstring& ks_name, const sstring& cf_name) const throw (no_such_column_family);
    schema_ptr find_schema(const utils::UUID&) const throw (no_such_column_family);
    bool has_schema(const sstring& ks_name, const sstring& cf_name) const;
@@ -652,9 +877,10 @@ public:
    unsigned shard_of(const dht::token& t);
    unsigned shard_of(const mutation& m);
    unsigned shard_of(const frozen_mutation& m);
-    future<lw_shared_ptr<query::result>> query(schema_ptr, const query::read_command& cmd, const std::vector<query::partition_range>& ranges);
+    future<lw_shared_ptr<query::result>> query(schema_ptr, const query::read_command& cmd, query::result_request request, const std::vector<query::partition_range>& ranges);
    future<reconcilable_result> query_mutations(schema_ptr, const query::read_command& cmd, const query::partition_range& range);
    future<> apply(schema_ptr, const frozen_mutation&);
+    future<> apply_streaming_mutation(schema_ptr, const frozen_mutation&);
    keyspace::config make_keyspace_config(const keyspace_metadata& ksm);
    const sstring& get_snitch_name() const;
    future<> clear_snapshot(sstring tag, std::vector<sstring> keyspace_names);
@@ -686,9 +912,16 @@ public:
    }

    future<> flush_all_memtables();
+
+    // See #937. Truncation now requires a callback to get a time stamp
+    // that must be guaranteed to be the same for all shards.
+    typedef std::function<future<db_clock::time_point>()> timestamp_func;
+
    /** Truncates the given column family */
-    future<> truncate(db_clock::time_point truncated_at, sstring ksname, sstring cfname);
-    future<> truncate(db_clock::time_point truncated_at, const keyspace& ks, column_family& cf);
+    future<> truncate(sstring ksname, sstring cfname, timestamp_func);
+    future<> truncate(const keyspace& ks, column_family& cf, timestamp_func);
+
+    future<> drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func);

    const logalloc::region_group& dirty_memory_region_group() const {
        return _dirty_memory_region_group;
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -59,6 +59,12 @@
 #include "gms/failure_detector.hh"
 #include "service/storage_service.hh"
 #include "schema_registry.hh"
+#include "idl/uuid.dist.hh"
+#include "idl/frozen_schema.dist.hh"
+#include "serializer_impl.hh"
+#include "serialization_visitors.hh"
+#include "idl/uuid.dist.impl.hh"
+#include "idl/frozen_schema.dist.impl.hh"

 static logging::logger logger("batchlog_manager");

@@ -119,15 +125,11 @@ mutation db::batchlog_manager::get_batch_log_mutation_for(const std::vector<muta
    auto timestamp = api::new_timestamp();
    auto data = [this, &mutations] {
        std::vector<canonical_mutation> fm(mutations.begin(), mutations.end());
-        const auto size = std::accumulate(fm.begin(), fm.end(), size_t(0), [](size_t s, auto& m) {
-            return s + serializer<canonical_mutation>{m}.size();
-        });
-        bytes buf(bytes::initialized_later(), size);
-        data_output out(buf);
+        bytes_ostream out;
        for (auto& m : fm) {
-            serializer<canonical_mutation>{m}(out);
+            ser::serialize(out, m);
        }
-        return buf;
+        return to_bytes(out.linearize());
    }();

    mutation m(key, schema);
@@ -155,47 +157,58 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
        auto written_at = row.get_as<db_clock::time_point>("written_at");
        auto id = row.get_as<utils::UUID>("id");
        // enough time for the actual write + batchlog entry mutation delivery (two separate requests).
-        // enough time for the actual write + batchlog entry mutation delivery (two separate requests).
        auto timeout = get_batch_log_timeout();
        if (db_clock::now() < written_at + timeout) {
            logger.debug("Skipping replay of {}, too fresh", id);
            return make_ready_future<>();
        }
-        // not used currently. ever?
-        //auto version = row.has("version") ? row.get_as<uint32_t>("version") : /*MessagingService.VERSION_12*/6u;
+
+        // check version of serialization format
+        if (!row.has("version")) {
+            logger.warn("Skipping logged batch because of unknown version");
+            return make_ready_future<>();
+        }
+
+        auto version = row.get_as<int32_t>("version");
+        if (version != net::messaging_service::current_version) {
+            logger.warn("Skipping logged batch because of incorrect version");
+            return make_ready_future<>();
+        }
+
        auto data = row.get_blob("data");

        logger.debug("Replaying batch {}", id);

        auto fms = make_lw_shared<std::deque<canonical_mutation>>();
-        data_input in(data);
-        while (in.has_next()) {
-            fms->emplace_back(serializer<canonical_mutation>::read(in));
+        auto in = ser::as_input_stream(data);
+        while (in.size()) {
+            fms->emplace_back(ser::deserialize(in, boost::type<canonical_mutation>()));
        }

-        auto mutations = make_lw_shared<std::vector<mutation>>();
        auto size = data.size();

-        return repeat([this, fms = std::move(fms), written_at, mutations]() mutable {
-            if (fms->empty()) {
-                return make_ready_future<stop_iteration>(stop_iteration::yes);
-            }
-            auto& fm = fms->front();
-            auto mid = fm.column_family_id();
-            return system_keyspace::get_truncated_at(mid).then([this, mid, &fm, written_at, mutations](db_clock::time_point t) {
-                schema_ptr s = _qp.db().local().find_schema(mid);
+        return map_reduce(*fms, [this, written_at] (canonical_mutation& fm) {
+            return system_keyspace::get_truncated_at(fm.column_family_id()).then([written_at, &fm] (db_clock::time_point t) ->
+                    std::experimental::optional<std::reference_wrapper<canonical_mutation>> {
                if (written_at > t) {
-                    mutations->emplace_back(fm.to_mutation(s));
+                    return { std::ref(fm) };
+                } else {
+                    return {};
                }
-            }).then([fms] {
-                fms->pop_front();
-                return make_ready_future<stop_iteration>(stop_iteration::no);
            });
-        }).then([this, id, mutations, limiter, written_at, size] {
-            if (mutations->empty()) {
+        },
+        std::vector<mutation>(),
+        [this] (std::vector<mutation> mutations, std::experimental::optional<std::reference_wrapper<canonical_mutation>> fm) {
+            if (fm) {
+                schema_ptr s = _qp.db().local().find_schema(fm.value().get().column_family_id());
+                mutations.emplace_back(fm.value().get().to_mutation(s));
+            }
+            return mutations;
+        }).then([this, id, limiter, written_at, size, fms] (std::vector<mutation> mutations) {
+            if (mutations.empty()) {
                return make_ready_future<>();
            }
-            const auto ttl = [this, mutations, written_at]() -> clock_type {
+            const auto ttl = [this, &mutations, written_at]() -> clock_type {
                /*
                 * Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
                 * This ensures that deletes aren't "undone" by an old batch replay.
@@ -217,8 +230,8 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
            // Our normal write path does not add much redundancy to the dispatch, and rate is handled after send
            // in both cases.
            // FIXME: verify that the above is reasonably true.
-            return limiter->reserve(size).then([this, mutations, id] {
-                return _qp.proxy().local().mutate(std::move(*mutations), db::consistency_level::ANY);
+            return limiter->reserve(size).then([this, mutations = std::move(mutations), id] {
+                return _qp.proxy().local().mutate(mutations, db::consistency_level::ANY);
            });
        }).then([this, id] {
            // delete batch
--- a/db/commitlog/commitlog.cc
+++ b/db/commitlog/commitlog.cc
@@ -67,6 +67,9 @@
 #include "commitlog_entry.hh"
 #include "service/priority_manager.hh"

+#include <boost/range/numeric.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+
 static logging::logger logger("commitlog");

 class crc32_nbo {
@@ -145,7 +148,7 @@ const std::string db::commitlog::descriptor::FILENAME_PREFIX(
        "CommitLog" + SEPARATOR);
 const std::string db::commitlog::descriptor::FILENAME_EXTENSION(".log");

-class db::commitlog::segment_manager {
+class db::commitlog::segment_manager : public ::enable_shared_from_this<segment_manager> {
 public:
    config cfg;
    const uint64_t max_size;
@@ -275,6 +278,8 @@ public:

    scollectd::registrations create_counters();

+    void orphan_all();
+
    void discard_unused_segments();
    void discard_completed_segments(const cf_id_type& id,
            const replay_position& pos);
@@ -372,7 +377,7 @@ private:
 */

 class db::commitlog::segment: public enable_lw_shared_from_this<segment> {
-    segment_manager* _segment_manager;
+    ::shared_ptr<segment_manager> _segment_manager;

    descriptor _desc;
    file _file;
@@ -404,7 +409,7 @@ class db::commitlog::segment: public enable_lw_shared_from_this<segment> {
        // This is maintaining the semantica of only using the write-lock
        // as a gate for flushing, i.e. once we've begun a flush for position X
        // we are ok with writes to positions > X
-        return _dwrite.write_lock().then(std::bind(&segment_manager::begin_flush, _segment_manager)).finally([this] {
+        return _segment_manager->begin_flush().then(std::bind(&rwlock::write_lock, &_dwrite)).finally([this] {
            _dwrite.write_unlock();
        });
    }
@@ -417,12 +422,12 @@ class db::commitlog::segment: public enable_lw_shared_from_this<segment> {
        // This is maintaining the semantica of only using the write-lock
        // as a gate for flushing, i.e. once we've begun a flush for position X
        // we are ok with writes to positions > X
-        return _dwrite.read_lock().then(std::bind(&segment_manager::begin_write, _segment_manager));
+        return _segment_manager->begin_write().then(std::bind(&rwlock::read_lock, &_dwrite));
    }

    void end_write() {
-        _segment_manager->end_write();
        _dwrite.read_unlock();
+        _segment_manager->end_write();
    }

 public:
@@ -444,8 +449,8 @@ public:
    // TODO : tune initial / default size
    static constexpr size_t default_size = align_up<size_t>(128 * 1024, alignment);

-    segment(segment_manager* m, const descriptor& d, file && f, bool active)
-            : _segment_manager(m), _desc(std::move(d)), _file(std::move(f)), _sync_time(
+    segment(::shared_ptr<segment_manager> m, const descriptor& d, file && f, bool active)
+            : _segment_manager(std::move(m)), _desc(std::move(d)), _file(std::move(f)), _sync_time(
                    clock_type::now()), _queue(0)
    {
        ++_segment_manager->totals.segments_created;
@@ -553,7 +558,7 @@ public:
                                    throw;
                                }
                            });
-        }).finally([this] {
+        }).finally([this, me] {
            end_flush();
        });
    }
@@ -642,7 +647,7 @@ public:
        forget_schema_versions();

        // acquire read lock
-        return begin_write().then([this, size, off, buf = std::move(buf), me]() mutable {
+        return begin_write().then([this, size, off, buf = std::move(buf)]() mutable {
            auto written = make_lw_shared<size_t>(0);
            auto p = buf.get();
            return repeat([this, size, off, written, p]() mutable {
@@ -1038,10 +1043,12 @@ void db::commitlog::segment_manager::flush_segments(bool force) {

 future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::allocate_segment(bool active) {
    descriptor d(next_id());
-    return open_file_dma(cfg.commit_log_location + "/" + d.filename(), open_flags::wo | open_flags::create).then([this, d, active](file f) {
+    file_open_options opt;
+    opt.extent_allocation_size_hint = max_size;
+    return open_file_dma(cfg.commit_log_location + "/" + d.filename(), open_flags::wo | open_flags::create, opt).then([this, d, active](file f) {
        // xfs doesn't like files extended betond eof, so enlarge the file
        return f.truncate(max_size).then([this, d, active, f] () mutable {
-            auto s = make_lw_shared<segment>(this, d, std::move(f), active);
+            auto s = make_lw_shared<segment>(this->shared_from_this(), d, std::move(f), active);
            return make_ready_future<sseg_ptr>(s);
        });
    });
@@ -1155,6 +1162,10 @@ future<> db::commitlog::segment_manager::shutdown() {
    return make_ready_future<>();
 }

+void db::commitlog::segment_manager::orphan_all() {
+    _segments.clear();
+    _reserve_segments.clear();
+}

 /*
 * Sync all segments, then clear them out. To ensure all ops are done.
@@ -1168,7 +1179,7 @@ future<> db::commitlog::segment_manager::clear() {
        for (auto& s : _segments) {
            s->mark_clean();
        }
-       _segments.clear();
+        orphan_all();
    });
 }
 /**
@@ -1202,7 +1213,15 @@ void db::commitlog::segment_manager::on_timer() {
        // take outstanding allocations into regard. This is paranoid,
        // but if for some reason the file::open takes longer than timer period,
        // we could flood the reserve list with new segments
-        auto n = _reserve_segments.size() + _reserve_allocating;
+        //
+        // #482 - _reserve_allocating is decremented in the finally clause below.
+        // This is needed because if either allocate_segment _or_ emplacing into
+        // _reserve_segments should throw, we still need the counter reset
+        // However, because of this, it might be that emplace was done, but not decrement,
+        // when we get here again. So occasionally we might get a sum of the two that is
+        // not consistent. It should however always just potentially be _to much_, i.e.
+        // just an indicator that we don't need to do anything. So lets do that.
+        auto n = std::min(_reserve_segments.size() + _reserve_allocating, _num_reserve_segments);
        return parallel_for_each(boost::irange(n, _num_reserve_segments), [this, n](auto i) {
            ++_reserve_allocating;
            return this->allocate_segment(false).then([this](sseg_ptr s) {
@@ -1283,8 +1302,9 @@ void db::commitlog::segment_manager::release_buffer(buffer_type&& b) {
        logger.trace("Deleting {} buffers", _temp_buffers.size() - max_temp_buffers);
        _temp_buffers.erase(_temp_buffers.begin() + max_temp_buffers, _temp_buffers.end());
    }
-    totals.buffer_list_bytes = std::accumulate(_temp_buffers.begin(),
-            _temp_buffers.end(), size_t(0), std::plus<size_t>());
+    totals.buffer_list_bytes = boost::accumulate(
+	    _temp_buffers | boost::adaptors::transformed(std::mem_fn(&buffer_type::size)),
+            size_t(0), std::plus<size_t>());
 }

 /**
@@ -1334,7 +1354,7 @@ future<db::replay_position> db::commitlog::add_entry(const cf_id_type& id, const
 }

 db::commitlog::commitlog(config cfg)
-        : _segment_manager(new segment_manager(std::move(cfg))) {
+        : _segment_manager(::make_shared<segment_manager>(std::move(cfg))) {
 }

 db::commitlog::commitlog(commitlog&& v) noexcept
@@ -1342,6 +1362,9 @@ db::commitlog::commitlog(commitlog&& v) noexcept
 }

 db::commitlog::~commitlog() {
+    if (_segment_manager != nullptr) {
+        _segment_manager->orphan_all();
+    }
 }

 future<db::commitlog> db::commitlog::create_commitlog(config cfg) {
--- a/db/commitlog/commitlog.hh
+++ b/db/commitlog/commitlog.hh
@@ -98,7 +98,7 @@ public:
    class segment;

 private:
-    std::unique_ptr<segment_manager> _segment_manager;
+    ::shared_ptr<segment_manager> _segment_manager;
 public:
    enum class sync_mode {
        PERIODIC, BATCH
--- a/db/commitlog/commitlog_entry.cc
+++ b/db/commitlog/commitlog_entry.cc
@@ -0,0 +1,86 @@
+/*
+ * Copyright 2016 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "commitlog_entry.hh"
+#include "idl/uuid.dist.hh"
+#include "idl/keys.dist.hh"
+#include "idl/frozen_mutation.dist.hh"
+#include "idl/mutation.dist.hh"
+#include "idl/commitlog.dist.hh"
+#include "serializer_impl.hh"
+#include "serialization_visitors.hh"
+#include "idl/uuid.dist.impl.hh"
+#include "idl/keys.dist.impl.hh"
+#include "idl/frozen_mutation.dist.impl.hh"
+#include "idl/mutation.dist.impl.hh"
+#include "idl/commitlog.dist.impl.hh"
+
+commitlog_entry::commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation)
+    : _mapping(std::move(mapping))
+      , _mutation_storage(std::move(mutation))
+      , _mutation(*_mutation_storage)
+{ }
+
+commitlog_entry::commitlog_entry(stdx::optional<column_mapping> mapping, const frozen_mutation& mutation)
+    : _mapping(std::move(mapping))
+      , _mutation(mutation)
+{ }
+
+commitlog_entry::commitlog_entry(commitlog_entry&& ce)
+    : _mapping(std::move(ce._mapping))
+    , _mutation_storage(std::move(ce._mutation_storage))
+    , _mutation(_mutation_storage ? *_mutation_storage : ce._mutation)
+{
+}
+
+commitlog_entry& commitlog_entry::operator=(commitlog_entry&& ce)
+{
+    if (this != &ce) {
+        this->~commitlog_entry();
+        new (this) commitlog_entry(std::move(ce));
+    }
+    return *this;
+}
+
+commitlog_entry commitlog_entry_writer::get_entry() const {
+    if (_with_schema) {
+        return commitlog_entry(_schema->get_column_mapping(), _mutation);
+    } else {
+        return commitlog_entry({}, _mutation);
+    }
+}
+
+void commitlog_entry_writer::compute_size() {
+    _size = ser::get_sizeof(get_entry());
+}
+
+void commitlog_entry_writer::write(data_output& out) const {
+    seastar::simple_output_stream str(out.reserve(size()));
+    ser::serialize(str, get_entry());
+}
+
+commitlog_entry_reader::commitlog_entry_reader(const temporary_buffer<char>& buffer)
+    : _ce([&] {
+    seastar::simple_input_stream in(buffer.get(), buffer.size());
+    return ser::deserialize(in, boost::type<commitlog_entry>());
+}())
+{
+}
--- a/db/commitlog/commitlog_entry.hh
+++ b/db/commitlog/commitlog_entry.hh
@@ -25,21 +25,43 @@

 #include "frozen_mutation.hh"
 #include "schema.hh"
+#include "utils/data_output.hh"

 namespace stdx = std::experimental;

+class commitlog_entry {
+    stdx::optional<column_mapping> _mapping;
+    stdx::optional<frozen_mutation> _mutation_storage;
+    const frozen_mutation& _mutation;
+public:
+    commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation);
+    commitlog_entry(stdx::optional<column_mapping> mapping, const frozen_mutation& mutation);
+    commitlog_entry(commitlog_entry&&);
+    commitlog_entry(const commitlog_entry&) = delete;
+    commitlog_entry& operator=(commitlog_entry&&);
+    commitlog_entry& operator=(const commitlog_entry&) = delete;
+    const stdx::optional<column_mapping>& mapping() const { return _mapping; }
+    const frozen_mutation& mutation() const { return _mutation; }
+};
+
 class commitlog_entry_writer {
    schema_ptr _schema;
-    db::serializer<column_mapping> _column_mapping_serializer;
    const frozen_mutation& _mutation;
    bool _with_schema = true;
+    size_t _size;
+private:
+    void compute_size();
+    commitlog_entry get_entry() const;
 public:
    commitlog_entry_writer(schema_ptr s, const frozen_mutation& fm)
-        : _schema(std::move(s)), _column_mapping_serializer(_schema->get_column_mapping()), _mutation(fm)
-    { }
+        : _schema(std::move(s)), _mutation(fm)
+    {
+        compute_size();
+    }

    void set_with_schema(bool value) {
        _with_schema = value;
+        compute_size();
    }
    bool with_schema() {
        return _with_schema;
@@ -49,40 +71,17 @@ public:
    }

    size_t size() const {
-        size_t size = data_output::serialized_size<bool>();
-        if (_with_schema) {
-            size += _column_mapping_serializer.size();
-        }
-        size += _mutation.representation().size();
-        return size;
+        return _size;
    }

-    void write(data_output& out) const {
-        out.write(_with_schema);
-        if (_with_schema) {
-            _column_mapping_serializer.write(out);
-        }
-        auto bv = _mutation.representation();
-        out.write(bv.begin(), bv.end());
-    }
+    void write(data_output& out) const;
 };

 class commitlog_entry_reader {
-    frozen_mutation _mutation;
-    stdx::optional<column_mapping> _column_mapping;
+    commitlog_entry _ce;
 public:
-    commitlog_entry_reader(const temporary_buffer<char>& buffer)
-        : _mutation(bytes())
-    {
-        data_input in(buffer);
-        bool has_column_mapping = in.read<bool>();
-        if (has_column_mapping) {
-            _column_mapping = db::serializer<::column_mapping>::read(in);
-        }
-        auto bv = in.read_view(in.avail());
-        _mutation = frozen_mutation(bytes(bv.begin(), bv.end()));
-    }
+    commitlog_entry_reader(const temporary_buffer<char>& buffer);

-    const stdx::optional<column_mapping>& get_column_mapping() const { return _column_mapping; }
-    const frozen_mutation& mutation() const { return _mutation; }
+    const stdx::optional<column_mapping>& get_column_mapping() const { return _ce.mapping(); }
+    const frozen_mutation& mutation() const { return _ce.mutation(); }
 };
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -53,7 +53,6 @@
 #include "database.hh"
 #include "sstables/sstables.hh"
 #include "db/system_keyspace.hh"
-#include "db/serializer.hh"
 #include "cql3/query_processor.hh"
 #include "log.hh"
 #include "converting_mutation_partition_applier.hh"
--- a/db/config.hh
+++ b/db/config.hh
@@ -487,7 +487,7 @@ public:
    val(cas_contention_timeout_in_ms, uint32_t, 5000, Unused,     \
            "The time that the coordinator continues to retry a CAS (compare and set) operation that contends with other proposals for the same row."  \
    )   \
-    val(truncate_request_timeout_in_ms, uint32_t, 10000, Unused,     \
+    val(truncate_request_timeout_in_ms, uint32_t, 10000, Used,     \
            "The time that the coordinator waits for truncates (remove all data from a table) to complete. The long default value allows for a snapshot to be taken before removing the data. If auto_snapshot is disabled (not recommended), you can reduce this time."  \
    )   \
    val(write_request_timeout_in_ms, uint32_t, 2000, Used,     \
@@ -556,7 +556,7 @@ public:
    val(start_rpc, bool, false, Used,                \
            "Starts the Thrift RPC server"  \
    )   \
-    val(rpc_keepalive, bool, true, Unused,     \
+    val(rpc_keepalive, bool, true, Used,     \
            "Enable or disable keepalive on client connections (RPC or native)."  \
    )   \
    val(rpc_max_threads, uint32_t, 0, Invalid,     \
--- a/db/consistency_level.cc
+++ b/db/consistency_level.cc
@@ -241,7 +241,7 @@ is_sufficient_live_nodes(consistency_level cl,

        if (rs.get_type() == replication_strategy_type::network_topology) {
            for (auto& entry : count_per_dc_endpoints(ks, live_endpoints)) {
-                if (entry.second < local_quorum_for(ks, entry.first)) {
+                if (entry.second.live < local_quorum_for(ks, entry.first)) {
                    return false;
                }
            }
--- a/db/consistency_level.hh
+++ b/db/consistency_level.hh
@@ -88,10 +88,16 @@ filter_for_query(consistency_level cl,

 std::vector<gms::inet_address> filter_for_query(consistency_level cl, keyspace& ks, std::vector<gms::inet_address>& live_endpoints);

-template <typename Range>
-inline std::unordered_map<sstring, size_t> count_per_dc_endpoints(
+struct dc_node_count {
+    size_t live = 0;
+    size_t pending = 0;
+};
+
+template <typename Range, typename PendingRange = std::array<gms::inet_address, 0>>
+inline std::unordered_map<sstring, dc_node_count> count_per_dc_endpoints(
        keyspace& ks,
-        Range& live_endpoints) {
+        Range& live_endpoints,
+        const PendingRange& pending_endpoints = std::array<gms::inet_address, 0>()) {
    using namespace locator;

    auto& rs = ks.get_replication_strategy();
@@ -100,9 +106,9 @@ inline std::unordered_map<sstring, size_t> count_per_dc_endpoints(
    network_topology_strategy* nrs =
            static_cast<network_topology_strategy*>(&rs);

-    std::unordered_map<sstring, size_t> dc_endpoints;
+    std::unordered_map<sstring, dc_node_count> dc_endpoints;
    for (auto& dc : nrs->get_datacenters()) {
-        dc_endpoints.emplace(dc, 0);
+        dc_endpoints.emplace(dc, dc_node_count());
    }

    //
@@ -111,7 +117,11 @@ inline std::unordered_map<sstring, size_t> count_per_dc_endpoints(
    // nrs->get_datacenters().
    //
    for (auto& endpoint : live_endpoints) {
-        ++(dc_endpoints[snitch_ptr->get_datacenter(endpoint)]);
+        ++(dc_endpoints[snitch_ptr->get_datacenter(endpoint)].live);
+    }
+
+    for (auto& endpoint : pending_endpoints) {
+        ++(dc_endpoints[snitch_ptr->get_datacenter(endpoint)].pending);
    }

    return dc_endpoints;
@@ -122,21 +132,23 @@ is_sufficient_live_nodes(consistency_level cl,
                         keyspace& ks,
                         const std::vector<gms::inet_address>& live_endpoints);

-template<typename Range>
+template<typename Range, typename PendingRange>
 inline bool assure_sufficient_live_nodes_each_quorum(
        consistency_level cl,
        keyspace& ks,
-        Range& live_endpoints) {
+        Range& live_endpoints,
+        const PendingRange& pending_endpoints) {
    using namespace locator;

    auto& rs = ks.get_replication_strategy();

    if (rs.get_type() == replication_strategy_type::network_topology) {
-        for (auto& entry : count_per_dc_endpoints(ks, live_endpoints)) {
+        for (auto& entry : count_per_dc_endpoints(ks, live_endpoints, pending_endpoints)) {
            auto dc_block_for = local_quorum_for(ks, entry.first);
-            auto dc_live = entry.second;
+            auto dc_live = entry.second.live;
+            auto dc_pending = entry.second.pending;

-            if (dc_live < dc_block_for) {
+            if (dc_live < dc_block_for + dc_pending) {
                throw exceptions::unavailable_exception(cl, dc_block_for, dc_live);
            }
        }
@@ -147,11 +159,12 @@ inline bool assure_sufficient_live_nodes_each_quorum(
    return false;
 }

-template<typename Range>
+template<typename Range, typename PendingRange = std::array<gms::inet_address, 0>>
 inline void assure_sufficient_live_nodes(
        consistency_level cl,
        keyspace& ks,
-        Range& live_endpoints) {
+        Range& live_endpoints,
+        const PendingRange& pending_endpoints = std::array<gms::inet_address, 0>()) {
    size_t need = block_for(ks, cl);

    switch (cl) {
@@ -159,13 +172,13 @@ inline void assure_sufficient_live_nodes(
        // local hint is acceptable, and local node is always live
        break;
    case consistency_level::LOCAL_ONE:
-        if (count_local_endpoints(live_endpoints) == 0) {
+        if (count_local_endpoints(live_endpoints) < count_local_endpoints(pending_endpoints) + 1) {
            throw exceptions::unavailable_exception(cl, 1, 0);
        }
        break;
    case consistency_level::LOCAL_QUORUM: {
        size_t local_live = count_local_endpoints(live_endpoints);
-        if (local_live < need) {
+        if (local_live < need + count_local_endpoints(pending_endpoints)) {
 #if 0
            if (logger.isDebugEnabled())
            {
@@ -184,14 +197,15 @@ inline void assure_sufficient_live_nodes(
        break;
    }
    case consistency_level::EACH_QUORUM:
-        if (assure_sufficient_live_nodes_each_quorum(cl, ks, live_endpoints)) {
+        if (assure_sufficient_live_nodes_each_quorum(cl, ks, live_endpoints, pending_endpoints)) {
            break;
        }
    // Fallthough on purpose for SimpleStrategy
    default:
        size_t live = live_endpoints.size();
-        if (live < need) {
-            cl_logger.debug("Live nodes {} do not satisfy ConsistencyLevel ({} required)", live, need);
+        size_t pending = pending_endpoints.size();
+        if (live < need + pending) {
+            cl_logger.debug("Live nodes {} do not satisfy ConsistencyLevel ({} required, {} pending)", live, need, pending);
            throw exceptions::unavailable_exception(cl, need, live);
        }
        break;
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -65,6 +65,7 @@
 #include <boost/range/adaptor/map.hpp>

 #include "compaction_strategy.hh"
+#include "utils/joinpoint.hh"

 using namespace db::system_keyspace;

@@ -415,16 +416,16 @@ future<std::vector<frozen_mutation>> convert_schema_to_mutations(distributed<ser
                if (partition_key == system_keyspace::NAME) {
                    continue;
                }
-                results.emplace_back(p.mut());
+                results.emplace_back(std::move(p.mut()));
            }
            return results;
        });
    };
    auto reduce = [] (auto&& result, auto&& mutations) {
-        std::copy(mutations.begin(), mutations.end(), std::back_inserter(result));
+        std::move(mutations.begin(), mutations.end(), std::back_inserter(result));
        return std::move(result);
    };
-    return map_reduce(ALL.begin(), ALL.end(), map, std::move(std::vector<frozen_mutation>{}), reduce);
+    return map_reduce(ALL.begin(), ALL.end(), map, std::vector<frozen_mutation>{}, reduce);
 }

 future<schema_result>
@@ -606,10 +607,10 @@ future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std::vector
 #endif
       proxy.local().get_db().invoke_on_all([keyspaces_to_drop = std::move(keyspaces_to_drop)] (database& db) {
           // it is safe to drop a keyspace only when all nested ColumnFamilies where deleted
-           for (auto&& keyspace_to_drop : keyspaces_to_drop) {
+           return do_for_each(keyspaces_to_drop, [&db] (auto keyspace_to_drop) {
               db.drop_keyspace(keyspace_to_drop);
-               service::get_local_migration_manager().notify_drop_keyspace(keyspace_to_drop);
-           }
+               return service::get_local_migration_manager().notify_drop_keyspace(keyspace_to_drop);
+            });
       }).get0();
   });
 }
@@ -649,7 +650,7 @@ future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& p
            return do_for_each(created, [&db](auto&& val) {
                auto ksm = create_keyspace_from_schema_partition(val);
                return db.create_keyspace(ksm).then([ksm] {
-                    service::get_local_migration_manager().notify_create_keyspace(ksm);
+                    return service::get_local_migration_manager().notify_create_keyspace(ksm);
                });
            }).then([&altered, &db] () mutable {
                for (auto&& name : altered) {
@@ -662,7 +663,7 @@ future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& p
    });
 }

-static void update_column_family(database& db, schema_ptr new_schema) {
+static future<> update_column_family(database& db, schema_ptr new_schema) {
    column_family& cfm = db.find_column_family(new_schema->id());

    bool columns_changed = !cfm.schema()->equal_columns(*new_schema);
@@ -671,7 +672,7 @@ static void update_column_family(database& db, schema_ptr new_schema) {
    s->registry_entry()->mark_synced();
    cfm.set_schema(std::move(s));

-    service::get_local_migration_manager().notify_update_column_family(cfm.schema(), columns_changed);
+    return service::get_local_migration_manager().notify_update_column_family(cfm.schema(), columns_changed);
 }

 // see the comments for merge_keyspaces()
@@ -679,7 +680,6 @@ static void merge_tables(distributed<service::storage_proxy>& proxy,
    std::map<qualified_name, schema_mutations>&& before,
    std::map<qualified_name, schema_mutations>&& after)
 {
-    auto changed_at = db_clock::now();
    std::vector<global_schema_ptr> created;
    std::vector<global_schema_ptr> altered;
    std::vector<global_schema_ptr> dropped;
@@ -687,34 +687,44 @@ static void merge_tables(distributed<service::storage_proxy>& proxy,
    auto diff = difference(before, after);
    for (auto&& key : diff.entries_only_on_left) {
        auto&& s = proxy.local().get_db().local().find_schema(key.keyspace_name, key.table_name);
+        logger.info("Dropping {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version());
        dropped.emplace_back(s);
    }
    for (auto&& key : diff.entries_only_on_right) {
-        created.emplace_back(create_table_from_mutations(after.at(key)));
+        auto s = create_table_from_mutations(after.at(key));
+        logger.info("Creating {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version());
+        created.emplace_back(s);
    }
    for (auto&& key : diff.entries_differing) {
-        altered.emplace_back(create_table_from_mutations(after.at(key)));
+        auto s = create_table_from_mutations(after.at(key));
+        logger.info("Altering {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version());
+        altered.emplace_back(s);
    }

-    proxy.local().get_db().invoke_on_all([&created, &dropped, &altered, changed_at] (database& db) {
-        return seastar::async([&] {
-            for (auto&& gs : created) {
-                schema_ptr s = gs.get();
-                auto& ks = db.find_keyspace(s->ks_name());
-                auto cfg = ks.make_column_family_config(*s);
-                db.add_column_family(s, cfg);
-                ks.make_directory_for_column_family(s->cf_name(), s->id()).get();
-                service::get_local_migration_manager().notify_create_column_family(s);
-            }
-            for (auto&& gs : altered) {
-                update_column_family(db, gs.get());
-            }
-            parallel_for_each(dropped.begin(), dropped.end(), [changed_at, &db](auto&& gs) {
-                schema_ptr s = gs.get();
-                return db.drop_column_family(changed_at, s->ks_name(), s->cf_name()).then([s] {
-                    service::get_local_migration_manager().notify_drop_column_family(s);
-                });
-            }).get();
+    do_with(utils::make_joinpoint([] { return db_clock::now();})
+        , [&created, &dropped, &altered, &proxy](auto& tsf) {
+        return proxy.local().get_db().invoke_on_all([&created, &dropped, &altered, &tsf] (database& db) {
+            return seastar::async([&] {
+                for (auto&& gs : created) {
+                    schema_ptr s = gs.get();
+                    auto& ks = db.find_keyspace(s->ks_name());
+                    auto cfg = ks.make_column_family_config(*s);
+                    db.add_column_family(s, cfg);
+                    auto& cf = db.find_column_family(s);
+                    cf.mark_ready_for_writes();
+                    ks.make_directory_for_column_family(s->cf_name(), s->id()).get();
+                    service::get_local_migration_manager().notify_create_column_family(s).get();
+                }
+                for (auto&& gs : altered) {
+                    update_column_family(db, gs.get()).get();
+                }
+                parallel_for_each(dropped.begin(), dropped.end(), [&db, &tsf](auto&& gs) {
+                    schema_ptr s = gs.get();
+                    return db.drop_column_family(s->ks_name(), s->cf_name(), [&tsf] { return tsf.value(); }).then([s] {
+                        return service::get_local_migration_manager().notify_drop_column_family(s);
+                    });
+                }).get();
+            });
        });
    }).get();
 }
--- a/db/serializer.cc
+++ b/db/serializer.cc
@@ -1,194 +0,0 @@
-/*
- * Copyright 2015 Cloudius Systems
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "serializer.hh"
-#include "database.hh"
-#include "types.hh"
-#include "utils/serialization.hh"
-
-typedef uint32_t count_type; // Me thinks 32-bits are enough for "normal" count purposes.
-
-template<>
-db::serializer<utils::UUID>::serializer(const utils::UUID& uuid)
-        : _item(uuid), _size(2 * sizeof(uint64_t)) {
-}
-
-template<>
-void db::serializer<utils::UUID>::write(output& out,
-        const type& t) {
-    out.write(t.get_most_significant_bits());
-    out.write(t.get_least_significant_bits());
-}
-
-template<>
-void db::serializer<utils::UUID>::read(utils::UUID& uuid, input& in) {
-    uuid = read(in);
-}
-
-template<>
-void db::serializer<utils::UUID>::skip(input& in) {
-    in.skip(2 * sizeof(uint64_t));
-}
-
-template<> utils::UUID db::serializer<utils::UUID>::read(input& in) {
-    auto msb = in.read<uint64_t>();
-    auto lsb = in.read<uint64_t>();
-    return utils::UUID(msb, lsb);
-}
-
-template<>
-db::serializer<bytes>::serializer(const bytes& b)
-        : _item(b), _size(output::serialized_size(b)) {
-}
-
-template<>
-void db::serializer<bytes>::write(output& out, const type& t) {
-    out.write(t);
-}
-
-template<>
-void db::serializer<bytes>::read(bytes& b, input& in) {
-    b = in.read<bytes>();
-}
-
-template<>
-void db::serializer<bytes>::skip(input& in) {
-    in.read<bytes>(); // FIXME: Avoid reading
-}
-
-template<>
-db::serializer<bytes_view>::serializer(const bytes_view& v)
-        : _item(v), _size(output::serialized_size(v)) {
-}
-
-template<>
-void db::serializer<bytes_view>::write(output& out, const type& t) {
-    out.write(t);
-}
-
-template<>
-void db::serializer<bytes_view>::read(bytes_view& v, input& in) {
-    v = in.read<bytes_view>();
-}
-
-template<>
-bytes_view db::serializer<bytes_view>::read(input& in) {
-    return in.read<bytes_view>();
-}
-
-template<>
-db::serializer<sstring>::serializer(const sstring& s)
-        : _item(s), _size(output::serialized_size(s)) {
-}
-
-template<>
-void db::serializer<sstring>::write(output& out, const type& t) {
-    out.write(t);
-}
-
-template<>
-void db::serializer<sstring>::read(sstring& s, input& in) {
-    s = in.read<sstring>();
-}
-
-template<>
-void db::serializer<sstring>::skip(input& in) {
-    in.read<sstring>(); // FIXME: avoid reading
-}
-
-template<>
-db::serializer<tombstone>::serializer(const tombstone& t)
-        : _item(t), _size(sizeof(t.timestamp) + sizeof(decltype(t.deletion_time.time_since_epoch().count()))) {
-}
-
-template<>
-void db::serializer<tombstone>::write(output& out, const type& t) {
-    out.write(t.timestamp);
-    out.write(t.deletion_time.time_since_epoch().count());
-}
-
-template<>
-void db::serializer<tombstone>::read(tombstone& t, input& in) {
-    t.timestamp = in.read<decltype(t.timestamp)>();
-    auto deletion_time = in.read<decltype(t.deletion_time.time_since_epoch().count())>();
-    t.deletion_time = gc_clock::time_point(gc_clock::duration(deletion_time));
-}
-
-template<>
-db::serializer<atomic_cell_view>::serializer(const atomic_cell_view& c)
-        : _item(c), _size(bytes_view_serializer(c.serialize()).size()) {
-}
-
-template<>
-void db::serializer<atomic_cell_view>::write(output& out, const atomic_cell_view& t) {
-    bytes_view_serializer::write(out, t.serialize());
-}
-
-template<>
-void db::serializer<atomic_cell_view>::read(atomic_cell_view& c, input& in) {
-    c = atomic_cell_view::from_bytes(bytes_view_serializer::read(in));
-}
-
-template<>
-atomic_cell_view db::serializer<atomic_cell_view>::read(input& in) {
-    return atomic_cell_view::from_bytes(bytes_view_serializer::read(in));
-}
-
-template<>
-db::serializer<collection_mutation_view>::serializer(const collection_mutation_view& c)
-        : _item(c), _size(bytes_view_serializer(c.serialize()).size()) {
-}
-
-template<>
-void db::serializer<collection_mutation_view>::write(output& out, const collection_mutation_view& t) {
-    bytes_view_serializer::write(out, t.serialize());
-}
-
-template<>
-void db::serializer<collection_mutation_view>::read(collection_mutation_view& c, input& in) {
-    c = collection_mutation_view::from_bytes(bytes_view_serializer::read(in));
-}
-
-template<>
-db::serializer<db::replay_position>::serializer(const db::replay_position& rp)
-        : _item(rp), _size(sizeof(uint64_t) * 2) {
-}
-
-template<>
-void db::serializer<db::replay_position>::write(output& out, const db::replay_position& rp) {
-    out.write<uint64_t>(rp.id);
-    out.write<uint64_t>(rp.pos);
-}
-
-template<>
-void db::serializer<db::replay_position>::read(db::replay_position& rp, input& in) {
-    rp.id = in.read<uint64_t>();
-    rp.pos = in.read<uint64_t>();
-}
-
-template class db::serializer<tombstone> ;
-template class db::serializer<bytes> ;
-template class db::serializer<bytes_view> ;
-template class db::serializer<sstring> ;
-template class db::serializer<atomic_cell_view> ;
-template class db::serializer<collection_mutation_view> ;
-template class db::serializer<utils::UUID> ;
-template class db::serializer<db::replay_position> ;
--- a/db/serializer.hh
+++ b/db/serializer.hh
@@ -1,235 +0,0 @@
-/*
- * Copyright 2015 Cloudius Systems
- */
-
-/*
- * This file is part of Scylla.
- *
- * Scylla is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Scylla is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef DB_SERIALIZER_HH_
-#define DB_SERIALIZER_HH_
-
-#include <experimental/optional>
-
-#include "utils/data_input.hh"
-#include "utils/data_output.hh"
-#include "bytes_ostream.hh"
-#include "bytes.hh"
-#include "database_fwd.hh"
-#include "db/commitlog/replay_position.hh"
-
-namespace db {
-/**
- * Serialization objects for various types and using "internal" format. (Not CQL, origin whatnot).
- * The design rationale is that a "serializer" can be instantiated for an object, and will contain
- * the obj + size, and is usable as a functor.
- *
- * Serialization can also be done "explicitly" through the static method "write"
- * (Not using "serialize", because writing "serializer<apa>::serialize" all the time is tiring and redundant)
- * though care should be takes than data will fit of course.
- */
-template<typename T>
-class serializer {
-public:
-    typedef T type;
-    typedef data_output output;
-    typedef data_input input;
-    typedef serializer<T> _MyType;
-
-    serializer(const type&);
-
-    // apply to memory, must be at least size() large.
-    const _MyType& operator()(output& out) const {
-        write(out, _item);
-        return *this;
-    }
-
-    static void write(output&, const type&);
-    static void read(type&, input&);
-    static type read(input&);
-    static void skip(input& in);
-
-    size_t size() const {
-        return _size;
-    }
-
-    void write(bytes_ostream& out) const {
-        auto buf = out.write_place_holder(_size);
-        data_output data_out((char*)buf, _size);
-        write(data_out, _item);
-    }
-
-    void write(data_output& out) const {
-        write(out, _item);
-    }
-
-    bytes to_bytes() const {
-        bytes b(bytes::initialized_later(), _size);
-        data_output out(b);
-        write(out);
-        return b;
-    }
-
-    static type from_bytes(bytes_view v) {
-        data_input in(v);
-        return read(in);
-    }
-private:
-    const type& _item;
-    size_t _size;
-};
-
-template<typename T>
-class serializer<std::experimental::optional<T>> {
-public:
-    typedef std::experimental::optional<T> type;
-    typedef data_output output;
-    typedef data_input input;
-    typedef serializer<T> _MyType;
-
-    serializer(const type& t)
-        : _item(t)
-        , _size(output::serialized_size<bool>() + (t ? serializer<T>(*t).size() : 0))
-    {}
-
-    // apply to memory, must be at least size() large.
-    const _MyType& operator()(output& out) const {
-        write(out, _item);
-        return *this;
-    }
-
-    static void write(output& out, const type& v) {
-        bool en = v;
-        out.write<bool>(en);
-        if (en) {
-            serializer<T>::write(out, *v);
-        }
-    }
-    static void read(type& dst, input& in) {
-        auto en = in.read<bool>();
-        if (en) {
-            dst = serializer<T>::read(in);
-        } else {
-            dst = {};
-        }
-    }
-    static type read(input& in) {
-        type t;
-        read(t, in);
-        return t;
-    }
-    static void skip(input& in) {
-        auto en = in.read<bool>();
-        if (en) {
-            serializer<T>::skip(in);
-        }
-    }
-
-    size_t size() const {
-        return _size;
-    }
-
-    void write(bytes_ostream& out) const {
-        auto buf = out.write_place_holder(_size);
-        data_output data_out((char*)buf, _size);
-        write(data_out, _item);
-    }
-
-    void write(data_output& out) const {
-        write(out, _item);
-    }
-
-    bytes to_bytes() const {
-        bytes b(bytes::initialized_later(), _size);
-        data_output out(b);
-        write(out);
-        return b;
-    }
-
-    static type from_bytes(bytes_view v) {
-        data_input in(v);
-        return read(in);
-    }
-private:
-    const std::experimental::optional<T> _item;
-    size_t _size;
-};
-
-
-template<> serializer<utils::UUID>::serializer(const utils::UUID &);
-template<> void serializer<utils::UUID>::write(output&, const type&);
-template<> void serializer<utils::UUID>::read(utils::UUID&, input&);
-template<> void serializer<utils::UUID>::skip(input&);
-template<> utils::UUID serializer<utils::UUID>::read(input&);
-
-template<> serializer<bytes>::serializer(const bytes &);
-template<> void serializer<bytes>::write(output&, const type&);
-template<> void serializer<bytes>::read(bytes&, input&);
-template<> void serializer<bytes>::skip(input&);
-
-template<> serializer<bytes_view>::serializer(const bytes_view&);
-template<> void serializer<bytes_view>::write(output&, const type&);
-template<> void serializer<bytes_view>::read(bytes_view&, input&);
-template<> bytes_view serializer<bytes_view>::read(input&);
-
-template<> serializer<sstring>::serializer(const sstring&);
-template<> void serializer<sstring>::write(output&, const type&);
-template<> void serializer<sstring>::read(sstring&, input&);
-template<> void serializer<sstring>::skip(input&);
-
-template<> serializer<tombstone>::serializer(const tombstone &);
-template<> void serializer<tombstone>::write(output&, const type&);
-template<> void serializer<tombstone>::read(tombstone&, input&);
-
-template<> serializer<atomic_cell_view>::serializer(const atomic_cell_view &);
-template<> void serializer<atomic_cell_view>::write(output&, const type&);
-template<> void serializer<atomic_cell_view>::read(atomic_cell_view&, input&);
-template<> atomic_cell_view serializer<atomic_cell_view>::read(input&);
-
-template<> serializer<collection_mutation_view>::serializer(const collection_mutation_view &);
-template<> void serializer<collection_mutation_view>::write(output&, const type&);
-template<> void serializer<collection_mutation_view>::read(collection_mutation_view&, input&);
-
-template<> serializer<db::replay_position>::serializer(const db::replay_position&);
-template<> void serializer<db::replay_position>::write(output&, const db::replay_position&);
-template<> void serializer<db::replay_position>::read(db::replay_position&, input&);
-
-template<typename T>
-T serializer<T>::read(input& in) {
-    type t;
-    read(t, in);
-    return t;
-}
-
-extern template class serializer<tombstone>;
-extern template class serializer<bytes>;
-extern template class serializer<bytes_view>;
-extern template class serializer<sstring>;
-extern template class serializer<utils::UUID>;
-extern template class serializer<db::replay_position>;
-
-typedef serializer<tombstone> tombstone_serializer;
-typedef serializer<bytes> bytes_serializer; // Compatible with bytes_view_serializer
-typedef serializer<bytes_view> bytes_view_serializer; // Compatible with bytes_serializer
-typedef serializer<sstring> sstring_serializer;
-typedef serializer<atomic_cell_view> atomic_cell_view_serializer;
-typedef serializer<collection_mutation_view> collection_mutation_view_serializer;
-typedef serializer<utils::UUID> uuid_serializer;
-typedef serializer<db::replay_position> replay_position_serializer;
-
-}
-
-#endif /* DB_SERIALIZER_HH_ */
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -58,14 +58,16 @@
 #include "thrift/server.hh"
 #include "exceptions/exceptions.hh"
 #include "cql3/query_processor.hh"
-#include "db/serializer.hh"
 #include "query_context.hh"
 #include "partition_slice_builder.hh"
 #include "db/config.hh"
 #include "schema_builder.hh"
 #include "md5_hasher.hh"
 #include "release.hh"
+#include "log.hh"
+#include "serializer.hh"
 #include <core/enum.hh>
+#include "service/storage_proxy.hh"

 using days = std::chrono::duration<int, std::ratio<24 * 3600>>;

@@ -75,6 +77,7 @@ std::unique_ptr<query_context> qctx = {};

 namespace system_keyspace {

+static logging::logger logger("system_keyspace");
 static const api::timestamp_type creation_timestamp = api::new_timestamp();

 api::timestamp_type schema_creation_timestamp() {
@@ -438,7 +441,7 @@ static future<> setup_version() {
                             version::release(),
                             cql3::query_processor::CQL_VERSION,
                             org::apache::cassandra::thrift_version,
-                             to_sstring(version::native_protocol()),
+                             to_sstring(cql_serialization_format::latest_version),
                             snitch->get_datacenter(utils::fb_utilities::get_broadcast_address()),
                             snitch->get_rack(utils::fb_utilities::get_broadcast_address()),
                             sstring(dht::global_partitioner().name()),
@@ -546,31 +549,44 @@ future<> setup(distributed<database>& db, distributed<cql3::query_processor>& qp
    });
 }

-typedef std::pair<replay_positions, db_clock::time_point> truncation_entry;
-typedef utils::UUID truncation_key;
-typedef std::unordered_map<truncation_key, truncation_entry> truncation_map;
+struct truncation_record {
+    static constexpr uint32_t current_magic = 0x53435452; // 'S' 'C' 'T' 'R'

+    uint32_t magic;
+    std::vector<db::replay_position> positions;
+    db_clock::time_point time_stamp;
+};
+}
+}
+
+#include "idl/replay_position.dist.hh"
+#include "idl/truncation_record.dist.hh"
+#include "serializer_impl.hh"
+#include "idl/replay_position.dist.impl.hh"
+#include "idl/truncation_record.dist.impl.hh"
+
+namespace db {
+namespace system_keyspace {
+
+typedef utils::UUID truncation_key;
+typedef std::unordered_map<truncation_key, truncation_record> truncation_map;
+
+static constexpr uint8_t current_version = 1;
 static thread_local std::experimental::optional<truncation_map> truncation_records;

 future<> save_truncation_records(const column_family& cf, db_clock::time_point truncated_at, replay_positions positions) {
-    auto size =
-            sizeof(db_clock::rep)
-                    + positions.size()
-                            * db::serializer<replay_position>(
-                                    db::replay_position()).size();
-    bytes buf(bytes::initialized_later(), size);
-    data_output out(buf);
+    truncation_record r;

-    // Old version would write a single RP. We write N. Resulting blob size
-    // will determine how many.
-    // An external entity reading this blob would get a "correct" RP
-    // and a garbled time stamp. But an external entity has no business
-    // reading this data anyway, since it is meaningless outside this
-    // machine instance.
-    for (auto& rp : positions) {
-        db::serializer<replay_position>::write(out, rp);
-    }
-    out.write<db_clock::rep>(truncated_at.time_since_epoch().count());
+    r.magic = truncation_record::current_magic;
+    r.time_stamp = truncated_at;
+    r.positions = std::move(positions);
+
+    auto buf = ser::serialize_to_buffer<bytes>(r, sizeof(current_version));
+
+    buf[0] = current_version;
+
+    static_assert(sizeof(current_version) == 1, "using this as mark");
+    assert(buf.size() & 1); // verify we've created an odd-numbered buffer

    map_type_impl::native_type tmp;
    tmp.emplace_back(cf.schema()->id(), data_value(buf));
@@ -594,7 +610,7 @@ future<> remove_truncation_record(utils::UUID id) {
    });
 }

-static future<truncation_entry> get_truncation_record(utils::UUID cf_id) {
+static future<truncation_record> get_truncation_record(utils::UUID cf_id) {
    if (!truncation_records) {
        sstring req = sprint("SELECT truncated_at FROM system.%s WHERE key = '%s'", LOCAL, LOCAL);
        return qctx->qp().execute_internal(req).then([cf_id](::shared_ptr<cql3::untyped_result_set> rs) {
@@ -605,22 +621,56 @@ static future<truncation_entry> get_truncation_record(utils::UUID cf_id) {
                    auto uuid = p.first;
                    auto buf = p.second;

-                    truncation_entry e;
+                    try {
+                        truncation_record e;

-                    data_input in(buf);
+                        if (buf.size() & 1) {
+                            // new record.
+                            if (buf[0] != current_version) {
+                                logger.warn("Found truncation record of unknown version {}. Ignoring.", int(buf[0]));
+                                continue;
+                            }
+                            e = ser::deserialize_from_buffer(buf, boost::type<truncation_record>(), 1);
+                            if (e.magic == truncation_record::current_magic) {
+                                tmp[uuid] = e;
+                                continue;
+                            }
+                        } else {
+                            // old scylla records. (We hope)
+                            // Read 64+64 bit RP:s, even though the
+                            // struct (and official serial size) is 64+32.
+                            data_input in(buf);

-                    while (in.avail() > sizeof(db_clock::rep)) {
-                        e.first.emplace_back(db::serializer<replay_position>::read(in));
+                            logger.debug("Reading old type record");
+                            while (in.avail() > sizeof(db_clock::rep)) {
+                                auto id = in.read<uint64_t>();
+                                auto pos = in.read<uint64_t>();
+                                e.positions.emplace_back(id, position_type(pos));
+                            }
+                            if (in.avail() == sizeof(db_clock::rep)) {
+                                e.time_stamp = db_clock::time_point(db_clock::duration(in.read<db_clock::rep>()));
+                                tmp[uuid] = e;
+                                continue;
+                            }
+                        }
+                    } catch (std::out_of_range &) {
                    }
-                    e.second = db_clock::time_point(db_clock::duration(in.read<db_clock::rep>()));
-                    tmp[uuid] = e;
+                    // Trying to load an origin table.
+                    // This is useless to us, because the only usage for this
+                    // data is commit log and batch replay, and we cannot replay
+                    // either from origin anyway.
+                    logger.warn("Error reading truncation record for {}. "
+                                    "Most likely this is data from a cassandra instance."
+                                    "Make sure you have cleared commit and batch logs before upgrading.",
+                                    uuid
+                    );
                }
            }
            truncation_records = std::move(tmp);
            return get_truncation_record(cf_id);
        });
    }
-    return make_ready_future<truncation_entry>((*truncation_records)[cf_id]);
+    return make_ready_future<truncation_record>((*truncation_records)[cf_id]);
 }

 future<> save_truncation_record(const column_family& cf, db_clock::time_point truncated_at, db::replay_position rp) {
@@ -628,16 +678,16 @@ future<> save_truncation_record(const column_family& cf, db_clock::time_point tr
    // once, for each core (calling us). But right now, redesigning so that calling here (or, rather,
    // save_truncation_records), is done from "somewhere higher, once per machine, not shard" is tricky.
    // Mainly because drop_tables also uses truncate. And is run per-core as well. Gah.
-    return get_truncated_position(cf.schema()->id()).then([&cf, truncated_at, rp](replay_positions positions) {
-        auto i = std::find_if(positions.begin(), positions.end(), [rp](auto& p) {
+    return get_truncation_record(cf.schema()->id()).then([&cf, truncated_at, rp](truncation_record e) {
+        auto i = std::find_if(e.positions.begin(), e.positions.end(), [rp](replay_position& p) {
            return p.shard_id() == rp.shard_id();
        });
-        if (i == positions.end()) {
-            positions.emplace_back(rp);
+        if (i == e.positions.end()) {
+            e.positions.emplace_back(rp);
        } else {
            *i = rp;
        }
-        return save_truncation_records(cf, truncated_at, positions);
+        return save_truncation_records(cf, std::max(truncated_at, e.time_stamp), e.positions);
    });
 }

@@ -653,14 +703,14 @@ future<db::replay_position> get_truncated_position(utils::UUID cf_id, uint32_t s
 }

 future<replay_positions> get_truncated_position(utils::UUID cf_id) {
-    return get_truncation_record(cf_id).then([](truncation_entry e) {
-        return make_ready_future<replay_positions>(e.first);
+    return get_truncation_record(cf_id).then([](truncation_record e) {
+        return make_ready_future<replay_positions>(e.positions);
    });
 }

 future<db_clock::time_point> get_truncated_at(utils::UUID cf_id) {
-    return get_truncation_record(cf_id).then([](truncation_entry e) {
-        return make_ready_future<db_clock::time_point>(e.second);
+    return get_truncation_record(cf_id).then([](truncation_record e) {
+        return make_ready_future<db_clock::time_point>(e.time_stamp);
    });
 }

@@ -1096,5 +1146,36 @@ future<std::vector<compaction_history_entry>> get_compaction_history()
    });
 }

+
+future<int> increment_and_get_generation() {
+    auto req = sprint("SELECT gossip_generation FROM system.%s WHERE key='%s'", LOCAL, LOCAL);
+    return qctx->qp().execute_internal(req).then([] (auto rs) {
+        int generation;
+        if (rs->empty() || !rs->one().has("gossip_generation")) {
+            // seconds-since-epoch isn't a foolproof new generation
+            // (where foolproof is "guaranteed to be larger than the last one seen at this ip address"),
+            // but it's as close as sanely possible
+            generation = service::get_generation_number();
+        } else {
+            // Other nodes will ignore gossip messages about a node that have a lower generation than previously seen.
+            int stored_generation = rs->one().template get_as<int>("gossip_generation") + 1;
+            int now = service::get_generation_number();
+            if (stored_generation >= now) {
+                logger.warn("Using stored Gossip Generation {} as it is greater than current system time {}."
+                            "See CASSANDRA-3654 if you experience problems", stored_generation, now);
+                generation = stored_generation;
+            } else {
+                generation = now;
+            }
+        }
+        auto req = sprint("INSERT INTO system.%s (key, gossip_generation) VALUES ('%s', ?)", LOCAL, LOCAL);
+        return qctx->qp().execute_internal(req, {generation}).then([generation] (auto rs) {
+            return force_blocking_flush(LOCAL);
+        }).then([generation] {
+            return make_ready_future<int>(generation);
+        });
+    });
+}
+
 } // namespace system_keyspace
 } // namespace db
--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -401,127 +401,9 @@ enum class bootstrap_state {
     */
    future<std::unordered_map<gms::inet_address, utils::UUID>> load_host_ids();

-#if 0
-    /**
-     * Get preferred IP for given endpoint if it is known. Otherwise this returns given endpoint itself.
-     *
-     * @param ep endpoint address to check
-     * @return Preferred IP for given endpoint if present, otherwise returns given ep
-     */
-    public static InetAddress getPreferredIP(InetAddress ep)
-    {
-        String req = "SELECT preferred_ip FROM system.%s WHERE peer=?";
-        UntypedResultSet result = executeInternal(String.format(req, PEERS), ep);
-        if (!result.isEmpty() && result.one().has("preferred_ip"))
-            return result.one().getInetAddress("preferred_ip");
-        return ep;
-    }
-
-    /**
-     * Return a map of IP addresses containing a map of dc and rack info
-     */
-    public static Map<InetAddress, Map<String,String>> loadDcRackInfo()
-    {
-        Map<InetAddress, Map<String, String>> result = new HashMap<>();
-        for (UntypedResultSet.Row row : executeInternal("SELECT peer, data_center, rack from system." + PEERS))
-        {
-            InetAddress peer = row.getInetAddress("peer");
-            if (row.has("data_center") && row.has("rack"))
-            {
-                Map<String, String> dcRack = new HashMap<>();
-                dcRack.put("data_center", row.getString("data_center"));
-                dcRack.put("rack", row.getString("rack"));
-                result.put(peer, dcRack);
-            }
-        }
-        return result;
-    }
-
-    /**
-     * One of three things will happen if you try to read the system keyspace:
-     * 1. files are present and you can read them: great
-     * 2. no files are there: great (new node is assumed)
-     * 3. files are present but you can't read them: bad
-     * @throws ConfigurationException
-     */
-    public static void checkHealth() throws ConfigurationException
-    {
-        Keyspace keyspace;
-        try
-        {
-            keyspace = Keyspace.open(NAME);
-        }
-        catch (AssertionError err)
-        {
-            // this happens when a user switches from OPP to RP.
-            ConfigurationException ex = new ConfigurationException("Could not read system keyspace!");
-            ex.initCause(err);
-            throw ex;
-        }
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(LOCAL);
-
-        String req = "SELECT cluster_name FROM system.%s WHERE key='%s'";
-        UntypedResultSet result = executeInternal(String.format(req, LOCAL, LOCAL));
-
-        if (result.isEmpty() || !result.one().has("cluster_name"))
-        {
-            // this is a brand new node
-            if (!cfs.getSSTables().isEmpty())
-                throw new ConfigurationException("Found system keyspace files, but they couldn't be loaded!");
-
-            // no system files.  this is a new node.
-            req = "INSERT INTO system.%s (key, cluster_name) VALUES ('%s', ?)";
-            executeInternal(String.format(req, LOCAL, LOCAL), DatabaseDescriptor.getClusterName());
-            return;
-        }
-
-        String savedClusterName = result.one().getString("cluster_name");
-        if (!DatabaseDescriptor.getClusterName().equals(savedClusterName))
-            throw new ConfigurationException("Saved cluster name " + savedClusterName + " != configured name " + DatabaseDescriptor.getClusterName());
-    }
-
-#endif
    future<std::unordered_set<dht::token>> get_saved_tokens();
-#if 0
-
-    public static int incrementAndGetGeneration()
-    {
-        String req = "SELECT gossip_generation FROM system.%s WHERE key='%s'";
-        UntypedResultSet result = executeInternal(String.format(req, LOCAL, LOCAL));
-
-        int generation;
-        if (result.isEmpty() || !result.one().has("gossip_generation"))
-        {
-            // seconds-since-epoch isn't a foolproof new generation
-            // (where foolproof is "guaranteed to be larger than the last one seen at this ip address"),
-            // but it's as close as sanely possible
-            generation = (int) (System.currentTimeMillis() / 1000);
-        }
-        else
-        {
-            // Other nodes will ignore gossip messages about a node that have a lower generation than previously seen.
-            final int storedGeneration = result.one().getInt("gossip_generation") + 1;
-            final int now = (int) (System.currentTimeMillis() / 1000);
-            if (storedGeneration >= now)
-            {
-                logger.warn("Using stored Gossip Generation {} as it is greater than current system time {}.  See CASSANDRA-3654 if you experience problems",
-                            storedGeneration, now);
-                generation = storedGeneration;
-            }
-            else
-            {
-                generation = now;
-            }
-        }
-
-        req = "INSERT INTO system.%s (key, gossip_generation) VALUES ('%s', ?)";
-        executeInternal(String.format(req, LOCAL, LOCAL), generation);
-        forceBlockingFlush(LOCAL);
-
-        return generation;
-    }
-#endif

+future<int> increment_and_get_generation();
 bool bootstrap_complete();
 bool bootstrap_in_progress();
 bootstrap_state get_bootstrap_state();
--- a/dht/i_partitioner.cc
+++ b/dht/i_partitioner.cc
@@ -263,29 +263,6 @@ int token_comparator::operator()(const token& t1, const token& t2) const {
    return tri_compare(t1, t2);
 }

-void token::serialize(bytes::iterator& out) const {
-    uint8_t kind = _kind == dht::token::kind::before_all_keys ? 0 :
-                   _kind == dht::token::kind::key ? 1 : 2;
-    serialize_int8(out, kind);
-    serialize_int16(out, _data.size());
-    out = std::copy(_data.begin(), _data.end(), out);
-}
-
-token token::deserialize(bytes_view& in) {
-    uint8_t kind = read_simple<uint8_t>(in);
-    size_t size = read_simple<uint16_t>(in);
-    return token(kind == 0 ? dht::token::kind::before_all_keys :
-                 kind == 1 ? dht::token::kind::key :
-                             dht::token::kind::after_all_keys,
-                 to_bytes(read_simple_bytes(in, size)));
-}
-
-size_t token::serialized_size() const {
-    return serialize_int8_size // token::kind;
-         + serialize_int16_size // token size
-         + _data.size();
-}
-
 bool ring_position::equal(const schema& s, const ring_position& other) const {
    return tri_compare(s, other) == 0;
 }
--- a/dht/i_partitioner.hh
+++ b/dht/i_partitioner.hh
@@ -97,11 +97,6 @@ public:
    bool is_maximum() const {
        return _kind == kind::after_all_keys;
    }
-
-
-    void serialize(bytes::iterator& out) const;
-    static token deserialize(bytes_view& in);
-    size_t serialized_size() const;
 };

 token midpoint_unsigned(const token& t1, const token& t2);
--- a/dist/ami/build_ami.sh
+++ b/dist/ami/build_ami.sh
@@ -6,22 +6,100 @@ if [ ! -e dist/ami/build_ami.sh ]; then
 fi

 print_usage() {
-    echo "build_ami.sh -l"
-    echo "  -l  deploy locally built rpms"
+    echo "build_ami.sh --localrpm --unstable"
+    echo "  --localrpm  deploy locally built rpms"
+    echo "  --unstable  use unstable branch"
    exit 1
 }
 LOCALRPM=0
-while getopts lh OPT; do
-    case "$OPT" in
-        "l")
+while [ $# -gt 0 ]; do
+    case "$1" in
+        "--localrpm")
            LOCALRPM=1
+            INSTALL_ARGS="$INSTALL_ARGS --localrpm"
+            shift 1
            ;;
-        "h")
+        "--unstable")
+            INSTALL_ARGS="$INSTALL_ARGS --unstable"
+            shift 1
+            ;;
+        *)
            print_usage
            ;;
    esac
 done

+. /etc/os-release
+case "$ID" in
+    "centos")
+        AMI=ami-f3102499
+        REGION=us-east-1
+        SSH_USERNAME=centos
+        ;;
+    "ubuntu")
+        AMI=ami-ff427095
+        REGION=us-east-1
+        SSH_USERNAME=ubuntu
+        ;;
+    *)
+        echo "build_ami.sh does not supported this distribution."
+        exit 1
+        ;;
+esac
+
+
+if [ $LOCALRPM -eq 1 ]; then
+    if [ "$ID" = "centos" ]; then
+        rm -rf build/*
+        sudo yum -y install git
+        if [ ! -f dist/ami/files/scylla-server.x86_64.rpm ]; then
+            dist/redhat/build_rpm.sh
+            cp build/rpmbuild/RPMS/x86_64/scylla-server-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-server.x86_64.rpm
+        fi
+        if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
+            cd build
+            git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
+            cd scylla-jmx
+            sh -x -e dist/redhat/build_rpm.sh $*
+            cd ../..
+            cp build/scylla-jmx/build/rpmbuild/RPMS/noarch/scylla-jmx-`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-jmx.noarch.rpm
+        fi
+        if [ ! -f dist/ami/files/scylla-tools.noarch.rpm ]; then
+            cd build
+            git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
+            cd scylla-tools-java
+            sh -x -e dist/redhat/build_rpm.sh
+            cd ../..
+            cp build/scylla-tools-java/build/rpmbuild/RPMS/noarch/scylla-tools-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools.noarch.rpm
+        fi
+    else
+        sudo apt-get install -y git
+        if [ ! -f dist/ami/files/scylla-server_amd64.deb ]; then
+            if [ ! -f ../scylla-server_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-ubuntu1_amd64.deb ]; then
+                echo "Build .deb before running build_ami.sh"
+                exit 1
+            fi
+            cp ../scylla-server_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-ubuntu1_amd64.deb dist/ami/files/scylla-server_amd64.deb
+        fi
+        if [ ! -f dist/ami/files/scylla-jmx_all.deb ]; then
+            cd build
+            git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
+            cd scylla-jmx
+            sh -x -e dist/ubuntu/build_deb.sh $*
+            cd ../..
+            cp build/scylla-jmx_`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`-ubuntu1_all.deb dist/ami/files/scylla-jmx_all.deb
+        fi
+        if [ ! -f dist/ami/files/scylla-tools_all.deb ]; then
+            cd build
+            git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
+            cd scylla-tools-java
+            sh -x -e dist/ubuntu/build_deb.sh $*
+            cd ../..
+            cp build/scylla-tools_`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`-ubuntu1_all.deb dist/ami/files/scylla-tools_all.deb
+        fi
+    fi
+fi
+
 cd dist/ami

 if [ ! -f variables.json ]; then
@@ -30,19 +108,11 @@ if [ ! -f variables.json ]; then
 fi

 if [ ! -d packer ]; then
-    wget https://dl.bintray.com/mitchellh/packer/packer_0.8.6_linux_amd64.zip
+    wget https://releases.hashicorp.com/packer/0.8.6/packer_0.8.6_linux_amd64.zip
    mkdir packer
    cd packer
    unzip -x ../packer_0.8.6_linux_amd64.zip
    cd -
 fi

-if [ $LOCALRPM = 0 ]; then
-    echo "sudo yum remove -y abrt; sudo sh -x -e /home/centos/scylla_install_pkg; sudo sh -x -e /usr/lib/scylla/scylla_setup -a" > scylla_deploy.sh
-else
-    echo "sudo yum remove -y abrt; sudo sh -x -e /home/centos/scylla_install_pkg -l /home/centos; sudo sh -x -e /usr/lib/scylla/scylla_setup -a" > scylla_deploy.sh
-
-fi
-
-chmod a+rx scylla_deploy.sh
-packer/packer build -var-file=variables.json scylla.json
+packer/packer build -var-file=variables.json -var install_args="$INSTALL_ARGS" -var region="$REGION" -var source_ami="$AMI" -var ssh_username="$SSH_USERNAME" scylla.json
--- a/dist/ami/build_ami_local.sh
+++ b/dist/ami/build_ami_local.sh
@@ -1,31 +0,0 @@
-#!/bin/sh -e
-
-if [ ! -e dist/ami/build_ami_local.sh ]; then
-    echo "run build_ami_local.sh in top of scylla dir"
-    exit 1
-fi
-
-rm -rf build/*
-sudo yum -y install git
-if [ ! -f dist/ami/files/scylla-server.x86_64.rpm ]; then
-    dist/redhat/build_rpm.sh
-    cp build/rpmbuild/RPMS/x86_64/scylla-server-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-server.x86_64.rpm
-fi
-if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
-    cd build
-    git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
-    cd scylla-jmx
-    sh -x -e dist/redhat/build_rpm.sh $*
-    cd ../..
-    cp build/scylla-jmx/build/rpmbuild/RPMS/noarch/scylla-jmx-`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-jmx.noarch.rpm
-fi
-if [ ! -f dist/ami/files/scylla-tools.noarch.rpm ]; then
-    cd build
-    git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
-    cd scylla-tools-java
-    sh -x -e dist/redhat/build_rpm.sh
-    cd ../..
-    cp build/scylla-tools-java/build/rpmbuild/RPMS/noarch/scylla-tools-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools.noarch.rpm
-fi
-
-exec dist/ami/build_ami.sh -l
--- a/dist/ami/files/.bash_profile
+++ b/dist/ami/files/.bash_profile
@@ -30,7 +30,21 @@ echo 'More documentation available at: '
 echo '	http://www.scylladb.com/doc/'
 echo

-if [ "`systemctl is-active scylla-server`" = "active" ]; then
+. /etc/os-release
+if [ "$ID" = "ubuntu" ]; then
+	if [ "`initctl status ssh|grep "running, process"`" != "" ]; then
+		STARTED=1
+	else
+		STARTED=0
+	fi
+else
+	if [ "`systemctl is-active scylla-server`" = "active" ]; then
+		STARTED=1
+	else
+		STARTED=0
+	fi
+fi
+if [ $STARTED -eq 1 ]; then
 	tput setaf 4
 	tput bold
 	echo "    ScyllaDB is active."
@@ -42,6 +56,13 @@ else
 	echo "    ScyllaDB is not started!"
 	tput sgr0
 	echo "Please wait for startup. To see status of ScyllaDB, run "
-	echo " 'systemctl status scylla-server'"
-	echo
+	if [ "$ID" = "ubuntu" ]; then
+		echo " 'initctl status scylla-server'"
+		echo "and"
+		echo " 'cat /var/log/upstart/scylla-server.log'"
+		echo
+	else
+		echo " 'systemctl status scylla-server'"
+		echo
+	fi
 fi
--- a/dist/ami/files/scylla-ami
+++ b/dist/ami/files/scylla-ami
--- a/dist/ami/scylla.json
+++ b/dist/ami/scylla.json
@@ -8,16 +8,52 @@
      "security_group_id": "{{user `security_group_id`}}",
      "region": "{{user `region`}}",
      "associate_public_ip_address": "{{user `associate_public_ip_address`}}",
-      "source_ami": "ami-8ef1d6e4",
+      "source_ami": "{{user `source_ami`}}",
      "user_data_file": "user_data.txt",
      "instance_type": "{{user `instance_type`}}",
-      "ssh_username": "centos",
+      "ssh_username": "{{user `ssh_username`}}",
      "ssh_timeout": "5m",
-      "ami_name": "scylla_{{isotime | clean_ami_name}}",
+      "ami_name": "{{user `ami_prefix`}}scylla_{{isotime | clean_ami_name}}",
+      "enhanced_networking": true,
      "launch_block_device_mappings": [
        {
          "device_name": "/dev/sda1",
-          "volume_size": 10
+          "volume_size": 10,
+          "delete_on_termination": true
+        }
+      ],
+      "ami_block_device_mappings": [
+        {
+          "device_name": "/dev/sdb",
+          "virtual_name": "ephemeral0"
+        },
+        {
+          "device_name": "/dev/sdc",
+          "virtual_name": "ephemeral1"
+        },
+        {
+          "device_name": "/dev/sdd",
+          "virtual_name": "ephemeral2"
+        },
+        {
+          "device_name": "/dev/sde",
+          "virtual_name": "ephemeral3"
+        },
+        {
+          "device_name": "/dev/sdf",
+          "virtual_name": "ephemeral4"
+        },
+        {
+          "device_name": "/dev/sdg",
+          "virtual_name": "ephemeral5"
+        },
+        {
+          "device_name": "/dev/sdh",
+          "virtual_name": "ephemeral6"
+        },
+        {
+          "device_name": "/dev/sdi",
+          "virtual_name": "ephemeral7"
        }
      ]
    }
@@ -26,16 +62,18 @@
    {
      "type": "file",
      "source": "files/",
-      "destination": "/home/centos/"
+      "destination": "/home/{{user `ssh_username`}}/"
    },
    {
      "type": "file",
      "source": "../../scripts/scylla_install_pkg",
-      "destination": "/home/centos/scylla_install_pkg"
+      "destination": "/home/{{user `ssh_username`}}/scylla_install_pkg"
    },
    {
      "type": "shell",
-      "script": "scylla_deploy.sh"
+      "inline": [
+         "sudo /home/{{user `ssh_username`}}/scylla-ami/scylla_install_ami {{ user `install_args` }}"
+       ]
    }
  ],
  "variables": {
@@ -45,6 +83,10 @@
    "security_group_id": "",
    "region": "",
    "associate_public_ip_address": "",
-    "instance_type": ""
+    "instance_type": "",
+    "install_args": "",
+    "ami_prefix": "",
+    "source_ami": "",
+    "ssh_username": ""
  }
 }
--- a/dist/common/bin/scyllatop
+++ b/dist/common/bin/scyllatop
@@ -0,0 +1,5 @@
+#!/bin/sh -e
+#
+#  Copyright (C) 2016 ScyllaDB
+
+exec python /usr/lib/scylla/scyllatop/scyllatop.py $@
--- a/dist/common/collectd.d/scylla.conf
+++ b/dist/common/collectd.d/scylla.conf
@@ -0,0 +1,16 @@
+LoadPlugin network
+LoadPlugin unixsock
+
+# dummy write_graphite to silent noisy warning
+LoadPlugin network
+<Plugin "network">
+        Server "127.0.0.1 65534"
+</Plugin>
+
+<Plugin network>
+	Listen "127.0.0.1" "25826"
+</Plugin>
+<Plugin unixsock>
+	SocketFile "/var/run/collectd-unixsock"
+	SocketPerms "0666"
+</Plugin>
--- a/dist/common/scripts/scylla_bootparam_setup
+++ b/dist/common/scripts/scylla_bootparam_setup
@@ -2,6 +2,25 @@
 #
 #  Copyright (C) 2015 ScyllaDB

+print_usage() {
+    echo "scylla_bootparam_setup --ami"
+    echo "  --ami				setup AMI instance"
+    exit 1
+}
+
+AMI_OPT=0
+while [ $# -gt 0 ]; do
+    case "$1" in
+        "--ami")
+            AMI_OPT=1
+            shift 1
+            ;;
+        *)
+            print_usage
+            ;;
+    esac
+done
+
 . /etc/os-release

 if [ ! -f /etc/default/grub ]; then
@@ -14,7 +33,11 @@ if [ "`grep hugepagesz /etc/default/grub`" != "" ] || [ "`grep hugepages /etc/de
    sed -e "s#hugepages=[0-9]* ##" /etc/default/grub > /tmp/grub
    mv /tmp/grub /etc/default/grub
 fi
-sed -e "s#^GRUB_CMDLINE_LINUX=\"#GRUB_CMDLINE_LINUX=\"hugepagesz=2M hugepages=$NR_HUGEPAGES #" /etc/default/grub > /tmp/grub
+if [ $AMI_OPT -eq 1 ]; then
+    sed -e "s#^GRUB_CMDLINE_LINUX=\"#GRUB_CMDLINE_LINUX=\"clocksource=tsc tsc=reliable hugepagesz=2M hugepages=$NR_HUGEPAGES #" /etc/default/grub > /tmp/grub
+else
+    sed -e "s#^GRUB_CMDLINE_LINUX=\"#GRUB_CMDLINE_LINUX=\"hugepagesz=2M hugepages=$NR_HUGEPAGES #" /etc/default/grub > /tmp/grub
+fi
 mv /tmp/grub /etc/default/grub
 if [ "$ID" = "ubuntu" ]; then
    grub-mkconfig -o /boot/grub/grub.cfg
--- a/dist/common/scripts/scylla_coredump_setup
+++ b/dist/common/scripts/scylla_coredump_setup
@@ -3,18 +3,19 @@
 #  Copyright (C) 2015 ScyllaDB

 print_usage() {
-    echo "scylla_coredump_setup -s"
-    echo "  -s  store coredump to /var/lib/scylla"
+    echo "scylla_coredump_setup --dump-to-raiddir"
+    echo "  --dump-to-raiddir  store coredump to /var/lib/scylla"
    exit 1
 }

 SYMLINK=0
-while getopts sh OPT; do
-    case "$OPT" in
-        "s")
+while [ $# -gt 0 ]; do
+    case "$1" in
+        "--dump-to-raiddir")
            SYMLINK=1
+            shift 1
            ;;
-        "h")
+        *)
            print_usage
            ;;
    esac
--- a/dist/common/scripts/scylla_dev_mode_setup
+++ b/dist/common/scripts/scylla_dev_mode_setup
@@ -0,0 +1,31 @@
+#!/bin/sh -e
+#
+#  Copyright (C) 2015 ScyllaDB
+
+print_usage() {
+    echo "scylla_developer_mode_setup --developer-mode=[0|1]"
+    echo "  --developer-mode   enable/disable developer mode"
+    exit 1
+}
+
+DEV_MODE=
+while [ $# -gt 0 ]; do
+    case "$1" in
+        "--developer-mode")
+            DEV_MODE=$2
+            shift 2
+            ;;
+        *)
+            print_usage
+            ;;
+    esac
+done
+
+if [ "$DEV_MODE" = "" ]; then
+    print_usage
+fi
+if [ "$DEV_MODE" != "0" ] && [ "$DEV_MODE" != "1" ]; then
+    print_usage
+fi
+
+echo "DEV_MODE=--developer-mode=$DEV_MODE" > /etc/scylla.d/dev-mode.conf
--- a/dist/common/scripts/scylla_io_setup
+++ b/dist/common/scripts/scylla_io_setup
@@ -0,0 +1,80 @@
+#!/bin/sh
+
+print_usage() {
+    echo "scylla_io_setup --ami"
+    echo "  --ami				setup AMI instance"
+    exit 1
+}
+
+AMI_OPT=0
+while [ $# -gt 0 ]; do
+    case "$1" in
+        "--ami")
+            AMI_OPT=1
+            shift 1
+            ;;
+        *)
+            print_usage
+            ;;
+    esac
+done
+
+
+is_developer_mode() {
+    cat /etc/scylla.d/dev-mode.conf|egrep -c "\-\-developer-mode(\s+|=)(1|true)"
+}
+
+output_to_user()
+{
+    echo "$1"
+    logger -p user.err "$1"
+}
+
+. /etc/os-release
+if [ "$NAME" = "Ubuntu" ]; then
+   . /etc/default/scylla-server
+else
+   . /etc/sysconfig/scylla-server
+fi
+
+if [ `is_developer_mode` -eq 0 ]; then
+    SMP=`echo $SCYLLA_ARGS|grep smp|sed -e "s/^.*smp\(\s\+\|=\)\([0-9]*\).*$/\2/"`
+    CPUSET=`echo $SCYLLA_ARGS|grep cpuset|sed -e "s/^.*\(--cpuset\(\s\+\|=\)[0-9\-]*\).*$/\1/"`
+    if [ $AMI_OPT -eq 1 ]; then
+        NR_CPU=`cat /proc/cpuinfo |grep processor|wc -l`
+        NR_DISKS=`lsblk --list --nodeps --noheadings | grep -v xvda | grep xvd | wc -l`
+        TYPE=`curl http://169.254.169.254/latest/meta-data/instance-type|cut -d . -f 1`
+
+        if [ "$SMP" != "" ]; then
+            NR_CPU=$SMP
+        fi
+        NR_SHARDS=$NR_CPU
+        if [ $NR_CPU -ge 8 ] && [ "$SET_NIC" = "no" ]; then
+            NR_SHARDS=$((NR_CPU - 1))
+        fi
+        if [ $NR_DISKS -lt 2 ]; then NR_DISKS=2; fi
+
+        NR_REQS=$((32 * $NR_DISKS / 2))
+
+        NR_IO_QUEUES=$NR_SHARDS
+        if [ $(($NR_REQS/$NR_IO_QUEUES)) -lt 4 ]; then
+            NR_IO_QUEUES=$(($NR_REQS / 4))
+        fi
+
+        NR_IO_QUEUES=$((NR_IO_QUEUES>NR_SHARDS?NR_SHARDS:NR_IO_QUEUES))
+        NR_REQS=$(($(($NR_REQS / $NR_IO_QUEUES)) * $NR_IO_QUEUES))
+        if [ "$TYPE" = "i2" ]; then
+            NR_REQS=$(($NR_REQS * 2))
+        fi
+
+        echo "SEASTAR_IO=\"--num-io-queues $NR_IO_QUEUES --max-io-requests $NR_REQS\"" > /etc/scylla.d/io.conf
+    else
+        iotune --evaluation-directory /var/lib/scylla --format envfile --options-file /etc/scylla.d/io.conf $CPUSET
+        if [ $? -ne 0 ]; then
+            output_to_user "/var/lib/scylla did not pass validation tests, it may not be on XFS and/or has limited disk space."
+            output_to_user "This is a non-supported setup, and performance is expected to be very bad."
+            output_to_user "For better performance, placing your data on XFS-formatted directories is required."
+            output_to_user " To override this error, see the developer_mode configuration option."
+        fi
+    fi
+fi
--- a/Show More
+++ b/Show More