mutation_reader_test: Wait for a future

Nothing was waiting for this future. Found while testing another patch. Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com> Message-Id: <20200630183929.1704908-1-espindola@scylladb.com> (cherry picked from commit 6fe7706fce) Fixes #6858.
repair: Relax node selection in bootstrap when nodes are less than RF
2020-07-16 14:44:31 +03:00 · 2020-07-16 12:02:38 +03:00 · 2020-07-15 14:48:49 +03:00 · 2020-07-14 12:34:26 +03:00 · 2020-07-14 12:34:06 +03:00 · 2020-07-13 20:08:16 +03:00
31 changed files with 267 additions and 98 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "seastar"]
 	path = seastar
-	url = ../seastar
+	url = ../scylla-seastar
 	ignore = dirty
 [submodule "swagger-ui"]
 	path = swagger-ui
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 #!/bin/sh

 PRODUCT=scylla
-VERSION=666.development
+VERSION=4.2.rc1

 if test -f version
 then
--- a/cql3/functions/castas_fcts.cc
+++ b/cql3/functions/castas_fcts.cc
@@ -88,16 +88,13 @@ static data_value castas_fctn_simple(data_value from) {
 template<typename ToType>
 static data_value castas_fctn_from_decimal_to_float(data_value from) {
    auto val_from = value_cast<big_decimal>(from);
-    boost::multiprecision::cpp_int ten(10);
-    boost::multiprecision::cpp_rational r = val_from.unscaled_value();
-    r /= boost::multiprecision::pow(ten, val_from.scale());
-    return static_cast<ToType>(r);
+    return static_cast<ToType>(val_from.as_rational());
 }

 static utils::multiprecision_int from_decimal_to_cppint(const data_value& from) {
    const auto& val_from = value_cast<big_decimal>(from);
-    boost::multiprecision::cpp_int ten(10);
-    return boost::multiprecision::cpp_int(val_from.unscaled_value() / boost::multiprecision::pow(ten, val_from.scale()));
+    auto r = val_from.as_rational();
+    return utils::multiprecision_int(numerator(r)/denominator(r));
 }

 template<typename ToType>
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -688,6 +688,11 @@ static query::range<bytes_view> to_range(const term_slice& slice, const query_op
        extract_bound(statements::bound::END));
 }

+static bool contains_without_wraparound(
+        const query::range<bytes_view>& range, bytes_view value, const serialized_tri_compare& cmp) {
+    return !range.is_wrap_around(cmp) && range.contains(value, cmp);
+}
+
 bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
        const partition_key& key,
        const clustering_key_prefix& ckey,
@@ -702,13 +707,13 @@ bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
        return false;
    }
    return cell_value->with_linearized([&] (bytes_view cell_value_bv) {
-        return to_range(_slice, options, _column_def.name_as_text()).contains(
+        return contains_without_wraparound(to_range(_slice, options, _column_def.name_as_text()),
                cell_value_bv, _column_def.type->as_tri_comparator());
    });
 }

 bool single_column_restriction::slice::is_satisfied_by(bytes_view data, const query_options& options) const {
-    return to_range(_slice, options, _column_def.name_as_text()).contains(
+    return contains_without_wraparound(to_range(_slice, options, _column_def.name_as_text()),
            data, _column_def.type->underlying_type()->as_tri_comparator());
 }

--- a/db/view/view_update_generator.cc
+++ b/db/view/view_update_generator.cc
@@ -44,11 +44,16 @@ future<> view_update_generator::start() {

            // If we got here, we will process all tables we know about so far eventually so there
            // is no starvation
-            for (auto& t : _sstables_with_tables | boost::adaptors::map_keys) {
+            for (auto table_it = _sstables_with_tables.begin(); table_it != _sstables_with_tables.end(); table_it = _sstables_with_tables.erase(table_it)) {
+                auto& [t, t_sstables] = *table_it;
                schema_ptr s = t->schema();

+                vug_logger.trace("Processing {}.{}: {} sstables", s->ks_name(), s->cf_name(), t_sstables.size());
+
                // Copy what we have so far so we don't miss new updates
-                auto sstables = std::exchange(_sstables_with_tables[t], {});
+                auto sstables = std::exchange(t_sstables, {});
+
+                const auto num_sstables = sstables.size();

                try {
                    // temporary: need an sstable set for the flat mutation reader, but the
@@ -89,7 +94,7 @@ future<> view_update_generator::start() {
                    // Move from staging will be retried upon restart.
                    vug_logger.warn("Moving {} from staging failed: {}:{}. Ignoring...", s->ks_name(), s->cf_name(), std::current_exception());
                }
-                _registration_sem.signal();
+                _registration_sem.signal(num_sstables);
            }
            // For each table, move the processed staging sstables into the table's base dir.
            for (auto it = _sstables_to_move.begin(); it != _sstables_to_move.end(); ) {
--- a/db/view/view_update_generator.hh
+++ b/db/view/view_update_generator.hh
@@ -32,7 +32,10 @@
 namespace db::view {

 class view_update_generator {
+public:
    static constexpr size_t registration_queue_size = 5;
+
+private:
    database& _db;
    seastar::abort_source _as;
    future<> _started = make_ready_future<>();
@@ -51,6 +54,8 @@ public:
    future<> start();
    future<> stop();
    future<> register_staging_sstable(sstables::shared_sstable sst, lw_shared_ptr<table> table);
+
+    ssize_t available_register_units() const { return _registration_sem.available_units(); }
 private:
    bool should_throttle() const;
 };
--- a/dht/boot_strapper.cc
+++ b/dht/boot_strapper.cc
@@ -59,7 +59,12 @@ future<> boot_strapper::bootstrap(streaming::stream_reason reason) {
        return make_exception_future<>(std::runtime_error("Wrong stream_reason provided: it can only be replace or bootstrap"));
    }
    auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _abort_source, _tokens, _address, description, reason);
-    streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(gms::get_local_gossiper().get_unreachable_members()));
+    auto nodes_to_filter = gms::get_local_gossiper().get_unreachable_members();
+    if (reason == streaming::stream_reason::replace && _db.local().get_replace_address()) {
+        nodes_to_filter.insert(_db.local().get_replace_address().value());
+    }
+    blogger.debug("nodes_to_filter={}", nodes_to_filter);
+    streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(nodes_to_filter));
    auto keyspaces = make_lw_shared<std::vector<sstring>>(_db.local().get_non_system_keyspaces());
    return do_for_each(*keyspaces, [this, keyspaces, streamer] (sstring& keyspace_name) {
        auto& ks = _db.local().find_keyspace(keyspace_name);
--- a/dist/common/scripts/scylla-housekeeping
+++ b/dist/common/scripts/scylla-housekeeping
@@ -61,7 +61,15 @@ def sh_command(*args):
    return out

 def get_url(path):
-    return urllib.request.urlopen(path).read().decode('utf-8')
+    # If server returns any error, like 403, or 500 urllib.request throws exception, which is not serializable.
+    # When multiprocessing routines fail to serialize it, it throws ambiguous serialization exception
+    #   from get_json_from_url.
+    # In order to see legit error we catch it from the inside of process, covert to string and
+    #   pass it as part of return value
+    try:
+        return 0, urllib.request.urlopen(path).read().decode('utf-8')
+    except Exception as exc:
+        return 1, str(exc)

 def get_json_from_url(path):
    pool = mp.Pool(processes=1)
@@ -71,13 +79,16 @@ def get_json_from_url(path):
    # to enforce a wallclock timeout.
    result = pool.apply_async(get_url, args=(path,))
    try:
-        retval = result.get(timeout=5)
+        status, retval = result.get(timeout=5)
    except mp.TimeoutError as err:
        pool.terminate()
        pool.join()
        raise
+    if status == 1:
+        raise RuntimeError(f'Failed to get "{path}" due to the following error: {retval}')
    return json.loads(retval)

+
 def get_api(path):
    return get_json_from_url("http://" + api_address + path)

--- a/dist/common/scripts/scylla_setup
+++ b/dist/common/scripts/scylla_setup
@@ -27,6 +27,7 @@ import glob
 import shutil
 import io
 import stat
+import distro
 from scylla_util import *

 interactive = False
@@ -385,6 +386,9 @@ if __name__ == '__main__':
                if not stat.S_ISBLK(os.stat(dsk).st_mode):
                    print('{} is not block device'.format(dsk))
                    continue
+                if dsk in selected:
+                    print(f'{dsk} is already added')
+                    continue
                selected.append(dsk)
                devices.remove(dsk)
            disks = ','.join(selected)
@@ -468,5 +472,10 @@ if __name__ == '__main__':
            print('Please restart your machine before using ScyllaDB, as you have disabled')
            print(' SELinux.')

-        if dist_name() == 'Ubuntu':
-            run('apt-get install -y hugepages')
+        if distro.id() == 'ubuntu':
+            # Ubuntu version is 20.04 or later
+            if int(distro.major_version()) >= 20:
+                hugepkg = 'libhugetlbfs-bin'
+            else:
+                hugepkg = 'hugepages'
+            run(f'apt-get install -y {hugepkg}')
--- a/dist/common/scripts/scylla_swap_setup
+++ b/dist/common/scripts/scylla_swap_setup
@@ -40,6 +40,10 @@ if __name__ == '__main__':
        sys.exit(1)

    memtotal = get_memtotal_gb()
+    if memtotal == 0:
+        print('memory too small: {} KB'.format(get_memtotal()))
+        sys.exit(1)
+
    # Scylla document says 'swap size should be set to either total_mem/3 or
    # 16GB - lower of the two', so we need to compare 16g vs memtotal/3 and
    # choose lower one
--- a/dist/docker/redhat/Dockerfile
+++ b/dist/docker/redhat/Dockerfile
@@ -5,8 +5,8 @@ MAINTAINER Avi Kivity <avi@cloudius-systems.com>
 ENV container docker

 # The SCYLLA_REPO_URL argument specifies the URL to the RPM repository this Docker image uses to install Scylla. The default value is the Scylla's unstable RPM repository, which contains the daily build.
-ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo
-ARG VERSION=666.development
+ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/scylla-4.2/latest/scylla.repo
+ARG VERSION=4.2

 ADD scylla_bashrc /scylla_bashrc

--- a/install-dependencies.sh
+++ b/install-dependencies.sh
@@ -98,6 +98,7 @@ fedora_packages=(
    debhelper
    fakeroot
    file
+    dpkg-dev
 )

 centos_packages=(
--- a/lua.cc
+++ b/lua.cc
@@ -262,14 +262,12 @@ static auto visit_lua_raw_value(lua_State* l, int index, Func&& f) {

 template <typename Func>
 static auto visit_decimal(const big_decimal &v, Func&& f) {
-    boost::multiprecision::cpp_int ten(10);
-    const auto& dividend = v.unscaled_value();
-    auto divisor = boost::multiprecision::pow(ten, v.scale());
+    boost::multiprecision::cpp_rational r = v.as_rational();
+    const boost::multiprecision::cpp_int& dividend = numerator(r);
+    const boost::multiprecision::cpp_int& divisor = denominator(r);
    if (dividend % divisor == 0) {
-        return f(utils::multiprecision_int(boost::multiprecision::cpp_int(dividend/divisor)));
+        return f(utils::multiprecision_int(dividend/divisor));
    }
-    boost::multiprecision::cpp_rational r = dividend;
-    r /= divisor;
    return f(r.convert_to<double>());
 }

--- a/message/messaging_service.cc
+++ b/message/messaging_service.cc
@@ -1199,14 +1199,14 @@ future<partition_checksum> messaging_service::send_repair_checksum_range(
 }

 // Wrapper for REPAIR_GET_FULL_ROW_HASHES
-void messaging_service::register_repair_get_full_row_hashes(std::function<future<std::unordered_set<repair_hash>> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func) {
+void messaging_service::register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func) {
    register_handler(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(func));
 }
 future<> messaging_service::unregister_repair_get_full_row_hashes() {
    return unregister_handler(messaging_verb::REPAIR_GET_FULL_ROW_HASHES);
 }
-future<std::unordered_set<repair_hash>> messaging_service::send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id) {
-    return send_message<future<std::unordered_set<repair_hash>>>(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(id), repair_meta_id);
+future<repair_hash_set> messaging_service::send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id) {
+    return send_message<future<repair_hash_set>>(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(id), repair_meta_id);
 }

 // Wrapper for REPAIR_GET_COMBINED_ROW_HASH
@@ -1231,13 +1231,13 @@ future<get_sync_boundary_response> messaging_service::send_repair_get_sync_bound
 }

 // Wrapper for REPAIR_GET_ROW_DIFF
-void messaging_service::register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows)>&& func) {
+void messaging_service::register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows)>&& func) {
    register_handler(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(func));
 }
 future<> messaging_service::unregister_repair_get_row_diff() {
    return unregister_handler(messaging_verb::REPAIR_GET_ROW_DIFF);
 }
-future<repair_rows_on_wire> messaging_service::send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows) {
+future<repair_rows_on_wire> messaging_service::send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows) {
    return send_message<future<repair_rows_on_wire>>(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(id), repair_meta_id, std::move(set_diff), needs_all_rows);
 }

--- a/message/messaging_service.hh
+++ b/message/messaging_service.hh
@@ -339,9 +339,9 @@ public:
    future<partition_checksum> send_repair_checksum_range(msg_addr id, sstring keyspace, sstring cf, dht::token_range range, repair_checksum hash_version);

    // Wrapper for REPAIR_GET_FULL_ROW_HASHES
-    void register_repair_get_full_row_hashes(std::function<future<std::unordered_set<repair_hash>> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func);
+    void register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func);
    future<> unregister_repair_get_full_row_hashes();
-    future<std::unordered_set<repair_hash>> send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id);
+    future<repair_hash_set> send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id);

    // Wrapper for REPAIR_GET_COMBINED_ROW_HASH
    void register_repair_get_combined_row_hash(std::function<future<get_combined_row_hash_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::optional<repair_sync_boundary> common_sync_boundary)>&& func);
@@ -354,9 +354,9 @@ public:
    future<get_sync_boundary_response> send_repair_get_sync_boundary(msg_addr id, uint32_t repair_meta_id, std::optional<repair_sync_boundary> skipped_sync_boundary);

    // Wrapper for REPAIR_GET_ROW_DIFF
-    void register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows)>&& func);
+    void register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows)>&& func);
    future<> unregister_repair_get_row_diff();
-    future<repair_rows_on_wire> send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows);
+    future<repair_rows_on_wire> send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows);

    // Wrapper for REPAIR_PUT_ROW_DIFF
    void register_repair_put_row_diff(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_rows_on_wire row_diff)>&& func);
--- a/reloc/build_deb.sh
+++ b/reloc/build_deb.sh
@@ -44,15 +44,15 @@ mkdir -p $BUILDDIR/scylla-package
 tar -C $BUILDDIR/scylla-package -xpf $RELOC_PKG
 cd $BUILDDIR/scylla-package

-PRODUCT=$(cat scylla/SCYLLA-PRODUCT-FILE)
-SCYLLA_VERSION=$(cat scylla/SCYLLA-VERSION-FILE)
-SCYLLA_RELEASE=$(cat scylla/SCYLLA-RELEASE-FILE)
-
-ln -fv $RELOC_PKG ../$PRODUCT-server_$SCYLLA_VERSION-$SCYLLA_RELEASE.orig.tar.gz
-
 if $DIST; then
    export DEB_BUILD_OPTIONS="housekeeping"
 fi

 mv scylla/debian debian
+
+PKG_NAME=$(dpkg-parsechangelog --show-field Source)
+# XXX: Drop revision number from version string.
+#      Since it always '1', this should be okay for now.
+PKG_VERSION=$(dpkg-parsechangelog --show-field Version |sed -e 's/-1$//')
+ln -fv $RELOC_PKG ../"$PKG_NAME"_"$PKG_VERSION".orig.tar.gz
 debuild -rfakeroot -us -uc
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -1633,6 +1633,7 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, locator::token_me
            auto& ks = db.local().find_keyspace(keyspace_name);
            auto& strat = ks.get_replication_strategy();
            dht::token_range_vector desired_ranges = strat.get_pending_address_ranges(tm, tokens, myip);
+            bool find_node_in_local_dc_only = strat.get_type() == locator::replication_strategy_type::network_topology;

            //Active ranges
            auto metadata_clone = tm.clone_only_token_map();
@@ -1719,6 +1720,9 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, locator::token_me
                            mandatory_neighbors = get_node_losing_the_ranges(old_endpoints, new_endpoints);
                            neighbors = mandatory_neighbors;
                        } else if (old_endpoints.size() < strat.get_replication_factor()) {
+                          if (!find_node_in_local_dc_only) {
+                            neighbors = old_endpoints;
+                          } else {
                            if (old_endpoints_in_local_dc.size() == rf_in_local_dc) {
                                // Local DC has enough replica nodes.
                                mandatory_neighbors = get_node_losing_the_ranges(old_endpoints_in_local_dc, new_endpoints);
@@ -1746,6 +1750,7 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, locator::token_me
                                throw std::runtime_error(format("bootstrap_with_repair: keyspace={}, range={}, wrong number of old_endpoints_in_local_dc={}, rf_in_local_dc={}",
                                        keyspace_name, desired_range, old_endpoints_in_local_dc.size(), rf_in_local_dc));
                            }
+                          }
                        } else {
                            throw std::runtime_error(format("bootstrap_with_repair: keyspace={}, range={}, wrong number of old_endpoints={}, rf={}",
                                        keyspace_name, desired_range, old_endpoints, strat.get_replication_factor()));
--- a/repair/repair.hh
+++ b/repair/repair.hh
@@ -23,6 +23,7 @@

 #include <unordered_map>
 #include <exception>
+#include <absl/container/btree_set.h>

 #include <seastar/core/sstring.hh>
 #include <seastar/core/sharded.hh>
@@ -339,6 +340,8 @@ public:
    }
 };

+using repair_hash_set = absl::btree_set<repair_hash>;
+
 enum class repair_row_level_start_status: uint8_t {
    ok,
    no_such_column_family,
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -666,7 +666,7 @@ private:
    // Tracks current sync boundary
    std::optional<repair_sync_boundary> _current_sync_boundary;
    // Contains the hashes of rows in the _working_row_buffor for all peer nodes
-    std::vector<std::unordered_set<repair_hash>> _peer_row_hash_sets;
+    std::vector<repair_hash_set> _peer_row_hash_sets;
    // Gate used to make sure pending operation of meta data is done
    seastar::gate _gate;
    sink_source_for_get_full_row_hashes _sink_source_for_get_full_row_hashes;
@@ -886,9 +886,9 @@ public:
    }

    // Must run inside a seastar thread
-    static std::unordered_set<repair_hash>
-    get_set_diff(const std::unordered_set<repair_hash>& x, const std::unordered_set<repair_hash>& y) {
-        std::unordered_set<repair_hash> set_diff;
+    static repair_hash_set
+    get_set_diff(const repair_hash_set& x, const repair_hash_set& y) {
+        repair_hash_set set_diff;
        // Note std::set_difference needs x and y are sorted.
        std::copy_if(x.begin(), x.end(), std::inserter(set_diff, set_diff.end()),
                [&y] (auto& item) { thread::maybe_yield(); return y.find(item) == y.end(); });
@@ -906,14 +906,14 @@ public:

    }

-    std::unordered_set<repair_hash>& peer_row_hash_sets(unsigned node_idx) {
+    repair_hash_set& peer_row_hash_sets(unsigned node_idx) {
        return _peer_row_hash_sets[node_idx];
    }

    // Get a list of row hashes in _working_row_buf
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    working_row_hashes() {
-        return do_with(std::unordered_set<repair_hash>(), [this] (std::unordered_set<repair_hash>& hashes) {
+        return do_with(repair_hash_set(), [this] (repair_hash_set& hashes) {
            return do_for_each(_working_row_buf, [&hashes] (repair_row& r) {
                hashes.emplace(r.hash());
            }).then([&hashes] {
@@ -1199,9 +1199,9 @@ private:
    }

    future<std::list<repair_row>>
-    copy_rows_from_working_row_buf_within_set_diff(std::unordered_set<repair_hash> set_diff) {
+    copy_rows_from_working_row_buf_within_set_diff(repair_hash_set set_diff) {
        return do_with(std::list<repair_row>(), std::move(set_diff),
-                [this] (std::list<repair_row>& rows, std::unordered_set<repair_hash>& set_diff) {
+                [this] (std::list<repair_row>& rows, repair_hash_set& set_diff) {
            return do_for_each(_working_row_buf, [this, &set_diff, &rows] (const repair_row& r) {
                if (set_diff.count(r.hash()) > 0) {
                    rows.push_back(r);
@@ -1216,7 +1216,7 @@ private:
    // Give a set of row hashes, return the corresponding rows
    // If needs_all_rows is set, return all the rows in _working_row_buf, ignore the set_diff
    future<std::list<repair_row>>
-    get_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows = needs_all_rows_t::no) {
+    get_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows = needs_all_rows_t::no) {
        if (needs_all_rows) {
            if (!_repair_master || _nr_peer_nodes == 1) {
                return make_ready_future<std::list<repair_row>>(std::move(_working_row_buf));
@@ -1266,7 +1266,7 @@ private:
                [this] (const repair_row& x, const repair_row& y) { thread::maybe_yield(); return _cmp(x.boundary(), y.boundary()) < 0; });
        }
        if (update_hash_set) {
-            _peer_row_hash_sets[node_idx] = boost::copy_range<std::unordered_set<repair_hash>>(row_diff |
+            _peer_row_hash_sets[node_idx] = boost::copy_range<repair_hash_set>(row_diff |
                    boost::adaptors::transformed([] (repair_row& r) { thread::maybe_yield(); return r.hash(); }));
        }
        do_apply_rows(row_diff, node_idx, update_buf).get();
@@ -1360,13 +1360,13 @@ private:
 public:
    // RPC API
    // Return the hashes of the rows in _working_row_buf
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    get_full_row_hashes(gms::inet_address remote_node) {
        if (remote_node == _myip) {
            return get_full_row_hashes_handler();
        }
        return netw::get_local_messaging_service().send_repair_get_full_row_hashes(msg_addr(remote_node),
-                _repair_meta_id).then([this, remote_node] (std::unordered_set<repair_hash> hashes) {
+                _repair_meta_id).then([this, remote_node] (repair_hash_set hashes) {
            rlogger.debug("Got full hashes from peer={}, nr_hashes={}", remote_node, hashes.size());
            _metrics.rx_hashes_nr += hashes.size();
            stats().rx_hashes_nr += hashes.size();
@@ -1377,7 +1377,7 @@ public:

 private:
    future<> get_full_row_hashes_source_op(
-            lw_shared_ptr<std::unordered_set<repair_hash>> current_hashes,
+            lw_shared_ptr<repair_hash_set> current_hashes,
            gms::inet_address remote_node,
            unsigned node_idx,
            rpc::source<repair_hash_with_cmd>& source) {
@@ -1415,12 +1415,12 @@ private:
    }

 public:
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    get_full_row_hashes_with_rpc_stream(gms::inet_address remote_node, unsigned node_idx) {
        if (remote_node == _myip) {
            return get_full_row_hashes_handler();
        }
-        auto current_hashes = make_lw_shared<std::unordered_set<repair_hash>>();
+        auto current_hashes = make_lw_shared<repair_hash_set>();
        return _sink_source_for_get_full_row_hashes.get_sink_source(remote_node, node_idx).then(
                [this, current_hashes, remote_node, node_idx]
                (rpc::sink<repair_stream_cmd>& sink, rpc::source<repair_hash_with_cmd>& source) mutable {
@@ -1435,7 +1435,7 @@ public:
    }

    // RPC handler
-    future<std::unordered_set<repair_hash>>
+    future<repair_hash_set>
    get_full_row_hashes_handler() {
        return with_gate(_gate, [this] {
            return working_row_hashes();
@@ -1585,7 +1585,7 @@ public:
    // RPC API
    // Return rows in the _working_row_buf with hash within the given sef_diff
    // Must run inside a seastar thread
-    void get_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, unsigned node_idx) {
+    void get_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, unsigned node_idx) {
        if (needs_all_rows || !set_diff.empty()) {
            if (remote_node == _myip) {
                return;
@@ -1654,11 +1654,11 @@ private:
    }

    future<> get_row_diff_sink_op(
-            std::unordered_set<repair_hash> set_diff,
+            repair_hash_set set_diff,
            needs_all_rows_t needs_all_rows,
            rpc::sink<repair_hash_with_cmd>& sink,
            gms::inet_address remote_node) {
-        return do_with(std::move(set_diff), [needs_all_rows, remote_node, &sink] (std::unordered_set<repair_hash>& set_diff) mutable {
+        return do_with(std::move(set_diff), [needs_all_rows, remote_node, &sink] (repair_hash_set& set_diff) mutable {
            if (inject_rpc_stream_error) {
                return make_exception_future<>(std::runtime_error("get_row_diff: Inject sender error in sink loop"));
            }
@@ -1685,7 +1685,7 @@ private:
 public:
    // Must run inside a seastar thread
    void get_row_diff_with_rpc_stream(
-            std::unordered_set<repair_hash> set_diff,
+            repair_hash_set set_diff,
            needs_all_rows_t needs_all_rows,
            update_peer_row_hash_sets update_hash_set,
            gms::inet_address remote_node,
@@ -1711,7 +1711,7 @@ public:
    }

    // RPC handler
-    future<repair_rows_on_wire> get_row_diff_handler(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows) {
+    future<repair_rows_on_wire> get_row_diff_handler(repair_hash_set set_diff, needs_all_rows_t needs_all_rows) {
        return with_gate(_gate, [this, set_diff = std::move(set_diff), needs_all_rows] () mutable {
            return get_row_diff(std::move(set_diff), needs_all_rows).then([this] (std::list<repair_row> row_diff) {
                return to_repair_rows_on_wire(std::move(row_diff));
@@ -1721,15 +1721,16 @@ public:

    // RPC API
    // Send rows in the _working_row_buf with hash within the given sef_diff
-    future<> put_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node) {
+    future<> put_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node) {
        if (!set_diff.empty()) {
            if (remote_node == _myip) {
                return make_ready_future<>();
            }
-            auto sz = set_diff.size();
+            size_t sz = set_diff.size();
            return get_row_diff(std::move(set_diff), needs_all_rows).then([this, remote_node, sz] (std::list<repair_row> row_diff) {
                if (row_diff.size() != sz) {
-                    throw std::runtime_error("row_diff.size() != set_diff.size()");
+                    rlogger.warn("Hash conflict detected, keyspace={}, table={}, range={}, row_diff.size={}, set_diff.size={}. It is recommended to compact the table and rerun repair for the range.",
+                            _schema->ks_name(), _schema->cf_name(), _range, row_diff.size(), sz);
                }
                return do_with(std::move(row_diff), [this, remote_node] (std::list<repair_row>& row_diff) {
                    return get_repair_rows_size(row_diff).then([this, remote_node, &row_diff] (size_t row_bytes) mutable {
@@ -1796,17 +1797,18 @@ private:

 public:
    future<> put_row_diff_with_rpc_stream(
-            std::unordered_set<repair_hash> set_diff,
+            repair_hash_set set_diff,
            needs_all_rows_t needs_all_rows,
            gms::inet_address remote_node, unsigned node_idx) {
        if (!set_diff.empty()) {
            if (remote_node == _myip) {
                return make_ready_future<>();
            }
-            auto sz = set_diff.size();
+            size_t sz = set_diff.size();
            return get_row_diff(std::move(set_diff), needs_all_rows).then([this, remote_node, node_idx, sz] (std::list<repair_row> row_diff) {
                if (row_diff.size() != sz) {
-                    throw std::runtime_error("row_diff.size() != set_diff.size()");
+                    rlogger.warn("Hash conflict detected, keyspace={}, table={}, range={}, row_diff.size={}, set_diff.size={}. It is recommended to compact the table and rerun repair for the range.",
+                            _schema->ks_name(), _schema->cf_name(), _range, row_diff.size(), sz);
                }
                return do_with(std::move(row_diff), [this, remote_node, node_idx] (std::list<repair_row>& row_diff) {
                    return get_repair_rows_size(row_diff).then([this, remote_node, node_idx, &row_diff] (size_t row_bytes) mutable {
@@ -1845,7 +1847,7 @@ static future<stop_iteration> repair_get_row_diff_with_rpc_stream_process_op(
        rpc::sink<repair_row_on_wire_with_cmd> sink,
        rpc::source<repair_hash_with_cmd> source,
        bool &error,
-        std::unordered_set<repair_hash>& current_set_diff,
+        repair_hash_set& current_set_diff,
        std::optional<std::tuple<repair_hash_with_cmd>> hash_cmd_opt) {
    repair_hash_with_cmd hash_cmd = std::get<0>(hash_cmd_opt.value());
    rlogger.trace("Got repair_hash_with_cmd from peer={}, hash={}, cmd={}", from, hash_cmd.hash, int(hash_cmd.cmd));
@@ -1858,7 +1860,7 @@ static future<stop_iteration> repair_get_row_diff_with_rpc_stream_process_op(
        }
        bool needs_all_rows = hash_cmd.cmd == repair_stream_cmd::needs_all_rows;
        _metrics.rx_hashes_nr += current_set_diff.size();
-        auto fp = make_foreign(std::make_unique<std::unordered_set<repair_hash>>(std::move(current_set_diff)));
+        auto fp = make_foreign(std::make_unique<repair_hash_set>(std::move(current_set_diff)));
        return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id, needs_all_rows, fp = std::move(fp)] {
            auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
            if (fp.get_owner_shard() == this_shard_id()) {
@@ -1936,12 +1938,12 @@ static future<stop_iteration> repair_get_full_row_hashes_with_rpc_stream_process
    if (status == repair_stream_cmd::get_full_row_hashes) {
        return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id] {
            auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
-            return rm->get_full_row_hashes_handler().then([] (std::unordered_set<repair_hash> hashes) {
+            return rm->get_full_row_hashes_handler().then([] (repair_hash_set hashes) {
                _metrics.tx_hashes_nr += hashes.size();
                return hashes;
            });
-        }).then([sink] (std::unordered_set<repair_hash> hashes) mutable {
-            return do_with(std::move(hashes), [sink] (std::unordered_set<repair_hash>& hashes) mutable {
+        }).then([sink] (repair_hash_set hashes) mutable {
+            return do_with(std::move(hashes), [sink] (repair_hash_set& hashes) mutable {
                return do_for_each(hashes, [sink] (const repair_hash& hash) mutable {
                    return sink(repair_hash_with_cmd{repair_stream_cmd::hash_data, hash});
                }).then([sink] () mutable {
@@ -1964,7 +1966,7 @@ static future<> repair_get_row_diff_with_rpc_stream_handler(
        uint32_t repair_meta_id,
        rpc::sink<repair_row_on_wire_with_cmd> sink,
        rpc::source<repair_hash_with_cmd> source) {
-    return do_with(false, std::unordered_set<repair_hash>(), [from, src_cpu_id, repair_meta_id, sink, source] (bool& error, std::unordered_set<repair_hash>& current_set_diff) mutable {
+    return do_with(false, repair_hash_set(), [from, src_cpu_id, repair_meta_id, sink, source] (bool& error, repair_hash_set& current_set_diff) mutable {
        return repeat([from, src_cpu_id, repair_meta_id, sink, source, &error, &current_set_diff] () mutable {
            return source().then([from, src_cpu_id, repair_meta_id, sink, source, &error, &current_set_diff] (std::optional<std::tuple<repair_hash_with_cmd>> hash_cmd_opt) mutable {
                if (hash_cmd_opt) {
@@ -2107,7 +2109,7 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
            auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
            return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id] {
                auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
-                return rm->get_full_row_hashes_handler().then([] (std::unordered_set<repair_hash> hashes) {
+                return rm->get_full_row_hashes_handler().then([] (repair_hash_set hashes) {
                    _metrics.tx_hashes_nr += hashes.size();
                    return hashes;
                });
@@ -2135,11 +2137,11 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
            });
        });
        ms.register_repair_get_row_diff([] (const rpc::client_info& cinfo, uint32_t repair_meta_id,
-                std::unordered_set<repair_hash> set_diff, bool needs_all_rows) {
+                repair_hash_set set_diff, bool needs_all_rows) {
            auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
            auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
            _metrics.rx_hashes_nr += set_diff.size();
-            auto fp = make_foreign(std::make_unique<std::unordered_set<repair_hash>>(std::move(set_diff)));
+            auto fp = make_foreign(std::make_unique<repair_hash_set>(std::move(set_diff)));
            return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id, fp = std::move(fp), needs_all_rows] () mutable {
                auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
                if (fp.get_owner_shard() == this_shard_id()) {
@@ -2439,7 +2441,7 @@ private:
            // sequentially because the rows from repair follower 1 to
            // repair master might reduce the amount of missing data
            // between repair master and repair follower 2.
-            std::unordered_set<repair_hash> set_diff = repair_meta::get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get0());
+            repair_hash_set set_diff = repair_meta::get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get0());
            // Request missing sets from peer node
            rlogger.debug("Before get_row_diff to node {}, local={}, peer={}, set_diff={}",
                    node, master.working_row_hashes().get0().size(), master.peer_row_hash_sets(node_idx).size(), set_diff.size());
@@ -2462,9 +2464,9 @@ private:
        // So we can figure out which rows peer node are missing and send the missing rows to them
        check_in_shutdown();
        _ri.check_in_abort();
-        std::unordered_set<repair_hash> local_row_hash_sets = master.working_row_hashes().get0();
+        repair_hash_set local_row_hash_sets = master.working_row_hashes().get0();
        auto sz = _all_live_peer_nodes.size();
-        std::vector<std::unordered_set<repair_hash>> set_diffs(sz);
+        std::vector<repair_hash_set> set_diffs(sz);
        for (size_t idx : boost::irange(size_t(0), sz)) {
            set_diffs[idx] = repair_meta::get_set_diff(local_row_hash_sets, master.peer_row_hash_sets(idx));
        }
--- a/2
+++ b/2
--- a/serializer_impl.hh
+++ b/serializer_impl.hh
@@ -25,6 +25,7 @@
 #include <seastar/util/bool_class.hh>
 #include <boost/range/algorithm/for_each.hpp>
 #include "utils/small_vector.hh"
+#include <absl/container/btree_set.h>

 namespace ser {

@@ -81,6 +82,17 @@ static inline void serialize_array(Output& out, const Container& v) {
 template<typename Container>
 struct container_traits;

+template<typename T>
+struct container_traits<absl::btree_set<T>> {
+    struct back_emplacer {
+        absl::btree_set<T>& c;
+        back_emplacer(absl::btree_set<T>& c_) : c(c_) {}
+        void operator()(T&& v) {
+            c.emplace(std::move(v));
+        }
+    };
+};
+
 template<typename T>
 struct container_traits<std::unordered_set<T>> {
    struct back_emplacer {
@@ -253,6 +265,27 @@ struct serializer<std::list<T>> {
    }
 };

+template<typename T>
+struct serializer<absl::btree_set<T>> {
+    template<typename Input>
+    static absl::btree_set<T> read(Input& in) {
+        auto sz = deserialize(in, boost::type<uint32_t>());
+        absl::btree_set<T> v;
+        deserialize_array_helper<false, T>::doit(in, v, sz);
+        return v;
+    }
+    template<typename Output>
+    static void write(Output& out, const absl::btree_set<T>& v) {
+        safe_serialize_as_uint32(out, v.size());
+        serialize_array_helper<false, T>::doit(out, v);
+    }
+    template<typename Input>
+    static void skip(Input& in) {
+        auto sz = deserialize(in, boost::type<uint32_t>());
+        skip_array<T>(in, sz);
+    }
+};
+
 template<typename T>
 struct serializer<std::unordered_set<T>> {
    template<typename Input>
--- a/sstables/compaction.cc
+++ b/sstables/compaction.cc
@@ -602,7 +602,7 @@ private:
        // - add support to merge summary (message: Partition merge counts were {%s}.).
        // - there is no easy way, currently, to know the exact number of total partitions.
        // By the time being, using estimated key count.
-        sstring formatted_msg = fmt::format("{} sstables to [{}]. {} to {} (~{} of original) in {}ms = {}. " \
+        sstring formatted_msg = fmt::format("{} sstables to [{}]. {} to {} (~{}% of original) in {}ms = {}. " \
            "~{} total partitions merged to {}.",
            _info->sstables, new_sstables_msg, pretty_printed_data_size(_info->start_size), pretty_printed_data_size(_info->end_size), int(ratio * 100),
            std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), pretty_printed_throughput(_info->end_size, duration),
--- a/test.py
+++ b/test.py
@@ -447,6 +447,9 @@ async def run_test(test, options, gentle_kill=False, env=dict()):
                env=dict(os.environ,
                         UBSAN_OPTIONS=":".join(filter(None, UBSAN_OPTIONS)),
                         ASAN_OPTIONS=":".join(filter(None, ASAN_OPTIONS)),
+                         # TMPDIR env variable is used by any seastar/scylla
+                         # test for directory to store test temporary data.
+                         TMPDIR=os.path.join(options.tmpdir, test.mode),
                         **env,
                         ),
                preexec_fn=os.setsid,
--- a/test/alternator/run
+++ b/test/alternator/run
@@ -28,8 +28,8 @@ fi
 SCYLLA_IP=127.1.$(($$ >> 8 & 255)).$(($$ & 255))
 echo "Running Scylla on $SCYLLA_IP"

-tmp_dir=/tmp/alternator-test-$$
-mkdir $tmp_dir
+tmp_dir="$(readlink -e ${TMPDIR-/tmp})"/alternator-test-$$
+mkdir "$tmp_dir"

 # We run the cleanup() function on exit for any reason - successful finish
 # of the script, an error (since we have "set -e"), or a signal.
@@ -76,7 +76,7 @@ done
 # argv[0] isn't good enough - because killall inspects the actual executable
 # filename in /proc/<pid>/stat. So we need to name the executable differently.
 # Luckily, using a symbolic link is good enough.
-SCYLLA_LINK=$tmp_dir/test_scylla
+SCYLLA_LINK="$tmp_dir"/test_scylla
 ln -s "$SCYLLA" "$SCYLLA_LINK"

 "$SCYLLA_LINK" --options-file "$source_path/conf/scylla.yaml" \
--- a/test/boost/big_decimal_test.cc
+++ b/test/boost/big_decimal_test.cc
@@ -157,6 +157,13 @@ BOOST_AUTO_TEST_CASE(test_big_decimal_div) {
    test_div("-0.25", 10, "-0.02");
    test_div("-0.26", 10, "-0.03");
    test_div("-10E10", 3, "-3E10");
+
+    // Document a small oddity, 1e1 has -1 decimal places, so dividing
+    // it by 2 produces 0. This is not the behavior in cassandra, but
+    // scylla doesn't expose arithmetic operations, so this doesn't
+    // seem to be visible from CQL.
+    test_div("10", 2, "5");
+    test_div("1e1", 2, "0e1");
 }

 BOOST_AUTO_TEST_CASE(test_big_decimal_assignadd) {
--- a/test/boost/castas_fcts_test.cc
+++ b/test/boost/castas_fcts_test.cc
@@ -142,6 +142,19 @@ SEASTAR_TEST_CASE(test_decimal_to_bigint) {
    });
 }

+SEASTAR_TEST_CASE(test_decimal_to_float) {
+    return do_with_cql_env_thread([&](auto& e) {
+        e.execute_cql("CREATE TABLE test (key text primary key, value decimal)").get();
+        e.execute_cql("INSERT INTO test (key, value) VALUES ('k1', 10)").get();
+        e.execute_cql("INSERT INTO test (key, value) VALUES ('k2', 1e1)").get();
+        auto v = e.execute_cql("SELECT key, CAST(value as float) from test").get0();
+        assert_that(v).is_rows().with_rows_ignore_order({
+            {{serialized("k1")}, {serialized(float(10))}},
+            {{serialized("k2")}, {serialized(float(10))}},
+        });
+    });
+}
+
 SEASTAR_TEST_CASE(test_varint_to_bigint) {
    return do_with_cql_env_thread([&](auto& e) {
        e.execute_cql("CREATE TABLE test (key text primary key, value varint)").get();
--- a/test/boost/cql_query_test.cc
+++ b/test/boost/cql_query_test.cc
@@ -4583,3 +4583,21 @@ SEASTAR_TEST_CASE(test_internal_alter_table_on_a_distributed_table) {
        });
    });
 }
+
+SEASTAR_TEST_CASE(test_impossible_where) {
+    return do_with_cql_env_thread([] (cql_test_env& e) {
+        cquery_nofail(e, "CREATE TABLE t(p int PRIMARY KEY, r int)");
+        cquery_nofail(e, "INSERT INTO  t(p,r) VALUES (0, 0)");
+        cquery_nofail(e, "INSERT INTO  t(p,r) VALUES (1, 10)");
+        cquery_nofail(e, "INSERT INTO  t(p,r) VALUES (2, 20)");
+        require_rows(e, "SELECT * FROM t WHERE r>10 AND r<10 ALLOW FILTERING", {});
+        require_rows(e, "SELECT * FROM t WHERE r>=10 AND r<=0 ALLOW FILTERING", {});
+
+        cquery_nofail(e, "CREATE TABLE t2(p int, c int, PRIMARY KEY(p, c)) WITH CLUSTERING ORDER BY (c DESC)");
+        cquery_nofail(e, "INSERT INTO  t2(p,c) VALUES (0, 0)");
+        cquery_nofail(e, "INSERT INTO  t2(p,c) VALUES (1, 10)");
+        cquery_nofail(e, "INSERT INTO  t2(p,c) VALUES (2, 20)");
+        require_rows(e, "SELECT * FROM t2 WHERE c>10 AND c<10 ALLOW FILTERING", {});
+        require_rows(e, "SELECT * FROM t2 WHERE c>=10 AND c<=0 ALLOW FILTERING", {});
+    });
+}
--- a/test/boost/mutation_reader_test.cc
+++ b/test/boost/mutation_reader_test.cc
@@ -2588,6 +2588,7 @@ SEASTAR_THREAD_TEST_CASE(test_queue_reader) {

        BOOST_REQUIRE_THROW(handle.push(partition_end{}).get(), std::runtime_error);
        BOOST_REQUIRE_THROW(handle.push_end_of_stream(), std::runtime_error);
+        BOOST_REQUIRE_THROW(fill_buffer_fut.get(), broken_promise);
    }

    // Abandoned handle aborts, move-assignment
--- a/test/boost/view_build_test.cc
+++ b/test/boost/view_build_test.cc
@@ -421,23 +421,49 @@ SEASTAR_TEST_CASE(test_view_update_generator) {
        auto& view_update_generator = e.local_view_update_generator();
        auto s = test_table_schema();

+        std::vector<shared_sstable> ssts;
+
+        lw_shared_ptr<table> t = e.local_db().find_column_family("ks", "t").shared_from_this();
+
+        auto write_to_sstable = [&] (mutation m) {
+            auto sst = t->make_streaming_staging_sstable();
+            sstables::sstable_writer_config sst_cfg = test_sstables_manager.configure_writer();
+            auto& pc = service::get_local_streaming_priority();
+
+            sst->write_components(flat_mutation_reader_from_mutations({m}), 1ul, s, sst_cfg, {}, pc).get();
+            sst->open_data().get();
+            t->add_sstable_and_update_cache(sst).get();
+            return sst;
+        };
+
        auto key = partition_key::from_exploded(*s, {to_bytes(key1)});
        mutation m(s, key);
        auto col = s->get_column_definition("v");
        for (int i = 1024; i < 1280; ++i) {
            auto& row = m.partition().clustered_row(*s, clustering_key::from_exploded(*s, {to_bytes(fmt::format("c{}", i))}));
            row.cells().apply(*col, atomic_cell::make_live(*col->type, 2345, col->type->decompose(sstring(fmt::format("v{}", i)))));
+            // Scatter the data in a bunch of different sstables, so we
+            // can test the registration semaphore of the view update
+            // generator
+            if (!(i % 10)) {
+                ssts.push_back(write_to_sstable(std::exchange(m, mutation(s, key))));
+            }
        }
-        lw_shared_ptr<table> t = e.local_db().find_column_family("ks", "t").shared_from_this();
+        ssts.push_back(write_to_sstable(std::move(m)));

-        auto sst = t->make_streaming_staging_sstable();
-        sstables::sstable_writer_config sst_cfg = test_sstables_manager.configure_writer();
-        auto& pc = service::get_local_streaming_priority();
+        BOOST_REQUIRE_EQUAL(view_update_generator.available_register_units(), db::view::view_update_generator::registration_queue_size);

-        sst->write_components(flat_mutation_reader_from_mutations({m}), 1ul, s, sst_cfg, {}, pc).get();
-        sst->open_data().get();
-        t->add_sstable_and_update_cache(sst).get();
-        view_update_generator.register_staging_sstable(sst, t).get();
+        parallel_for_each(ssts.begin(), ssts.begin() + 10, [&] (shared_sstable& sst) {
+            return view_update_generator.register_staging_sstable(sst, t);
+        }).get();
+
+        BOOST_REQUIRE_EQUAL(view_update_generator.available_register_units(), db::view::view_update_generator::registration_queue_size);
+
+        parallel_for_each(ssts.begin() + 10, ssts.end(), [&] (shared_sstable& sst) {
+            return view_update_generator.register_staging_sstable(sst, t);
+        }).get();
+
+        BOOST_REQUIRE_EQUAL(view_update_generator.available_register_units(), db::view::view_update_generator::registration_queue_size);

        eventually([&, key1, key2] {
            auto msg = e.execute_cql(fmt::format("SELECT * FROM t WHERE p = '{}'", key1)).get0();
@@ -464,5 +490,7 @@ SEASTAR_TEST_CASE(test_view_update_generator) {

            }
        });
+
+        BOOST_REQUIRE_EQUAL(view_update_generator.available_register_units(), db::view::view_update_generator::registration_queue_size);
    });
 }
--- a/utils/big_decimal.cc
+++ b/utils/big_decimal.cc
@@ -36,6 +36,9 @@ uint64_t from_varint_to_integer(const utils::multiprecision_int& varint) {
    return static_cast<uint64_t>(~static_cast<uint64_t>(0) & boost::multiprecision::cpp_int(varint));
 }

+big_decimal::big_decimal() : big_decimal(0, 0) {}
+big_decimal::big_decimal(int32_t scale, boost::multiprecision::cpp_int unscaled_value)
+    : _scale(scale), _unscaled_value(std::move(unscaled_value)) {}

 big_decimal::big_decimal(sstring_view text)
 {
@@ -82,6 +85,20 @@ big_decimal::big_decimal(sstring_view text)
    _scale += fraction.size();
 }

+boost::multiprecision::cpp_rational big_decimal::as_rational() const {
+    boost::multiprecision::cpp_int ten(10);
+    auto unscaled_value = static_cast<const boost::multiprecision::cpp_int&>(_unscaled_value);
+    boost::multiprecision::cpp_rational r = unscaled_value;
+    int32_t abs_scale = std::abs(_scale);
+    auto pow = boost::multiprecision::pow(ten, abs_scale);
+    if (_scale < 0) {
+        r *= pow;
+    } else {
+        r /= pow;
+    }
+    return r;
+}
+
 sstring big_decimal::to_string() const
 {
    if (!_unscaled_value) {
--- a/utils/big_decimal.hh
+++ b/utils/big_decimal.hh
@@ -39,13 +39,12 @@ public:
    };

    explicit big_decimal(sstring_view text);
-    big_decimal() : big_decimal(0, 0) {}
-    big_decimal(int32_t scale, boost::multiprecision::cpp_int unscaled_value)
-        : _scale(scale), _unscaled_value(unscaled_value)
-    { }
+    big_decimal();
+    big_decimal(int32_t scale, boost::multiprecision::cpp_int unscaled_value);

    int32_t scale() const { return _scale; }
    const boost::multiprecision::cpp_int& unscaled_value() const { return _unscaled_value; }
+    boost::multiprecision::cpp_rational as_rational() const;

    sstring to_string() const;
Author	SHA1	Message	Date
Rafael Ávila de Espíndola	b7c5a918cb	mutation_reader_test: Wait for a future Nothing was waiting for this future. Found while testing another patch. Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com> Message-Id: <20200630183929.1704908-1-espindola@scylladb.com> (cherry picked from commit `6fe7706fce`) Fixes #6858.	2020-07-16 14:44:31 +03:00
Asias He	fb2ae9e66b	repair: Relax node selection in bootstrap when nodes are less than RF Consider a cluster with two nodes: - n1 (dc1) - n2 (dc2) A third node is bootstrapped: - n3 (dc2) The n3 fails to bootstrap as follows: [shard 0] init - Startup failed: std::runtime_error (bootstrap_with_repair: keyspace=system_distributed, range=(9183073555191895134, 9196226903124807343], no existing node in local dc) The system_distributed keyspace is using SimpleStrategy with RF 3. For the keyspace that does not use NetworkTopologyStrategy, we should not require the source node to be in the same DC. Fixes: #6744 Backports: 4.0 4.1, 4.2 (cherry picked from commit `38d964352d`)	2020-07-16 12:02:38 +03:00
Asias He	7a7ed8c65d	repair: Relax size check of get_row_diff and set_diff In case a row hash conflict, a hash in set_diff will get more than one row from get_row_diff. For example, Node1 (Repair master): row1 -> hash1 row2 -> hash2 row3 -> hash3 row3' -> hash3 Node2 (Repair follower): row1 -> hash1 row2 -> hash2 We will have set_diff = {hash3} between node1 and node2, while get_row_diff({hash3}) will return two rows: row3 and row3'. And the error below was observed: repair - Got error in row level repair: std::runtime_error (row_diff.size() != set_diff.size()) In this case, node1 should send both row3 and row3' to peer node instead of fail the whole repair. Because node2 does not have row3 or row3', otherwise node1 won't send row with hash3 to node1 in the first place. Refs: #6252 (cherry picked from commit `a00ab8688f`)	2020-07-15 14:48:49 +03:00
Nadav Har'El	7b9be752ec	alternator test: configurable temporary directory The test/alternator/run script creates a temporary directory for the Scylla database in /tmp. The assumption was that this is the fastest disk (usually even a ramdisk) on the test machine, and we didn't need anything else from it. But it turns out that on some systems, /tmp is actually a slow disk, so this patch adds a way to configure the temporary directory - if the TMPDIR environment variable exists, it is used instead of /tmp. As before this patch, a temporary subdirectry is created in $TMPDIR, and this subdirectory is automatically deleted when the test ends. The test.py script already passes an appropriate TMPDIR (testlog/$mode), which after this patch the Alternator test will use instead of /tmp. Fixes #6750 Signed-off-by: Nadav Har'El <nyh@scylladb.com> Message-Id: <20200713193023.788634-1-nyh@scylladb.com> (cherry picked from commit `8e3be5e7d6`)	2020-07-14 12:34:26 +03:00
Konstantin Osipov	903e967a16	Export TMPDIR pointing at subdir of testlog/ Export TMPDIR environment variable pointing at a subdir of testlog. This variable is used by seastar/scylla tests to create a a subdirectory with temporary test data. Normally a test cleans up the temporary directory, but if it crashes or is killed the directory remains. By resetting the default location from /tmp to testlog/{mode} we allow test.py we consolidate all test artefacts in a single place. Fixes #6062, "test.py uses tmpfs" (cherry picked from commit `e628da863d`)	2020-07-14 12:34:06 +03:00
Avi Kivity	b84946895c	Update seastar submodule * seastar 1e762652c4...8aad24a5f8 (2): > futures: Add a test for a broken promise in a parallel_for_each > future: Call set_to_broken_promise earlier Fixes #6749 (probably).	2020-07-13 20:08:16 +03:00
Asias He	a27188886a	repair: Switch to btree_set for repair_hash. In one of the longevity tests, we observed 1.3s reactor stall which came from repair_meta::get_full_row_hashes_source_op. It traced back to a call to std::unordered_set::insert() which triggered big memory allocation and reclaim. I measured std::unordered_set, absl::flat_hash_set, absl::node_hash_set and absl::btree_set. The absl::btree_set was the only one that seastar oversized allocation checker did not warn in my tests where around 300K repair hashes were inserted into the container. - unordered_set: hash_sets=295634, time=333029199 ns - flat_hash_set: hash_sets=295634, time=312484711 ns - node_hash_set: hash_sets=295634, time=346195835 ns - btree_set: hash_sets=295634, time=341379801 ns The btree_set is a bit slower than unordered_set but it does not have huge memory allocation. I do not measure real difference of total time to finish repair of the same dataset with unordered_set and btree_set. To fix, switch to absl btree_set container. Fixes #6190 (cherry picked from commit `67f6da6466`)	2020-07-13 10:09:23 +03:00
Dmitry Kropachev	51d4efc321	dist/common/scripts/scylla-housekeeping: wrap urllib.request with try ... except We could hit "cannot serialize '_io.BufferedReader' object" when request get 404 error from the server Now you will get legit error message in the case. Fixes #6690 (cherry picked from commit `de82b3efae`)	2020-07-09 18:24:55 +03:00
Avi Kivity	0847eea8d6	Update seastar submodule * seastar 11e86172ba...1e762652c4 (1): > sharded: Do not hang on never set freed promise Fixes #6606.	2020-07-09 15:52:26 +03:00
Avi Kivity	35ad57cb9c	Point seastar submodule at scylla-seastar.git This allows us to backport seastar patches to 4.2.	2020-07-09 15:50:25 +03:00
Hagit Segev	42b0b9ad08	release: prepare for 4.2.rc1	2020-07-08 23:01:10 +03:00
Dejan Mircevski	68b95bf2ac	cql/restrictions: Handle `WHERE a>0 AND a<0` WHERE clauses with start point above the end point were handled incorrectly. When the slice bounds are transformed to interval bounds, the resulting interval is interpreted as wrap-around (because start > end), so it contains all values above 0 and all values below 0. This is clearly incorrect, as the user's intent was to filter out all possible values of a. Fix it by explicitly short-circuiting to false when start > end. Add a test case. Fixes #5799. Tests: unit (dev) Signed-off-by: Dejan Mircevski <dejan@scylladb.com> (cherry picked from commit `921dbd0978`)	2020-07-08 13:20:10 +03:00
Botond Dénes	fea83f6ae0	db/view: view_update_generator: re-balance wait/signal on the register semaphore The view update generator has a semaphore to limit concurrency. This semaphore is waited on in `register_staging_sstable()` and later the unit is returned after the sstable is processed in the loop inside `start()`. This was broken by `4e64002`, which changed the loop inside `start()` to process sstables in per table batches, however didn't change the `signal()` call to return the amount of units according to the number of sstables processed. This can cause the semaphore units to dry up, as the loop can process multiple sstables per table but return just a single unit. This can also block callers of `register_staging_sstable()` indefinitely as some waiters will never be released as under the right circumstances the units on the semaphore can permanently go below 0. In addition to this, `4e64002` introduced another bug: table entries from the `_sstables_with_tables` are never removed, so they are processed every turn. If the sstable list is empty, there won't be any update generated but due to the unconditional `signal()` described above, this can cause the units on the semaphore to grow to infinity, allowing future staging sstables producers to register a huge amount of sstables, causing memory problems due to the amount of sstable readers that have to be opened (#6603, #6707). Both outcomes are equally bad. This patch fixes both issues and modifies the `test_view_update_generator` unit test to reproduce them and hence to verify that this doesn't happen in the future. Fixes: #6774 Refs: #6707 Refs: #6603 Tests: unit(dev) Signed-off-by: Botond DÃ©nes <bdenes@scylladb.com> Message-Id: <20200706135108.116134-1-bdenes@scylladb.com> (cherry picked from commit `5ebe2c28d1`)	2020-07-08 11:13:24 +03:00
Takuya ASADA	76618a7e06	scylla_setup: don't add same disk device twice We shouldn't accept adding same disk twice for RAID prompt. Fixes #6711 (cherry picked from commit `835e76fdfc`)	2020-07-07 13:07:59 +03:00
Takuya ASADA	189a08ac72	scylla_setup: follow hugepages package name change on Ubuntu 20.04LTS hugepages package now renamed to libhugetlbfs-bin, we need to follow the change. Fixes #6673 (cherry picked from commit `03ce19d53a`)	2020-07-05 14:41:33 +03:00
Takuya ASADA	a3e9915a83	dist/debian: apply generated package version for .orig.tar.gz file We currently does not able to apply version number fixup for .orig.tar.gz file, even we applied correct fixup on debian/changelog, becuase it just reading SCYLLA-VERSION-FILE. We should parse debian/{changelog,control} instead. Fixes #6736 (cherry picked from commit `a107f086bc`)	2020-07-05 14:08:37 +03:00
Asias He	e4bc14ec1a	boot_strapper: Ignore node to be replaced explicitly as stream source After commit `7d86a3b208` (storage_service: Make replacing node take writes), during replace operation, tokens in _token_metadata for node being replaced are updated only after the replace operation is finished. As a result, in range_streamer::add_ranges, the node being replaced will be considered as a source to stream data from. Before commit `7d86a3b208`, the node being replaced will not be considered as a source node because it is already replaced by the replacing node before the replace operation is finished. This is the reason why it works in the past. To fix, filter out the node being replaced as a source node explicitly. Tests: replace_first_boot_test and replace_stopped_node_test Backports: 4.1 Fixes: #6728 (cherry picked from commit e338028b7e22b0a80be7f80c337c52f958bfe1d7)	2020-07-01 14:36:43 +03:00
Takuya ASADA	972acb6d56	scylla_swap_setup: handle <1GB environment Show better error message and exit with non-zero status when memory size <1GB. Fixes #6659 (cherry picked from commit `a9de438b1f`)	2020-07-01 12:40:25 +03:00
Yaron Kaikov	7fbfedf025	dist/docker/redhat/Dockerfile: update 4.2 params Set SCYLLA_REPO and VERSION values for scylla-4.2	2020-06-30 13:09:06 +03:00
Avi Kivity	5f175f8103	Merge "Fix handling of decimals with negative scales" from Rafael " Before this series scylla would effectively infinite loop when, for example, casting a decimal with a negative scale to float. Fixes #6720 " * 'espindola/fix-decimal-issue' of https://github.com/espindola/scylla: big_decimal: Add a test for a corner case big_decimal: Correctly handle negative scales big_decimal: Add a as_rational member function big_decimal: Move constructors out of line (cherry picked from commit `3e2eeec83a`)	2020-06-29 12:05:17 +03:00
Benny Halevy	674ad6656a	comapction: restore % in compaction completion message The % sign fell off in `c4841fa735` Fixes #6727. Signed-off-by: Benny Halevy <bhalevy@scylladb.com> Message-Id: <20200625151352.736561-1-bhalevy@scylladb.com> (cherry picked from commit `a843945115`)	2020-06-28 12:10:21 +03:00
Hagit Segev	58498b4b6c	release: prepare for 4.2.rc0	2020-06-26 13:06:07 +03:00