gossip: Fix tokens assignment in assassinate_endpoint

The tokens vector is defined a few lines above and is needed outsie the if block. Do not redefine it again in the if block, otherwise the tokens will be empty. Found by code inspection. Fixes #3551. Message-Id: <c7a06375c65c950e94236571127f533e5a60cbfd.1530002177.git.asias@scylladb.com> (cherry picked from commit c3b5a2ecd5)
locator::ec2_multi_region_snitch: don't call for ec2_snitch::gossiper_starting()
2018-06-27 12:01:19 +03:00 · 2018-06-12 19:02:48 +03:00 · 2018-05-24 12:02:15 +03:00 · 2018-05-24 11:14:20 +03:00 · 2018-05-24 11:08:13 +03:00 · 2018-05-24 15:24:29 +08:00
7 changed files with 46 additions and 29 deletions
--- a/dist/common/modprobe.d/scylla-raid0.conf
+++ b/dist/common/modprobe.d/scylla-raid0.conf
@@ -1 +0,0 @@
-options raid0 devices_discard_performance=Y
--- a/dist/redhat/scylla.spec.in
+++ b/dist/redhat/scylla.spec.in
@@ -92,9 +92,6 @@ mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
-%if 0%{?rhel}
-mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
-%endif
 mkdir -p $RPM_BUILD_ROOT%{_sysctldir}/
 mkdir -p $RPM_BUILD_ROOT%{_docdir}/scylla/
 mkdir -p $RPM_BUILD_ROOT%{_unitdir}
@@ -105,9 +102,6 @@ install -m644 dist/common/limits.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/sec
 install -m644 dist/common/collectd.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
 install -m644 dist/common/scylla.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
 install -m644 dist/common/sysctl.d/*.conf $RPM_BUILD_ROOT%{_sysctldir}/
-%if 0%{?rhel}
-install -m644 dist/common/modprobe.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
-%endif
 install -d -m755 $RPM_BUILD_ROOT%{_sysconfdir}/scylla
 install -m644 conf/scylla.yaml $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
 install -m644 conf/cassandra-rackdc.properties $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
@@ -271,18 +265,9 @@ if Scylla is the main application on your server and you wish to optimize its la
 # We cannot use the sysctl_apply rpm macro because it is not present in 7.0
 # following is a "manual" expansion
 /usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
-# Write modprobe.d params when module already loaded
-%if 0%{?rhel}
-if [ -e /sys/module/raid0/parameters/devices_discard_performance ]; then
-    echo Y > /sys/module/raid0/parameters/devices_discard_performance
-fi
-%endif

 %files kernel-conf
 %defattr(-,root,root)
-%if 0%{?rhel}
-%config(noreplace) %{_sysconfdir}/modprobe.d/*.conf
-%endif
 %{_sysctldir}/*.conf

 %changelog
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -984,7 +984,7 @@ future<> gossiper::assassinate_endpoint(sstring address) {
            logger.warn("Assassinating {} via gossip", endpoint);
            if (es) {
                auto& ss = service::get_local_storage_service();
-                auto tokens = ss.get_token_metadata().get_tokens(endpoint);
+                tokens = ss.get_token_metadata().get_tokens(endpoint);
                if (tokens.empty()) {
                    logger.warn("Unable to calculate tokens for {}.  Will use a random one", address);
                    throw std::runtime_error(sprint("Unable to calculate tokens for %s", endpoint));
--- a/locator/ec2_multi_region_snitch.cc
+++ b/locator/ec2_multi_region_snitch.cc
@@ -100,7 +100,6 @@ future<> ec2_multi_region_snitch::gossiper_starting() {
    // Note: currently gossiper "main" instance always runs on CPU0 therefore
    // this function will be executed on CPU0 only.
    //
-    ec2_snitch::gossiper_starting();

    using namespace gms;
    auto& g = get_local_gossiper();
--- a/2
+++ b/2
--- a/streaming/stream_session.cc
+++ b/streaming/stream_session.cc
@@ -181,8 +181,8 @@ void stream_session::init_messaging_service_handler() {
        if (failed && *failed) {
            return smp::submit_to(dst_cpu_id, [plan_id, from, dst_cpu_id] () {
                auto session = get_session(plan_id, from, "COMPLETE_MESSAGE");
-                sslog.warn("[Stream #{}] COMPLETE_MESSAGE with error flag from {} dst_cpu_id={}", plan_id, from, dst_cpu_id);
-                session->on_error();
+                sslog.debug("[Stream #{}] COMPLETE_MESSAGE with error flag from {} dst_cpu_id={}", plan_id, from, dst_cpu_id);
+                session->received_failed_complete_message();
                return make_ready_future<>();
            });
        } else {
@@ -236,7 +236,9 @@ future<> stream_session::on_initialization_complete() {
            for (auto& summary : msg.summaries) {
                this->prepare_receiving(summary);
            }
-            _stream_result->handle_session_prepared(this->shared_from_this());
+            if (_stream_result) {
+                _stream_result->handle_session_prepared(this->shared_from_this());
+            }
        } catch (...) {
            sslog.warn("[Stream #{}] Fail to send PREPARE_MESSAGE to {}, {}", this->plan_id(), id, std::current_exception());
            throw;
@@ -257,9 +259,19 @@ future<> stream_session::on_initialization_complete() {
    });
 }

+void stream_session::received_failed_complete_message() {
+    sslog.info("[Stream #{}] Received failed complete message, peer={}", plan_id(), peer);
+    _received_failed_complete_message = true;
+    close_session(stream_session_state::FAILED);
+}
+
+void stream_session::abort() {
+    sslog.info("[Stream #{}] Aborted stream session={}, peer={}, is_initialized={}", plan_id(), this, peer, is_initialized());
+    close_session(stream_session_state::FAILED);
+}
+
 void stream_session::on_error() {
-    sslog.warn("[Stream #{}] Streaming error occurred", plan_id());
-    // fail session
+    sslog.warn("[Stream #{}] Streaming error occurred, peer={}", plan_id(), peer);
    close_session(stream_session_state::FAILED);
 }

@@ -309,7 +321,9 @@ future<prepare_message> stream_session::prepare(std::vector<stream_request> requ
        }
    }
    prepare.dst_cpu_id = engine().cpu_id();;
-    _stream_result->handle_session_prepared(shared_from_this());
+    if (_stream_result) {
+        _stream_result->handle_session_prepared(shared_from_this());
+    }
    return make_ready_future<prepare_message>(std::move(prepare));
 }

@@ -352,20 +366,27 @@ void stream_session::transfer_task_completed_all() {
 }

 void stream_session::send_failed_complete_message() {
+    if (!is_initialized()) {
+        return;
+    }
+    auto plan_id = this->plan_id();
+    if (_received_failed_complete_message) {
+        sslog.debug("[Stream #{}] Skip sending failed message back to peer", plan_id);
+        return;
+    }
    if (!_complete_sent) {
        _complete_sent = true;
    } else {
        return;
    }
    auto id = msg_addr{this->peer, this->dst_cpu_id};
-    auto plan_id = this->plan_id();
    sslog.debug("[Stream #{}] SEND COMPLETE_MESSAGE to {}", plan_id, id);
    auto session = shared_from_this();
    bool failed = true;
    this->ms().send_complete_message(id, plan_id, this->dst_cpu_id, failed).then([session, id, plan_id] {
        sslog.debug("[Stream #{}] GOT COMPLETE_MESSAGE Reply from {}", plan_id, id.addr);
    }).handle_exception([session, id, plan_id] (auto ep) {
-        sslog.warn("[Stream #{}] COMPLETE_MESSAGE for {} has failed: {}", plan_id, id.addr, ep);
+        sslog.debug("[Stream #{}] COMPLETE_MESSAGE for {} has failed: {}", plan_id, id.addr, ep);
    });
 }

@@ -481,7 +502,9 @@ void stream_session::close_session(stream_session_state final_state) {
        // Note that we shouldn't block on this close because this method is called on the handler
        // incoming thread (so we would deadlock).
        //handler.close();
-        _stream_result->handle_session_complete(shared_from_this());
+        if (_stream_result) {
+            _stream_result->handle_session_complete(shared_from_this());
+        }

        sslog.debug("[Stream #{}] close_session session={}, state={}, cancel keep_alive timer", plan_id(), this, final_state);
        _keep_alive.cancel();
@@ -505,6 +528,10 @@ void stream_session::start() {
    });
 }

+bool stream_session::is_initialized() const {
+    return bool(_stream_result);
+}
+
 void stream_session::init(shared_ptr<stream_result_future> stream_result_) {
    _stream_result = stream_result_;
    _keep_alive.set_callback([this] {
--- a/streaming/stream_session.hh
+++ b/streaming/stream_session.hh
@@ -151,7 +151,7 @@ public:
     * Each {@code StreamSession} is identified by this InetAddress which is broadcast address of the node streaming.
     */
    inet_address peer;
-    unsigned dst_cpu_id;
+    unsigned dst_cpu_id = 0;
 private:
    // should not be null when session is started
    shared_ptr<stream_result_future> _stream_result;
@@ -174,6 +174,7 @@ private:

    stream_session_state _state = stream_session_state::INITIALIZED;
    bool _complete_sent = false;
+    bool _received_failed_complete_message = false;

    // If the session is idle for 10 minutes, close the session
    std::chrono::seconds _keep_alive_timeout{60 * 10};
@@ -231,6 +232,8 @@ public:

    void start();

+    bool is_initialized() const;
+
    /**
     * Request data fetch task to this session.
     *
@@ -299,6 +302,10 @@ public:
     */
    void on_error();

+    void abort();
+
+    void received_failed_complete_message();
+
    /**
     * Prepare this session for sending/receiving files.
     */
Author	SHA1	Message	Date
Asias He	f19fbc3058	gossip: Fix tokens assignment in assassinate_endpoint The tokens vector is defined a few lines above and is needed outsie the if block. Do not redefine it again in the if block, otherwise the tokens will be empty. Found by code inspection. Fixes #3551. Message-Id: <c7a06375c65c950e94236571127f533e5a60cbfd.1530002177.git.asias@scylladb.com> (cherry picked from commit `c3b5a2ecd5`)	2018-06-27 12:01:19 +03:00
Vlad Zolotarov	8eddb28954	locator::ec2_multi_region_snitch: don't call for ec2_snitch::gossiper_starting() ec2_snitch::gossiper_starting() calls for the base class (default) method that sets _gossip_started to TRUE and thereby prevents to following reconnectable_snitch_helper registration. Fixes #3454 Signed-off-by: Vlad Zolotarov <vladz@scylladb.com> Message-Id: <1528208520-28046-1-git-send-email-vladz@scylladb.com> (cherry picked from commit `2dde372ae6`)	2018-06-12 19:02:48 +03:00
Avi Kivity	5aaa8031a2	Update seastar submodule * seastar f5162dc...da2e1af (1): > net/tls: Wait for output to be sent when shutting down Fixes #3459.	2018-05-24 12:02:15 +03:00
Avi Kivity	3d50e7077a	Merge "Backport fixes for streaming segfault with bogus dst_cpu_id for 2.0" from Asias " The minimum changes that makes the backport of "streaming: Do send failed message for uninitialized session" without backport conflits. Fixes simliar issue we saw: https://github.com/scylladb/scylla/issues/3115 " * tag 'asias/backport_issue_3115_for_2.0/v1' of github.com:scylladb/seastar-dev: streaming: Do send failed message for uninitialized session streaming: Introduce streaming::abort() streaming: Log peer address in on_error streaming: Check if _stream_result is valid streaming: Introduce received_failed_complete_message	2018-05-24 11:14:20 +03:00
Avi Kivity	4063e92f57	dist: redhat: get rid of raid0.devices_discard_performance This parameter is not available on recent Red Hat kernels or on non-Red Hat kernels (it was removed on 3.10.0-772.el7, RHBZ 1455932). The presence of the parameter on kernels that don't support it cause the module load to fail, with the result that the storage is not available. Fix by removing the parameter. For someone running an older Red Hat kernel the effect will be that discard is disabled, but they can fix that by updating the kernel. For someone running a newer kernel, the effect will be that they can access their data. Fixes #3437. Message-Id: <20180516134913.6540-1-avi@scylladb.com> (cherry picked from commit `3b8118d4e5`)	2018-05-24 11:08:13 +03:00
Asias He	b6de30bb87	streaming: Do send failed message for uninitialized session The uninitialized session has no peer associated with it yet. There is no point sending the failed message when abort the session. Sending the failed message in this case will send to a peer with uninitialized dst_cpu_id which will casue the receiver to pass a bogus shard id to smp::submit_to which cases segfault. In addition, to be safe, initialize the dst_cpu_id to zero. So that uninitialized session will send message to shard zero instead of random bogus shard id. Fixes the segfault issue found by repair_additional_test.py:RepairAdditionalTest.repair_abort_test Fixes #3115 Message-Id: <9f0f7b44c7d6d8f5c60d6293ab2435dadc3496a9.1515380325.git.asias@scylladb.com> (cherry picked from commit `774307b3a7`)	2018-05-24 15:24:29 +08:00
Asias He	c23e3a1eda	streaming: Introduce streaming::abort() It will be used soon by stream_plan::abort() to abort a stream session. (cherry picked from commit `fad34801bf`)	2018-05-24 15:21:54 +08:00
Asias He	2732b6cf1d	streaming: Log peer address in on_error (cherry picked from commit `8a3f6acdd2`)	2018-05-24 15:20:43 +08:00
Asias He	49722e74da	streaming: Check if _stream_result is valid If on_error() was called before init() was executed, the _stream_result can be invalid. (cherry picked from commit `be573bcafb`)	2018-05-24 15:20:02 +08:00
Asias He	ba7623ac55	streaming: Introduce received_failed_complete_message It is the handler for the failed complete message. Add a flag to remember if we received a such message from peer, if so, do not send back the failed complete message back to the peer when running close_session with failed status. (cherry picked from commit `eace5fc6e8`)	2018-05-24 15:19:26 +08:00
				`@@ -1 +0,0 @@`
				`options raid0 devices_discard_performance=Y`