mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-31 20:16:43 +00:00
Fixes repair_additional_test.py:RepairAdditionalTest.repair_kill_3_test
The test does:
- Insert data on node1 only
- Insert data on node2 only
- Run repair on node1 and stop node1
once "starting user-requested repair" is seen
The repair shutdown code may wait for the stream session to complete for
a very long time if node 1 finishes sending data to node2 and is waiting
for node2 to send data to it, when node1 is stopped. The stream session
will not be closed in this case until stream session _keep_alive_timeout
(10 minutes) expires. Instead of waiting for the stream_session keep
alive timer to expire, we can fail all the stream sessions during
shutdown.
Before 1 - The bad case (repair shutdown will last for 10 minutes):
INFO 2016-09-21 16:23:56,617 [shard 0] stream_session - [Stream #bd34fea1-7fd4-11e6-8020-000000000001] Executing streaming plan for repair-in
INFO 2016-09-21 16:23:56,617 [shard 0] stream_session - [Stream #bd34fea1-7fd4-11e6-8020-000000000001] Starting streaming to 127.0.0.2
INFO 2016-09-21 16:23:56,617 [shard 0] stream_session - [Stream #bd34fea1-7fd4-11e6-8020-000000000001] Beginning stream session with 127.0.0.2
INFO 2016-09-21 16:23:56,618 [shard 0] stream_session - [Stream #bd34fea1-7fd4-11e6-8020-000000000001] Prepare completed with 127.0.0.2. Receiving 1, sending 0
INFO 2016-09-21 16:23:58,625 [shard 0] storage_service - Stop transport: stop_gossiping done
INFO 2016-09-21 16:23:58,625 [shard 0] storage_service - Thrift server stopped
INFO 2016-09-21 16:23:58,625 [shard 0] storage_service - CQL server stopped
INFO 2016-09-21 16:23:58,625 [shard 0] storage_service - Stop transport: shutdown rpc and cql server done
INFO 2016-09-21 16:23:58,626 [shard 0] storage_service - messaging_service stopped
INFO 2016-09-21 16:23:58,626 [shard 0] storage_service - Stop transport: shutdown messaging_service done
INFO 2016-09-21 16:23:58,626 [shard 0] storage_service - Stop transport: auth shutdown
INFO 2016-09-21 16:23:58,626 [shard 0] storage_service - Stop transport: done
INFO 2016-09-21 16:23:58,626 [shard 0] storage_service - Drain on shutdown: stop_transport done
INFO 2016-09-21 16:23:58,626 [shard 0] tracing - Asked to shut down
INFO 2016-09-21 16:23:58,626 [shard 0] tracing - Tracing is down
INFO 2016-09-21 16:23:58,626 [shard 1] tracing - Asked to shut down
INFO 2016-09-21 16:23:58,626 [shard 1] tracing - Tracing is down
INFO 2016-09-21 16:23:58,626 [shard 0] storage_service - Drain on shutdown: tracing is stopped
INFO 2016-09-21 16:23:58,669 [shard 0] storage_service - Drain on shutdown: flush column_families done
INFO 2016-09-21 16:23:58,669 [shard 0] storage_service - Drain on shutdown: shutdown commitlog done
INFO 2016-09-21 16:23:58,669 [shard 0] storage_service - Drain on shutdown: done
INFO 2016-09-21 16:23:58,669 [shard 0] repair - Starting shutdown of repair
INFO 2016-09-21 16:25:56,624 [shard 0] stream_session - [Stream #bd34fea1-7fd4-11e6-8020-000000000001] The session 0x600021516c00 made no progress with peer 127.0.0.2
Before 2 - The good case:
INFO 2016-09-21 16:18:32,087 [shard 0] stream_session - [Stream #fbc668d1-7fd3-11e6-bc54-000000000001] Executing streaming plan for repair-in
INFO 2016-09-21 16:18:32,087 [shard 0] stream_session - [Stream #fbc668d1-7fd3-11e6-bc54-000000000001] Starting streaming to 127.0.0.2
INFO 2016-09-21 16:18:32,087 [shard 0] stream_session - [Stream #fbc668d1-7fd3-11e6-bc54-000000000001] Beginning stream session with 127.0.0.2
INFO 2016-09-21 16:18:32,087 [shard 0] stream_session - [Stream #fbc668d1-7fd3-11e6-bc54-000000000001] Prepare completed with 127.0.0.2. Receiving 1, sending 0
INFO 2016-09-21 16:18:34,098 [shard 0] storage_service - Stop transport: stop_gossiping done
INFO 2016-09-21 16:18:34,098 [shard 0] storage_service - Thrift server stopped
INFO 2016-09-21 16:18:34,098 [shard 0] storage_service - CQL server stopped
INFO 2016-09-21 16:18:34,098 [shard 0] storage_service - Stop transport: shutdown rpc and cql server done
INFO 2016-09-21 16:18:34,155 [shard 0] messaging_service - Retry verb=19 to 127.0.0.2:0, retry=10: rpc::closed_error (connection is closed)
WARN 2016-09-21 16:18:34,155 [shard 0] stream_session - [Stream #fbc668d1-7fd3-11e6-bc54-000000000001] COMPLETE_MESSAGE for 127.0.0.2 has failed: rpc::closed_error (connection is closed)
WARN 2016-09-21 16:18:34,155 [shard 0] stream_session - [Stream #fbc668d1-7fd3-11e6-bc54-000000000001] Streaming error occurred
INFO 2016-09-21 16:18:34,155 [shard 0] stream_session - [Stream #fbc668d1-7fd3-11e6-bc54-000000000001] Session with 127.0.0.2 is complete, state=FAILED
INFO 2016-09-21 16:18:34,155 [shard 0] storage_service - messaging_service stopped
INFO 2016-09-21 16:18:34,155 [shard 0] storage_service - Stop transport: shutdown messaging_service done
INFO 2016-09-21 16:18:34,155 [shard 0] stream_session - [Stream #fbc668d1-7fd3-11e6-bc54-000000000001] bytes_sent = 0, bytes_received = 245000
WARN 2016-09-21 16:18:34,155 [shard 0] stream_session - [Stream #fbc668d1-7fd3-11e6-bc54-000000000001] Stream failed, peers={127.0.0.2}
WARN 2016-09-21 16:18:34,155 [shard 0] repair - repair's stream failed: streaming::stream_exception (Stream failed)
INFO 2016-09-21 16:18:34,155 [shard 0] repair - repair 1 failed - streaming::stream_exception (Stream failed)
INFO 2016-09-21 16:18:34,155 [shard 0] storage_service - Stop transport: auth shutdown
INFO 2016-09-21 16:18:34,155 [shard 0] storage_service - Stop transport: done
INFO 2016-09-21 16:18:34,155 [shard 0] storage_service - Drain on shutdown: stop_transport done
INFO 2016-09-21 16:18:34,155 [shard 0] tracing - Asked to shut down
INFO 2016-09-21 16:18:34,155 [shard 0] tracing - Tracing is down
INFO 2016-09-21 16:18:34,156 [shard 1] tracing - Asked to shut down
INFO 2016-09-21 16:18:34,156 [shard 1] tracing - Tracing is down
INFO 2016-09-21 16:18:34,156 [shard 0] storage_service - Drain on shutdown: tracing is stopped
INFO 2016-09-21 16:18:34,199 [shard 0] storage_service - Drain on shutdown: flush column_families done
INFO 2016-09-21 16:18:34,199 [shard 0] storage_service - Drain on shutdown: shutdown commitlog done
INFO 2016-09-21 16:18:34,199 [shard 0] storage_service - Drain on shutdown: done
INFO 2016-09-21 16:18:34,199 [shard 0] repair - Starting shutdown of repair
INFO 2016-09-21 16:18:34,199 [shard 0] repair - Completed shutdown of repair
INFO 2016-09-21 16:18:34,199 [shard 0] compaction_manager - Asked to stop
INFO 2016-09-21 16:18:34,199 [shard 1] compaction_manager - Asked to stop
After:
INFO 2016-09-21 16:06:21,684 [shard 0] stream_session - [Stream #48661c51-7fd2-11e6-8ba7-000000000001] Executing streaming plan for repair-in
INFO 2016-09-21 16:06:21,684 [shard 0] stream_session - [Stream #48661c51-7fd2-11e6-8ba7-000000000001] Starting streaming to 127.0.0.2
INFO 2016-09-21 16:06:21,684 [shard 0] stream_session - [Stream #48661c51-7fd2-11e6-8ba7-000000000001] Beginning stream session with 127.0.0.2
INFO 2016-09-21 16:06:21,685 [shard 0] stream_session - [Stream #48661c51-7fd2-11e6-8ba7-000000000001] Prepare completed with 127.0.0.2. Receiving 1, sending 0
INFO 2016-09-21 16:06:23,687 [shard 0] storage_service - Stop transport: stop_gossiping done
INFO 2016-09-21 16:06:23,687 [shard 0] storage_service - Thrift server stopped
INFO 2016-09-21 16:06:23,687 [shard 0] storage_service - CQL server stopped
INFO 2016-09-21 16:06:23,687 [shard 0] storage_service - Stop transport: shutdown rpc and cql server done
INFO 2016-09-21 16:06:23,688 [shard 0] storage_service - messaging_service stopped
INFO 2016-09-21 16:06:23,688 [shard 0] storage_service - Stop transport: shutdown messaging_service done
INFO 2016-09-21 16:06:23,688 [shard 0] stream_session - [Stream #48661c51-7fd2-11e6-8ba7-000000000001] Session with 127.0.0.2 is complete, state=FAILED
INFO 2016-09-21 16:06:23,688 [shard 0] storage_service - stream_manager stopped
INFO 2016-09-21 16:06:23,688 [shard 1] storage_service - stream_manager stopped
INFO 2016-09-21 16:06:23,688 [shard 0] stream_session - [Stream #48661c51-7fd2-11e6-8ba7-000000000001] bytes_sent = 0, bytes_received = 25725
INFO 2016-09-21 16:06:23,688 [shard 0] storage_service - Stop transport: shutdown stream_manager done
WARN 2016-09-21 16:06:23,688 [shard 0] stream_session - [Stream #48661c51-7fd2-11e6-8ba7-000000000001] Stream failed, peers={127.0.0.2}
WARN 2016-09-21 16:06:23,688 [shard 0] repair - repair's stream failed: streaming::stream_exception (Stream failed)
INFO 2016-09-21 16:06:23,688 [shard 0] repair - repair 1 failed - streaming::stream_exception (Stream failed)
INFO 2016-09-21 16:06:23,688 [shard 0] storage_service - Stop transport: auth shutdown
INFO 2016-09-21 16:06:23,688 [shard 0] storage_service - Stop transport: done
INFO 2016-09-21 16:06:23,688 [shard 0] storage_service - Drain on shutdown: stop_transport done
INFO 2016-09-21 16:06:23,688 [shard 0] tracing - Asked to shut down
INFO 2016-09-21 16:06:23,688 [shard 0] tracing - Tracing is down
INFO 2016-09-21 16:06:23,688 [shard 1] tracing - Asked to shut down
INFO 2016-09-21 16:06:23,688 [shard 1] tracing - Tracing is down
INFO 2016-09-21 16:06:23,688 [shard 0] storage_service - Drain on shutdown: tracing is stopped
INFO 2016-09-21 16:06:23,774 [shard 0] storage_service - Drain on shutdown: flush column_families done
INFO 2016-09-21 16:06:23,774 [shard 0] storage_service - Drain on shutdown: shutdown commitlog done
INFO 2016-09-21 16:06:23,774 [shard 0] storage_service - Drain on shutdown: done
INFO 2016-09-21 16:06:23,774 [shard 0] repair - Starting shutdown of repair
INFO 2016-09-21 16:06:23,774 [shard 0] repair - Completed shutdown of repair
INFO 2016-09-21 16:06:23,774 [shard 0] compaction_manager - Asked to stop
INFO 2016-09-21 16:06:23,774 [shard 1] compaction_manager - Asked to stop
271 lines
8.6 KiB
C++
271 lines
8.6 KiB
C++
/*
|
|
* Licensed to the Apache Software Foundation (ASF) under one
|
|
* or more contributor license agreements. See the NOTICE file
|
|
* distributed with this work for additional information
|
|
* regarding copyright ownership. The ASF licenses this file
|
|
* to you under the Apache License, Version 2.0 (the
|
|
* "License"); you may not use this file except in compliance
|
|
* with the License. You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*
|
|
* Modified by ScyllaDB
|
|
* Copyright (C) 2015 ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "core/distributed.hh"
|
|
#include "streaming/stream_manager.hh"
|
|
#include "streaming/stream_result_future.hh"
|
|
#include "log.hh"
|
|
#include "streaming/stream_session_state.hh"
|
|
|
|
namespace streaming {
|
|
|
|
extern logging::logger sslog;
|
|
|
|
distributed<stream_manager> _the_stream_manager;
|
|
|
|
void stream_manager::register_sending(shared_ptr<stream_result_future> result) {
|
|
#if 0
|
|
result.addEventListener(notifier);
|
|
// Make sure we remove the stream on completion (whether successful or not)
|
|
result.addListener(new Runnable()
|
|
{
|
|
public void run()
|
|
{
|
|
initiatedStreams.remove(result.planId);
|
|
}
|
|
}, MoreExecutors.sameThreadExecutor());
|
|
#endif
|
|
_initiated_streams[result->plan_id] = std::move(result);
|
|
}
|
|
|
|
void stream_manager::register_receiving(shared_ptr<stream_result_future> result) {
|
|
#if 0
|
|
result->add_event_listener(notifier);
|
|
// Make sure we remove the stream on completion (whether successful or not)
|
|
result.addListener(new Runnable()
|
|
{
|
|
public void run()
|
|
{
|
|
receivingStreams.remove(result.planId);
|
|
}
|
|
}, MoreExecutors.sameThreadExecutor());
|
|
#endif
|
|
_receiving_streams[result->plan_id] = std::move(result);
|
|
}
|
|
|
|
shared_ptr<stream_result_future> stream_manager::get_sending_stream(UUID plan_id) {
|
|
auto it = _initiated_streams.find(plan_id);
|
|
if (it != _initiated_streams.end()) {
|
|
return it->second;
|
|
}
|
|
return {};
|
|
}
|
|
|
|
shared_ptr<stream_result_future> stream_manager::get_receiving_stream(UUID plan_id) {
|
|
auto it = _receiving_streams.find(plan_id);
|
|
if (it != _receiving_streams.end()) {
|
|
return it->second;
|
|
}
|
|
return {};
|
|
}
|
|
|
|
void stream_manager::remove_stream(UUID plan_id) {
|
|
sslog.debug("stream_manager: removing plan_id={}", plan_id);
|
|
_initiated_streams.erase(plan_id);
|
|
_receiving_streams.erase(plan_id);
|
|
// FIXME: Do not ignore the future
|
|
remove_progress_on_all_shards(plan_id).handle_exception([plan_id] (auto ep) {
|
|
sslog.info("stream_manager: Fail to remove progress for plan_id={}: {}", plan_id, ep);
|
|
});
|
|
}
|
|
|
|
void stream_manager::show_streams() {
|
|
for (auto& x : _initiated_streams) {
|
|
sslog.debug("stream_manager:initiated_stream: plan_id={}", x.first);
|
|
}
|
|
for (auto& x : _receiving_streams) {
|
|
sslog.debug("stream_manager:receiving_stream: plan_id={}", x.first);
|
|
}
|
|
}
|
|
|
|
std::vector<shared_ptr<stream_result_future>> stream_manager::get_all_streams() const {
|
|
std::vector<shared_ptr<stream_result_future>> result;
|
|
for (auto& x : _initiated_streams) {
|
|
result.push_back(x.second);
|
|
}
|
|
for (auto& x : _receiving_streams) {
|
|
result.push_back(x.second);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void stream_manager::update_progress(UUID cf_id, gms::inet_address peer, progress_info::direction dir, size_t fm_size) {
|
|
auto& sbytes = _stream_bytes[cf_id];
|
|
if (dir == progress_info::direction::OUT) {
|
|
sbytes[peer].bytes_sent += fm_size;
|
|
} else {
|
|
sbytes[peer].bytes_received += fm_size;
|
|
}
|
|
}
|
|
|
|
future<> stream_manager::update_all_progress_info() {
|
|
return seastar::async([this] {
|
|
for (auto sr: get_all_streams()) {
|
|
for (auto session : sr->get_coordinator()->get_all_stream_sessions()) {
|
|
session->update_progress().get();
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
void stream_manager::remove_progress(UUID plan_id) {
|
|
_stream_bytes.erase(plan_id);
|
|
}
|
|
|
|
stream_bytes stream_manager::get_progress(UUID plan_id, gms::inet_address peer) {
|
|
auto& sbytes = _stream_bytes[plan_id];
|
|
return sbytes[peer];
|
|
}
|
|
|
|
stream_bytes stream_manager::get_progress(UUID plan_id) {
|
|
stream_bytes ret;
|
|
for (auto& x : _stream_bytes[plan_id]) {
|
|
ret += x.second;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
future<> stream_manager::remove_progress_on_all_shards(UUID plan_id) {
|
|
return get_stream_manager().invoke_on_all([plan_id] (auto& sm) {
|
|
sm.remove_progress(plan_id);
|
|
});
|
|
}
|
|
|
|
future<stream_bytes> stream_manager::get_progress_on_all_shards(UUID plan_id, gms::inet_address peer) {
|
|
return get_stream_manager().map_reduce0(
|
|
[plan_id, peer] (auto& sm) {
|
|
return sm.get_progress(plan_id, peer);
|
|
},
|
|
stream_bytes(),
|
|
std::plus<stream_bytes>()
|
|
);
|
|
}
|
|
|
|
future<stream_bytes> stream_manager::get_progress_on_all_shards(UUID plan_id) {
|
|
return get_stream_manager().map_reduce0(
|
|
[plan_id] (auto& sm) {
|
|
return sm.get_progress(plan_id);
|
|
},
|
|
stream_bytes(),
|
|
std::plus<stream_bytes>()
|
|
);
|
|
}
|
|
|
|
future<stream_bytes> stream_manager::get_progress_on_all_shards(gms::inet_address peer) {
|
|
return get_stream_manager().map_reduce0(
|
|
[peer] (auto& sm) {
|
|
stream_bytes ret;
|
|
for (auto& sbytes : sm._stream_bytes) {
|
|
ret += sbytes.second[peer];
|
|
}
|
|
return ret;
|
|
},
|
|
stream_bytes(),
|
|
std::plus<stream_bytes>()
|
|
);
|
|
}
|
|
|
|
future<stream_bytes> stream_manager::get_progress_on_all_shards() {
|
|
return get_stream_manager().map_reduce0(
|
|
[] (auto& sm) {
|
|
stream_bytes ret;
|
|
for (auto& sbytes : sm._stream_bytes) {
|
|
for (auto& sb : sbytes.second) {
|
|
ret += sb.second;
|
|
}
|
|
}
|
|
return ret;
|
|
},
|
|
stream_bytes(),
|
|
std::plus<stream_bytes>()
|
|
);
|
|
}
|
|
|
|
bool stream_manager::has_peer(inet_address endpoint) {
|
|
for (auto sr : get_all_streams()) {
|
|
for (auto session : sr->get_coordinator()->get_all_stream_sessions()) {
|
|
if (session->peer == endpoint) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void stream_manager::fail_sessions(inet_address endpoint) {
|
|
for (auto sr : get_all_streams()) {
|
|
for (auto session : sr->get_coordinator()->get_all_stream_sessions()) {
|
|
if (session->peer == endpoint) {
|
|
session->close_session(stream_session_state::FAILED);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void stream_manager::fail_all_sessions() {
|
|
for (auto sr : get_all_streams()) {
|
|
for (auto session : sr->get_coordinator()->get_all_stream_sessions()) {
|
|
session->close_session(stream_session_state::FAILED);
|
|
}
|
|
}
|
|
}
|
|
|
|
void stream_manager::on_remove(inet_address endpoint) {
|
|
if (has_peer(endpoint)) {
|
|
sslog.info("stream_manager: Close all stream_session with peer = {} in on_remove", endpoint);
|
|
get_stream_manager().invoke_on_all([endpoint] (auto& sm) {
|
|
sm.fail_sessions(endpoint);
|
|
}).handle_exception([endpoint] (auto ep) {
|
|
sslog.warn("stream_manager: Fail to close sessions peer = {} in on_remove", endpoint);
|
|
});
|
|
}
|
|
}
|
|
|
|
void stream_manager::on_restart(inet_address endpoint, endpoint_state ep_state) {
|
|
if (has_peer(endpoint)) {
|
|
sslog.info("stream_manager: Close all stream_session with peer = {} in on_restart", endpoint);
|
|
get_stream_manager().invoke_on_all([endpoint] (auto& sm) {
|
|
sm.fail_sessions(endpoint);
|
|
}).handle_exception([endpoint] (auto ep) {
|
|
sslog.warn("stream_manager: Fail to close sessions peer = {} in on_restart", endpoint);
|
|
});
|
|
}
|
|
}
|
|
|
|
} // namespace streaming
|