From 69c81bcc87b8656264a1384c92e652fca53d57a6 Mon Sep 17 00:00:00 2001 From: Asias He Date: Wed, 23 Aug 2017 10:02:29 +0800 Subject: [PATCH] repair: Do not allow repair until node is in NORMAL status The following backtrace was reported by user when running repair and keeping restarting the node at the same time. #0 0x00007eff077281d7 in raise () from /lib64/libc.so.6 #1 0x00007eff07729a08 in abort () from /lib64/libc.so.6 #2 0x00007eff07721146 in __assert_fail_base () from /lib64/libc.so.6 #3 0x00007eff077211f2 in __assert_fail () from /lib64/libc.so.6 #4 0x00000000010ef2c2 in locator::token_metadata::first_token_index (this=0x641000214e98, start=...) at locator/token_metadata.cc:133 #5 0x00000000010ef2d9 in locator::token_metadata::first_token (this=0x641000214e98, start=...) at locator/token_metadata.cc:143 #6 0x00000000010e329d in locator::abstract_replication_strategy::get_natural_endpoints (this=0x641000494000, search_token=...) at locator/abstract_replication_strategy.cc:66 #7 0x0000000001481186 in get_neighbors (hosts=std::vector of length 0, capacity 0, data_centers=std::vector of length 0, capacity 0, range=, ksname=..., db=...) at repair/repair.cc:196 #8 repair_range > (range=..., ri=...) at repair/repair.cc:781 #9 ::::::::operator() (__closure=0x7efec07f7460) at repair/repair.cc:1005 #10 futurize > >::apply:: It is reproduced with 1) while true; do curl -X POST --header "Content-Type: application/json" --header "Accept: application/json" "http://127.0.0.1:10000/storage_service/repair_async/ks3"; done 2) start node 127.0.0.1, stop node 127.0.0.1 in a loop The problem is, during boot up, the token_metadata is not replicated to all shards until the node goes into NORMAL status. To fix, check until node is in NORMAL status before allowing repair. Fixes #2723 --- repair/repair.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/repair/repair.cc b/repair/repair.cc index bf42d4ff1e..1b7db4853c 100644 --- a/repair/repair.cc +++ b/repair/repair.cc @@ -1053,6 +1053,10 @@ static int do_repair_start(seastar::sharded& db, sstring keyspace, repair_tracker.start(id); auto fail = defer([id] { repair_tracker.done(id, false); }); + if (!gms::get_local_gossiper().is_normal(utils::fb_utilities::get_broadcast_address())) { + throw std::runtime_error("Node is not in NORMAL status yet!"); + } + // If the "ranges" option is not explicitly specified, we repair all the // local ranges (the token ranges for which this node holds a replica of). // Each of these ranges may have a different set of replicas, so the