mirror of
https://github.com/tendermint/tendermint.git
synced 2025-12-23 06:15:19 +00:00
consensus: double-sign risk reduction (ADR-51) (#5147)
Implementation spec of Double Signing Risk Reduction [ADR-51](https://github.com/tendermint/tendermint/blob/master/docs/architecture/adr-051-double-signing-risk-reduction.md) by B-Harvest - Add `DoubleSignCheckHeight` config variable to ConsensusConfig for "How many blocks looks back to check existence of the node's consensus votes when before joining consensus" - Add `consensus.double_sign_check_height` to `config.toml` and `tendermint node` as flag for set `DoubleSignCheckHeight` - Set default `consensus.double_sign_check_height` to `0` ( it could be adjustable in this PR, disable when 0 ) Refs - [ADR-51](https://github.com/tendermint/tendermint/blob/master/docs/architecture/adr-051-double-signing-risk-reduction.md) - [https://github.com/tendermint/tendermint/issues/4059](https://github.com/tendermint/tendermint/issues/4059) - [https://github.com/tendermint/tendermint/pull/4262](https://github.com/tendermint/tendermint/pull/4262)
This commit is contained in:
2
.github/workflows/coverage.yml
vendored
2
.github/workflows/coverage.yml
vendored
@@ -104,7 +104,7 @@ jobs:
|
||||
if: "env.GIT_DIFF != ''"
|
||||
- name: test & coverage report creation
|
||||
run: |
|
||||
cat xac.txt | xargs go test -mod=readonly -timeout 5m -race -coverprofile=coverage.txt -covermode=atomic
|
||||
cat xac.txt | xargs go test -mod=readonly -timeout 10m -race -coverprofile=coverage.txt -covermode=atomic
|
||||
if: "env.GIT_DIFF != ''"
|
||||
- uses: codecov/codecov-action@v1.0.13
|
||||
with:
|
||||
|
||||
@@ -11,6 +11,7 @@ Friendly reminder, we have a [bug bounty program](https://hackerone.com/tendermi
|
||||
## FEATURES
|
||||
|
||||
- [privval] \#5239 Add `chainID` to requests from client. (@marbar3778)
|
||||
- [config] Add `--consensus.double_sign_check_height` flag and `DoubleSignCheckHeight` config variable. See [ADR-51](https://github.com/tendermint/tendermint/blob/master/docs/architecture/adr-051-double-signing-risk-reduction.md)
|
||||
|
||||
## IMPROVEMENTS
|
||||
|
||||
|
||||
@@ -37,6 +37,9 @@ func AddNodeFlags(cmd *cobra.Command) {
|
||||
"genesis_hash",
|
||||
[]byte{},
|
||||
"Optional SHA-256 hash of the genesis file")
|
||||
cmd.Flags().Int64("consensus.double_sign_check_height", config.Consensus.DoubleSignCheckHeight,
|
||||
"How many blocks to look back to check existence of the node's "+
|
||||
"consensus votes before joining consensus")
|
||||
|
||||
// abci flags
|
||||
cmd.Flags().String(
|
||||
|
||||
@@ -830,6 +830,8 @@ type ConsensusConfig struct {
|
||||
// Reactor sleep duration parameters
|
||||
PeerGossipSleepDuration time.Duration `mapstructure:"peer_gossip_sleep_duration"`
|
||||
PeerQueryMaj23SleepDuration time.Duration `mapstructure:"peer_query_maj23_sleep_duration"`
|
||||
|
||||
DoubleSignCheckHeight int64 `mapstructure:"double_sign_check_height"`
|
||||
}
|
||||
|
||||
// DefaultConsensusConfig returns a default configuration for the consensus service
|
||||
@@ -848,6 +850,7 @@ func DefaultConsensusConfig() *ConsensusConfig {
|
||||
CreateEmptyBlocksInterval: 0 * time.Second,
|
||||
PeerGossipSleepDuration: 100 * time.Millisecond,
|
||||
PeerQueryMaj23SleepDuration: 2000 * time.Millisecond,
|
||||
DoubleSignCheckHeight: int64(0),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -864,6 +867,7 @@ func TestConsensusConfig() *ConsensusConfig {
|
||||
cfg.SkipTimeoutCommit = true
|
||||
cfg.PeerGossipSleepDuration = 5 * time.Millisecond
|
||||
cfg.PeerQueryMaj23SleepDuration = 250 * time.Millisecond
|
||||
cfg.DoubleSignCheckHeight = int64(0)
|
||||
return cfg
|
||||
}
|
||||
|
||||
@@ -945,6 +949,9 @@ func (cfg *ConsensusConfig) ValidateBasic() error {
|
||||
if cfg.PeerQueryMaj23SleepDuration < 0 {
|
||||
return errors.New("peer_query_maj23_sleep_duration can't be negative")
|
||||
}
|
||||
if cfg.DoubleSignCheckHeight < 0 {
|
||||
return errors.New("double_sign_check_height can't be negative")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -164,6 +164,7 @@ func TestConsensusConfig_ValidateBasic(t *testing.T) {
|
||||
"PeerGossipSleepDuration negative": {func(c *ConsensusConfig) { c.PeerGossipSleepDuration = -1 }, true},
|
||||
"PeerQueryMaj23SleepDuration": {func(c *ConsensusConfig) { c.PeerQueryMaj23SleepDuration = time.Second }, false},
|
||||
"PeerQueryMaj23SleepDuration negative": {func(c *ConsensusConfig) { c.PeerQueryMaj23SleepDuration = -1 }, true},
|
||||
"DoubleSignCheckHeight negative": {func(c *ConsensusConfig) { c.DoubleSignCheckHeight = -1 }, true},
|
||||
}
|
||||
for desc, tc := range testcases {
|
||||
tc := tc // appease linter
|
||||
|
||||
@@ -381,6 +381,12 @@ timeout_precommit = "{{ .Consensus.TimeoutPrecommit }}"
|
||||
timeout_precommit_delta = "{{ .Consensus.TimeoutPrecommitDelta }}"
|
||||
timeout_commit = "{{ .Consensus.TimeoutCommit }}"
|
||||
|
||||
# How many blocks to look back to check existence of the node's consensus votes before joining consensus
|
||||
# When non-zero, the node will panic upon restart
|
||||
# if the same consensus key was used to sign {double_sign_check_height} last blocks.
|
||||
# So, validators should stop the state machine, wait for some blocks, and then restart the state machine to avoid panic.
|
||||
double_sign_check_height = {{ .Consensus.DoubleSignCheckHeight }}
|
||||
|
||||
# Make progress as soon as we have all the precommits (as if TimeoutCommit = 0)
|
||||
skip_timeout_commit = {{ .Consensus.SkipTimeoutCommit }}
|
||||
|
||||
|
||||
@@ -34,9 +34,10 @@ import (
|
||||
// Errors
|
||||
|
||||
var (
|
||||
ErrInvalidProposalSignature = errors.New("error invalid proposal signature")
|
||||
ErrInvalidProposalPOLRound = errors.New("error invalid proposal POL round")
|
||||
ErrAddingVote = errors.New("error adding vote")
|
||||
ErrInvalidProposalSignature = errors.New("error invalid proposal signature")
|
||||
ErrInvalidProposalPOLRound = errors.New("error invalid proposal POL round")
|
||||
ErrAddingVote = errors.New("error adding vote")
|
||||
ErrSignatureFoundInPastBlocks = errors.New("found signature from the same key")
|
||||
|
||||
errPubKeyIsNotSet = errors.New("pubkey is not set. Look for \"Can't get private validator pubkey\" errors")
|
||||
)
|
||||
@@ -366,6 +367,11 @@ func (cs *State) OnStart() error {
|
||||
return err
|
||||
}
|
||||
|
||||
// Double Signing Risk Reduction
|
||||
if err := cs.checkDoubleSigningRisk(cs.Height); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// now start the receiveRoutine
|
||||
go cs.receiveRoutine(0)
|
||||
|
||||
@@ -2114,6 +2120,29 @@ func (cs *State) updatePrivValidatorPubKey() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// look back to check existence of the node's consensus votes before joining consensus
|
||||
func (cs *State) checkDoubleSigningRisk(height int64) error {
|
||||
if cs.privValidator != nil && cs.privValidatorPubKey != nil && cs.config.DoubleSignCheckHeight > 0 && height > 0 {
|
||||
valAddr := cs.privValidatorPubKey.Address()
|
||||
doubleSignCheckHeight := cs.config.DoubleSignCheckHeight
|
||||
if doubleSignCheckHeight > height {
|
||||
doubleSignCheckHeight = height
|
||||
}
|
||||
for i := int64(1); i < doubleSignCheckHeight; i++ {
|
||||
lastCommit := cs.blockStore.LoadSeenCommit(height - i)
|
||||
if lastCommit != nil {
|
||||
for sigIdx, s := range lastCommit.Signatures {
|
||||
if s.BlockIDFlag == types.BlockIDFlagCommit && bytes.Equal(s.ValidatorAddress, valAddr) {
|
||||
cs.Logger.Info("Found signature from the same key", "sig", s, "idx", sigIdx, "height", height-i)
|
||||
return ErrSignatureFoundInPastBlocks
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
//---------------------------------------------------------
|
||||
|
||||
func CompareHRS(h1 int64, r1 int32, s1 cstypes.RoundStepType, h2 int64, r2 int32, s2 cstypes.RoundStepType) int {
|
||||
|
||||
@@ -28,8 +28,8 @@ We would like to suggest a double signing risk reduction method.
|
||||
- Configuration
|
||||
- We would like to suggest by introducing `double_sign_check_height` parameter in `config.toml` and cli, how many blocks state machine looks back to check votes
|
||||
- <span v-pre>`double_sign_check_height = {{ .Consensus.DoubleSignCheckHeight }}`</span> in `config.toml`
|
||||
- `tendermint node --double_sign_check_height` in cli
|
||||
- State machine ignore checking procedure when `vote-check-height == 0`
|
||||
- `tendermint node --consensus.double_sign_check_height` in cli
|
||||
- State machine ignore checking procedure when `double_sign_check_height == 0`
|
||||
|
||||
## Status
|
||||
|
||||
|
||||
@@ -334,6 +334,12 @@ timeout_precommit = "1s"
|
||||
timeout_precommit_delta = "500ms"
|
||||
timeout_commit = "1s"
|
||||
|
||||
# How many blocks to look back to check existence of the node's consensus votes before joining consensus
|
||||
# When non-zero, the node will panic upon restart
|
||||
# if the same consensus key was used to sign {double_sign_check_height} last blocks.
|
||||
# So, validators should stop the state machine, wait for some blocks, and then restart the state machine to avoid panic.
|
||||
double_sign_check_height = 0
|
||||
|
||||
# Make progress as soon as we have all the precommits (as if TimeoutCommit = 0)
|
||||
skip_timeout_commit = false
|
||||
|
||||
|
||||
@@ -61,16 +61,18 @@ When initializing nodes there are five parameters in the `config.toml` that may
|
||||
- `unconditional_peer_ids:` comma separated list of nodeID's. These nodes will be connected to no matter the limits of inbound and outbound peers. This is useful for when sentry nodes have full address books.
|
||||
- `private_peer_ids:` comma separated list of nodeID's. These nodes will not be gossiped to the network. This is an important field as you do not want your validator IP gossiped to the network.
|
||||
- `addr_book_strict:` boolean. By default nodes with a routable address will be considered for connection. If this setting is turned off (false), non-routable IP addresses, like addresses in a private network can be added to the address book.
|
||||
- `double_sign_check_height` int64 height. How many blocks to look back to check existence of the node's consensus votes before joining consensus When non-zero, the node will panic upon restart if the same consensus key was used to sign {double_sign_check_height} last blocks. So, validators should stop the state machine, wait for some blocks, and then restart the state machine to avoid panic.
|
||||
|
||||
#### Validator Node Configuration
|
||||
|
||||
| Config Option | Setting |
|
||||
| ---------------------- | -------------------------- |
|
||||
| pex | false |
|
||||
| persistent_peers | list of sentry nodes |
|
||||
| private_peer_ids | none |
|
||||
| unconditional_peer_ids | optionally sentry node IDs |
|
||||
| addr_book_strict | false |
|
||||
| Config Option | Setting |
|
||||
| ------------------------ | -------------------------- |
|
||||
| pex | false |
|
||||
| persistent_peers | list of sentry nodes |
|
||||
| private_peer_ids | none |
|
||||
| unconditional_peer_ids | optionally sentry node IDs |
|
||||
| addr_book_strict | false |
|
||||
| double_sign_check_height | 10 |
|
||||
|
||||
The validator node should have `pex=false` so it does not gossip to the entire network. The persistent peers will be your sentry nodes. Private peers can be left empty as the validator is not trying to hide who it is communicating with. Setting unconditional peers is optional for a validator because they will not have a full address books.
|
||||
|
||||
|
||||
@@ -19,3 +19,4 @@ and run the following tests in docker containers:
|
||||
- send a tx on each node and ensure the state root is updated on all of them
|
||||
- crash and restart nodes one at a time and ensure they can sync back up (via fastsync)
|
||||
- crash and restart all nodes at once and ensure they can sync back up
|
||||
- restart each nodes with double_sign_check_height and ensure panic if the same consensus key was used to sign in double_sign_check_height blocks
|
||||
|
||||
64
test/p2p/dsrr/check_peer.sh
Normal file
64
test/p2p/dsrr/check_peer.sh
Normal file
@@ -0,0 +1,64 @@
|
||||
#! /bin/bash
|
||||
set -eu
|
||||
set -o pipefail
|
||||
|
||||
IPV=$1
|
||||
ID=$2
|
||||
ASSERT_CASE=$3
|
||||
ASSERT_NODE_UP=1
|
||||
ASSERT_NODE_DOWN=0
|
||||
MAX_TRY=10
|
||||
|
||||
|
||||
###########################################
|
||||
#
|
||||
# Wait for peer to catchup to other peers
|
||||
#
|
||||
###########################################
|
||||
|
||||
addr=$(test/p2p/address.sh $IPV $ID 26657)
|
||||
peerID=$(( $(($ID % 4)) + 1 )) # 1->2 ... 3->4 ... 4->1
|
||||
peer_addr=$(test/p2p/address.sh $IPV $peerID 26657)
|
||||
|
||||
# get another peer's height
|
||||
h1=`curl -s $peer_addr/status | jq .result.sync_info.latest_block_height | jq fromjson`
|
||||
|
||||
# get another peer's state
|
||||
root1=`curl -s $peer_addr/status | jq .result.sync_info.latest_app_hash`
|
||||
|
||||
echo "Other peer is on height $h1 with state $root1"
|
||||
echo "Waiting for peer $ID to catch up"
|
||||
|
||||
# wait for it to sync to past its previous height
|
||||
set +e
|
||||
set +o pipefail
|
||||
h2="0"
|
||||
COUNT=0
|
||||
while [[ "$h2" -lt "$(($h1+1))" ]]; do
|
||||
sleep 1
|
||||
h2=`curl -s $addr/status --connect-timeout 1 | jq .result.sync_info.latest_block_height | jq fromjson`
|
||||
COUNT=$((COUNT+1))
|
||||
echo "... $h2, try $COUNT"
|
||||
if [ "$COUNT" -ge "$MAX_TRY" ]; then
|
||||
if [ $ASSERT_CASE -eq $ASSERT_NODE_DOWN ]; then
|
||||
echo "double sign risk reduction operates normally as expected"
|
||||
fi
|
||||
if [ $ASSERT_CASE -eq $ASSERT_NODE_UP ]; then
|
||||
echo "double sign risk reduction fail"
|
||||
exit 1
|
||||
fi
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
|
||||
if [ $ASSERT_CASE -eq $ASSERT_NODE_UP ]; then
|
||||
# check the app hash
|
||||
root2=`curl -s $addr/status | jq .result.sync_info.latest_app_hash`
|
||||
|
||||
if [[ "$root1" != "$root2" ]]; then
|
||||
echo "App hash after restart does not match. Got $root2; expected $root1"
|
||||
exit 1
|
||||
fi
|
||||
echo "... double sign risk reduction test passed"
|
||||
fi
|
||||
13
test/p2p/dsrr/test.sh
Normal file
13
test/p2p/dsrr/test.sh
Normal file
@@ -0,0 +1,13 @@
|
||||
#! /bin/bash
|
||||
set -eu
|
||||
|
||||
DOCKER_IMAGE=$1
|
||||
NETWORK_NAME=$2
|
||||
IPV=$3
|
||||
N=$4
|
||||
PROXY_APP=$5
|
||||
|
||||
# run it on each of them
|
||||
for i in `seq 1 $N`; do
|
||||
bash test/p2p/dsrr/test_peer.sh $DOCKER_IMAGE $NETWORK_NAME $IPV $i $N $PROXY_APP
|
||||
done
|
||||
65
test/p2p/dsrr/test_peer.sh
Normal file
65
test/p2p/dsrr/test_peer.sh
Normal file
@@ -0,0 +1,65 @@
|
||||
#! /bin/bash
|
||||
set -eu
|
||||
set -o pipefail
|
||||
|
||||
DOCKER_IMAGE=$1
|
||||
NETWORK_NAME=$2
|
||||
IPV=$3
|
||||
ID=$4
|
||||
N=$5
|
||||
PROXY_APP=$6
|
||||
ASSERT_NODE_UP=1
|
||||
ASSERT_NODE_DOWN=0
|
||||
|
||||
###########################s####################################
|
||||
# this runs on each peer:
|
||||
# kill peer
|
||||
# bring it back online with double_sign_check_height 10
|
||||
# wait node is not run by double sign risk reduction
|
||||
#
|
||||
# kill peer
|
||||
# bring it back online with double_sign_check_height 1
|
||||
# pass double sign risk reduction, wait for it to sync and check the app hash
|
||||
#
|
||||
# kill peer
|
||||
# bring it back online with double_sign_check_height 0
|
||||
# wait for it to sync and check the app hash
|
||||
###############################################################
|
||||
|
||||
echo "Testing double sign risk reduction on node $ID"
|
||||
|
||||
# kill peer
|
||||
set +e
|
||||
docker rm -vf local_testnet_$ID
|
||||
set -e
|
||||
PERSISTENT_PEERS="$(test/p2p/address.sh $IPV 1 26656 $DOCKER_IMAGE)"
|
||||
for j in `seq 2 $N`; do
|
||||
PERSISTENT_PEERS="$PERSISTENT_PEERS,$(test/p2p/address.sh $IPV $j 26656 $DOCKER_IMAGE)"
|
||||
done
|
||||
|
||||
# bring it back online with double_sign_check_height 10
|
||||
# wait node is not run by double sign risk reduction
|
||||
DSCH=10
|
||||
bash test/p2p/peer.sh $DOCKER_IMAGE $NETWORK_NAME $IPV $ID $PROXY_APP "--p2p.persistent_peers $PERSISTENT_PEERS --p2p.pex --rpc.unsafe --consensus.double_sign_check_height $DSCH"
|
||||
bash test/p2p/client.sh $DOCKER_IMAGE $NETWORK_NAME $IPV fs_$ID "test/p2p/dsrr/check_peer.sh $IPV $ID $ASSERT_NODE_DOWN"
|
||||
|
||||
|
||||
docker stop local_testnet_$ID
|
||||
docker rm local_testnet_$ID
|
||||
# bring it back online with double_sign_check_height 1
|
||||
# pass double sign risk reduction, wait for it to sync and check the app hash
|
||||
DSCH=1
|
||||
bash test/p2p/peer.sh $DOCKER_IMAGE $NETWORK_NAME $IPV $ID $PROXY_APP "--p2p.persistent_peers $PERSISTENT_PEERS --p2p.pex --rpc.unsafe --consensus.double_sign_check_height $DSCH"
|
||||
bash test/p2p/client.sh $DOCKER_IMAGE $NETWORK_NAME $IPV fs_$ID "test/p2p/dsrr/check_peer.sh $IPV $ID $ASSERT_NODE_UP"
|
||||
|
||||
docker stop local_testnet_$ID
|
||||
docker rm local_testnet_$ID
|
||||
DSCH=0
|
||||
# bring it back online with double_sign_check_height 0
|
||||
# double sign risk reduction is not activated, wait for it to sync and check the app hash
|
||||
bash test/p2p/peer.sh $DOCKER_IMAGE $NETWORK_NAME $IPV $ID $PROXY_APP "--p2p.persistent_peers $PERSISTENT_PEERS --p2p.pex --rpc.unsafe --consensus.double_sign_check_height $DSCH"
|
||||
bash test/p2p/client.sh $DOCKER_IMAGE $NETWORK_NAME $IPV fs_$ID "test/p2p/dsrr/check_peer.sh $IPV $ID $ASSERT_NODE_UP"
|
||||
|
||||
echo ""
|
||||
echo "PASS"
|
||||
echo ""
|
||||
@@ -35,6 +35,9 @@ bash test/p2p/client.sh "$DOCKER_IMAGE" "$NETWORK_NAME" "$IPV" ab "test/p2p/atom
|
||||
# for each node, kill it and readd via fast sync
|
||||
bash test/p2p/fast_sync/test.sh "$DOCKER_IMAGE" "$NETWORK_NAME" "$IPV" "$N" "$PROXY_APP"
|
||||
|
||||
# test double sign risk reduction for each node
|
||||
bash test/p2p/dsrr/test.sh "$DOCKER_IMAGE" "$NETWORK_NAME" "$IPV" "$N" "$PROXY_APP"
|
||||
|
||||
# test killing all peers 3 times
|
||||
bash test/p2p/kill_all/test.sh "$DOCKER_IMAGE" "$NETWORK_NAME" "$IPV" "$N" 3
|
||||
|
||||
|
||||
Reference in New Issue
Block a user