From 9d1f77369f355a189a52fd5dfd96ed5a2664573d Mon Sep 17 00:00:00 2001 From: Anton Kaliaev Date: Tue, 2 Feb 2021 12:22:56 +0400 Subject: [PATCH 01/15] goreleaser: downcase archive and binary names (#6029) before: ``` Tendermint_0.34.3_darwin_amd64.tar.gz -rw-r--r-- 0 runner docker 192329 Jan 19 19:30 CHANGELOG.md -rw-r--r-- 0 runner docker 321 Jan 19 19:30 CHANGELOG_PENDING.md -rw-r--r-- 0 runner docker 11382 Jan 19 19:30 LICENSE -rw-r--r-- 0 runner docker 8165 Jan 19 19:30 README.md -rwxr-xr-x 0 runner docker 23224320 Jan 19 19:30 tendermint ``` after: ``` tendermint_0.34.3_darwin_amd64.tar.gz -rw-r--r-- 0 runner docker 192329 Jan 19 19:30 CHANGELOG.md -rw-r--r-- 0 runner docker 321 Jan 19 19:30 CHANGELOG_PENDING.md -rw-r--r-- 0 runner docker 11382 Jan 19 19:30 LICENSE -rw-r--r-- 0 runner docker 8165 Jan 19 19:30 README.md -rwxr-xr-x 0 runner docker 23224320 Jan 19 19:30 tendermint ``` --- .goreleaser.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.goreleaser.yml b/.goreleaser.yml index cf3a35f2b..267d4e4ac 100644 --- a/.goreleaser.yml +++ b/.goreleaser.yml @@ -1,4 +1,4 @@ -project_name: Tendermint +project_name: tendermint env: # Require use of Go modules. From 90d3f56797743eb615b72b0d529e040dab94785f Mon Sep 17 00:00:00 2001 From: Callum Waters Date: Tue, 2 Feb 2021 11:36:52 +0100 Subject: [PATCH 02/15] store: fix deadlock in pruning (#6007) --- evidence/pool.go | 108 ++++++++++++++++-------- evidence/pool_test.go | 25 +++++- light/store/db/db.go | 52 ++++++------ state/state.go | 5 -- state/store.go | 185 +++++++++++++++++++++++++----------------- state/store_test.go | 90 ++++++++++---------- store/store.go | 123 +++++++++++++++------------- 7 files changed, 343 insertions(+), 245 deletions(-) diff --git a/evidence/pool.go b/evidence/pool.go index 34c267c0c..bc09ea94e 100644 --- a/evidence/pool.go +++ b/evidence/pool.go @@ -23,8 +23,8 @@ import ( const ( // prefixes are unique across all tm db's - prefixCommitted = int64(8) - prefixPending = int64(9) + prefixCommitted = int64(9) + prefixPending = int64(10) ) // Pool maintains a pool of valid evidence to be broadcasted and committed @@ -132,7 +132,7 @@ func (evpool *Pool) Update(state sm.State, ev types.EvidenceList) { evpool.updateState(state) // move committed evidence out from the pending pool and into the committed pool - evpool.markEvidenceAsCommitted(ev) + evpool.markEvidenceAsCommitted(ev, state.LastBlockHeight) // Prune pending evidence when it has expired. This also updates when the next // evidence will expire. @@ -386,23 +386,18 @@ func (evpool *Pool) addPendingEvidence(ev types.Evidence) error { return nil } -func (evpool *Pool) removePendingEvidence(evidence types.Evidence) { - key := keyPending(evidence) - if err := evpool.evidenceStore.Delete(key); err != nil { - evpool.logger.Error("failed to delete pending evidence", "err", err) - } else { - atomic.AddUint32(&evpool.evidenceSize, ^uint32(0)) - evpool.logger.Debug("deleted pending evidence", "evidence", evidence) - } -} - // markEvidenceAsCommitted processes all the evidence in the block, marking it as // committed and removing it from the pending database. -func (evpool *Pool) markEvidenceAsCommitted(evidence types.EvidenceList) { +func (evpool *Pool) markEvidenceAsCommitted(evidence types.EvidenceList, height int64) { blockEvidenceMap := make(map[string]struct{}, len(evidence)) + batch := evpool.evidenceStore.NewBatch() + defer batch.Close() + for _, ev := range evidence { if evpool.isPending(ev) { - evpool.removePendingEvidence(ev) + if err := batch.Delete(keyPending(ev)); err != nil { + evpool.logger.Error("failed to batch pending evidence", "err", err) + } blockEvidenceMap[evMapKey(ev)] = struct{}{} } @@ -410,7 +405,7 @@ func (evpool *Pool) markEvidenceAsCommitted(evidence types.EvidenceList) { // we only need to record the height that it was saved at. key := keyCommitted(ev) - h := gogotypes.Int64Value{Value: ev.Height()} + h := gogotypes.Int64Value{Value: height} evBytes, err := proto.Marshal(&h) if err != nil { evpool.logger.Error("failed to marshal committed evidence", "key(height/hash)", key, "err", err) @@ -424,10 +419,22 @@ func (evpool *Pool) markEvidenceAsCommitted(evidence types.EvidenceList) { evpool.logger.Debug("marked evidence as committed", "evidence", ev) } - // remove committed evidence from the clist - if len(blockEvidenceMap) != 0 { - evpool.removeEvidenceFromList(blockEvidenceMap) + // check if we need to remove any pending evidence + if len(blockEvidenceMap) == 0 { + return } + + // remove committed evidence from pending bucket + if err := batch.WriteSync(); err != nil { + evpool.logger.Error("failed to batch delete pending evidence", "err", err) + return + } + + // remove committed evidence from the clist + evpool.removeEvidenceFromList(blockEvidenceMap) + + // update the evidence size + atomic.AddUint32(&evpool.evidenceSize, ^uint32(len(blockEvidenceMap)-1)) } // listEvidence retrieves lists evidence from oldest to newest within maxBytes. @@ -481,44 +488,73 @@ func (evpool *Pool) listEvidence(prefixKey int64, maxBytes int64) ([]types.Evide } func (evpool *Pool) removeExpiredPendingEvidence() (int64, time.Time) { - iter, err := dbm.IteratePrefix(evpool.evidenceStore, prefixToBytes(prefixPending)) - if err != nil { - evpool.logger.Error("failed to iterate over pending evidence", "err", err) + batch := evpool.evidenceStore.NewBatch() + defer batch.Close() + + height, time, blockEvidenceMap := evpool.batchExpiredPendingEvidence(batch) + + // if we haven't removed any evidence then return early + if len(blockEvidenceMap) == 0 { + return height, time + } + + evpool.logger.Debug("removing expired evidence", + "height", evpool.State().LastBlockHeight, + "time", evpool.State().LastBlockTime, + "expired evidence", len(blockEvidenceMap), + ) + + // remove expired evidence from pending bucket + if err := batch.WriteSync(); err != nil { + evpool.logger.Error("failed to batch delete pending evidence", "err", err) return evpool.State().LastBlockHeight, evpool.State().LastBlockTime } - defer iter.Close() + // remove evidence from the clist + evpool.removeEvidenceFromList(blockEvidenceMap) + // update the evidence size + atomic.AddUint32(&evpool.evidenceSize, ^uint32(len(blockEvidenceMap)-1)) + + return height, time +} + +func (evpool *Pool) batchExpiredPendingEvidence(batch dbm.Batch) (int64, time.Time, map[string]struct{}) { blockEvidenceMap := make(map[string]struct{}) + iter, err := dbm.IteratePrefix(evpool.evidenceStore, prefixToBytes(prefixPending)) + if err != nil { + evpool.logger.Error("failed to iterate over pending evidence", "err", err) + return evpool.State().LastBlockHeight, evpool.State().LastBlockTime, blockEvidenceMap + } + defer iter.Close() for ; iter.Valid(); iter.Next() { ev, err := bytesToEv(iter.Value()) if err != nil { - evpool.logger.Error("failed to transition evidence from protobuf", "err", err) + evpool.logger.Error("failed to transition evidence from protobuf", "err", err, "ev", ev) continue } + // if true, we have looped through all expired evidence if !evpool.isExpired(ev.Height(), ev.Time()) { - if len(blockEvidenceMap) != 0 { - evpool.removeEvidenceFromList(blockEvidenceMap) - } - // Return the height and time with which this evidence will have expired // so we know when to prune next. return ev.Height() + evpool.State().ConsensusParams.Evidence.MaxAgeNumBlocks + 1, - ev.Time().Add(evpool.State().ConsensusParams.Evidence.MaxAgeDuration).Add(time.Second) + ev.Time().Add(evpool.State().ConsensusParams.Evidence.MaxAgeDuration).Add(time.Second), + blockEvidenceMap } - evpool.removePendingEvidence(ev) + // else add to the batch + if err := batch.Delete(iter.Key()); err != nil { + evpool.logger.Error("failed to batch evidence", "err", err, "ev", ev) + continue + } + + // and add to the map to remove the evidence from the clist blockEvidenceMap[evMapKey(ev)] = struct{}{} } - // we either have no pending evidence or all evidence has expired - if len(blockEvidenceMap) != 0 { - evpool.removeEvidenceFromList(blockEvidenceMap) - } - - return evpool.State().LastBlockHeight, evpool.State().LastBlockTime + return evpool.State().LastBlockHeight, evpool.State().LastBlockTime, blockEvidenceMap } func (evpool *Pool) removeEvidenceFromList( diff --git a/evidence/pool_test.go b/evidence/pool_test.go index 3d17fef13..522419a08 100644 --- a/evidence/pool_test.go +++ b/evidence/pool_test.go @@ -172,7 +172,7 @@ func TestEvidencePoolUpdate(t *testing.T) { pool, val := defaultTestPool(t, height) state := pool.State() - // create new block (no need to save it to blockStore) + // create two lots of old evidence that we expect to be pruned when we update prunedEv := types.NewMockDuplicateVoteEvidenceWithValidator( 1, defaultEvidenceTime.Add(1*time.Minute), @@ -180,7 +180,15 @@ func TestEvidencePoolUpdate(t *testing.T) { evidenceChainID, ) + notPrunedEv := types.NewMockDuplicateVoteEvidenceWithValidator( + 2, + defaultEvidenceTime.Add(2*time.Minute), + val, + evidenceChainID, + ) + require.NoError(t, pool.AddEvidence(prunedEv)) + require.NoError(t, pool.AddEvidence(notPrunedEv)) ev := types.NewMockDuplicateVoteEvidenceWithValidator( height, @@ -195,14 +203,23 @@ func TestEvidencePoolUpdate(t *testing.T) { state.LastBlockHeight = height + 1 state.LastBlockTime = defaultEvidenceTime.Add(22 * time.Minute) + evList, _ := pool.PendingEvidence(2 * defaultEvidenceMaxBytes) + require.Equal(t, 2, len(evList)) + + require.Equal(t, uint32(2), pool.Size()) + require.NoError(t, pool.CheckEvidence(types.EvidenceList{ev})) + evList, _ = pool.PendingEvidence(3 * defaultEvidenceMaxBytes) + require.Equal(t, 3, len(evList)) + + require.Equal(t, uint32(3), pool.Size()) + pool.Update(state, block.Evidence.Evidence) // a) Update marks evidence as committed so pending evidence should be empty - evList, evSize := pool.PendingEvidence(defaultEvidenceMaxBytes) - require.Empty(t, evList) - require.Zero(t, evSize) + evList, _ = pool.PendingEvidence(defaultEvidenceMaxBytes) + require.Equal(t, []types.Evidence{notPrunedEv}, evList) // b) If we try to check this evidence again it should fail because it has already been committed err := pool.CheckEvidence(types.EvidenceList{ev}) diff --git a/light/store/db/db.go b/light/store/db/db.go index ee3973c8e..9ed121a80 100644 --- a/light/store/db/db.go +++ b/light/store/db/db.go @@ -14,8 +14,8 @@ import ( ) const ( - prefixLightBlock = int64(0x0a) - prefixSize = int64(0x0b) + prefixLightBlock = int64(11) + prefixSize = int64(12) ) type dbs struct { @@ -230,27 +230,11 @@ func (s *dbs) Prune(size uint16) error { } numToPrune := sSize - size - // 2) Iterate over headers and perform a batch operation. - itr, err := s.db.Iterator( - s.lbKey(1), - append(s.lbKey(1<<63-1), byte(0x00)), - ) - if err != nil { - panic(err) - } - defer itr.Close() - b := s.db.NewBatch() defer b.Close() - for itr.Valid() && numToPrune > 0 { - if err = b.Delete(itr.Key()); err != nil { - return err - } - itr.Next() - numToPrune-- - } - if err = itr.Error(); err != nil { + // 2) use an iterator to batch together all the blocks that need to be deleted + if err := s.batchDelete(b, numToPrune); err != nil { return err } @@ -261,12 +245,7 @@ func (s *dbs) Prune(size uint16) error { } // 4) write batch deletion to disk - err = b.WriteSync() - if err != nil { - return err - } - - return nil + return b.WriteSync() } // Size returns the number of header & validator set pairs. @@ -278,6 +257,27 @@ func (s *dbs) Size() uint16 { return s.size } +func (s *dbs) batchDelete(batch dbm.Batch, numToPrune uint16) error { + itr, err := s.db.Iterator( + s.lbKey(1), + append(s.lbKey(1<<63-1), byte(0x00)), + ) + if err != nil { + return err + } + defer itr.Close() + + for itr.Valid() && numToPrune > 0 { + if err = batch.Delete(itr.Key()); err != nil { + return err + } + itr.Next() + numToPrune-- + } + + return itr.Error() +} + func (s *dbs) sizeKey() []byte { key, err := orderedcode.Append(nil, prefixSize) if err != nil { diff --git a/state/state.go b/state/state.go index ffebaf3b4..430dbf88b 100644 --- a/state/state.go +++ b/state/state.go @@ -16,11 +16,6 @@ import ( "github.com/tendermint/tendermint/version" ) -// database keys -var ( - stateKey = []byte("stateKey") -) - //----------------------------------------------------------------------------- type Version struct { diff --git a/state/store.go b/state/store.go index 19ac01daa..4829a8b75 100644 --- a/state/store.go +++ b/state/store.go @@ -32,6 +32,7 @@ const ( prefixValidators = int64(5) prefixConsensusParams = int64(6) prefixABCIResponses = int64(7) + prefixState = int64(8) ) func encodeKey(prefix int64, height int64) []byte { @@ -54,6 +55,17 @@ func abciResponsesKey(height int64) []byte { return encodeKey(prefixABCIResponses, height) } +// stateKey should never change after being set in init() +var stateKey []byte + +func init() { + var err error + stateKey, err = orderedcode.Append(nil, prefixState) + if err != nil { + panic(err) + } +} + //---------------------- //go:generate mockery --case underscore --name Store @@ -239,11 +251,16 @@ func (store dbStore) PruneStates(retainHeight int64) error { return fmt.Errorf("height %v must be greater than 0", retainHeight) } - if err := store.pruneValidatorSets(retainHeight); err != nil { + // NOTE: We need to prune consensus params first because the validator + // sets have always one extra height. If validator sets were pruned first + // we could get a situation where we prune up to the last validator set + // yet don't have the respective consensus params at that height and thus + // return an error + if err := store.pruneConsensusParams(retainHeight); err != nil { return err } - if err := store.pruneConsensusParams(retainHeight); err != nil { + if err := store.pruneValidatorSets(retainHeight); err != nil { return err } @@ -257,37 +274,48 @@ func (store dbStore) PruneStates(retainHeight int64) error { // pruneValidatorSets calls a reverse iterator from base height to retain height (exclusive), deleting // all validator sets in between. Due to the fact that most validator sets stored reference an earlier // validator set, it is likely that there will remain one validator set left after pruning. -func (store dbStore) pruneValidatorSets(height int64) error { - valInfo, err := loadValidatorsInfo(store.db, height) +func (store dbStore) pruneValidatorSets(retainHeight int64) error { + valInfo, err := loadValidatorsInfo(store.db, retainHeight) if err != nil { - return fmt.Errorf("validators at height %v not found: %w", height, err) + return fmt.Errorf("validators at height %v not found: %w", retainHeight, err) } // We will prune up to the validator set at the given "height". As we don't save validator sets every // height but only when they change or at a check point, it is likely that the validator set at the height // we prune to is empty and thus dependent on the validator set saved at a previous height. We must find // that validator set and make sure it is not pruned. - lastRecordedValSetHeight := lastStoredHeightFor(height, valInfo.LastHeightChanged) + lastRecordedValSetHeight := lastStoredHeightFor(retainHeight, valInfo.LastHeightChanged) lastRecordedValSet, err := loadValidatorsInfo(store.db, lastRecordedValSetHeight) if err != nil || lastRecordedValSet.ValidatorSet == nil { return fmt.Errorf("couldn't find validators at height %d (height %d was originally requested): %w", - lastStoredHeightFor(height, valInfo.LastHeightChanged), - height, + lastStoredHeightFor(retainHeight, valInfo.LastHeightChanged), + retainHeight, err, ) } - // batch delete all the validators sets up to height - return store.batchDelete( + // if this is not equal to the retain height, prune from the retain height to the height above + // the last saved validator set. This way we can skip over the dependent validator set. + if lastRecordedValSetHeight < retainHeight { + err := store.pruneRange( + validatorsKey(lastRecordedValSetHeight+1), + validatorsKey(retainHeight), + ) + if err != nil { + return err + } + } + + // prune all the validators sets up to last saved validator set + return store.pruneRange( validatorsKey(1), - validatorsKey(height), validatorsKey(lastRecordedValSetHeight), ) } // pruneConsensusParams calls a reverse iterator from base height to retain height batch deleting // all consensus params in between. If the consensus params at the new base height is dependent -// on a prior height then this will keep that lower height to. +// on a prior height then this will keep that lower height too. func (store dbStore) pruneConsensusParams(retainHeight int64) error { paramsInfo, err := store.loadConsensusParamsInfo(retainHeight) if err != nil { @@ -298,21 +326,31 @@ func (store dbStore) pruneConsensusParams(retainHeight int64) error { // we must not prune (or save) the last consensus params that the consensus params info at height // is dependent on. if paramsInfo.ConsensusParams.Equal(&tmproto.ConsensusParams{}) { + // sanity check that the consensus params at the last height it was changed is there lastRecordedConsensusParams, err := store.loadConsensusParamsInfo(paramsInfo.LastHeightChanged) if err != nil || lastRecordedConsensusParams.ConsensusParams.Equal(&tmproto.ConsensusParams{}) { return fmt.Errorf( - "couldn't find consensus params at height %d as last changed from height %d: %w", + "couldn't find consensus params at height %d (height %d was originally requested): %w", paramsInfo.LastHeightChanged, retainHeight, err, ) } + + // prune the params above the height with which it last changed and below the retain height. + err = store.pruneRange( + consensusParamsKey(paramsInfo.LastHeightChanged+1), + consensusParamsKey(retainHeight), + ) + if err != nil { + return err + } } - // batch delete all the consensus params up to the retain height - return store.batchDelete( + // prune all the consensus params up to either the last height the params changed or if the params + // last changed at the retain height, then up to the retain height. + return store.pruneRange( consensusParamsKey(1), - consensusParamsKey(retainHeight), consensusParamsKey(paramsInfo.LastHeightChanged), ) } @@ -320,72 +358,69 @@ func (store dbStore) pruneConsensusParams(retainHeight int64) error { // pruneABCIResponses calls a reverse iterator from base height to retain height batch deleting // all abci responses in between func (store dbStore) pruneABCIResponses(height int64) error { - return store.batchDelete(abciResponsesKey(1), abciResponsesKey(height), nil) + return store.pruneRange(abciResponsesKey(1), abciResponsesKey(height)) } -// batchDelete is a generic function for deleting a range of keys in reverse order. It will -// skip keys that have been -func (store dbStore) batchDelete(start []byte, end []byte, exception []byte) error { - iter, err := store.db.ReverseIterator(start, end) - if err != nil { - return fmt.Errorf("iterator error: %w", err) - } - defer iter.Close() - +// pruneRange is a generic function for deleting a range of keys in reverse order. +// we keep filling up batches of at most 1000 keys, perform a deletion and continue until +// we have gone through all of keys in the range. This avoids doing any writes whilst +// iterating. +func (store dbStore) pruneRange(start []byte, end []byte) error { + var err error batch := store.db.NewBatch() defer batch.Close() - pruned := 0 - for iter.Valid() { - key := iter.Key() - if bytes.Equal(key, exception) { - iter.Next() - continue - } - - if err := batch.Delete(key); err != nil { - return fmt.Errorf("pruning error at key %X: %w", key, err) - } - - pruned++ - // avoid batches growing too large by flushing to disk regularly - if pruned%1000 == 0 { - if err := iter.Error(); err != nil { - return err - } - if err := iter.Close(); err != nil { - return err - } - - if err := batch.Write(); err != nil { - return fmt.Errorf("pruning error at key %X: %w", key, err) - } - if err := batch.Close(); err != nil { - return err - } - - iter, err = store.db.ReverseIterator(start, end) - if err != nil { - return fmt.Errorf("iterator error: %w", err) - } - defer iter.Close() - - batch = store.db.NewBatch() - defer batch.Close() - } else { - iter.Next() - } - } - - if err := iter.Error(); err != nil { - return fmt.Errorf("iterator error: %w", err) - } - - if err := batch.WriteSync(); err != nil { + end, err = store.reverseBatchDelete(batch, start, end) + if err != nil { return err } - return nil + // iterate until the last batch of the pruning range in which case we will perform a + // write sync + for !bytes.Equal(start, end) { + if err := batch.Write(); err != nil { + return err + } + + if err := batch.Close(); err != nil { + return err + } + + batch = store.db.NewBatch() + + // fill a new batch of keys for deletion over the remainding range + end, err = store.reverseBatchDelete(batch, start, end) + if err != nil { + return err + } + } + + return batch.WriteSync() +} + +// reverseBatchDelete runs a reverse iterator (from end to start) filling up a batch until either +// (a) the iterator reaches the start or (b) the iterator has added a 1000 keys (this avoids the +// batch from growing too large) +func (store dbStore) reverseBatchDelete(batch dbm.Batch, start, end []byte) ([]byte, error) { + iter, err := store.db.ReverseIterator(start, end) + if err != nil { + return end, fmt.Errorf("iterator error: %w", err) + } + defer iter.Close() + + size := 0 + for ; iter.Valid(); iter.Next() { + if err := batch.Delete(iter.Key()); err != nil { + return end, fmt.Errorf("pruning error at key %X: %w", iter.Key(), err) + } + + // avoid batches growing too large by capping them + size++ + if size == 1000 { + return iter.Key(), iter.Error() + } + } + return start, iter.Error() } //------------------------------------------------------------------------ @@ -584,7 +619,7 @@ func (store dbStore) LoadConsensusParams(height int64) (types.ConsensusParams, e paramsInfo2, err := store.loadConsensusParamsInfo(paramsInfo.LastHeightChanged) if err != nil { return empty, fmt.Errorf( - "couldn't find consensus params at height %d as last changed from height %d: %w", + "couldn't find consensus params at height %d (height %d was originally requested): %w", paramsInfo.LastHeightChanged, height, err, diff --git a/state/store_test.go b/state/store_test.go index b51a25739..927f3ff45 100644 --- a/state/store_test.go +++ b/state/store_test.go @@ -159,26 +159,28 @@ func TestStoreLoadConsensusParams(t *testing.T) { func TestPruneStates(t *testing.T) { testcases := map[string]struct { - makeHeights int64 - pruneHeight int64 - expectErr bool - expectVals []int64 - expectParams []int64 - expectABCI []int64 + startHeight int64 + endHeight int64 + pruneHeight int64 + expectErr bool + remainingValSetHeight int64 + remainingParamsHeight int64 }{ - "error when prune height is 0": {100, 0, true, nil, nil, nil}, - "error when prune height is negative": {100, -10, true, nil, nil, nil}, - "error when prune height does not exist": {100, 101, true, nil, nil, nil}, - "prune all": {100, 100, false, []int64{93, 100}, []int64{95, 100}, []int64{100}}, - "prune some": {10, 8, false, []int64{3, 8, 9, 10}, - []int64{5, 8, 9, 10}, []int64{8, 9, 10}}, - "prune across checkpoint": {100002, 100002, false, []int64{100000, 100002}, - []int64{99995, 100002}, []int64{100002}}, + "error when prune height is 0": {1, 100, 0, true, 0, 0}, + "error when prune height is negative": {1, 100, -10, true, 0, 0}, + "error when prune height does not exist": {1, 100, 101, true, 0, 0}, + "prune all": {1, 100, 100, false, 93, 95}, + "prune from non 1 height": {10, 50, 40, false, 33, 35}, + "prune some": {1, 10, 8, false, 3, 5}, + // we test this because we flush to disk every 1000 "states" + "prune more than 1000 state": {1, 1010, 1010, false, 1003, 1005}, + "prune across checkpoint": {99900, 100002, 100002, false, 100000, 99995}, } for name, tc := range testcases { tc := tc t.Run(name, func(t *testing.T) { db := dbm.NewMemDB() + stateStore := sm.NewStore(db) pk := ed25519.GenPrivKey().PubKey() @@ -192,7 +194,7 @@ func TestPruneStates(t *testing.T) { valsChanged := int64(0) paramsChanged := int64(0) - for h := int64(1); h <= tc.makeHeights; h++ { + for h := tc.startHeight; h <= tc.endHeight; h++ { if valsChanged == 0 || h%10 == 2 { valsChanged = h + 1 // Have to add 1, since NextValidators is what's stored } @@ -237,36 +239,44 @@ func TestPruneStates(t *testing.T) { } require.NoError(t, err) - expectVals := sliceToMap(tc.expectVals) - expectParams := sliceToMap(tc.expectParams) - expectABCI := sliceToMap(tc.expectABCI) - - for h := int64(1); h <= tc.makeHeights; h++ { + for h := tc.pruneHeight; h <= tc.endHeight; h++ { vals, err := stateStore.LoadValidators(h) - if expectVals[h] { - require.NoError(t, err, "validators height %v", h) - require.NotNil(t, vals) + require.NoError(t, err, h) + require.NotNil(t, vals, h) + + params, err := stateStore.LoadConsensusParams(h) + require.NoError(t, err, h) + require.NotNil(t, params, h) + + abci, err := stateStore.LoadABCIResponses(h) + require.NoError(t, err, h) + require.NotNil(t, abci, h) + } + + emptyParams := types.ConsensusParams{} + + for h := tc.startHeight; h < tc.pruneHeight; h++ { + vals, err := stateStore.LoadValidators(h) + if h == tc.remainingValSetHeight { + require.NoError(t, err, h) + require.NotNil(t, vals, h) } else { - require.Error(t, err, "validators height %v", h) - require.Equal(t, sm.ErrNoValSetForHeight{Height: h}, err) + require.Error(t, err, h) + require.Nil(t, vals, h) } params, err := stateStore.LoadConsensusParams(h) - if expectParams[h] { - require.NoError(t, err, "params height %v", h) - require.False(t, params.Equals(&types.ConsensusParams{}), "params should not be empty") + if h == tc.remainingParamsHeight { + require.NoError(t, err, h) + require.NotEqual(t, emptyParams, params, h) } else { - require.Error(t, err, "params height %v", h) + require.Error(t, err, h) + require.Equal(t, emptyParams, params, h) } abci, err := stateStore.LoadABCIResponses(h) - if expectABCI[h] { - require.NoError(t, err, "abci height %v", h) - require.NotNil(t, abci) - } else { - require.Error(t, err, "abci height %v", h) - require.Equal(t, sm.ErrNoABCIResponsesForHeight{Height: h}, err) - } + require.Error(t, err, h) + require.Nil(t, abci, h) } }) } @@ -293,11 +303,3 @@ func TestABCIResponsesResultsHash(t *testing.T) { require.NoError(t, err) assert.NoError(t, proof.Verify(root, bz)) } - -func sliceToMap(s []int64) map[int64]bool { - m := make(map[int64]bool, len(s)) - for _, i := range s { - m[i] = true - } - return m -} diff --git a/store/store.go b/store/store.go index 6d1ac870f..ed335c3c8 100644 --- a/store/store.go +++ b/store/store.go @@ -1,6 +1,7 @@ package store import ( + "bytes" "fmt" "strconv" @@ -315,99 +316,111 @@ func (bs *BlockStore) PruneBlocks(height int64) (uint64, error) { // remove block meta first as this is used to indicate whether the block exists. // For this reason, we also use ony block meta as a measure of the amount of blocks pruned - pruned, err := bs.batchDelete(blockMetaKey(0), blockMetaKey(height), removeBlockHash) + pruned, err := bs.pruneRange(blockMetaKey(0), blockMetaKey(height), removeBlockHash) if err != nil { return pruned, err } - if _, err := bs.batchDelete(blockPartKey(0, 0), blockPartKey(height, 0), nil); err != nil { + if _, err := bs.pruneRange(blockPartKey(0, 0), blockPartKey(height, 0), nil); err != nil { return pruned, err } - if _, err := bs.batchDelete(blockCommitKey(0), blockCommitKey(height), nil); err != nil { + if _, err := bs.pruneRange(blockCommitKey(0), blockCommitKey(height), nil); err != nil { return pruned, err } - if _, err := bs.batchDelete(seenCommitKey(0), seenCommitKey(height), nil); err != nil { + if _, err := bs.pruneRange(seenCommitKey(0), seenCommitKey(height), nil); err != nil { return pruned, err } return pruned, nil } -// batchDelete is a generic function for deleting a range of values based on the lowest +// pruneRange is a generic function for deleting a range of values based on the lowest // height up to but excluding retainHeight. For each key/value pair, an optional hook can be -// executed before the deletion itself is made -func (bs *BlockStore) batchDelete( +// executed before the deletion itself is made. pruneRange will use batch delete to delete +// keys in batches of at most 1000 keys. +func (bs *BlockStore) pruneRange( start []byte, end []byte, preDeletionHook func(key, value []byte, batch dbm.Batch) error, ) (uint64, error) { - iter, err := bs.db.Iterator(start, end) - if err != nil { - panic(err) - } - defer iter.Close() + var ( + err error + pruned uint64 + totalPruned uint64 = 0 + ) batch := bs.db.NewBatch() defer batch.Close() - pruned := uint64(0) - flushed := pruned - for iter.Valid() { + pruned, start, err = bs.batchDelete(batch, start, end, preDeletionHook) + if err != nil { + return totalPruned, err + } + + // loop until we have finished iterating over all the keys by writing, opening a new batch + // and incrementing through the next range of keys. + for !bytes.Equal(start, end) { + if err := batch.Write(); err != nil { + return totalPruned, err + } + + totalPruned += pruned + + if err := batch.Close(); err != nil { + return totalPruned, err + } + + batch = bs.db.NewBatch() + + pruned, start, err = bs.batchDelete(batch, start, end, preDeletionHook) + if err != nil { + return totalPruned, err + } + } + + // once we looped over all keys we do a final flush to disk + if err := batch.WriteSync(); err != nil { + return totalPruned, err + } + totalPruned += pruned + return totalPruned, nil +} + +// batchDelete runs an iterator over a set of keys, first preforming a pre deletion hook before adding it to the batch. +// The function ends when either 1000 keys have been added to the batch or the iterator has reached the end. +func (bs *BlockStore) batchDelete( + batch dbm.Batch, + start, end []byte, + preDeletionHook func(key, value []byte, batch dbm.Batch) error, +) (uint64, []byte, error) { + var pruned uint64 = 0 + iter, err := bs.db.Iterator(start, end) + if err != nil { + return pruned, start, err + } + defer iter.Close() + + for ; iter.Valid(); iter.Next() { key := iter.Key() if preDeletionHook != nil { if err := preDeletionHook(key, iter.Value(), batch); err != nil { - return flushed, err + return 0, start, fmt.Errorf("pruning error at key %X: %w", iter.Key(), err) } } if err := batch.Delete(key); err != nil { - return flushed, fmt.Errorf("pruning error at key %X: %w", iter.Key(), err) + return 0, start, fmt.Errorf("pruning error at key %X: %w", iter.Key(), err) } pruned++ - // avoid batches growing too large by flushing to database regularly - if pruned%1000 == 0 { - if err := iter.Error(); err != nil { - return flushed, err - } - if err := iter.Close(); err != nil { - return flushed, err - } - - err := batch.Write() - if err != nil { - return flushed, fmt.Errorf("pruning error at key %X: %w", iter.Key(), err) - } - if err := batch.Close(); err != nil { - return flushed, err - } - flushed = pruned - - iter, err = bs.db.Iterator(start, end) - if err != nil { - panic(err) - } - defer iter.Close() - - batch = bs.db.NewBatch() - defer batch.Close() - } else { - iter.Next() + if pruned == 1000 { + return pruned, iter.Key(), iter.Error() } } - flushed = pruned - if err := iter.Error(); err != nil { - return flushed, err - } - err = batch.WriteSync() - if err != nil { - return flushed, fmt.Errorf("pruning error at key %X: %w", iter.Key(), err) - } - - return flushed, nil + return pruned, end, iter.Error() } // SaveBlock persists the given block, blockParts, and seenCommit to the underlying db. From 0b953eb1907a6b345e26ae935d34f51b3582790d Mon Sep 17 00:00:00 2001 From: Tess Rinearson Date: Tue, 2 Feb 2021 11:54:27 +0100 Subject: [PATCH 03/15] Revert "e2e: releases nightly (#5906)" (#6031) This reverts commit 64961e22673a796782b0db54dab7fd06bd3aa7b4, to see if it will make the workflow dispatch trigger reappear and fix our Slack notification link. --- .github/workflows/e2e-nightly.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/e2e-nightly.yml b/.github/workflows/e2e-nightly.yml index b4be46954..d32b54840 100644 --- a/.github/workflows/e2e-nightly.yml +++ b/.github/workflows/e2e-nightly.yml @@ -13,8 +13,6 @@ jobs: fail-fast: false matrix: group: ['00', '01', '02', '03'] - # todo: expand to multiple versions after 0.35 release - branch: ['master', 'v0.34.x'] runs-on: ubuntu-latest timeout-minutes: 60 steps: @@ -23,8 +21,6 @@ jobs: go-version: '1.15' - uses: actions/checkout@v2 - with: - ref: ${{ matrix.branch}} - name: Build working-directory: test/e2e From 3d01d98f67f3f9a803567e33cdebe08da54b196f Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 2 Feb 2021 13:43:08 +0100 Subject: [PATCH 04/15] test/e2e: increase sign/propose tolerances (#6033) E2E tests often fail because validators miss signing or proposing blocks. Often this is because e.g. there's a lot of disruption in the network or it takes a long time to start up all the nodes. This changes the test criteria to only check for 3 signed/proposed blocks, rather than a fraction of the expected blocks. This should be enough to catch most issues, apart from performance problems causing nodes to miss signing/proposing, but we may want separate tests for those sorts of things. --- test/e2e/tests/validator_test.go | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/test/e2e/tests/validator_test.go b/test/e2e/tests/validator_test.go index f358296f2..247f2c586 100644 --- a/test/e2e/tests/validator_test.go +++ b/test/e2e/tests/validator_test.go @@ -77,8 +77,12 @@ func TestValidator_Propose(t *testing.T) { require.False(t, proposeCount == 0 && expectCount > 0, "node did not propose any blocks (expected %v)", expectCount) - require.Less(t, expectCount-proposeCount, 5, - "validator missed proposing too many blocks (proposed %v out of %v)", proposeCount, expectCount) + if expectCount > 3 { + require.GreaterOrEqual(t, proposeCount, 3, "validator didn't propose even 3 blocks") + } else { + require.Equal(t, proposeCount, expectCount, "validator missed proposing blocks (proposed %v, expected %v)", + proposeCount, expectCount) + } }) } @@ -116,8 +120,12 @@ func TestValidator_Sign(t *testing.T) { require.False(t, signCount == 0 && expectCount > 0, "validator did not sign any blocks (expected %v)", expectCount) - require.Less(t, float64(expectCount-signCount)/float64(expectCount), 0.33, - "validator missed signing too many blocks (signed %v out of %v)", signCount, expectCount) + if expectCount > 3 { + require.GreaterOrEqual(t, signCount, 3, "validator didn't sign even 3 blocks") + } else { + require.Equal(t, signCount, expectCount, "validator missed signing blocks (signed %v, expected %v)", + signCount, expectCount) + } }) } From c7b619188d42168e14cbd97e29ef3c097b1af185 Mon Sep 17 00:00:00 2001 From: Callum Waters Date: Tue, 2 Feb 2021 14:01:39 +0100 Subject: [PATCH 05/15] light: fix panic with RPC calls to commit and validator when height is nil (#6026) --- CHANGELOG_PENDING.md | 4 ++-- light/provider/http/http.go | 2 +- light/rpc/client.go | 40 ++++++++++++++++++++++----------- light/rpc/mocks/light_client.go | 25 ++++++++++++++++++++- 4 files changed, 54 insertions(+), 17 deletions(-) diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index c6f45faa5..184df82cc 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -28,8 +28,7 @@ Friendly reminder, we have a [bug bounty program](https://hackerone.com/tendermi - [proto/p2p] Renamed `DefaultNodeInfo` and `DefaultNodeInfoOther` to `NodeInfo` and `NodeInfoOther` (@erikgrinaker) - [proto/p2p] Rename `NodeInfo.default_node_id` to `node_id` (@erikgrinaker) - [libs/os] Kill() and {Must,}{Read,Write}File() functions have been removed. (@alessio) - - [store] \#5848 Remove block store state in favor of using the db iterators directly (@cmwaters) - - [state] \#5864 Use an iterator when pruning state (@cmwaters) + - [store] \#5848 Remove block store state in favor of using the db iterators directly (@cmwaters) - [state] \#5864 Use an iterator when pruning state (@cmwaters) - [types] \#6023 Remove `tm2pb.Header`, `tm2pb.BlockID`, `tm2pb.PartSetHeader` and `tm2pb.NewValidatorUpdate`. - Each of the above types has a `ToProto` and `FromProto` method or function which replaced this logic. - [rpc/client/http] \#6022 Change `timeout` type to `time.Duration` in `NewWithTimeout` @@ -66,3 +65,4 @@ Friendly reminder, we have a [bug bounty program](https://hackerone.com/tendermi - [blockchain/v1] [\#5701](https://github.com/tendermint/tendermint/pull/5701) Handle peers without blocks (@melekes) - [blockchain/v1] \#5711 Fix deadlock (@melekes) - [light] \#6022 Fix a bug when the number of validators equals 100 (@melekes) +- [light] \#6026 Fix a bug when height isn't provided for the rpc calls: `/commit` and `/validators` (@cmwaters) diff --git a/light/provider/http/http.go b/light/provider/http/http.go index c28d732ca..009542971 100644 --- a/light/provider/http/http.go +++ b/light/provider/http/http.go @@ -70,7 +70,7 @@ func (p *http) LightBlock(ctx context.Context, height int64) (*types.LightBlock, return nil, err } - vs, err := p.validatorSet(ctx, h) + vs, err := p.validatorSet(ctx, &sh.Height) if err != nil { return nil, err } diff --git a/light/rpc/client.go b/light/rpc/client.go index b6a5c929c..7c403aba7 100644 --- a/light/rpc/client.go +++ b/light/rpc/client.go @@ -26,8 +26,10 @@ var errNegOrZeroHeight = errors.New("negative or zero height") type KeyPathFunc func(path string, key []byte) (merkle.KeyPath, error) // LightClient is an interface that contains functionality needed by Client from the light client. +//go:generate mockery --case underscore --name LightClient type LightClient interface { ChainID() string + Update(ctx context.Context, now time.Time) (*types.LightBlock, error) VerifyLightBlockAtHeight(ctx context.Context, height int64, now time.Time) (*types.LightBlock, error) TrustedLightBlock(height int64) (*types.LightBlock, error) } @@ -130,7 +132,8 @@ func (c *Client) ABCIQueryWithOptions(ctx context.Context, path string, data tmb // Update the light client if we're behind. // NOTE: AppHash for height H is in header H+1. - l, err := c.updateLightClientIfNeededTo(ctx, resp.Height+1) + nextHeight := resp.Height + 1 + l, err := c.updateLightClientIfNeededTo(ctx, &nextHeight) if err != nil { return nil, err } @@ -213,7 +216,7 @@ func (c *Client) ConsensusParams(ctx context.Context, height *int64) (*ctypes.Re } // Update the light client if we're behind. - l, err := c.updateLightClientIfNeededTo(ctx, res.BlockHeight) + l, err := c.updateLightClientIfNeededTo(ctx, &res.BlockHeight) if err != nil { return nil, err } @@ -252,7 +255,7 @@ func (c *Client) BlockchainInfo(ctx context.Context, minHeight, maxHeight int64) // Update the light client if we're behind. if len(res.BlockMetas) > 0 { lastHeight := res.BlockMetas[len(res.BlockMetas)-1].Header.Height - if _, err := c.updateLightClientIfNeededTo(ctx, lastHeight); err != nil { + if _, err := c.updateLightClientIfNeededTo(ctx, &lastHeight); err != nil { return nil, err } } @@ -296,7 +299,7 @@ func (c *Client) Block(ctx context.Context, height *int64) (*ctypes.ResultBlock, } // Update the light client if we're behind. - l, err := c.updateLightClientIfNeededTo(ctx, res.Block.Height) + l, err := c.updateLightClientIfNeededTo(ctx, &res.Block.Height) if err != nil { return nil, err } @@ -330,7 +333,7 @@ func (c *Client) BlockByHash(ctx context.Context, hash []byte) (*ctypes.ResultBl } // Update the light client if we're behind. - l, err := c.updateLightClientIfNeededTo(ctx, res.Block.Height) + l, err := c.updateLightClientIfNeededTo(ctx, &res.Block.Height) if err != nil { return nil, err } @@ -371,7 +374,8 @@ func (c *Client) BlockResults(ctx context.Context, height *int64) (*ctypes.Resul } // Update the light client if we're behind. - trustedBlock, err := c.updateLightClientIfNeededTo(ctx, h+1) + nextHeight := h + 1 + trustedBlock, err := c.updateLightClientIfNeededTo(ctx, &nextHeight) if err != nil { return nil, err } @@ -409,7 +413,8 @@ func (c *Client) BlockResults(ctx context.Context, height *int64) (*ctypes.Resul func (c *Client) Commit(ctx context.Context, height *int64) (*ctypes.ResultCommit, error) { // Update the light client if we're behind and retrieve the light block at the requested height - l, err := c.updateLightClientIfNeededTo(ctx, *height) + // or at the latest height if no height is provided. + l, err := c.updateLightClientIfNeededTo(ctx, height) if err != nil { return nil, err } @@ -434,7 +439,7 @@ func (c *Client) Tx(ctx context.Context, hash []byte, prove bool) (*ctypes.Resul } // Update the light client if we're behind. - l, err := c.updateLightClientIfNeededTo(ctx, res.Height) + l, err := c.updateLightClientIfNeededTo(ctx, &res.Height) if err != nil { return nil, err } @@ -451,8 +456,9 @@ func (c *Client) TxSearch(ctx context.Context, query string, prove bool, page, p // Validators fetches and verifies validators. func (c *Client) Validators(ctx context.Context, height *int64, pagePtr, perPagePtr *int) (*ctypes.ResultValidators, error) { - // Update the light client if we're behind and retrieve the light block at the requested height. - l, err := c.updateLightClientIfNeededTo(ctx, *height) + // Update the light client if we're behind and retrieve the light block at the requested height + // or at the latest height if no height is provided. + l, err := c.updateLightClientIfNeededTo(ctx, height) if err != nil { return nil, err } @@ -469,7 +475,7 @@ func (c *Client) Validators(ctx context.Context, height *int64, pagePtr, perPage v := l.ValidatorSet.Validators[skipCount : skipCount+tmmath.MinInt(perPage, totalCount-skipCount)] return &ctypes.ResultValidators{ - BlockHeight: *height, + BlockHeight: l.Height, Validators: v, Count: len(v), Total: totalCount}, nil @@ -492,8 +498,16 @@ func (c *Client) UnsubscribeAll(ctx context.Context, subscriber string) error { return c.next.UnsubscribeAll(ctx, subscriber) } -func (c *Client) updateLightClientIfNeededTo(ctx context.Context, height int64) (*types.LightBlock, error) { - l, err := c.lc.VerifyLightBlockAtHeight(ctx, height, time.Now()) +func (c *Client) updateLightClientIfNeededTo(ctx context.Context, height *int64) (*types.LightBlock, error) { + var ( + l *types.LightBlock + err error + ) + if height == nil { + l, err = c.lc.Update(ctx, time.Now()) + } else { + l, err = c.lc.VerifyLightBlockAtHeight(ctx, *height, time.Now()) + } if err != nil { return nil, fmt.Errorf("failed to update light client to %d: %w", height, err) } diff --git a/light/rpc/mocks/light_client.go b/light/rpc/mocks/light_client.go index 2f512d881..7bd0175c5 100644 --- a/light/rpc/mocks/light_client.go +++ b/light/rpc/mocks/light_client.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.3.0. DO NOT EDIT. +// Code generated by mockery v0.0.0-dev. DO NOT EDIT. package mocks @@ -54,6 +54,29 @@ func (_m *LightClient) TrustedLightBlock(height int64) (*types.LightBlock, error return r0, r1 } +// Update provides a mock function with given fields: ctx, now +func (_m *LightClient) Update(ctx context.Context, now time.Time) (*types.LightBlock, error) { + ret := _m.Called(ctx, now) + + var r0 *types.LightBlock + if rf, ok := ret.Get(0).(func(context.Context, time.Time) *types.LightBlock); ok { + r0 = rf(ctx, now) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*types.LightBlock) + } + } + + var r1 error + if rf, ok := ret.Get(1).(func(context.Context, time.Time) error); ok { + r1 = rf(ctx, now) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + // VerifyLightBlockAtHeight provides a mock function with given fields: ctx, height, now func (_m *LightClient) VerifyLightBlockAtHeight(ctx context.Context, height int64, now time.Time) (*types.LightBlock, error) { ret := _m.Called(ctx, height, now) From b0dfbd18329b6cff3dee7e33d7aad278f01627f8 Mon Sep 17 00:00:00 2001 From: Tess Rinearson Date: Tue, 2 Feb 2021 16:36:52 +0100 Subject: [PATCH 06/15] .github/workflows: try different e2e nightly test set (#6036) --- .github/workflows/e2e-nightly2.yml | 70 ++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 .github/workflows/e2e-nightly2.yml diff --git a/.github/workflows/e2e-nightly2.yml b/.github/workflows/e2e-nightly2.yml new file mode 100644 index 000000000..d829efa72 --- /dev/null +++ b/.github/workflows/e2e-nightly2.yml @@ -0,0 +1,70 @@ +# Runs randomly generated E2E testnets nightly. +# Copy of earlier nightly test file, for debugging +name: e2e-nightly2 +on: + workflow_dispatch: # allow running workflow manually + schedule: + - cron: '0 2 * * *' + +jobs: + e2e-nightly-test-2: + # Run parallel jobs for the listed testnet groups (must match the + # ./build/generator -g flag) + strategy: + fail-fast: false + matrix: + group: ['00', '01', '02', '03'] + runs-on: ubuntu-latest + timeout-minutes: 60 + steps: + - uses: actions/setup-go@v2 + with: + go-version: '1.15' + + - uses: actions/checkout@v2 + + - name: Build + working-directory: test/e2e + # Run make jobs in parallel, since we can't run steps in parallel. + run: make -j2 docker generator runner + + - name: Generate testnets + working-directory: test/e2e + # When changing -g, also change the matrix groups above + run: ./build/generator -g 4 -d networks/nightly + + - name: Run testnets in group ${{ matrix.group }} + working-directory: test/e2e + run: ./run-multiple.sh networks/nightly/*-group${{ matrix.group }}-*.toml + + e2e-nightly-fail-2: + needs: e2e-nightly-test-2 + if: ${{ failure() }} + runs-on: ubuntu-latest + steps: + - name: Notify Slack on failure + uses: rtCamp/action-slack-notify@ae4223259071871559b6e9d08b24a63d71b3f0c0 + env: + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + SLACK_CHANNEL: tendermint-internal + SLACK_USERNAME: Nightly E2E Tests + SLACK_ICON_EMOJI: ':skull:' + SLACK_COLOR: danger + SLACK_MESSAGE: Nightly E2E (copy) tests failed + SLACK_FOOTER: '' + + e2e-nightly-success: + needs: e2e-nightly-test-2 + if: ${{ success() }} + runs-on: ubuntu-latest + steps: + - name: Notify Slack on success + uses: rtCamp/action-slack-notify@ae4223259071871559b6e9d08b24a63d71b3f0c0 + env: + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + SLACK_CHANNEL: tendermint-internal + SLACK_USERNAME: Nightly E2E Tests + SLACK_ICON_EMOJI: ':green_heart:' + SLACK_COLOR: good + SLACK_MESSAGE: Nightly E2E (copy) tests failed + SLACK_FOOTER: '' From 5f88d6aa44e2029034139b55a9d38ee98d7c522c Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 2 Feb 2021 17:41:27 +0100 Subject: [PATCH 07/15] test/e2e: increase validator tolerances (#6037) --- test/e2e/tests/validator_test.go | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/test/e2e/tests/validator_test.go b/test/e2e/tests/validator_test.go index 247f2c586..847a8d388 100644 --- a/test/e2e/tests/validator_test.go +++ b/test/e2e/tests/validator_test.go @@ -77,11 +77,8 @@ func TestValidator_Propose(t *testing.T) { require.False(t, proposeCount == 0 && expectCount > 0, "node did not propose any blocks (expected %v)", expectCount) - if expectCount > 3 { + if expectCount > 5 { require.GreaterOrEqual(t, proposeCount, 3, "validator didn't propose even 3 blocks") - } else { - require.Equal(t, proposeCount, expectCount, "validator missed proposing blocks (proposed %v, expected %v)", - proposeCount, expectCount) } }) } @@ -120,11 +117,8 @@ func TestValidator_Sign(t *testing.T) { require.False(t, signCount == 0 && expectCount > 0, "validator did not sign any blocks (expected %v)", expectCount) - if expectCount > 3 { - require.GreaterOrEqual(t, signCount, 3, "validator didn't sign even 3 blocks") - } else { - require.Equal(t, signCount, expectCount, "validator missed signing blocks (signed %v, expected %v)", - signCount, expectCount) + if expectCount > 7 { + require.GreaterOrEqual(t, signCount, 3, "validator didn't sign even 3 blocks (expected %v)", expectCount) } }) } From 059364e840a3344538ffb21befe2518803ee9950 Mon Sep 17 00:00:00 2001 From: Marko Date: Tue, 2 Feb 2021 16:51:13 +0000 Subject: [PATCH 08/15] docs: external address (#6035) ## Description - add note to recommend setting external address --- docs/nodes/configuration.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/nodes/configuration.md b/docs/nodes/configuration.md index 7db0cae3d..1635780ae 100644 --- a/docs/nodes/configuration.md +++ b/docs/nodes/configuration.md @@ -385,7 +385,7 @@ peer-query-maj23-sleep-duration = "2s" # Options: # 1) "null" # 2) "kv" (default) - the simplest possible indexer, backed by key-value storage (defaults to levelDB; see DBBackend). -# - When "kv" is chosen "tx.height" and "tx.hash" will always be indexed. +# - When "kv" is chosen "tx.height" and "tx.hash" will always be indexed. indexer = "kv" ####################################################### @@ -487,6 +487,7 @@ Here's a brief summary of the timeouts: This section will cover settings within the p2p section of the `config.toml`. - `external-address` = is the address that will be advertised for other nodes to use. We recommend setting this field with your public IP and p2p port. + - > We recommend setting external address. There is an open issue with Tendermint not advertising the public address, if being used within a private network. There is an ongoing effort to refactor the P2P layer where this will be fixed. - `seeds` = is a list of comma separated seed nodes that you will connect upon a start and ask for peers. A seed node is a node that does not participate in consensus but only helps propagate peers to nodes in the networks - `persistent-peers` = is a list of comma separated peers that you will always want to be connected to. If you're already connected to the maximum number of peers, persistent peers will not be added. - `max-num-inbound-peers` = is the maximum number of peers you will accept inbound connections from at one time (where they dial your address and initiate the connection). @@ -494,4 +495,4 @@ This section will cover settings within the p2p section of the `config.toml`. - `unconditional-peer-ids` = is similar to `persistent-peers` except that these peers will be connected to even if you are already connected to the maximum number of peers. This can be a validator node ID on your sentry node. - `pex` = turns the peer exchange reactor on or off. Validator node will want the `pex` turned off so it would not begin gossiping to unknown peers on the network. PeX can also be turned off for statically configured networks with fixed network connectivity. For full nodes on open, dynamic networks, it should be turned on. - `seed-mode` = is used for when node operators want to run their node as a seed node. Seed node's run a variation of the PeX protocol that disconnects from peers after sending them a list of peers to connect to. To minimize the servers usage, it is recommended to set the mempool's size to 0. -- `private-peer-ids` = is a comma separated list of node ids that you would not like exposed to other peers (ie. you will not tell other peers about the private-peer-ids). This can be filled with a validators node id. +- `private-peer-ids` = is a comma separated list of node ids that you would not like exposed to other peers (ie. you will not tell other peers about the private-peer-ids). This can be filled with a validators node id. From fd597dc72634230075886fb4921d3476fc6333fc Mon Sep 17 00:00:00 2001 From: Marko Date: Tue, 2 Feb 2021 17:03:06 +0000 Subject: [PATCH 09/15] docs: reword configuration (#6039) ## Description changes requested in https://github.com/tendermint/tendermint/pull/6035 but the bot merged the PR prior to applying them --- docs/nodes/configuration.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/nodes/configuration.md b/docs/nodes/configuration.md index 1635780ae..30d22234f 100644 --- a/docs/nodes/configuration.md +++ b/docs/nodes/configuration.md @@ -487,7 +487,7 @@ Here's a brief summary of the timeouts: This section will cover settings within the p2p section of the `config.toml`. - `external-address` = is the address that will be advertised for other nodes to use. We recommend setting this field with your public IP and p2p port. - - > We recommend setting external address. There is an open issue with Tendermint not advertising the public address, if being used within a private network. There is an ongoing effort to refactor the P2P layer where this will be fixed. + - > We recommend setting an external address. When used in a private network, Tendermint Core currently doesn't advertise the node's public address. There is active and ongoing work to improve the P2P system, but this is a helpful workaround for now. - `seeds` = is a list of comma separated seed nodes that you will connect upon a start and ask for peers. A seed node is a node that does not participate in consensus but only helps propagate peers to nodes in the networks - `persistent-peers` = is a list of comma separated peers that you will always want to be connected to. If you're already connected to the maximum number of peers, persistent peers will not be added. - `max-num-inbound-peers` = is the maximum number of peers you will accept inbound connections from at one time (where they dial your address and initiate the connection). @@ -495,4 +495,4 @@ This section will cover settings within the p2p section of the `config.toml`. - `unconditional-peer-ids` = is similar to `persistent-peers` except that these peers will be connected to even if you are already connected to the maximum number of peers. This can be a validator node ID on your sentry node. - `pex` = turns the peer exchange reactor on or off. Validator node will want the `pex` turned off so it would not begin gossiping to unknown peers on the network. PeX can also be turned off for statically configured networks with fixed network connectivity. For full nodes on open, dynamic networks, it should be turned on. - `seed-mode` = is used for when node operators want to run their node as a seed node. Seed node's run a variation of the PeX protocol that disconnects from peers after sending them a list of peers to connect to. To minimize the servers usage, it is recommended to set the mempool's size to 0. -- `private-peer-ids` = is a comma separated list of node ids that you would not like exposed to other peers (ie. you will not tell other peers about the private-peer-ids). This can be filled with a validators node id. +- `private-peer-ids` = is a comma-separated list of node ids that will _not_ be exposed to other peers (i.e., you will not tell other peers about the ids in this list). This can be filled with a validator's node id. From 2aad26e2f1de21136ae42e76c8fcced34e63f7c2 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 3 Feb 2021 07:15:23 +0100 Subject: [PATCH 10/15] p2p: tighten up and test PeerManager (#6034) This tightens up the `PeerManager` and related code, adds a ton of tests, and fixes a bunch of inconsistencies and bugs. --- blockchain/v0/reactor.go | 36 +- blockchain/v0/reactor_test.go | 16 +- evidence/reactor.go | 21 +- evidence/reactor_test.go | 22 +- libs/sync/waker.go | 30 + libs/sync/waker_test.go | 47 + mempool/reactor.go | 25 +- mempool/reactor_test.go | 14 +- p2p/address.go | 3 +- p2p/address_test.go | 2 +- p2p/channel.go | 15 + p2p/peer.go | 1157 ------------------------ p2p/peermanager.go | 1275 ++++++++++++++++++++++++++ p2p/peermanager_test.go | 1580 +++++++++++++++++++++++++++++++++ p2p/pex/reactor.go | 13 +- p2p/router.go | 32 +- p2p/router_test.go | 13 +- p2p/shim.go | 12 +- p2p/shim_test.go | 4 +- statesync/reactor.go | 24 +- statesync/reactor_test.go | 6 +- 21 files changed, 3066 insertions(+), 1281 deletions(-) create mode 100644 libs/sync/waker.go create mode 100644 libs/sync/waker_test.go create mode 100644 p2p/peermanager.go create mode 100644 p2p/peermanager_test.go diff --git a/blockchain/v0/reactor.go b/blockchain/v0/reactor.go index 7dd4d4ea3..583bb6766 100644 --- a/blockchain/v0/reactor.go +++ b/blockchain/v0/reactor.go @@ -84,7 +84,7 @@ type Reactor struct { fastSync bool blockchainCh *p2p.Channel - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates closeCh chan struct{} requestsCh <-chan BlockRequest @@ -104,7 +104,7 @@ func NewReactor( store *store.BlockStore, consReactor consensusReactor, blockchainCh *p2p.Channel, - peerUpdates *p2p.PeerUpdatesCh, + peerUpdates *p2p.PeerUpdates, fastSync bool, ) (*Reactor, error) { if state.LastBlockHeight != store.Height() { @@ -288,9 +288,8 @@ func (r *Reactor) processBlockchainCh() { if err := r.handleMessage(r.blockchainCh.ID(), envelope); err != nil { r.Logger.Error("failed to process message", "ch_id", r.blockchainCh.ID(), "envelope", envelope, "err", err) r.blockchainCh.Error() <- p2p.PeerError{ - PeerID: envelope.From, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + NodeID: envelope.From, + Err: err, } } @@ -303,26 +302,26 @@ func (r *Reactor) processBlockchainCh() { // processPeerUpdate processes a PeerUpdate. func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { - r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status) + r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) // XXX: Pool#RedoRequest can sometimes give us an empty peer. - if len(peerUpdate.PeerID) == 0 { + if len(peerUpdate.NodeID) == 0 { return } switch peerUpdate.Status { - case p2p.PeerStatusNew, p2p.PeerStatusUp: + case p2p.PeerStatusUp: // send a status update the newly added peer r.blockchainCh.Out() <- p2p.Envelope{ - To: peerUpdate.PeerID, + To: peerUpdate.NodeID, Message: &bcproto.StatusResponse{ Base: r.store.Base(), Height: r.store.Height(), }, } - case p2p.PeerStatusDown, p2p.PeerStatusRemoved, p2p.PeerStatusBanned: - r.pool.RemovePeer(peerUpdate.PeerID) + case p2p.PeerStatusDown: + r.pool.RemovePeer(peerUpdate.NodeID) } } @@ -384,9 +383,8 @@ func (r *Reactor) requestRoutine() { case pErr := <-r.errorsCh: r.blockchainCh.Error() <- p2p.PeerError{ - PeerID: pErr.peerID, - Err: pErr.err, - Severity: p2p.PeerErrorSeverityLow, + NodeID: pErr.peerID, + Err: pErr.err, } case <-statusUpdateTicker.C: @@ -525,17 +523,15 @@ FOR_LOOP: // to clean up the rest. peerID := r.pool.RedoRequest(first.Height) r.blockchainCh.Error() <- p2p.PeerError{ - PeerID: peerID, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + NodeID: peerID, + Err: err, } peerID2 := r.pool.RedoRequest(second.Height) if peerID2 != peerID { r.blockchainCh.Error() <- p2p.PeerError{ - PeerID: peerID2, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + NodeID: peerID2, + Err: err, } } diff --git a/blockchain/v0/reactor_test.go b/blockchain/v0/reactor_test.go index 372c689fb..9d955319a 100644 --- a/blockchain/v0/reactor_test.go +++ b/blockchain/v0/reactor_test.go @@ -36,7 +36,7 @@ type reactorTestSuite struct { blockchainPeerErrCh chan p2p.PeerError peerUpdatesCh chan p2p.PeerUpdate - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates } func setup( @@ -200,8 +200,8 @@ func simulateRouter(primary *reactorTestSuite, suites []*reactorTestSuite, dropC primary.reactor.Logger.Debug("dropped peer error", "err", pErr.Err) } else { primary.peerUpdatesCh <- p2p.PeerUpdate{ - PeerID: pErr.PeerID, - Status: p2p.PeerStatusRemoved, + NodeID: pErr.NodeID, + Status: p2p.PeerStatusDown, } } } @@ -229,7 +229,7 @@ func TestReactor_AbruptDisconnect(t *testing.T) { if s.peerID != ss.peerID { s.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: ss.peerID, + NodeID: ss.peerID, } } } @@ -251,7 +251,7 @@ func TestReactor_AbruptDisconnect(t *testing.T) { // deadlocks or race conditions within the context of poolRoutine. testSuites[1].peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusDown, - PeerID: testSuites[0].peerID, + NodeID: testSuites[0].peerID, } } @@ -276,7 +276,7 @@ func TestReactor_NoBlockResponse(t *testing.T) { if s.peerID != ss.peerID { s.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: ss.peerID, + NodeID: ss.peerID, } } } @@ -341,7 +341,7 @@ func TestReactor_BadBlockStopsPeer(t *testing.T) { if s.peerID != ss.peerID { s.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: ss.peerID, + NodeID: ss.peerID, } } } @@ -388,7 +388,7 @@ func TestReactor_BadBlockStopsPeer(t *testing.T) { for _, s := range testSuites[:len(testSuites)-1] { newSuite.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: s.peerID, + NodeID: s.peerID, } } diff --git a/evidence/reactor.go b/evidence/reactor.go index 643d9915a..cfa9b8989 100644 --- a/evidence/reactor.go +++ b/evidence/reactor.go @@ -55,7 +55,7 @@ type Reactor struct { evpool *Pool eventBus *types.EventBus evidenceCh *p2p.Channel - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates closeCh chan struct{} peerWG sync.WaitGroup @@ -70,7 +70,7 @@ type Reactor struct { func NewReactor( logger log.Logger, evidenceCh *p2p.Channel, - peerUpdates *p2p.PeerUpdatesCh, + peerUpdates *p2p.PeerUpdates, evpool *Pool, ) *Reactor { r := &Reactor{ @@ -196,9 +196,8 @@ func (r *Reactor) processEvidenceCh() { if err := r.handleMessage(r.evidenceCh.ID(), envelope); err != nil { r.Logger.Error("failed to process message", "ch_id", r.evidenceCh.ID(), "envelope", envelope, "err", err) r.evidenceCh.Error() <- p2p.PeerError{ - PeerID: envelope.From, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + NodeID: envelope.From, + Err: err, } } @@ -221,7 +220,7 @@ func (r *Reactor) processEvidenceCh() { // // REF: https://github.com/tendermint/tendermint/issues/4727 func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { - r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status) + r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) r.mtx.Lock() defer r.mtx.Unlock() @@ -240,21 +239,21 @@ func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { // a new done channel so we can explicitly close the goroutine if the peer // is later removed, we increment the waitgroup so the reactor can stop // safely, and finally start the goroutine to broadcast evidence to that peer. - _, ok := r.peerRoutines[peerUpdate.PeerID] + _, ok := r.peerRoutines[peerUpdate.NodeID] if !ok { closer := tmsync.NewCloser() - r.peerRoutines[peerUpdate.PeerID] = closer + r.peerRoutines[peerUpdate.NodeID] = closer r.peerWG.Add(1) - go r.broadcastEvidenceLoop(peerUpdate.PeerID, closer) + go r.broadcastEvidenceLoop(peerUpdate.NodeID, closer) } - case p2p.PeerStatusDown, p2p.PeerStatusRemoved, p2p.PeerStatusBanned: + case p2p.PeerStatusDown: // Check if we've started an evidence broadcasting goroutine for this peer. // If we have, we signal to terminate the goroutine via the channel's closure. // This will internally decrement the peer waitgroup and remove the peer // from the map of peer evidence broadcasting goroutines. - closer, ok := r.peerRoutines[peerUpdate.PeerID] + closer, ok := r.peerRoutines[peerUpdate.NodeID] if ok { closer.Close() } diff --git a/evidence/reactor_test.go b/evidence/reactor_test.go index 0958b260f..4462b851a 100644 --- a/evidence/reactor_test.go +++ b/evidence/reactor_test.go @@ -42,7 +42,7 @@ type reactorTestSuite struct { evidencePeerErrCh chan p2p.PeerError peerUpdatesCh chan p2p.PeerUpdate - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates } func setup(t *testing.T, logger log.Logger, pool *evidence.Pool, chBuf uint) *reactorTestSuite { @@ -224,18 +224,18 @@ func TestReactorMultiDisconnect(t *testing.T) { primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // Ensure "disconnecting" the secondary peer from the primary more than once // is handled gracefully. primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusDown, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusDown, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } } @@ -276,7 +276,7 @@ func TestReactorBroadcastEvidence(t *testing.T) { for _, suite := range secondaries { primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: suite.peerID, + NodeID: suite.peerID, } } @@ -327,7 +327,7 @@ func TestReactorBroadcastEvidence_Lagging(t *testing.T) { for _, suite := range secondaries { primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: suite.peerID, + NodeID: suite.peerID, } } @@ -378,7 +378,7 @@ func TestReactorBroadcastEvidence_Pending(t *testing.T) { // add the secondary reactor as a peer to the primary reactor primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // The secondary reactor should have received all the evidence ignoring the @@ -438,7 +438,7 @@ func TestReactorBroadcastEvidence_Committed(t *testing.T) { // add the secondary reactor as a peer to the primary reactor primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // The secondary reactor should have received all the evidence ignoring the @@ -487,7 +487,7 @@ func TestReactorBroadcastEvidence_FullyConnected(t *testing.T) { if suiteI.peerID != suiteJ.peerID { suiteI.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: suiteJ.peerID, + NodeID: suiteJ.peerID, } } } @@ -530,7 +530,7 @@ func TestReactorBroadcastEvidence_RemovePeer(t *testing.T) { // add the secondary reactor as a peer to the primary reactor primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // have the secondary reactor receive only half the evidence @@ -539,7 +539,7 @@ func TestReactorBroadcastEvidence_RemovePeer(t *testing.T) { // disconnect the peer primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusDown, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // Ensure the secondary only received half of the evidence before being diff --git a/libs/sync/waker.go b/libs/sync/waker.go new file mode 100644 index 000000000..0aff3ddf8 --- /dev/null +++ b/libs/sync/waker.go @@ -0,0 +1,30 @@ +package sync + +// Waker is used to wake up a sleeper when some event occurs. It debounces +// multiple wakeup calls occurring between each sleep, and wakeups are +// non-blocking to avoid having to coordinate goroutines. +type Waker struct { + wakeCh chan struct{} +} + +// NewWaker creates a new Waker. +func NewWaker() *Waker { + return &Waker{ + wakeCh: make(chan struct{}, 1), // buffer used for debouncing + } +} + +// Sleep returns a channel that blocks until Wake() is called. +func (w *Waker) Sleep() <-chan struct{} { + return w.wakeCh +} + +// Wake wakes up the sleeper. +func (w *Waker) Wake() { + // A non-blocking send with a size 1 buffer ensures that we never block, and + // that we queue up at most a single wakeup call between each Sleep(). + select { + case w.wakeCh <- struct{}{}: + default: + } +} diff --git a/libs/sync/waker_test.go b/libs/sync/waker_test.go new file mode 100644 index 000000000..dfd337e0e --- /dev/null +++ b/libs/sync/waker_test.go @@ -0,0 +1,47 @@ +package sync_test + +import ( + "testing" + + "github.com/stretchr/testify/require" + + tmsync "github.com/tendermint/tendermint/libs/sync" +) + +func TestWaker(t *testing.T) { + + // A new waker should block when sleeping. + waker := tmsync.NewWaker() + + select { + case <-waker.Sleep(): + require.Fail(t, "unexpected wakeup") + default: + } + + // Wakeups should not block, and should cause the next sleeper to awaken. + waker.Wake() + + select { + case <-waker.Sleep(): + default: + require.Fail(t, "expected wakeup, but sleeping instead") + } + + // Multiple wakeups should only wake a single sleeper. + waker.Wake() + waker.Wake() + waker.Wake() + + select { + case <-waker.Sleep(): + default: + require.Fail(t, "expected wakeup, but sleeping instead") + } + + select { + case <-waker.Sleep(): + require.Fail(t, "unexpected wakeup") + default: + } +} diff --git a/mempool/reactor.go b/mempool/reactor.go index f6ae9dc9c..8b2a1c063 100644 --- a/mempool/reactor.go +++ b/mempool/reactor.go @@ -58,7 +58,7 @@ type Reactor struct { peerMgr PeerManager mempoolCh *p2p.Channel - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates closeCh chan struct{} // peerWG is used to coordinate graceful termination of all peer broadcasting @@ -76,7 +76,7 @@ func NewReactor( peerMgr PeerManager, mempool *CListMempool, mempoolCh *p2p.Channel, - peerUpdates *p2p.PeerUpdatesCh, + peerUpdates *p2p.PeerUpdates, ) *Reactor { r := &Reactor{ @@ -225,9 +225,8 @@ func (r *Reactor) processMempoolCh() { if err := r.handleMessage(r.mempoolCh.ID(), envelope); err != nil { r.Logger.Error("failed to process message", "ch_id", r.mempoolCh.ID(), "envelope", envelope, "err", err) r.mempoolCh.Error() <- p2p.PeerError{ - PeerID: envelope.From, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + NodeID: envelope.From, + Err: err, } } @@ -244,7 +243,7 @@ func (r *Reactor) processMempoolCh() { // removed peers, we remove the peer from the mempool peer ID set and signal to // stop the tx broadcasting goroutine. func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { - r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status) + r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) r.mtx.Lock() defer r.mtx.Unlock() @@ -264,28 +263,28 @@ func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { // a new done channel so we can explicitly close the goroutine if the peer // is later removed, we increment the waitgroup so the reactor can stop // safely, and finally start the goroutine to broadcast txs to that peer. - _, ok := r.peerRoutines[peerUpdate.PeerID] + _, ok := r.peerRoutines[peerUpdate.NodeID] if !ok { closer := tmsync.NewCloser() - r.peerRoutines[peerUpdate.PeerID] = closer + r.peerRoutines[peerUpdate.NodeID] = closer r.peerWG.Add(1) - r.ids.ReserveForPeer(peerUpdate.PeerID) + r.ids.ReserveForPeer(peerUpdate.NodeID) // start a broadcast routine ensuring all txs are forwarded to the peer - go r.broadcastTxRoutine(peerUpdate.PeerID, closer) + go r.broadcastTxRoutine(peerUpdate.NodeID, closer) } } - case p2p.PeerStatusDown, p2p.PeerStatusRemoved, p2p.PeerStatusBanned: - r.ids.Reclaim(peerUpdate.PeerID) + case p2p.PeerStatusDown: + r.ids.Reclaim(peerUpdate.NodeID) // Check if we've started a tx broadcasting goroutine for this peer. // If we have, we signal to terminate the goroutine via the channel's closure. // This will internally decrement the peer waitgroup and remove the peer // from the map of peer tx broadcasting goroutines. - closer, ok := r.peerRoutines[peerUpdate.PeerID] + closer, ok := r.peerRoutines[peerUpdate.NodeID] if ok { closer.Close() } diff --git a/mempool/reactor_test.go b/mempool/reactor_test.go index d1163bf09..1f1b3be1e 100644 --- a/mempool/reactor_test.go +++ b/mempool/reactor_test.go @@ -33,7 +33,7 @@ type reactorTestSuite struct { mempoolPeerErrCh chan p2p.PeerError peerUpdatesCh chan p2p.PeerUpdate - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates } func setup(t *testing.T, cfg *cfg.MempoolConfig, logger log.Logger, chBuf uint) *reactorTestSuite { @@ -189,7 +189,7 @@ func TestReactorBroadcastTxs(t *testing.T) { for _, suite := range secondaries { primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: suite.peerID, + NodeID: suite.peerID, } } @@ -295,7 +295,7 @@ func TestReactorNoBroadcastToSender(t *testing.T) { primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } time.Sleep(100 * time.Millisecond) @@ -360,7 +360,7 @@ func TestReactor_MaxTxBytes(t *testing.T) { primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // Wait till all secondary suites (reactor) received all mempool txs from the @@ -406,7 +406,7 @@ func TestDontExhaustMaxActiveIDs(t *testing.T) { for i := 0; i < maxActiveIDs+1; i++ { reactor.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: peerID, + NodeID: peerID, } reactor.mempoolOutCh <- p2p.Envelope{ To: peerID, @@ -466,12 +466,12 @@ func TestBroadcastTxForPeerStopsWhenPeerStops(t *testing.T) { // connect peer primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // disconnect peer primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusDown, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } } diff --git a/p2p/address.go b/p2p/address.go index 8ff44515a..1e964b449 100644 --- a/p2p/address.go +++ b/p2p/address.go @@ -106,10 +106,9 @@ func ParseNodeAddress(urlString string) (NodeAddress, error) { Protocol: Protocol(strings.ToLower(url.Scheme)), } - // Opaque URLs are expected to contain only a node ID, also used as path. + // Opaque URLs are expected to contain only a node ID. if url.Opaque != "" { address.NodeID = NodeID(url.Opaque) - address.Path = url.Opaque return address, address.Validate() } diff --git a/p2p/address_test.go b/p2p/address_test.go index 83bfecf24..41b89eaee 100644 --- a/p2p/address_test.go +++ b/p2p/address_test.go @@ -158,7 +158,7 @@ func TestParseNodeAddress(t *testing.T) { }, { "memory:" + user, - p2p.NodeAddress{Protocol: "memory", NodeID: id, Path: user}, + p2p.NodeAddress{Protocol: "memory", NodeID: id}, true, }, diff --git a/p2p/channel.go b/p2p/channel.go index b2bcd156e..c13cd75d1 100644 --- a/p2p/channel.go +++ b/p2p/channel.go @@ -27,6 +27,21 @@ func (e Envelope) Strip() Envelope { return e } +// PeerError is a peer error reported via the Error channel. +// +// FIXME: This currently just disconnects the peer, which is too simplistic. +// For example, some errors should be logged, some should cause disconnects, +// and some should ban the peer. +// +// FIXME: This should probably be replaced by a more general PeerBehavior +// concept that can mark good and bad behavior and contributes to peer scoring. +// It should possibly also allow reactors to request explicit actions, e.g. +// disconnection or banning, in addition to doing this based on aggregates. +type PeerError struct { + NodeID NodeID + Err error +} + // Channel is a bidirectional channel for Protobuf message exchange with peers. // A Channel is safe for concurrent use by multiple goroutines. type Channel struct { diff --git a/p2p/peer.go b/p2p/peer.go index 07ffaa330..1aa923533 100644 --- a/p2p/peer.go +++ b/p2p/peer.go @@ -1,1175 +1,18 @@ package p2p import ( - "context" - "errors" "fmt" "io" - "math" - "math/rand" "net" "runtime/debug" - "sort" - "sync" "time" - "github.com/gogo/protobuf/proto" - "github.com/google/orderedcode" - dbm "github.com/tendermint/tm-db" - "github.com/tendermint/tendermint/libs/cmap" "github.com/tendermint/tendermint/libs/log" "github.com/tendermint/tendermint/libs/service" tmconn "github.com/tendermint/tendermint/p2p/conn" - p2pproto "github.com/tendermint/tendermint/proto/tendermint/p2p" ) -// PeerStatus specifies peer statuses. -type PeerStatus string - -const ( - PeerStatusNew = PeerStatus("new") // New peer which we haven't tried to contact yet. - PeerStatusUp = PeerStatus("up") // Peer which we have an active connection to. - PeerStatusDown = PeerStatus("down") // Peer which we're temporarily disconnected from. - PeerStatusRemoved = PeerStatus("removed") // Peer which has been removed. - PeerStatusBanned = PeerStatus("banned") // Peer which is banned for misbehavior. -) - -// PeerError is a peer error reported by a reactor via the Error channel. The -// severity may cause the peer to be disconnected or banned depending on policy. -type PeerError struct { - PeerID NodeID - Err error - Severity PeerErrorSeverity -} - -// PeerErrorSeverity determines the severity of a peer error. -type PeerErrorSeverity string - -const ( - PeerErrorSeverityLow PeerErrorSeverity = "low" // Mostly ignored. - PeerErrorSeverityHigh PeerErrorSeverity = "high" // May disconnect. - PeerErrorSeverityCritical PeerErrorSeverity = "critical" // Ban. -) - -// PeerUpdatesCh defines a wrapper around a PeerUpdate go channel that allows -// a reactor to listen for peer updates and safely close it when stopping. -type PeerUpdatesCh struct { - closeOnce sync.Once - - // updatesCh defines the go channel in which the router sends peer updates to - // reactors. Each reactor will have its own PeerUpdatesCh to listen for updates - // from. - updatesCh chan PeerUpdate - - // doneCh is used to signal that a PeerUpdatesCh is closed. It is the - // reactor's responsibility to invoke Close. - doneCh chan struct{} -} - -// NewPeerUpdates returns a reference to a new PeerUpdatesCh. -func NewPeerUpdates(updatesCh chan PeerUpdate) *PeerUpdatesCh { - return &PeerUpdatesCh{ - updatesCh: updatesCh, - doneCh: make(chan struct{}), - } -} - -// Updates returns a read-only go channel where a consuming reactor can listen -// for peer updates sent from the router. -func (puc *PeerUpdatesCh) Updates() <-chan PeerUpdate { - return puc.updatesCh -} - -// Close closes the PeerUpdatesCh channel. It should only be closed by the respective -// reactor when stopping and ensure nothing is listening for updates. -// -// NOTE: After a PeerUpdatesCh is closed, the router may safely assume it can no -// longer send on the internal updatesCh, however it should NEVER explicitly close -// it as that could result in panics by sending on a closed channel. -func (puc *PeerUpdatesCh) Close() { - puc.closeOnce.Do(func() { - close(puc.doneCh) - }) -} - -// Done returns a read-only version of the PeerUpdatesCh's internal doneCh go -// channel that should be used by a router to signal when it is safe to explicitly -// not send any peer updates. -func (puc *PeerUpdatesCh) Done() <-chan struct{} { - return puc.doneCh -} - -// PeerUpdate is a peer status update for reactors. -type PeerUpdate struct { - PeerID NodeID - Status PeerStatus -} - -// PeerScore is a numeric score assigned to a peer (higher is better). -type PeerScore uint16 - -const ( - // PeerScorePersistent is added for persistent peers. - PeerScorePersistent PeerScore = 100 -) - -// PeerManager manages peer lifecycle information, using a peerStore for -// underlying storage. Its primary purpose is to determine which peers to -// connect to next, make sure a peer only has a single active connection (either -// inbound or outbound), and evict peers to make room for higher-scored peers. -// It does not manage actual connections (this is handled by the Router), -// only the peer lifecycle state. -// -// We track dialing and connected states independently. This allows us to accept -// an inbound connection from a peer while the router is also dialing an -// outbound connection to that same peer, which will cause the dialer to -// eventually error when attempting to mark the peer as connected. This also -// avoids race conditions where multiple goroutines may end up dialing a peer if -// an incoming connection was briefly accepted and disconnected while we were -// also dialing. -// -// For an outbound connection, the flow is as follows: -// - DialNext: returns a peer address to dial, marking the peer as dialing. -// - DialFailed: reports a dial failure, unmarking the peer as dialing. -// - Dialed: successfully dialed, unmarking as dialing and marking as connected -// (or erroring if already connected). -// - Ready: routing is up, broadcasts a PeerStatusUp peer update to subscribers. -// - Disconnected: peer disconnects, unmarking as connected and broadcasts a -// PeerStatusDown peer update. -// -// For an inbound connection, the flow is as follows: -// - Accepted: successfully accepted connection, marking as connected (or erroring -// if already connected). -// - Ready: routing is up, broadcasts a PeerStatusUp peer update to subscribers. -// - Disconnected: peer disconnects, unmarking as connected and broadcasts a -// PeerStatusDown peer update. -// -// When evicting peers, either because peers are explicitly scheduled for -// eviction or we are connected to too many peers, the flow is as follows: -// - EvictNext: if marked evict and connected, unmark evict and mark evicting. -// If beyond MaxConnected, pick lowest-scored peer and mark evicting. -// - Disconnected: unmark connected, evicting, evict, and broadcast a -// PeerStatusDown peer update. -// -// If all connection slots are full (at MaxConnections), we can use up to -// MaxConnectionsUpgrade additional connections to probe any higher-scored -// unconnected peers, and if we reach them (or they reach us) we allow the -// connection and evict a lower-scored peer. We mark the lower-scored peer as -// upgrading[from]=to to make sure no other higher-scored peers can claim the -// same one for an upgrade. The flow is as follows: -// - Accepted: if upgrade is possible, mark connected and add lower-scored to evict. -// - DialNext: if upgrade is possible, mark upgrading[from]=to and dialing. -// - DialFailed: unmark upgrading[from]=to and dialing. -// - Dialed: unmark upgrading[from]=to and dialing, mark as connected, add -// lower-scored to evict. -// - EvictNext: pick peer from evict, mark as evicting. -// - Disconnected: unmark connected, upgrading[from]=to, evict, evicting. -// -// FIXME: The old stack supports ABCI-based peer ID filtering via -// /p2p/filter/id/ queries, we should implement this here as well by taking -// a peer ID filtering callback in PeerManagerOptions and configuring it during -// Node setup. -type PeerManager struct { - options PeerManagerOptions - wakeDialCh chan struct{} // wakes up DialNext() on relevant peer changes - wakeEvictCh chan struct{} // wakes up EvictNext() on relevant peer changes - closeCh chan struct{} // signal channel for Close() - closeOnce sync.Once - - mtx sync.Mutex - store *peerStore - dialing map[NodeID]bool // peers being dialed (DialNext -> Dialed/DialFail) - upgrading map[NodeID]NodeID // peers claimed for upgrade (DialNext -> Dialed/DialFail) - connected map[NodeID]bool // connected peers (Dialed/Accepted -> Disconnected) - evict map[NodeID]bool // peers scheduled for eviction (Connected -> EvictNext) - evicting map[NodeID]bool // peers being evicted (EvictNext -> Disconnected) - subscriptions map[*PeerUpdatesCh]*PeerUpdatesCh // keyed by struct identity (address) -} - -// PeerManagerOptions specifies options for a PeerManager. -type PeerManagerOptions struct { - // PersistentPeers are peers that we want to maintain persistent connections - // to. These will be scored higher than other peers, and if - // MaxConnectedUpgrade is non-zero any lower-scored peers will be evicted if - // necessary to make room for these. - PersistentPeers []NodeID - - // MaxPeers is the maximum number of peers to track information about, i.e. - // store in the peer store. When exceeded, the lowest-scored unconnected peers - // will be deleted. 0 means no limit. - MaxPeers uint16 - - // MaxConnected is the maximum number of connected peers (inbound and - // outbound). 0 means no limit. - MaxConnected uint16 - - // MaxConnectedUpgrade is the maximum number of additional connections to - // use for probing any better-scored peers to upgrade to when all connection - // slots are full. 0 disables peer upgrading. - // - // For example, if we are already connected to MaxConnected peers, but we - // know or learn about better-scored peers (e.g. configured persistent - // peers) that we are not connected too, then we can probe these peers by - // using up to MaxConnectedUpgrade connections, and once connected evict the - // lowest-scored connected peers. This also works for inbound connections, - // i.e. if a higher-scored peer attempts to connect to us, we can accept - // the connection and evict a lower-scored peer. - MaxConnectedUpgrade uint16 - - // MinRetryTime is the minimum time to wait between retries. Retry times - // double for each retry, up to MaxRetryTime. 0 disables retries. - MinRetryTime time.Duration - - // MaxRetryTime is the maximum time to wait between retries. 0 means - // no maximum, in which case the retry time will keep doubling. - MaxRetryTime time.Duration - - // MaxRetryTimePersistent is the maximum time to wait between retries for - // peers listed in PersistentPeers. 0 uses MaxRetryTime instead. - MaxRetryTimePersistent time.Duration - - // RetryTimeJitter is the upper bound of a random interval added to - // retry times, to avoid thundering herds. 0 disables jutter. - RetryTimeJitter time.Duration -} - -// NewPeerManager creates a new peer manager. -func NewPeerManager(peerDB dbm.DB, options PeerManagerOptions) (*PeerManager, error) { - store, err := newPeerStore(peerDB) - if err != nil { - return nil, err - } - peerManager := &PeerManager{ - options: options, - closeCh: make(chan struct{}), - - // We use a buffer of size 1 for these trigger channels, with - // non-blocking sends. This ensures that if e.g. wakeDial() is called - // multiple times before the initial trigger is picked up we only - // process the trigger once. - // - // FIXME: This should maybe be a libs/sync type. - wakeDialCh: make(chan struct{}, 1), - wakeEvictCh: make(chan struct{}, 1), - - store: store, - dialing: map[NodeID]bool{}, - upgrading: map[NodeID]NodeID{}, - connected: map[NodeID]bool{}, - evict: map[NodeID]bool{}, - evicting: map[NodeID]bool{}, - subscriptions: map[*PeerUpdatesCh]*PeerUpdatesCh{}, - } - if err = peerManager.configurePeers(); err != nil { - return nil, err - } - if err = peerManager.prunePeers(); err != nil { - return nil, err - } - return peerManager, nil -} - -// configurePeers configures peers in the peer store with ephemeral runtime -// configuration, e.g. setting peerInfo.Persistent based on -// PeerManagerOptions.PersistentPeers. The caller must hold the mutex lock. -func (m *PeerManager) configurePeers() error { - for _, peerID := range m.options.PersistentPeers { - if peer, ok := m.store.Get(peerID); ok { - peer.Persistent = true - if err := m.store.Set(peer); err != nil { - return err - } - } - } - return nil -} - -// prunePeers removes peers from the peer store if it contains more than -// MaxPeers peers. The lowest-scored non-connected peers are removed. -// The caller must hold the mutex lock. -func (m *PeerManager) prunePeers() error { - if m.options.MaxPeers == 0 || m.store.Size() <= int(m.options.MaxPeers) { - return nil - } - m.mtx.Lock() - defer m.mtx.Unlock() - - ranked := m.store.Ranked() - for i := len(ranked) - 1; i >= 0; i-- { - peerID := ranked[i].ID - switch { - case m.store.Size() <= int(m.options.MaxPeers): - break - case m.dialing[peerID]: - case m.connected[peerID]: - case m.evicting[peerID]: - default: - if err := m.store.Delete(peerID); err != nil { - return err - } - } - } - return nil -} - -// Close closes the peer manager, releasing resources allocated with it -// (specifically any running goroutines). -func (m *PeerManager) Close() { - m.closeOnce.Do(func() { - close(m.closeCh) - }) -} - -// Add adds a peer to the manager, given as an address. If the peer already -// exists, the address is added to it. -func (m *PeerManager) Add(address NodeAddress) error { - if err := address.Validate(); err != nil { - return err - } - m.mtx.Lock() - defer m.mtx.Unlock() - - peer, ok := m.store.Get(address.NodeID) - if !ok { - peer = m.makePeerInfo(address.NodeID) - } - if _, ok := peer.AddressInfo[address.String()]; !ok { - peer.AddressInfo[address.String()] = &peerAddressInfo{Address: address} - } - if err := m.store.Set(peer); err != nil { - return err - } - if err := m.prunePeers(); err != nil { - return err - } - m.wakeDial() - return nil -} - -// Advertise returns a list of peer addresses to advertise to a peer. -// -// FIXME: This is fairly naïve and only returns the addresses of the -// highest-ranked peers. -func (m *PeerManager) Advertise(peerID NodeID, limit uint16) []NodeAddress { - m.mtx.Lock() - defer m.mtx.Unlock() - - addresses := make([]NodeAddress, 0, limit) - for _, peer := range m.store.Ranked() { - if peer.ID == peerID { - continue - } - for _, addressInfo := range peer.AddressInfo { - if len(addresses) >= int(limit) { - return addresses - } - addresses = append(addresses, addressInfo.Address) - } - } - return addresses -} - -// makePeerInfo creates a peerInfo for a new peer. -func (m *PeerManager) makePeerInfo(id NodeID) peerInfo { - isPersistent := false - for _, p := range m.options.PersistentPeers { - if id == p { - isPersistent = true - break - } - } - return peerInfo{ - ID: id, - Persistent: isPersistent, - AddressInfo: map[string]*peerAddressInfo{}, - } -} - -// Subscribe subscribes to peer updates. The caller must consume the peer -// updates in a timely fashion and close the subscription when done, since -// delivery is guaranteed and will block peer connection/disconnection -// otherwise. -func (m *PeerManager) Subscribe() *PeerUpdatesCh { - // FIXME: We may want to use a size 1 buffer here. When the router - // broadcasts a peer update it has to loop over all of the - // subscriptions, and we want to avoid blocking and waiting for a - // context switch before continuing to the next subscription. This also - // prevents tail latencies from compounding across updates. We also want - // to make sure the subscribers are reasonably in sync, so it should be - // kept at 1. However, this should be benchmarked first. - peerUpdates := NewPeerUpdates(make(chan PeerUpdate)) - m.mtx.Lock() - m.subscriptions[peerUpdates] = peerUpdates - m.mtx.Unlock() - - go func() { - <-peerUpdates.Done() - m.mtx.Lock() - delete(m.subscriptions, peerUpdates) - m.mtx.Unlock() - }() - return peerUpdates -} - -// broadcast broadcasts a peer update to all subscriptions. The caller must -// already hold the mutex lock. This means the mutex is held for the duration -// of the broadcast, which we want to make sure all subscriptions receive all -// updates in the same order. -// -// FIXME: Consider using more fine-grained mutexes here, and/or a channel to -// enforce ordering of updates. -func (m *PeerManager) broadcast(peerUpdate PeerUpdate) { - for _, sub := range m.subscriptions { - select { - case sub.updatesCh <- peerUpdate: - case <-sub.doneCh: - } - } -} - -// DialNext finds an appropriate peer address to dial, and marks it as dialing. -// If no peer is found, or all connection slots are full, it blocks until one -// becomes available. The caller must call Dialed() or DialFailed() for the -// returned peer. The context can be used to cancel the call. -func (m *PeerManager) DialNext(ctx context.Context) (NodeID, NodeAddress, error) { - for { - id, address, err := m.TryDialNext() - if err != nil || id != "" { - return id, address, err - } - select { - case <-m.wakeDialCh: - case <-ctx.Done(): - return "", NodeAddress{}, ctx.Err() - } - } -} - -// TryDialNext is equivalent to DialNext(), but immediately returns an empty -// peer ID if no peers or connection slots are available. -func (m *PeerManager) TryDialNext() (NodeID, NodeAddress, error) { - m.mtx.Lock() - defer m.mtx.Unlock() - - // We allow dialing MaxConnected+MaxConnectedUpgrade peers. Including - // MaxConnectedUpgrade allows us to probe additional peers that have a - // higher score than any other peers, and if successful evict it. - if m.options.MaxConnected > 0 && - len(m.connected)+len(m.dialing) >= int(m.options.MaxConnected)+int(m.options.MaxConnectedUpgrade) { - return "", NodeAddress{}, nil - } - - for _, peer := range m.store.Ranked() { - if m.dialing[peer.ID] || m.connected[peer.ID] { - continue - } - - for _, addressInfo := range peer.AddressInfo { - if time.Since(addressInfo.LastDialFailure) < m.retryDelay(addressInfo.DialFailures, peer.Persistent) { - continue - } - - // We now have an eligible address to dial. If we're full but have - // upgrade capacity (as checked above), we find a lower-scored peer - // we can replace and mark it as upgrading so noone else claims it. - // - // If we don't find one, there is no point in trying additional - // peers, since they will all have the same or lower score than this - // peer (since they're ordered by score via peerStore.Ranked). - if m.options.MaxConnected > 0 && len(m.connected) >= int(m.options.MaxConnected) { - upgradeFromPeer := m.findUpgradeCandidate(peer.ID, peer.Score()) - if upgradeFromPeer == "" { - return "", NodeAddress{}, nil - } - m.upgrading[upgradeFromPeer] = peer.ID - } - - m.dialing[peer.ID] = true - return peer.ID, addressInfo.Address, nil - } - } - return "", NodeAddress{}, nil -} - -// wakeDial is used to notify DialNext about changes that *may* cause new -// peers to become eligible for dialing, such as peer disconnections and -// retry timeouts. -func (m *PeerManager) wakeDial() { - // The channel has a 1-size buffer. A non-blocking send ensures - // we only queue up at most 1 trigger between each DialNext(). - select { - case m.wakeDialCh <- struct{}{}: - default: - } -} - -// wakeEvict is used to notify EvictNext about changes that *may* cause -// peers to become eligible for eviction, such as peer upgrades. -func (m *PeerManager) wakeEvict() { - // The channel has a 1-size buffer. A non-blocking send ensures - // we only queue up at most 1 trigger between each EvictNext(). - select { - case m.wakeEvictCh <- struct{}{}: - default: - } -} - -// retryDelay calculates a dial retry delay using exponential backoff, based on -// retry settings in PeerManagerOptions. If MinRetryTime is 0, this returns -// MaxInt64 (i.e. an infinite retry delay, effectively disabling retries). -func (m *PeerManager) retryDelay(failures uint32, persistent bool) time.Duration { - if failures == 0 { - return 0 - } - if m.options.MinRetryTime == 0 { - return time.Duration(math.MaxInt64) - } - maxDelay := m.options.MaxRetryTime - if persistent && m.options.MaxRetryTimePersistent > 0 { - maxDelay = m.options.MaxRetryTimePersistent - } - - delay := m.options.MinRetryTime * time.Duration(math.Pow(2, float64(failures))) - if maxDelay > 0 && delay > maxDelay { - delay = maxDelay - } - // FIXME: This should use a PeerManager-scoped RNG. - delay += time.Duration(rand.Int63n(int64(m.options.RetryTimeJitter))) // nolint:gosec - return delay -} - -// DialFailed reports a failed dial attempt. This will make the peer available -// for dialing again when appropriate. -// -// FIXME: This should probably delete or mark bad addresses/peers after some time. -func (m *PeerManager) DialFailed(peerID NodeID, address NodeAddress) error { - m.mtx.Lock() - defer m.mtx.Unlock() - - delete(m.dialing, peerID) - for from, to := range m.upgrading { - if to == peerID { - delete(m.upgrading, from) // Unmark failed upgrade attempt. - } - } - - peer, ok := m.store.Get(peerID) - if !ok { // Peer may have been removed while dialing, ignore. - return nil - } - addressInfo, ok := peer.AddressInfo[address.String()] - if !ok { - return nil // Assume the address has been removed, ignore. - } - addressInfo.LastDialFailure = time.Now().UTC() - addressInfo.DialFailures++ - if err := m.store.Set(peer); err != nil { - return err - } - - // We spawn a goroutine that notifies DialNext() again when the retry - // timeout has elapsed, so that we can consider dialing it again. - go func() { - retryDelay := m.retryDelay(addressInfo.DialFailures, peer.Persistent) - if retryDelay == time.Duration(math.MaxInt64) { - return - } - // Use an explicit timer with deferred cleanup instead of - // time.After(), to avoid leaking goroutines on PeerManager.Close(). - timer := time.NewTimer(retryDelay) - defer timer.Stop() - select { - case <-timer.C: - m.wakeDial() - case <-m.closeCh: - } - }() - - m.wakeDial() - return nil -} - -// Dialed marks a peer as successfully dialed. Any further incoming connections -// will be rejected, and once disconnected the peer may be dialed again. -func (m *PeerManager) Dialed(peerID NodeID, address NodeAddress) error { - m.mtx.Lock() - defer m.mtx.Unlock() - - delete(m.dialing, peerID) - - var upgradeFromPeer NodeID - for from, to := range m.upgrading { - if to == peerID { - delete(m.upgrading, from) - upgradeFromPeer = from - // Don't break, just in case this peer was marked as upgrading for - // multiple lower-scored peers (shouldn't really happen). - } - } - - if m.connected[peerID] { - return fmt.Errorf("peer %v is already connected", peerID) - } - if m.options.MaxConnected > 0 && - len(m.connected) >= int(m.options.MaxConnected)+int(m.options.MaxConnectedUpgrade) { - return fmt.Errorf("already connected to maximum number of peers") - } - - peer, ok := m.store.Get(peerID) - if !ok { - return fmt.Errorf("peer %q was removed while dialing", peerID) - } - now := time.Now().UTC() - peer.LastConnected = now - if addressInfo, ok := peer.AddressInfo[address.String()]; ok { - addressInfo.DialFailures = 0 - addressInfo.LastDialSuccess = now - // If not found, assume address has been removed. - } - if err := m.store.Set(peer); err != nil { - return err - } - - if upgradeFromPeer != "" && m.options.MaxConnected > 0 && - len(m.connected) >= int(m.options.MaxConnected) { - // Look for an even lower-scored peer that may have appeared - // since we started the upgrade. - if p, ok := m.store.Get(upgradeFromPeer); ok { - if u := m.findUpgradeCandidate(p.ID, p.Score()); u != "" { - upgradeFromPeer = u - } - } - m.evict[upgradeFromPeer] = true - } - m.connected[peerID] = true - m.wakeEvict() - - return nil -} - -// Accepted marks an incoming peer connection successfully accepted. If the peer -// is already connected or we don't allow additional connections then this will -// return an error. -// -// If full but MaxConnectedUpgrade is non-zero and the incoming peer is -// better-scored than any existing peers, then we accept it and evict a -// lower-scored peer. -// -// NOTE: We can't take an address here, since e.g. TCP uses a different port -// number for outbound traffic than inbound traffic, so the peer's endpoint -// wouldn't necessarily be an appropriate address to dial. -// -// FIXME: When we accept a connection from a peer, we should register that -// peer's address in the peer store so that we can dial it later. In order to do -// that, we'll need to get the remote address after all, but as noted above that -// can't be the remote endpoint since that will usually have the wrong port -// number. -func (m *PeerManager) Accepted(peerID NodeID) error { - m.mtx.Lock() - defer m.mtx.Unlock() - - if m.connected[peerID] { - return fmt.Errorf("peer %q is already connected", peerID) - } - if m.options.MaxConnected > 0 && - len(m.connected) >= int(m.options.MaxConnected)+int(m.options.MaxConnectedUpgrade) { - return fmt.Errorf("already connected to maximum number of peers") - } - - peer, ok := m.store.Get(peerID) - if !ok { - peer = m.makePeerInfo(peerID) - } - - // If all connections slots are full, but we allow upgrades (and we checked - // above that we have upgrade capacity), then we can look for a lower-scored - // peer to replace and if found accept the connection anyway and evict it. - var upgradeFromPeer NodeID - if m.options.MaxConnected > 0 && len(m.connected) >= int(m.options.MaxConnected) { - upgradeFromPeer = m.findUpgradeCandidate(peer.ID, peer.Score()) - if upgradeFromPeer == "" { - return fmt.Errorf("already connected to maximum number of peers") - } - } - - peer.LastConnected = time.Now().UTC() - if err := m.store.Set(peer); err != nil { - return err - } - - m.connected[peerID] = true - if upgradeFromPeer != "" { - m.evict[upgradeFromPeer] = true - } - m.wakeEvict() - return nil -} - -// Ready marks a peer as ready, broadcasting status updates to subscribers. The -// peer must already be marked as connected. This is separate from Dialed() and -// Accepted() to allow the router to set up its internal queues before reactors -// start sending messages. -func (m *PeerManager) Ready(peerID NodeID) { - m.mtx.Lock() - defer m.mtx.Unlock() - - if m.connected[peerID] { - m.broadcast(PeerUpdate{ - PeerID: peerID, - Status: PeerStatusUp, - }) - } -} - -// Disconnected unmarks a peer as connected, allowing new connections to be -// established. -func (m *PeerManager) Disconnected(peerID NodeID) error { - m.mtx.Lock() - defer m.mtx.Unlock() - - delete(m.connected, peerID) - delete(m.upgrading, peerID) - delete(m.evict, peerID) - delete(m.evicting, peerID) - m.broadcast(PeerUpdate{ - PeerID: peerID, - Status: PeerStatusDown, - }) - m.wakeDial() - return nil -} - -// EvictNext returns the next peer to evict (i.e. disconnect). If no evictable -// peers are found, the call will block until one becomes available or the -// context is cancelled. -func (m *PeerManager) EvictNext(ctx context.Context) (NodeID, error) { - for { - id, err := m.TryEvictNext() - if err != nil || id != "" { - return id, err - } - select { - case <-m.wakeEvictCh: - case <-ctx.Done(): - return "", ctx.Err() - } - } -} - -// TryEvictNext is equivalent to EvictNext, but immediately returns an empty -// node ID if no evictable peers are found. -func (m *PeerManager) TryEvictNext() (NodeID, error) { - m.mtx.Lock() - defer m.mtx.Unlock() - - // If any connected peers are explicitly scheduled for eviction, we return a - // random one. - for peerID := range m.evict { - delete(m.evict, peerID) - if m.connected[peerID] && !m.evicting[peerID] { - m.evicting[peerID] = true - return peerID, nil - } - } - - // If we're below capacity, we don't need to evict anything. - if m.options.MaxConnected == 0 || - len(m.connected)-len(m.evicting) <= int(m.options.MaxConnected) { - return "", nil - } - - // If we're above capacity, just pick the lowest-ranked peer to evict. - ranked := m.store.Ranked() - for i := len(ranked) - 1; i >= 0; i-- { - peer := ranked[i] - if m.connected[peer.ID] && !m.evicting[peer.ID] { - m.evicting[peer.ID] = true - return peer.ID, nil - } - } - - return "", nil -} - -// findUpgradeCandidate looks for a lower-scored peer that we could evict -// to make room for the given peer. Returns an empty ID if none is found. -// The caller must hold the mutex lock. -func (m *PeerManager) findUpgradeCandidate(id NodeID, score PeerScore) NodeID { - ranked := m.store.Ranked() - for i := len(ranked) - 1; i >= 0; i-- { - candidate := ranked[i] - switch { - case candidate.Score() >= score: - return "" // no further peers can be scored lower, due to sorting - case !m.connected[candidate.ID]: - case m.evict[candidate.ID]: - case m.evicting[candidate.ID]: - case m.upgrading[candidate.ID] != "": - default: - return candidate.ID - } - } - return "" -} - -// GetHeight returns a peer's height, as reported via SetHeight. If the peer -// or height is unknown, this returns 0. -// -// FIXME: This is a temporary workaround for the peer state stored via the -// legacy Peer.Set() and Peer.Get() APIs, used to share height state between the -// consensus and mempool reactors. These dependencies should be removed from the -// reactors, and instead query this information independently via new P2P -// protocol additions. -func (m *PeerManager) GetHeight(peerID NodeID) int64 { - m.mtx.Lock() - defer m.mtx.Unlock() - - peer, _ := m.store.Get(peerID) - return peer.Height -} - -// SetHeight stores a peer's height, making it available via GetHeight. If the -// peer is unknown, it is created. -// -// FIXME: This is a temporary workaround for the peer state stored via the -// legacy Peer.Set() and Peer.Get() APIs, used to share height state between the -// consensus and mempool reactors. These dependencies should be removed from the -// reactors, and instead query this information independently via new P2P -// protocol additions. -func (m *PeerManager) SetHeight(peerID NodeID, height int64) error { - m.mtx.Lock() - defer m.mtx.Unlock() - - peer, ok := m.store.Get(peerID) - if !ok { - peer = m.makePeerInfo(peerID) - } - peer.Height = height - return m.store.Set(peer) -} - -// peerStore stores information about peers. It is not thread-safe, assuming -// it is used only by PeerManager which handles concurrency control, allowing -// it to execute multiple operations atomically via its own mutex. -// -// The entire set of peers is kept in memory, for performance. It is loaded -// from disk on initialization, and any changes are written back to disk -// (without fsync, since we can afford to lose recent writes). -type peerStore struct { - db dbm.DB - peers map[NodeID]*peerInfo - ranked []*peerInfo // cache for Ranked(), nil invalidates cache -} - -// newPeerStore creates a new peer store, loading all persisted peers from the -// database into memory. -func newPeerStore(db dbm.DB) (*peerStore, error) { - store := &peerStore{ - db: db, - } - if err := store.loadPeers(); err != nil { - return nil, err - } - return store, nil -} - -// loadPeers loads all peers from the database into memory. -func (s *peerStore) loadPeers() error { - peers := make(map[NodeID]*peerInfo) - - start, end := keyPeerInfoRange() - iter, err := s.db.Iterator(start, end) - if err != nil { - return err - } - defer iter.Close() - for ; iter.Valid(); iter.Next() { - // FIXME: We may want to tolerate failures here, by simply logging - // the errors and ignoring the faulty peer entries. - msg := new(p2pproto.PeerInfo) - if err := proto.Unmarshal(iter.Value(), msg); err != nil { - return fmt.Errorf("invalid peer Protobuf data: %w", err) - } - peer, err := peerInfoFromProto(msg) - if err != nil { - return fmt.Errorf("invalid peer data: %w", err) - } - peers[peer.ID] = peer - } - if iter.Error() != nil { - return iter.Error() - } - s.peers = peers - s.ranked = nil // invalidate cache if populated - return nil -} - -// Get fetches a peer. The boolean indicates whether the peer existed or not. -// The returned peer info is a copy, and can be mutated at will. -func (s *peerStore) Get(id NodeID) (peerInfo, bool) { - peer, ok := s.peers[id] - return peer.Copy(), ok -} - -// Set stores peer data. The input data will be copied, and can safely be reused -// by the caller. -func (s *peerStore) Set(peer peerInfo) error { - if err := peer.Validate(); err != nil { - return err - } - peer = peer.Copy() - - // FIXME: We may want to optimize this by avoiding saving to the database - // if there haven't been any changes to persisted fields. - bz, err := peer.ToProto().Marshal() - if err != nil { - return err - } - if err = s.db.Set(keyPeerInfo(peer.ID), bz); err != nil { - return err - } - - if current, ok := s.peers[peer.ID]; !ok || current.Score() != peer.Score() { - // If the peer is new, or its score changes, we invalidate the Ranked() cache. - s.peers[peer.ID] = &peer - s.ranked = nil - } else { - // Otherwise, since s.ranked contains pointers to the old data and we - // want those pointers to remain valid with the new data, we have to - // update the existing pointer address. - *current = peer - } - - return nil -} - -// Delete deletes a peer, or does nothing if it does not exist. -func (s *peerStore) Delete(id NodeID) error { - if _, ok := s.peers[id]; !ok { - return nil - } - if err := s.db.Delete(keyPeerInfo(id)); err != nil { - return err - } - delete(s.peers, id) - s.ranked = nil - return nil -} - -// List retrieves all peers in an arbitrary order. The returned data is a copy, -// and can be mutated at will. -func (s *peerStore) List() []peerInfo { - peers := make([]peerInfo, 0, len(s.peers)) - for _, peer := range s.peers { - peers = append(peers, peer.Copy()) - } - return peers -} - -// Ranked returns a list of peers ordered by score (better peers first). Peers -// with equal scores are returned in an arbitrary order. The returned list must -// not be mutated or accessed concurrently by the caller, since it returns -// pointers to internal peerStore data for performance. -// -// Ranked is used to determine both which peers to dial, which ones to evict, -// and which ones to delete completely. -// -// FIXME: For now, we simply maintain a cache in s.ranked which is invalidated -// by setting it to nil, but if necessary we should use a better data structure -// for this (e.g. a heap or ordered map). -// -// FIXME: The scoring logic is currently very naïve, see peerInfo.Score(). -func (s *peerStore) Ranked() []*peerInfo { - if s.ranked != nil { - return s.ranked - } - s.ranked = make([]*peerInfo, 0, len(s.peers)) - for _, peer := range s.peers { - s.ranked = append(s.ranked, peer) - } - sort.Slice(s.ranked, func(i, j int) bool { - // FIXME: If necessary, consider precomputing scores before sorting, - // to reduce the number of Score() calls. - return s.ranked[i].Score() > s.ranked[j].Score() - }) - return s.ranked -} - -// Size returns the number of peers in the peer store. -func (s *peerStore) Size() int { - return len(s.peers) -} - -// peerInfo contains peer information stored in a peerStore. -type peerInfo struct { - ID NodeID - AddressInfo map[string]*peerAddressInfo - LastConnected time.Time - - // These fields are ephemeral, i.e. not persisted to the database. - Persistent bool - Height int64 -} - -// peerInfoFromProto converts a Protobuf PeerInfo message to a peerInfo, -// erroring if the data is invalid. -func peerInfoFromProto(msg *p2pproto.PeerInfo) (*peerInfo, error) { - p := &peerInfo{ - ID: NodeID(msg.ID), - AddressInfo: map[string]*peerAddressInfo{}, - } - if msg.LastConnected != nil { - p.LastConnected = *msg.LastConnected - } - for _, addr := range msg.AddressInfo { - addressInfo, err := peerAddressInfoFromProto(addr) - if err != nil { - return nil, err - } - p.AddressInfo[addressInfo.Address.String()] = addressInfo - } - return p, p.Validate() -} - -// ToProto converts the peerInfo to p2pproto.PeerInfo for database storage. The -// Protobuf type only contains persisted fields, while ephemeral fields are -// discarded. The returned message may contain pointers to original data, since -// it is expected to be serialized immediately. -func (p *peerInfo) ToProto() *p2pproto.PeerInfo { - msg := &p2pproto.PeerInfo{ - ID: string(p.ID), - LastConnected: &p.LastConnected, - } - for _, addressInfo := range p.AddressInfo { - msg.AddressInfo = append(msg.AddressInfo, addressInfo.ToProto()) - } - if msg.LastConnected.IsZero() { - msg.LastConnected = nil - } - return msg -} - -// Copy returns a deep copy of the peer info. -func (p *peerInfo) Copy() peerInfo { - if p == nil { - return peerInfo{} - } - c := *p - for i, addressInfo := range c.AddressInfo { - addressInfoCopy := addressInfo.Copy() - c.AddressInfo[i] = &addressInfoCopy - } - return c -} - -// Score calculates a score for the peer. Higher-scored peers will be -// preferred over lower scores. -func (p *peerInfo) Score() PeerScore { - var score PeerScore - if p.Persistent { - score += PeerScorePersistent - } - return score -} - -// Validate validates the peer info. -func (p *peerInfo) Validate() error { - if p.ID == "" { - return errors.New("no peer ID") - } - return nil -} - -// peerAddressInfo contains information and statistics about a peer address. -type peerAddressInfo struct { - Address NodeAddress - LastDialSuccess time.Time - LastDialFailure time.Time - DialFailures uint32 // since last successful dial -} - -// peerAddressInfoFromProto converts a Protobuf PeerAddressInfo message -// to a peerAddressInfo. -func peerAddressInfoFromProto(msg *p2pproto.PeerAddressInfo) (*peerAddressInfo, error) { - address, err := ParseNodeAddress(msg.Address) - if err != nil { - return nil, fmt.Errorf("invalid address %q: %w", address, err) - } - addressInfo := &peerAddressInfo{ - Address: address, - DialFailures: msg.DialFailures, - } - if msg.LastDialSuccess != nil { - addressInfo.LastDialSuccess = *msg.LastDialSuccess - } - if msg.LastDialFailure != nil { - addressInfo.LastDialFailure = *msg.LastDialFailure - } - return addressInfo, addressInfo.Validate() -} - -// ToProto converts the address into to a Protobuf message for serialization. -func (a *peerAddressInfo) ToProto() *p2pproto.PeerAddressInfo { - msg := &p2pproto.PeerAddressInfo{ - Address: a.Address.String(), - LastDialSuccess: &a.LastDialSuccess, - LastDialFailure: &a.LastDialFailure, - DialFailures: a.DialFailures, - } - if msg.LastDialSuccess.IsZero() { - msg.LastDialSuccess = nil - } - if msg.LastDialFailure.IsZero() { - msg.LastDialFailure = nil - } - return msg -} - -// Copy returns a copy of the address info. -func (a *peerAddressInfo) Copy() peerAddressInfo { - return *a -} - -// Validate validates the address info. -func (a *peerAddressInfo) Validate() error { - return a.Address.Validate() -} - -// These are database key prefixes. -const ( - prefixPeerInfo int64 = 1 -) - -// keyPeerInfo generates a peerInfo database key. -func keyPeerInfo(id NodeID) []byte { - key, err := orderedcode.Append(nil, prefixPeerInfo, string(id)) - if err != nil { - panic(err) - } - return key -} - -// keyPeerInfoPrefix generates start/end keys for the entire peerInfo key range. -func keyPeerInfoRange() ([]byte, []byte) { - start, err := orderedcode.Append(nil, prefixPeerInfo, "") - if err != nil { - panic(err) - } - end, err := orderedcode.Append(nil, prefixPeerInfo, orderedcode.Infinity) - if err != nil { - panic(err) - } - return start, end -} - -// ============================================================================ -// Types and business logic below may be deprecated. -// -// TODO: Rename once legacy p2p types are removed. -// ref: https://github.com/tendermint/tendermint/issues/5670 -// ============================================================================ - //go:generate mockery --case underscore --name Peer const metricsTickerDuration = 10 * time.Second diff --git a/p2p/peermanager.go b/p2p/peermanager.go new file mode 100644 index 000000000..254f93a63 --- /dev/null +++ b/p2p/peermanager.go @@ -0,0 +1,1275 @@ +package p2p + +import ( + "context" + "errors" + "fmt" + "math" + "math/rand" + "sort" + "sync" + "time" + + "github.com/gogo/protobuf/proto" + "github.com/google/orderedcode" + dbm "github.com/tendermint/tm-db" + + tmsync "github.com/tendermint/tendermint/libs/sync" + p2pproto "github.com/tendermint/tendermint/proto/tendermint/p2p" +) + +const ( + // retryNever is returned by retryDelay() when retries are disabled. + retryNever time.Duration = math.MaxInt64 +) + +// PeerStatus is a peer status. +// +// The peer manager has many more internal states for a peer (e.g. dialing, +// connected, evicting, and so on), which are tracked separately. PeerStatus is +// for external use outside of the peer manager. +type PeerStatus string + +const ( + PeerStatusUp PeerStatus = "up" // connected and ready + PeerStatusDown PeerStatus = "down" // disconnected +) + +// PeerScore is a numeric score assigned to a peer (higher is better). +type PeerScore uint8 + +const ( + PeerScorePersistent PeerScore = 100 // persistent peers +) + +// PeerUpdate is a peer update event sent via PeerUpdates. +type PeerUpdate struct { + NodeID NodeID + Status PeerStatus +} + +// PeerUpdates is a peer update subscription with notifications about peer +// events (currently just status changes). +type PeerUpdates struct { + updatesCh chan PeerUpdate + closeCh chan struct{} + closeOnce sync.Once +} + +// NewPeerUpdates creates a new PeerUpdates subscription. It is primarily for +// internal use, callers should typically use PeerManager.Subscribe(). The +// subscriber must call Close() when done. +func NewPeerUpdates(updatesCh chan PeerUpdate) *PeerUpdates { + return &PeerUpdates{ + updatesCh: updatesCh, + closeCh: make(chan struct{}), + } +} + +// Updates returns a channel for consuming peer updates. +func (pu *PeerUpdates) Updates() <-chan PeerUpdate { + return pu.updatesCh +} + +// Close closes the peer updates subscription. +func (pu *PeerUpdates) Close() { + pu.closeOnce.Do(func() { + // NOTE: We don't close updatesCh since multiple goroutines may be + // sending on it. The PeerManager senders will select on closeCh as well + // to avoid blocking on a closed subscription. + close(pu.closeCh) + }) +} + +// Done returns a channel that is closed when the subscription is closed. +func (pu *PeerUpdates) Done() <-chan struct{} { + return pu.closeCh +} + +// PeerManagerOptions specifies options for a PeerManager. +type PeerManagerOptions struct { + // PersistentPeers are peers that we want to maintain persistent connections + // to. These will be scored higher than other peers, and if + // MaxConnectedUpgrade is non-zero any lower-scored peers will be evicted if + // necessary to make room for these. + PersistentPeers []NodeID + + // MaxPeers is the maximum number of peers to track information about, i.e. + // store in the peer store. When exceeded, the lowest-scored unconnected peers + // will be deleted. 0 means no limit. + MaxPeers uint16 + + // MaxConnected is the maximum number of connected peers (inbound and + // outbound). 0 means no limit. + MaxConnected uint16 + + // MaxConnectedUpgrade is the maximum number of additional connections to + // use for probing any better-scored peers to upgrade to when all connection + // slots are full. 0 disables peer upgrading. + // + // For example, if we are already connected to MaxConnected peers, but we + // know or learn about better-scored peers (e.g. configured persistent + // peers) that we are not connected too, then we can probe these peers by + // using up to MaxConnectedUpgrade connections, and once connected evict the + // lowest-scored connected peers. This also works for inbound connections, + // i.e. if a higher-scored peer attempts to connect to us, we can accept + // the connection and evict a lower-scored peer. + MaxConnectedUpgrade uint16 + + // MinRetryTime is the minimum time to wait between retries. Retry times + // double for each retry, up to MaxRetryTime. 0 disables retries. + MinRetryTime time.Duration + + // MaxRetryTime is the maximum time to wait between retries. 0 means + // no maximum, in which case the retry time will keep doubling. + MaxRetryTime time.Duration + + // MaxRetryTimePersistent is the maximum time to wait between retries for + // peers listed in PersistentPeers. 0 uses MaxRetryTime instead. + MaxRetryTimePersistent time.Duration + + // RetryTimeJitter is the upper bound of a random interval added to + // retry times, to avoid thundering herds. 0 disables jitter. + RetryTimeJitter time.Duration + + // PeerScores sets fixed scores for specific peers. It is mainly used + // for testing. A score of 0 is ignored. + PeerScores map[NodeID]PeerScore + + // persistentPeers provides fast PersistentPeers lookups. It is built + // by optimize(). + persistentPeers map[NodeID]bool +} + +// Validate validates the options. +func (o *PeerManagerOptions) Validate() error { + for _, id := range o.PersistentPeers { + if err := id.Validate(); err != nil { + return fmt.Errorf("invalid PersistentPeer ID %q: %w", id, err) + } + } + if o.MaxConnected > 0 && len(o.PersistentPeers) > int(o.MaxConnected) { + return fmt.Errorf("number of persistent peers %v can't exceed MaxConnected %v", + len(o.PersistentPeers), o.MaxConnected) + } + + if o.MaxPeers > 0 { + if o.MaxConnected == 0 || o.MaxConnected+o.MaxConnectedUpgrade > o.MaxPeers { + return fmt.Errorf("MaxConnected %v and MaxConnectedUpgrade %v can't exceed MaxPeers %v", // nolint + o.MaxConnected, o.MaxConnectedUpgrade, o.MaxPeers) + } + } + + if o.MaxRetryTime > 0 { + if o.MinRetryTime == 0 { + return errors.New("can't set MaxRetryTime without MinRetryTime") + } + if o.MinRetryTime > o.MaxRetryTime { + return fmt.Errorf("MinRetryTime %v is greater than MaxRetryTime %v", // nolint + o.MinRetryTime, o.MaxRetryTime) + } + } + if o.MaxRetryTimePersistent > 0 { + if o.MinRetryTime == 0 { + return errors.New("can't set MaxRetryTimePersistent without MinRetryTime") + } + if o.MinRetryTime > o.MaxRetryTimePersistent { + return fmt.Errorf("MinRetryTime %v is greater than MaxRetryTimePersistent %v", // nolint + o.MinRetryTime, o.MaxRetryTimePersistent) + } + } + + return nil +} + +// isPersistentPeer checks if a peer is in PersistentPeers. It will panic +// if called before optimize(). +func (o *PeerManagerOptions) isPersistent(id NodeID) bool { + if o.persistentPeers == nil { + panic("isPersistentPeer() called before optimize()") + } + return o.persistentPeers[id] +} + +// optimize optimizes operations by pregenerating lookup structures. It's a +// separate method instead of memoizing during calls to avoid dealing with +// concurrency and mutex overhead. +func (o *PeerManagerOptions) optimize() { + o.persistentPeers = make(map[NodeID]bool, len(o.PersistentPeers)) + for _, p := range o.PersistentPeers { + o.persistentPeers[p] = true + } +} + +// PeerManager manages peer lifecycle information, using a peerStore for +// underlying storage. Its primary purpose is to determine which peer to connect +// to next (including retry timers), make sure a peer only has a single active +// connection (either inbound or outbound), and evict peers to make room for +// higher-scored peers. It does not manage actual connections (this is handled +// by the Router), only the peer lifecycle state. +// +// For an outbound connection, the flow is as follows: +// - DialNext: return a peer address to dial, mark peer as dialing. +// - DialFailed: report a dial failure, unmark as dialing. +// - Dialed: report a dial success, unmark as dialing and mark as connected +// (errors if already connected, e.g. by Accepted). +// - Ready: report routing is ready, mark as ready and broadcast PeerStatusUp. +// - Disconnected: report peer disconnect, unmark as connected and broadcasts +// PeerStatusDown. +// +// For an inbound connection, the flow is as follows: +// - Accepted: report inbound connection success, mark as connected (errors if +// already connected, e.g. by Dialed). +// - Ready: report routing is ready, mark as ready and broadcast PeerStatusUp. +// - Disconnected: report peer disconnect, unmark as connected and broadcasts +// PeerStatusDown. +// +// When evicting peers, either because peers are explicitly scheduled for +// eviction or we are connected to too many peers, the flow is as follows: +// - EvictNext: if marked evict and connected, unmark evict and mark evicting. +// If beyond MaxConnected, pick lowest-scored peer and mark evicting. +// - Disconnected: unmark connected, evicting, evict, and broadcast a +// PeerStatusDown peer update. +// +// If all connection slots are full (at MaxConnections), we can use up to +// MaxConnectionsUpgrade additional connections to probe any higher-scored +// unconnected peers, and if we reach them (or they reach us) we allow the +// connection and evict a lower-scored peer. We mark the lower-scored peer as +// upgrading[from]=to to make sure no other higher-scored peers can claim the +// same one for an upgrade. The flow is as follows: +// - Accepted: if upgrade is possible, mark connected and add lower-scored to evict. +// - DialNext: if upgrade is possible, mark upgrading[from]=to and dialing. +// - DialFailed: unmark upgrading[from]=to and dialing. +// - Dialed: unmark upgrading[from]=to and dialing, mark as connected, add +// lower-scored to evict. +// - EvictNext: pick peer from evict, mark as evicting. +// - Disconnected: unmark connected, upgrading[from]=to, evict, evicting. +// +// FIXME: The old stack supports ABCI-based peer ID filtering via +// /p2p/filter/id/ queries, we should implement this here as well by taking +// a peer ID filtering callback in PeerManagerOptions and configuring it during +// Node setup. +type PeerManager struct { + options PeerManagerOptions + rand *rand.Rand + dialWaker *tmsync.Waker // wakes up DialNext() on relevant peer changes + evictWaker *tmsync.Waker // wakes up EvictNext() on relevant peer changes + closeCh chan struct{} // signal channel for Close() + closeOnce sync.Once + + mtx sync.Mutex + store *peerStore + subscriptions map[*PeerUpdates]*PeerUpdates // keyed by struct identity (address) + dialing map[NodeID]bool // peers being dialed (DialNext → Dialed/DialFail) + upgrading map[NodeID]NodeID // peers claimed for upgrade (DialNext → Dialed/DialFail) + connected map[NodeID]bool // connected peers (Dialed/Accepted → Disconnected) + ready map[NodeID]bool // ready peers (Ready → Disconnected) + evict map[NodeID]bool // peers scheduled for eviction (Connected → EvictNext) + evicting map[NodeID]bool // peers being evicted (EvictNext → Disconnected) +} + +// NewPeerManager creates a new peer manager. +func NewPeerManager(peerDB dbm.DB, options PeerManagerOptions) (*PeerManager, error) { + store, err := newPeerStore(peerDB) + if err != nil { + return nil, err + } + if err = options.Validate(); err != nil { + return nil, err + } + options.optimize() + + peerManager := &PeerManager{ + options: options, + rand: rand.New(rand.NewSource(time.Now().UnixNano())), // nolint:gosec + dialWaker: tmsync.NewWaker(), + evictWaker: tmsync.NewWaker(), + closeCh: make(chan struct{}), + + store: store, + dialing: map[NodeID]bool{}, + upgrading: map[NodeID]NodeID{}, + connected: map[NodeID]bool{}, + ready: map[NodeID]bool{}, + evict: map[NodeID]bool{}, + evicting: map[NodeID]bool{}, + subscriptions: map[*PeerUpdates]*PeerUpdates{}, + } + if err = peerManager.configurePeers(); err != nil { + return nil, err + } + if err = peerManager.prunePeers(); err != nil { + return nil, err + } + return peerManager, nil +} + +// configurePeers configures peers in the peer store with ephemeral runtime +// configuration, e.g. PersistentPeers. The caller must hold the mutex lock. +func (m *PeerManager) configurePeers() error { + configure := map[NodeID]bool{} + for _, id := range m.options.PersistentPeers { + configure[id] = true + } + for id := range m.options.PeerScores { + configure[id] = true + } + for id := range configure { + if peer, ok := m.store.Get(id); ok { + if err := m.store.Set(m.configurePeer(peer)); err != nil { + return err + } + } + } + return nil +} + +// configurePeer configures a peer with ephemeral runtime configuration. +func (m *PeerManager) configurePeer(peer peerInfo) peerInfo { + peer.Persistent = m.options.isPersistent(peer.ID) + peer.FixedScore = m.options.PeerScores[peer.ID] + return peer +} + +// newPeerInfo creates a peerInfo for a new peer. +func (m *PeerManager) newPeerInfo(id NodeID) peerInfo { + peerInfo := peerInfo{ + ID: id, + AddressInfo: map[NodeAddress]*peerAddressInfo{}, + } + return m.configurePeer(peerInfo) +} + +// prunePeers removes low-scored peers from the peer store if it contains more +// than MaxPeers peers. The caller must hold the mutex lock. +func (m *PeerManager) prunePeers() error { + if m.options.MaxPeers == 0 || m.store.Size() <= int(m.options.MaxPeers) { + return nil + } + + ranked := m.store.Ranked() + for i := len(ranked) - 1; i >= 0; i-- { + peerID := ranked[i].ID + switch { + case m.store.Size() <= int(m.options.MaxPeers): + break + case m.dialing[peerID]: + case m.connected[peerID]: + default: + if err := m.store.Delete(peerID); err != nil { + return err + } + } + } + return nil +} + +// Add adds a peer to the manager, given as an address. If the peer already +// exists, the address is added to it if not already present. +func (m *PeerManager) Add(address NodeAddress) error { + if err := address.Validate(); err != nil { + return err + } + + m.mtx.Lock() + defer m.mtx.Unlock() + + peer, ok := m.store.Get(address.NodeID) + if !ok { + peer = m.newPeerInfo(address.NodeID) + } + if _, ok := peer.AddressInfo[address]; !ok { + peer.AddressInfo[address] = &peerAddressInfo{Address: address} + } + if err := m.store.Set(peer); err != nil { + return err + } + if err := m.prunePeers(); err != nil { + return err + } + m.dialWaker.Wake() + return nil +} + +// DialNext finds an appropriate peer address to dial, and marks it as dialing. +// If no peer is found, or all connection slots are full, it blocks until one +// becomes available. The caller must call Dialed() or DialFailed() for the +// returned peer. +func (m *PeerManager) DialNext(ctx context.Context) (NodeAddress, error) { + for { + address, err := m.TryDialNext() + if err != nil || (address != NodeAddress{}) { + return address, err + } + select { + case <-m.dialWaker.Sleep(): + case <-ctx.Done(): + return NodeAddress{}, ctx.Err() + } + } +} + +// TryDialNext is equivalent to DialNext(), but immediately returns an empty +// address if no peers or connection slots are available. +func (m *PeerManager) TryDialNext() (NodeAddress, error) { + m.mtx.Lock() + defer m.mtx.Unlock() + + // We allow dialing MaxConnected+MaxConnectedUpgrade peers. Including + // MaxConnectedUpgrade allows us to probe additional peers that have a + // higher score than any other peers, and if successful evict it. + if m.options.MaxConnected > 0 && len(m.connected)+len(m.dialing) >= + int(m.options.MaxConnected)+int(m.options.MaxConnectedUpgrade) { + return NodeAddress{}, nil + } + + for _, peer := range m.store.Ranked() { + if m.dialing[peer.ID] || m.connected[peer.ID] { + continue + } + + for _, addressInfo := range peer.AddressInfo { + if time.Since(addressInfo.LastDialFailure) < m.retryDelay(addressInfo.DialFailures, peer.Persistent) { + continue + } + + // We now have an eligible address to dial. If we're full but have + // upgrade capacity (as checked above), we find a lower-scored peer + // we can replace and mark it as upgrading so noone else claims it. + // + // If we don't find one, there is no point in trying additional + // peers, since they will all have the same or lower score than this + // peer (since they're ordered by score via peerStore.Ranked). + if m.options.MaxConnected > 0 && len(m.connected) >= int(m.options.MaxConnected) { + upgradeFromPeer := m.findUpgradeCandidate(peer.ID, peer.Score()) + if upgradeFromPeer == "" { + return NodeAddress{}, nil + } + m.upgrading[upgradeFromPeer] = peer.ID + } + + m.dialing[peer.ID] = true + return addressInfo.Address, nil + } + } + return NodeAddress{}, nil +} + +// DialFailed reports a failed dial attempt. This will make the peer available +// for dialing again when appropriate (possibly after a retry timeout). +// +// FIXME: This should probably delete or mark bad addresses/peers after some time. +func (m *PeerManager) DialFailed(address NodeAddress) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + delete(m.dialing, address.NodeID) + for from, to := range m.upgrading { + if to == address.NodeID { + delete(m.upgrading, from) // Unmark failed upgrade attempt. + } + } + + peer, ok := m.store.Get(address.NodeID) + if !ok { // Peer may have been removed while dialing, ignore. + return nil + } + addressInfo, ok := peer.AddressInfo[address] + if !ok { + return nil // Assume the address has been removed, ignore. + } + addressInfo.LastDialFailure = time.Now().UTC() + addressInfo.DialFailures++ + if err := m.store.Set(peer); err != nil { + return err + } + + // We spawn a goroutine that notifies DialNext() again when the retry + // timeout has elapsed, so that we can consider dialing it again. We + // calculate the retry delay outside the goroutine, since it must hold + // the mutex lock. + if d := m.retryDelay(addressInfo.DialFailures, peer.Persistent); d != retryNever { + go func() { + // Use an explicit timer with deferred cleanup instead of + // time.After(), to avoid leaking goroutines on PeerManager.Close(). + timer := time.NewTimer(d) + defer timer.Stop() + select { + case <-timer.C: + m.dialWaker.Wake() + case <-m.closeCh: + } + }() + } + + m.dialWaker.Wake() + return nil +} + +// Dialed marks a peer as successfully dialed. Any further connections will be +// rejected, and once disconnected the peer may be dialed again. +func (m *PeerManager) Dialed(address NodeAddress) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + delete(m.dialing, address.NodeID) + + var upgradeFromPeer NodeID + for from, to := range m.upgrading { + if to == address.NodeID { + delete(m.upgrading, from) + upgradeFromPeer = from + // Don't break, just in case this peer was marked as upgrading for + // multiple lower-scored peers (shouldn't really happen). + } + } + + if m.connected[address.NodeID] { + return fmt.Errorf("peer %v is already connected", address.NodeID) + } + if m.options.MaxConnected > 0 && len(m.connected) >= int(m.options.MaxConnected) { + if upgradeFromPeer == "" || len(m.connected) >= + int(m.options.MaxConnected)+int(m.options.MaxConnectedUpgrade) { + return fmt.Errorf("already connected to maximum number of peers") + } + } + + peer, ok := m.store.Get(address.NodeID) + if !ok { + return fmt.Errorf("peer %q was removed while dialing", address.NodeID) + } + now := time.Now().UTC() + peer.LastConnected = now + if addressInfo, ok := peer.AddressInfo[address]; ok { + addressInfo.DialFailures = 0 + addressInfo.LastDialSuccess = now + // If not found, assume address has been removed. + } + if err := m.store.Set(peer); err != nil { + return err + } + + if upgradeFromPeer != "" && m.options.MaxConnected > 0 && + len(m.connected) >= int(m.options.MaxConnected) { + // Look for an even lower-scored peer that may have appeared since we + // started the upgrade. + if p, ok := m.store.Get(upgradeFromPeer); ok { + if u := m.findUpgradeCandidate(p.ID, p.Score()); u != "" { + upgradeFromPeer = u + } + } + m.evict[upgradeFromPeer] = true + } + m.connected[peer.ID] = true + m.evictWaker.Wake() + + return nil +} + +// Accepted marks an incoming peer connection successfully accepted. If the peer +// is already connected or we don't allow additional connections then this will +// return an error. +// +// If full but MaxConnectedUpgrade is non-zero and the incoming peer is +// better-scored than any existing peers, then we accept it and evict a +// lower-scored peer. +// +// NOTE: We can't take an address here, since e.g. TCP uses a different port +// number for outbound traffic than inbound traffic, so the peer's endpoint +// wouldn't necessarily be an appropriate address to dial. +// +// FIXME: When we accept a connection from a peer, we should register that +// peer's address in the peer store so that we can dial it later. In order to do +// that, we'll need to get the remote address after all, but as noted above that +// can't be the remote endpoint since that will usually have the wrong port +// number. +func (m *PeerManager) Accepted(peerID NodeID) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + if m.connected[peerID] { + return fmt.Errorf("peer %q is already connected", peerID) + } + if m.options.MaxConnected > 0 && + len(m.connected) >= int(m.options.MaxConnected)+int(m.options.MaxConnectedUpgrade) { + return fmt.Errorf("already connected to maximum number of peers") + } + + peer, ok := m.store.Get(peerID) + if !ok { + peer = m.newPeerInfo(peerID) + } + + // If all connections slots are full, but we allow upgrades (and we checked + // above that we have upgrade capacity), then we can look for a lower-scored + // peer to replace and if found accept the connection anyway and evict it. + var upgradeFromPeer NodeID + if m.options.MaxConnected > 0 && len(m.connected) >= int(m.options.MaxConnected) { + upgradeFromPeer = m.findUpgradeCandidate(peer.ID, peer.Score()) + if upgradeFromPeer == "" { + return fmt.Errorf("already connected to maximum number of peers") + } + } + + peer.LastConnected = time.Now().UTC() + if err := m.store.Set(peer); err != nil { + return err + } + + m.connected[peerID] = true + if upgradeFromPeer != "" { + m.evict[upgradeFromPeer] = true + } + m.evictWaker.Wake() + return nil +} + +// Ready marks a peer as ready, broadcasting status updates to subscribers. The +// peer must already be marked as connected. This is separate from Dialed() and +// Accepted() to allow the router to set up its internal queues before reactors +// start sending messages. +func (m *PeerManager) Ready(peerID NodeID) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + if m.connected[peerID] { + m.ready[peerID] = true + m.broadcast(PeerUpdate{ + NodeID: peerID, + Status: PeerStatusUp, + }) + } + return nil +} + +// EvictNext returns the next peer to evict (i.e. disconnect). If no evictable +// peers are found, the call will block until one becomes available. +func (m *PeerManager) EvictNext(ctx context.Context) (NodeID, error) { + for { + id, err := m.TryEvictNext() + if err != nil || id != "" { + return id, err + } + select { + case <-m.evictWaker.Sleep(): + case <-ctx.Done(): + return "", ctx.Err() + } + } +} + +// TryEvictNext is equivalent to EvictNext, but immediately returns an empty +// node ID if no evictable peers are found. +func (m *PeerManager) TryEvictNext() (NodeID, error) { + m.mtx.Lock() + defer m.mtx.Unlock() + + // If any connected peers are explicitly scheduled for eviction, we return a + // random one. + for peerID := range m.evict { + delete(m.evict, peerID) + if m.connected[peerID] && !m.evicting[peerID] { + m.evicting[peerID] = true + return peerID, nil + } + } + + // If we're below capacity, we don't need to evict anything. + if m.options.MaxConnected == 0 || + len(m.connected)-len(m.evicting) <= int(m.options.MaxConnected) { + return "", nil + } + + // If we're above capacity (shouldn't really happen), just pick the + // lowest-ranked peer to evict. + ranked := m.store.Ranked() + for i := len(ranked) - 1; i >= 0; i-- { + peer := ranked[i] + if m.connected[peer.ID] && !m.evicting[peer.ID] { + m.evicting[peer.ID] = true + return peer.ID, nil + } + } + + return "", nil +} + +// Disconnected unmarks a peer as connected, allowing it to be dialed or +// accepted again as appropriate. +func (m *PeerManager) Disconnected(peerID NodeID) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + ready := m.ready[peerID] + + delete(m.connected, peerID) + delete(m.upgrading, peerID) + delete(m.evict, peerID) + delete(m.evicting, peerID) + delete(m.ready, peerID) + + if ready { + m.broadcast(PeerUpdate{ + NodeID: peerID, + Status: PeerStatusDown, + }) + } + + m.dialWaker.Wake() + return nil +} + +// Errored reports a peer error, causing the peer to be evicted if it's +// currently connected. +// +// FIXME: This should probably be replaced with a peer behavior API, see +// PeerError comments for more details. +// +// FIXME: This will cause the peer manager to immediately try to reconnect to +// the peer, which is probably not always what we want. +func (m *PeerManager) Errored(peerID NodeID, err error) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + if m.connected[peerID] { + m.evict[peerID] = true + } + + m.evictWaker.Wake() + return nil +} + +// Advertise returns a list of peer addresses to advertise to a peer. +// +// FIXME: This is fairly naïve and only returns the addresses of the +// highest-ranked peers. +func (m *PeerManager) Advertise(peerID NodeID, limit uint16) []NodeAddress { + m.mtx.Lock() + defer m.mtx.Unlock() + + addresses := make([]NodeAddress, 0, limit) + for _, peer := range m.store.Ranked() { + if peer.ID == peerID { + continue + } + for _, addressInfo := range peer.AddressInfo { + if len(addresses) >= int(limit) { + return addresses + } + addresses = append(addresses, addressInfo.Address) + } + } + return addresses +} + +// Subscribe subscribes to peer updates. The caller must consume the peer +// updates in a timely fashion and close the subscription when done, otherwise +// the PeerManager will halt. +func (m *PeerManager) Subscribe() *PeerUpdates { + // FIXME: We use a size 1 buffer here. When we broadcast a peer update + // we have to loop over all of the subscriptions, and we want to avoid + // having to block and wait for a context switch before continuing on + // to the next subscriptions. This also prevents tail latencies from + // compounding. Limiting it to 1 means that the subscribers are still + // reasonably in sync. However, this should probably be benchmarked. + peerUpdates := NewPeerUpdates(make(chan PeerUpdate, 1)) + m.mtx.Lock() + m.subscriptions[peerUpdates] = peerUpdates + m.mtx.Unlock() + + go func() { + select { + case <-peerUpdates.Done(): + m.mtx.Lock() + delete(m.subscriptions, peerUpdates) + m.mtx.Unlock() + case <-m.closeCh: + } + }() + return peerUpdates +} + +// broadcast broadcasts a peer update to all subscriptions. The caller must +// already hold the mutex lock, to make sure updates are sent in the same order +// as the PeerManager processes them, but this means subscribers must be +// responsive at all times or the entire PeerManager will halt. +// +// FIXME: Consider using an internal channel to buffer updates while also +// maintaining order if this is a problem. +func (m *PeerManager) broadcast(peerUpdate PeerUpdate) { + for _, sub := range m.subscriptions { + // We have to check closeCh separately first, otherwise there's a 50% + // chance the second select will send on a closed subscription. + select { + case <-sub.closeCh: + continue + default: + } + select { + case sub.updatesCh <- peerUpdate: + case <-sub.closeCh: + } + } +} + +// Close closes the peer manager, releasing resources (i.e. goroutines). +func (m *PeerManager) Close() { + m.closeOnce.Do(func() { + close(m.closeCh) + }) +} + +// Addresses returns all known addresses for a peer, primarily for testing. +// The order is arbitrary. +func (m *PeerManager) Addresses(peerID NodeID) []NodeAddress { + m.mtx.Lock() + defer m.mtx.Unlock() + + addresses := []NodeAddress{} + if peer, ok := m.store.Get(peerID); ok { + for _, addressInfo := range peer.AddressInfo { + addresses = append(addresses, addressInfo.Address) + } + } + return addresses +} + +// Peers returns all known peers, primarily for testing. The order is arbitrary. +func (m *PeerManager) Peers() []NodeID { + m.mtx.Lock() + defer m.mtx.Unlock() + + peers := []NodeID{} + for _, peer := range m.store.Ranked() { + peers = append(peers, peer.ID) + } + return peers +} + +// Scores returns the peer scores for all known peers, primarily for testing. +func (m *PeerManager) Scores() map[NodeID]PeerScore { + m.mtx.Lock() + defer m.mtx.Unlock() + + scores := map[NodeID]PeerScore{} + for _, peer := range m.store.Ranked() { + scores[peer.ID] = peer.Score() + } + return scores +} + +// Status returns the status for a peer, primarily for testing. +func (m *PeerManager) Status(id NodeID) PeerStatus { + m.mtx.Lock() + defer m.mtx.Unlock() + switch { + case m.ready[id]: + return PeerStatusUp + default: + return PeerStatusDown + } +} + +// findUpgradeCandidate looks for a lower-scored peer that we could evict +// to make room for the given peer. Returns an empty ID if none is found. +// If the peer is already being upgraded to, we return that same upgrade. +// The caller must hold the mutex lock. +func (m *PeerManager) findUpgradeCandidate(id NodeID, score PeerScore) NodeID { + for from, to := range m.upgrading { + if to == id { + return from + } + } + + ranked := m.store.Ranked() + for i := len(ranked) - 1; i >= 0; i-- { + candidate := ranked[i] + switch { + case candidate.Score() >= score: + return "" // no further peers can be scored lower, due to sorting + case !m.connected[candidate.ID]: + case m.evict[candidate.ID]: + case m.evicting[candidate.ID]: + case m.upgrading[candidate.ID] != "": + default: + return candidate.ID + } + } + return "" +} + +// retryDelay calculates a dial retry delay using exponential backoff, based on +// retry settings in PeerManagerOptions. If retries are disabled (i.e. +// MinRetryTime is 0), this returns retryNever (i.e. an infinite retry delay). +// The caller must hold the mutex lock (for m.rand which is not thread-safe). +func (m *PeerManager) retryDelay(failures uint32, persistent bool) time.Duration { + if failures == 0 { + return 0 + } + if m.options.MinRetryTime == 0 { + return retryNever + } + maxDelay := m.options.MaxRetryTime + if persistent && m.options.MaxRetryTimePersistent > 0 { + maxDelay = m.options.MaxRetryTimePersistent + } + + delay := m.options.MinRetryTime * time.Duration(math.Pow(2, float64(failures-1))) + if maxDelay > 0 && delay > maxDelay { + delay = maxDelay + } + if m.options.RetryTimeJitter > 0 { + delay += time.Duration(m.rand.Int63n(int64(m.options.RetryTimeJitter))) + } + return delay +} + +// GetHeight returns a peer's height, as reported via SetHeight, or 0 if the +// peer or height is unknown. +// +// FIXME: This is a temporary workaround to share state between the consensus +// and mempool reactors, carried over from the legacy P2P stack. Reactors should +// not have dependencies on each other, instead tracking this themselves. +func (m *PeerManager) GetHeight(peerID NodeID) int64 { + m.mtx.Lock() + defer m.mtx.Unlock() + + peer, _ := m.store.Get(peerID) + return peer.Height +} + +// SetHeight stores a peer's height, making it available via GetHeight. +// +// FIXME: This is a temporary workaround to share state between the consensus +// and mempool reactors, carried over from the legacy P2P stack. Reactors should +// not have dependencies on each other, instead tracking this themselves. +func (m *PeerManager) SetHeight(peerID NodeID, height int64) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + peer, ok := m.store.Get(peerID) + if !ok { + peer = m.newPeerInfo(peerID) + } + peer.Height = height + return m.store.Set(peer) +} + +// peerStore stores information about peers. It is not thread-safe, assuming it +// is only used by PeerManager which handles concurrency control. This allows +// the manager to execute multiple operations atomically via its own mutex. +// +// The entire set of peers is kept in memory, for performance. It is loaded +// from disk on initialization, and any changes are written back to disk +// (without fsync, since we can afford to lose recent writes). +type peerStore struct { + db dbm.DB + peers map[NodeID]*peerInfo + ranked []*peerInfo // cache for Ranked(), nil invalidates cache +} + +// newPeerStore creates a new peer store, loading all persisted peers from the +// database into memory. +func newPeerStore(db dbm.DB) (*peerStore, error) { + if db == nil { + return nil, errors.New("no database provided") + } + store := &peerStore{db: db} + if err := store.loadPeers(); err != nil { + return nil, err + } + return store, nil +} + +// loadPeers loads all peers from the database into memory. +func (s *peerStore) loadPeers() error { + peers := map[NodeID]*peerInfo{} + + start, end := keyPeerInfoRange() + iter, err := s.db.Iterator(start, end) + if err != nil { + return err + } + defer iter.Close() + for ; iter.Valid(); iter.Next() { + // FIXME: We may want to tolerate failures here, by simply logging + // the errors and ignoring the faulty peer entries. + msg := new(p2pproto.PeerInfo) + if err := proto.Unmarshal(iter.Value(), msg); err != nil { + return fmt.Errorf("invalid peer Protobuf data: %w", err) + } + peer, err := peerInfoFromProto(msg) + if err != nil { + return fmt.Errorf("invalid peer data: %w", err) + } + peers[peer.ID] = peer + } + if iter.Error() != nil { + return iter.Error() + } + s.peers = peers + s.ranked = nil // invalidate cache if populated + return nil +} + +// Get fetches a peer. The boolean indicates whether the peer existed or not. +// The returned peer info is a copy, and can be mutated at will. +func (s *peerStore) Get(id NodeID) (peerInfo, bool) { + peer, ok := s.peers[id] + return peer.Copy(), ok +} + +// Set stores peer data. The input data will be copied, and can safely be reused +// by the caller. +func (s *peerStore) Set(peer peerInfo) error { + if err := peer.Validate(); err != nil { + return err + } + peer = peer.Copy() + + // FIXME: We may want to optimize this by avoiding saving to the database + // if there haven't been any changes to persisted fields. + bz, err := peer.ToProto().Marshal() + if err != nil { + return err + } + if err = s.db.Set(keyPeerInfo(peer.ID), bz); err != nil { + return err + } + + if current, ok := s.peers[peer.ID]; !ok || current.Score() != peer.Score() { + // If the peer is new, or its score changes, we invalidate the Ranked() cache. + s.peers[peer.ID] = &peer + s.ranked = nil + } else { + // Otherwise, since s.ranked contains pointers to the old data and we + // want those pointers to remain valid with the new data, we have to + // update the existing pointer address. + *current = peer + } + + return nil +} + +// Delete deletes a peer, or does nothing if it does not exist. +func (s *peerStore) Delete(id NodeID) error { + if _, ok := s.peers[id]; !ok { + return nil + } + if err := s.db.Delete(keyPeerInfo(id)); err != nil { + return err + } + delete(s.peers, id) + s.ranked = nil + return nil +} + +// List retrieves all peers in an arbitrary order. The returned data is a copy, +// and can be mutated at will. +func (s *peerStore) List() []peerInfo { + peers := make([]peerInfo, 0, len(s.peers)) + for _, peer := range s.peers { + peers = append(peers, peer.Copy()) + } + return peers +} + +// Ranked returns a list of peers ordered by score (better peers first). Peers +// with equal scores are returned in an arbitrary order. The returned list must +// not be mutated or accessed concurrently by the caller, since it returns +// pointers to internal peerStore data for performance. +// +// Ranked is used to determine both which peers to dial, which ones to evict, +// and which ones to delete completely. +// +// FIXME: For now, we simply maintain a cache in s.ranked which is invalidated +// by setting it to nil, but if necessary we should use a better data structure +// for this (e.g. a heap or ordered map). +// +// FIXME: The scoring logic is currently very naïve, see peerInfo.Score(). +func (s *peerStore) Ranked() []*peerInfo { + if s.ranked != nil { + return s.ranked + } + s.ranked = make([]*peerInfo, 0, len(s.peers)) + for _, peer := range s.peers { + s.ranked = append(s.ranked, peer) + } + sort.Slice(s.ranked, func(i, j int) bool { + // FIXME: If necessary, consider precomputing scores before sorting, + // to reduce the number of Score() calls. + return s.ranked[i].Score() > s.ranked[j].Score() + }) + return s.ranked +} + +// Size returns the number of peers in the peer store. +func (s *peerStore) Size() int { + return len(s.peers) +} + +// peerInfo contains peer information stored in a peerStore. +type peerInfo struct { + ID NodeID + AddressInfo map[NodeAddress]*peerAddressInfo + LastConnected time.Time + + // These fields are ephemeral, i.e. not persisted to the database. + Persistent bool + Height int64 + FixedScore PeerScore // mainly for tests +} + +// peerInfoFromProto converts a Protobuf PeerInfo message to a peerInfo, +// erroring if the data is invalid. +func peerInfoFromProto(msg *p2pproto.PeerInfo) (*peerInfo, error) { + p := &peerInfo{ + ID: NodeID(msg.ID), + AddressInfo: map[NodeAddress]*peerAddressInfo{}, + } + if msg.LastConnected != nil { + p.LastConnected = *msg.LastConnected + } + for _, a := range msg.AddressInfo { + addressInfo, err := peerAddressInfoFromProto(a) + if err != nil { + return nil, err + } + p.AddressInfo[addressInfo.Address] = addressInfo + } + return p, p.Validate() +} + +// ToProto converts the peerInfo to p2pproto.PeerInfo for database storage. The +// Protobuf type only contains persisted fields, while ephemeral fields are +// discarded. The returned message may contain pointers to original data, since +// it is expected to be serialized immediately. +func (p *peerInfo) ToProto() *p2pproto.PeerInfo { + msg := &p2pproto.PeerInfo{ + ID: string(p.ID), + LastConnected: &p.LastConnected, + } + for _, addressInfo := range p.AddressInfo { + msg.AddressInfo = append(msg.AddressInfo, addressInfo.ToProto()) + } + if msg.LastConnected.IsZero() { + msg.LastConnected = nil + } + return msg +} + +// Copy returns a deep copy of the peer info. +func (p *peerInfo) Copy() peerInfo { + if p == nil { + return peerInfo{} + } + c := *p + for i, addressInfo := range c.AddressInfo { + addressInfoCopy := addressInfo.Copy() + c.AddressInfo[i] = &addressInfoCopy + } + return c +} + +// Score calculates a score for the peer. Higher-scored peers will be +// preferred over lower scores. +func (p *peerInfo) Score() PeerScore { + var score PeerScore + if p.FixedScore > 0 { + return p.FixedScore + } + if p.Persistent { + score += PeerScorePersistent + } + return score +} + +// Validate validates the peer info. +func (p *peerInfo) Validate() error { + if p.ID == "" { + return errors.New("no peer ID") + } + return nil +} + +// peerAddressInfo contains information and statistics about a peer address. +type peerAddressInfo struct { + Address NodeAddress + LastDialSuccess time.Time + LastDialFailure time.Time + DialFailures uint32 // since last successful dial +} + +// peerAddressInfoFromProto converts a Protobuf PeerAddressInfo message +// to a peerAddressInfo. +func peerAddressInfoFromProto(msg *p2pproto.PeerAddressInfo) (*peerAddressInfo, error) { + address, err := ParseNodeAddress(msg.Address) + if err != nil { + return nil, fmt.Errorf("invalid address %q: %w", address, err) + } + addressInfo := &peerAddressInfo{ + Address: address, + DialFailures: msg.DialFailures, + } + if msg.LastDialSuccess != nil { + addressInfo.LastDialSuccess = *msg.LastDialSuccess + } + if msg.LastDialFailure != nil { + addressInfo.LastDialFailure = *msg.LastDialFailure + } + return addressInfo, addressInfo.Validate() +} + +// ToProto converts the address into to a Protobuf message for serialization. +func (a *peerAddressInfo) ToProto() *p2pproto.PeerAddressInfo { + msg := &p2pproto.PeerAddressInfo{ + Address: a.Address.String(), + LastDialSuccess: &a.LastDialSuccess, + LastDialFailure: &a.LastDialFailure, + DialFailures: a.DialFailures, + } + if msg.LastDialSuccess.IsZero() { + msg.LastDialSuccess = nil + } + if msg.LastDialFailure.IsZero() { + msg.LastDialFailure = nil + } + return msg +} + +// Copy returns a copy of the address info. +func (a *peerAddressInfo) Copy() peerAddressInfo { + return *a +} + +// Validate validates the address info. +func (a *peerAddressInfo) Validate() error { + return a.Address.Validate() +} + +// Database key prefixes. +const ( + prefixPeerInfo int64 = 1 +) + +// keyPeerInfo generates a peerInfo database key. +func keyPeerInfo(id NodeID) []byte { + key, err := orderedcode.Append(nil, prefixPeerInfo, string(id)) + if err != nil { + panic(err) + } + return key +} + +// keyPeerInfoRange generates start/end keys for the entire peerInfo key range. +func keyPeerInfoRange() ([]byte, []byte) { + start, err := orderedcode.Append(nil, prefixPeerInfo, "") + if err != nil { + panic(err) + } + end, err := orderedcode.Append(nil, prefixPeerInfo, orderedcode.Infinity) + if err != nil { + panic(err) + } + return start, end +} diff --git a/p2p/peermanager_test.go b/p2p/peermanager_test.go new file mode 100644 index 000000000..76d8c21a5 --- /dev/null +++ b/p2p/peermanager_test.go @@ -0,0 +1,1580 @@ +package p2p_test + +import ( + "context" + "errors" + "strings" + "testing" + "time" + + "github.com/fortytw2/leaktest" + "github.com/stretchr/testify/require" + dbm "github.com/tendermint/tm-db" + + "github.com/tendermint/tendermint/p2p" +) + +// FIXME: We should probably have some randomized property-based tests for the +// PeerManager too, which runs a bunch of random operations with random peers +// and ensures certain invariants always hold. The logic can be complex, with +// many interactions, and it's hard to cover all scenarios with handwritten +// tests. + +func TestPeerManagerOptions_Validate(t *testing.T) { + nodeID := p2p.NodeID("00112233445566778899aabbccddeeff00112233") + + testcases := map[string]struct { + options p2p.PeerManagerOptions + ok bool + }{ + "zero options is valid": {p2p.PeerManagerOptions{}, true}, + + // PersistentPeers + "valid PersistentPeers NodeID": {p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{"00112233445566778899aabbccddeeff00112233"}, + }, true}, + "invalid PersistentPeers NodeID": {p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{"foo"}, + }, false}, + "uppercase PersistentPeers NodeID": {p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{"00112233445566778899AABBCCDDEEFF00112233"}, + }, false}, + "PersistentPeers at MaxConnected": {p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{nodeID, nodeID, nodeID}, + MaxConnected: 3, + }, true}, + "PersistentPeers above MaxConnected": {p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{nodeID, nodeID, nodeID}, + MaxConnected: 2, + }, false}, + "PersistentPeers above MaxConnected below MaxConnectedUpgrade": {p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{nodeID, nodeID, nodeID}, + MaxConnected: 2, + MaxConnectedUpgrade: 2, + }, false}, + + // MaxPeers + "MaxPeers without MaxConnected": {p2p.PeerManagerOptions{ + MaxPeers: 3, + }, false}, + "MaxPeers below MaxConnected+MaxConnectedUpgrade": {p2p.PeerManagerOptions{ + MaxPeers: 2, + MaxConnected: 2, + MaxConnectedUpgrade: 1, + }, false}, + "MaxPeers at MaxConnected+MaxConnectedUpgrade": {p2p.PeerManagerOptions{ + MaxPeers: 3, + MaxConnected: 2, + MaxConnectedUpgrade: 1, + }, true}, + + // MaxRetryTime + "MaxRetryTime below MinRetryTime": {p2p.PeerManagerOptions{ + MinRetryTime: 7 * time.Second, + MaxRetryTime: 5 * time.Second, + }, false}, + "MaxRetryTime at MinRetryTime": {p2p.PeerManagerOptions{ + MinRetryTime: 5 * time.Second, + MaxRetryTime: 5 * time.Second, + }, true}, + "MaxRetryTime without MinRetryTime": {p2p.PeerManagerOptions{ + MaxRetryTime: 5 * time.Second, + }, false}, + + // MaxRetryTimePersistent + "MaxRetryTimePersistent below MinRetryTime": {p2p.PeerManagerOptions{ + MinRetryTime: 7 * time.Second, + MaxRetryTimePersistent: 5 * time.Second, + }, false}, + "MaxRetryTimePersistent at MinRetryTime": {p2p.PeerManagerOptions{ + MinRetryTime: 5 * time.Second, + MaxRetryTimePersistent: 5 * time.Second, + }, true}, + "MaxRetryTimePersistent without MinRetryTime": {p2p.PeerManagerOptions{ + MaxRetryTimePersistent: 5 * time.Second, + }, false}, + } + for name, tc := range testcases { + tc := tc + t.Run(name, func(t *testing.T) { + err := tc.options.Validate() + if tc.ok { + require.NoError(t, err) + } else { + require.Error(t, err) + } + }) + } +} + +func TestNewPeerManager(t *testing.T) { + + // Invalid options should error. + _, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{"foo"}, + }) + require.Error(t, err) + + // Invalid database should error. + _, err = p2p.NewPeerManager(nil, p2p.PeerManagerOptions{}) + require.Error(t, err) + + // Zero options should be valid. + _, err = p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) +} + +func TestNewPeerManager_Persistence(t *testing.T) { + aID := p2p.NodeID(strings.Repeat("a", 40)) + aAddresses := []p2p.NodeAddress{ + {Protocol: "tcp", NodeID: aID, Hostname: "127.0.0.1", Port: 26657, Path: "/path"}, + {Protocol: "memory", NodeID: aID}, + } + + bID := p2p.NodeID(strings.Repeat("b", 40)) + bAddresses := []p2p.NodeAddress{ + {Protocol: "tcp", NodeID: bID, Hostname: "b10c::1", Port: 26657, Path: "/path"}, + {Protocol: "memory", NodeID: bID}, + } + + cID := p2p.NodeID(strings.Repeat("c", 40)) + cAddresses := []p2p.NodeAddress{ + {Protocol: "tcp", NodeID: cID, Hostname: "host.domain", Port: 80}, + {Protocol: "memory", NodeID: cID}, + } + + // Create an initial peer manager and add the peers. + db := dbm.NewMemDB() + peerManager, err := p2p.NewPeerManager(db, p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{aID}, + PeerScores: map[p2p.NodeID]p2p.PeerScore{bID: 1}, + }) + require.NoError(t, err) + defer peerManager.Close() + + for _, addr := range append(append(aAddresses, bAddresses...), cAddresses...) { + require.NoError(t, peerManager.Add(addr)) + } + + require.ElementsMatch(t, aAddresses, peerManager.Addresses(aID)) + require.ElementsMatch(t, bAddresses, peerManager.Addresses(bID)) + require.ElementsMatch(t, cAddresses, peerManager.Addresses(cID)) + require.Equal(t, map[p2p.NodeID]p2p.PeerScore{ + aID: p2p.PeerScorePersistent, + bID: 1, + cID: 0, + }, peerManager.Scores()) + + peerManager.Close() + + // Creating a new peer manager with the same database should retain the + // peers, but they should have updated scores from the new PersistentPeers + // configuration. + peerManager, err = p2p.NewPeerManager(db, p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{bID}, + PeerScores: map[p2p.NodeID]p2p.PeerScore{cID: 1}, + }) + require.NoError(t, err) + defer peerManager.Close() + + require.ElementsMatch(t, aAddresses, peerManager.Addresses(aID)) + require.ElementsMatch(t, bAddresses, peerManager.Addresses(bID)) + require.ElementsMatch(t, cAddresses, peerManager.Addresses(cID)) + require.Equal(t, map[p2p.NodeID]p2p.PeerScore{ + aID: 0, + bID: p2p.PeerScorePersistent, + cID: 1, + }, peerManager.Scores()) +} + +func TestPeerManager_Add(t *testing.T) { + aID := p2p.NodeID(strings.Repeat("a", 40)) + bID := p2p.NodeID(strings.Repeat("b", 40)) + cID := p2p.NodeID(strings.Repeat("c", 40)) + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{aID, cID}, + MaxPeers: 2, + MaxConnected: 2, + }) + require.NoError(t, err) + + // Adding a couple of addresses should work. + aAddresses := []p2p.NodeAddress{ + {Protocol: "tcp", NodeID: aID, Hostname: "localhost"}, + {Protocol: "memory", NodeID: aID}, + } + for _, addr := range aAddresses { + err = peerManager.Add(addr) + require.NoError(t, err) + } + require.ElementsMatch(t, aAddresses, peerManager.Addresses(aID)) + + // Adding a different peer should be fine. + bAddress := p2p.NodeAddress{Protocol: "tcp", NodeID: bID, Hostname: "localhost"} + require.NoError(t, peerManager.Add(bAddress)) + require.Equal(t, []p2p.NodeAddress{bAddress}, peerManager.Addresses(bID)) + require.ElementsMatch(t, aAddresses, peerManager.Addresses(aID)) + + // Adding an existing address again should be a noop. + require.NoError(t, peerManager.Add(aAddresses[0])) + require.ElementsMatch(t, aAddresses, peerManager.Addresses(aID)) + + // Adding a third peer with MaxPeers=2 should cause bID, which is + // the lowest-scored peer (not in PersistentPeers), to be removed. + require.NoError(t, peerManager.Add(p2p.NodeAddress{ + Protocol: "tcp", NodeID: cID, Hostname: "localhost"})) + require.ElementsMatch(t, []p2p.NodeID{aID, cID}, peerManager.Peers()) + + // Adding an invalid address should error. + require.Error(t, peerManager.Add(p2p.NodeAddress{Path: "foo"})) +} + +func TestPeerManager_DialNext(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Add an address. DialNext should return it. + require.NoError(t, peerManager.Add(a)) + address, err := peerManager.DialNext(ctx) + require.NoError(t, err) + require.Equal(t, a, address) + + // Since there are no more undialed peers, the next call should block + // until it times out. + timeoutCtx, cancel := context.WithTimeout(ctx, 100*time.Millisecond) + defer cancel() + _, err = peerManager.DialNext(timeoutCtx) + require.Error(t, err) + require.Equal(t, context.DeadlineExceeded, err) +} + +func TestPeerManager_DialNext_Retry(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + options := p2p.PeerManagerOptions{ + MinRetryTime: 100 * time.Millisecond, + MaxRetryTime: 500 * time.Millisecond, + } + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), options) + require.NoError(t, err) + + require.NoError(t, peerManager.Add(a)) + + // Do five dial retries (six dials total). The retry time should double for + // each failure. At the forth retry, MaxRetryTime should kick in. + ctx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + + for i := 0; i <= 5; i++ { + start := time.Now() + dial, err := peerManager.DialNext(ctx) + require.NoError(t, err) + require.Equal(t, a, dial) + elapsed := time.Since(start).Round(time.Millisecond) + + switch i { + case 0: + require.LessOrEqual(t, elapsed, options.MinRetryTime) + case 1: + require.GreaterOrEqual(t, elapsed, options.MinRetryTime) + case 2: + require.GreaterOrEqual(t, elapsed, 2*options.MinRetryTime) + case 3: + require.GreaterOrEqual(t, elapsed, 4*options.MinRetryTime) + case 4, 5: + require.GreaterOrEqual(t, elapsed, options.MaxRetryTime) + require.LessOrEqual(t, elapsed, 8*options.MinRetryTime) + default: + require.Fail(t, "unexpected retry") + } + + require.NoError(t, peerManager.DialFailed(a)) + } +} + +func TestPeerManager_DialNext_WakeOnAdd(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Spawn a goroutine to add a peer after a delay. + go func() { + time.Sleep(200 * time.Millisecond) + require.NoError(t, peerManager.Add(a)) + }() + + // This will block until peer is added above. + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + dial, err := peerManager.DialNext(ctx) + require.NoError(t, err) + require.Equal(t, a, dial) +} + +func TestPeerManager_DialNext_WakeOnDialFailed(t *testing.T) { + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 1, + }) + require.NoError(t, err) + + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + // Add and dial a. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + + // Add b. We shouldn't be able to dial it, due to MaxConnected. + require.NoError(t, peerManager.Add(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) + + // Spawn a goroutine to fail a's dial attempt. + go func() { + time.Sleep(200 * time.Millisecond) + require.NoError(t, peerManager.DialFailed(a)) + }() + + // This should make b available for dialing (not a, retries are disabled). + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + dial, err = peerManager.DialNext(ctx) + require.NoError(t, err) + require.Equal(t, b, dial) +} + +func TestPeerManager_DialNext_WakeOnDialFailedRetry(t *testing.T) { + options := p2p.PeerManagerOptions{MinRetryTime: 200 * time.Millisecond} + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), options) + require.NoError(t, err) + + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + // Add a, dial it, and mark it a failure. This will start a retry timer. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.DialFailed(dial)) + failed := time.Now() + + // The retry timer should unblock DialNext and make a available again after + // the retry time passes. + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + dial, err = peerManager.DialNext(ctx) + require.NoError(t, err) + require.Equal(t, a, dial) + require.GreaterOrEqual(t, time.Since(failed), options.MinRetryTime) +} + +func TestPeerManager_DialNext_WakeOnDisconnected(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + err = peerManager.Add(a) + require.NoError(t, err) + err = peerManager.Accepted(a.NodeID) + require.NoError(t, err) + + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) + + go func() { + time.Sleep(200 * time.Millisecond) + require.NoError(t, peerManager.Disconnected(a.NodeID)) + }() + + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + dial, err = peerManager.DialNext(ctx) + require.NoError(t, err) + require.Equal(t, a, dial) +} + +func TestPeerManager_TryDialNext_MaxConnected(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 2, + }) + require.NoError(t, err) + + // Add a and connect to it. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.Dialed(a)) + + // Add b and start dialing it. + require.NoError(t, peerManager.Add(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + + // At this point, adding c will not allow dialing it. + require.NoError(t, peerManager.Add(c)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) +} + +func TestPeerManager_TryDialNext_MaxConnectedUpgrade(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + d := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("d", 40))} + e := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("e", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{ + a.NodeID: 0, + b.NodeID: 1, + c.NodeID: 2, + d.NodeID: 3, + e.NodeID: 0, + }, + PersistentPeers: []p2p.NodeID{c.NodeID, d.NodeID}, + MaxConnected: 2, + MaxConnectedUpgrade: 1, + }) + require.NoError(t, err) + + // Add a and connect to it. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.Dialed(a)) + + // Add b and start dialing it. + require.NoError(t, peerManager.Add(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + + // Even though we are at capacity, we should be allowed to dial c for an + // upgrade of a, since it's higher-scored. + require.NoError(t, peerManager.Add(c)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, c, dial) + + // However, since we're using all upgrade slots now, we can't add and dial + // d, even though it's also higher-scored. + require.NoError(t, peerManager.Add(d)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) + + // We go through with c's upgrade. + require.NoError(t, peerManager.Dialed(c)) + + // Still can't dial d. + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) + + // Now, if we disconnect a, we should be allowed to dial d because we have a + // free upgrade slot. + require.NoError(t, peerManager.Disconnected(a.NodeID)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, d, dial) + require.NoError(t, peerManager.Dialed(d)) + + // However, if we disconnect b (such that only c and d are connected), we + // should not be allowed to dial e even though there are upgrade slots, + // because there are no lower-scored nodes that can be upgraded. + require.NoError(t, peerManager.Disconnected(b.NodeID)) + require.NoError(t, peerManager.Add(e)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) +} + +func TestPeerManager_TryDialNext_UpgradeReservesPeer(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{b.NodeID: 1, c.NodeID: 1}, + MaxConnected: 1, + MaxConnectedUpgrade: 2, + }) + require.NoError(t, err) + + // Add a and connect to it. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.Dialed(a)) + + // Add b and start dialing it. This will claim a for upgrading. + require.NoError(t, peerManager.Add(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + + // Adding c and dialing it will fail, because a is the only connected + // peer that can be upgraded, and b is already trying to upgrade it. + require.NoError(t, peerManager.Add(c)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Empty(t, dial) +} + +func TestPeerManager_TryDialNext_DialingConnected(t *testing.T) { + aID := p2p.NodeID(strings.Repeat("a", 40)) + a := p2p.NodeAddress{Protocol: "memory", NodeID: aID} + aTCP := p2p.NodeAddress{Protocol: "tcp", NodeID: aID, Hostname: "localhost"} + + bID := p2p.NodeID(strings.Repeat("b", 40)) + b := p2p.NodeAddress{Protocol: "memory", NodeID: bID} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 2, + }) + require.NoError(t, err) + + // Add a and dial it. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + + // Adding a's TCP address will not dispense a, since it's already dialing. + require.NoError(t, peerManager.Add(aTCP)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) + + // Marking a as dialed will still not dispense it. + require.NoError(t, peerManager.Dialed(a)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) + + // Adding b and accepting a connection from it will not dispense it either. + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Accepted(bID)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) +} + +func TestPeerManager_TryDialNext_Multiple(t *testing.T) { + aID := p2p.NodeID(strings.Repeat("a", 40)) + bID := p2p.NodeID(strings.Repeat("b", 40)) + addresses := []p2p.NodeAddress{ + {Protocol: "memory", NodeID: aID}, + {Protocol: "memory", NodeID: bID}, + {Protocol: "tcp", NodeID: aID, Hostname: "127.0.0.1"}, + {Protocol: "tcp", NodeID: bID, Hostname: "::1"}, + } + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + for _, address := range addresses { + require.NoError(t, peerManager.Add(address)) + } + + // All addresses should be dispensed as long as dialing them has failed. + dial := []p2p.NodeAddress{} + for range addresses { + address, err := peerManager.TryDialNext() + require.NoError(t, err) + require.NotZero(t, address) + require.NoError(t, peerManager.DialFailed(address)) + dial = append(dial, address) + } + require.ElementsMatch(t, dial, addresses) + + address, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, address) +} + +func TestPeerManager_DialFailed(t *testing.T) { + // DialFailed is tested through other tests, we'll just check a few basic + // things here, e.g. reporting unknown addresses. + aID := p2p.NodeID(strings.Repeat("a", 40)) + a := p2p.NodeAddress{Protocol: "memory", NodeID: aID} + bID := p2p.NodeID(strings.Repeat("b", 40)) + b := p2p.NodeAddress{Protocol: "memory", NodeID: bID} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + require.NoError(t, peerManager.Add(a)) + + // Dialing and then calling DialFailed with a different address (same + // NodeID) should unmark as dialing and allow us to dial the other address + // again, but not register the failed address. + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.DialFailed(p2p.NodeAddress{ + Protocol: "tcp", NodeID: aID, Hostname: "localhost"})) + require.Equal(t, []p2p.NodeAddress{a}, peerManager.Addresses(aID)) + + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + + // Calling DialFailed on same address twice should be fine. + require.NoError(t, peerManager.DialFailed(a)) + require.NoError(t, peerManager.DialFailed(a)) + + // DialFailed on an unknown peer shouldn't error or add it. + require.NoError(t, peerManager.DialFailed(b)) + require.Equal(t, []p2p.NodeID{aID}, peerManager.Peers()) +} + +func TestPeerManager_DialFailed_UnreservePeer(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{b.NodeID: 1, c.NodeID: 1}, + MaxConnected: 1, + MaxConnectedUpgrade: 2, + }) + require.NoError(t, err) + + // Add a and connect to it. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.Dialed(a)) + + // Add b and start dialing it. This will claim a for upgrading. + require.NoError(t, peerManager.Add(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + + // Adding c and dialing it will fail, even though it could upgrade a and we + // have free upgrade slots, because a is the only connected peer that can be + // upgraded and b is already trying to upgrade it. + require.NoError(t, peerManager.Add(c)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Empty(t, dial) + + // Failing b's dial will now make c available for dialing. + require.NoError(t, peerManager.DialFailed(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, c, dial) +} + +func TestPeerManager_Dialed_Connected(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Marking a as dialed twice should error. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + + require.NoError(t, peerManager.Dialed(a)) + require.Error(t, peerManager.Dialed(a)) + + // Accepting a connection from b and then trying to mark it as dialed should fail. + require.NoError(t, peerManager.Add(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + + require.NoError(t, peerManager.Accepted(b.NodeID)) + require.Error(t, peerManager.Dialed(b)) +} + +func TestPeerManager_Dialed_MaxConnected(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 1, + }) + require.NoError(t, err) + + // Start to dial a. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + + // Marking b as dialed in the meanwhile (even without TryDialNext) + // should be fine. + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Dialed(b)) + + // Completing the dial for a should now error. + require.Error(t, peerManager.Dialed(a)) +} + +func TestPeerManager_Dialed_MaxConnectedUpgrade(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + d := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("d", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 2, + MaxConnectedUpgrade: 1, + PeerScores: map[p2p.NodeID]p2p.PeerScore{c.NodeID: 1, d.NodeID: 1}, + }) + require.NoError(t, err) + + // Dialing a and b is fine. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Dialed(a)) + + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Dialed(b)) + + // Starting an upgrade of c should be fine. + require.NoError(t, peerManager.Add(c)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, c, dial) + require.NoError(t, peerManager.Dialed(c)) + + // Trying to mark d dialed should fail, since there are no more upgrade + // slots and a/b haven't been evicted yet. + require.NoError(t, peerManager.Add(d)) + require.Error(t, peerManager.Dialed(d)) +} + +func TestPeerManager_Dialed_Unknown(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Marking an unknown node as dialed should error. + require.Error(t, peerManager.Dialed(a)) +} + +func TestPeerManager_Dialed_Upgrade(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 1, + MaxConnectedUpgrade: 2, + PeerScores: map[p2p.NodeID]p2p.PeerScore{b.NodeID: 1, c.NodeID: 1}, + }) + require.NoError(t, err) + + // Dialing a is fine. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Dialed(a)) + + // Upgrading it with b should work, since b has a higher score. + require.NoError(t, peerManager.Add(b)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + require.NoError(t, peerManager.Dialed(b)) + + // a hasn't been evicted yet, but c shouldn't be allowed to upgrade anyway + // since it's about to be evicted. + require.NoError(t, peerManager.Add(c)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Empty(t, dial) + + // a should now be evicted. + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) +} + +func TestPeerManager_Dialed_UpgradeEvenLower(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + d := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("d", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 2, + MaxConnectedUpgrade: 1, + PeerScores: map[p2p.NodeID]p2p.PeerScore{ + a.NodeID: 3, + b.NodeID: 2, + c.NodeID: 10, + d.NodeID: 1, + }, + }) + require.NoError(t, err) + + // Connect to a and b. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Dialed(a)) + + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Dialed(b)) + + // Start an upgrade with c, which should pick b to upgrade (since it + // has score 2). + require.NoError(t, peerManager.Add(c)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, c, dial) + + // In the meanwhile, a disconnects and d connects. d is even lower-scored + // than b (1 vs 2), which is currently being upgraded. + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.NoError(t, peerManager.Add(d)) + require.NoError(t, peerManager.Accepted(d.NodeID)) + + // Once c completes the upgrade of b, it should instead evict d, + // since it has en even lower score. + require.NoError(t, peerManager.Dialed(c)) + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, d.NodeID, evict) +} + +func TestPeerManager_Dialed_UpgradeNoEvict(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 2, + MaxConnectedUpgrade: 1, + PeerScores: map[p2p.NodeID]p2p.PeerScore{ + a.NodeID: 1, + b.NodeID: 2, + c.NodeID: 3, + }, + }) + require.NoError(t, err) + + // Connect to a and b. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Dialed(a)) + + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Dialed(b)) + + // Start an upgrade with c, which should pick a to upgrade. + require.NoError(t, peerManager.Add(c)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, c, dial) + + // In the meanwhile, b disconnects. + require.NoError(t, peerManager.Disconnected(b.NodeID)) + + // Once c completes the upgrade of b, there is no longer a need to + // evict anything since we're at capacity. + // since it has en even lower score. + require.NoError(t, peerManager.Dialed(c)) + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) +} + +func TestPeerManager_Accepted(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + d := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("d", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Accepting a connection from a known peer should work. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + + // Accepting a connection from an already accepted peer should error. + require.Error(t, peerManager.Accepted(a.NodeID)) + + // Accepting a connection from an unknown peer should work and register it. + require.NoError(t, peerManager.Accepted(b.NodeID)) + require.ElementsMatch(t, []p2p.NodeID{a.NodeID, b.NodeID}, peerManager.Peers()) + + // Accepting a connection from a peer that's being dialed should work, and + // should cause the dial to fail. + require.NoError(t, peerManager.Add(c)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, c, dial) + require.NoError(t, peerManager.Accepted(c.NodeID)) + require.Error(t, peerManager.Dialed(c)) + + // Accepting a connection from a peer that's been dialed should fail. + require.NoError(t, peerManager.Add(d)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, d, dial) + require.NoError(t, peerManager.Dialed(d)) + require.Error(t, peerManager.Accepted(d.NodeID)) +} + +func TestPeerManager_Accepted_MaxConnected(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 2, + }) + require.NoError(t, err) + + // Connect to a and b. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Dialed(a)) + + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Accepted(b.NodeID)) + + // Accepting c should now fail. + require.NoError(t, peerManager.Add(c)) + require.Error(t, peerManager.Accepted(c.NodeID)) +} + +func TestPeerManager_Accepted_MaxConnectedUpgrade(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + d := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("d", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{ + c.NodeID: 1, + d.NodeID: 2, + }, + MaxConnected: 1, + MaxConnectedUpgrade: 1, + }) + require.NoError(t, err) + + // Dial a. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Dialed(a)) + + // Accepting b should fail, since it's not an upgrade over a. + require.NoError(t, peerManager.Add(b)) + require.Error(t, peerManager.Accepted(b.NodeID)) + + // Accepting c should work, since it upgrades a. + require.NoError(t, peerManager.Add(c)) + require.NoError(t, peerManager.Accepted(c.NodeID)) + + // a still hasn't been evicted, so accepting b should still fail. + require.NoError(t, peerManager.Add(b)) + require.Error(t, peerManager.Accepted(b.NodeID)) + + // Also, accepting d should fail, since all upgrade slots are full. + require.NoError(t, peerManager.Add(d)) + require.Error(t, peerManager.Accepted(d.NodeID)) +} + +func TestPeerManager_Accepted_Upgrade(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{ + b.NodeID: 1, + c.NodeID: 1, + }, + MaxConnected: 1, + MaxConnectedUpgrade: 2, + }) + require.NoError(t, err) + + // Accept a. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + + // Accepting b should work, since it upgrades a. + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Accepted(b.NodeID)) + + // c cannot get accepted, since a has been upgraded by b. + require.NoError(t, peerManager.Add(c)) + require.Error(t, peerManager.Accepted(c.NodeID)) + + // This should cause a to get evicted. + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) + require.NoError(t, peerManager.Disconnected(a.NodeID)) + + // c still cannot get accepted, since it's not scored above b. + require.Error(t, peerManager.Accepted(c.NodeID)) +} + +func TestPeerManager_Accepted_UpgradeDialing(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{ + b.NodeID: 1, + c.NodeID: 1, + }, + MaxConnected: 1, + MaxConnectedUpgrade: 2, + }) + require.NoError(t, err) + + // Accept a. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + + // Start dial upgrade from a to b. + require.NoError(t, peerManager.Add(b)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + + // a has already been claimed as an upgrade of a, so accepting + // c should fail since there's noone else to upgrade. + require.NoError(t, peerManager.Add(c)) + require.Error(t, peerManager.Accepted(c.NodeID)) + + // However, if b connects to us while we're also trying to upgrade to it via + // dialing, then we accept the incoming connection as an upgrade. + require.NoError(t, peerManager.Accepted(b.NodeID)) + + // This should cause a to get evicted, and the dial upgrade to fail. + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) + require.Error(t, peerManager.Dialed(b)) +} + +func TestPeerManager_Ready(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + sub := peerManager.Subscribe() + defer sub.Close() + + // Connecting to a should still have it as status down. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.Equal(t, p2p.PeerStatusDown, peerManager.Status(a.NodeID)) + + // Marking a as ready should transition it to PeerStatusUp and send an update. + require.NoError(t, peerManager.Ready(a.NodeID)) + require.Equal(t, p2p.PeerStatusUp, peerManager.Status(a.NodeID)) + require.Equal(t, p2p.PeerUpdate{ + NodeID: a.NodeID, + Status: p2p.PeerStatusUp, + }, <-sub.Updates()) + + // Marking an unconnected peer as ready should do nothing. + require.NoError(t, peerManager.Add(b)) + require.Equal(t, p2p.PeerStatusDown, peerManager.Status(b.NodeID)) + require.NoError(t, peerManager.Ready(b.NodeID)) + require.Equal(t, p2p.PeerStatusDown, peerManager.Status(b.NodeID)) + require.Empty(t, sub.Updates()) +} + +// See TryEvictNext for most tests, this just tests blocking behavior. +func TestPeerManager_EvictNext(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + + // Since there are no peers to evict, EvictNext should block until timeout. + timeoutCtx, cancel := context.WithTimeout(ctx, 100*time.Millisecond) + defer cancel() + _, err = peerManager.EvictNext(timeoutCtx) + require.Error(t, err) + require.Equal(t, context.DeadlineExceeded, err) + + // Erroring the peer will return it from EvictNext(). + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + evict, err := peerManager.EvictNext(timeoutCtx) + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) + + // Since there are no more peers to evict, the next call should block. + timeoutCtx, cancel = context.WithTimeout(ctx, 100*time.Millisecond) + defer cancel() + _, err = peerManager.EvictNext(timeoutCtx) + require.Error(t, err) + require.Equal(t, context.DeadlineExceeded, err) +} + +func TestPeerManager_EvictNext_WakeOnError(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + + // Spawn a goroutine to error a peer after a delay. + go func() { + time.Sleep(200 * time.Millisecond) + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + }() + + // This will block until peer errors above. + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + evict, err := peerManager.EvictNext(ctx) + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) +} + +func TestPeerManager_EvictNext_WakeOnUpgradeDialed(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 1, + MaxConnectedUpgrade: 1, + PeerScores: map[p2p.NodeID]p2p.PeerScore{b.NodeID: 1}, + }) + require.NoError(t, err) + + // Connect a. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + + // Spawn a goroutine to upgrade to b with a delay. + go func() { + time.Sleep(200 * time.Millisecond) + require.NoError(t, peerManager.Add(b)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + require.NoError(t, peerManager.Dialed(b)) + }() + + // This will block until peer is upgraded above. + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + evict, err := peerManager.EvictNext(ctx) + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) +} + +func TestPeerManager_EvictNext_WakeOnUpgradeAccepted(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 1, + MaxConnectedUpgrade: 1, + PeerScores: map[p2p.NodeID]p2p.PeerScore{b.NodeID: 1}, + }) + require.NoError(t, err) + + // Connect a. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + + // Spawn a goroutine to upgrade b with a delay. + go func() { + time.Sleep(200 * time.Millisecond) + require.NoError(t, peerManager.Accepted(b.NodeID)) + }() + + // This will block until peer is upgraded above. + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + evict, err := peerManager.EvictNext(ctx) + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) +} +func TestPeerManager_TryEvictNext(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + require.NoError(t, peerManager.Add(a)) + + // Nothing is evicted with no peers connected. + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) + + // Connecting to a won't evict anything either. + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + + // But if a errors it should be evicted. + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + evict, err = peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) + + // While a is being evicted (before disconnect), it shouldn't get evicted again. + evict, err = peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) + + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + evict, err = peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) +} + +func TestPeerManager_Disconnected(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + sub := peerManager.Subscribe() + defer sub.Close() + + // Disconnecting an unknown peer does nothing. + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.Empty(t, peerManager.Peers()) + require.Empty(t, sub.Updates()) + + // Disconnecting an accepted non-ready peer does not send a status update. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.Empty(t, sub.Updates()) + + // Disconnecting a ready peer sends a status update. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + require.Equal(t, p2p.PeerStatusUp, peerManager.Status(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{ + NodeID: a.NodeID, + Status: p2p.PeerStatusUp, + }, <-sub.Updates()) + + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.Equal(t, p2p.PeerStatusDown, peerManager.Status(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{ + NodeID: a.NodeID, + Status: p2p.PeerStatusDown, + }, <-sub.Updates()) + + // Disconnecting a dialing peer does not unmark it as dialing, to avoid + // dialing it multiple times in parallel. + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + + require.NoError(t, peerManager.Disconnected(a.NodeID)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) +} + +func TestPeerManager_Errored(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Erroring an unknown peer does nothing. + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + require.Empty(t, peerManager.Peers()) + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) + + // Erroring a known peer does nothing, and won't evict it later, + // even when it connects. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + evict, err = peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) + + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + evict, err = peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) + + // However, erroring once connected will evict it. + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + evict, err = peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) +} + +func TestPeerManager_Subscribe(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // This tests all subscription events for full peer lifecycles. + sub := peerManager.Subscribe() + defer sub.Close() + + require.NoError(t, peerManager.Add(a)) + require.Empty(t, sub.Updates()) + + // Inbound connection. + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.Empty(t, sub.Updates()) + + require.NoError(t, peerManager.Ready(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusUp}, <-sub.Updates()) + + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusDown}, <-sub.Updates()) + + // Outbound connection with peer error and eviction. + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.Empty(t, sub.Updates()) + + require.NoError(t, peerManager.Dialed(a)) + require.Empty(t, sub.Updates()) + + require.NoError(t, peerManager.Ready(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusUp}, <-sub.Updates()) + + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + require.Empty(t, sub.Updates()) + + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) + + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusDown}, <-sub.Updates()) + + // Outbound connection with dial failure. + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.Empty(t, sub.Updates()) + + require.NoError(t, peerManager.DialFailed(a)) + require.Empty(t, sub.Updates()) +} + +func TestPeerManager_Subscribe_Close(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + sub := peerManager.Subscribe() + defer sub.Close() + + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.Empty(t, sub.Updates()) + + require.NoError(t, peerManager.Ready(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusUp}, <-sub.Updates()) + + // Closing the subscription should not send us the disconnected update. + sub.Close() + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.Empty(t, sub.Updates()) +} + +func TestPeerManager_Subscribe_Broadcast(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + s1 := peerManager.Subscribe() + defer s1.Close() + s2 := peerManager.Subscribe() + defer s2.Close() + s3 := peerManager.Subscribe() + defer s3.Close() + + // Connecting to a peer should send updates on all subscriptions. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + + expectUp := p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusUp} + require.NotEmpty(t, s1) + require.Equal(t, expectUp, <-s1.Updates()) + require.NotEmpty(t, s2) + require.Equal(t, expectUp, <-s2.Updates()) + require.NotEmpty(t, s3) + require.Equal(t, expectUp, <-s3.Updates()) + + // We now close s2. Disconnecting the peer should only send updates + // on s1 and s3. + s2.Close() + require.NoError(t, peerManager.Disconnected(a.NodeID)) + + expectDown := p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusDown} + require.NotEmpty(t, s1) + require.Equal(t, expectDown, <-s1.Updates()) + require.Empty(t, s2.Updates()) + require.NotEmpty(t, s3) + require.Equal(t, expectDown, <-s3.Updates()) +} + +func TestPeerManager_Close(t *testing.T) { + // leaktest will check that spawned goroutines are closed. + t.Cleanup(leaktest.CheckTimeout(t, 1*time.Second)) + + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MinRetryTime: 10 * time.Second, + }) + require.NoError(t, err) + + // This subscription isn't closed, but PeerManager.Close() + // should reap the spawned goroutine. + _ = peerManager.Subscribe() + + // This dial failure will start a retry timer for 10 seconds, which + // should be reaped. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.DialFailed(a)) + + // This should clean up the goroutines. + peerManager.Close() +} + +func TestPeerManager_Advertise(t *testing.T) { + aID := p2p.NodeID(strings.Repeat("a", 40)) + aTCP := p2p.NodeAddress{Protocol: "tcp", NodeID: aID, Hostname: "127.0.0.1", Port: 26657, Path: "/path"} + aMem := p2p.NodeAddress{Protocol: "memory", NodeID: aID} + + bID := p2p.NodeID(strings.Repeat("b", 40)) + bTCP := p2p.NodeAddress{Protocol: "tcp", NodeID: bID, Hostname: "b10c::1", Port: 26657, Path: "/path"} + bMem := p2p.NodeAddress{Protocol: "memory", NodeID: bID} + + cID := p2p.NodeID(strings.Repeat("c", 40)) + cTCP := p2p.NodeAddress{Protocol: "tcp", NodeID: cID, Hostname: "host.domain", Port: 80} + cMem := p2p.NodeAddress{Protocol: "memory", NodeID: cID} + + dID := p2p.NodeID(strings.Repeat("d", 40)) + + // Create an initial peer manager and add the peers. + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{aID: 3, bID: 2, cID: 1}, + }) + require.NoError(t, err) + defer peerManager.Close() + + require.NoError(t, peerManager.Add(aTCP)) + require.NoError(t, peerManager.Add(aMem)) + require.NoError(t, peerManager.Add(bTCP)) + require.NoError(t, peerManager.Add(bMem)) + require.NoError(t, peerManager.Add(cTCP)) + require.NoError(t, peerManager.Add(cMem)) + + // d should get all addresses. + require.ElementsMatch(t, []p2p.NodeAddress{ + aTCP, aMem, bTCP, bMem, cTCP, cMem, + }, peerManager.Advertise(dID, 100)) + + // a should not get its own addresses. + require.ElementsMatch(t, []p2p.NodeAddress{ + bTCP, bMem, cTCP, cMem, + }, peerManager.Advertise(aID, 100)) + + // Asking for 0 addresses should return, well, 0. + require.Empty(t, peerManager.Advertise(aID, 0)) + + // Asking for 2 addresses should get the highest-rated ones, i.e. a. + require.ElementsMatch(t, []p2p.NodeAddress{ + aTCP, aMem, + }, peerManager.Advertise(dID, 2)) +} + +func TestPeerManager_SetHeight_GetHeight(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + db := dbm.NewMemDB() + peerManager, err := p2p.NewPeerManager(db, p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Getting a height should default to 0, for unknown peers and + // for known peers without height. + require.NoError(t, peerManager.Add(a)) + require.EqualValues(t, 0, peerManager.GetHeight(a.NodeID)) + require.EqualValues(t, 0, peerManager.GetHeight(b.NodeID)) + + // Setting a height should work for a known node. + require.NoError(t, peerManager.SetHeight(a.NodeID, 3)) + require.EqualValues(t, 3, peerManager.GetHeight(a.NodeID)) + + // Setting a height should add an unknown node. + require.Equal(t, []p2p.NodeID{a.NodeID}, peerManager.Peers()) + require.NoError(t, peerManager.SetHeight(b.NodeID, 7)) + require.EqualValues(t, 7, peerManager.GetHeight(b.NodeID)) + require.ElementsMatch(t, []p2p.NodeID{a.NodeID, b.NodeID}, peerManager.Peers()) + + // The heights should not be persisted. + peerManager.Close() + peerManager, err = p2p.NewPeerManager(db, p2p.PeerManagerOptions{}) + require.NoError(t, err) + + require.ElementsMatch(t, []p2p.NodeID{a.NodeID, b.NodeID}, peerManager.Peers()) + require.Zero(t, peerManager.GetHeight(a.NodeID)) + require.Zero(t, peerManager.GetHeight(b.NodeID)) +} diff --git a/p2p/pex/reactor.go b/p2p/pex/reactor.go index 4762260b0..52eccbe68 100644 --- a/p2p/pex/reactor.go +++ b/p2p/pex/reactor.go @@ -30,7 +30,7 @@ type ReactorV2 struct { peerManager *p2p.PeerManager pexCh *p2p.Channel - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates closeCh chan struct{} } @@ -39,7 +39,7 @@ func NewReactorV2( logger log.Logger, peerManager *p2p.PeerManager, pexCh *p2p.Channel, - peerUpdates *p2p.PeerUpdatesCh, + peerUpdates *p2p.PeerUpdates, ) *ReactorV2 { r := &ReactorV2{ peerManager: peerManager, @@ -181,9 +181,8 @@ func (r *ReactorV2) processPexCh() { if err := r.handleMessage(r.pexCh.ID(), envelope); err != nil { r.Logger.Error("failed to process message", "ch_id", r.pexCh.ID(), "envelope", envelope, "err", err) r.pexCh.Error() <- p2p.PeerError{ - PeerID: envelope.From, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + NodeID: envelope.From, + Err: err, } } @@ -197,11 +196,11 @@ func (r *ReactorV2) processPexCh() { // processPeerUpdate processes a PeerUpdate. For added peers, PeerStatusUp, we // send a request for addresses. func (r *ReactorV2) processPeerUpdate(peerUpdate p2p.PeerUpdate) { - r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status) + r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) if peerUpdate.Status == p2p.PeerStatusUp { r.pexCh.Out() <- p2p.Envelope{ - To: peerUpdate.PeerID, + To: peerUpdate.NodeID, Message: &protop2p.PexRequest{}, } } diff --git a/p2p/router.go b/p2p/router.go index 669c96265..379dfc9f6 100644 --- a/p2p/router.go +++ b/p2p/router.go @@ -255,13 +255,10 @@ func (r *Router) routeChannel(channel *Channel) { if !ok { return } - // FIXME: We just disconnect the peer for now - r.logger.Error("peer error, disconnecting", "peer", peerError.PeerID, "err", peerError.Err) - r.peerMtx.RLock() - peerQueue, ok := r.peerQueues[peerError.PeerID] - r.peerMtx.RUnlock() - if ok { - peerQueue.close() + // FIXME: We just evict the peer for now. + r.logger.Error("peer error, evicting", "peer", peerError.NodeID, "err", peerError.Err) + if err := r.peerManager.Errored(peerError.NodeID, peerError.Err); err != nil { + r.logger.Error("failed to report peer error", "peer", peerError.NodeID, "err", err) } case <-channel.Done(): @@ -338,7 +335,6 @@ func (r *Router) acceptPeers(transport Transport) { r.peerMtx.Lock() r.peerQueues[peerInfo.NodeID] = queue r.peerMtx.Unlock() - r.peerManager.Ready(peerInfo.NodeID) defer func() { r.peerMtx.Lock() @@ -350,6 +346,11 @@ func (r *Router) acceptPeers(transport Transport) { } }() + if err := r.peerManager.Ready(peerInfo.NodeID); err != nil { + r.logger.Error("failed to mark peer as ready", "peer", peerInfo.NodeID, "err", err) + return + } + r.routePeer(peerInfo.NodeID, conn, queue) }() } @@ -359,7 +360,7 @@ func (r *Router) acceptPeers(transport Transport) { func (r *Router) dialPeers() { ctx := r.stopCtx() for { - peerID, address, err := r.peerManager.DialNext(ctx) + address, err := r.peerManager.DialNext(ctx) switch err { case nil: case context.Canceled: @@ -371,12 +372,13 @@ func (r *Router) dialPeers() { } go func() { + peerID := address.NodeID conn, err := r.dialPeer(ctx, address) if errors.Is(err, context.Canceled) { return } else if err != nil { r.logger.Error("failed to dial peer", "peer", peerID, "err", err) - if err = r.peerManager.DialFailed(peerID, address); err != nil { + if err = r.peerManager.DialFailed(address); err != nil { r.logger.Error("failed to report dial failure", "peer", peerID, "err", err) } return @@ -388,13 +390,13 @@ func (r *Router) dialPeers() { return } else if err != nil { r.logger.Error("failed to handshake with peer", "peer", peerID, "err", err) - if err = r.peerManager.DialFailed(peerID, address); err != nil { + if err = r.peerManager.DialFailed(address); err != nil { r.logger.Error("failed to report dial failure", "peer", peerID, "err", err) } return } - if err = r.peerManager.Dialed(peerID, address); err != nil { + if err = r.peerManager.Dialed(address); err != nil { r.logger.Error("failed to dial peer", "peer", peerID, "err", err) return } @@ -403,7 +405,6 @@ func (r *Router) dialPeers() { r.peerMtx.Lock() r.peerQueues[peerID] = queue r.peerMtx.Unlock() - r.peerManager.Ready(peerID) defer func() { r.peerMtx.Lock() @@ -415,6 +416,11 @@ func (r *Router) dialPeers() { } }() + if err := r.peerManager.Ready(peerID); err != nil { + r.logger.Error("failed to mark peer as ready", "peer", peerID, "err", err) + return + } + r.routePeer(peerID, conn, queue) }() } diff --git a/p2p/router_test.go b/p2p/router_test.go index 8f5cb91b9..01ad949d0 100644 --- a/p2p/router_test.go +++ b/p2p/router_test.go @@ -115,9 +115,9 @@ func TestRouter(t *testing.T) { // Wait for peers to come online, and ping them as they do. for i := 0; i < len(peers); i++ { peerUpdate := <-peerUpdates.Updates() - peerID := peerUpdate.PeerID + peerID := peerUpdate.NodeID require.Equal(t, p2p.PeerUpdate{ - PeerID: peerID, + NodeID: peerID, Status: p2p.PeerStatusUp, }, peerUpdate) @@ -140,13 +140,12 @@ func TestRouter(t *testing.T) { // We then submit an error for a peer, and watch it get disconnected. channel.Error() <- p2p.PeerError{ - PeerID: peers[0].NodeID, - Err: errors.New("test error"), - Severity: p2p.PeerErrorSeverityCritical, + NodeID: peers[0].NodeID, + Err: errors.New("test error"), } peerUpdate := <-peerUpdates.Updates() require.Equal(t, p2p.PeerUpdate{ - PeerID: peers[0].NodeID, + NodeID: peers[0].NodeID, Status: p2p.PeerStatusDown, }, peerUpdate) @@ -154,7 +153,7 @@ func TestRouter(t *testing.T) { // for that to happen. peerUpdate = <-peerUpdates.Updates() require.Equal(t, p2p.PeerUpdate{ - PeerID: peers[0].NodeID, + NodeID: peers[0].NodeID, Status: p2p.PeerStatusUp, }, peerUpdate) } diff --git a/p2p/shim.go b/p2p/shim.go index 266fc0f1f..a349849c7 100644 --- a/p2p/shim.go +++ b/p2p/shim.go @@ -29,7 +29,7 @@ type ( BaseReactor Name string - PeerUpdates *PeerUpdatesCh + PeerUpdates *PeerUpdates Channels map[ChannelID]*ChannelShim } @@ -162,10 +162,10 @@ func (rs *ReactorShim) handlePeerErrors() { for _, cs := range rs.Channels { go func(cs *ChannelShim) { for pErr := range cs.Channel.errCh { - if pErr.PeerID != "" { - peer := rs.Switch.peers.Get(pErr.PeerID) + if pErr.NodeID != "" { + peer := rs.Switch.peers.Get(pErr.NodeID) if peer == nil { - rs.Logger.Error("failed to handle peer error; failed to find peer", "peer", pErr.PeerID) + rs.Logger.Error("failed to handle peer error; failed to find peer", "peer", pErr.NodeID) continue } @@ -225,7 +225,7 @@ func (rs *ReactorShim) GetChannels() []*ChannelDescriptor { // handle adding a peer. func (rs *ReactorShim) AddPeer(peer Peer) { select { - case rs.PeerUpdates.updatesCh <- PeerUpdate{PeerID: peer.ID(), Status: PeerStatusUp}: + case rs.PeerUpdates.updatesCh <- PeerUpdate{NodeID: peer.ID(), Status: PeerStatusUp}: rs.Logger.Debug("sent peer update", "reactor", rs.Name, "peer", peer.ID(), "status", PeerStatusUp) case <-rs.PeerUpdates.Done(): @@ -244,7 +244,7 @@ func (rs *ReactorShim) AddPeer(peer Peer) { // handle removing a peer. func (rs *ReactorShim) RemovePeer(peer Peer, reason interface{}) { select { - case rs.PeerUpdates.updatesCh <- PeerUpdate{PeerID: peer.ID(), Status: PeerStatusDown}: + case rs.PeerUpdates.updatesCh <- PeerUpdate{NodeID: peer.ID(), Status: PeerStatusDown}: rs.Logger.Debug( "sent peer update", "reactor", rs.Name, diff --git a/p2p/shim_test.go b/p2p/shim_test.go index f5b84a490..a12a184cb 100644 --- a/p2p/shim_test.go +++ b/p2p/shim_test.go @@ -123,7 +123,7 @@ func TestReactorShim_AddPeer(t *testing.T) { rts.shim.AddPeer(peerA) wg.Wait() - require.Equal(t, peerIDA, peerUpdate.PeerID) + require.Equal(t, peerIDA, peerUpdate.NodeID) require.Equal(t, p2p.PeerStatusUp, peerUpdate.Status) } @@ -143,7 +143,7 @@ func TestReactorShim_RemovePeer(t *testing.T) { rts.shim.RemovePeer(peerA, "test reason") wg.Wait() - require.Equal(t, peerIDA, peerUpdate.PeerID) + require.Equal(t, peerIDA, peerUpdate.NodeID) require.Equal(t, p2p.PeerStatusDown, peerUpdate.Status) } diff --git a/statesync/reactor.go b/statesync/reactor.go index 30cf5c09e..f88e2d1e0 100644 --- a/statesync/reactor.go +++ b/statesync/reactor.go @@ -78,7 +78,7 @@ type Reactor struct { tempDir string snapshotCh *p2p.Channel chunkCh *p2p.Channel - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates closeCh chan struct{} // This will only be set when a state sync is in progress. It is used to feed @@ -96,7 +96,7 @@ func NewReactor( conn proxy.AppConnSnapshot, connQuery proxy.AppConnQuery, snapshotCh, chunkCh *p2p.Channel, - peerUpdates *p2p.PeerUpdatesCh, + peerUpdates *p2p.PeerUpdates, tempDir string, ) *Reactor { r := &Reactor{ @@ -347,9 +347,8 @@ func (r *Reactor) processSnapshotCh() { if err := r.handleMessage(r.snapshotCh.ID(), envelope); err != nil { r.Logger.Error("failed to process message", "ch_id", r.snapshotCh.ID(), "envelope", envelope, "err", err) r.snapshotCh.Error() <- p2p.PeerError{ - PeerID: envelope.From, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + NodeID: envelope.From, + Err: err, } } @@ -374,9 +373,8 @@ func (r *Reactor) processChunkCh() { if err := r.handleMessage(r.chunkCh.ID(), envelope); err != nil { r.Logger.Error("failed to process message", "ch_id", r.chunkCh.ID(), "envelope", envelope, "err", err) r.chunkCh.Error() <- p2p.PeerError{ - PeerID: envelope.From, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + NodeID: envelope.From, + Err: err, } } @@ -390,18 +388,18 @@ func (r *Reactor) processChunkCh() { // processPeerUpdate processes a PeerUpdate, returning an error upon failing to // handle the PeerUpdate or if a panic is recovered. func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { - r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status) + r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) r.mtx.RLock() defer r.mtx.RUnlock() if r.syncer != nil { switch peerUpdate.Status { - case p2p.PeerStatusNew, p2p.PeerStatusUp: - r.syncer.AddPeer(peerUpdate.PeerID) + case p2p.PeerStatusUp: + r.syncer.AddPeer(peerUpdate.NodeID) - case p2p.PeerStatusDown, p2p.PeerStatusRemoved, p2p.PeerStatusBanned: - r.syncer.RemovePeer(peerUpdate.PeerID) + case p2p.PeerStatusDown: + r.syncer.RemovePeer(peerUpdate.NodeID) } } } diff --git a/statesync/reactor_test.go b/statesync/reactor_test.go index 6dfad5edb..0760c1e54 100644 --- a/statesync/reactor_test.go +++ b/statesync/reactor_test.go @@ -33,7 +33,7 @@ type reactorTestSuite struct { chunkOutCh chan p2p.Envelope chunkPeerErrCh chan p2p.PeerError - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates } func setup( @@ -127,7 +127,7 @@ func TestReactor_ChunkRequest_InvalidRequest(t *testing.T) { require.Error(t, response.Err) require.Empty(t, rts.chunkOutCh) require.Contains(t, response.Err.Error(), "received unknown message") - require.Equal(t, p2p.NodeID("aa"), response.PeerID) + require.Equal(t, p2p.NodeID("aa"), response.NodeID) } func TestReactor_ChunkRequest(t *testing.T) { @@ -198,7 +198,7 @@ func TestReactor_SnapshotsRequest_InvalidRequest(t *testing.T) { require.Error(t, response.Err) require.Empty(t, rts.snapshotOutCh) require.Contains(t, response.Err.Error(), "received unknown message") - require.Equal(t, p2p.NodeID("aa"), response.PeerID) + require.Equal(t, p2p.NodeID("aa"), response.NodeID) } func TestReactor_SnapshotsRequest(t *testing.T) { From 91e1f3548a435c51197ac6f591b0634aebe59896 Mon Sep 17 00:00:00 2001 From: Tess Rinearson Date: Wed, 3 Feb 2021 16:26:46 +0100 Subject: [PATCH 11/15] .github/workflows: separate e2e workflows for 0.34.x and master (#6041) Co-authored-by: Erik Grinaker Co-authored-by: Marko --- .../{e2e-nightly.yml => e2e-nightly-34x.yml} | 27 ++++++++++++++++--- ...2e-nightly2.yml => e2e-nightly-master.yml} | 13 +++++---- 2 files changed, 29 insertions(+), 11 deletions(-) rename .github/workflows/{e2e-nightly.yml => e2e-nightly-34x.yml} (62%) rename .github/workflows/{e2e-nightly2.yml => e2e-nightly-master.yml} (84%) diff --git a/.github/workflows/e2e-nightly.yml b/.github/workflows/e2e-nightly-34x.yml similarity index 62% rename from .github/workflows/e2e-nightly.yml rename to .github/workflows/e2e-nightly-34x.yml index d32b54840..5ceb613cb 100644 --- a/.github/workflows/e2e-nightly.yml +++ b/.github/workflows/e2e-nightly-34x.yml @@ -1,7 +1,8 @@ -# Runs randomly generated E2E testnets nightly. -name: e2e-nightly +# Runs randomly generated E2E testnets nightly +# on the 0.34.x release branch +name: e2e-nightly-34x on: - workflow_dispatch: # allow running workflow manually + workflow_dispatch: # allow running workflow manually, in theory schedule: - cron: '0 2 * * *' @@ -21,6 +22,8 @@ jobs: go-version: '1.15' - uses: actions/checkout@v2 + with: + ref: 'v0.34.x' - name: Build working-directory: test/e2e @@ -49,5 +52,21 @@ jobs: SLACK_USERNAME: Nightly E2E Tests SLACK_ICON_EMOJI: ':skull:' SLACK_COLOR: danger - SLACK_MESSAGE: Nightly E2E tests failed + SLACK_MESSAGE: Nightly E2E tests failed on v0.34.x + SLACK_FOOTER: '' + + e2e-nightly-success: # may turn this off once they seem to pass consistently + needs: e2e-nightly-test-2 + if: ${{ success() }} + runs-on: ubuntu-latest + steps: + - name: Notify Slack on success + uses: rtCamp/action-slack-notify@ae4223259071871559b6e9d08b24a63d71b3f0c0 + env: + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + SLACK_CHANNEL: tendermint-internal + SLACK_USERNAME: Nightly E2E Tests + SLACK_ICON_EMOJI: ':white_check_mark:' + SLACK_COLOR: good + SLACK_MESSAGE: Nightly E2E tests passed on v0.34.x SLACK_FOOTER: '' diff --git a/.github/workflows/e2e-nightly2.yml b/.github/workflows/e2e-nightly-master.yml similarity index 84% rename from .github/workflows/e2e-nightly2.yml rename to .github/workflows/e2e-nightly-master.yml index d829efa72..de88499d9 100644 --- a/.github/workflows/e2e-nightly2.yml +++ b/.github/workflows/e2e-nightly-master.yml @@ -1,6 +1,5 @@ -# Runs randomly generated E2E testnets nightly. -# Copy of earlier nightly test file, for debugging -name: e2e-nightly2 +# Runs randomly generated E2E testnets nightly on master +name: e2e-nightly-master on: workflow_dispatch: # allow running workflow manually schedule: @@ -50,10 +49,10 @@ jobs: SLACK_USERNAME: Nightly E2E Tests SLACK_ICON_EMOJI: ':skull:' SLACK_COLOR: danger - SLACK_MESSAGE: Nightly E2E (copy) tests failed + SLACK_MESSAGE: Nightly E2E (copy) tests failed on master SLACK_FOOTER: '' - e2e-nightly-success: + e2e-nightly-success: # may turn this off once they seem to pass consistently needs: e2e-nightly-test-2 if: ${{ success() }} runs-on: ubuntu-latest @@ -64,7 +63,7 @@ jobs: SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} SLACK_CHANNEL: tendermint-internal SLACK_USERNAME: Nightly E2E Tests - SLACK_ICON_EMOJI: ':green_heart:' + SLACK_ICON_EMOJI: ':white_check_mark:' SLACK_COLOR: good - SLACK_MESSAGE: Nightly E2E (copy) tests failed + SLACK_MESSAGE: Nightly E2E (copy) tests passed on master SLACK_FOOTER: '' From 8ea2ac0db770e9bc846c110f850ebe68cf7dd987 Mon Sep 17 00:00:00 2001 From: Tess Rinearson Date: Wed, 3 Feb 2021 17:08:32 +0100 Subject: [PATCH 12/15] .github/workflows: fix whitespace in e2e config file (#6043) --- .github/workflows/e2e-nightly-34x.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-nightly-34x.yml b/.github/workflows/e2e-nightly-34x.yml index 5ceb613cb..bd1a00358 100644 --- a/.github/workflows/e2e-nightly-34x.yml +++ b/.github/workflows/e2e-nightly-34x.yml @@ -22,7 +22,7 @@ jobs: go-version: '1.15' - uses: actions/checkout@v2 - with: + with: ref: 'v0.34.x' - name: Build @@ -55,7 +55,7 @@ jobs: SLACK_MESSAGE: Nightly E2E tests failed on v0.34.x SLACK_FOOTER: '' - e2e-nightly-success: # may turn this off once they seem to pass consistently + e2e-nightly-success: # may turn this off once they seem to pass consistently needs: e2e-nightly-test-2 if: ${{ success() }} runs-on: ubuntu-latest From 9b6d6a3ad0ad4a62caa50fb4939757f035f500ae Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 4 Feb 2021 00:03:02 +0100 Subject: [PATCH 13/15] p2p: tighten up Router and add tests (#6044) This cleans up the `Router` code and adds a bunch of tests. These sorts of systems are a real pain to test, since they have a bunch of asynchronous goroutines living their own lives, so the test coverage is decent but not fantastic. Luckily we've been able to move all of the complex peer management and transport logic outside of the router, as synchronous components that are much easier to test, so the core router logic is fairly small and simple. This also provides some initial test tooling in `p2p/p2ptest` that automatically sets up in-memory networks and channels for use in integration tests. It also includes channel-oriented test asserters in `p2p/p2ptest/require.go`, but these have primarily been written for router testing and should probably be adapted or extended for reactor testing. --- blockchain/v0/reactor.go | 26 +- evidence/reactor.go | 10 +- mempool/reactor.go | 10 +- node/node.go | 2 +- p2p/channel.go | 145 -------- p2p/mocks/connection.go | 206 +++++++++++ p2p/mocks/transport.go | 121 ++++++ p2p/p2p_test.go | 34 ++ p2p/p2ptest/network.go | 240 ++++++++++++ p2p/p2ptest/require.go | 155 ++++++++ p2p/p2ptest/util.go | 8 + p2p/peermanager.go | 33 +- p2p/peermanager_test.go | 133 ++++--- p2p/pex/reactor.go | 12 +- p2p/router.go | 514 +++++++++++++++----------- p2p/router_test.go | 728 +++++++++++++++++++++++++++++++------ p2p/shim.go | 23 +- p2p/shim_test.go | 8 +- p2p/transport.go | 2 + p2p/transport_test.go | 7 +- statesync/reactor.go | 24 +- test/maverick/node/node.go | 2 +- 22 files changed, 1860 insertions(+), 583 deletions(-) delete mode 100644 p2p/channel.go create mode 100644 p2p/mocks/connection.go create mode 100644 p2p/mocks/transport.go create mode 100644 p2p/p2p_test.go create mode 100644 p2p/p2ptest/network.go create mode 100644 p2p/p2ptest/require.go create mode 100644 p2p/p2ptest/util.go diff --git a/blockchain/v0/reactor.go b/blockchain/v0/reactor.go index 583bb6766..af79fe563 100644 --- a/blockchain/v0/reactor.go +++ b/blockchain/v0/reactor.go @@ -194,7 +194,7 @@ func (r *Reactor) respondToPeer(msg *bcproto.BlockRequest, peerID p2p.NodeID) { return } - r.blockchainCh.Out() <- p2p.Envelope{ + r.blockchainCh.Out <- p2p.Envelope{ To: peerID, Message: &bcproto.BlockResponse{Block: blockProto}, } @@ -203,7 +203,7 @@ func (r *Reactor) respondToPeer(msg *bcproto.BlockRequest, peerID p2p.NodeID) { } r.Logger.Info("peer requesting a block we do not have", "peer", peerID, "height", msg.Height) - r.blockchainCh.Out() <- p2p.Envelope{ + r.blockchainCh.Out <- p2p.Envelope{ To: peerID, Message: &bcproto.NoBlockResponse{Height: msg.Height}, } @@ -229,7 +229,7 @@ func (r *Reactor) handleBlockchainMessage(envelope p2p.Envelope) error { r.pool.AddBlock(envelope.From, block, block.Size()) case *bcproto.StatusRequest: - r.blockchainCh.Out() <- p2p.Envelope{ + r.blockchainCh.Out <- p2p.Envelope{ To: envelope.From, Message: &bcproto.StatusResponse{ Height: r.store.Height(), @@ -284,10 +284,10 @@ func (r *Reactor) processBlockchainCh() { for { select { - case envelope := <-r.blockchainCh.In(): - if err := r.handleMessage(r.blockchainCh.ID(), envelope); err != nil { - r.Logger.Error("failed to process message", "ch_id", r.blockchainCh.ID(), "envelope", envelope, "err", err) - r.blockchainCh.Error() <- p2p.PeerError{ + case envelope := <-r.blockchainCh.In: + if err := r.handleMessage(r.blockchainCh.ID, envelope); err != nil { + r.Logger.Error("failed to process message", "ch_id", r.blockchainCh.ID, "envelope", envelope, "err", err) + r.blockchainCh.Error <- p2p.PeerError{ NodeID: envelope.From, Err: err, } @@ -312,7 +312,7 @@ func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { switch peerUpdate.Status { case p2p.PeerStatusUp: // send a status update the newly added peer - r.blockchainCh.Out() <- p2p.Envelope{ + r.blockchainCh.Out <- p2p.Envelope{ To: peerUpdate.NodeID, Message: &bcproto.StatusResponse{ Base: r.store.Base(), @@ -376,13 +376,13 @@ func (r *Reactor) requestRoutine() { return case request := <-r.requestsCh: - r.blockchainCh.Out() <- p2p.Envelope{ + r.blockchainCh.Out <- p2p.Envelope{ To: request.PeerID, Message: &bcproto.BlockRequest{Height: request.Height}, } case pErr := <-r.errorsCh: - r.blockchainCh.Error() <- p2p.PeerError{ + r.blockchainCh.Error <- p2p.PeerError{ NodeID: pErr.peerID, Err: pErr.err, } @@ -393,7 +393,7 @@ func (r *Reactor) requestRoutine() { go func() { defer r.poolWG.Done() - r.blockchainCh.Out() <- p2p.Envelope{ + r.blockchainCh.Out <- p2p.Envelope{ Broadcast: true, Message: &bcproto.StatusRequest{}, } @@ -522,14 +522,14 @@ FOR_LOOP: // NOTE: We've already removed the peer's request, but we still need // to clean up the rest. peerID := r.pool.RedoRequest(first.Height) - r.blockchainCh.Error() <- p2p.PeerError{ + r.blockchainCh.Error <- p2p.PeerError{ NodeID: peerID, Err: err, } peerID2 := r.pool.RedoRequest(second.Height) if peerID2 != peerID { - r.blockchainCh.Error() <- p2p.PeerError{ + r.blockchainCh.Error <- p2p.PeerError{ NodeID: peerID2, Err: err, } diff --git a/evidence/reactor.go b/evidence/reactor.go index cfa9b8989..aa3db318e 100644 --- a/evidence/reactor.go +++ b/evidence/reactor.go @@ -192,10 +192,10 @@ func (r *Reactor) processEvidenceCh() { for { select { - case envelope := <-r.evidenceCh.In(): - if err := r.handleMessage(r.evidenceCh.ID(), envelope); err != nil { - r.Logger.Error("failed to process message", "ch_id", r.evidenceCh.ID(), "envelope", envelope, "err", err) - r.evidenceCh.Error() <- p2p.PeerError{ + case envelope := <-r.evidenceCh.In: + if err := r.handleMessage(r.evidenceCh.ID, envelope); err != nil { + r.Logger.Error("failed to process message", "ch_id", r.evidenceCh.ID, "envelope", envelope, "err", err) + r.evidenceCh.Error <- p2p.PeerError{ NodeID: envelope.From, Err: err, } @@ -337,7 +337,7 @@ func (r *Reactor) broadcastEvidenceLoop(peerID p2p.NodeID, closer *tmsync.Closer // and thus would not be able to process the evidence correctly. Also, the // peer may receive this piece of evidence multiple times if it added and // removed frequently from the broadcasting peer. - r.evidenceCh.Out() <- p2p.Envelope{ + r.evidenceCh.Out <- p2p.Envelope{ To: peerID, Message: &tmproto.EvidenceList{ Evidence: []tmproto.Evidence{*evProto}, diff --git a/mempool/reactor.go b/mempool/reactor.go index 8b2a1c063..2c8b1c2cd 100644 --- a/mempool/reactor.go +++ b/mempool/reactor.go @@ -221,10 +221,10 @@ func (r *Reactor) processMempoolCh() { for { select { - case envelope := <-r.mempoolCh.In(): - if err := r.handleMessage(r.mempoolCh.ID(), envelope); err != nil { - r.Logger.Error("failed to process message", "ch_id", r.mempoolCh.ID(), "envelope", envelope, "err", err) - r.mempoolCh.Error() <- p2p.PeerError{ + case envelope := <-r.mempoolCh.In: + if err := r.handleMessage(r.mempoolCh.ID, envelope); err != nil { + r.Logger.Error("failed to process message", "ch_id", r.mempoolCh.ID, "envelope", envelope, "err", err) + r.mempoolCh.Error <- p2p.PeerError{ NodeID: envelope.From, Err: err, } @@ -370,7 +370,7 @@ func (r *Reactor) broadcastTxRoutine(peerID p2p.NodeID, closer *tmsync.Closer) { if _, ok := memTx.senders.Load(peerMempoolID); !ok { // Send the mempool tx to the corresponding peer. Note, the peer may be // behind and thus would not be able to process the mempool tx correctly. - r.mempoolCh.Out() <- p2p.Envelope{ + r.mempoolCh.Out <- p2p.Envelope{ To: peerID, Message: &protomem.Txs{ Txs: [][]byte{memTx.tx}, diff --git a/node/node.go b/node/node.go index 1086d40be..c82877422 100644 --- a/node/node.go +++ b/node/node.go @@ -758,7 +758,7 @@ func NewNode(config *cfg.Config, // TODO: Fetch and provide real options and do proper p2p bootstrapping. // TODO: Use a persistent peer database. - peerMgr, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + peerMgr, err := p2p.NewPeerManager(nodeKey.ID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) if err != nil { return nil, err } diff --git a/p2p/channel.go b/p2p/channel.go deleted file mode 100644 index c13cd75d1..000000000 --- a/p2p/channel.go +++ /dev/null @@ -1,145 +0,0 @@ -package p2p - -import ( - "sync" - - "github.com/gogo/protobuf/proto" -) - -// ChannelID is an arbitrary channel ID. -type ChannelID uint16 - -// Envelope specifies the message receiver and sender. -type Envelope struct { - From NodeID // Message sender, or empty for outbound messages. - To NodeID // Message receiver, or empty for inbound messages. - Broadcast bool // Send message to all connected peers, ignoring To. - Message proto.Message // Payload. - - // For internal use in the Router. - channelID ChannelID -} - -// Strip strips internal information from the envelope. Primarily used for -// testing, such that returned envelopes can be compared with literals. -func (e Envelope) Strip() Envelope { - e.channelID = 0 - return e -} - -// PeerError is a peer error reported via the Error channel. -// -// FIXME: This currently just disconnects the peer, which is too simplistic. -// For example, some errors should be logged, some should cause disconnects, -// and some should ban the peer. -// -// FIXME: This should probably be replaced by a more general PeerBehavior -// concept that can mark good and bad behavior and contributes to peer scoring. -// It should possibly also allow reactors to request explicit actions, e.g. -// disconnection or banning, in addition to doing this based on aggregates. -type PeerError struct { - NodeID NodeID - Err error -} - -// Channel is a bidirectional channel for Protobuf message exchange with peers. -// A Channel is safe for concurrent use by multiple goroutines. -type Channel struct { - closeOnce sync.Once - - // id defines the unique channel ID. - id ChannelID - - // messageType specifies the type of messages exchanged via the channel, and - // is used e.g. for automatic unmarshaling. - messageType proto.Message - - // inCh is a channel for receiving inbound messages. Envelope.From is always - // set. - inCh chan Envelope - - // outCh is a channel for sending outbound messages. Envelope.To or Broadcast - // must be set, otherwise the message is discarded. - outCh chan Envelope - - // errCh is a channel for reporting peer errors to the router, typically used - // when peers send an invalid or malignant message. - errCh chan PeerError - - // doneCh is used to signal that a Channel is closed. A Channel is bi-directional - // and should be closed by the reactor, where as the router is responsible - // for explicitly closing the internal In channel. - doneCh chan struct{} -} - -// NewChannel returns a reference to a new p2p Channel. It is the reactor's -// responsibility to close the Channel. After a channel is closed, the router may -// safely and explicitly close the internal In channel. -func NewChannel(id ChannelID, mType proto.Message, in, out chan Envelope, errCh chan PeerError) *Channel { - return &Channel{ - id: id, - messageType: mType, - inCh: in, - outCh: out, - errCh: errCh, - doneCh: make(chan struct{}), - } -} - -// ID returns the Channel's ID. -func (c *Channel) ID() ChannelID { - return c.id -} - -// In returns a read-only inbound go channel. This go channel should be used by -// reactors to consume Envelopes sent from peers. -func (c *Channel) In() <-chan Envelope { - return c.inCh -} - -// Out returns a write-only outbound go channel. This go channel should be used -// by reactors to route Envelopes to other peers. -func (c *Channel) Out() chan<- Envelope { - return c.outCh -} - -// Error returns a write-only outbound go channel designated for peer errors only. -// This go channel should be used by reactors to send peer errors when consuming -// Envelopes sent from other peers. -func (c *Channel) Error() chan<- PeerError { - return c.errCh -} - -// Close closes the outbound channel and marks the Channel as done. Internally, -// the outbound outCh and peer error errCh channels are closed. It is the reactor's -// responsibility to invoke Close. Any send on the Out or Error channel will -// panic after the Channel is closed. -// -// NOTE: After a Channel is closed, the router may safely assume it can no longer -// send on the internal inCh, however it should NEVER explicitly close it as -// that could result in panics by sending on a closed channel. -func (c *Channel) Close() { - c.closeOnce.Do(func() { - close(c.doneCh) - close(c.outCh) - close(c.errCh) - }) -} - -// Done returns the Channel's internal channel that should be used by a router -// to signal when it is safe to send on the internal inCh go channel. -func (c *Channel) Done() <-chan struct{} { - return c.doneCh -} - -// Wrapper is a Protobuf message that can contain a variety of inner messages. -// If a Channel's message type implements Wrapper, the channel will -// automatically (un)wrap passed messages using the container type, such that -// the channel can transparently support multiple message types. -type Wrapper interface { - // Wrap will take a message and wrap it in this one. - Wrap(proto.Message) error - - // Unwrap will unwrap the inner message contained in this message. - Unwrap() (proto.Message, error) -} diff --git a/p2p/mocks/connection.go b/p2p/mocks/connection.go new file mode 100644 index 000000000..07c4540ed --- /dev/null +++ b/p2p/mocks/connection.go @@ -0,0 +1,206 @@ +// Code generated by mockery v2.5.1. DO NOT EDIT. + +package mocks + +import ( + context "context" + + conn "github.com/tendermint/tendermint/p2p/conn" + + crypto "github.com/tendermint/tendermint/crypto" + + mock "github.com/stretchr/testify/mock" + + p2p "github.com/tendermint/tendermint/p2p" +) + +// Connection is an autogenerated mock type for the Connection type +type Connection struct { + mock.Mock +} + +// Close provides a mock function with given fields: +func (_m *Connection) Close() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// FlushClose provides a mock function with given fields: +func (_m *Connection) FlushClose() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// Handshake provides a mock function with given fields: _a0, _a1, _a2 +func (_m *Connection) Handshake(_a0 context.Context, _a1 p2p.NodeInfo, _a2 crypto.PrivKey) (p2p.NodeInfo, crypto.PubKey, error) { + ret := _m.Called(_a0, _a1, _a2) + + var r0 p2p.NodeInfo + if rf, ok := ret.Get(0).(func(context.Context, p2p.NodeInfo, crypto.PrivKey) p2p.NodeInfo); ok { + r0 = rf(_a0, _a1, _a2) + } else { + r0 = ret.Get(0).(p2p.NodeInfo) + } + + var r1 crypto.PubKey + if rf, ok := ret.Get(1).(func(context.Context, p2p.NodeInfo, crypto.PrivKey) crypto.PubKey); ok { + r1 = rf(_a0, _a1, _a2) + } else { + if ret.Get(1) != nil { + r1 = ret.Get(1).(crypto.PubKey) + } + } + + var r2 error + if rf, ok := ret.Get(2).(func(context.Context, p2p.NodeInfo, crypto.PrivKey) error); ok { + r2 = rf(_a0, _a1, _a2) + } else { + r2 = ret.Error(2) + } + + return r0, r1, r2 +} + +// LocalEndpoint provides a mock function with given fields: +func (_m *Connection) LocalEndpoint() p2p.Endpoint { + ret := _m.Called() + + var r0 p2p.Endpoint + if rf, ok := ret.Get(0).(func() p2p.Endpoint); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(p2p.Endpoint) + } + + return r0 +} + +// ReceiveMessage provides a mock function with given fields: +func (_m *Connection) ReceiveMessage() (p2p.ChannelID, []byte, error) { + ret := _m.Called() + + var r0 p2p.ChannelID + if rf, ok := ret.Get(0).(func() p2p.ChannelID); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(p2p.ChannelID) + } + + var r1 []byte + if rf, ok := ret.Get(1).(func() []byte); ok { + r1 = rf() + } else { + if ret.Get(1) != nil { + r1 = ret.Get(1).([]byte) + } + } + + var r2 error + if rf, ok := ret.Get(2).(func() error); ok { + r2 = rf() + } else { + r2 = ret.Error(2) + } + + return r0, r1, r2 +} + +// RemoteEndpoint provides a mock function with given fields: +func (_m *Connection) RemoteEndpoint() p2p.Endpoint { + ret := _m.Called() + + var r0 p2p.Endpoint + if rf, ok := ret.Get(0).(func() p2p.Endpoint); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(p2p.Endpoint) + } + + return r0 +} + +// SendMessage provides a mock function with given fields: _a0, _a1 +func (_m *Connection) SendMessage(_a0 p2p.ChannelID, _a1 []byte) (bool, error) { + ret := _m.Called(_a0, _a1) + + var r0 bool + if rf, ok := ret.Get(0).(func(p2p.ChannelID, []byte) bool); ok { + r0 = rf(_a0, _a1) + } else { + r0 = ret.Get(0).(bool) + } + + var r1 error + if rf, ok := ret.Get(1).(func(p2p.ChannelID, []byte) error); ok { + r1 = rf(_a0, _a1) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Status provides a mock function with given fields: +func (_m *Connection) Status() conn.ConnectionStatus { + ret := _m.Called() + + var r0 conn.ConnectionStatus + if rf, ok := ret.Get(0).(func() conn.ConnectionStatus); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(conn.ConnectionStatus) + } + + return r0 +} + +// String provides a mock function with given fields: +func (_m *Connection) String() string { + ret := _m.Called() + + var r0 string + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + return r0 +} + +// TrySendMessage provides a mock function with given fields: _a0, _a1 +func (_m *Connection) TrySendMessage(_a0 p2p.ChannelID, _a1 []byte) (bool, error) { + ret := _m.Called(_a0, _a1) + + var r0 bool + if rf, ok := ret.Get(0).(func(p2p.ChannelID, []byte) bool); ok { + r0 = rf(_a0, _a1) + } else { + r0 = ret.Get(0).(bool) + } + + var r1 error + if rf, ok := ret.Get(1).(func(p2p.ChannelID, []byte) error); ok { + r1 = rf(_a0, _a1) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} diff --git a/p2p/mocks/transport.go b/p2p/mocks/transport.go new file mode 100644 index 000000000..980c4a606 --- /dev/null +++ b/p2p/mocks/transport.go @@ -0,0 +1,121 @@ +// Code generated by mockery v2.5.1. DO NOT EDIT. + +package mocks + +import ( + context "context" + + mock "github.com/stretchr/testify/mock" + p2p "github.com/tendermint/tendermint/p2p" +) + +// Transport is an autogenerated mock type for the Transport type +type Transport struct { + mock.Mock +} + +// Accept provides a mock function with given fields: +func (_m *Transport) Accept() (p2p.Connection, error) { + ret := _m.Called() + + var r0 p2p.Connection + if rf, ok := ret.Get(0).(func() p2p.Connection); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(p2p.Connection) + } + } + + var r1 error + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Close provides a mock function with given fields: +func (_m *Transport) Close() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// Dial provides a mock function with given fields: _a0, _a1 +func (_m *Transport) Dial(_a0 context.Context, _a1 p2p.Endpoint) (p2p.Connection, error) { + ret := _m.Called(_a0, _a1) + + var r0 p2p.Connection + if rf, ok := ret.Get(0).(func(context.Context, p2p.Endpoint) p2p.Connection); ok { + r0 = rf(_a0, _a1) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(p2p.Connection) + } + } + + var r1 error + if rf, ok := ret.Get(1).(func(context.Context, p2p.Endpoint) error); ok { + r1 = rf(_a0, _a1) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Endpoints provides a mock function with given fields: +func (_m *Transport) Endpoints() []p2p.Endpoint { + ret := _m.Called() + + var r0 []p2p.Endpoint + if rf, ok := ret.Get(0).(func() []p2p.Endpoint); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]p2p.Endpoint) + } + } + + return r0 +} + +// Protocols provides a mock function with given fields: +func (_m *Transport) Protocols() []p2p.Protocol { + ret := _m.Called() + + var r0 []p2p.Protocol + if rf, ok := ret.Get(0).(func() []p2p.Protocol); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]p2p.Protocol) + } + } + + return r0 +} + +// String provides a mock function with given fields: +func (_m *Transport) String() string { + ret := _m.Called() + + var r0 string + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + return r0 +} diff --git a/p2p/p2p_test.go b/p2p/p2p_test.go new file mode 100644 index 000000000..9ab57e614 --- /dev/null +++ b/p2p/p2p_test.go @@ -0,0 +1,34 @@ +package p2p_test + +import ( + "context" + + "github.com/tendermint/tendermint/crypto" + "github.com/tendermint/tendermint/crypto/ed25519" + "github.com/tendermint/tendermint/p2p" +) + +// Common setup for P2P tests. + +var ( + ctx = context.Background() + chID = p2p.ChannelID(1) + + selfKey crypto.PrivKey = ed25519.GenPrivKeyFromSecret([]byte{0xf9, 0x1b, 0x08, 0xaa, 0x38, 0xee, 0x34, 0xdd}) + selfID = p2p.NodeIDFromPubKey(selfKey.PubKey()) + selfInfo = p2p.NodeInfo{ + NodeID: selfID, + ListenAddr: "0.0.0.0:0", + Network: "test", + Moniker: string(selfID), + } + + peerKey crypto.PrivKey = ed25519.GenPrivKeyFromSecret([]byte{0x84, 0xd7, 0x01, 0xbf, 0x83, 0x20, 0x1c, 0xfe}) + peerID = p2p.NodeIDFromPubKey(peerKey.PubKey()) + peerInfo = p2p.NodeInfo{ + NodeID: peerID, + ListenAddr: "0.0.0.0:0", + Network: "test", + Moniker: string(peerID), + } +) diff --git a/p2p/p2ptest/network.go b/p2p/p2ptest/network.go new file mode 100644 index 000000000..9ae2def8b --- /dev/null +++ b/p2p/p2ptest/network.go @@ -0,0 +1,240 @@ +package p2ptest + +import ( + "math/rand" + "testing" + "time" + + "github.com/gogo/protobuf/proto" + "github.com/stretchr/testify/require" + dbm "github.com/tendermint/tm-db" + + "github.com/tendermint/tendermint/crypto" + "github.com/tendermint/tendermint/crypto/ed25519" + "github.com/tendermint/tendermint/libs/log" + "github.com/tendermint/tendermint/p2p" +) + +// Network sets up an in-memory network that can be used for high-level P2P +// testing. It creates an arbitrary number of nodes that are connected to each +// other, and can open channels across all nodes with custom reactors. +type Network struct { + Nodes map[p2p.NodeID]*Node + + logger log.Logger + memoryNetwork *p2p.MemoryNetwork +} + +// MakeNetwork creates a test network with the given number of nodes and +// connects them to each other. +func MakeNetwork(t *testing.T, nodes int) *Network { + logger := log.TestingLogger() + network := &Network{ + Nodes: map[p2p.NodeID]*Node{}, + logger: logger, + memoryNetwork: p2p.NewMemoryNetwork(logger), + } + for i := 0; i < nodes; i++ { + node := MakeNode(t, network) + network.Nodes[node.NodeID] = node + } + + // Set up a list of node addresses to dial, and a peer update subscription + // for each node. + dialQueue := []p2p.NodeAddress{} + subs := map[p2p.NodeID]*p2p.PeerUpdates{} + for _, node := range network.Nodes { + dialQueue = append(dialQueue, node.NodeAddress) + subs[node.NodeID] = node.PeerManager.Subscribe() + defer subs[node.NodeID].Close() + } + + // For each node, dial the nodes that it still doesn't have a connection to + // (either inbound or outbound), and wait for both sides to confirm the + // connection via the subscriptions. + for i, sourceAddress := range dialQueue { + sourceNode := network.Nodes[sourceAddress.NodeID] + sourceSub := subs[sourceAddress.NodeID] + for _, targetAddress := range dialQueue[i+1:] { // nodes 0 { var cancel context.CancelFunc resolveCtx, cancel = context.WithTimeout(resolveCtx, r.options.ResolveTimeout) defer cancel() } + + r.logger.Debug("resolving peer address", "peer", address) endpoints, err := address.Resolve(resolveCtx) - if err != nil { + switch { + case err != nil: return nil, fmt.Errorf("failed to resolve address %q: %w", address, err) + case len(endpoints) == 0: + return nil, fmt.Errorf("address %q did not resolve to any endpoints", address) } for _, endpoint := range endpoints { - transport, ok := r.transports[endpoint.Protocol] + transport, ok := r.protocolTransports[endpoint.Protocol] if !ok { - r.logger.Error("no transport found for endpoint protocol", "endpoint", endpoint) + r.logger.Error("no transport found for protocol", "endpoint", endpoint) continue } @@ -463,17 +561,17 @@ func (r *Router) dialPeer(ctx context.Context, address NodeAddress) (Connection, // Internet can't and needs a different public address. conn, err := transport.Dial(dialCtx, endpoint) if err != nil { - r.logger.Error("failed to dial endpoint", "endpoint", endpoint, "err", err) + r.logger.Error("failed to dial endpoint", "peer", address.NodeID, "endpoint", endpoint, "err", err) } else { - r.logger.Info("connected to peer", "peer", address.NodeID, "endpoint", endpoint) + r.logger.Debug("dialed peer", "peer", address.NodeID, "endpoint", endpoint) return conn, nil } } - return nil, fmt.Errorf("failed to connect to peer via %q", address) + return nil, errors.New("all endpoints failed") } // handshakePeer handshakes with a peer, validating the peer's information. If -// expectID is given, we check that the peer's public key matches it. +// expectID is given, we check that the peer's info matches it. func (r *Router) handshakePeer(ctx context.Context, conn Connection, expectID NodeID) (NodeInfo, crypto.PubKey, error) { if r.options.HandshakeTimeout > 0 { var cancel context.CancelFunc @@ -484,51 +582,47 @@ func (r *Router) handshakePeer(ctx context.Context, conn Connection, expectID No if err != nil { return peerInfo, peerKey, err } + if err = peerInfo.Validate(); err != nil { return peerInfo, peerKey, fmt.Errorf("invalid handshake NodeInfo: %w", err) } - if expectID != "" && expectID != peerInfo.NodeID { - return peerInfo, peerKey, fmt.Errorf("expected to connect with peer %q, got %q", - expectID, peerInfo.NodeID) - } if NodeIDFromPubKey(peerKey) != peerInfo.NodeID { return peerInfo, peerKey, fmt.Errorf("peer's public key did not match its node ID %q (expected %q)", peerInfo.NodeID, NodeIDFromPubKey(peerKey)) } - if peerInfo.NodeID == r.nodeInfo.NodeID { - return peerInfo, peerKey, errors.New("rejecting handshake with self") + if expectID != "" && expectID != peerInfo.NodeID { + return peerInfo, peerKey, fmt.Errorf("expected to connect with peer %q, got %q", + expectID, peerInfo.NodeID) } return peerInfo, peerKey, nil } -// routePeer routes inbound messages from a peer to channels, and also sends -// outbound queued messages to the peer. It will close the connection and send -// queue, using this as a signal to coordinate the internal receivePeer() and -// sendPeer() goroutines. It blocks until the peer is done, e.g. when the -// connection or queue is closed. +// routePeer routes inbound and outbound messages between a peer and the reactor +// channels. It will close the given connection and send queue when done, or if +// they are closed elsewhere it will cause this method to shut down and return. func (r *Router) routePeer(peerID NodeID, conn Connection, sendQueue queue) { - r.logger.Info("routing peer", "peer", peerID) - resultsCh := make(chan error, 2) + r.logger.Info("peer connected", "peer", peerID, "endpoint", conn) + errCh := make(chan error, 2) go func() { - resultsCh <- r.receivePeer(peerID, conn) + errCh <- r.receivePeer(peerID, conn) }() go func() { - resultsCh <- r.sendPeer(peerID, conn, sendQueue) + errCh <- r.sendPeer(peerID, conn, sendQueue) }() - err := <-resultsCh + err := <-errCh _ = conn.Close() sendQueue.close() - if e := <-resultsCh; err == nil { - // The first err was nil, so we update it with the second result, - // which may or may not be nil. + if e := <-errCh; err == nil { + // The first err was nil, so we update it with the second err, which may + // or may not be nil. err = e } switch err { - case nil, io.EOF, ErrTransportClosed{}: - r.logger.Info("peer disconnected", "peer", peerID) + case nil, io.EOF: + r.logger.Info("peer disconnected", "peer", peerID, "endpoint", conn) default: - r.logger.Error("peer failure", "peer", peerID, "err", err) + r.logger.Error("peer failure", "peer", peerID, "endpoint", conn, "err", err) } } @@ -546,7 +640,7 @@ func (r *Router) receivePeer(peerID NodeID, conn Connection) error { messageType := r.channelMessages[chID] r.channelMtx.RUnlock() if !ok { - r.logger.Error("dropping message for unknown channel", "peer", peerID, "channel", chID) + r.logger.Debug("dropping message for unknown channel", "peer", peerID, "channel", chID) continue } @@ -564,10 +658,10 @@ func (r *Router) receivePeer(peerID NodeID, conn Connection) error { } select { - case queue.enqueue() <- Envelope{channelID: chID, From: peerID, Message: msg}: + case queue.enqueue() <- Envelope{From: peerID, Message: msg}: r.logger.Debug("received message", "peer", peerID, "message", msg) case <-queue.closed(): - r.logger.Error("channel closed, dropping message", "peer", peerID, "channel", chID) + r.logger.Debug("channel closed, dropping message", "peer", peerID, "channel", chID) case <-r.stopCh: return nil } @@ -579,6 +673,10 @@ func (r *Router) sendPeer(peerID NodeID, conn Connection, queue queue) error { for { select { case envelope := <-queue.dequeue(): + if envelope.Message == nil { + r.logger.Error("dropping nil message", "peer", peerID) + continue + } bz, err := proto.Marshal(envelope.Message) if err != nil { r.logger.Error("failed to marshal message", "peer", peerID, "err", err) @@ -602,43 +700,57 @@ func (r *Router) sendPeer(peerID NodeID, conn Connection, queue queue) error { // evictPeers evicts connected peers as requested by the peer manager. func (r *Router) evictPeers() { + r.logger.Debug("starting evict routine") ctx := r.stopCtx() for { peerID, err := r.peerManager.EvictNext(ctx) - switch err { - case nil: - case context.Canceled: + switch { + case errors.Is(err, context.Canceled): r.logger.Debug("stopping evict routine") return - default: + case err != nil: r.logger.Error("failed to find next peer to evict", "err", err) return } r.logger.Info("evicting peer", "peer", peerID) r.peerMtx.RLock() - if queue, ok := r.peerQueues[peerID]; ok { + queue, ok := r.peerQueues[peerID] + r.peerMtx.RUnlock() + if ok { queue.close() } - r.peerMtx.RUnlock() } } // OnStart implements service.Service. func (r *Router) OnStart() error { go r.dialPeers() + go r.evictPeers() for _, transport := range r.transports { go r.acceptPeers(transport) } - go r.evictPeers() return nil } // OnStop implements service.Service. // -// FIXME: This needs to close transports as well. +// All channels must be closed by OpenChannel() callers before stopping the +// router, to prevent blocked channel sends in reactors. Channels are not closed +// here, since that would cause any reactor senders to panic, so it is the +// sender's responsibility. func (r *Router) OnStop() { - // Collect all active queues, so we can wait for them to close. + // Signal router shutdown. + close(r.stopCh) + + // Close transport listeners (unblocks Accept calls). + for _, transport := range r.transports { + if err := transport.Close(); err != nil { + r.logger.Error("failed to close transport", "transport", transport, "err", err) + } + } + + // Collect all remaining queues, and wait for them to close. queues := []queue{} r.channelMtx.RLock() for _, q := range r.channelQueues { @@ -650,16 +762,12 @@ func (r *Router) OnStop() { queues = append(queues, q) } r.peerMtx.RUnlock() - - // Signal router shutdown, and wait for queues (and thus goroutines) - // to complete. - close(r.stopCh) for _, q := range queues { <-q.closed() } } -// stopCtx returns a context that is cancelled when the router stops. +// stopCtx returns a new context that is cancelled when the router stops. func (r *Router) stopCtx() context.Context { ctx, cancel := context.WithCancel(context.Background()) go func() { diff --git a/p2p/router_test.go b/p2p/router_test.go index 01ad949d0..6d1c59bc2 100644 --- a/p2p/router_test.go +++ b/p2p/router_test.go @@ -2,158 +2,644 @@ package p2p_test import ( "errors" + "fmt" + "io" + "strings" + "sync" "testing" + "time" "github.com/fortytw2/leaktest" + "github.com/gogo/protobuf/proto" gogotypes "github.com/gogo/protobuf/types" - "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" dbm "github.com/tendermint/tm-db" "github.com/tendermint/tendermint/crypto" - "github.com/tendermint/tendermint/crypto/ed25519" "github.com/tendermint/tendermint/libs/log" + tmsync "github.com/tendermint/tendermint/libs/sync" "github.com/tendermint/tendermint/p2p" + "github.com/tendermint/tendermint/p2p/mocks" + "github.com/tendermint/tendermint/p2p/p2ptest" ) -type TestMessage = gogotypes.StringValue - -func generateNode() (p2p.NodeInfo, crypto.PrivKey) { - privKey := ed25519.GenPrivKey() - nodeID := p2p.NodeIDFromPubKey(privKey.PubKey()) - nodeInfo := p2p.NodeInfo{ - NodeID: nodeID, - // FIXME: We have to fake a ListenAddr for now. - ListenAddr: "127.0.0.1:1234", - Moniker: "foo", - } - return nodeInfo, privKey -} - func echoReactor(channel *p2p.Channel) { for { select { - case envelope := <-channel.In(): - channel.Out() <- p2p.Envelope{ + case envelope := <-channel.In: + value := envelope.Message.(*p2ptest.Message).Value + channel.Out <- p2p.Envelope{ To: envelope.From, - Message: &TestMessage{Value: envelope.Message.(*TestMessage).Value}, + Message: &p2ptest.Message{Value: value}, } + case <-channel.Done(): return } } } -func TestRouter(t *testing.T) { - defer leaktest.Check(t)() +func TestRouter_Network(t *testing.T) { + t.Cleanup(leaktest.Check(t)) - logger := log.TestingLogger() - network := p2p.NewMemoryNetwork(logger) - nodeInfo, privKey := generateNode() - transport := network.CreateTransport(nodeInfo.NodeID) - defer transport.Close() - chID := p2p.ChannelID(1) + // Create a test network and open a channel where all peers run echoReactor. + network := p2ptest.MakeNetwork(t, 8) + local := network.RandomNode() + peers := network.Peers(local.NodeID) + channels := network.MakeChannels(t, 1, &p2ptest.Message{}) - // Start some other in-memory network nodes to communicate with, running - // a simple echo reactor that returns received messages. - peers := []p2p.NodeAddress{} - for i := 0; i < 3; i++ { - peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) - require.NoError(t, err) - peerInfo, peerKey := generateNode() - peerTransport := network.CreateTransport(peerInfo.NodeID) - defer peerTransport.Close() - peerRouter, err := p2p.NewRouter( - logger.With("peerID", i), - peerInfo, - peerKey, - peerManager, - []p2p.Transport{peerTransport}, - p2p.RouterOptions{}, + channel := channels[local.NodeID] + for _, peer := range peers { + go echoReactor(channels[peer.NodeID]) + } + + // Sending a message to each peer should work. + for _, peer := range peers { + p2ptest.RequireSendReceive(t, channel, peer.NodeID, + &p2ptest.Message{Value: "foo"}, + &p2ptest.Message{Value: "foo"}, ) - require.NoError(t, err) - peers = append(peers, peerTransport.Endpoints()[0].NodeAddress(peerInfo.NodeID)) - - channel, err := peerRouter.OpenChannel(chID, &TestMessage{}) - require.NoError(t, err) - defer channel.Close() - go echoReactor(channel) - - err = peerRouter.Start() - require.NoError(t, err) - defer func() { require.NoError(t, peerRouter.Stop()) }() } - // Start the main router and connect it to the peers above. - peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) - require.NoError(t, err) - defer peerManager.Close() - for _, address := range peers { - err := peerManager.Add(address) - require.NoError(t, err) - } - peerUpdates := peerManager.Subscribe() - defer peerUpdates.Close() - - router, err := p2p.NewRouter(logger, nodeInfo, privKey, peerManager, []p2p.Transport{transport}, p2p.RouterOptions{}) - require.NoError(t, err) - channel, err := router.OpenChannel(chID, &TestMessage{}) - require.NoError(t, err) - defer channel.Close() - - err = router.Start() - require.NoError(t, err) - defer func() { - // Since earlier defers are closed after this, and we have to make sure - // we close channels and subscriptions before the router, we explicitly - // close them here to. - peerUpdates.Close() - channel.Close() - require.NoError(t, router.Stop()) - }() - - // Wait for peers to come online, and ping them as they do. - for i := 0; i < len(peers); i++ { - peerUpdate := <-peerUpdates.Updates() - peerID := peerUpdate.NodeID - require.Equal(t, p2p.PeerUpdate{ - NodeID: peerID, - Status: p2p.PeerStatusUp, - }, peerUpdate) - - channel.Out() <- p2p.Envelope{To: peerID, Message: &TestMessage{Value: "hi!"}} - assert.Equal(t, p2p.Envelope{ - From: peerID, - Message: &TestMessage{Value: "hi!"}, - }, (<-channel.In()).Strip()) - } - - // We now send a broadcast, which we should return back from all peers. - channel.Out() <- p2p.Envelope{ + // Sending a broadcast should return back a message from all peers. + p2ptest.RequireSend(t, channel, p2p.Envelope{ Broadcast: true, - Message: &TestMessage{Value: "broadcast"}, - } - for i := 0; i < len(peers); i++ { - envelope := <-channel.In() - require.Equal(t, &TestMessage{Value: "broadcast"}, envelope.Message) + Message: &p2ptest.Message{Value: "bar"}, + }) + expect := []p2p.Envelope{} + for _, peer := range peers { + expect = append(expect, p2p.Envelope{ + From: peer.NodeID, + Message: &p2ptest.Message{Value: "bar"}, + }) } + p2ptest.RequireReceiveUnordered(t, channel, expect) - // We then submit an error for a peer, and watch it get disconnected. - channel.Error() <- p2p.PeerError{ + // We then submit an error for a peer, and watch it get disconnected and + // then reconnected as the router retries it. + peerUpdates := local.MakePeerUpdates(t) + channel.Error <- p2p.PeerError{ NodeID: peers[0].NodeID, - Err: errors.New("test error"), + Err: errors.New("boom"), } - peerUpdate := <-peerUpdates.Updates() - require.Equal(t, p2p.PeerUpdate{ - NodeID: peers[0].NodeID, - Status: p2p.PeerStatusDown, - }, peerUpdate) - - // The peer manager will automatically reconnect the peer, so we wait - // for that to happen. - peerUpdate = <-peerUpdates.Updates() - require.Equal(t, p2p.PeerUpdate{ - NodeID: peers[0].NodeID, - Status: p2p.PeerStatusUp, - }, peerUpdate) + p2ptest.RequireUpdates(t, peerUpdates, []p2p.PeerUpdate{ + {NodeID: peers[0].NodeID, Status: p2p.PeerStatusDown}, + {NodeID: peers[0].NodeID, Status: p2p.PeerStatusUp}, + }) +} + +func TestRouter_Channel(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Set up a router with no transports (so no peers). + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager, nil, p2p.RouterOptions{}) + require.NoError(t, err) + + require.NoError(t, router.Start()) + t.Cleanup(func() { + require.NoError(t, router.Stop()) + }) + + // Opening a channel should work. + channel, err := router.OpenChannel(chID, &p2ptest.Message{}) + require.NoError(t, err) + + // Opening the same channel again should fail. + _, err = router.OpenChannel(chID, &p2ptest.Message{}) + require.Error(t, err) + + // Opening a different channel should work. + _, err = router.OpenChannel(2, &p2ptest.Message{}) + require.NoError(t, err) + + // Closing the channel, then opening it again should be fine. + channel.Close() + time.Sleep(100 * time.Millisecond) // yes yes, but Close() is async... + + channel, err = router.OpenChannel(chID, &p2ptest.Message{}) + require.NoError(t, err) + + // We should be able to send on the channel, even though there are no peers. + p2ptest.RequireSend(t, channel, p2p.Envelope{ + To: p2p.NodeID(strings.Repeat("a", 40)), + Message: &p2ptest.Message{Value: "foo"}, + }) + + // A message to ourselves should be dropped. + p2ptest.RequireSend(t, channel, p2p.Envelope{ + To: selfID, + Message: &p2ptest.Message{Value: "self"}, + }) + p2ptest.RequireEmpty(t, channel) +} + +// Channel tests are hairy to mock, so we use an in-memory network instead. +func TestRouter_Channel_SendReceive(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Create a test network and open a channel on all nodes. + network := p2ptest.MakeNetwork(t, 3) + ids := network.NodeIDs() + aID, bID, cID := ids[0], ids[1], ids[2] + channels := network.MakeChannels(t, chID, &p2ptest.Message{}) + a, b, c := channels[aID], channels[bID], channels[cID] + otherChannels := network.MakeChannels(t, 9, &p2ptest.Message{}) + + // Sending a message a->b should work, and not send anything + // further to a, b, or c. + p2ptest.RequireSend(t, a, p2p.Envelope{To: bID, Message: &p2ptest.Message{Value: "foo"}}) + p2ptest.RequireReceive(t, b, p2p.Envelope{From: aID, Message: &p2ptest.Message{Value: "foo"}}) + p2ptest.RequireEmpty(t, a, b, c) + + // Sending a nil message a->c should be dropped. + p2ptest.RequireSend(t, a, p2p.Envelope{To: bID, Message: nil}) + p2ptest.RequireEmpty(t, a, b, c) + + // Sending a different message type should be dropped. + p2ptest.RequireSend(t, a, p2p.Envelope{To: bID, Message: &gogotypes.BoolValue{Value: true}}) + p2ptest.RequireEmpty(t, a, b, c) + + // Sending to an unknown peer should be dropped. + p2ptest.RequireSend(t, a, p2p.Envelope{ + To: p2p.NodeID(strings.Repeat("a", 40)), + Message: &p2ptest.Message{Value: "a"}, + }) + p2ptest.RequireEmpty(t, a, b, c) + + // Sending without a recipient should be dropped. + p2ptest.RequireSend(t, a, p2p.Envelope{Message: &p2ptest.Message{Value: "noto"}}) + p2ptest.RequireEmpty(t, a, b, c) + + // Sending to self should be dropped. + p2ptest.RequireSend(t, a, p2p.Envelope{To: aID, Message: &p2ptest.Message{Value: "self"}}) + p2ptest.RequireEmpty(t, a, b, c) + + // Removing b and sending to it should be dropped. + network.Remove(t, bID) + p2ptest.RequireSend(t, a, p2p.Envelope{To: bID, Message: &p2ptest.Message{Value: "nob"}}) + p2ptest.RequireEmpty(t, a, b, c) + + // After all this, sending a message c->a should work. + p2ptest.RequireSend(t, c, p2p.Envelope{To: aID, Message: &p2ptest.Message{Value: "bar"}}) + p2ptest.RequireReceive(t, a, p2p.Envelope{From: cID, Message: &p2ptest.Message{Value: "bar"}}) + p2ptest.RequireEmpty(t, a, b, c) + + // None of these messages should have made it onto the other channels. + for _, other := range otherChannels { + p2ptest.RequireEmpty(t, other) + } +} + +func TestRouter_Channel_Broadcast(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Create a test network and open a channel on all nodes. + network := p2ptest.MakeNetwork(t, 4) + ids := network.NodeIDs() + aID, bID, cID, dID := ids[0], ids[1], ids[2], ids[3] + channels := network.MakeChannels(t, 1, &p2ptest.Message{}) + a, b, c, d := channels[aID], channels[bID], channels[cID], channels[dID] + + // Sending a broadcast from b should work. + p2ptest.RequireSend(t, b, p2p.Envelope{Broadcast: true, Message: &p2ptest.Message{Value: "foo"}}) + p2ptest.RequireReceive(t, a, p2p.Envelope{From: bID, Message: &p2ptest.Message{Value: "foo"}}) + p2ptest.RequireReceive(t, c, p2p.Envelope{From: bID, Message: &p2ptest.Message{Value: "foo"}}) + p2ptest.RequireReceive(t, d, p2p.Envelope{From: bID, Message: &p2ptest.Message{Value: "foo"}}) + p2ptest.RequireEmpty(t, a, b, c, d) + + // Removing one node from the network shouldn't prevent broadcasts from working. + network.Remove(t, dID) + p2ptest.RequireSend(t, a, p2p.Envelope{Broadcast: true, Message: &p2ptest.Message{Value: "bar"}}) + p2ptest.RequireReceive(t, b, p2p.Envelope{From: aID, Message: &p2ptest.Message{Value: "bar"}}) + p2ptest.RequireReceive(t, c, p2p.Envelope{From: aID, Message: &p2ptest.Message{Value: "bar"}}) + p2ptest.RequireEmpty(t, a, b, c, d) +} + +func TestRouter_Channel_Wrapper(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Create a test network and open a channel on all nodes. + network := p2ptest.MakeNetwork(t, 2) + ids := network.NodeIDs() + aID, bID := ids[0], ids[1] + channels := network.MakeChannels(t, 1, &wrapperMessage{}) + a, b := channels[aID], channels[bID] + + // Since wrapperMessage implements p2p.Wrapper and handles Message, it + // should automatically wrap and unwrap sent messages -- we prepend the + // wrapper actions to the message value to signal this. + p2ptest.RequireSend(t, a, p2p.Envelope{To: bID, Message: &p2ptest.Message{Value: "foo"}}) + p2ptest.RequireReceive(t, b, p2p.Envelope{From: aID, Message: &p2ptest.Message{Value: "unwrap:wrap:foo"}}) + + // If we send a different message that can't be wrapped, it should be dropped. + p2ptest.RequireSend(t, a, p2p.Envelope{To: bID, Message: &gogotypes.BoolValue{Value: true}}) + p2ptest.RequireEmpty(t, b) + + // If we send the wrapper message itself, it should also be passed through + // since WrapperMessage supports it, and should only be unwrapped at the receiver. + p2ptest.RequireSend(t, a, p2p.Envelope{ + To: bID, + Message: &wrapperMessage{Message: p2ptest.Message{Value: "foo"}}, + }) + p2ptest.RequireReceive(t, b, p2p.Envelope{ + From: aID, + Message: &p2ptest.Message{Value: "unwrap:foo"}, + }) + +} + +// WrapperMessage prepends the value with "wrap:" and "unwrap:" to test it. +type wrapperMessage struct { + p2ptest.Message +} + +var _ p2p.Wrapper = (*wrapperMessage)(nil) + +func (w *wrapperMessage) Wrap(inner proto.Message) error { + switch inner := inner.(type) { + case *p2ptest.Message: + w.Message.Value = fmt.Sprintf("wrap:%v", inner.Value) + case *wrapperMessage: + *w = *inner + default: + return fmt.Errorf("invalid message type %T", inner) + } + return nil +} + +func (w *wrapperMessage) Unwrap() (proto.Message, error) { + return &p2ptest.Message{Value: fmt.Sprintf("unwrap:%v", w.Message.Value)}, nil +} + +func TestRouter_Channel_Error(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Create a test network and open a channel on all nodes. + network := p2ptest.MakeNetwork(t, 3) + ids := network.NodeIDs() + aID, bID := ids[0], ids[1] + channels := network.MakeChannels(t, 1, &p2ptest.Message{}) + a := channels[aID] + + // Erroring b should cause it to be disconnected. It will reconnect shortly after. + sub := network.Nodes[aID].MakePeerUpdates(t) + p2ptest.RequireError(t, a, p2p.PeerError{NodeID: bID, Err: errors.New("boom")}) + p2ptest.RequireUpdates(t, sub, []p2p.PeerUpdate{ + {NodeID: bID, Status: p2p.PeerStatusDown}, + {NodeID: bID, Status: p2p.PeerStatusUp}, + }) +} + +func TestRouter_AcceptPeers(t *testing.T) { + testcases := map[string]struct { + peerInfo p2p.NodeInfo + peerKey crypto.PubKey + ok bool + }{ + "valid handshake": {peerInfo, peerKey.PubKey(), true}, + "empty handshake": {p2p.NodeInfo{}, nil, false}, + "invalid key": {peerInfo, selfKey.PubKey(), false}, + "self handshake": {selfInfo, selfKey.PubKey(), false}, + } + for name, tc := range testcases { + tc := tc + t.Run(name, func(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Set up a mock transport that handshakes. + closer := tmsync.NewCloser() + mockConnection := &mocks.Connection{} + mockConnection.On("String").Maybe().Return("mock") + mockConnection.On("Handshake", mock.Anything, selfInfo, selfKey). + Return(tc.peerInfo, tc.peerKey, nil) + mockConnection.On("Close").Run(func(_ mock.Arguments) { closer.Close() }).Return(nil) + if tc.ok { + mockConnection.On("ReceiveMessage").Return(chID, nil, io.EOF) + } + + mockTransport := &mocks.Transport{} + mockTransport.On("String").Maybe().Return("mock") + mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"}) + mockTransport.On("Close").Return(nil) + mockTransport.On("Accept").Once().Return(mockConnection, nil) + mockTransport.On("Accept").Once().Return(nil, io.EOF) + + // Set up and start the router. + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + sub := peerManager.Subscribe() + defer sub.Close() + + router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager, + []p2p.Transport{mockTransport}, p2p.RouterOptions{}) + require.NoError(t, err) + require.NoError(t, router.Start()) + + if tc.ok { + p2ptest.RequireUpdate(t, sub, p2p.PeerUpdate{ + NodeID: tc.peerInfo.NodeID, + Status: p2p.PeerStatusUp, + }) + sub.Close() + } else { + select { + case <-closer.Done(): + case <-time.After(100 * time.Millisecond): + require.Fail(t, "connection not closed") + } + } + + require.NoError(t, router.Stop()) + mockTransport.AssertExpectations(t) + mockConnection.AssertExpectations(t) + }) + } +} + +func TestRouter_AcceptPeers_Error(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Set up a mock transport that returns an error, which should prevent + // the router from calling Accept again. + mockTransport := &mocks.Transport{} + mockTransport.On("String").Maybe().Return("mock") + mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"}) + mockTransport.On("Accept").Once().Return(nil, errors.New("boom")) + mockTransport.On("Close").Return(nil) + + // Set up and start the router. + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager, + []p2p.Transport{mockTransport}, p2p.RouterOptions{}) + require.NoError(t, err) + + require.NoError(t, router.Start()) + time.Sleep(time.Second) + require.NoError(t, router.Stop()) + + mockTransport.AssertExpectations(t) +} + +func TestRouter_AcceptPeers_ErrorEOF(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Set up a mock transport that returns io.EOF once, which should prevent + // the router from calling Accept again. + mockTransport := &mocks.Transport{} + mockTransport.On("String").Maybe().Return("mock") + mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"}) + mockTransport.On("Accept").Once().Return(nil, io.EOF) + mockTransport.On("Close").Return(nil) + + // Set up and start the router. + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager, + []p2p.Transport{mockTransport}, p2p.RouterOptions{}) + require.NoError(t, err) + + require.NoError(t, router.Start()) + time.Sleep(time.Second) + require.NoError(t, router.Stop()) + + mockTransport.AssertExpectations(t) +} + +func TestRouter_AcceptPeers_HeadOfLineBlocking(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Set up a mock transport that returns a connection that blocks during the + // handshake. It should be able to accept several of these in parallel, i.e. + // a single connection can't halt other connections being accepted. + acceptCh := make(chan bool, 3) + closeCh := make(chan time.Time) + + mockConnection := &mocks.Connection{} + mockConnection.On("String").Maybe().Return("mock") + mockConnection.On("Handshake", mock.Anything, selfInfo, selfKey). + WaitUntil(closeCh).Return(p2p.NodeInfo{}, nil, io.EOF) + mockConnection.On("Close").Return(nil) + + mockTransport := &mocks.Transport{} + mockTransport.On("String").Maybe().Return("mock") + mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"}) + mockTransport.On("Close").Return(nil) + mockTransport.On("Accept").Times(3).Run(func(_ mock.Arguments) { + acceptCh <- true + }).Return(mockConnection, nil) + mockTransport.On("Accept").Once().Return(nil, io.EOF) + + // Set up and start the router. + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager, + []p2p.Transport{mockTransport}, p2p.RouterOptions{}) + require.NoError(t, err) + require.NoError(t, router.Start()) + + require.Eventually(t, func() bool { + return len(acceptCh) == 3 + }, time.Second, 10*time.Millisecond) + close(closeCh) + time.Sleep(100 * time.Millisecond) + + require.NoError(t, router.Stop()) + mockTransport.AssertExpectations(t) + mockConnection.AssertExpectations(t) +} + +func TestRouter_DialPeers(t *testing.T) { + testcases := map[string]struct { + dialID p2p.NodeID + peerInfo p2p.NodeInfo + peerKey crypto.PubKey + dialErr error + ok bool + }{ + "valid dial": {peerInfo.NodeID, peerInfo, peerKey.PubKey(), nil, true}, + "empty handshake": {peerInfo.NodeID, p2p.NodeInfo{}, nil, nil, false}, + "invalid key": {peerInfo.NodeID, peerInfo, selfKey.PubKey(), nil, false}, + "unexpected node ID": {peerInfo.NodeID, selfInfo, selfKey.PubKey(), nil, false}, + "dial error": {peerInfo.NodeID, peerInfo, peerKey.PubKey(), errors.New("boom"), false}, + } + for name, tc := range testcases { + tc := tc + t.Run(name, func(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + address := p2p.NodeAddress{Protocol: "mock", NodeID: tc.dialID} + endpoint := p2p.Endpoint{Protocol: "mock", Path: string(tc.dialID)} + + // Set up a mock transport that handshakes. + closer := tmsync.NewCloser() + mockConnection := &mocks.Connection{} + mockConnection.On("String").Maybe().Return("mock") + if tc.dialErr == nil { + mockConnection.On("Handshake", mock.Anything, selfInfo, selfKey). + Return(tc.peerInfo, tc.peerKey, nil) + mockConnection.On("Close").Run(func(_ mock.Arguments) { closer.Close() }).Return(nil) + } + if tc.ok { + mockConnection.On("ReceiveMessage").Return(chID, nil, io.EOF) + } + + mockTransport := &mocks.Transport{} + mockTransport.On("String").Maybe().Return("mock") + mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"}) + mockTransport.On("Close").Return(nil) + mockTransport.On("Accept").Maybe().Return(nil, io.EOF) + if tc.dialErr == nil { + mockTransport.On("Dial", mock.Anything, endpoint).Once().Return(mockConnection, nil) + // This handles the retry when a dialed connection gets closed after ReceiveMessage + // returns io.EOF above. + mockTransport.On("Dial", mock.Anything, endpoint).Maybe().Return(nil, io.EOF) + } else { + mockTransport.On("Dial", mock.Anything, endpoint).Once(). + Run(func(_ mock.Arguments) { closer.Close() }). + Return(nil, tc.dialErr) + } + + // Set up and start the router. + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + require.NoError(t, peerManager.Add(address)) + sub := peerManager.Subscribe() + defer sub.Close() + + router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager, + []p2p.Transport{mockTransport}, p2p.RouterOptions{}) + require.NoError(t, err) + require.NoError(t, router.Start()) + + if tc.ok { + p2ptest.RequireUpdate(t, sub, p2p.PeerUpdate{ + NodeID: tc.peerInfo.NodeID, + Status: p2p.PeerStatusUp, + }) + sub.Close() + } else { + select { + case <-closer.Done(): + case <-time.After(100 * time.Millisecond): + require.Fail(t, "connection not closed") + } + } + + require.NoError(t, router.Stop()) + mockTransport.AssertExpectations(t) + mockConnection.AssertExpectations(t) + }) + } +} + +func TestRouter_DialPeers_Parallel(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + a := p2p.NodeAddress{Protocol: "mock", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "mock", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "mock", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + // Set up a mock transport that returns a connection that blocks during the + // handshake. It should dial all peers in parallel. + dialCh := make(chan bool, 3) + closeCh := make(chan time.Time) + + mockConnection := &mocks.Connection{} + mockConnection.On("String").Maybe().Return("mock") + mockConnection.On("Handshake", mock.Anything, selfInfo, selfKey). + WaitUntil(closeCh).Return(p2p.NodeInfo{}, nil, io.EOF) + mockConnection.On("Close").Return(nil) + + mockTransport := &mocks.Transport{} + mockTransport.On("String").Maybe().Return("mock") + mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"}) + mockTransport.On("Close").Return(nil) + mockTransport.On("Accept").Once().Return(nil, io.EOF) + for _, address := range []p2p.NodeAddress{a, b, c} { + endpoint := p2p.Endpoint{Protocol: address.Protocol, Path: string(address.NodeID)} + mockTransport.On("Dial", mock.Anything, endpoint).Run(func(_ mock.Arguments) { + dialCh <- true + }).Return(mockConnection, nil) + } + + // Set up and start the router. + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Add(c)) + + router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager, + []p2p.Transport{mockTransport}, p2p.RouterOptions{}) + require.NoError(t, err) + require.NoError(t, router.Start()) + + require.Eventually(t, func() bool { + return len(dialCh) == 3 + }, time.Second, 10*time.Millisecond) + close(closeCh) + time.Sleep(100 * time.Millisecond) + + require.NoError(t, router.Stop()) + mockTransport.AssertExpectations(t) + mockConnection.AssertExpectations(t) +} + +func TestRouter_EvictPeers(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Set up a mock transport that we can evict. + closeCh := make(chan time.Time) + closeOnce := sync.Once{} + + mockConnection := &mocks.Connection{} + mockConnection.On("String").Maybe().Return("mock") + mockConnection.On("Handshake", mock.Anything, selfInfo, selfKey). + Return(peerInfo, peerKey.PubKey(), nil) + mockConnection.On("ReceiveMessage").WaitUntil(closeCh).Return(chID, nil, io.EOF) + mockConnection.On("Close").Run(func(_ mock.Arguments) { + closeOnce.Do(func() { + close(closeCh) + }) + }).Return(nil) + + mockTransport := &mocks.Transport{} + mockTransport.On("String").Maybe().Return("mock") + mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"}) + mockTransport.On("Close").Return(nil) + mockTransport.On("Accept").Once().Return(mockConnection, nil) + mockTransport.On("Accept").Once().Return(nil, io.EOF) + + // Set up and start the router. + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + sub := peerManager.Subscribe() + defer sub.Close() + + router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager, + []p2p.Transport{mockTransport}, p2p.RouterOptions{}) + require.NoError(t, err) + require.NoError(t, router.Start()) + + // Wait for the mock peer to connect, then evict it by reporting an error. + p2ptest.RequireUpdate(t, sub, p2p.PeerUpdate{ + NodeID: peerInfo.NodeID, + Status: p2p.PeerStatusUp, + }) + + require.NoError(t, peerManager.Errored(peerInfo.NodeID, errors.New("boom"))) + + p2ptest.RequireUpdate(t, sub, p2p.PeerUpdate{ + NodeID: peerInfo.NodeID, + Status: p2p.PeerStatusDown, + }) + sub.Close() + + require.NoError(t, router.Stop()) + mockTransport.AssertExpectations(t) + mockConnection.AssertExpectations(t) } diff --git a/p2p/shim.go b/p2p/shim.go index a349849c7..a71a1df54 100644 --- a/p2p/shim.go +++ b/p2p/shim.go @@ -39,6 +39,9 @@ type ( ChannelShim struct { Descriptor *ChannelDescriptor Channel *Channel + inCh chan<- Envelope + outCh <-chan Envelope + errCh <-chan PeerError } // ChannelDescriptorShim defines a shim wrapper around a legacy p2p channel @@ -56,7 +59,7 @@ func NewReactorShim(logger log.Logger, name string, descriptors map[ChannelID]*C for _, cds := range descriptors { chShim := NewChannelShim(cds, 0) - channels[chShim.Channel.id] = chShim + channels[chShim.Channel.ID] = chShim } rs := &ReactorShim{ @@ -72,15 +75,21 @@ func NewReactorShim(logger log.Logger, name string, descriptors map[ChannelID]*C } func NewChannelShim(cds *ChannelDescriptorShim, buf uint) *ChannelShim { + inCh := make(chan Envelope, buf) + outCh := make(chan Envelope, buf) + errCh := make(chan PeerError, buf) return &ChannelShim{ Descriptor: cds.Descriptor, Channel: NewChannel( ChannelID(cds.Descriptor.ID), cds.MsgType, - make(chan Envelope, buf), - make(chan Envelope, buf), - make(chan PeerError, buf), + inCh, + outCh, + errCh, ), + inCh: inCh, + outCh: outCh, + errCh: errCh, } } @@ -91,7 +100,7 @@ func NewChannelShim(cds *ChannelDescriptorShim, buf uint) *ChannelShim { func (rs *ReactorShim) proxyPeerEnvelopes() { for _, cs := range rs.Channels { go func(cs *ChannelShim) { - for e := range cs.Channel.outCh { + for e := range cs.outCh { msg := proto.Clone(cs.Channel.messageType) msg.Reset() @@ -161,7 +170,7 @@ func (rs *ReactorShim) proxyPeerEnvelopes() { func (rs *ReactorShim) handlePeerErrors() { for _, cs := range rs.Channels { go func(cs *ChannelShim) { - for pErr := range cs.Channel.errCh { + for pErr := range cs.errCh { if pErr.NodeID != "" { peer := rs.Switch.peers.Get(pErr.NodeID) if peer == nil { @@ -311,7 +320,7 @@ func (rs *ReactorShim) Receive(chID byte, src Peer, msgBytes []byte) { } select { - case channelShim.Channel.inCh <- Envelope{From: src.ID(), Message: msg}: + case channelShim.inCh <- Envelope{From: src.ID(), Message: msg}: rs.Logger.Debug("proxied envelope", "reactor", rs.Name, "ch_id", cID, "peer", src.ID()) case <-channelShim.Channel.Done(): diff --git a/p2p/shim_test.go b/p2p/shim_test.go index a12a184cb..b91af366d 100644 --- a/p2p/shim_test.go +++ b/p2p/shim_test.go @@ -92,7 +92,7 @@ func TestReactorShim_GetChannel(t *testing.T) { p2pCh := rts.shim.GetChannel(p2p.ChannelID(channelID1)) require.NotNil(t, p2pCh) - require.Equal(t, p2pCh.ID(), p2p.ChannelID(channelID1)) + require.Equal(t, p2pCh.ID, p2p.ChannelID(channelID1)) p2pCh = rts.shim.GetChannel(p2p.ChannelID(byte(0x03))) require.Nil(t, p2pCh) @@ -178,11 +178,11 @@ func TestReactorShim_Receive(t *testing.T) { // Simulate receiving the envelope in some real reactor and replying back with // the same envelope and then closing the Channel. go func() { - e := <-p2pCh.Channel.In() + e := <-p2pCh.Channel.In require.Equal(t, peerIDA, e.From) require.NotNil(t, e.Message) - p2pCh.Channel.Out() <- p2p.Envelope{To: e.From, Message: e.Message} + p2pCh.Channel.Out <- p2p.Envelope{To: e.From, Message: e.Message} p2pCh.Channel.Close() wg.Done() }() @@ -200,7 +200,7 @@ func TestReactorShim_Receive(t *testing.T) { // Since p2pCh was closed in the simulated reactor above, calling Receive // should not block. rts.shim.Receive(channelID1, peerA, bz) - require.Empty(t, p2pCh.Channel.In()) + require.Empty(t, p2pCh.Channel.In) peerA.AssertExpectations(t) } diff --git a/p2p/transport.go b/p2p/transport.go index dc01e5efc..ff782f966 100644 --- a/p2p/transport.go +++ b/p2p/transport.go @@ -10,6 +10,8 @@ import ( "github.com/tendermint/tendermint/p2p/conn" ) +//go:generate mockery --case underscore --name Transport|Connection + const ( // defaultProtocol is the default protocol used for NodeAddress when // a protocol isn't explicitly given as a URL scheme. diff --git a/p2p/transport_test.go b/p2p/transport_test.go index 56aa3d92b..657cb4b35 100644 --- a/p2p/transport_test.go +++ b/p2p/transport_test.go @@ -19,11 +19,8 @@ import ( // transportFactory is used to set up transports for tests. type transportFactory func(t *testing.T) p2p.Transport -var ( - ctx = context.Background() // convenience context - chID = p2p.ChannelID(1) // channel ID for use in tests - testTransports = map[string]transportFactory{} // registry for withTransports -) +// testTransports is a registry of transport factories for withTransports(). +var testTransports = map[string]transportFactory{} // withTransports is a test helper that runs a test against all transports // registered in testTransports. diff --git a/statesync/reactor.go b/statesync/reactor.go index f88e2d1e0..a7e536b0f 100644 --- a/statesync/reactor.go +++ b/statesync/reactor.go @@ -170,7 +170,7 @@ func (r *Reactor) handleSnapshotMessage(envelope p2p.Envelope) error { "height", snapshot.Height, "format", snapshot.Format, ) - r.snapshotCh.Out() <- p2p.Envelope{ + r.snapshotCh.Out <- p2p.Envelope{ To: envelope.From, Message: &ssproto.SnapshotsResponse{ Height: snapshot.Height, @@ -254,7 +254,7 @@ func (r *Reactor) handleChunkMessage(envelope p2p.Envelope) error { "chunk", msg.Index, "peer", envelope.From, ) - r.chunkCh.Out() <- p2p.Envelope{ + r.chunkCh.Out <- p2p.Envelope{ To: envelope.From, Message: &ssproto.ChunkResponse{ Height: msg.Height, @@ -343,10 +343,10 @@ func (r *Reactor) processSnapshotCh() { for { select { - case envelope := <-r.snapshotCh.In(): - if err := r.handleMessage(r.snapshotCh.ID(), envelope); err != nil { - r.Logger.Error("failed to process message", "ch_id", r.snapshotCh.ID(), "envelope", envelope, "err", err) - r.snapshotCh.Error() <- p2p.PeerError{ + case envelope := <-r.snapshotCh.In: + if err := r.handleMessage(r.snapshotCh.ID, envelope); err != nil { + r.Logger.Error("failed to process message", "ch_id", r.snapshotCh.ID, "envelope", envelope, "err", err) + r.snapshotCh.Error <- p2p.PeerError{ NodeID: envelope.From, Err: err, } @@ -369,10 +369,10 @@ func (r *Reactor) processChunkCh() { for { select { - case envelope := <-r.chunkCh.In(): - if err := r.handleMessage(r.chunkCh.ID(), envelope); err != nil { - r.Logger.Error("failed to process message", "ch_id", r.chunkCh.ID(), "envelope", envelope, "err", err) - r.chunkCh.Error() <- p2p.PeerError{ + case envelope := <-r.chunkCh.In: + if err := r.handleMessage(r.chunkCh.ID, envelope); err != nil { + r.Logger.Error("failed to process message", "ch_id", r.chunkCh.ID, "envelope", envelope, "err", err) + r.chunkCh.Error <- p2p.PeerError{ NodeID: envelope.From, Err: err, } @@ -470,12 +470,12 @@ func (r *Reactor) Sync(stateProvider StateProvider, discoveryTime time.Duration) return sm.State{}, nil, errors.New("a state sync is already in progress") } - r.syncer = newSyncer(r.Logger, r.conn, r.connQuery, stateProvider, r.snapshotCh.Out(), r.chunkCh.Out(), r.tempDir) + r.syncer = newSyncer(r.Logger, r.conn, r.connQuery, stateProvider, r.snapshotCh.Out, r.chunkCh.Out, r.tempDir) r.mtx.Unlock() // request snapshots from all currently connected peers r.Logger.Debug("requesting snapshots from known peers") - r.snapshotCh.Out() <- p2p.Envelope{ + r.snapshotCh.Out <- p2p.Envelope{ Broadcast: true, Message: &ssproto.SnapshotsRequest{}, } diff --git a/test/maverick/node/node.go b/test/maverick/node/node.go index 4ad921e9e..d094e880a 100644 --- a/test/maverick/node/node.go +++ b/test/maverick/node/node.go @@ -791,7 +791,7 @@ func NewNode(config *cfg.Config, // TODO: Fetch and provide real options and do proper p2p bootstrapping. // TODO: Use a persistent peer database. - peerMgr, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + peerMgr, err := p2p.NewPeerManager(nodeKey.ID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) if err != nil { return nil, err } From 17c511db67bdda242cf541151e12282fa7befce7 Mon Sep 17 00:00:00 2001 From: Tess Rinearson Date: Thu, 4 Feb 2021 10:38:46 +0100 Subject: [PATCH 14/15] .github/workflows: cleanup yaml for e2e nightlies (#6049) --- .github/workflows/e2e-nightly-34x.yml | 6 +++++- .github/workflows/e2e-nightly-master.yml | 8 ++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/workflows/e2e-nightly-34x.yml b/.github/workflows/e2e-nightly-34x.yml index bd1a00358..97be84eab 100644 --- a/.github/workflows/e2e-nightly-34x.yml +++ b/.github/workflows/e2e-nightly-34x.yml @@ -1,5 +1,9 @@ # Runs randomly generated E2E testnets nightly # on the 0.34.x release branch + +# !! If you change something in this file, you probably want +# to update the e2e-nightly-master workflow as well! + name: e2e-nightly-34x on: workflow_dispatch: # allow running workflow manually, in theory @@ -56,7 +60,7 @@ jobs: SLACK_FOOTER: '' e2e-nightly-success: # may turn this off once they seem to pass consistently - needs: e2e-nightly-test-2 + needs: e2e-nightly-test if: ${{ success() }} runs-on: ubuntu-latest steps: diff --git a/.github/workflows/e2e-nightly-master.yml b/.github/workflows/e2e-nightly-master.yml index de88499d9..3a4e4fdaa 100644 --- a/.github/workflows/e2e-nightly-master.yml +++ b/.github/workflows/e2e-nightly-master.yml @@ -1,4 +1,8 @@ # Runs randomly generated E2E testnets nightly on master + +# !! If you change something in this file, you probably want +# to update the e2e-nightly-34x workflow as well! + name: e2e-nightly-master on: workflow_dispatch: # allow running workflow manually @@ -49,7 +53,7 @@ jobs: SLACK_USERNAME: Nightly E2E Tests SLACK_ICON_EMOJI: ':skull:' SLACK_COLOR: danger - SLACK_MESSAGE: Nightly E2E (copy) tests failed on master + SLACK_MESSAGE: Nightly E2E tests failed on master SLACK_FOOTER: '' e2e-nightly-success: # may turn this off once they seem to pass consistently @@ -65,5 +69,5 @@ jobs: SLACK_USERNAME: Nightly E2E Tests SLACK_ICON_EMOJI: ':white_check_mark:' SLACK_COLOR: good - SLACK_MESSAGE: Nightly E2E (copy) tests passed on master + SLACK_MESSAGE: Nightly E2E tests passed on master SLACK_FOOTER: '' From 740008e32be6a916b55a44db2c58041bbb205955 Mon Sep 17 00:00:00 2001 From: Tess Rinearson Date: Thu, 4 Feb 2021 10:51:03 +0100 Subject: [PATCH 15/15] .github: clean up PR template (#6050) I got tired of seeing the literal phrase "Closes #XXX" left in PR bodies. Also, this template isn't usually viewed as rendered markdown, so I've removed the markdown formatting and the "Description" heading (which usually gets deleted anyways). --- .github/PULL_REQUEST_TEMPLATE.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 975ad1cf5..d13124fa9 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,7 +1,7 @@ -## Description +Please add a description of the changes that this PR introduces and the files that +are the most critical to review. -_Please add a description of the changes that this PR introduces and the files that -are the most critical to review._ +If this PR fixes an open Issue, please include "Closes #XXX" (where "XXX" is the Issue number) +so that GitHub will automatically close the Issue when this PR is merged. -Closes: #XXX