diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 975ad1cf5..d13124fa9 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,7 +1,7 @@ -## Description +Please add a description of the changes that this PR introduces and the files that +are the most critical to review. -_Please add a description of the changes that this PR introduces and the files that -are the most critical to review._ +If this PR fixes an open Issue, please include "Closes #XXX" (where "XXX" is the Issue number) +so that GitHub will automatically close the Issue when this PR is merged. -Closes: #XXX diff --git a/.github/workflows/e2e-nightly-34x.yml b/.github/workflows/e2e-nightly-34x.yml new file mode 100644 index 000000000..97be84eab --- /dev/null +++ b/.github/workflows/e2e-nightly-34x.yml @@ -0,0 +1,76 @@ +# Runs randomly generated E2E testnets nightly +# on the 0.34.x release branch + +# !! If you change something in this file, you probably want +# to update the e2e-nightly-master workflow as well! + +name: e2e-nightly-34x +on: + workflow_dispatch: # allow running workflow manually, in theory + schedule: + - cron: '0 2 * * *' + +jobs: + e2e-nightly-test: + # Run parallel jobs for the listed testnet groups (must match the + # ./build/generator -g flag) + strategy: + fail-fast: false + matrix: + group: ['00', '01', '02', '03'] + runs-on: ubuntu-latest + timeout-minutes: 60 + steps: + - uses: actions/setup-go@v2 + with: + go-version: '1.15' + + - uses: actions/checkout@v2 + with: + ref: 'v0.34.x' + + - name: Build + working-directory: test/e2e + # Run make jobs in parallel, since we can't run steps in parallel. + run: make -j2 docker generator runner + + - name: Generate testnets + working-directory: test/e2e + # When changing -g, also change the matrix groups above + run: ./build/generator -g 4 -d networks/nightly + + - name: Run testnets in group ${{ matrix.group }} + working-directory: test/e2e + run: ./run-multiple.sh networks/nightly/*-group${{ matrix.group }}-*.toml + + e2e-nightly-fail: + needs: e2e-nightly-test + if: ${{ failure() }} + runs-on: ubuntu-latest + steps: + - name: Notify Slack on failure + uses: rtCamp/action-slack-notify@ae4223259071871559b6e9d08b24a63d71b3f0c0 + env: + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + SLACK_CHANNEL: tendermint-internal + SLACK_USERNAME: Nightly E2E Tests + SLACK_ICON_EMOJI: ':skull:' + SLACK_COLOR: danger + SLACK_MESSAGE: Nightly E2E tests failed on v0.34.x + SLACK_FOOTER: '' + + e2e-nightly-success: # may turn this off once they seem to pass consistently + needs: e2e-nightly-test + if: ${{ success() }} + runs-on: ubuntu-latest + steps: + - name: Notify Slack on success + uses: rtCamp/action-slack-notify@ae4223259071871559b6e9d08b24a63d71b3f0c0 + env: + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + SLACK_CHANNEL: tendermint-internal + SLACK_USERNAME: Nightly E2E Tests + SLACK_ICON_EMOJI: ':white_check_mark:' + SLACK_COLOR: good + SLACK_MESSAGE: Nightly E2E tests passed on v0.34.x + SLACK_FOOTER: '' diff --git a/.github/workflows/e2e-nightly.yml b/.github/workflows/e2e-nightly-master.yml similarity index 60% rename from .github/workflows/e2e-nightly.yml rename to .github/workflows/e2e-nightly-master.yml index b4be46954..3a4e4fdaa 100644 --- a/.github/workflows/e2e-nightly.yml +++ b/.github/workflows/e2e-nightly-master.yml @@ -1,20 +1,22 @@ -# Runs randomly generated E2E testnets nightly. -name: e2e-nightly +# Runs randomly generated E2E testnets nightly on master + +# !! If you change something in this file, you probably want +# to update the e2e-nightly-34x workflow as well! + +name: e2e-nightly-master on: workflow_dispatch: # allow running workflow manually schedule: - cron: '0 2 * * *' jobs: - e2e-nightly-test: + e2e-nightly-test-2: # Run parallel jobs for the listed testnet groups (must match the # ./build/generator -g flag) strategy: fail-fast: false matrix: group: ['00', '01', '02', '03'] - # todo: expand to multiple versions after 0.35 release - branch: ['master', 'v0.34.x'] runs-on: ubuntu-latest timeout-minutes: 60 steps: @@ -23,8 +25,6 @@ jobs: go-version: '1.15' - uses: actions/checkout@v2 - with: - ref: ${{ matrix.branch}} - name: Build working-directory: test/e2e @@ -40,8 +40,8 @@ jobs: working-directory: test/e2e run: ./run-multiple.sh networks/nightly/*-group${{ matrix.group }}-*.toml - e2e-nightly-fail: - needs: e2e-nightly-test + e2e-nightly-fail-2: + needs: e2e-nightly-test-2 if: ${{ failure() }} runs-on: ubuntu-latest steps: @@ -53,5 +53,21 @@ jobs: SLACK_USERNAME: Nightly E2E Tests SLACK_ICON_EMOJI: ':skull:' SLACK_COLOR: danger - SLACK_MESSAGE: Nightly E2E tests failed + SLACK_MESSAGE: Nightly E2E tests failed on master + SLACK_FOOTER: '' + + e2e-nightly-success: # may turn this off once they seem to pass consistently + needs: e2e-nightly-test-2 + if: ${{ success() }} + runs-on: ubuntu-latest + steps: + - name: Notify Slack on success + uses: rtCamp/action-slack-notify@ae4223259071871559b6e9d08b24a63d71b3f0c0 + env: + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + SLACK_CHANNEL: tendermint-internal + SLACK_USERNAME: Nightly E2E Tests + SLACK_ICON_EMOJI: ':white_check_mark:' + SLACK_COLOR: good + SLACK_MESSAGE: Nightly E2E tests passed on master SLACK_FOOTER: '' diff --git a/.goreleaser.yml b/.goreleaser.yml index cf3a35f2b..267d4e4ac 100644 --- a/.goreleaser.yml +++ b/.goreleaser.yml @@ -1,4 +1,4 @@ -project_name: Tendermint +project_name: tendermint env: # Require use of Go modules. diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index c6f45faa5..184df82cc 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -28,8 +28,7 @@ Friendly reminder, we have a [bug bounty program](https://hackerone.com/tendermi - [proto/p2p] Renamed `DefaultNodeInfo` and `DefaultNodeInfoOther` to `NodeInfo` and `NodeInfoOther` (@erikgrinaker) - [proto/p2p] Rename `NodeInfo.default_node_id` to `node_id` (@erikgrinaker) - [libs/os] Kill() and {Must,}{Read,Write}File() functions have been removed. (@alessio) - - [store] \#5848 Remove block store state in favor of using the db iterators directly (@cmwaters) - - [state] \#5864 Use an iterator when pruning state (@cmwaters) + - [store] \#5848 Remove block store state in favor of using the db iterators directly (@cmwaters) - [state] \#5864 Use an iterator when pruning state (@cmwaters) - [types] \#6023 Remove `tm2pb.Header`, `tm2pb.BlockID`, `tm2pb.PartSetHeader` and `tm2pb.NewValidatorUpdate`. - Each of the above types has a `ToProto` and `FromProto` method or function which replaced this logic. - [rpc/client/http] \#6022 Change `timeout` type to `time.Duration` in `NewWithTimeout` @@ -66,3 +65,4 @@ Friendly reminder, we have a [bug bounty program](https://hackerone.com/tendermi - [blockchain/v1] [\#5701](https://github.com/tendermint/tendermint/pull/5701) Handle peers without blocks (@melekes) - [blockchain/v1] \#5711 Fix deadlock (@melekes) - [light] \#6022 Fix a bug when the number of validators equals 100 (@melekes) +- [light] \#6026 Fix a bug when height isn't provided for the rpc calls: `/commit` and `/validators` (@cmwaters) diff --git a/blockchain/v0/reactor.go b/blockchain/v0/reactor.go index 7dd4d4ea3..af79fe563 100644 --- a/blockchain/v0/reactor.go +++ b/blockchain/v0/reactor.go @@ -84,7 +84,7 @@ type Reactor struct { fastSync bool blockchainCh *p2p.Channel - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates closeCh chan struct{} requestsCh <-chan BlockRequest @@ -104,7 +104,7 @@ func NewReactor( store *store.BlockStore, consReactor consensusReactor, blockchainCh *p2p.Channel, - peerUpdates *p2p.PeerUpdatesCh, + peerUpdates *p2p.PeerUpdates, fastSync bool, ) (*Reactor, error) { if state.LastBlockHeight != store.Height() { @@ -194,7 +194,7 @@ func (r *Reactor) respondToPeer(msg *bcproto.BlockRequest, peerID p2p.NodeID) { return } - r.blockchainCh.Out() <- p2p.Envelope{ + r.blockchainCh.Out <- p2p.Envelope{ To: peerID, Message: &bcproto.BlockResponse{Block: blockProto}, } @@ -203,7 +203,7 @@ func (r *Reactor) respondToPeer(msg *bcproto.BlockRequest, peerID p2p.NodeID) { } r.Logger.Info("peer requesting a block we do not have", "peer", peerID, "height", msg.Height) - r.blockchainCh.Out() <- p2p.Envelope{ + r.blockchainCh.Out <- p2p.Envelope{ To: peerID, Message: &bcproto.NoBlockResponse{Height: msg.Height}, } @@ -229,7 +229,7 @@ func (r *Reactor) handleBlockchainMessage(envelope p2p.Envelope) error { r.pool.AddBlock(envelope.From, block, block.Size()) case *bcproto.StatusRequest: - r.blockchainCh.Out() <- p2p.Envelope{ + r.blockchainCh.Out <- p2p.Envelope{ To: envelope.From, Message: &bcproto.StatusResponse{ Height: r.store.Height(), @@ -284,13 +284,12 @@ func (r *Reactor) processBlockchainCh() { for { select { - case envelope := <-r.blockchainCh.In(): - if err := r.handleMessage(r.blockchainCh.ID(), envelope); err != nil { - r.Logger.Error("failed to process message", "ch_id", r.blockchainCh.ID(), "envelope", envelope, "err", err) - r.blockchainCh.Error() <- p2p.PeerError{ - PeerID: envelope.From, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + case envelope := <-r.blockchainCh.In: + if err := r.handleMessage(r.blockchainCh.ID, envelope); err != nil { + r.Logger.Error("failed to process message", "ch_id", r.blockchainCh.ID, "envelope", envelope, "err", err) + r.blockchainCh.Error <- p2p.PeerError{ + NodeID: envelope.From, + Err: err, } } @@ -303,26 +302,26 @@ func (r *Reactor) processBlockchainCh() { // processPeerUpdate processes a PeerUpdate. func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { - r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status) + r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) // XXX: Pool#RedoRequest can sometimes give us an empty peer. - if len(peerUpdate.PeerID) == 0 { + if len(peerUpdate.NodeID) == 0 { return } switch peerUpdate.Status { - case p2p.PeerStatusNew, p2p.PeerStatusUp: + case p2p.PeerStatusUp: // send a status update the newly added peer - r.blockchainCh.Out() <- p2p.Envelope{ - To: peerUpdate.PeerID, + r.blockchainCh.Out <- p2p.Envelope{ + To: peerUpdate.NodeID, Message: &bcproto.StatusResponse{ Base: r.store.Base(), Height: r.store.Height(), }, } - case p2p.PeerStatusDown, p2p.PeerStatusRemoved, p2p.PeerStatusBanned: - r.pool.RemovePeer(peerUpdate.PeerID) + case p2p.PeerStatusDown: + r.pool.RemovePeer(peerUpdate.NodeID) } } @@ -377,16 +376,15 @@ func (r *Reactor) requestRoutine() { return case request := <-r.requestsCh: - r.blockchainCh.Out() <- p2p.Envelope{ + r.blockchainCh.Out <- p2p.Envelope{ To: request.PeerID, Message: &bcproto.BlockRequest{Height: request.Height}, } case pErr := <-r.errorsCh: - r.blockchainCh.Error() <- p2p.PeerError{ - PeerID: pErr.peerID, - Err: pErr.err, - Severity: p2p.PeerErrorSeverityLow, + r.blockchainCh.Error <- p2p.PeerError{ + NodeID: pErr.peerID, + Err: pErr.err, } case <-statusUpdateTicker.C: @@ -395,7 +393,7 @@ func (r *Reactor) requestRoutine() { go func() { defer r.poolWG.Done() - r.blockchainCh.Out() <- p2p.Envelope{ + r.blockchainCh.Out <- p2p.Envelope{ Broadcast: true, Message: &bcproto.StatusRequest{}, } @@ -524,18 +522,16 @@ FOR_LOOP: // NOTE: We've already removed the peer's request, but we still need // to clean up the rest. peerID := r.pool.RedoRequest(first.Height) - r.blockchainCh.Error() <- p2p.PeerError{ - PeerID: peerID, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + r.blockchainCh.Error <- p2p.PeerError{ + NodeID: peerID, + Err: err, } peerID2 := r.pool.RedoRequest(second.Height) if peerID2 != peerID { - r.blockchainCh.Error() <- p2p.PeerError{ - PeerID: peerID2, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + r.blockchainCh.Error <- p2p.PeerError{ + NodeID: peerID2, + Err: err, } } diff --git a/blockchain/v0/reactor_test.go b/blockchain/v0/reactor_test.go index 7f396d535..2dbf6d743 100644 --- a/blockchain/v0/reactor_test.go +++ b/blockchain/v0/reactor_test.go @@ -36,7 +36,7 @@ type reactorTestSuite struct { blockchainPeerErrCh chan p2p.PeerError peerUpdatesCh chan p2p.PeerUpdate - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates } func setup( @@ -200,8 +200,8 @@ func simulateRouter(primary *reactorTestSuite, suites []*reactorTestSuite, dropC primary.reactor.Logger.Debug("dropped peer error", "err", pErr.Err) } else { primary.peerUpdatesCh <- p2p.PeerUpdate{ - PeerID: pErr.PeerID, - Status: p2p.PeerStatusRemoved, + NodeID: pErr.NodeID, + Status: p2p.PeerStatusDown, } } } @@ -229,7 +229,7 @@ func TestReactor_AbruptDisconnect(t *testing.T) { if s.peerID != ss.peerID { s.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: ss.peerID, + NodeID: ss.peerID, } } } @@ -251,7 +251,7 @@ func TestReactor_AbruptDisconnect(t *testing.T) { // deadlocks or race conditions within the context of poolRoutine. testSuites[1].peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusDown, - PeerID: testSuites[0].peerID, + NodeID: testSuites[0].peerID, } } @@ -276,7 +276,7 @@ func TestReactor_NoBlockResponse(t *testing.T) { if s.peerID != ss.peerID { s.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: ss.peerID, + NodeID: ss.peerID, } } } @@ -341,7 +341,7 @@ func TestReactor_BadBlockStopsPeer(t *testing.T) { if s.peerID != ss.peerID { s.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: ss.peerID, + NodeID: ss.peerID, } } } @@ -388,7 +388,7 @@ func TestReactor_BadBlockStopsPeer(t *testing.T) { for _, s := range testSuites[:len(testSuites)-1] { newSuite.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: s.peerID, + NodeID: s.peerID, } } diff --git a/docs/nodes/configuration.md b/docs/nodes/configuration.md index 7db0cae3d..30d22234f 100644 --- a/docs/nodes/configuration.md +++ b/docs/nodes/configuration.md @@ -385,7 +385,7 @@ peer-query-maj23-sleep-duration = "2s" # Options: # 1) "null" # 2) "kv" (default) - the simplest possible indexer, backed by key-value storage (defaults to levelDB; see DBBackend). -# - When "kv" is chosen "tx.height" and "tx.hash" will always be indexed. +# - When "kv" is chosen "tx.height" and "tx.hash" will always be indexed. indexer = "kv" ####################################################### @@ -487,6 +487,7 @@ Here's a brief summary of the timeouts: This section will cover settings within the p2p section of the `config.toml`. - `external-address` = is the address that will be advertised for other nodes to use. We recommend setting this field with your public IP and p2p port. + - > We recommend setting an external address. When used in a private network, Tendermint Core currently doesn't advertise the node's public address. There is active and ongoing work to improve the P2P system, but this is a helpful workaround for now. - `seeds` = is a list of comma separated seed nodes that you will connect upon a start and ask for peers. A seed node is a node that does not participate in consensus but only helps propagate peers to nodes in the networks - `persistent-peers` = is a list of comma separated peers that you will always want to be connected to. If you're already connected to the maximum number of peers, persistent peers will not be added. - `max-num-inbound-peers` = is the maximum number of peers you will accept inbound connections from at one time (where they dial your address and initiate the connection). @@ -494,4 +495,4 @@ This section will cover settings within the p2p section of the `config.toml`. - `unconditional-peer-ids` = is similar to `persistent-peers` except that these peers will be connected to even if you are already connected to the maximum number of peers. This can be a validator node ID on your sentry node. - `pex` = turns the peer exchange reactor on or off. Validator node will want the `pex` turned off so it would not begin gossiping to unknown peers on the network. PeX can also be turned off for statically configured networks with fixed network connectivity. For full nodes on open, dynamic networks, it should be turned on. - `seed-mode` = is used for when node operators want to run their node as a seed node. Seed node's run a variation of the PeX protocol that disconnects from peers after sending them a list of peers to connect to. To minimize the servers usage, it is recommended to set the mempool's size to 0. -- `private-peer-ids` = is a comma separated list of node ids that you would not like exposed to other peers (ie. you will not tell other peers about the private-peer-ids). This can be filled with a validators node id. +- `private-peer-ids` = is a comma-separated list of node ids that will _not_ be exposed to other peers (i.e., you will not tell other peers about the ids in this list). This can be filled with a validator's node id. diff --git a/evidence/pool.go b/evidence/pool.go index c67d3fc2f..7b2d95ad5 100644 --- a/evidence/pool.go +++ b/evidence/pool.go @@ -23,8 +23,8 @@ import ( const ( // prefixes are unique across all tm db's - prefixCommitted = int64(8) - prefixPending = int64(9) + prefixCommitted = int64(9) + prefixPending = int64(10) ) // Pool maintains a pool of valid evidence to be broadcasted and committed @@ -132,7 +132,7 @@ func (evpool *Pool) Update(state sm.State, ev types.EvidenceList) { evpool.updateState(state) // move committed evidence out from the pending pool and into the committed pool - evpool.markEvidenceAsCommitted(ev) + evpool.markEvidenceAsCommitted(ev, state.LastBlockHeight) // Prune pending evidence when it has expired. This also updates when the next // evidence will expire. @@ -386,23 +386,18 @@ func (evpool *Pool) addPendingEvidence(ev types.Evidence) error { return nil } -func (evpool *Pool) removePendingEvidence(evidence types.Evidence) { - key := keyPending(evidence) - if err := evpool.evidenceStore.Delete(key); err != nil { - evpool.logger.Error("failed to delete pending evidence", "err", err) - } else { - atomic.AddUint32(&evpool.evidenceSize, ^uint32(0)) - evpool.logger.Debug("deleted pending evidence", "evidence", evidence) - } -} - // markEvidenceAsCommitted processes all the evidence in the block, marking it as // committed and removing it from the pending database. -func (evpool *Pool) markEvidenceAsCommitted(evidence types.EvidenceList) { +func (evpool *Pool) markEvidenceAsCommitted(evidence types.EvidenceList, height uint64) { blockEvidenceMap := make(map[string]struct{}, len(evidence)) + batch := evpool.evidenceStore.NewBatch() + defer batch.Close() + for _, ev := range evidence { if evpool.isPending(ev) { - evpool.removePendingEvidence(ev) + if err := batch.Delete(keyPending(ev)); err != nil { + evpool.logger.Error("failed to batch pending evidence", "err", err) + } blockEvidenceMap[evMapKey(ev)] = struct{}{} } @@ -410,7 +405,7 @@ func (evpool *Pool) markEvidenceAsCommitted(evidence types.EvidenceList) { // we only need to record the height that it was saved at. key := keyCommitted(ev) - h := gogotypes.UInt64Value{Value: ev.Height()} + h := gogotypes.UInt64Value{Value: height} evBytes, err := proto.Marshal(&h) if err != nil { evpool.logger.Error("failed to marshal committed evidence", "key(height/hash)", key, "err", err) @@ -424,10 +419,22 @@ func (evpool *Pool) markEvidenceAsCommitted(evidence types.EvidenceList) { evpool.logger.Debug("marked evidence as committed", "evidence", ev) } - // remove committed evidence from the clist - if len(blockEvidenceMap) != 0 { - evpool.removeEvidenceFromList(blockEvidenceMap) + // check if we need to remove any pending evidence + if len(blockEvidenceMap) == 0 { + return } + + // remove committed evidence from pending bucket + if err := batch.WriteSync(); err != nil { + evpool.logger.Error("failed to batch delete pending evidence", "err", err) + return + } + + // remove committed evidence from the clist + evpool.removeEvidenceFromList(blockEvidenceMap) + + // update the evidence size + atomic.AddUint32(&evpool.evidenceSize, ^uint32(len(blockEvidenceMap)-1)) } // listEvidence retrieves lists evidence from oldest to newest within maxBytes. @@ -481,44 +488,73 @@ func (evpool *Pool) listEvidence(prefixKey int64, maxBytes int64) ([]types.Evide } func (evpool *Pool) removeExpiredPendingEvidence() (uint64, time.Time) { - iter, err := dbm.IteratePrefix(evpool.evidenceStore, prefixToBytes(prefixPending)) - if err != nil { - evpool.logger.Error("failed to iterate over pending evidence", "err", err) + batch := evpool.evidenceStore.NewBatch() + defer batch.Close() + + height, time, blockEvidenceMap := evpool.batchExpiredPendingEvidence(batch) + + // if we haven't removed any evidence then return early + if len(blockEvidenceMap) == 0 { + return height, time + } + + evpool.logger.Debug("removing expired evidence", + "height", evpool.State().LastBlockHeight, + "time", evpool.State().LastBlockTime, + "expired evidence", len(blockEvidenceMap), + ) + + // remove expired evidence from pending bucket + if err := batch.WriteSync(); err != nil { + evpool.logger.Error("failed to batch delete pending evidence", "err", err) return evpool.State().LastBlockHeight, evpool.State().LastBlockTime } - defer iter.Close() + // remove evidence from the clist + evpool.removeEvidenceFromList(blockEvidenceMap) + // update the evidence size + atomic.AddUint32(&evpool.evidenceSize, ^uint32(len(blockEvidenceMap)-1)) + + return height, time +} + +func (evpool *Pool) batchExpiredPendingEvidence(batch dbm.Batch) (uint64, time.Time, map[string]struct{}) { blockEvidenceMap := make(map[string]struct{}) + iter, err := dbm.IteratePrefix(evpool.evidenceStore, prefixToBytes(prefixPending)) + if err != nil { + evpool.logger.Error("failed to iterate over pending evidence", "err", err) + return evpool.State().LastBlockHeight, evpool.State().LastBlockTime, blockEvidenceMap + } + defer iter.Close() for ; iter.Valid(); iter.Next() { ev, err := bytesToEv(iter.Value()) if err != nil { - evpool.logger.Error("failed to transition evidence from protobuf", "err", err) + evpool.logger.Error("failed to transition evidence from protobuf", "err", err, "ev", ev) continue } + // if true, we have looped through all expired evidence if !evpool.isExpired(ev.Height(), ev.Time()) { - if len(blockEvidenceMap) != 0 { - evpool.removeEvidenceFromList(blockEvidenceMap) - } - // Return the height and time with which this evidence will have expired // so we know when to prune next. return ev.Height() + uint64(evpool.State().ConsensusParams.Evidence.MaxAgeNumBlocks+1), - ev.Time().Add(evpool.State().ConsensusParams.Evidence.MaxAgeDuration).Add(time.Second) + ev.Time().Add(evpool.State().ConsensusParams.Evidence.MaxAgeDuration).Add(time.Second), + blockEvidenceMap } - evpool.removePendingEvidence(ev) + // else add to the batch + if err := batch.Delete(iter.Key()); err != nil { + evpool.logger.Error("failed to batch evidence", "err", err, "ev", ev) + continue + } + + // and add to the map to remove the evidence from the clist blockEvidenceMap[evMapKey(ev)] = struct{}{} } - // we either have no pending evidence or all evidence has expired - if len(blockEvidenceMap) != 0 { - evpool.removeEvidenceFromList(blockEvidenceMap) - } - - return evpool.State().LastBlockHeight, evpool.State().LastBlockTime + return evpool.State().LastBlockHeight, evpool.State().LastBlockTime, blockEvidenceMap } func (evpool *Pool) removeEvidenceFromList( diff --git a/evidence/pool_test.go b/evidence/pool_test.go index 40f1122bd..5831a22f8 100644 --- a/evidence/pool_test.go +++ b/evidence/pool_test.go @@ -172,7 +172,7 @@ func TestEvidencePoolUpdate(t *testing.T) { pool, val := defaultTestPool(t, height) state := pool.State() - // create new block (no need to save it to blockStore) + // create two lots of old evidence that we expect to be pruned when we update prunedEv := types.NewMockDuplicateVoteEvidenceWithValidator( 1, defaultEvidenceTime.Add(1*time.Minute), @@ -180,7 +180,15 @@ func TestEvidencePoolUpdate(t *testing.T) { evidenceChainID, ) + notPrunedEv := types.NewMockDuplicateVoteEvidenceWithValidator( + 2, + defaultEvidenceTime.Add(2*time.Minute), + val, + evidenceChainID, + ) + require.NoError(t, pool.AddEvidence(prunedEv)) + require.NoError(t, pool.AddEvidence(notPrunedEv)) ev := types.NewMockDuplicateVoteEvidenceWithValidator( height, @@ -195,14 +203,23 @@ func TestEvidencePoolUpdate(t *testing.T) { state.LastBlockHeight = height + 1 state.LastBlockTime = defaultEvidenceTime.Add(22 * time.Minute) + evList, _ := pool.PendingEvidence(2 * defaultEvidenceMaxBytes) + require.Equal(t, 2, len(evList)) + + require.Equal(t, uint32(2), pool.Size()) + require.NoError(t, pool.CheckEvidence(types.EvidenceList{ev})) + evList, _ = pool.PendingEvidence(3 * defaultEvidenceMaxBytes) + require.Equal(t, 3, len(evList)) + + require.Equal(t, uint32(3), pool.Size()) + pool.Update(state, block.Evidence.Evidence) // a) Update marks evidence as committed so pending evidence should be empty - evList, evSize := pool.PendingEvidence(defaultEvidenceMaxBytes) - require.Empty(t, evList) - require.Zero(t, evSize) + evList, _ = pool.PendingEvidence(defaultEvidenceMaxBytes) + require.Equal(t, []types.Evidence{notPrunedEv}, evList) // b) If we try to check this evidence again it should fail because it has already been committed err := pool.CheckEvidence(types.EvidenceList{ev}) diff --git a/evidence/reactor.go b/evidence/reactor.go index 643d9915a..aa3db318e 100644 --- a/evidence/reactor.go +++ b/evidence/reactor.go @@ -55,7 +55,7 @@ type Reactor struct { evpool *Pool eventBus *types.EventBus evidenceCh *p2p.Channel - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates closeCh chan struct{} peerWG sync.WaitGroup @@ -70,7 +70,7 @@ type Reactor struct { func NewReactor( logger log.Logger, evidenceCh *p2p.Channel, - peerUpdates *p2p.PeerUpdatesCh, + peerUpdates *p2p.PeerUpdates, evpool *Pool, ) *Reactor { r := &Reactor{ @@ -192,13 +192,12 @@ func (r *Reactor) processEvidenceCh() { for { select { - case envelope := <-r.evidenceCh.In(): - if err := r.handleMessage(r.evidenceCh.ID(), envelope); err != nil { - r.Logger.Error("failed to process message", "ch_id", r.evidenceCh.ID(), "envelope", envelope, "err", err) - r.evidenceCh.Error() <- p2p.PeerError{ - PeerID: envelope.From, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + case envelope := <-r.evidenceCh.In: + if err := r.handleMessage(r.evidenceCh.ID, envelope); err != nil { + r.Logger.Error("failed to process message", "ch_id", r.evidenceCh.ID, "envelope", envelope, "err", err) + r.evidenceCh.Error <- p2p.PeerError{ + NodeID: envelope.From, + Err: err, } } @@ -221,7 +220,7 @@ func (r *Reactor) processEvidenceCh() { // // REF: https://github.com/tendermint/tendermint/issues/4727 func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { - r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status) + r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) r.mtx.Lock() defer r.mtx.Unlock() @@ -240,21 +239,21 @@ func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { // a new done channel so we can explicitly close the goroutine if the peer // is later removed, we increment the waitgroup so the reactor can stop // safely, and finally start the goroutine to broadcast evidence to that peer. - _, ok := r.peerRoutines[peerUpdate.PeerID] + _, ok := r.peerRoutines[peerUpdate.NodeID] if !ok { closer := tmsync.NewCloser() - r.peerRoutines[peerUpdate.PeerID] = closer + r.peerRoutines[peerUpdate.NodeID] = closer r.peerWG.Add(1) - go r.broadcastEvidenceLoop(peerUpdate.PeerID, closer) + go r.broadcastEvidenceLoop(peerUpdate.NodeID, closer) } - case p2p.PeerStatusDown, p2p.PeerStatusRemoved, p2p.PeerStatusBanned: + case p2p.PeerStatusDown: // Check if we've started an evidence broadcasting goroutine for this peer. // If we have, we signal to terminate the goroutine via the channel's closure. // This will internally decrement the peer waitgroup and remove the peer // from the map of peer evidence broadcasting goroutines. - closer, ok := r.peerRoutines[peerUpdate.PeerID] + closer, ok := r.peerRoutines[peerUpdate.NodeID] if ok { closer.Close() } @@ -338,7 +337,7 @@ func (r *Reactor) broadcastEvidenceLoop(peerID p2p.NodeID, closer *tmsync.Closer // and thus would not be able to process the evidence correctly. Also, the // peer may receive this piece of evidence multiple times if it added and // removed frequently from the broadcasting peer. - r.evidenceCh.Out() <- p2p.Envelope{ + r.evidenceCh.Out <- p2p.Envelope{ To: peerID, Message: &tmproto.EvidenceList{ Evidence: []tmproto.Evidence{*evProto}, diff --git a/evidence/reactor_test.go b/evidence/reactor_test.go index c20b4e980..5ed1c818e 100644 --- a/evidence/reactor_test.go +++ b/evidence/reactor_test.go @@ -42,7 +42,7 @@ type reactorTestSuite struct { evidencePeerErrCh chan p2p.PeerError peerUpdatesCh chan p2p.PeerUpdate - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates } func setup(t *testing.T, logger log.Logger, pool *evidence.Pool, chBuf uint) *reactorTestSuite { @@ -224,18 +224,18 @@ func TestReactorMultiDisconnect(t *testing.T) { primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // Ensure "disconnecting" the secondary peer from the primary more than once // is handled gracefully. primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusDown, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusDown, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } } @@ -276,7 +276,7 @@ func TestReactorBroadcastEvidence(t *testing.T) { for _, suite := range secondaries { primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: suite.peerID, + NodeID: suite.peerID, } } @@ -327,7 +327,7 @@ func TestReactorBroadcastEvidence_Lagging(t *testing.T) { for _, suite := range secondaries { primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: suite.peerID, + NodeID: suite.peerID, } } @@ -378,7 +378,7 @@ func TestReactorBroadcastEvidence_Pending(t *testing.T) { // add the secondary reactor as a peer to the primary reactor primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // The secondary reactor should have received all the evidence ignoring the @@ -438,7 +438,7 @@ func TestReactorBroadcastEvidence_Committed(t *testing.T) { // add the secondary reactor as a peer to the primary reactor primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // The secondary reactor should have received all the evidence ignoring the @@ -487,7 +487,7 @@ func TestReactorBroadcastEvidence_FullyConnected(t *testing.T) { if suiteI.peerID != suiteJ.peerID { suiteI.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: suiteJ.peerID, + NodeID: suiteJ.peerID, } } } @@ -530,7 +530,7 @@ func TestReactorBroadcastEvidence_RemovePeer(t *testing.T) { // add the secondary reactor as a peer to the primary reactor primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // have the secondary reactor receive only half the evidence @@ -539,7 +539,7 @@ func TestReactorBroadcastEvidence_RemovePeer(t *testing.T) { // disconnect the peer primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusDown, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // Ensure the secondary only received half of the evidence before being diff --git a/libs/sync/waker.go b/libs/sync/waker.go new file mode 100644 index 000000000..0aff3ddf8 --- /dev/null +++ b/libs/sync/waker.go @@ -0,0 +1,30 @@ +package sync + +// Waker is used to wake up a sleeper when some event occurs. It debounces +// multiple wakeup calls occurring between each sleep, and wakeups are +// non-blocking to avoid having to coordinate goroutines. +type Waker struct { + wakeCh chan struct{} +} + +// NewWaker creates a new Waker. +func NewWaker() *Waker { + return &Waker{ + wakeCh: make(chan struct{}, 1), // buffer used for debouncing + } +} + +// Sleep returns a channel that blocks until Wake() is called. +func (w *Waker) Sleep() <-chan struct{} { + return w.wakeCh +} + +// Wake wakes up the sleeper. +func (w *Waker) Wake() { + // A non-blocking send with a size 1 buffer ensures that we never block, and + // that we queue up at most a single wakeup call between each Sleep(). + select { + case w.wakeCh <- struct{}{}: + default: + } +} diff --git a/libs/sync/waker_test.go b/libs/sync/waker_test.go new file mode 100644 index 000000000..dfd337e0e --- /dev/null +++ b/libs/sync/waker_test.go @@ -0,0 +1,47 @@ +package sync_test + +import ( + "testing" + + "github.com/stretchr/testify/require" + + tmsync "github.com/tendermint/tendermint/libs/sync" +) + +func TestWaker(t *testing.T) { + + // A new waker should block when sleeping. + waker := tmsync.NewWaker() + + select { + case <-waker.Sleep(): + require.Fail(t, "unexpected wakeup") + default: + } + + // Wakeups should not block, and should cause the next sleeper to awaken. + waker.Wake() + + select { + case <-waker.Sleep(): + default: + require.Fail(t, "expected wakeup, but sleeping instead") + } + + // Multiple wakeups should only wake a single sleeper. + waker.Wake() + waker.Wake() + waker.Wake() + + select { + case <-waker.Sleep(): + default: + require.Fail(t, "expected wakeup, but sleeping instead") + } + + select { + case <-waker.Sleep(): + require.Fail(t, "unexpected wakeup") + default: + } +} diff --git a/light/provider/http/http.go b/light/provider/http/http.go index 33172ce35..371f44549 100644 --- a/light/provider/http/http.go +++ b/light/provider/http/http.go @@ -70,7 +70,7 @@ func (p *http) LightBlock(ctx context.Context, height uint64) (*types.LightBlock return nil, err } - vs, err := p.validatorSet(ctx, h) + vs, err := p.validatorSet(ctx, &sh.Height) if err != nil { return nil, err } diff --git a/light/rpc/client.go b/light/rpc/client.go index 2debbb2bd..7e205082e 100644 --- a/light/rpc/client.go +++ b/light/rpc/client.go @@ -29,6 +29,7 @@ type KeyPathFunc func(path string, key []byte) (merkle.KeyPath, error) //go:generate mockery --case underscore --name LightClient type LightClient interface { ChainID() string + Update(ctx context.Context, now time.Time) (*types.LightBlock, error) VerifyLightBlockAtHeight(ctx context.Context, height uint64, now time.Time) (*types.LightBlock, error) TrustedLightBlock(height uint64) (*types.LightBlock, error) } @@ -131,7 +132,8 @@ func (c *Client) ABCIQueryWithOptions(ctx context.Context, path string, data tmb // Update the light client if we're behind. // NOTE: AppHash for height H is in header H+1. - l, err := c.updateLightClientIfNeededTo(ctx, resp.Height+1) + nextHeight := resp.Height + 1 + l, err := c.updateLightClientIfNeededTo(ctx, &nextHeight) if err != nil { return nil, err } @@ -214,7 +216,7 @@ func (c *Client) ConsensusParams(ctx context.Context, height *uint64) (*ctypes.R } // Update the light client if we're behind. - l, err := c.updateLightClientIfNeededTo(ctx, res.BlockHeight) + l, err := c.updateLightClientIfNeededTo(ctx, &res.BlockHeight) if err != nil { return nil, err } @@ -253,7 +255,7 @@ func (c *Client) BlockchainInfo(ctx context.Context, minHeight, maxHeight uint64 // Update the light client if we're behind. if len(res.BlockMetas) > 0 { lastHeight := res.BlockMetas[len(res.BlockMetas)-1].Header.Height - if _, err := c.updateLightClientIfNeededTo(ctx, lastHeight); err != nil { + if _, err := c.updateLightClientIfNeededTo(ctx, &lastHeight); err != nil { return nil, err } } @@ -297,7 +299,7 @@ func (c *Client) Block(ctx context.Context, height *uint64) (*ctypes.ResultBlock } // Update the light client if we're behind. - l, err := c.updateLightClientIfNeededTo(ctx, res.Block.Height) + l, err := c.updateLightClientIfNeededTo(ctx, &res.Block.Height) if err != nil { return nil, err } @@ -331,7 +333,7 @@ func (c *Client) BlockByHash(ctx context.Context, hash []byte) (*ctypes.ResultBl } // Update the light client if we're behind. - l, err := c.updateLightClientIfNeededTo(ctx, res.Block.Height) + l, err := c.updateLightClientIfNeededTo(ctx, &res.Block.Height) if err != nil { return nil, err } @@ -372,7 +374,8 @@ func (c *Client) BlockResults(ctx context.Context, height *uint64) (*ctypes.Resu } // Update the light client if we're behind. - trustedBlock, err := c.updateLightClientIfNeededTo(ctx, h+1) + nextHeight := h + 1 + trustedBlock, err := c.updateLightClientIfNeededTo(ctx, &nextHeight) if err != nil { return nil, err } @@ -410,7 +413,8 @@ func (c *Client) BlockResults(ctx context.Context, height *uint64) (*ctypes.Resu func (c *Client) Commit(ctx context.Context, height *uint64) (*ctypes.ResultCommit, error) { // Update the light client if we're behind and retrieve the light block at the requested height - l, err := c.updateLightClientIfNeededTo(ctx, *height) + // or at the latest height if no height is provided. + l, err := c.updateLightClientIfNeededTo(ctx, height) if err != nil { return nil, err } @@ -435,7 +439,7 @@ func (c *Client) Tx(ctx context.Context, hash []byte, prove bool) (*ctypes.Resul } // Update the light client if we're behind. - l, err := c.updateLightClientIfNeededTo(ctx, res.Height) + l, err := c.updateLightClientIfNeededTo(ctx, &res.Height) if err != nil { return nil, err } @@ -452,8 +456,9 @@ func (c *Client) TxSearch(ctx context.Context, query string, prove bool, page, p // Validators fetches and verifies validators. func (c *Client) Validators(ctx context.Context, height *uint64, pagePtr, perPagePtr *int) (*ctypes.ResultValidators, error) { - // Update the light client if we're behind and retrieve the light block at the requested height. - l, err := c.updateLightClientIfNeededTo(ctx, *height) + // Update the light client if we're behind and retrieve the light block at the requested height + // or at the latest height if no height is provided. + l, err := c.updateLightClientIfNeededTo(ctx, height) if err != nil { return nil, err } @@ -470,7 +475,7 @@ func (c *Client) Validators(ctx context.Context, height *uint64, pagePtr, perPag v := l.ValidatorSet.Validators[skipCount : skipCount+tmmath.MinInt(perPage, totalCount-skipCount)] return &ctypes.ResultValidators{ - BlockHeight: *height, + BlockHeight: l.Height, Validators: v, Count: len(v), Total: totalCount}, nil @@ -493,8 +498,16 @@ func (c *Client) UnsubscribeAll(ctx context.Context, subscriber string) error { return c.next.UnsubscribeAll(ctx, subscriber) } -func (c *Client) updateLightClientIfNeededTo(ctx context.Context, height uint64) (*types.LightBlock, error) { - l, err := c.lc.VerifyLightBlockAtHeight(ctx, height, time.Now()) +func (c *Client) updateLightClientIfNeededTo(ctx context.Context, height *uint64) (*types.LightBlock, error) { + var ( + l *types.LightBlock + err error + ) + if height == nil { + l, err = c.lc.Update(ctx, time.Now()) + } else { + l, err = c.lc.VerifyLightBlockAtHeight(ctx, *height, time.Now()) + } if err != nil { return nil, fmt.Errorf("failed to update light client to %d: %w", height, err) } diff --git a/light/rpc/mocks/light_client.go b/light/rpc/mocks/light_client.go index 8ec167fd4..fa3807196 100644 --- a/light/rpc/mocks/light_client.go +++ b/light/rpc/mocks/light_client.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.5.1. DO NOT EDIT. +// Code generated by mockery v0.0.0-dev. DO NOT EDIT. package mocks @@ -54,6 +54,29 @@ func (_m *LightClient) TrustedLightBlock(height uint64) (*types.LightBlock, erro return r0, r1 } +// Update provides a mock function with given fields: ctx, now +func (_m *LightClient) Update(ctx context.Context, now time.Time) (*types.LightBlock, error) { + ret := _m.Called(ctx, now) + + var r0 *types.LightBlock + if rf, ok := ret.Get(0).(func(context.Context, time.Time) *types.LightBlock); ok { + r0 = rf(ctx, now) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*types.LightBlock) + } + } + + var r1 error + if rf, ok := ret.Get(1).(func(context.Context, time.Time) error); ok { + r1 = rf(ctx, now) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + // VerifyLightBlockAtHeight provides a mock function with given fields: ctx, height, now func (_m *LightClient) VerifyLightBlockAtHeight(ctx context.Context, height uint64, now time.Time) (*types.LightBlock, error) { ret := _m.Called(ctx, height, now) diff --git a/light/store/db/db.go b/light/store/db/db.go index 4e622518b..e060be29b 100644 --- a/light/store/db/db.go +++ b/light/store/db/db.go @@ -14,8 +14,8 @@ import ( ) const ( - prefixLightBlock = int64(0x0a) - prefixSize = int64(0x0b) + prefixLightBlock = int64(11) + prefixSize = int64(12) ) type dbs struct { @@ -230,27 +230,11 @@ func (s *dbs) Prune(size uint16) error { } numToPrune := sSize - size - // 2) Iterate over headers and perform a batch operation. - itr, err := s.db.Iterator( - s.lbKey(1), - append(s.lbKey(1<<63-1), byte(0x00)), - ) - if err != nil { - panic(err) - } - defer itr.Close() - b := s.db.NewBatch() defer b.Close() - for itr.Valid() && numToPrune > 0 { - if err = b.Delete(itr.Key()); err != nil { - return err - } - itr.Next() - numToPrune-- - } - if err = itr.Error(); err != nil { + // 2) use an iterator to batch together all the blocks that need to be deleted + if err := s.batchDelete(b, numToPrune); err != nil { return err } @@ -261,12 +245,7 @@ func (s *dbs) Prune(size uint16) error { } // 4) write batch deletion to disk - err = b.WriteSync() - if err != nil { - return err - } - - return nil + return b.WriteSync() } // Size returns the number of header & validator set pairs. @@ -278,6 +257,27 @@ func (s *dbs) Size() uint16 { return s.size } +func (s *dbs) batchDelete(batch dbm.Batch, numToPrune uint16) error { + itr, err := s.db.Iterator( + s.lbKey(1), + append(s.lbKey(1<<63-1), byte(0x00)), + ) + if err != nil { + return err + } + defer itr.Close() + + for itr.Valid() && numToPrune > 0 { + if err = batch.Delete(itr.Key()); err != nil { + return err + } + itr.Next() + numToPrune-- + } + + return itr.Error() +} + func (s *dbs) sizeKey() []byte { key, err := orderedcode.Append(nil, prefixSize) if err != nil { diff --git a/mempool/reactor.go b/mempool/reactor.go index 78ba4d8a0..f67bc4fae 100644 --- a/mempool/reactor.go +++ b/mempool/reactor.go @@ -58,7 +58,7 @@ type Reactor struct { peerMgr PeerManager mempoolCh *p2p.Channel - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates closeCh chan struct{} // peerWG is used to coordinate graceful termination of all peer broadcasting @@ -76,7 +76,7 @@ func NewReactor( peerMgr PeerManager, mempool *CListMempool, mempoolCh *p2p.Channel, - peerUpdates *p2p.PeerUpdatesCh, + peerUpdates *p2p.PeerUpdates, ) *Reactor { r := &Reactor{ @@ -221,13 +221,12 @@ func (r *Reactor) processMempoolCh() { for { select { - case envelope := <-r.mempoolCh.In(): - if err := r.handleMessage(r.mempoolCh.ID(), envelope); err != nil { - r.Logger.Error("failed to process message", "ch_id", r.mempoolCh.ID(), "envelope", envelope, "err", err) - r.mempoolCh.Error() <- p2p.PeerError{ - PeerID: envelope.From, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + case envelope := <-r.mempoolCh.In: + if err := r.handleMessage(r.mempoolCh.ID, envelope); err != nil { + r.Logger.Error("failed to process message", "ch_id", r.mempoolCh.ID, "envelope", envelope, "err", err) + r.mempoolCh.Error <- p2p.PeerError{ + NodeID: envelope.From, + Err: err, } } @@ -244,7 +243,7 @@ func (r *Reactor) processMempoolCh() { // removed peers, we remove the peer from the mempool peer ID set and signal to // stop the tx broadcasting goroutine. func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { - r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status) + r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) r.mtx.Lock() defer r.mtx.Unlock() @@ -264,28 +263,28 @@ func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { // a new done channel so we can explicitly close the goroutine if the peer // is later removed, we increment the waitgroup so the reactor can stop // safely, and finally start the goroutine to broadcast txs to that peer. - _, ok := r.peerRoutines[peerUpdate.PeerID] + _, ok := r.peerRoutines[peerUpdate.NodeID] if !ok { closer := tmsync.NewCloser() - r.peerRoutines[peerUpdate.PeerID] = closer + r.peerRoutines[peerUpdate.NodeID] = closer r.peerWG.Add(1) - r.ids.ReserveForPeer(peerUpdate.PeerID) + r.ids.ReserveForPeer(peerUpdate.NodeID) // start a broadcast routine ensuring all txs are forwarded to the peer - go r.broadcastTxRoutine(peerUpdate.PeerID, closer) + go r.broadcastTxRoutine(peerUpdate.NodeID, closer) } } - case p2p.PeerStatusDown, p2p.PeerStatusRemoved, p2p.PeerStatusBanned: - r.ids.Reclaim(peerUpdate.PeerID) + case p2p.PeerStatusDown: + r.ids.Reclaim(peerUpdate.NodeID) // Check if we've started a tx broadcasting goroutine for this peer. // If we have, we signal to terminate the goroutine via the channel's closure. // This will internally decrement the peer waitgroup and remove the peer // from the map of peer tx broadcasting goroutines. - closer, ok := r.peerRoutines[peerUpdate.PeerID] + closer, ok := r.peerRoutines[peerUpdate.NodeID] if ok { closer.Close() } @@ -371,7 +370,7 @@ func (r *Reactor) broadcastTxRoutine(peerID p2p.NodeID, closer *tmsync.Closer) { if _, ok := memTx.senders.Load(peerMempoolID); !ok { // Send the mempool tx to the corresponding peer. Note, the peer may be // behind and thus would not be able to process the mempool tx correctly. - r.mempoolCh.Out() <- p2p.Envelope{ + r.mempoolCh.Out <- p2p.Envelope{ To: peerID, Message: &protomem.Txs{ Txs: [][]byte{memTx.tx}, diff --git a/mempool/reactor_test.go b/mempool/reactor_test.go index d1163bf09..1f1b3be1e 100644 --- a/mempool/reactor_test.go +++ b/mempool/reactor_test.go @@ -33,7 +33,7 @@ type reactorTestSuite struct { mempoolPeerErrCh chan p2p.PeerError peerUpdatesCh chan p2p.PeerUpdate - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates } func setup(t *testing.T, cfg *cfg.MempoolConfig, logger log.Logger, chBuf uint) *reactorTestSuite { @@ -189,7 +189,7 @@ func TestReactorBroadcastTxs(t *testing.T) { for _, suite := range secondaries { primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: suite.peerID, + NodeID: suite.peerID, } } @@ -295,7 +295,7 @@ func TestReactorNoBroadcastToSender(t *testing.T) { primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } time.Sleep(100 * time.Millisecond) @@ -360,7 +360,7 @@ func TestReactor_MaxTxBytes(t *testing.T) { primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // Wait till all secondary suites (reactor) received all mempool txs from the @@ -406,7 +406,7 @@ func TestDontExhaustMaxActiveIDs(t *testing.T) { for i := 0; i < maxActiveIDs+1; i++ { reactor.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: peerID, + NodeID: peerID, } reactor.mempoolOutCh <- p2p.Envelope{ To: peerID, @@ -466,12 +466,12 @@ func TestBroadcastTxForPeerStopsWhenPeerStops(t *testing.T) { // connect peer primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // disconnect peer primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusDown, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } } diff --git a/node/node.go b/node/node.go index 1086d40be..c82877422 100644 --- a/node/node.go +++ b/node/node.go @@ -758,7 +758,7 @@ func NewNode(config *cfg.Config, // TODO: Fetch and provide real options and do proper p2p bootstrapping. // TODO: Use a persistent peer database. - peerMgr, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + peerMgr, err := p2p.NewPeerManager(nodeKey.ID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) if err != nil { return nil, err } diff --git a/p2p/address.go b/p2p/address.go index 8ff44515a..1e964b449 100644 --- a/p2p/address.go +++ b/p2p/address.go @@ -106,10 +106,9 @@ func ParseNodeAddress(urlString string) (NodeAddress, error) { Protocol: Protocol(strings.ToLower(url.Scheme)), } - // Opaque URLs are expected to contain only a node ID, also used as path. + // Opaque URLs are expected to contain only a node ID. if url.Opaque != "" { address.NodeID = NodeID(url.Opaque) - address.Path = url.Opaque return address, address.Validate() } diff --git a/p2p/address_test.go b/p2p/address_test.go index 83bfecf24..41b89eaee 100644 --- a/p2p/address_test.go +++ b/p2p/address_test.go @@ -158,7 +158,7 @@ func TestParseNodeAddress(t *testing.T) { }, { "memory:" + user, - p2p.NodeAddress{Protocol: "memory", NodeID: id, Path: user}, + p2p.NodeAddress{Protocol: "memory", NodeID: id}, true, }, diff --git a/p2p/channel.go b/p2p/channel.go deleted file mode 100644 index b2bcd156e..000000000 --- a/p2p/channel.go +++ /dev/null @@ -1,130 +0,0 @@ -package p2p - -import ( - "sync" - - "github.com/gogo/protobuf/proto" -) - -// ChannelID is an arbitrary channel ID. -type ChannelID uint16 - -// Envelope specifies the message receiver and sender. -type Envelope struct { - From NodeID // Message sender, or empty for outbound messages. - To NodeID // Message receiver, or empty for inbound messages. - Broadcast bool // Send message to all connected peers, ignoring To. - Message proto.Message // Payload. - - // For internal use in the Router. - channelID ChannelID -} - -// Strip strips internal information from the envelope. Primarily used for -// testing, such that returned envelopes can be compared with literals. -func (e Envelope) Strip() Envelope { - e.channelID = 0 - return e -} - -// Channel is a bidirectional channel for Protobuf message exchange with peers. -// A Channel is safe for concurrent use by multiple goroutines. -type Channel struct { - closeOnce sync.Once - - // id defines the unique channel ID. - id ChannelID - - // messageType specifies the type of messages exchanged via the channel, and - // is used e.g. for automatic unmarshaling. - messageType proto.Message - - // inCh is a channel for receiving inbound messages. Envelope.From is always - // set. - inCh chan Envelope - - // outCh is a channel for sending outbound messages. Envelope.To or Broadcast - // must be set, otherwise the message is discarded. - outCh chan Envelope - - // errCh is a channel for reporting peer errors to the router, typically used - // when peers send an invalid or malignant message. - errCh chan PeerError - - // doneCh is used to signal that a Channel is closed. A Channel is bi-directional - // and should be closed by the reactor, where as the router is responsible - // for explicitly closing the internal In channel. - doneCh chan struct{} -} - -// NewChannel returns a reference to a new p2p Channel. It is the reactor's -// responsibility to close the Channel. After a channel is closed, the router may -// safely and explicitly close the internal In channel. -func NewChannel(id ChannelID, mType proto.Message, in, out chan Envelope, errCh chan PeerError) *Channel { - return &Channel{ - id: id, - messageType: mType, - inCh: in, - outCh: out, - errCh: errCh, - doneCh: make(chan struct{}), - } -} - -// ID returns the Channel's ID. -func (c *Channel) ID() ChannelID { - return c.id -} - -// In returns a read-only inbound go channel. This go channel should be used by -// reactors to consume Envelopes sent from peers. -func (c *Channel) In() <-chan Envelope { - return c.inCh -} - -// Out returns a write-only outbound go channel. This go channel should be used -// by reactors to route Envelopes to other peers. -func (c *Channel) Out() chan<- Envelope { - return c.outCh -} - -// Error returns a write-only outbound go channel designated for peer errors only. -// This go channel should be used by reactors to send peer errors when consuming -// Envelopes sent from other peers. -func (c *Channel) Error() chan<- PeerError { - return c.errCh -} - -// Close closes the outbound channel and marks the Channel as done. Internally, -// the outbound outCh and peer error errCh channels are closed. It is the reactor's -// responsibility to invoke Close. Any send on the Out or Error channel will -// panic after the Channel is closed. -// -// NOTE: After a Channel is closed, the router may safely assume it can no longer -// send on the internal inCh, however it should NEVER explicitly close it as -// that could result in panics by sending on a closed channel. -func (c *Channel) Close() { - c.closeOnce.Do(func() { - close(c.doneCh) - close(c.outCh) - close(c.errCh) - }) -} - -// Done returns the Channel's internal channel that should be used by a router -// to signal when it is safe to send on the internal inCh go channel. -func (c *Channel) Done() <-chan struct{} { - return c.doneCh -} - -// Wrapper is a Protobuf message that can contain a variety of inner messages. -// If a Channel's message type implements Wrapper, the channel will -// automatically (un)wrap passed messages using the container type, such that -// the channel can transparently support multiple message types. -type Wrapper interface { - // Wrap will take a message and wrap it in this one. - Wrap(proto.Message) error - - // Unwrap will unwrap the inner message contained in this message. - Unwrap() (proto.Message, error) -} diff --git a/p2p/mocks/connection.go b/p2p/mocks/connection.go new file mode 100644 index 000000000..07c4540ed --- /dev/null +++ b/p2p/mocks/connection.go @@ -0,0 +1,206 @@ +// Code generated by mockery v2.5.1. DO NOT EDIT. + +package mocks + +import ( + context "context" + + conn "github.com/tendermint/tendermint/p2p/conn" + + crypto "github.com/tendermint/tendermint/crypto" + + mock "github.com/stretchr/testify/mock" + + p2p "github.com/tendermint/tendermint/p2p" +) + +// Connection is an autogenerated mock type for the Connection type +type Connection struct { + mock.Mock +} + +// Close provides a mock function with given fields: +func (_m *Connection) Close() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// FlushClose provides a mock function with given fields: +func (_m *Connection) FlushClose() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// Handshake provides a mock function with given fields: _a0, _a1, _a2 +func (_m *Connection) Handshake(_a0 context.Context, _a1 p2p.NodeInfo, _a2 crypto.PrivKey) (p2p.NodeInfo, crypto.PubKey, error) { + ret := _m.Called(_a0, _a1, _a2) + + var r0 p2p.NodeInfo + if rf, ok := ret.Get(0).(func(context.Context, p2p.NodeInfo, crypto.PrivKey) p2p.NodeInfo); ok { + r0 = rf(_a0, _a1, _a2) + } else { + r0 = ret.Get(0).(p2p.NodeInfo) + } + + var r1 crypto.PubKey + if rf, ok := ret.Get(1).(func(context.Context, p2p.NodeInfo, crypto.PrivKey) crypto.PubKey); ok { + r1 = rf(_a0, _a1, _a2) + } else { + if ret.Get(1) != nil { + r1 = ret.Get(1).(crypto.PubKey) + } + } + + var r2 error + if rf, ok := ret.Get(2).(func(context.Context, p2p.NodeInfo, crypto.PrivKey) error); ok { + r2 = rf(_a0, _a1, _a2) + } else { + r2 = ret.Error(2) + } + + return r0, r1, r2 +} + +// LocalEndpoint provides a mock function with given fields: +func (_m *Connection) LocalEndpoint() p2p.Endpoint { + ret := _m.Called() + + var r0 p2p.Endpoint + if rf, ok := ret.Get(0).(func() p2p.Endpoint); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(p2p.Endpoint) + } + + return r0 +} + +// ReceiveMessage provides a mock function with given fields: +func (_m *Connection) ReceiveMessage() (p2p.ChannelID, []byte, error) { + ret := _m.Called() + + var r0 p2p.ChannelID + if rf, ok := ret.Get(0).(func() p2p.ChannelID); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(p2p.ChannelID) + } + + var r1 []byte + if rf, ok := ret.Get(1).(func() []byte); ok { + r1 = rf() + } else { + if ret.Get(1) != nil { + r1 = ret.Get(1).([]byte) + } + } + + var r2 error + if rf, ok := ret.Get(2).(func() error); ok { + r2 = rf() + } else { + r2 = ret.Error(2) + } + + return r0, r1, r2 +} + +// RemoteEndpoint provides a mock function with given fields: +func (_m *Connection) RemoteEndpoint() p2p.Endpoint { + ret := _m.Called() + + var r0 p2p.Endpoint + if rf, ok := ret.Get(0).(func() p2p.Endpoint); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(p2p.Endpoint) + } + + return r0 +} + +// SendMessage provides a mock function with given fields: _a0, _a1 +func (_m *Connection) SendMessage(_a0 p2p.ChannelID, _a1 []byte) (bool, error) { + ret := _m.Called(_a0, _a1) + + var r0 bool + if rf, ok := ret.Get(0).(func(p2p.ChannelID, []byte) bool); ok { + r0 = rf(_a0, _a1) + } else { + r0 = ret.Get(0).(bool) + } + + var r1 error + if rf, ok := ret.Get(1).(func(p2p.ChannelID, []byte) error); ok { + r1 = rf(_a0, _a1) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Status provides a mock function with given fields: +func (_m *Connection) Status() conn.ConnectionStatus { + ret := _m.Called() + + var r0 conn.ConnectionStatus + if rf, ok := ret.Get(0).(func() conn.ConnectionStatus); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(conn.ConnectionStatus) + } + + return r0 +} + +// String provides a mock function with given fields: +func (_m *Connection) String() string { + ret := _m.Called() + + var r0 string + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + return r0 +} + +// TrySendMessage provides a mock function with given fields: _a0, _a1 +func (_m *Connection) TrySendMessage(_a0 p2p.ChannelID, _a1 []byte) (bool, error) { + ret := _m.Called(_a0, _a1) + + var r0 bool + if rf, ok := ret.Get(0).(func(p2p.ChannelID, []byte) bool); ok { + r0 = rf(_a0, _a1) + } else { + r0 = ret.Get(0).(bool) + } + + var r1 error + if rf, ok := ret.Get(1).(func(p2p.ChannelID, []byte) error); ok { + r1 = rf(_a0, _a1) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} diff --git a/p2p/mocks/transport.go b/p2p/mocks/transport.go new file mode 100644 index 000000000..980c4a606 --- /dev/null +++ b/p2p/mocks/transport.go @@ -0,0 +1,121 @@ +// Code generated by mockery v2.5.1. DO NOT EDIT. + +package mocks + +import ( + context "context" + + mock "github.com/stretchr/testify/mock" + p2p "github.com/tendermint/tendermint/p2p" +) + +// Transport is an autogenerated mock type for the Transport type +type Transport struct { + mock.Mock +} + +// Accept provides a mock function with given fields: +func (_m *Transport) Accept() (p2p.Connection, error) { + ret := _m.Called() + + var r0 p2p.Connection + if rf, ok := ret.Get(0).(func() p2p.Connection); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(p2p.Connection) + } + } + + var r1 error + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Close provides a mock function with given fields: +func (_m *Transport) Close() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// Dial provides a mock function with given fields: _a0, _a1 +func (_m *Transport) Dial(_a0 context.Context, _a1 p2p.Endpoint) (p2p.Connection, error) { + ret := _m.Called(_a0, _a1) + + var r0 p2p.Connection + if rf, ok := ret.Get(0).(func(context.Context, p2p.Endpoint) p2p.Connection); ok { + r0 = rf(_a0, _a1) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(p2p.Connection) + } + } + + var r1 error + if rf, ok := ret.Get(1).(func(context.Context, p2p.Endpoint) error); ok { + r1 = rf(_a0, _a1) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Endpoints provides a mock function with given fields: +func (_m *Transport) Endpoints() []p2p.Endpoint { + ret := _m.Called() + + var r0 []p2p.Endpoint + if rf, ok := ret.Get(0).(func() []p2p.Endpoint); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]p2p.Endpoint) + } + } + + return r0 +} + +// Protocols provides a mock function with given fields: +func (_m *Transport) Protocols() []p2p.Protocol { + ret := _m.Called() + + var r0 []p2p.Protocol + if rf, ok := ret.Get(0).(func() []p2p.Protocol); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]p2p.Protocol) + } + } + + return r0 +} + +// String provides a mock function with given fields: +func (_m *Transport) String() string { + ret := _m.Called() + + var r0 string + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + return r0 +} diff --git a/p2p/p2p_test.go b/p2p/p2p_test.go new file mode 100644 index 000000000..9ab57e614 --- /dev/null +++ b/p2p/p2p_test.go @@ -0,0 +1,34 @@ +package p2p_test + +import ( + "context" + + "github.com/tendermint/tendermint/crypto" + "github.com/tendermint/tendermint/crypto/ed25519" + "github.com/tendermint/tendermint/p2p" +) + +// Common setup for P2P tests. + +var ( + ctx = context.Background() + chID = p2p.ChannelID(1) + + selfKey crypto.PrivKey = ed25519.GenPrivKeyFromSecret([]byte{0xf9, 0x1b, 0x08, 0xaa, 0x38, 0xee, 0x34, 0xdd}) + selfID = p2p.NodeIDFromPubKey(selfKey.PubKey()) + selfInfo = p2p.NodeInfo{ + NodeID: selfID, + ListenAddr: "0.0.0.0:0", + Network: "test", + Moniker: string(selfID), + } + + peerKey crypto.PrivKey = ed25519.GenPrivKeyFromSecret([]byte{0x84, 0xd7, 0x01, 0xbf, 0x83, 0x20, 0x1c, 0xfe}) + peerID = p2p.NodeIDFromPubKey(peerKey.PubKey()) + peerInfo = p2p.NodeInfo{ + NodeID: peerID, + ListenAddr: "0.0.0.0:0", + Network: "test", + Moniker: string(peerID), + } +) diff --git a/p2p/p2ptest/network.go b/p2p/p2ptest/network.go new file mode 100644 index 000000000..9ae2def8b --- /dev/null +++ b/p2p/p2ptest/network.go @@ -0,0 +1,240 @@ +package p2ptest + +import ( + "math/rand" + "testing" + "time" + + "github.com/gogo/protobuf/proto" + "github.com/stretchr/testify/require" + dbm "github.com/tendermint/tm-db" + + "github.com/tendermint/tendermint/crypto" + "github.com/tendermint/tendermint/crypto/ed25519" + "github.com/tendermint/tendermint/libs/log" + "github.com/tendermint/tendermint/p2p" +) + +// Network sets up an in-memory network that can be used for high-level P2P +// testing. It creates an arbitrary number of nodes that are connected to each +// other, and can open channels across all nodes with custom reactors. +type Network struct { + Nodes map[p2p.NodeID]*Node + + logger log.Logger + memoryNetwork *p2p.MemoryNetwork +} + +// MakeNetwork creates a test network with the given number of nodes and +// connects them to each other. +func MakeNetwork(t *testing.T, nodes int) *Network { + logger := log.TestingLogger() + network := &Network{ + Nodes: map[p2p.NodeID]*Node{}, + logger: logger, + memoryNetwork: p2p.NewMemoryNetwork(logger), + } + for i := 0; i < nodes; i++ { + node := MakeNode(t, network) + network.Nodes[node.NodeID] = node + } + + // Set up a list of node addresses to dial, and a peer update subscription + // for each node. + dialQueue := []p2p.NodeAddress{} + subs := map[p2p.NodeID]*p2p.PeerUpdates{} + for _, node := range network.Nodes { + dialQueue = append(dialQueue, node.NodeAddress) + subs[node.NodeID] = node.PeerManager.Subscribe() + defer subs[node.NodeID].Close() + } + + // For each node, dial the nodes that it still doesn't have a connection to + // (either inbound or outbound), and wait for both sides to confirm the + // connection via the subscriptions. + for i, sourceAddress := range dialQueue { + sourceNode := network.Nodes[sourceAddress.NodeID] + sourceSub := subs[sourceAddress.NodeID] + for _, targetAddress := range dialQueue[i+1:] { // nodes queries, we should implement this here as well by taking -// a peer ID filtering callback in PeerManagerOptions and configuring it during -// Node setup. -type PeerManager struct { - options PeerManagerOptions - wakeDialCh chan struct{} // wakes up DialNext() on relevant peer changes - wakeEvictCh chan struct{} // wakes up EvictNext() on relevant peer changes - closeCh chan struct{} // signal channel for Close() - closeOnce sync.Once - - mtx sync.Mutex - store *peerStore - dialing map[NodeID]bool // peers being dialed (DialNext -> Dialed/DialFail) - upgrading map[NodeID]NodeID // peers claimed for upgrade (DialNext -> Dialed/DialFail) - connected map[NodeID]bool // connected peers (Dialed/Accepted -> Disconnected) - evict map[NodeID]bool // peers scheduled for eviction (Connected -> EvictNext) - evicting map[NodeID]bool // peers being evicted (EvictNext -> Disconnected) - subscriptions map[*PeerUpdatesCh]*PeerUpdatesCh // keyed by struct identity (address) -} - -// PeerManagerOptions specifies options for a PeerManager. -type PeerManagerOptions struct { - // PersistentPeers are peers that we want to maintain persistent connections - // to. These will be scored higher than other peers, and if - // MaxConnectedUpgrade is non-zero any lower-scored peers will be evicted if - // necessary to make room for these. - PersistentPeers []NodeID - - // MaxPeers is the maximum number of peers to track information about, i.e. - // store in the peer store. When exceeded, the lowest-scored unconnected peers - // will be deleted. 0 means no limit. - MaxPeers uint16 - - // MaxConnected is the maximum number of connected peers (inbound and - // outbound). 0 means no limit. - MaxConnected uint16 - - // MaxConnectedUpgrade is the maximum number of additional connections to - // use for probing any better-scored peers to upgrade to when all connection - // slots are full. 0 disables peer upgrading. - // - // For example, if we are already connected to MaxConnected peers, but we - // know or learn about better-scored peers (e.g. configured persistent - // peers) that we are not connected too, then we can probe these peers by - // using up to MaxConnectedUpgrade connections, and once connected evict the - // lowest-scored connected peers. This also works for inbound connections, - // i.e. if a higher-scored peer attempts to connect to us, we can accept - // the connection and evict a lower-scored peer. - MaxConnectedUpgrade uint16 - - // MinRetryTime is the minimum time to wait between retries. Retry times - // double for each retry, up to MaxRetryTime. 0 disables retries. - MinRetryTime time.Duration - - // MaxRetryTime is the maximum time to wait between retries. 0 means - // no maximum, in which case the retry time will keep doubling. - MaxRetryTime time.Duration - - // MaxRetryTimePersistent is the maximum time to wait between retries for - // peers listed in PersistentPeers. 0 uses MaxRetryTime instead. - MaxRetryTimePersistent time.Duration - - // RetryTimeJitter is the upper bound of a random interval added to - // retry times, to avoid thundering herds. 0 disables jutter. - RetryTimeJitter time.Duration -} - -// NewPeerManager creates a new peer manager. -func NewPeerManager(peerDB dbm.DB, options PeerManagerOptions) (*PeerManager, error) { - store, err := newPeerStore(peerDB) - if err != nil { - return nil, err - } - peerManager := &PeerManager{ - options: options, - closeCh: make(chan struct{}), - - // We use a buffer of size 1 for these trigger channels, with - // non-blocking sends. This ensures that if e.g. wakeDial() is called - // multiple times before the initial trigger is picked up we only - // process the trigger once. - // - // FIXME: This should maybe be a libs/sync type. - wakeDialCh: make(chan struct{}, 1), - wakeEvictCh: make(chan struct{}, 1), - - store: store, - dialing: map[NodeID]bool{}, - upgrading: map[NodeID]NodeID{}, - connected: map[NodeID]bool{}, - evict: map[NodeID]bool{}, - evicting: map[NodeID]bool{}, - subscriptions: map[*PeerUpdatesCh]*PeerUpdatesCh{}, - } - if err = peerManager.configurePeers(); err != nil { - return nil, err - } - if err = peerManager.prunePeers(); err != nil { - return nil, err - } - return peerManager, nil -} - -// configurePeers configures peers in the peer store with ephemeral runtime -// configuration, e.g. setting peerInfo.Persistent based on -// PeerManagerOptions.PersistentPeers. The caller must hold the mutex lock. -func (m *PeerManager) configurePeers() error { - for _, peerID := range m.options.PersistentPeers { - if peer, ok := m.store.Get(peerID); ok { - peer.Persistent = true - if err := m.store.Set(peer); err != nil { - return err - } - } - } - return nil -} - -// prunePeers removes peers from the peer store if it contains more than -// MaxPeers peers. The lowest-scored non-connected peers are removed. -// The caller must hold the mutex lock. -func (m *PeerManager) prunePeers() error { - if m.options.MaxPeers == 0 || m.store.Size() <= int(m.options.MaxPeers) { - return nil - } - m.mtx.Lock() - defer m.mtx.Unlock() - - ranked := m.store.Ranked() - for i := len(ranked) - 1; i >= 0; i-- { - peerID := ranked[i].ID - switch { - case m.store.Size() <= int(m.options.MaxPeers): - break - case m.dialing[peerID]: - case m.connected[peerID]: - case m.evicting[peerID]: - default: - if err := m.store.Delete(peerID); err != nil { - return err - } - } - } - return nil -} - -// Close closes the peer manager, releasing resources allocated with it -// (specifically any running goroutines). -func (m *PeerManager) Close() { - m.closeOnce.Do(func() { - close(m.closeCh) - }) -} - -// Add adds a peer to the manager, given as an address. If the peer already -// exists, the address is added to it. -func (m *PeerManager) Add(address NodeAddress) error { - if err := address.Validate(); err != nil { - return err - } - m.mtx.Lock() - defer m.mtx.Unlock() - - peer, ok := m.store.Get(address.NodeID) - if !ok { - peer = m.makePeerInfo(address.NodeID) - } - if _, ok := peer.AddressInfo[address.String()]; !ok { - peer.AddressInfo[address.String()] = &peerAddressInfo{Address: address} - } - if err := m.store.Set(peer); err != nil { - return err - } - if err := m.prunePeers(); err != nil { - return err - } - m.wakeDial() - return nil -} - -// Advertise returns a list of peer addresses to advertise to a peer. -// -// FIXME: This is fairly naïve and only returns the addresses of the -// highest-ranked peers. -func (m *PeerManager) Advertise(peerID NodeID, limit uint16) []NodeAddress { - m.mtx.Lock() - defer m.mtx.Unlock() - - addresses := make([]NodeAddress, 0, limit) - for _, peer := range m.store.Ranked() { - if peer.ID == peerID { - continue - } - for _, addressInfo := range peer.AddressInfo { - if len(addresses) >= int(limit) { - return addresses - } - addresses = append(addresses, addressInfo.Address) - } - } - return addresses -} - -// makePeerInfo creates a peerInfo for a new peer. -func (m *PeerManager) makePeerInfo(id NodeID) peerInfo { - isPersistent := false - for _, p := range m.options.PersistentPeers { - if id == p { - isPersistent = true - break - } - } - return peerInfo{ - ID: id, - Persistent: isPersistent, - AddressInfo: map[string]*peerAddressInfo{}, - } -} - -// Subscribe subscribes to peer updates. The caller must consume the peer -// updates in a timely fashion and close the subscription when done, since -// delivery is guaranteed and will block peer connection/disconnection -// otherwise. -func (m *PeerManager) Subscribe() *PeerUpdatesCh { - // FIXME: We may want to use a size 1 buffer here. When the router - // broadcasts a peer update it has to loop over all of the - // subscriptions, and we want to avoid blocking and waiting for a - // context switch before continuing to the next subscription. This also - // prevents tail latencies from compounding across updates. We also want - // to make sure the subscribers are reasonably in sync, so it should be - // kept at 1. However, this should be benchmarked first. - peerUpdates := NewPeerUpdates(make(chan PeerUpdate)) - m.mtx.Lock() - m.subscriptions[peerUpdates] = peerUpdates - m.mtx.Unlock() - - go func() { - <-peerUpdates.Done() - m.mtx.Lock() - delete(m.subscriptions, peerUpdates) - m.mtx.Unlock() - }() - return peerUpdates -} - -// broadcast broadcasts a peer update to all subscriptions. The caller must -// already hold the mutex lock. This means the mutex is held for the duration -// of the broadcast, which we want to make sure all subscriptions receive all -// updates in the same order. -// -// FIXME: Consider using more fine-grained mutexes here, and/or a channel to -// enforce ordering of updates. -func (m *PeerManager) broadcast(peerUpdate PeerUpdate) { - for _, sub := range m.subscriptions { - select { - case sub.updatesCh <- peerUpdate: - case <-sub.doneCh: - } - } -} - -// DialNext finds an appropriate peer address to dial, and marks it as dialing. -// If no peer is found, or all connection slots are full, it blocks until one -// becomes available. The caller must call Dialed() or DialFailed() for the -// returned peer. The context can be used to cancel the call. -func (m *PeerManager) DialNext(ctx context.Context) (NodeID, NodeAddress, error) { - for { - id, address, err := m.TryDialNext() - if err != nil || id != "" { - return id, address, err - } - select { - case <-m.wakeDialCh: - case <-ctx.Done(): - return "", NodeAddress{}, ctx.Err() - } - } -} - -// TryDialNext is equivalent to DialNext(), but immediately returns an empty -// peer ID if no peers or connection slots are available. -func (m *PeerManager) TryDialNext() (NodeID, NodeAddress, error) { - m.mtx.Lock() - defer m.mtx.Unlock() - - // We allow dialing MaxConnected+MaxConnectedUpgrade peers. Including - // MaxConnectedUpgrade allows us to probe additional peers that have a - // higher score than any other peers, and if successful evict it. - if m.options.MaxConnected > 0 && - len(m.connected)+len(m.dialing) >= int(m.options.MaxConnected)+int(m.options.MaxConnectedUpgrade) { - return "", NodeAddress{}, nil - } - - for _, peer := range m.store.Ranked() { - if m.dialing[peer.ID] || m.connected[peer.ID] { - continue - } - - for _, addressInfo := range peer.AddressInfo { - if time.Since(addressInfo.LastDialFailure) < m.retryDelay(addressInfo.DialFailures, peer.Persistent) { - continue - } - - // We now have an eligible address to dial. If we're full but have - // upgrade capacity (as checked above), we find a lower-scored peer - // we can replace and mark it as upgrading so noone else claims it. - // - // If we don't find one, there is no point in trying additional - // peers, since they will all have the same or lower score than this - // peer (since they're ordered by score via peerStore.Ranked). - if m.options.MaxConnected > 0 && len(m.connected) >= int(m.options.MaxConnected) { - upgradeFromPeer := m.findUpgradeCandidate(peer.ID, peer.Score()) - if upgradeFromPeer == "" { - return "", NodeAddress{}, nil - } - m.upgrading[upgradeFromPeer] = peer.ID - } - - m.dialing[peer.ID] = true - return peer.ID, addressInfo.Address, nil - } - } - return "", NodeAddress{}, nil -} - -// wakeDial is used to notify DialNext about changes that *may* cause new -// peers to become eligible for dialing, such as peer disconnections and -// retry timeouts. -func (m *PeerManager) wakeDial() { - // The channel has a 1-size buffer. A non-blocking send ensures - // we only queue up at most 1 trigger between each DialNext(). - select { - case m.wakeDialCh <- struct{}{}: - default: - } -} - -// wakeEvict is used to notify EvictNext about changes that *may* cause -// peers to become eligible for eviction, such as peer upgrades. -func (m *PeerManager) wakeEvict() { - // The channel has a 1-size buffer. A non-blocking send ensures - // we only queue up at most 1 trigger between each EvictNext(). - select { - case m.wakeEvictCh <- struct{}{}: - default: - } -} - -// retryDelay calculates a dial retry delay using exponential backoff, based on -// retry settings in PeerManagerOptions. If MinRetryTime is 0, this returns -// MaxInt64 (i.e. an infinite retry delay, effectively disabling retries). -func (m *PeerManager) retryDelay(failures uint32, persistent bool) time.Duration { - if failures == 0 { - return 0 - } - if m.options.MinRetryTime == 0 { - return time.Duration(math.MaxInt64) - } - maxDelay := m.options.MaxRetryTime - if persistent && m.options.MaxRetryTimePersistent > 0 { - maxDelay = m.options.MaxRetryTimePersistent - } - - delay := m.options.MinRetryTime * time.Duration(math.Pow(2, float64(failures))) - if maxDelay > 0 && delay > maxDelay { - delay = maxDelay - } - // FIXME: This should use a PeerManager-scoped RNG. - delay += time.Duration(rand.Int63n(int64(m.options.RetryTimeJitter))) // nolint:gosec - return delay -} - -// DialFailed reports a failed dial attempt. This will make the peer available -// for dialing again when appropriate. -// -// FIXME: This should probably delete or mark bad addresses/peers after some time. -func (m *PeerManager) DialFailed(peerID NodeID, address NodeAddress) error { - m.mtx.Lock() - defer m.mtx.Unlock() - - delete(m.dialing, peerID) - for from, to := range m.upgrading { - if to == peerID { - delete(m.upgrading, from) // Unmark failed upgrade attempt. - } - } - - peer, ok := m.store.Get(peerID) - if !ok { // Peer may have been removed while dialing, ignore. - return nil - } - addressInfo, ok := peer.AddressInfo[address.String()] - if !ok { - return nil // Assume the address has been removed, ignore. - } - addressInfo.LastDialFailure = time.Now().UTC() - addressInfo.DialFailures++ - if err := m.store.Set(peer); err != nil { - return err - } - - // We spawn a goroutine that notifies DialNext() again when the retry - // timeout has elapsed, so that we can consider dialing it again. - go func() { - retryDelay := m.retryDelay(addressInfo.DialFailures, peer.Persistent) - if retryDelay == time.Duration(math.MaxInt64) { - return - } - // Use an explicit timer with deferred cleanup instead of - // time.After(), to avoid leaking goroutines on PeerManager.Close(). - timer := time.NewTimer(retryDelay) - defer timer.Stop() - select { - case <-timer.C: - m.wakeDial() - case <-m.closeCh: - } - }() - - m.wakeDial() - return nil -} - -// Dialed marks a peer as successfully dialed. Any further incoming connections -// will be rejected, and once disconnected the peer may be dialed again. -func (m *PeerManager) Dialed(peerID NodeID, address NodeAddress) error { - m.mtx.Lock() - defer m.mtx.Unlock() - - delete(m.dialing, peerID) - - var upgradeFromPeer NodeID - for from, to := range m.upgrading { - if to == peerID { - delete(m.upgrading, from) - upgradeFromPeer = from - // Don't break, just in case this peer was marked as upgrading for - // multiple lower-scored peers (shouldn't really happen). - } - } - - if m.connected[peerID] { - return fmt.Errorf("peer %v is already connected", peerID) - } - if m.options.MaxConnected > 0 && - len(m.connected) >= int(m.options.MaxConnected)+int(m.options.MaxConnectedUpgrade) { - return fmt.Errorf("already connected to maximum number of peers") - } - - peer, ok := m.store.Get(peerID) - if !ok { - return fmt.Errorf("peer %q was removed while dialing", peerID) - } - now := time.Now().UTC() - peer.LastConnected = now - if addressInfo, ok := peer.AddressInfo[address.String()]; ok { - addressInfo.DialFailures = 0 - addressInfo.LastDialSuccess = now - // If not found, assume address has been removed. - } - if err := m.store.Set(peer); err != nil { - return err - } - - if upgradeFromPeer != "" && m.options.MaxConnected > 0 && - len(m.connected) >= int(m.options.MaxConnected) { - // Look for an even lower-scored peer that may have appeared - // since we started the upgrade. - if p, ok := m.store.Get(upgradeFromPeer); ok { - if u := m.findUpgradeCandidate(p.ID, p.Score()); u != "" { - upgradeFromPeer = u - } - } - m.evict[upgradeFromPeer] = true - } - m.connected[peerID] = true - m.wakeEvict() - - return nil -} - -// Accepted marks an incoming peer connection successfully accepted. If the peer -// is already connected or we don't allow additional connections then this will -// return an error. -// -// If full but MaxConnectedUpgrade is non-zero and the incoming peer is -// better-scored than any existing peers, then we accept it and evict a -// lower-scored peer. -// -// NOTE: We can't take an address here, since e.g. TCP uses a different port -// number for outbound traffic than inbound traffic, so the peer's endpoint -// wouldn't necessarily be an appropriate address to dial. -// -// FIXME: When we accept a connection from a peer, we should register that -// peer's address in the peer store so that we can dial it later. In order to do -// that, we'll need to get the remote address after all, but as noted above that -// can't be the remote endpoint since that will usually have the wrong port -// number. -func (m *PeerManager) Accepted(peerID NodeID) error { - m.mtx.Lock() - defer m.mtx.Unlock() - - if m.connected[peerID] { - return fmt.Errorf("peer %q is already connected", peerID) - } - if m.options.MaxConnected > 0 && - len(m.connected) >= int(m.options.MaxConnected)+int(m.options.MaxConnectedUpgrade) { - return fmt.Errorf("already connected to maximum number of peers") - } - - peer, ok := m.store.Get(peerID) - if !ok { - peer = m.makePeerInfo(peerID) - } - - // If all connections slots are full, but we allow upgrades (and we checked - // above that we have upgrade capacity), then we can look for a lower-scored - // peer to replace and if found accept the connection anyway and evict it. - var upgradeFromPeer NodeID - if m.options.MaxConnected > 0 && len(m.connected) >= int(m.options.MaxConnected) { - upgradeFromPeer = m.findUpgradeCandidate(peer.ID, peer.Score()) - if upgradeFromPeer == "" { - return fmt.Errorf("already connected to maximum number of peers") - } - } - - peer.LastConnected = time.Now().UTC() - if err := m.store.Set(peer); err != nil { - return err - } - - m.connected[peerID] = true - if upgradeFromPeer != "" { - m.evict[upgradeFromPeer] = true - } - m.wakeEvict() - return nil -} - -// Ready marks a peer as ready, broadcasting status updates to subscribers. The -// peer must already be marked as connected. This is separate from Dialed() and -// Accepted() to allow the router to set up its internal queues before reactors -// start sending messages. -func (m *PeerManager) Ready(peerID NodeID) { - m.mtx.Lock() - defer m.mtx.Unlock() - - if m.connected[peerID] { - m.broadcast(PeerUpdate{ - PeerID: peerID, - Status: PeerStatusUp, - }) - } -} - -// Disconnected unmarks a peer as connected, allowing new connections to be -// established. -func (m *PeerManager) Disconnected(peerID NodeID) error { - m.mtx.Lock() - defer m.mtx.Unlock() - - delete(m.connected, peerID) - delete(m.upgrading, peerID) - delete(m.evict, peerID) - delete(m.evicting, peerID) - m.broadcast(PeerUpdate{ - PeerID: peerID, - Status: PeerStatusDown, - }) - m.wakeDial() - return nil -} - -// EvictNext returns the next peer to evict (i.e. disconnect). If no evictable -// peers are found, the call will block until one becomes available or the -// context is cancelled. -func (m *PeerManager) EvictNext(ctx context.Context) (NodeID, error) { - for { - id, err := m.TryEvictNext() - if err != nil || id != "" { - return id, err - } - select { - case <-m.wakeEvictCh: - case <-ctx.Done(): - return "", ctx.Err() - } - } -} - -// TryEvictNext is equivalent to EvictNext, but immediately returns an empty -// node ID if no evictable peers are found. -func (m *PeerManager) TryEvictNext() (NodeID, error) { - m.mtx.Lock() - defer m.mtx.Unlock() - - // If any connected peers are explicitly scheduled for eviction, we return a - // random one. - for peerID := range m.evict { - delete(m.evict, peerID) - if m.connected[peerID] && !m.evicting[peerID] { - m.evicting[peerID] = true - return peerID, nil - } - } - - // If we're below capacity, we don't need to evict anything. - if m.options.MaxConnected == 0 || - len(m.connected)-len(m.evicting) <= int(m.options.MaxConnected) { - return "", nil - } - - // If we're above capacity, just pick the lowest-ranked peer to evict. - ranked := m.store.Ranked() - for i := len(ranked) - 1; i >= 0; i-- { - peer := ranked[i] - if m.connected[peer.ID] && !m.evicting[peer.ID] { - m.evicting[peer.ID] = true - return peer.ID, nil - } - } - - return "", nil -} - -// findUpgradeCandidate looks for a lower-scored peer that we could evict -// to make room for the given peer. Returns an empty ID if none is found. -// The caller must hold the mutex lock. -func (m *PeerManager) findUpgradeCandidate(id NodeID, score PeerScore) NodeID { - ranked := m.store.Ranked() - for i := len(ranked) - 1; i >= 0; i-- { - candidate := ranked[i] - switch { - case candidate.Score() >= score: - return "" // no further peers can be scored lower, due to sorting - case !m.connected[candidate.ID]: - case m.evict[candidate.ID]: - case m.evicting[candidate.ID]: - case m.upgrading[candidate.ID] != "": - default: - return candidate.ID - } - } - return "" -} - -// GetHeight returns a peer's height, as reported via SetHeight. If the peer -// or height is unknown, this returns 0. -// -// FIXME: This is a temporary workaround for the peer state stored via the -// legacy Peer.Set() and Peer.Get() APIs, used to share height state between the -// consensus and mempool reactors. These dependencies should be removed from the -// reactors, and instead query this information independently via new P2P -// protocol additions. -func (m *PeerManager) GetHeight(peerID NodeID) uint64 { - m.mtx.Lock() - defer m.mtx.Unlock() - - peer, _ := m.store.Get(peerID) - return peer.Height -} - -// SetHeight stores a peer's height, making it available via GetHeight. If the -// peer is unknown, it is created. -// -// FIXME: This is a temporary workaround for the peer state stored via the -// legacy Peer.Set() and Peer.Get() APIs, used to share height state between the -// consensus and mempool reactors. These dependencies should be removed from the -// reactors, and instead query this information independently via new P2P -// protocol additions. -func (m *PeerManager) SetHeight(peerID NodeID, height uint64) error { - m.mtx.Lock() - defer m.mtx.Unlock() - - peer, ok := m.store.Get(peerID) - if !ok { - peer = m.makePeerInfo(peerID) - } - peer.Height = height - return m.store.Set(peer) -} - -// peerStore stores information about peers. It is not thread-safe, assuming -// it is used only by PeerManager which handles concurrency control, allowing -// it to execute multiple operations atomically via its own mutex. -// -// The entire set of peers is kept in memory, for performance. It is loaded -// from disk on initialization, and any changes are written back to disk -// (without fsync, since we can afford to lose recent writes). -type peerStore struct { - db dbm.DB - peers map[NodeID]*peerInfo - ranked []*peerInfo // cache for Ranked(), nil invalidates cache -} - -// newPeerStore creates a new peer store, loading all persisted peers from the -// database into memory. -func newPeerStore(db dbm.DB) (*peerStore, error) { - store := &peerStore{ - db: db, - } - if err := store.loadPeers(); err != nil { - return nil, err - } - return store, nil -} - -// loadPeers loads all peers from the database into memory. -func (s *peerStore) loadPeers() error { - peers := make(map[NodeID]*peerInfo) - - start, end := keyPeerInfoRange() - iter, err := s.db.Iterator(start, end) - if err != nil { - return err - } - defer iter.Close() - for ; iter.Valid(); iter.Next() { - // FIXME: We may want to tolerate failures here, by simply logging - // the errors and ignoring the faulty peer entries. - msg := new(p2pproto.PeerInfo) - if err := proto.Unmarshal(iter.Value(), msg); err != nil { - return fmt.Errorf("invalid peer Protobuf data: %w", err) - } - peer, err := peerInfoFromProto(msg) - if err != nil { - return fmt.Errorf("invalid peer data: %w", err) - } - peers[peer.ID] = peer - } - if iter.Error() != nil { - return iter.Error() - } - s.peers = peers - s.ranked = nil // invalidate cache if populated - return nil -} - -// Get fetches a peer. The boolean indicates whether the peer existed or not. -// The returned peer info is a copy, and can be mutated at will. -func (s *peerStore) Get(id NodeID) (peerInfo, bool) { - peer, ok := s.peers[id] - return peer.Copy(), ok -} - -// Set stores peer data. The input data will be copied, and can safely be reused -// by the caller. -func (s *peerStore) Set(peer peerInfo) error { - if err := peer.Validate(); err != nil { - return err - } - peer = peer.Copy() - - // FIXME: We may want to optimize this by avoiding saving to the database - // if there haven't been any changes to persisted fields. - bz, err := peer.ToProto().Marshal() - if err != nil { - return err - } - if err = s.db.Set(keyPeerInfo(peer.ID), bz); err != nil { - return err - } - - if current, ok := s.peers[peer.ID]; !ok || current.Score() != peer.Score() { - // If the peer is new, or its score changes, we invalidate the Ranked() cache. - s.peers[peer.ID] = &peer - s.ranked = nil - } else { - // Otherwise, since s.ranked contains pointers to the old data and we - // want those pointers to remain valid with the new data, we have to - // update the existing pointer address. - *current = peer - } - - return nil -} - -// Delete deletes a peer, or does nothing if it does not exist. -func (s *peerStore) Delete(id NodeID) error { - if _, ok := s.peers[id]; !ok { - return nil - } - if err := s.db.Delete(keyPeerInfo(id)); err != nil { - return err - } - delete(s.peers, id) - s.ranked = nil - return nil -} - -// List retrieves all peers in an arbitrary order. The returned data is a copy, -// and can be mutated at will. -func (s *peerStore) List() []peerInfo { - peers := make([]peerInfo, 0, len(s.peers)) - for _, peer := range s.peers { - peers = append(peers, peer.Copy()) - } - return peers -} - -// Ranked returns a list of peers ordered by score (better peers first). Peers -// with equal scores are returned in an arbitrary order. The returned list must -// not be mutated or accessed concurrently by the caller, since it returns -// pointers to internal peerStore data for performance. -// -// Ranked is used to determine both which peers to dial, which ones to evict, -// and which ones to delete completely. -// -// FIXME: For now, we simply maintain a cache in s.ranked which is invalidated -// by setting it to nil, but if necessary we should use a better data structure -// for this (e.g. a heap or ordered map). -// -// FIXME: The scoring logic is currently very naïve, see peerInfo.Score(). -func (s *peerStore) Ranked() []*peerInfo { - if s.ranked != nil { - return s.ranked - } - s.ranked = make([]*peerInfo, 0, len(s.peers)) - for _, peer := range s.peers { - s.ranked = append(s.ranked, peer) - } - sort.Slice(s.ranked, func(i, j int) bool { - // FIXME: If necessary, consider precomputing scores before sorting, - // to reduce the number of Score() calls. - return s.ranked[i].Score() > s.ranked[j].Score() - }) - return s.ranked -} - -// Size returns the number of peers in the peer store. -func (s *peerStore) Size() int { - return len(s.peers) -} - -// peerInfo contains peer information stored in a peerStore. -type peerInfo struct { - ID NodeID - AddressInfo map[string]*peerAddressInfo - LastConnected time.Time - - // These fields are ephemeral, i.e. not persisted to the database. - Persistent bool - Height uint64 -} - -// peerInfoFromProto converts a Protobuf PeerInfo message to a peerInfo, -// erroring if the data is invalid. -func peerInfoFromProto(msg *p2pproto.PeerInfo) (*peerInfo, error) { - p := &peerInfo{ - ID: NodeID(msg.ID), - AddressInfo: map[string]*peerAddressInfo{}, - } - if msg.LastConnected != nil { - p.LastConnected = *msg.LastConnected - } - for _, addr := range msg.AddressInfo { - addressInfo, err := peerAddressInfoFromProto(addr) - if err != nil { - return nil, err - } - p.AddressInfo[addressInfo.Address.String()] = addressInfo - } - return p, p.Validate() -} - -// ToProto converts the peerInfo to p2pproto.PeerInfo for database storage. The -// Protobuf type only contains persisted fields, while ephemeral fields are -// discarded. The returned message may contain pointers to original data, since -// it is expected to be serialized immediately. -func (p *peerInfo) ToProto() *p2pproto.PeerInfo { - msg := &p2pproto.PeerInfo{ - ID: string(p.ID), - LastConnected: &p.LastConnected, - } - for _, addressInfo := range p.AddressInfo { - msg.AddressInfo = append(msg.AddressInfo, addressInfo.ToProto()) - } - if msg.LastConnected.IsZero() { - msg.LastConnected = nil - } - return msg -} - -// Copy returns a deep copy of the peer info. -func (p *peerInfo) Copy() peerInfo { - if p == nil { - return peerInfo{} - } - c := *p - for i, addressInfo := range c.AddressInfo { - addressInfoCopy := addressInfo.Copy() - c.AddressInfo[i] = &addressInfoCopy - } - return c -} - -// Score calculates a score for the peer. Higher-scored peers will be -// preferred over lower scores. -func (p *peerInfo) Score() PeerScore { - var score PeerScore - if p.Persistent { - score += PeerScorePersistent - } - return score -} - -// Validate validates the peer info. -func (p *peerInfo) Validate() error { - if p.ID == "" { - return errors.New("no peer ID") - } - return nil -} - -// peerAddressInfo contains information and statistics about a peer address. -type peerAddressInfo struct { - Address NodeAddress - LastDialSuccess time.Time - LastDialFailure time.Time - DialFailures uint32 // since last successful dial -} - -// peerAddressInfoFromProto converts a Protobuf PeerAddressInfo message -// to a peerAddressInfo. -func peerAddressInfoFromProto(msg *p2pproto.PeerAddressInfo) (*peerAddressInfo, error) { - address, err := ParseNodeAddress(msg.Address) - if err != nil { - return nil, fmt.Errorf("invalid address %q: %w", address, err) - } - addressInfo := &peerAddressInfo{ - Address: address, - DialFailures: msg.DialFailures, - } - if msg.LastDialSuccess != nil { - addressInfo.LastDialSuccess = *msg.LastDialSuccess - } - if msg.LastDialFailure != nil { - addressInfo.LastDialFailure = *msg.LastDialFailure - } - return addressInfo, addressInfo.Validate() -} - -// ToProto converts the address into to a Protobuf message for serialization. -func (a *peerAddressInfo) ToProto() *p2pproto.PeerAddressInfo { - msg := &p2pproto.PeerAddressInfo{ - Address: a.Address.String(), - LastDialSuccess: &a.LastDialSuccess, - LastDialFailure: &a.LastDialFailure, - DialFailures: a.DialFailures, - } - if msg.LastDialSuccess.IsZero() { - msg.LastDialSuccess = nil - } - if msg.LastDialFailure.IsZero() { - msg.LastDialFailure = nil - } - return msg -} - -// Copy returns a copy of the address info. -func (a *peerAddressInfo) Copy() peerAddressInfo { - return *a -} - -// Validate validates the address info. -func (a *peerAddressInfo) Validate() error { - return a.Address.Validate() -} - -// These are database key prefixes. -const ( - prefixPeerInfo int64 = 1 -) - -// keyPeerInfo generates a peerInfo database key. -func keyPeerInfo(id NodeID) []byte { - key, err := orderedcode.Append(nil, prefixPeerInfo, string(id)) - if err != nil { - panic(err) - } - return key -} - -// keyPeerInfoPrefix generates start/end keys for the entire peerInfo key range. -func keyPeerInfoRange() ([]byte, []byte) { - start, err := orderedcode.Append(nil, prefixPeerInfo, "") - if err != nil { - panic(err) - } - end, err := orderedcode.Append(nil, prefixPeerInfo, orderedcode.Infinity) - if err != nil { - panic(err) - } - return start, end -} - -// ============================================================================ -// Types and business logic below may be deprecated. -// -// TODO: Rename once legacy p2p types are removed. -// ref: https://github.com/tendermint/tendermint/issues/5670 -// ============================================================================ - //go:generate mockery --case underscore --name Peer const metricsTickerDuration = 10 * time.Second diff --git a/p2p/peermanager.go b/p2p/peermanager.go new file mode 100644 index 000000000..f330dd236 --- /dev/null +++ b/p2p/peermanager.go @@ -0,0 +1,1294 @@ +package p2p + +import ( + "context" + "errors" + "fmt" + "math" + "math/rand" + "sort" + "sync" + "time" + + "github.com/gogo/protobuf/proto" + "github.com/google/orderedcode" + dbm "github.com/tendermint/tm-db" + + tmsync "github.com/tendermint/tendermint/libs/sync" + p2pproto "github.com/tendermint/tendermint/proto/tendermint/p2p" +) + +const ( + // retryNever is returned by retryDelay() when retries are disabled. + retryNever time.Duration = math.MaxInt64 +) + +// PeerStatus is a peer status. +// +// The peer manager has many more internal states for a peer (e.g. dialing, +// connected, evicting, and so on), which are tracked separately. PeerStatus is +// for external use outside of the peer manager. +type PeerStatus string + +const ( + PeerStatusUp PeerStatus = "up" // connected and ready + PeerStatusDown PeerStatus = "down" // disconnected +) + +// PeerScore is a numeric score assigned to a peer (higher is better). +type PeerScore uint8 + +const ( + PeerScorePersistent PeerScore = 100 // persistent peers +) + +// PeerUpdate is a peer update event sent via PeerUpdates. +type PeerUpdate struct { + NodeID NodeID + Status PeerStatus +} + +// PeerUpdates is a peer update subscription with notifications about peer +// events (currently just status changes). +type PeerUpdates struct { + updatesCh chan PeerUpdate + closeCh chan struct{} + closeOnce sync.Once +} + +// NewPeerUpdates creates a new PeerUpdates subscription. It is primarily for +// internal use, callers should typically use PeerManager.Subscribe(). The +// subscriber must call Close() when done. +func NewPeerUpdates(updatesCh chan PeerUpdate) *PeerUpdates { + return &PeerUpdates{ + updatesCh: updatesCh, + closeCh: make(chan struct{}), + } +} + +// Updates returns a channel for consuming peer updates. +func (pu *PeerUpdates) Updates() <-chan PeerUpdate { + return pu.updatesCh +} + +// Close closes the peer updates subscription. +func (pu *PeerUpdates) Close() { + pu.closeOnce.Do(func() { + // NOTE: We don't close updatesCh since multiple goroutines may be + // sending on it. The PeerManager senders will select on closeCh as well + // to avoid blocking on a closed subscription. + close(pu.closeCh) + }) +} + +// Done returns a channel that is closed when the subscription is closed. +func (pu *PeerUpdates) Done() <-chan struct{} { + return pu.closeCh +} + +// PeerManagerOptions specifies options for a PeerManager. +type PeerManagerOptions struct { + // PersistentPeers are peers that we want to maintain persistent connections + // to. These will be scored higher than other peers, and if + // MaxConnectedUpgrade is non-zero any lower-scored peers will be evicted if + // necessary to make room for these. + PersistentPeers []NodeID + + // MaxPeers is the maximum number of peers to track information about, i.e. + // store in the peer store. When exceeded, the lowest-scored unconnected peers + // will be deleted. 0 means no limit. + MaxPeers uint16 + + // MaxConnected is the maximum number of connected peers (inbound and + // outbound). 0 means no limit. + MaxConnected uint16 + + // MaxConnectedUpgrade is the maximum number of additional connections to + // use for probing any better-scored peers to upgrade to when all connection + // slots are full. 0 disables peer upgrading. + // + // For example, if we are already connected to MaxConnected peers, but we + // know or learn about better-scored peers (e.g. configured persistent + // peers) that we are not connected too, then we can probe these peers by + // using up to MaxConnectedUpgrade connections, and once connected evict the + // lowest-scored connected peers. This also works for inbound connections, + // i.e. if a higher-scored peer attempts to connect to us, we can accept + // the connection and evict a lower-scored peer. + MaxConnectedUpgrade uint16 + + // MinRetryTime is the minimum time to wait between retries. Retry times + // double for each retry, up to MaxRetryTime. 0 disables retries. + MinRetryTime time.Duration + + // MaxRetryTime is the maximum time to wait between retries. 0 means + // no maximum, in which case the retry time will keep doubling. + MaxRetryTime time.Duration + + // MaxRetryTimePersistent is the maximum time to wait between retries for + // peers listed in PersistentPeers. 0 uses MaxRetryTime instead. + MaxRetryTimePersistent time.Duration + + // RetryTimeJitter is the upper bound of a random interval added to + // retry times, to avoid thundering herds. 0 disables jitter. + RetryTimeJitter time.Duration + + // PeerScores sets fixed scores for specific peers. It is mainly used + // for testing. A score of 0 is ignored. + PeerScores map[NodeID]PeerScore + + // persistentPeers provides fast PersistentPeers lookups. It is built + // by optimize(). + persistentPeers map[NodeID]bool +} + +// Validate validates the options. +func (o *PeerManagerOptions) Validate() error { + for _, id := range o.PersistentPeers { + if err := id.Validate(); err != nil { + return fmt.Errorf("invalid PersistentPeer ID %q: %w", id, err) + } + } + if o.MaxConnected > 0 && len(o.PersistentPeers) > int(o.MaxConnected) { + return fmt.Errorf("number of persistent peers %v can't exceed MaxConnected %v", + len(o.PersistentPeers), o.MaxConnected) + } + + if o.MaxPeers > 0 { + if o.MaxConnected == 0 || o.MaxConnected+o.MaxConnectedUpgrade > o.MaxPeers { + return fmt.Errorf("MaxConnected %v and MaxConnectedUpgrade %v can't exceed MaxPeers %v", // nolint + o.MaxConnected, o.MaxConnectedUpgrade, o.MaxPeers) + } + } + + if o.MaxRetryTime > 0 { + if o.MinRetryTime == 0 { + return errors.New("can't set MaxRetryTime without MinRetryTime") + } + if o.MinRetryTime > o.MaxRetryTime { + return fmt.Errorf("MinRetryTime %v is greater than MaxRetryTime %v", // nolint + o.MinRetryTime, o.MaxRetryTime) + } + } + if o.MaxRetryTimePersistent > 0 { + if o.MinRetryTime == 0 { + return errors.New("can't set MaxRetryTimePersistent without MinRetryTime") + } + if o.MinRetryTime > o.MaxRetryTimePersistent { + return fmt.Errorf("MinRetryTime %v is greater than MaxRetryTimePersistent %v", // nolint + o.MinRetryTime, o.MaxRetryTimePersistent) + } + } + + return nil +} + +// isPersistentPeer checks if a peer is in PersistentPeers. It will panic +// if called before optimize(). +func (o *PeerManagerOptions) isPersistent(id NodeID) bool { + if o.persistentPeers == nil { + panic("isPersistentPeer() called before optimize()") + } + return o.persistentPeers[id] +} + +// optimize optimizes operations by pregenerating lookup structures. It's a +// separate method instead of memoizing during calls to avoid dealing with +// concurrency and mutex overhead. +func (o *PeerManagerOptions) optimize() { + o.persistentPeers = make(map[NodeID]bool, len(o.PersistentPeers)) + for _, p := range o.PersistentPeers { + o.persistentPeers[p] = true + } +} + +// PeerManager manages peer lifecycle information, using a peerStore for +// underlying storage. Its primary purpose is to determine which peer to connect +// to next (including retry timers), make sure a peer only has a single active +// connection (either inbound or outbound), and evict peers to make room for +// higher-scored peers. It does not manage actual connections (this is handled +// by the Router), only the peer lifecycle state. +// +// For an outbound connection, the flow is as follows: +// - DialNext: return a peer address to dial, mark peer as dialing. +// - DialFailed: report a dial failure, unmark as dialing. +// - Dialed: report a dial success, unmark as dialing and mark as connected +// (errors if already connected, e.g. by Accepted). +// - Ready: report routing is ready, mark as ready and broadcast PeerStatusUp. +// - Disconnected: report peer disconnect, unmark as connected and broadcasts +// PeerStatusDown. +// +// For an inbound connection, the flow is as follows: +// - Accepted: report inbound connection success, mark as connected (errors if +// already connected, e.g. by Dialed). +// - Ready: report routing is ready, mark as ready and broadcast PeerStatusUp. +// - Disconnected: report peer disconnect, unmark as connected and broadcasts +// PeerStatusDown. +// +// When evicting peers, either because peers are explicitly scheduled for +// eviction or we are connected to too many peers, the flow is as follows: +// - EvictNext: if marked evict and connected, unmark evict and mark evicting. +// If beyond MaxConnected, pick lowest-scored peer and mark evicting. +// - Disconnected: unmark connected, evicting, evict, and broadcast a +// PeerStatusDown peer update. +// +// If all connection slots are full (at MaxConnections), we can use up to +// MaxConnectionsUpgrade additional connections to probe any higher-scored +// unconnected peers, and if we reach them (or they reach us) we allow the +// connection and evict a lower-scored peer. We mark the lower-scored peer as +// upgrading[from]=to to make sure no other higher-scored peers can claim the +// same one for an upgrade. The flow is as follows: +// - Accepted: if upgrade is possible, mark connected and add lower-scored to evict. +// - DialNext: if upgrade is possible, mark upgrading[from]=to and dialing. +// - DialFailed: unmark upgrading[from]=to and dialing. +// - Dialed: unmark upgrading[from]=to and dialing, mark as connected, add +// lower-scored to evict. +// - EvictNext: pick peer from evict, mark as evicting. +// - Disconnected: unmark connected, upgrading[from]=to, evict, evicting. +// +// FIXME: The old stack supports ABCI-based peer ID filtering via +// /p2p/filter/id/ queries, we should implement this here as well by taking +// a peer ID filtering callback in PeerManagerOptions and configuring it during +// Node setup. +type PeerManager struct { + selfID NodeID + options PeerManagerOptions + rand *rand.Rand + dialWaker *tmsync.Waker // wakes up DialNext() on relevant peer changes + evictWaker *tmsync.Waker // wakes up EvictNext() on relevant peer changes + closeCh chan struct{} // signal channel for Close() + closeOnce sync.Once + + mtx sync.Mutex + store *peerStore + subscriptions map[*PeerUpdates]*PeerUpdates // keyed by struct identity (address) + dialing map[NodeID]bool // peers being dialed (DialNext → Dialed/DialFail) + upgrading map[NodeID]NodeID // peers claimed for upgrade (DialNext → Dialed/DialFail) + connected map[NodeID]bool // connected peers (Dialed/Accepted → Disconnected) + ready map[NodeID]bool // ready peers (Ready → Disconnected) + evict map[NodeID]bool // peers scheduled for eviction (Connected → EvictNext) + evicting map[NodeID]bool // peers being evicted (EvictNext → Disconnected) +} + +// NewPeerManager creates a new peer manager. +func NewPeerManager(selfID NodeID, peerDB dbm.DB, options PeerManagerOptions) (*PeerManager, error) { + if selfID == "" { + return nil, errors.New("self ID not given") + } + if err := options.Validate(); err != nil { + return nil, err + } + options.optimize() + + store, err := newPeerStore(peerDB) + if err != nil { + return nil, err + } + + peerManager := &PeerManager{ + selfID: selfID, + options: options, + rand: rand.New(rand.NewSource(time.Now().UnixNano())), // nolint:gosec + dialWaker: tmsync.NewWaker(), + evictWaker: tmsync.NewWaker(), + closeCh: make(chan struct{}), + + store: store, + dialing: map[NodeID]bool{}, + upgrading: map[NodeID]NodeID{}, + connected: map[NodeID]bool{}, + ready: map[NodeID]bool{}, + evict: map[NodeID]bool{}, + evicting: map[NodeID]bool{}, + subscriptions: map[*PeerUpdates]*PeerUpdates{}, + } + if err = peerManager.configurePeers(); err != nil { + return nil, err + } + if err = peerManager.prunePeers(); err != nil { + return nil, err + } + return peerManager, nil +} + +// configurePeers configures peers in the peer store with ephemeral runtime +// configuration, e.g. PersistentPeers. It also removes ourself, if we're in the +// peer store. The caller must hold the mutex lock. +func (m *PeerManager) configurePeers() error { + if err := m.store.Delete(m.selfID); err != nil { + return err + } + + configure := map[NodeID]bool{} + for _, id := range m.options.PersistentPeers { + configure[id] = true + } + for id := range m.options.PeerScores { + configure[id] = true + } + for id := range configure { + if peer, ok := m.store.Get(id); ok { + if err := m.store.Set(m.configurePeer(peer)); err != nil { + return err + } + } + } + return nil +} + +// configurePeer configures a peer with ephemeral runtime configuration. +func (m *PeerManager) configurePeer(peer peerInfo) peerInfo { + peer.Persistent = m.options.isPersistent(peer.ID) + peer.FixedScore = m.options.PeerScores[peer.ID] + return peer +} + +// newPeerInfo creates a peerInfo for a new peer. +func (m *PeerManager) newPeerInfo(id NodeID) peerInfo { + peerInfo := peerInfo{ + ID: id, + AddressInfo: map[NodeAddress]*peerAddressInfo{}, + } + return m.configurePeer(peerInfo) +} + +// prunePeers removes low-scored peers from the peer store if it contains more +// than MaxPeers peers. The caller must hold the mutex lock. +func (m *PeerManager) prunePeers() error { + if m.options.MaxPeers == 0 || m.store.Size() <= int(m.options.MaxPeers) { + return nil + } + + ranked := m.store.Ranked() + for i := len(ranked) - 1; i >= 0; i-- { + peerID := ranked[i].ID + switch { + case m.store.Size() <= int(m.options.MaxPeers): + break + case m.dialing[peerID]: + case m.connected[peerID]: + default: + if err := m.store.Delete(peerID); err != nil { + return err + } + } + } + return nil +} + +// Add adds a peer to the manager, given as an address. If the peer already +// exists, the address is added to it if not already present. +func (m *PeerManager) Add(address NodeAddress) error { + if err := address.Validate(); err != nil { + return err + } + if address.NodeID == m.selfID { + return fmt.Errorf("can't add self (%v) to peer store", m.selfID) + } + + m.mtx.Lock() + defer m.mtx.Unlock() + + peer, ok := m.store.Get(address.NodeID) + if !ok { + peer = m.newPeerInfo(address.NodeID) + } + if _, ok := peer.AddressInfo[address]; !ok { + peer.AddressInfo[address] = &peerAddressInfo{Address: address} + } + if err := m.store.Set(peer); err != nil { + return err + } + if err := m.prunePeers(); err != nil { + return err + } + m.dialWaker.Wake() + return nil +} + +// DialNext finds an appropriate peer address to dial, and marks it as dialing. +// If no peer is found, or all connection slots are full, it blocks until one +// becomes available. The caller must call Dialed() or DialFailed() for the +// returned peer. +func (m *PeerManager) DialNext(ctx context.Context) (NodeAddress, error) { + for { + address, err := m.TryDialNext() + if err != nil || (address != NodeAddress{}) { + return address, err + } + select { + case <-m.dialWaker.Sleep(): + case <-ctx.Done(): + return NodeAddress{}, ctx.Err() + } + } +} + +// TryDialNext is equivalent to DialNext(), but immediately returns an empty +// address if no peers or connection slots are available. +func (m *PeerManager) TryDialNext() (NodeAddress, error) { + m.mtx.Lock() + defer m.mtx.Unlock() + + // We allow dialing MaxConnected+MaxConnectedUpgrade peers. Including + // MaxConnectedUpgrade allows us to probe additional peers that have a + // higher score than any other peers, and if successful evict it. + if m.options.MaxConnected > 0 && len(m.connected)+len(m.dialing) >= + int(m.options.MaxConnected)+int(m.options.MaxConnectedUpgrade) { + return NodeAddress{}, nil + } + + for _, peer := range m.store.Ranked() { + if m.dialing[peer.ID] || m.connected[peer.ID] { + continue + } + + for _, addressInfo := range peer.AddressInfo { + if time.Since(addressInfo.LastDialFailure) < m.retryDelay(addressInfo.DialFailures, peer.Persistent) { + continue + } + + // We now have an eligible address to dial. If we're full but have + // upgrade capacity (as checked above), we find a lower-scored peer + // we can replace and mark it as upgrading so noone else claims it. + // + // If we don't find one, there is no point in trying additional + // peers, since they will all have the same or lower score than this + // peer (since they're ordered by score via peerStore.Ranked). + if m.options.MaxConnected > 0 && len(m.connected) >= int(m.options.MaxConnected) { + upgradeFromPeer := m.findUpgradeCandidate(peer.ID, peer.Score()) + if upgradeFromPeer == "" { + return NodeAddress{}, nil + } + m.upgrading[upgradeFromPeer] = peer.ID + } + + m.dialing[peer.ID] = true + return addressInfo.Address, nil + } + } + return NodeAddress{}, nil +} + +// DialFailed reports a failed dial attempt. This will make the peer available +// for dialing again when appropriate (possibly after a retry timeout). +// +// FIXME: This should probably delete or mark bad addresses/peers after some time. +func (m *PeerManager) DialFailed(address NodeAddress) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + delete(m.dialing, address.NodeID) + for from, to := range m.upgrading { + if to == address.NodeID { + delete(m.upgrading, from) // Unmark failed upgrade attempt. + } + } + + peer, ok := m.store.Get(address.NodeID) + if !ok { // Peer may have been removed while dialing, ignore. + return nil + } + addressInfo, ok := peer.AddressInfo[address] + if !ok { + return nil // Assume the address has been removed, ignore. + } + addressInfo.LastDialFailure = time.Now().UTC() + addressInfo.DialFailures++ + if err := m.store.Set(peer); err != nil { + return err + } + + // We spawn a goroutine that notifies DialNext() again when the retry + // timeout has elapsed, so that we can consider dialing it again. We + // calculate the retry delay outside the goroutine, since it must hold + // the mutex lock. + if d := m.retryDelay(addressInfo.DialFailures, peer.Persistent); d != retryNever { + go func() { + // Use an explicit timer with deferred cleanup instead of + // time.After(), to avoid leaking goroutines on PeerManager.Close(). + timer := time.NewTimer(d) + defer timer.Stop() + select { + case <-timer.C: + m.dialWaker.Wake() + case <-m.closeCh: + } + }() + } + + m.dialWaker.Wake() + return nil +} + +// Dialed marks a peer as successfully dialed. Any further connections will be +// rejected, and once disconnected the peer may be dialed again. +func (m *PeerManager) Dialed(address NodeAddress) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + delete(m.dialing, address.NodeID) + + var upgradeFromPeer NodeID + for from, to := range m.upgrading { + if to == address.NodeID { + delete(m.upgrading, from) + upgradeFromPeer = from + // Don't break, just in case this peer was marked as upgrading for + // multiple lower-scored peers (shouldn't really happen). + } + } + if address.NodeID == m.selfID { + return fmt.Errorf("rejecting connection to self (%v)", address.NodeID) + } + if m.connected[address.NodeID] { + return fmt.Errorf("peer %v is already connected", address.NodeID) + } + if m.options.MaxConnected > 0 && len(m.connected) >= int(m.options.MaxConnected) { + if upgradeFromPeer == "" || len(m.connected) >= + int(m.options.MaxConnected)+int(m.options.MaxConnectedUpgrade) { + return fmt.Errorf("already connected to maximum number of peers") + } + } + + peer, ok := m.store.Get(address.NodeID) + if !ok { + return fmt.Errorf("peer %q was removed while dialing", address.NodeID) + } + now := time.Now().UTC() + peer.LastConnected = now + if addressInfo, ok := peer.AddressInfo[address]; ok { + addressInfo.DialFailures = 0 + addressInfo.LastDialSuccess = now + // If not found, assume address has been removed. + } + if err := m.store.Set(peer); err != nil { + return err + } + + if upgradeFromPeer != "" && m.options.MaxConnected > 0 && + len(m.connected) >= int(m.options.MaxConnected) { + // Look for an even lower-scored peer that may have appeared since we + // started the upgrade. + if p, ok := m.store.Get(upgradeFromPeer); ok { + if u := m.findUpgradeCandidate(p.ID, p.Score()); u != "" { + upgradeFromPeer = u + } + } + m.evict[upgradeFromPeer] = true + } + m.connected[peer.ID] = true + m.evictWaker.Wake() + + return nil +} + +// Accepted marks an incoming peer connection successfully accepted. If the peer +// is already connected or we don't allow additional connections then this will +// return an error. +// +// If full but MaxConnectedUpgrade is non-zero and the incoming peer is +// better-scored than any existing peers, then we accept it and evict a +// lower-scored peer. +// +// NOTE: We can't take an address here, since e.g. TCP uses a different port +// number for outbound traffic than inbound traffic, so the peer's endpoint +// wouldn't necessarily be an appropriate address to dial. +// +// FIXME: When we accept a connection from a peer, we should register that +// peer's address in the peer store so that we can dial it later. In order to do +// that, we'll need to get the remote address after all, but as noted above that +// can't be the remote endpoint since that will usually have the wrong port +// number. +func (m *PeerManager) Accepted(peerID NodeID) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + if peerID == m.selfID { + return fmt.Errorf("rejecting connection from self (%v)", peerID) + } + if m.connected[peerID] { + return fmt.Errorf("peer %q is already connected", peerID) + } + if m.options.MaxConnected > 0 && + len(m.connected) >= int(m.options.MaxConnected)+int(m.options.MaxConnectedUpgrade) { + return fmt.Errorf("already connected to maximum number of peers") + } + + peer, ok := m.store.Get(peerID) + if !ok { + peer = m.newPeerInfo(peerID) + } + + // If all connections slots are full, but we allow upgrades (and we checked + // above that we have upgrade capacity), then we can look for a lower-scored + // peer to replace and if found accept the connection anyway and evict it. + var upgradeFromPeer NodeID + if m.options.MaxConnected > 0 && len(m.connected) >= int(m.options.MaxConnected) { + upgradeFromPeer = m.findUpgradeCandidate(peer.ID, peer.Score()) + if upgradeFromPeer == "" { + return fmt.Errorf("already connected to maximum number of peers") + } + } + + peer.LastConnected = time.Now().UTC() + if err := m.store.Set(peer); err != nil { + return err + } + + m.connected[peerID] = true + if upgradeFromPeer != "" { + m.evict[upgradeFromPeer] = true + } + m.evictWaker.Wake() + return nil +} + +// Ready marks a peer as ready, broadcasting status updates to subscribers. The +// peer must already be marked as connected. This is separate from Dialed() and +// Accepted() to allow the router to set up its internal queues before reactors +// start sending messages. +func (m *PeerManager) Ready(peerID NodeID) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + if m.connected[peerID] { + m.ready[peerID] = true + m.broadcast(PeerUpdate{ + NodeID: peerID, + Status: PeerStatusUp, + }) + } + return nil +} + +// EvictNext returns the next peer to evict (i.e. disconnect). If no evictable +// peers are found, the call will block until one becomes available. +func (m *PeerManager) EvictNext(ctx context.Context) (NodeID, error) { + for { + id, err := m.TryEvictNext() + if err != nil || id != "" { + return id, err + } + select { + case <-m.evictWaker.Sleep(): + case <-ctx.Done(): + return "", ctx.Err() + } + } +} + +// TryEvictNext is equivalent to EvictNext, but immediately returns an empty +// node ID if no evictable peers are found. +func (m *PeerManager) TryEvictNext() (NodeID, error) { + m.mtx.Lock() + defer m.mtx.Unlock() + + // If any connected peers are explicitly scheduled for eviction, we return a + // random one. + for peerID := range m.evict { + delete(m.evict, peerID) + if m.connected[peerID] && !m.evicting[peerID] { + m.evicting[peerID] = true + return peerID, nil + } + } + + // If we're below capacity, we don't need to evict anything. + if m.options.MaxConnected == 0 || + len(m.connected)-len(m.evicting) <= int(m.options.MaxConnected) { + return "", nil + } + + // If we're above capacity (shouldn't really happen), just pick the + // lowest-ranked peer to evict. + ranked := m.store.Ranked() + for i := len(ranked) - 1; i >= 0; i-- { + peer := ranked[i] + if m.connected[peer.ID] && !m.evicting[peer.ID] { + m.evicting[peer.ID] = true + return peer.ID, nil + } + } + + return "", nil +} + +// Disconnected unmarks a peer as connected, allowing it to be dialed or +// accepted again as appropriate. +func (m *PeerManager) Disconnected(peerID NodeID) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + ready := m.ready[peerID] + + delete(m.connected, peerID) + delete(m.upgrading, peerID) + delete(m.evict, peerID) + delete(m.evicting, peerID) + delete(m.ready, peerID) + + if ready { + m.broadcast(PeerUpdate{ + NodeID: peerID, + Status: PeerStatusDown, + }) + } + + m.dialWaker.Wake() + return nil +} + +// Errored reports a peer error, causing the peer to be evicted if it's +// currently connected. +// +// FIXME: This should probably be replaced with a peer behavior API, see +// PeerError comments for more details. +// +// FIXME: This will cause the peer manager to immediately try to reconnect to +// the peer, which is probably not always what we want. +func (m *PeerManager) Errored(peerID NodeID, err error) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + if m.connected[peerID] { + m.evict[peerID] = true + } + + m.evictWaker.Wake() + return nil +} + +// Advertise returns a list of peer addresses to advertise to a peer. +// +// FIXME: This is fairly naïve and only returns the addresses of the +// highest-ranked peers. +func (m *PeerManager) Advertise(peerID NodeID, limit uint16) []NodeAddress { + m.mtx.Lock() + defer m.mtx.Unlock() + + addresses := make([]NodeAddress, 0, limit) + for _, peer := range m.store.Ranked() { + if peer.ID == peerID { + continue + } + for _, addressInfo := range peer.AddressInfo { + if len(addresses) >= int(limit) { + return addresses + } + addresses = append(addresses, addressInfo.Address) + } + } + return addresses +} + +// Subscribe subscribes to peer updates. The caller must consume the peer +// updates in a timely fashion and close the subscription when done, otherwise +// the PeerManager will halt. +func (m *PeerManager) Subscribe() *PeerUpdates { + // FIXME: We use a size 1 buffer here. When we broadcast a peer update + // we have to loop over all of the subscriptions, and we want to avoid + // having to block and wait for a context switch before continuing on + // to the next subscriptions. This also prevents tail latencies from + // compounding. Limiting it to 1 means that the subscribers are still + // reasonably in sync. However, this should probably be benchmarked. + peerUpdates := NewPeerUpdates(make(chan PeerUpdate, 1)) + m.mtx.Lock() + m.subscriptions[peerUpdates] = peerUpdates + m.mtx.Unlock() + + go func() { + select { + case <-peerUpdates.Done(): + m.mtx.Lock() + delete(m.subscriptions, peerUpdates) + m.mtx.Unlock() + case <-m.closeCh: + } + }() + return peerUpdates +} + +// broadcast broadcasts a peer update to all subscriptions. The caller must +// already hold the mutex lock, to make sure updates are sent in the same order +// as the PeerManager processes them, but this means subscribers must be +// responsive at all times or the entire PeerManager will halt. +// +// FIXME: Consider using an internal channel to buffer updates while also +// maintaining order if this is a problem. +func (m *PeerManager) broadcast(peerUpdate PeerUpdate) { + for _, sub := range m.subscriptions { + // We have to check closeCh separately first, otherwise there's a 50% + // chance the second select will send on a closed subscription. + select { + case <-sub.closeCh: + continue + default: + } + select { + case sub.updatesCh <- peerUpdate: + case <-sub.closeCh: + } + } +} + +// Close closes the peer manager, releasing resources (i.e. goroutines). +func (m *PeerManager) Close() { + m.closeOnce.Do(func() { + close(m.closeCh) + }) +} + +// Addresses returns all known addresses for a peer, primarily for testing. +// The order is arbitrary. +func (m *PeerManager) Addresses(peerID NodeID) []NodeAddress { + m.mtx.Lock() + defer m.mtx.Unlock() + + addresses := []NodeAddress{} + if peer, ok := m.store.Get(peerID); ok { + for _, addressInfo := range peer.AddressInfo { + addresses = append(addresses, addressInfo.Address) + } + } + return addresses +} + +// Peers returns all known peers, primarily for testing. The order is arbitrary. +func (m *PeerManager) Peers() []NodeID { + m.mtx.Lock() + defer m.mtx.Unlock() + + peers := []NodeID{} + for _, peer := range m.store.Ranked() { + peers = append(peers, peer.ID) + } + return peers +} + +// Scores returns the peer scores for all known peers, primarily for testing. +func (m *PeerManager) Scores() map[NodeID]PeerScore { + m.mtx.Lock() + defer m.mtx.Unlock() + + scores := map[NodeID]PeerScore{} + for _, peer := range m.store.Ranked() { + scores[peer.ID] = peer.Score() + } + return scores +} + +// Status returns the status for a peer, primarily for testing. +func (m *PeerManager) Status(id NodeID) PeerStatus { + m.mtx.Lock() + defer m.mtx.Unlock() + switch { + case m.ready[id]: + return PeerStatusUp + default: + return PeerStatusDown + } +} + +// findUpgradeCandidate looks for a lower-scored peer that we could evict +// to make room for the given peer. Returns an empty ID if none is found. +// If the peer is already being upgraded to, we return that same upgrade. +// The caller must hold the mutex lock. +func (m *PeerManager) findUpgradeCandidate(id NodeID, score PeerScore) NodeID { + for from, to := range m.upgrading { + if to == id { + return from + } + } + + ranked := m.store.Ranked() + for i := len(ranked) - 1; i >= 0; i-- { + candidate := ranked[i] + switch { + case candidate.Score() >= score: + return "" // no further peers can be scored lower, due to sorting + case !m.connected[candidate.ID]: + case m.evict[candidate.ID]: + case m.evicting[candidate.ID]: + case m.upgrading[candidate.ID] != "": + default: + return candidate.ID + } + } + return "" +} + +// retryDelay calculates a dial retry delay using exponential backoff, based on +// retry settings in PeerManagerOptions. If retries are disabled (i.e. +// MinRetryTime is 0), this returns retryNever (i.e. an infinite retry delay). +// The caller must hold the mutex lock (for m.rand which is not thread-safe). +func (m *PeerManager) retryDelay(failures uint32, persistent bool) time.Duration { + if failures == 0 { + return 0 + } + if m.options.MinRetryTime == 0 { + return retryNever + } + maxDelay := m.options.MaxRetryTime + if persistent && m.options.MaxRetryTimePersistent > 0 { + maxDelay = m.options.MaxRetryTimePersistent + } + + delay := m.options.MinRetryTime * time.Duration(math.Pow(2, float64(failures-1))) + if maxDelay > 0 && delay > maxDelay { + delay = maxDelay + } + if m.options.RetryTimeJitter > 0 { + delay += time.Duration(m.rand.Int63n(int64(m.options.RetryTimeJitter))) + } + return delay +} + +// GetHeight returns a peer's height, as reported via SetHeight, or 0 if the +// peer or height is unknown. +// +// FIXME: This is a temporary workaround to share state between the consensus +// and mempool reactors, carried over from the legacy P2P stack. Reactors should +// not have dependencies on each other, instead tracking this themselves. +func (m *PeerManager) GetHeight(peerID NodeID) uint64 { + m.mtx.Lock() + defer m.mtx.Unlock() + + peer, _ := m.store.Get(peerID) + return peer.Height +} + +// SetHeight stores a peer's height, making it available via GetHeight. +// +// FIXME: This is a temporary workaround to share state between the consensus +// and mempool reactors, carried over from the legacy P2P stack. Reactors should +// not have dependencies on each other, instead tracking this themselves. +func (m *PeerManager) SetHeight(peerID NodeID, height uint64) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + peer, ok := m.store.Get(peerID) + if !ok { + peer = m.newPeerInfo(peerID) + } + peer.Height = height + return m.store.Set(peer) +} + +// peerStore stores information about peers. It is not thread-safe, assuming it +// is only used by PeerManager which handles concurrency control. This allows +// the manager to execute multiple operations atomically via its own mutex. +// +// The entire set of peers is kept in memory, for performance. It is loaded +// from disk on initialization, and any changes are written back to disk +// (without fsync, since we can afford to lose recent writes). +type peerStore struct { + db dbm.DB + peers map[NodeID]*peerInfo + ranked []*peerInfo // cache for Ranked(), nil invalidates cache +} + +// newPeerStore creates a new peer store, loading all persisted peers from the +// database into memory. +func newPeerStore(db dbm.DB) (*peerStore, error) { + if db == nil { + return nil, errors.New("no database provided") + } + store := &peerStore{db: db} + if err := store.loadPeers(); err != nil { + return nil, err + } + return store, nil +} + +// loadPeers loads all peers from the database into memory. +func (s *peerStore) loadPeers() error { + peers := map[NodeID]*peerInfo{} + + start, end := keyPeerInfoRange() + iter, err := s.db.Iterator(start, end) + if err != nil { + return err + } + defer iter.Close() + for ; iter.Valid(); iter.Next() { + // FIXME: We may want to tolerate failures here, by simply logging + // the errors and ignoring the faulty peer entries. + msg := new(p2pproto.PeerInfo) + if err := proto.Unmarshal(iter.Value(), msg); err != nil { + return fmt.Errorf("invalid peer Protobuf data: %w", err) + } + peer, err := peerInfoFromProto(msg) + if err != nil { + return fmt.Errorf("invalid peer data: %w", err) + } + peers[peer.ID] = peer + } + if iter.Error() != nil { + return iter.Error() + } + s.peers = peers + s.ranked = nil // invalidate cache if populated + return nil +} + +// Get fetches a peer. The boolean indicates whether the peer existed or not. +// The returned peer info is a copy, and can be mutated at will. +func (s *peerStore) Get(id NodeID) (peerInfo, bool) { + peer, ok := s.peers[id] + return peer.Copy(), ok +} + +// Set stores peer data. The input data will be copied, and can safely be reused +// by the caller. +func (s *peerStore) Set(peer peerInfo) error { + if err := peer.Validate(); err != nil { + return err + } + peer = peer.Copy() + + // FIXME: We may want to optimize this by avoiding saving to the database + // if there haven't been any changes to persisted fields. + bz, err := peer.ToProto().Marshal() + if err != nil { + return err + } + if err = s.db.Set(keyPeerInfo(peer.ID), bz); err != nil { + return err + } + + if current, ok := s.peers[peer.ID]; !ok || current.Score() != peer.Score() { + // If the peer is new, or its score changes, we invalidate the Ranked() cache. + s.peers[peer.ID] = &peer + s.ranked = nil + } else { + // Otherwise, since s.ranked contains pointers to the old data and we + // want those pointers to remain valid with the new data, we have to + // update the existing pointer address. + *current = peer + } + + return nil +} + +// Delete deletes a peer, or does nothing if it does not exist. +func (s *peerStore) Delete(id NodeID) error { + if _, ok := s.peers[id]; !ok { + return nil + } + if err := s.db.Delete(keyPeerInfo(id)); err != nil { + return err + } + delete(s.peers, id) + s.ranked = nil + return nil +} + +// List retrieves all peers in an arbitrary order. The returned data is a copy, +// and can be mutated at will. +func (s *peerStore) List() []peerInfo { + peers := make([]peerInfo, 0, len(s.peers)) + for _, peer := range s.peers { + peers = append(peers, peer.Copy()) + } + return peers +} + +// Ranked returns a list of peers ordered by score (better peers first). Peers +// with equal scores are returned in an arbitrary order. The returned list must +// not be mutated or accessed concurrently by the caller, since it returns +// pointers to internal peerStore data for performance. +// +// Ranked is used to determine both which peers to dial, which ones to evict, +// and which ones to delete completely. +// +// FIXME: For now, we simply maintain a cache in s.ranked which is invalidated +// by setting it to nil, but if necessary we should use a better data structure +// for this (e.g. a heap or ordered map). +// +// FIXME: The scoring logic is currently very naïve, see peerInfo.Score(). +func (s *peerStore) Ranked() []*peerInfo { + if s.ranked != nil { + return s.ranked + } + s.ranked = make([]*peerInfo, 0, len(s.peers)) + for _, peer := range s.peers { + s.ranked = append(s.ranked, peer) + } + sort.Slice(s.ranked, func(i, j int) bool { + // FIXME: If necessary, consider precomputing scores before sorting, + // to reduce the number of Score() calls. + return s.ranked[i].Score() > s.ranked[j].Score() + }) + return s.ranked +} + +// Size returns the number of peers in the peer store. +func (s *peerStore) Size() int { + return len(s.peers) +} + +// peerInfo contains peer information stored in a peerStore. +type peerInfo struct { + ID NodeID + AddressInfo map[NodeAddress]*peerAddressInfo + LastConnected time.Time + + // These fields are ephemeral, i.e. not persisted to the database. + Persistent bool + Height uint64 + FixedScore PeerScore // mainly for tests +} + +// peerInfoFromProto converts a Protobuf PeerInfo message to a peerInfo, +// erroring if the data is invalid. +func peerInfoFromProto(msg *p2pproto.PeerInfo) (*peerInfo, error) { + p := &peerInfo{ + ID: NodeID(msg.ID), + AddressInfo: map[NodeAddress]*peerAddressInfo{}, + } + if msg.LastConnected != nil { + p.LastConnected = *msg.LastConnected + } + for _, a := range msg.AddressInfo { + addressInfo, err := peerAddressInfoFromProto(a) + if err != nil { + return nil, err + } + p.AddressInfo[addressInfo.Address] = addressInfo + } + return p, p.Validate() +} + +// ToProto converts the peerInfo to p2pproto.PeerInfo for database storage. The +// Protobuf type only contains persisted fields, while ephemeral fields are +// discarded. The returned message may contain pointers to original data, since +// it is expected to be serialized immediately. +func (p *peerInfo) ToProto() *p2pproto.PeerInfo { + msg := &p2pproto.PeerInfo{ + ID: string(p.ID), + LastConnected: &p.LastConnected, + } + for _, addressInfo := range p.AddressInfo { + msg.AddressInfo = append(msg.AddressInfo, addressInfo.ToProto()) + } + if msg.LastConnected.IsZero() { + msg.LastConnected = nil + } + return msg +} + +// Copy returns a deep copy of the peer info. +func (p *peerInfo) Copy() peerInfo { + if p == nil { + return peerInfo{} + } + c := *p + for i, addressInfo := range c.AddressInfo { + addressInfoCopy := addressInfo.Copy() + c.AddressInfo[i] = &addressInfoCopy + } + return c +} + +// Score calculates a score for the peer. Higher-scored peers will be +// preferred over lower scores. +func (p *peerInfo) Score() PeerScore { + var score PeerScore + if p.FixedScore > 0 { + return p.FixedScore + } + if p.Persistent { + score += PeerScorePersistent + } + return score +} + +// Validate validates the peer info. +func (p *peerInfo) Validate() error { + if p.ID == "" { + return errors.New("no peer ID") + } + return nil +} + +// peerAddressInfo contains information and statistics about a peer address. +type peerAddressInfo struct { + Address NodeAddress + LastDialSuccess time.Time + LastDialFailure time.Time + DialFailures uint32 // since last successful dial +} + +// peerAddressInfoFromProto converts a Protobuf PeerAddressInfo message +// to a peerAddressInfo. +func peerAddressInfoFromProto(msg *p2pproto.PeerAddressInfo) (*peerAddressInfo, error) { + address, err := ParseNodeAddress(msg.Address) + if err != nil { + return nil, fmt.Errorf("invalid address %q: %w", address, err) + } + addressInfo := &peerAddressInfo{ + Address: address, + DialFailures: msg.DialFailures, + } + if msg.LastDialSuccess != nil { + addressInfo.LastDialSuccess = *msg.LastDialSuccess + } + if msg.LastDialFailure != nil { + addressInfo.LastDialFailure = *msg.LastDialFailure + } + return addressInfo, addressInfo.Validate() +} + +// ToProto converts the address into to a Protobuf message for serialization. +func (a *peerAddressInfo) ToProto() *p2pproto.PeerAddressInfo { + msg := &p2pproto.PeerAddressInfo{ + Address: a.Address.String(), + LastDialSuccess: &a.LastDialSuccess, + LastDialFailure: &a.LastDialFailure, + DialFailures: a.DialFailures, + } + if msg.LastDialSuccess.IsZero() { + msg.LastDialSuccess = nil + } + if msg.LastDialFailure.IsZero() { + msg.LastDialFailure = nil + } + return msg +} + +// Copy returns a copy of the address info. +func (a *peerAddressInfo) Copy() peerAddressInfo { + return *a +} + +// Validate validates the address info. +func (a *peerAddressInfo) Validate() error { + return a.Address.Validate() +} + +// Database key prefixes. +const ( + prefixPeerInfo int64 = 1 +) + +// keyPeerInfo generates a peerInfo database key. +func keyPeerInfo(id NodeID) []byte { + key, err := orderedcode.Append(nil, prefixPeerInfo, string(id)) + if err != nil { + panic(err) + } + return key +} + +// keyPeerInfoRange generates start/end keys for the entire peerInfo key range. +func keyPeerInfoRange() ([]byte, []byte) { + start, err := orderedcode.Append(nil, prefixPeerInfo, "") + if err != nil { + panic(err) + } + end, err := orderedcode.Append(nil, prefixPeerInfo, orderedcode.Infinity) + if err != nil { + panic(err) + } + return start, end +} diff --git a/p2p/peermanager_test.go b/p2p/peermanager_test.go new file mode 100644 index 000000000..d15d9f6d6 --- /dev/null +++ b/p2p/peermanager_test.go @@ -0,0 +1,1617 @@ +package p2p_test + +import ( + "context" + "errors" + "strings" + "testing" + "time" + + "github.com/fortytw2/leaktest" + "github.com/stretchr/testify/require" + dbm "github.com/tendermint/tm-db" + + "github.com/tendermint/tendermint/p2p" +) + +// FIXME: We should probably have some randomized property-based tests for the +// PeerManager too, which runs a bunch of random operations with random peers +// and ensures certain invariants always hold. The logic can be complex, with +// many interactions, and it's hard to cover all scenarios with handwritten +// tests. + +func TestPeerManagerOptions_Validate(t *testing.T) { + nodeID := p2p.NodeID("00112233445566778899aabbccddeeff00112233") + + testcases := map[string]struct { + options p2p.PeerManagerOptions + ok bool + }{ + "zero options is valid": {p2p.PeerManagerOptions{}, true}, + + // PersistentPeers + "valid PersistentPeers NodeID": {p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{"00112233445566778899aabbccddeeff00112233"}, + }, true}, + "invalid PersistentPeers NodeID": {p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{"foo"}, + }, false}, + "uppercase PersistentPeers NodeID": {p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{"00112233445566778899AABBCCDDEEFF00112233"}, + }, false}, + "PersistentPeers at MaxConnected": {p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{nodeID, nodeID, nodeID}, + MaxConnected: 3, + }, true}, + "PersistentPeers above MaxConnected": {p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{nodeID, nodeID, nodeID}, + MaxConnected: 2, + }, false}, + "PersistentPeers above MaxConnected below MaxConnectedUpgrade": {p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{nodeID, nodeID, nodeID}, + MaxConnected: 2, + MaxConnectedUpgrade: 2, + }, false}, + + // MaxPeers + "MaxPeers without MaxConnected": {p2p.PeerManagerOptions{ + MaxPeers: 3, + }, false}, + "MaxPeers below MaxConnected+MaxConnectedUpgrade": {p2p.PeerManagerOptions{ + MaxPeers: 2, + MaxConnected: 2, + MaxConnectedUpgrade: 1, + }, false}, + "MaxPeers at MaxConnected+MaxConnectedUpgrade": {p2p.PeerManagerOptions{ + MaxPeers: 3, + MaxConnected: 2, + MaxConnectedUpgrade: 1, + }, true}, + + // MaxRetryTime + "MaxRetryTime below MinRetryTime": {p2p.PeerManagerOptions{ + MinRetryTime: 7 * time.Second, + MaxRetryTime: 5 * time.Second, + }, false}, + "MaxRetryTime at MinRetryTime": {p2p.PeerManagerOptions{ + MinRetryTime: 5 * time.Second, + MaxRetryTime: 5 * time.Second, + }, true}, + "MaxRetryTime without MinRetryTime": {p2p.PeerManagerOptions{ + MaxRetryTime: 5 * time.Second, + }, false}, + + // MaxRetryTimePersistent + "MaxRetryTimePersistent below MinRetryTime": {p2p.PeerManagerOptions{ + MinRetryTime: 7 * time.Second, + MaxRetryTimePersistent: 5 * time.Second, + }, false}, + "MaxRetryTimePersistent at MinRetryTime": {p2p.PeerManagerOptions{ + MinRetryTime: 5 * time.Second, + MaxRetryTimePersistent: 5 * time.Second, + }, true}, + "MaxRetryTimePersistent without MinRetryTime": {p2p.PeerManagerOptions{ + MaxRetryTimePersistent: 5 * time.Second, + }, false}, + } + for name, tc := range testcases { + tc := tc + t.Run(name, func(t *testing.T) { + err := tc.options.Validate() + if tc.ok { + require.NoError(t, err) + } else { + require.Error(t, err) + } + }) + } +} + +func TestNewPeerManager(t *testing.T) { + // Zero options should be valid. + _, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Invalid options should error. + _, err = p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{"foo"}, + }) + require.Error(t, err) + + // Invalid database should error. + _, err = p2p.NewPeerManager(selfID, nil, p2p.PeerManagerOptions{}) + require.Error(t, err) + + // Empty self ID should error. + _, err = p2p.NewPeerManager("", nil, p2p.PeerManagerOptions{}) + require.Error(t, err) +} + +func TestNewPeerManager_Persistence(t *testing.T) { + aID := p2p.NodeID(strings.Repeat("a", 40)) + aAddresses := []p2p.NodeAddress{ + {Protocol: "tcp", NodeID: aID, Hostname: "127.0.0.1", Port: 26657, Path: "/path"}, + {Protocol: "memory", NodeID: aID}, + } + + bID := p2p.NodeID(strings.Repeat("b", 40)) + bAddresses := []p2p.NodeAddress{ + {Protocol: "tcp", NodeID: bID, Hostname: "b10c::1", Port: 26657, Path: "/path"}, + {Protocol: "memory", NodeID: bID}, + } + + cID := p2p.NodeID(strings.Repeat("c", 40)) + cAddresses := []p2p.NodeAddress{ + {Protocol: "tcp", NodeID: cID, Hostname: "host.domain", Port: 80}, + {Protocol: "memory", NodeID: cID}, + } + + // Create an initial peer manager and add the peers. + db := dbm.NewMemDB() + peerManager, err := p2p.NewPeerManager(selfID, db, p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{aID}, + PeerScores: map[p2p.NodeID]p2p.PeerScore{bID: 1}, + }) + require.NoError(t, err) + defer peerManager.Close() + + for _, addr := range append(append(aAddresses, bAddresses...), cAddresses...) { + require.NoError(t, peerManager.Add(addr)) + } + + require.ElementsMatch(t, aAddresses, peerManager.Addresses(aID)) + require.ElementsMatch(t, bAddresses, peerManager.Addresses(bID)) + require.ElementsMatch(t, cAddresses, peerManager.Addresses(cID)) + require.Equal(t, map[p2p.NodeID]p2p.PeerScore{ + aID: p2p.PeerScorePersistent, + bID: 1, + cID: 0, + }, peerManager.Scores()) + + peerManager.Close() + + // Creating a new peer manager with the same database should retain the + // peers, but they should have updated scores from the new PersistentPeers + // configuration. + peerManager, err = p2p.NewPeerManager(selfID, db, p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{bID}, + PeerScores: map[p2p.NodeID]p2p.PeerScore{cID: 1}, + }) + require.NoError(t, err) + defer peerManager.Close() + + require.ElementsMatch(t, aAddresses, peerManager.Addresses(aID)) + require.ElementsMatch(t, bAddresses, peerManager.Addresses(bID)) + require.ElementsMatch(t, cAddresses, peerManager.Addresses(cID)) + require.Equal(t, map[p2p.NodeID]p2p.PeerScore{ + aID: 0, + bID: p2p.PeerScorePersistent, + cID: 1, + }, peerManager.Scores()) +} + +func TestNewPeerManager_SelfIDChange(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + db := dbm.NewMemDB() + peerManager, err := p2p.NewPeerManager(selfID, db, p2p.PeerManagerOptions{}) + require.NoError(t, err) + + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Add(b)) + require.ElementsMatch(t, []p2p.NodeID{a.NodeID, b.NodeID}, peerManager.Peers()) + peerManager.Close() + + // If we change our selfID to one of the peers in the peer store, it + // should be removed from the store. + peerManager, err = p2p.NewPeerManager(a.NodeID, db, p2p.PeerManagerOptions{}) + require.NoError(t, err) + require.Equal(t, []p2p.NodeID{b.NodeID}, peerManager.Peers()) +} + +func TestPeerManager_Add(t *testing.T) { + aID := p2p.NodeID(strings.Repeat("a", 40)) + bID := p2p.NodeID(strings.Repeat("b", 40)) + cID := p2p.NodeID(strings.Repeat("c", 40)) + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{aID, cID}, + MaxPeers: 2, + MaxConnected: 2, + }) + require.NoError(t, err) + + // Adding a couple of addresses should work. + aAddresses := []p2p.NodeAddress{ + {Protocol: "tcp", NodeID: aID, Hostname: "localhost"}, + {Protocol: "memory", NodeID: aID}, + } + for _, addr := range aAddresses { + err = peerManager.Add(addr) + require.NoError(t, err) + } + require.ElementsMatch(t, aAddresses, peerManager.Addresses(aID)) + + // Adding a different peer should be fine. + bAddress := p2p.NodeAddress{Protocol: "tcp", NodeID: bID, Hostname: "localhost"} + require.NoError(t, peerManager.Add(bAddress)) + require.Equal(t, []p2p.NodeAddress{bAddress}, peerManager.Addresses(bID)) + require.ElementsMatch(t, aAddresses, peerManager.Addresses(aID)) + + // Adding an existing address again should be a noop. + require.NoError(t, peerManager.Add(aAddresses[0])) + require.ElementsMatch(t, aAddresses, peerManager.Addresses(aID)) + + // Adding a third peer with MaxPeers=2 should cause bID, which is + // the lowest-scored peer (not in PersistentPeers), to be removed. + require.NoError(t, peerManager.Add(p2p.NodeAddress{ + Protocol: "tcp", NodeID: cID, Hostname: "localhost"})) + require.ElementsMatch(t, []p2p.NodeID{aID, cID}, peerManager.Peers()) + + // Adding an invalid address should error. + require.Error(t, peerManager.Add(p2p.NodeAddress{Path: "foo"})) + + // Adding self should error + require.Error(t, peerManager.Add(p2p.NodeAddress{Protocol: "memory", NodeID: selfID})) +} + +func TestPeerManager_DialNext(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Add an address. DialNext should return it. + require.NoError(t, peerManager.Add(a)) + address, err := peerManager.DialNext(ctx) + require.NoError(t, err) + require.Equal(t, a, address) + + // Since there are no more undialed peers, the next call should block + // until it times out. + timeoutCtx, cancel := context.WithTimeout(ctx, 100*time.Millisecond) + defer cancel() + _, err = peerManager.DialNext(timeoutCtx) + require.Error(t, err) + require.Equal(t, context.DeadlineExceeded, err) +} + +func TestPeerManager_DialNext_Retry(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + options := p2p.PeerManagerOptions{ + MinRetryTime: 100 * time.Millisecond, + MaxRetryTime: 500 * time.Millisecond, + } + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), options) + require.NoError(t, err) + + require.NoError(t, peerManager.Add(a)) + + // Do five dial retries (six dials total). The retry time should double for + // each failure. At the forth retry, MaxRetryTime should kick in. + ctx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + + for i := 0; i <= 5; i++ { + start := time.Now() + dial, err := peerManager.DialNext(ctx) + require.NoError(t, err) + require.Equal(t, a, dial) + elapsed := time.Since(start).Round(time.Millisecond) + + switch i { + case 0: + require.LessOrEqual(t, elapsed, options.MinRetryTime) + case 1: + require.GreaterOrEqual(t, elapsed, options.MinRetryTime) + case 2: + require.GreaterOrEqual(t, elapsed, 2*options.MinRetryTime) + case 3: + require.GreaterOrEqual(t, elapsed, 4*options.MinRetryTime) + case 4, 5: + require.GreaterOrEqual(t, elapsed, options.MaxRetryTime) + require.LessOrEqual(t, elapsed, 8*options.MinRetryTime) + default: + require.Fail(t, "unexpected retry") + } + + require.NoError(t, peerManager.DialFailed(a)) + } +} + +func TestPeerManager_DialNext_WakeOnAdd(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Spawn a goroutine to add a peer after a delay. + go func() { + time.Sleep(200 * time.Millisecond) + require.NoError(t, peerManager.Add(a)) + }() + + // This will block until peer is added above. + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + dial, err := peerManager.DialNext(ctx) + require.NoError(t, err) + require.Equal(t, a, dial) +} + +func TestPeerManager_DialNext_WakeOnDialFailed(t *testing.T) { + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 1, + }) + require.NoError(t, err) + + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + // Add and dial a. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + + // Add b. We shouldn't be able to dial it, due to MaxConnected. + require.NoError(t, peerManager.Add(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) + + // Spawn a goroutine to fail a's dial attempt. + go func() { + time.Sleep(200 * time.Millisecond) + require.NoError(t, peerManager.DialFailed(a)) + }() + + // This should make b available for dialing (not a, retries are disabled). + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + dial, err = peerManager.DialNext(ctx) + require.NoError(t, err) + require.Equal(t, b, dial) +} + +func TestPeerManager_DialNext_WakeOnDialFailedRetry(t *testing.T) { + options := p2p.PeerManagerOptions{MinRetryTime: 200 * time.Millisecond} + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), options) + require.NoError(t, err) + + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + // Add a, dial it, and mark it a failure. This will start a retry timer. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.DialFailed(dial)) + failed := time.Now() + + // The retry timer should unblock DialNext and make a available again after + // the retry time passes. + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + dial, err = peerManager.DialNext(ctx) + require.NoError(t, err) + require.Equal(t, a, dial) + require.GreaterOrEqual(t, time.Since(failed), options.MinRetryTime) +} + +func TestPeerManager_DialNext_WakeOnDisconnected(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + err = peerManager.Add(a) + require.NoError(t, err) + err = peerManager.Accepted(a.NodeID) + require.NoError(t, err) + + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) + + go func() { + time.Sleep(200 * time.Millisecond) + require.NoError(t, peerManager.Disconnected(a.NodeID)) + }() + + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + dial, err = peerManager.DialNext(ctx) + require.NoError(t, err) + require.Equal(t, a, dial) +} + +func TestPeerManager_TryDialNext_MaxConnected(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 2, + }) + require.NoError(t, err) + + // Add a and connect to it. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.Dialed(a)) + + // Add b and start dialing it. + require.NoError(t, peerManager.Add(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + + // At this point, adding c will not allow dialing it. + require.NoError(t, peerManager.Add(c)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) +} + +func TestPeerManager_TryDialNext_MaxConnectedUpgrade(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + d := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("d", 40))} + e := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("e", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{ + a.NodeID: 0, + b.NodeID: 1, + c.NodeID: 2, + d.NodeID: 3, + e.NodeID: 0, + }, + PersistentPeers: []p2p.NodeID{c.NodeID, d.NodeID}, + MaxConnected: 2, + MaxConnectedUpgrade: 1, + }) + require.NoError(t, err) + + // Add a and connect to it. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.Dialed(a)) + + // Add b and start dialing it. + require.NoError(t, peerManager.Add(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + + // Even though we are at capacity, we should be allowed to dial c for an + // upgrade of a, since it's higher-scored. + require.NoError(t, peerManager.Add(c)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, c, dial) + + // However, since we're using all upgrade slots now, we can't add and dial + // d, even though it's also higher-scored. + require.NoError(t, peerManager.Add(d)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) + + // We go through with c's upgrade. + require.NoError(t, peerManager.Dialed(c)) + + // Still can't dial d. + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) + + // Now, if we disconnect a, we should be allowed to dial d because we have a + // free upgrade slot. + require.NoError(t, peerManager.Disconnected(a.NodeID)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, d, dial) + require.NoError(t, peerManager.Dialed(d)) + + // However, if we disconnect b (such that only c and d are connected), we + // should not be allowed to dial e even though there are upgrade slots, + // because there are no lower-scored nodes that can be upgraded. + require.NoError(t, peerManager.Disconnected(b.NodeID)) + require.NoError(t, peerManager.Add(e)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) +} + +func TestPeerManager_TryDialNext_UpgradeReservesPeer(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{b.NodeID: 1, c.NodeID: 1}, + MaxConnected: 1, + MaxConnectedUpgrade: 2, + }) + require.NoError(t, err) + + // Add a and connect to it. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.Dialed(a)) + + // Add b and start dialing it. This will claim a for upgrading. + require.NoError(t, peerManager.Add(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + + // Adding c and dialing it will fail, because a is the only connected + // peer that can be upgraded, and b is already trying to upgrade it. + require.NoError(t, peerManager.Add(c)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Empty(t, dial) +} + +func TestPeerManager_TryDialNext_DialingConnected(t *testing.T) { + aID := p2p.NodeID(strings.Repeat("a", 40)) + a := p2p.NodeAddress{Protocol: "memory", NodeID: aID} + aTCP := p2p.NodeAddress{Protocol: "tcp", NodeID: aID, Hostname: "localhost"} + + bID := p2p.NodeID(strings.Repeat("b", 40)) + b := p2p.NodeAddress{Protocol: "memory", NodeID: bID} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 2, + }) + require.NoError(t, err) + + // Add a and dial it. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + + // Adding a's TCP address will not dispense a, since it's already dialing. + require.NoError(t, peerManager.Add(aTCP)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) + + // Marking a as dialed will still not dispense it. + require.NoError(t, peerManager.Dialed(a)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) + + // Adding b and accepting a connection from it will not dispense it either. + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Accepted(bID)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) +} + +func TestPeerManager_TryDialNext_Multiple(t *testing.T) { + aID := p2p.NodeID(strings.Repeat("a", 40)) + bID := p2p.NodeID(strings.Repeat("b", 40)) + addresses := []p2p.NodeAddress{ + {Protocol: "memory", NodeID: aID}, + {Protocol: "memory", NodeID: bID}, + {Protocol: "tcp", NodeID: aID, Hostname: "127.0.0.1"}, + {Protocol: "tcp", NodeID: bID, Hostname: "::1"}, + } + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + for _, address := range addresses { + require.NoError(t, peerManager.Add(address)) + } + + // All addresses should be dispensed as long as dialing them has failed. + dial := []p2p.NodeAddress{} + for range addresses { + address, err := peerManager.TryDialNext() + require.NoError(t, err) + require.NotZero(t, address) + require.NoError(t, peerManager.DialFailed(address)) + dial = append(dial, address) + } + require.ElementsMatch(t, dial, addresses) + + address, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, address) +} + +func TestPeerManager_DialFailed(t *testing.T) { + // DialFailed is tested through other tests, we'll just check a few basic + // things here, e.g. reporting unknown addresses. + aID := p2p.NodeID(strings.Repeat("a", 40)) + a := p2p.NodeAddress{Protocol: "memory", NodeID: aID} + bID := p2p.NodeID(strings.Repeat("b", 40)) + b := p2p.NodeAddress{Protocol: "memory", NodeID: bID} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + require.NoError(t, peerManager.Add(a)) + + // Dialing and then calling DialFailed with a different address (same + // NodeID) should unmark as dialing and allow us to dial the other address + // again, but not register the failed address. + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.DialFailed(p2p.NodeAddress{ + Protocol: "tcp", NodeID: aID, Hostname: "localhost"})) + require.Equal(t, []p2p.NodeAddress{a}, peerManager.Addresses(aID)) + + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + + // Calling DialFailed on same address twice should be fine. + require.NoError(t, peerManager.DialFailed(a)) + require.NoError(t, peerManager.DialFailed(a)) + + // DialFailed on an unknown peer shouldn't error or add it. + require.NoError(t, peerManager.DialFailed(b)) + require.Equal(t, []p2p.NodeID{aID}, peerManager.Peers()) +} + +func TestPeerManager_DialFailed_UnreservePeer(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{b.NodeID: 1, c.NodeID: 1}, + MaxConnected: 1, + MaxConnectedUpgrade: 2, + }) + require.NoError(t, err) + + // Add a and connect to it. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.Dialed(a)) + + // Add b and start dialing it. This will claim a for upgrading. + require.NoError(t, peerManager.Add(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + + // Adding c and dialing it will fail, even though it could upgrade a and we + // have free upgrade slots, because a is the only connected peer that can be + // upgraded and b is already trying to upgrade it. + require.NoError(t, peerManager.Add(c)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Empty(t, dial) + + // Failing b's dial will now make c available for dialing. + require.NoError(t, peerManager.DialFailed(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, c, dial) +} + +func TestPeerManager_Dialed_Connected(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Marking a as dialed twice should error. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + + require.NoError(t, peerManager.Dialed(a)) + require.Error(t, peerManager.Dialed(a)) + + // Accepting a connection from b and then trying to mark it as dialed should fail. + require.NoError(t, peerManager.Add(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + + require.NoError(t, peerManager.Accepted(b.NodeID)) + require.Error(t, peerManager.Dialed(b)) +} + +func TestPeerManager_Dialed_Self(t *testing.T) { + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Dialing self should error. + require.Error(t, peerManager.Add(p2p.NodeAddress{Protocol: "memory", NodeID: selfID})) +} + +func TestPeerManager_Dialed_MaxConnected(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 1, + }) + require.NoError(t, err) + + // Start to dial a. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + + // Marking b as dialed in the meanwhile (even without TryDialNext) + // should be fine. + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Dialed(b)) + + // Completing the dial for a should now error. + require.Error(t, peerManager.Dialed(a)) +} + +func TestPeerManager_Dialed_MaxConnectedUpgrade(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + d := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("d", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 2, + MaxConnectedUpgrade: 1, + PeerScores: map[p2p.NodeID]p2p.PeerScore{c.NodeID: 1, d.NodeID: 1}, + }) + require.NoError(t, err) + + // Dialing a and b is fine. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Dialed(a)) + + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Dialed(b)) + + // Starting an upgrade of c should be fine. + require.NoError(t, peerManager.Add(c)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, c, dial) + require.NoError(t, peerManager.Dialed(c)) + + // Trying to mark d dialed should fail, since there are no more upgrade + // slots and a/b haven't been evicted yet. + require.NoError(t, peerManager.Add(d)) + require.Error(t, peerManager.Dialed(d)) +} + +func TestPeerManager_Dialed_Unknown(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Marking an unknown node as dialed should error. + require.Error(t, peerManager.Dialed(a)) +} + +func TestPeerManager_Dialed_Upgrade(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 1, + MaxConnectedUpgrade: 2, + PeerScores: map[p2p.NodeID]p2p.PeerScore{b.NodeID: 1, c.NodeID: 1}, + }) + require.NoError(t, err) + + // Dialing a is fine. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Dialed(a)) + + // Upgrading it with b should work, since b has a higher score. + require.NoError(t, peerManager.Add(b)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + require.NoError(t, peerManager.Dialed(b)) + + // a hasn't been evicted yet, but c shouldn't be allowed to upgrade anyway + // since it's about to be evicted. + require.NoError(t, peerManager.Add(c)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Empty(t, dial) + + // a should now be evicted. + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) +} + +func TestPeerManager_Dialed_UpgradeEvenLower(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + d := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("d", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 2, + MaxConnectedUpgrade: 1, + PeerScores: map[p2p.NodeID]p2p.PeerScore{ + a.NodeID: 3, + b.NodeID: 2, + c.NodeID: 10, + d.NodeID: 1, + }, + }) + require.NoError(t, err) + + // Connect to a and b. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Dialed(a)) + + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Dialed(b)) + + // Start an upgrade with c, which should pick b to upgrade (since it + // has score 2). + require.NoError(t, peerManager.Add(c)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, c, dial) + + // In the meanwhile, a disconnects and d connects. d is even lower-scored + // than b (1 vs 2), which is currently being upgraded. + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.NoError(t, peerManager.Add(d)) + require.NoError(t, peerManager.Accepted(d.NodeID)) + + // Once c completes the upgrade of b, it should instead evict d, + // since it has en even lower score. + require.NoError(t, peerManager.Dialed(c)) + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, d.NodeID, evict) +} + +func TestPeerManager_Dialed_UpgradeNoEvict(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 2, + MaxConnectedUpgrade: 1, + PeerScores: map[p2p.NodeID]p2p.PeerScore{ + a.NodeID: 1, + b.NodeID: 2, + c.NodeID: 3, + }, + }) + require.NoError(t, err) + + // Connect to a and b. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Dialed(a)) + + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Dialed(b)) + + // Start an upgrade with c, which should pick a to upgrade. + require.NoError(t, peerManager.Add(c)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, c, dial) + + // In the meanwhile, b disconnects. + require.NoError(t, peerManager.Disconnected(b.NodeID)) + + // Once c completes the upgrade of b, there is no longer a need to + // evict anything since we're at capacity. + // since it has en even lower score. + require.NoError(t, peerManager.Dialed(c)) + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) +} + +func TestPeerManager_Accepted(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + d := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("d", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Accepting a connection from self should error. + require.Error(t, peerManager.Accepted(selfID)) + + // Accepting a connection from a known peer should work. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + + // Accepting a connection from an already accepted peer should error. + require.Error(t, peerManager.Accepted(a.NodeID)) + + // Accepting a connection from an unknown peer should work and register it. + require.NoError(t, peerManager.Accepted(b.NodeID)) + require.ElementsMatch(t, []p2p.NodeID{a.NodeID, b.NodeID}, peerManager.Peers()) + + // Accepting a connection from a peer that's being dialed should work, and + // should cause the dial to fail. + require.NoError(t, peerManager.Add(c)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, c, dial) + require.NoError(t, peerManager.Accepted(c.NodeID)) + require.Error(t, peerManager.Dialed(c)) + + // Accepting a connection from a peer that's been dialed should fail. + require.NoError(t, peerManager.Add(d)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, d, dial) + require.NoError(t, peerManager.Dialed(d)) + require.Error(t, peerManager.Accepted(d.NodeID)) +} + +func TestPeerManager_Accepted_MaxConnected(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 2, + }) + require.NoError(t, err) + + // Connect to a and b. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Dialed(a)) + + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Accepted(b.NodeID)) + + // Accepting c should now fail. + require.NoError(t, peerManager.Add(c)) + require.Error(t, peerManager.Accepted(c.NodeID)) +} + +func TestPeerManager_Accepted_MaxConnectedUpgrade(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + d := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("d", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{ + c.NodeID: 1, + d.NodeID: 2, + }, + MaxConnected: 1, + MaxConnectedUpgrade: 1, + }) + require.NoError(t, err) + + // Dial a. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Dialed(a)) + + // Accepting b should fail, since it's not an upgrade over a. + require.NoError(t, peerManager.Add(b)) + require.Error(t, peerManager.Accepted(b.NodeID)) + + // Accepting c should work, since it upgrades a. + require.NoError(t, peerManager.Add(c)) + require.NoError(t, peerManager.Accepted(c.NodeID)) + + // a still hasn't been evicted, so accepting b should still fail. + require.NoError(t, peerManager.Add(b)) + require.Error(t, peerManager.Accepted(b.NodeID)) + + // Also, accepting d should fail, since all upgrade slots are full. + require.NoError(t, peerManager.Add(d)) + require.Error(t, peerManager.Accepted(d.NodeID)) +} + +func TestPeerManager_Accepted_Upgrade(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{ + b.NodeID: 1, + c.NodeID: 1, + }, + MaxConnected: 1, + MaxConnectedUpgrade: 2, + }) + require.NoError(t, err) + + // Accept a. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + + // Accepting b should work, since it upgrades a. + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Accepted(b.NodeID)) + + // c cannot get accepted, since a has been upgraded by b. + require.NoError(t, peerManager.Add(c)) + require.Error(t, peerManager.Accepted(c.NodeID)) + + // This should cause a to get evicted. + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) + require.NoError(t, peerManager.Disconnected(a.NodeID)) + + // c still cannot get accepted, since it's not scored above b. + require.Error(t, peerManager.Accepted(c.NodeID)) +} + +func TestPeerManager_Accepted_UpgradeDialing(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{ + b.NodeID: 1, + c.NodeID: 1, + }, + MaxConnected: 1, + MaxConnectedUpgrade: 2, + }) + require.NoError(t, err) + + // Accept a. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + + // Start dial upgrade from a to b. + require.NoError(t, peerManager.Add(b)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + + // a has already been claimed as an upgrade of a, so accepting + // c should fail since there's noone else to upgrade. + require.NoError(t, peerManager.Add(c)) + require.Error(t, peerManager.Accepted(c.NodeID)) + + // However, if b connects to us while we're also trying to upgrade to it via + // dialing, then we accept the incoming connection as an upgrade. + require.NoError(t, peerManager.Accepted(b.NodeID)) + + // This should cause a to get evicted, and the dial upgrade to fail. + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) + require.Error(t, peerManager.Dialed(b)) +} + +func TestPeerManager_Ready(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + sub := peerManager.Subscribe() + defer sub.Close() + + // Connecting to a should still have it as status down. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.Equal(t, p2p.PeerStatusDown, peerManager.Status(a.NodeID)) + + // Marking a as ready should transition it to PeerStatusUp and send an update. + require.NoError(t, peerManager.Ready(a.NodeID)) + require.Equal(t, p2p.PeerStatusUp, peerManager.Status(a.NodeID)) + require.Equal(t, p2p.PeerUpdate{ + NodeID: a.NodeID, + Status: p2p.PeerStatusUp, + }, <-sub.Updates()) + + // Marking an unconnected peer as ready should do nothing. + require.NoError(t, peerManager.Add(b)) + require.Equal(t, p2p.PeerStatusDown, peerManager.Status(b.NodeID)) + require.NoError(t, peerManager.Ready(b.NodeID)) + require.Equal(t, p2p.PeerStatusDown, peerManager.Status(b.NodeID)) + require.Empty(t, sub.Updates()) +} + +// See TryEvictNext for most tests, this just tests blocking behavior. +func TestPeerManager_EvictNext(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + + // Since there are no peers to evict, EvictNext should block until timeout. + timeoutCtx, cancel := context.WithTimeout(ctx, 100*time.Millisecond) + defer cancel() + _, err = peerManager.EvictNext(timeoutCtx) + require.Error(t, err) + require.Equal(t, context.DeadlineExceeded, err) + + // Erroring the peer will return it from EvictNext(). + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + evict, err := peerManager.EvictNext(timeoutCtx) + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) + + // Since there are no more peers to evict, the next call should block. + timeoutCtx, cancel = context.WithTimeout(ctx, 100*time.Millisecond) + defer cancel() + _, err = peerManager.EvictNext(timeoutCtx) + require.Error(t, err) + require.Equal(t, context.DeadlineExceeded, err) +} + +func TestPeerManager_EvictNext_WakeOnError(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + + // Spawn a goroutine to error a peer after a delay. + go func() { + time.Sleep(200 * time.Millisecond) + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + }() + + // This will block until peer errors above. + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + evict, err := peerManager.EvictNext(ctx) + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) +} + +func TestPeerManager_EvictNext_WakeOnUpgradeDialed(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 1, + MaxConnectedUpgrade: 1, + PeerScores: map[p2p.NodeID]p2p.PeerScore{b.NodeID: 1}, + }) + require.NoError(t, err) + + // Connect a. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + + // Spawn a goroutine to upgrade to b with a delay. + go func() { + time.Sleep(200 * time.Millisecond) + require.NoError(t, peerManager.Add(b)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + require.NoError(t, peerManager.Dialed(b)) + }() + + // This will block until peer is upgraded above. + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + evict, err := peerManager.EvictNext(ctx) + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) +} + +func TestPeerManager_EvictNext_WakeOnUpgradeAccepted(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 1, + MaxConnectedUpgrade: 1, + PeerScores: map[p2p.NodeID]p2p.PeerScore{b.NodeID: 1}, + }) + require.NoError(t, err) + + // Connect a. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + + // Spawn a goroutine to upgrade b with a delay. + go func() { + time.Sleep(200 * time.Millisecond) + require.NoError(t, peerManager.Accepted(b.NodeID)) + }() + + // This will block until peer is upgraded above. + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + evict, err := peerManager.EvictNext(ctx) + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) +} +func TestPeerManager_TryEvictNext(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + require.NoError(t, peerManager.Add(a)) + + // Nothing is evicted with no peers connected. + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) + + // Connecting to a won't evict anything either. + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + + // But if a errors it should be evicted. + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + evict, err = peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) + + // While a is being evicted (before disconnect), it shouldn't get evicted again. + evict, err = peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) + + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + evict, err = peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) +} + +func TestPeerManager_Disconnected(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + sub := peerManager.Subscribe() + defer sub.Close() + + // Disconnecting an unknown peer does nothing. + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.Empty(t, peerManager.Peers()) + require.Empty(t, sub.Updates()) + + // Disconnecting an accepted non-ready peer does not send a status update. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.Empty(t, sub.Updates()) + + // Disconnecting a ready peer sends a status update. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + require.Equal(t, p2p.PeerStatusUp, peerManager.Status(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{ + NodeID: a.NodeID, + Status: p2p.PeerStatusUp, + }, <-sub.Updates()) + + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.Equal(t, p2p.PeerStatusDown, peerManager.Status(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{ + NodeID: a.NodeID, + Status: p2p.PeerStatusDown, + }, <-sub.Updates()) + + // Disconnecting a dialing peer does not unmark it as dialing, to avoid + // dialing it multiple times in parallel. + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + + require.NoError(t, peerManager.Disconnected(a.NodeID)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) +} + +func TestPeerManager_Errored(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Erroring an unknown peer does nothing. + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + require.Empty(t, peerManager.Peers()) + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) + + // Erroring a known peer does nothing, and won't evict it later, + // even when it connects. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + evict, err = peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) + + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + evict, err = peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) + + // However, erroring once connected will evict it. + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + evict, err = peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) +} + +func TestPeerManager_Subscribe(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // This tests all subscription events for full peer lifecycles. + sub := peerManager.Subscribe() + defer sub.Close() + + require.NoError(t, peerManager.Add(a)) + require.Empty(t, sub.Updates()) + + // Inbound connection. + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.Empty(t, sub.Updates()) + + require.NoError(t, peerManager.Ready(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusUp}, <-sub.Updates()) + + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusDown}, <-sub.Updates()) + + // Outbound connection with peer error and eviction. + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.Empty(t, sub.Updates()) + + require.NoError(t, peerManager.Dialed(a)) + require.Empty(t, sub.Updates()) + + require.NoError(t, peerManager.Ready(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusUp}, <-sub.Updates()) + + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + require.Empty(t, sub.Updates()) + + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) + + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusDown}, <-sub.Updates()) + + // Outbound connection with dial failure. + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.Empty(t, sub.Updates()) + + require.NoError(t, peerManager.DialFailed(a)) + require.Empty(t, sub.Updates()) +} + +func TestPeerManager_Subscribe_Close(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + sub := peerManager.Subscribe() + defer sub.Close() + + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.Empty(t, sub.Updates()) + + require.NoError(t, peerManager.Ready(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusUp}, <-sub.Updates()) + + // Closing the subscription should not send us the disconnected update. + sub.Close() + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.Empty(t, sub.Updates()) +} + +func TestPeerManager_Subscribe_Broadcast(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + s1 := peerManager.Subscribe() + defer s1.Close() + s2 := peerManager.Subscribe() + defer s2.Close() + s3 := peerManager.Subscribe() + defer s3.Close() + + // Connecting to a peer should send updates on all subscriptions. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + + expectUp := p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusUp} + require.NotEmpty(t, s1) + require.Equal(t, expectUp, <-s1.Updates()) + require.NotEmpty(t, s2) + require.Equal(t, expectUp, <-s2.Updates()) + require.NotEmpty(t, s3) + require.Equal(t, expectUp, <-s3.Updates()) + + // We now close s2. Disconnecting the peer should only send updates + // on s1 and s3. + s2.Close() + require.NoError(t, peerManager.Disconnected(a.NodeID)) + + expectDown := p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusDown} + require.NotEmpty(t, s1) + require.Equal(t, expectDown, <-s1.Updates()) + require.Empty(t, s2.Updates()) + require.NotEmpty(t, s3) + require.Equal(t, expectDown, <-s3.Updates()) +} + +func TestPeerManager_Close(t *testing.T) { + // leaktest will check that spawned goroutines are closed. + t.Cleanup(leaktest.CheckTimeout(t, 1*time.Second)) + + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + MinRetryTime: 10 * time.Second, + }) + require.NoError(t, err) + + // This subscription isn't closed, but PeerManager.Close() + // should reap the spawned goroutine. + _ = peerManager.Subscribe() + + // This dial failure will start a retry timer for 10 seconds, which + // should be reaped. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.DialFailed(a)) + + // This should clean up the goroutines. + peerManager.Close() +} + +func TestPeerManager_Advertise(t *testing.T) { + aID := p2p.NodeID(strings.Repeat("a", 40)) + aTCP := p2p.NodeAddress{Protocol: "tcp", NodeID: aID, Hostname: "127.0.0.1", Port: 26657, Path: "/path"} + aMem := p2p.NodeAddress{Protocol: "memory", NodeID: aID} + + bID := p2p.NodeID(strings.Repeat("b", 40)) + bTCP := p2p.NodeAddress{Protocol: "tcp", NodeID: bID, Hostname: "b10c::1", Port: 26657, Path: "/path"} + bMem := p2p.NodeAddress{Protocol: "memory", NodeID: bID} + + cID := p2p.NodeID(strings.Repeat("c", 40)) + cTCP := p2p.NodeAddress{Protocol: "tcp", NodeID: cID, Hostname: "host.domain", Port: 80} + cMem := p2p.NodeAddress{Protocol: "memory", NodeID: cID} + + dID := p2p.NodeID(strings.Repeat("d", 40)) + + // Create an initial peer manager and add the peers. + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{aID: 3, bID: 2, cID: 1}, + }) + require.NoError(t, err) + defer peerManager.Close() + + require.NoError(t, peerManager.Add(aTCP)) + require.NoError(t, peerManager.Add(aMem)) + require.NoError(t, peerManager.Add(bTCP)) + require.NoError(t, peerManager.Add(bMem)) + require.NoError(t, peerManager.Add(cTCP)) + require.NoError(t, peerManager.Add(cMem)) + + // d should get all addresses. + require.ElementsMatch(t, []p2p.NodeAddress{ + aTCP, aMem, bTCP, bMem, cTCP, cMem, + }, peerManager.Advertise(dID, 100)) + + // a should not get its own addresses. + require.ElementsMatch(t, []p2p.NodeAddress{ + bTCP, bMem, cTCP, cMem, + }, peerManager.Advertise(aID, 100)) + + // Asking for 0 addresses should return, well, 0. + require.Empty(t, peerManager.Advertise(aID, 0)) + + // Asking for 2 addresses should get the highest-rated ones, i.e. a. + require.ElementsMatch(t, []p2p.NodeAddress{ + aTCP, aMem, + }, peerManager.Advertise(dID, 2)) +} + +func TestPeerManager_SetHeight_GetHeight(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + db := dbm.NewMemDB() + peerManager, err := p2p.NewPeerManager(selfID, db, p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Getting a height should default to 0, for unknown peers and + // for known peers without height. + require.NoError(t, peerManager.Add(a)) + require.EqualValues(t, 0, peerManager.GetHeight(a.NodeID)) + require.EqualValues(t, 0, peerManager.GetHeight(b.NodeID)) + + // Setting a height should work for a known node. + require.NoError(t, peerManager.SetHeight(a.NodeID, 3)) + require.EqualValues(t, 3, peerManager.GetHeight(a.NodeID)) + + // Setting a height should add an unknown node. + require.Equal(t, []p2p.NodeID{a.NodeID}, peerManager.Peers()) + require.NoError(t, peerManager.SetHeight(b.NodeID, 7)) + require.EqualValues(t, 7, peerManager.GetHeight(b.NodeID)) + require.ElementsMatch(t, []p2p.NodeID{a.NodeID, b.NodeID}, peerManager.Peers()) + + // The heights should not be persisted. + peerManager.Close() + peerManager, err = p2p.NewPeerManager(selfID, db, p2p.PeerManagerOptions{}) + require.NoError(t, err) + + require.ElementsMatch(t, []p2p.NodeID{a.NodeID, b.NodeID}, peerManager.Peers()) + require.Zero(t, peerManager.GetHeight(a.NodeID)) + require.Zero(t, peerManager.GetHeight(b.NodeID)) +} diff --git a/p2p/pex/reactor.go b/p2p/pex/reactor.go index 4762260b0..b2786a360 100644 --- a/p2p/pex/reactor.go +++ b/p2p/pex/reactor.go @@ -30,7 +30,7 @@ type ReactorV2 struct { peerManager *p2p.PeerManager pexCh *p2p.Channel - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates closeCh chan struct{} } @@ -39,7 +39,7 @@ func NewReactorV2( logger log.Logger, peerManager *p2p.PeerManager, pexCh *p2p.Channel, - peerUpdates *p2p.PeerUpdatesCh, + peerUpdates *p2p.PeerUpdates, ) *ReactorV2 { r := &ReactorV2{ peerManager: peerManager, @@ -85,7 +85,7 @@ func (r *ReactorV2) handlePexMessage(envelope p2p.Envelope) error { switch msg := envelope.Message.(type) { case *protop2p.PexRequest: pexAddresses := r.resolve(r.peerManager.Advertise(envelope.From, maxAddresses), maxAddresses) - r.pexCh.Out() <- p2p.Envelope{ + r.pexCh.Out <- p2p.Envelope{ To: envelope.From, Message: &protop2p.PexResponse{Addresses: pexAddresses}, } @@ -177,13 +177,12 @@ func (r *ReactorV2) processPexCh() { for { select { - case envelope := <-r.pexCh.In(): - if err := r.handleMessage(r.pexCh.ID(), envelope); err != nil { - r.Logger.Error("failed to process message", "ch_id", r.pexCh.ID(), "envelope", envelope, "err", err) - r.pexCh.Error() <- p2p.PeerError{ - PeerID: envelope.From, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + case envelope := <-r.pexCh.In: + if err := r.handleMessage(r.pexCh.ID, envelope); err != nil { + r.Logger.Error("failed to process message", "ch_id", r.pexCh.ID, "envelope", envelope, "err", err) + r.pexCh.Error <- p2p.PeerError{ + NodeID: envelope.From, + Err: err, } } @@ -197,11 +196,11 @@ func (r *ReactorV2) processPexCh() { // processPeerUpdate processes a PeerUpdate. For added peers, PeerStatusUp, we // send a request for addresses. func (r *ReactorV2) processPeerUpdate(peerUpdate p2p.PeerUpdate) { - r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status) + r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) if peerUpdate.Status == p2p.PeerStatusUp { - r.pexCh.Out() <- p2p.Envelope{ - To: peerUpdate.PeerID, + r.pexCh.Out <- p2p.Envelope{ + To: peerUpdate.NodeID, Message: &protop2p.PexRequest{}, } } diff --git a/p2p/router.go b/p2p/router.go index 669c96265..ff2335fe9 100644 --- a/p2p/router.go +++ b/p2p/router.go @@ -9,11 +9,109 @@ import ( "time" "github.com/gogo/protobuf/proto" + "github.com/tendermint/tendermint/crypto" "github.com/tendermint/tendermint/libs/log" "github.com/tendermint/tendermint/libs/service" ) +// ChannelID is an arbitrary channel ID. +type ChannelID uint16 + +// Envelope contains a message with sender/receiver routing info. +type Envelope struct { + From NodeID // sender (empty if outbound) + To NodeID // receiver (empty if inbound) + Broadcast bool // send to all connected peers (ignores To) + Message proto.Message // message payload + + // channelID is for internal Router use, set on outbound messages to inform + // the sendPeer() goroutine which transport channel to use. + // + // FIXME: If we migrate the Transport API to a byte-oriented multi-stream + // API, this will no longer be necessary since each channel will be mapped + // onto a stream during channel/peer setup. See: + // https://github.com/tendermint/spec/pull/227 + channelID ChannelID +} + +// PeerError is a peer error reported via Channel.Error. +// +// FIXME: This currently just disconnects the peer, which is too simplistic. +// For example, some errors should be logged, some should cause disconnects, +// and some should ban the peer. +// +// FIXME: This should probably be replaced by a more general PeerBehavior +// concept that can mark good and bad behavior and contributes to peer scoring. +// It should possibly also allow reactors to request explicit actions, e.g. +// disconnection or banning, in addition to doing this based on aggregates. +type PeerError struct { + NodeID NodeID + Err error +} + +// Channel is a bidirectional channel to exchange Protobuf messages with peers, +// wrapped in Envelope to specify routing info (i.e. sender/receiver). +type Channel struct { + ID ChannelID + In <-chan Envelope // inbound messages (peers to reactors) + Out chan<- Envelope // outbound messages (reactors to peers) + Error chan<- PeerError // peer error reporting + + messageType proto.Message // the channel's message type, used for unmarshalling + closeCh chan struct{} + closeOnce sync.Once +} + +// NewChannel creates a new channel. It is primarily for internal and test +// use, reactors should use Router.OpenChannel(). +func NewChannel( + id ChannelID, + messageType proto.Message, + inCh <-chan Envelope, + outCh chan<- Envelope, + errCh chan<- PeerError, +) *Channel { + return &Channel{ + ID: id, + messageType: messageType, + In: inCh, + Out: outCh, + Error: errCh, + closeCh: make(chan struct{}), + } +} + +// Close closes the channel. Future sends on Out and Error will panic. The In +// channel remains open to avoid having to synchronize Router senders, which +// should use Done() to detect channel closure instead. +func (c *Channel) Close() { + c.closeOnce.Do(func() { + close(c.closeCh) + close(c.Out) + close(c.Error) + }) +} + +// Done returns a channel that's closed when Channel.Close() is called. +func (c *Channel) Done() <-chan struct{} { + return c.closeCh +} + +// Wrapper is a Protobuf message that can contain a variety of inner messages +// (e.g. via oneof fields). If a Channel's message type implements Wrapper, the +// Router will automatically wrap outbound messages and unwrap inbound messages, +// such that reactors do not have to do this themselves. +type Wrapper interface { + proto.Message + + // Wrap will take a message and wrap it in this one if possible. + Wrap(proto.Message) error + + // Unwrap will unwrap the inner message contained in this message. + Unwrap() (proto.Message, error) +} + // RouterOptions specifies options for a Router. type RouterOptions struct { // ResolveTimeout is the timeout for resolving NodeAddress URLs. @@ -28,83 +126,62 @@ type RouterOptions struct { HandshakeTimeout time.Duration } -// Validate validates the options. +// Validate validates router options. func (o *RouterOptions) Validate() error { return nil } // Router manages peer connections and routes messages between peers and reactor -// channels. This is an early prototype. +// channels. It takes a PeerManager for peer lifecycle management (e.g. which +// peers to dial and when) and a set of Transports for connecting and +// communicating with peers. // -// Channels are registered via OpenChannel(). When called, we register an input -// message queue for the channel in channelQueues and spawn off a goroutine for -// Router.routeChannel(). This goroutine reads off outbound messages and puts -// them in the appropriate peer message queue, and processes peer errors which -// will close (and thus disconnect) the appriate peer queue. It runs until -// either the channel is closed by the caller or the router is stopped, at which -// point the input message queue is closed and removed. +// On startup, three main goroutines are spawned to maintain peer connections: // -// On startup, the router spawns off three primary goroutines that maintain -// connections to peers and run for the lifetime of the router: +// dialPeers(): in a loop, calls PeerManager.DialNext() to get the next peer +// address to dial and spawns a goroutine that dials the peer, handshakes +// with it, and begins to route messages if successful. // -// Router.dialPeers(): in a loop, asks the PeerManager for the next peer -// address to contact, resolves it into endpoints, and attempts to dial -// each one. +// acceptPeers(): in a loop, waits for an inbound connection via +// Transport.Accept() and spawns a goroutine that handshakes with it and +// begins to route messages if successful. // -// Router.acceptPeers(): in a loop, waits for the next inbound connection -// from a peer, and checks with the PeerManager if it should be accepted. +// evictPeers(): in a loop, calls PeerManager.EvictNext() to get the next +// peer to evict, and disconnects it by closing its message queue. // -// Router.evictPeers(): in a loop, asks the PeerManager for any connected -// peers to evict, and disconnects them. +// When a peer is connected, an outbound peer message queue is registered in +// peerQueues, and routePeer() is called to spawn off two additional goroutines: // -// Once either an inbound or outbound connection has been made, an outbound -// message queue is registered in Router.peerQueues and a goroutine is spawned -// off for Router.routePeer() which will spawn off additional goroutines for -// Router.sendPeer() that sends outbound messages from the peer queue over the -// connection and for Router.receivePeer() that reads inbound messages from -// the connection and places them in the appropriate channel queue. When either -// goroutine exits, the connection and peer queue is closed, which will cause -// the other goroutines to close as well. +// sendPeer(): waits for an outbound message from the peerQueues queue, +// marshals it, and passes it to the peer transport which delivers it. // -// The peerStore is used to coordinate peer connections, by only allowing a peer -// to be claimed (owned) by a single caller at a time (both for outbound and -// inbound connections). This is done either via peerStore.Dispense() which -// dispenses and claims an eligible peer to dial, or via peerStore.Claim() which -// attempts to claim a given peer for an inbound connection. Peers must be -// returned to the peerStore with peerStore.Return() to release the claim. Over -// time, the peerStore will also do peer scheduling and prioritization, e.g. -// ensuring we do exponential backoff on dial failures and connecting to -// more important peers first (such as persistent peers and validators). +// receivePeer(): waits for an inbound message from the peer transport, +// unmarshals it, and passes it to the appropriate inbound channel queue +// in channelQueues. // -// An additional goroutine Router.broadcastPeerUpdates() is also spawned off -// on startup, which consumes peer updates from Router.peerUpdatesCh (currently -// only connections and disconnections), and broadcasts them to all peer update -// subscriptions registered via SubscribePeerUpdates(). +// When a reactor opens a channel via OpenChannel, an inbound channel message +// queue is registered in channelQueues, and a channel goroutine is spawned: // -// On router shutdown, we close Router.stopCh which will signal to all -// goroutines to terminate. This in turn will cause all pending channel/peer -// queues to close, and we wait for this as a signal that goroutines have ended. +// routeChannel(): waits for an outbound message from the channel, looks +// up the recipient peer's outbound message queue in peerQueues, and submits +// the message to it. // -// All message scheduling should be limited to the queue implementations used -// for channel queues and peer queues. All message sending throughout the router -// is blocking, and if any messages should be dropped or buffered this is the -// sole responsibility of the queue, such that we can limit this logic to a -// single place. There is currently only a FIFO queue implementation that always -// blocks and never drops messages, but this must be improved with other -// implementations. The only exception is that all message sending must also -// select on appropriate channel/queue/router closure signals, to avoid blocking -// forever on a channel that has no consumer. +// All channel sends in the router are blocking. It is the responsibility of the +// queue interface in peerQueues and channelQueues to prioritize and drop +// messages as appropriate during contention to prevent stalls and ensure good +// quality of service. type Router struct { *service.BaseService - logger log.Logger - nodeInfo NodeInfo - privKey crypto.PrivKey - transports map[Protocol]Transport - peerManager *PeerManager - options RouterOptions + logger log.Logger + options RouterOptions + nodeInfo NodeInfo + privKey crypto.PrivKey + peerManager *PeerManager + transports []Transport + protocolTransports map[Protocol]Transport + stopCh chan struct{} // signals Router shutdown - // FIXME: Consider using sync.Map. peerMtx sync.RWMutex peerQueues map[NodeID]queue @@ -114,12 +191,11 @@ type Router struct { channelMtx sync.RWMutex channelQueues map[ChannelID]queue channelMessages map[ChannelID]proto.Message - - // stopCh is used to signal router shutdown, by closing the channel. - stopCh chan struct{} } -// NewRouter creates a new Router. +// NewRouter creates a new Router. The given Transports must already be +// listening on appropriate interfaces, and will be closed by the Router when it +// stops. func NewRouter( logger log.Logger, nodeInfo NodeInfo, @@ -133,23 +209,24 @@ func NewRouter( } router := &Router{ - logger: logger, - nodeInfo: nodeInfo, - privKey: privKey, - transports: map[Protocol]Transport{}, - peerManager: peerManager, - options: options, - stopCh: make(chan struct{}), - channelQueues: map[ChannelID]queue{}, - channelMessages: map[ChannelID]proto.Message{}, - peerQueues: map[NodeID]queue{}, + logger: logger, + nodeInfo: nodeInfo, + privKey: privKey, + transports: transports, + protocolTransports: map[Protocol]Transport{}, + peerManager: peerManager, + options: options, + stopCh: make(chan struct{}), + channelQueues: map[ChannelID]queue{}, + channelMessages: map[ChannelID]proto.Message{}, + peerQueues: map[NodeID]queue{}, } router.BaseService = service.NewBaseService(logger, "router", router) for _, transport := range transports { for _, protocol := range transport.Protocols() { - if _, ok := router.transports[protocol]; !ok { - router.transports[protocol] = transport + if _, ok := router.protocolTransports[protocol]; !ok { + router.protocolTransports[protocol] = transport } } } @@ -158,12 +235,20 @@ func NewRouter( } // OpenChannel opens a new channel for the given message type. The caller must -// close the channel when done, and this must happen before the router stops. +// close the channel when done, before stopping the Router. messageType is the +// type of message passed through the channel (used for unmarshaling), which can +// implement Wrapper to automatically (un)wrap multiple message types in a +// wrapper message. func (r *Router) OpenChannel(id ChannelID, messageType proto.Message) (*Channel, error) { - // FIXME: NewChannel should take directional channels so we can pass - // queue.dequeue() instead of reaching inside for queue.queueCh. queue := newFIFOQueue() - channel := NewChannel(id, messageType, queue.queueCh, make(chan Envelope), make(chan PeerError)) + outCh := make(chan Envelope) + errCh := make(chan PeerError) + channel := NewChannel(id, messageType, queue.dequeue(), outCh, errCh) + + var wrapper Wrapper + if w, ok := messageType.(Wrapper); ok { + wrapper = w + } r.channelMtx.Lock() defer r.channelMtx.Unlock() @@ -182,98 +267,97 @@ func (r *Router) OpenChannel(id ChannelID, messageType proto.Message) (*Channel, r.channelMtx.Unlock() queue.close() }() - r.routeChannel(channel) + + r.routeChannel(id, outCh, errCh, wrapper) }() return channel, nil } -// routeChannel receives outbound messages and errors from a channel and routes -// them to the appropriate peer. It returns when either the channel is closed or -// the router is shutting down. -func (r *Router) routeChannel(channel *Channel) { +// routeChannel receives outbound channel messages and routes them to the +// appropriate peer. It also receives peer errors and reports them to the peer +// manager. It returns when either the outbound channel or error channel is +// closed, or the Router is stopped. wrapper is an optional message wrapper +// for messages, see Wrapper for details. +func (r *Router) routeChannel( + chID ChannelID, + outCh <-chan Envelope, + errCh <-chan PeerError, + wrapper Wrapper, +) { for { select { - case envelope, ok := <-channel.outCh: + case envelope, ok := <-outCh: if !ok { return } - // FIXME: This is a bit unergonomic, maybe it'd be better for Wrap() - // to return a wrapped copy. - if _, ok := channel.messageType.(Wrapper); ok { - wrapper := proto.Clone(channel.messageType) - if err := wrapper.(Wrapper).Wrap(envelope.Message); err != nil { - r.Logger.Error("failed to wrap message", "err", err) + // Mark the envelope with the channel ID to allow sendPeer() to pass + // it on to Transport.SendMessage(). + envelope.channelID = chID + + // Wrap the message in a wrapper message, if requested. + if wrapper != nil { + msg := proto.Clone(wrapper) + if err := msg.(Wrapper).Wrap(envelope.Message); err != nil { + r.Logger.Error("failed to wrap message", "channel", chID, "err", err) continue } - envelope.Message = wrapper + envelope.Message = msg } - envelope.channelID = channel.id + // Collect peer queues to pass the message via. + var queues []queue if envelope.Broadcast { r.peerMtx.RLock() - peerQueues := make(map[NodeID]queue, len(r.peerQueues)) - for peerID, peerQueue := range r.peerQueues { - peerQueues[peerID] = peerQueue + queues = make([]queue, 0, len(r.peerQueues)) + for _, q := range r.peerQueues { + queues = append(queues, q) } r.peerMtx.RUnlock() - - for peerID, peerQueue := range peerQueues { - e := envelope - e.Broadcast = false - e.To = peerID - select { - case peerQueue.enqueue() <- e: - case <-peerQueue.closed(): - case <-r.stopCh: - return - } - } - } else { r.peerMtx.RLock() - peerQueue, ok := r.peerQueues[envelope.To] + q, ok := r.peerQueues[envelope.To] r.peerMtx.RUnlock() if !ok { - r.logger.Error("dropping message for non-connected peer", - "peer", envelope.To, "channel", channel.id) + r.logger.Debug("dropping message for unconnected peer", + "peer", envelope.To, "channel", chID) continue } + queues = []queue{q} + } + // Send message to peers. + for _, q := range queues { select { - case peerQueue.enqueue() <- envelope: - case <-peerQueue.closed(): - r.logger.Error("dropping message for non-connected peer", - "peer", envelope.To, "channel", channel.id) + case q.enqueue() <- envelope: + case <-q.closed(): + r.logger.Debug("dropping message for unconnected peer", + "peer", envelope.To, "channel", chID) case <-r.stopCh: return } } - case peerError, ok := <-channel.errCh: + case peerError, ok := <-errCh: if !ok { return } - // FIXME: We just disconnect the peer for now - r.logger.Error("peer error, disconnecting", "peer", peerError.PeerID, "err", peerError.Err) - r.peerMtx.RLock() - peerQueue, ok := r.peerQueues[peerError.PeerID] - r.peerMtx.RUnlock() - if ok { - peerQueue.close() + r.logger.Error("peer error, evicting", "peer", peerError.NodeID, "err", peerError.Err) + if err := r.peerManager.Errored(peerError.NodeID, peerError.Err); err != nil { + r.logger.Error("failed to report peer error", "peer", peerError.NodeID, "err", err) } - case <-channel.Done(): - return case <-r.stopCh: return } } } -// acceptPeers accepts inbound connections from peers on the given transport. +// acceptPeers accepts inbound connections from peers on the given transport, +// and spawns goroutines that route messages to/from them. func (r *Router) acceptPeers(transport Transport) { + r.logger.Debug("starting accept routine", "transport", transport) ctx := r.stopCtx() for { // FIXME: We may need transports to enforce some sort of rate limiting @@ -301,34 +385,37 @@ func (r *Router) acceptPeers(transport Transport) { return default: r.logger.Error("failed to accept connection", "transport", transport, "err", err) - continue + return } + // Spawn a goroutine for the handshake, to avoid head-of-line blocking. go func() { - defer func() { - _ = conn.Close() - }() + defer conn.Close() - // FIXME: Because we do the handshake in each transport, rather than - // here in the Router, the remote peer will think they've - // successfully connected and start sending us messages, although we - // can end up rejecting the connection here. This can e.g. cause - // problems in tests, where because of race conditions a - // disconnection can cause the local node to immediately redial, - // while the remote node may not have completed the disconnection - // registration yet and reject the accept below. + // FIXME: The peer manager may reject the peer during Accepted() + // after we've handshaked with the peer (to find out which peer it + // is). However, because the handshake has no ack, the remote peer + // will think the handshake was successful and start sending us + // messages. // - // The Router should do the handshake, and we should check with the - // peer manager before completing the handshake -- this probably - // requires protocol changes to send an additional message when the - // handshake is accepted. + // This can cause problems in tests, where a disconnection can cause + // the local node to immediately redial, while the remote node may + // not have completed the disconnection yet and therefore reject the + // reconnection attempt (since it thinks we're still connected from + // before). + // + // The Router should do the handshake and have a final ack/fail + // message to make sure both ends have accepted the connection, such + // that it can be coordinated with the peer manager. peerInfo, _, err := r.handshakePeer(ctx, conn, "") - if err == context.Canceled { + switch { + case errors.Is(err, context.Canceled): return - } else if err != nil { - r.logger.Error("failed to handshake with peer", "err", err) + case err != nil: + r.logger.Error("peer handshake failed", "endpoint", conn, "err", err) return } + if err := r.peerManager.Accepted(peerInfo.NodeID); err != nil { r.logger.Error("failed to accept connection", "peer", peerInfo.NodeID, "err", err) return @@ -338,7 +425,6 @@ func (r *Router) acceptPeers(transport Transport) { r.peerMtx.Lock() r.peerQueues[peerInfo.NodeID] = queue r.peerMtx.Unlock() - r.peerManager.Ready(peerInfo.NodeID) defer func() { r.peerMtx.Lock() @@ -350,52 +436,62 @@ func (r *Router) acceptPeers(transport Transport) { } }() + if err := r.peerManager.Ready(peerInfo.NodeID); err != nil { + r.logger.Error("failed to mark peer as ready", "peer", peerInfo.NodeID, "err", err) + return + } + r.routePeer(peerInfo.NodeID, conn, queue) }() } } -// dialPeers maintains outbound connections to peers. +// dialPeers maintains outbound connections to peers by dialing them. func (r *Router) dialPeers() { + r.logger.Debug("starting dial routine") ctx := r.stopCtx() for { - peerID, address, err := r.peerManager.DialNext(ctx) - switch err { - case nil: - case context.Canceled: + address, err := r.peerManager.DialNext(ctx) + switch { + case errors.Is(err, context.Canceled): r.logger.Debug("stopping dial routine") return - default: + case err != nil: r.logger.Error("failed to find next peer to dial", "err", err) return } + // Spawn off a goroutine to actually dial the peer, so that we can + // dial multiple peers in parallel. go func() { conn, err := r.dialPeer(ctx, address) - if errors.Is(err, context.Canceled) { + switch { + case errors.Is(err, context.Canceled): return - } else if err != nil { - r.logger.Error("failed to dial peer", "peer", peerID, "err", err) - if err = r.peerManager.DialFailed(peerID, address); err != nil { - r.logger.Error("failed to report dial failure", "peer", peerID, "err", err) + case err != nil: + r.logger.Error("failed to dial peer", "peer", address, "err", err) + if err = r.peerManager.DialFailed(address); err != nil { + r.logger.Error("failed to report dial failure", "peer", address, "err", err) } return } defer conn.Close() + peerID := address.NodeID _, _, err = r.handshakePeer(ctx, conn, peerID) - if errors.Is(err, context.Canceled) { + switch { + case errors.Is(err, context.Canceled): return - } else if err != nil { - r.logger.Error("failed to handshake with peer", "peer", peerID, "err", err) - if err = r.peerManager.DialFailed(peerID, address); err != nil { - r.logger.Error("failed to report dial failure", "peer", peerID, "err", err) + case err != nil: + r.logger.Error("failed to handshake with peer", "peer", address, "err", err) + if err = r.peerManager.DialFailed(address); err != nil { + r.logger.Error("failed to report dial failure", "peer", address, "err", err) } return } - if err = r.peerManager.Dialed(peerID, address); err != nil { - r.logger.Error("failed to dial peer", "peer", peerID, "err", err) + if err = r.peerManager.Dialed(address); err != nil { + r.logger.Error("failed to dial peer", "peer", address, "err", err) return } @@ -403,7 +499,6 @@ func (r *Router) dialPeers() { r.peerMtx.Lock() r.peerQueues[peerID] = queue r.peerMtx.Unlock() - r.peerManager.Ready(peerID) defer func() { r.peerMtx.Lock() @@ -411,10 +506,15 @@ func (r *Router) dialPeers() { r.peerMtx.Unlock() queue.close() if err := r.peerManager.Disconnected(peerID); err != nil { - r.logger.Error("failed to disconnect peer", "peer", peerID, "err", err) + r.logger.Error("failed to disconnect peer", "peer", address, "err", err) } }() + if err := r.peerManager.Ready(peerID); err != nil { + r.logger.Error("failed to mark peer as ready", "peer", address, "err", err) + return + } + r.routePeer(peerID, conn, queue) }() } @@ -422,22 +522,26 @@ func (r *Router) dialPeers() { // dialPeer connects to a peer by dialing it. func (r *Router) dialPeer(ctx context.Context, address NodeAddress) (Connection, error) { - r.logger.Info("resolving peer address", "address", address) resolveCtx := ctx if r.options.ResolveTimeout > 0 { var cancel context.CancelFunc resolveCtx, cancel = context.WithTimeout(resolveCtx, r.options.ResolveTimeout) defer cancel() } + + r.logger.Debug("resolving peer address", "peer", address) endpoints, err := address.Resolve(resolveCtx) - if err != nil { + switch { + case err != nil: return nil, fmt.Errorf("failed to resolve address %q: %w", address, err) + case len(endpoints) == 0: + return nil, fmt.Errorf("address %q did not resolve to any endpoints", address) } for _, endpoint := range endpoints { - transport, ok := r.transports[endpoint.Protocol] + transport, ok := r.protocolTransports[endpoint.Protocol] if !ok { - r.logger.Error("no transport found for endpoint protocol", "endpoint", endpoint) + r.logger.Error("no transport found for protocol", "endpoint", endpoint) continue } @@ -457,17 +561,17 @@ func (r *Router) dialPeer(ctx context.Context, address NodeAddress) (Connection, // Internet can't and needs a different public address. conn, err := transport.Dial(dialCtx, endpoint) if err != nil { - r.logger.Error("failed to dial endpoint", "endpoint", endpoint, "err", err) + r.logger.Error("failed to dial endpoint", "peer", address.NodeID, "endpoint", endpoint, "err", err) } else { - r.logger.Info("connected to peer", "peer", address.NodeID, "endpoint", endpoint) + r.logger.Debug("dialed peer", "peer", address.NodeID, "endpoint", endpoint) return conn, nil } } - return nil, fmt.Errorf("failed to connect to peer via %q", address) + return nil, errors.New("all endpoints failed") } // handshakePeer handshakes with a peer, validating the peer's information. If -// expectID is given, we check that the peer's public key matches it. +// expectID is given, we check that the peer's info matches it. func (r *Router) handshakePeer(ctx context.Context, conn Connection, expectID NodeID) (NodeInfo, crypto.PubKey, error) { if r.options.HandshakeTimeout > 0 { var cancel context.CancelFunc @@ -478,51 +582,47 @@ func (r *Router) handshakePeer(ctx context.Context, conn Connection, expectID No if err != nil { return peerInfo, peerKey, err } + if err = peerInfo.Validate(); err != nil { return peerInfo, peerKey, fmt.Errorf("invalid handshake NodeInfo: %w", err) } - if expectID != "" && expectID != peerInfo.NodeID { - return peerInfo, peerKey, fmt.Errorf("expected to connect with peer %q, got %q", - expectID, peerInfo.NodeID) - } if NodeIDFromPubKey(peerKey) != peerInfo.NodeID { return peerInfo, peerKey, fmt.Errorf("peer's public key did not match its node ID %q (expected %q)", peerInfo.NodeID, NodeIDFromPubKey(peerKey)) } - if peerInfo.NodeID == r.nodeInfo.NodeID { - return peerInfo, peerKey, errors.New("rejecting handshake with self") + if expectID != "" && expectID != peerInfo.NodeID { + return peerInfo, peerKey, fmt.Errorf("expected to connect with peer %q, got %q", + expectID, peerInfo.NodeID) } return peerInfo, peerKey, nil } -// routePeer routes inbound messages from a peer to channels, and also sends -// outbound queued messages to the peer. It will close the connection and send -// queue, using this as a signal to coordinate the internal receivePeer() and -// sendPeer() goroutines. It blocks until the peer is done, e.g. when the -// connection or queue is closed. +// routePeer routes inbound and outbound messages between a peer and the reactor +// channels. It will close the given connection and send queue when done, or if +// they are closed elsewhere it will cause this method to shut down and return. func (r *Router) routePeer(peerID NodeID, conn Connection, sendQueue queue) { - r.logger.Info("routing peer", "peer", peerID) - resultsCh := make(chan error, 2) + r.logger.Info("peer connected", "peer", peerID, "endpoint", conn) + errCh := make(chan error, 2) go func() { - resultsCh <- r.receivePeer(peerID, conn) + errCh <- r.receivePeer(peerID, conn) }() go func() { - resultsCh <- r.sendPeer(peerID, conn, sendQueue) + errCh <- r.sendPeer(peerID, conn, sendQueue) }() - err := <-resultsCh + err := <-errCh _ = conn.Close() sendQueue.close() - if e := <-resultsCh; err == nil { - // The first err was nil, so we update it with the second result, - // which may or may not be nil. + if e := <-errCh; err == nil { + // The first err was nil, so we update it with the second err, which may + // or may not be nil. err = e } switch err { - case nil, io.EOF, ErrTransportClosed{}: - r.logger.Info("peer disconnected", "peer", peerID) + case nil, io.EOF: + r.logger.Info("peer disconnected", "peer", peerID, "endpoint", conn) default: - r.logger.Error("peer failure", "peer", peerID, "err", err) + r.logger.Error("peer failure", "peer", peerID, "endpoint", conn, "err", err) } } @@ -540,7 +640,7 @@ func (r *Router) receivePeer(peerID NodeID, conn Connection) error { messageType := r.channelMessages[chID] r.channelMtx.RUnlock() if !ok { - r.logger.Error("dropping message for unknown channel", "peer", peerID, "channel", chID) + r.logger.Debug("dropping message for unknown channel", "peer", peerID, "channel", chID) continue } @@ -558,10 +658,10 @@ func (r *Router) receivePeer(peerID NodeID, conn Connection) error { } select { - case queue.enqueue() <- Envelope{channelID: chID, From: peerID, Message: msg}: + case queue.enqueue() <- Envelope{From: peerID, Message: msg}: r.logger.Debug("received message", "peer", peerID, "message", msg) case <-queue.closed(): - r.logger.Error("channel closed, dropping message", "peer", peerID, "channel", chID) + r.logger.Debug("channel closed, dropping message", "peer", peerID, "channel", chID) case <-r.stopCh: return nil } @@ -573,6 +673,10 @@ func (r *Router) sendPeer(peerID NodeID, conn Connection, queue queue) error { for { select { case envelope := <-queue.dequeue(): + if envelope.Message == nil { + r.logger.Error("dropping nil message", "peer", peerID) + continue + } bz, err := proto.Marshal(envelope.Message) if err != nil { r.logger.Error("failed to marshal message", "peer", peerID, "err", err) @@ -596,43 +700,57 @@ func (r *Router) sendPeer(peerID NodeID, conn Connection, queue queue) error { // evictPeers evicts connected peers as requested by the peer manager. func (r *Router) evictPeers() { + r.logger.Debug("starting evict routine") ctx := r.stopCtx() for { peerID, err := r.peerManager.EvictNext(ctx) - switch err { - case nil: - case context.Canceled: + switch { + case errors.Is(err, context.Canceled): r.logger.Debug("stopping evict routine") return - default: + case err != nil: r.logger.Error("failed to find next peer to evict", "err", err) return } r.logger.Info("evicting peer", "peer", peerID) r.peerMtx.RLock() - if queue, ok := r.peerQueues[peerID]; ok { + queue, ok := r.peerQueues[peerID] + r.peerMtx.RUnlock() + if ok { queue.close() } - r.peerMtx.RUnlock() } } // OnStart implements service.Service. func (r *Router) OnStart() error { go r.dialPeers() + go r.evictPeers() for _, transport := range r.transports { go r.acceptPeers(transport) } - go r.evictPeers() return nil } // OnStop implements service.Service. // -// FIXME: This needs to close transports as well. +// All channels must be closed by OpenChannel() callers before stopping the +// router, to prevent blocked channel sends in reactors. Channels are not closed +// here, since that would cause any reactor senders to panic, so it is the +// sender's responsibility. func (r *Router) OnStop() { - // Collect all active queues, so we can wait for them to close. + // Signal router shutdown. + close(r.stopCh) + + // Close transport listeners (unblocks Accept calls). + for _, transport := range r.transports { + if err := transport.Close(); err != nil { + r.logger.Error("failed to close transport", "transport", transport, "err", err) + } + } + + // Collect all remaining queues, and wait for them to close. queues := []queue{} r.channelMtx.RLock() for _, q := range r.channelQueues { @@ -644,16 +762,12 @@ func (r *Router) OnStop() { queues = append(queues, q) } r.peerMtx.RUnlock() - - // Signal router shutdown, and wait for queues (and thus goroutines) - // to complete. - close(r.stopCh) for _, q := range queues { <-q.closed() } } -// stopCtx returns a context that is cancelled when the router stops. +// stopCtx returns a new context that is cancelled when the router stops. func (r *Router) stopCtx() context.Context { ctx, cancel := context.WithCancel(context.Background()) go func() { diff --git a/p2p/router_test.go b/p2p/router_test.go index 8f5cb91b9..6d1c59bc2 100644 --- a/p2p/router_test.go +++ b/p2p/router_test.go @@ -2,159 +2,644 @@ package p2p_test import ( "errors" + "fmt" + "io" + "strings" + "sync" "testing" + "time" "github.com/fortytw2/leaktest" + "github.com/gogo/protobuf/proto" gogotypes "github.com/gogo/protobuf/types" - "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" dbm "github.com/tendermint/tm-db" "github.com/tendermint/tendermint/crypto" - "github.com/tendermint/tendermint/crypto/ed25519" "github.com/tendermint/tendermint/libs/log" + tmsync "github.com/tendermint/tendermint/libs/sync" "github.com/tendermint/tendermint/p2p" + "github.com/tendermint/tendermint/p2p/mocks" + "github.com/tendermint/tendermint/p2p/p2ptest" ) -type TestMessage = gogotypes.StringValue - -func generateNode() (p2p.NodeInfo, crypto.PrivKey) { - privKey := ed25519.GenPrivKey() - nodeID := p2p.NodeIDFromPubKey(privKey.PubKey()) - nodeInfo := p2p.NodeInfo{ - NodeID: nodeID, - // FIXME: We have to fake a ListenAddr for now. - ListenAddr: "127.0.0.1:1234", - Moniker: "foo", - } - return nodeInfo, privKey -} - func echoReactor(channel *p2p.Channel) { for { select { - case envelope := <-channel.In(): - channel.Out() <- p2p.Envelope{ + case envelope := <-channel.In: + value := envelope.Message.(*p2ptest.Message).Value + channel.Out <- p2p.Envelope{ To: envelope.From, - Message: &TestMessage{Value: envelope.Message.(*TestMessage).Value}, + Message: &p2ptest.Message{Value: value}, } + case <-channel.Done(): return } } } -func TestRouter(t *testing.T) { - defer leaktest.Check(t)() +func TestRouter_Network(t *testing.T) { + t.Cleanup(leaktest.Check(t)) - logger := log.TestingLogger() - network := p2p.NewMemoryNetwork(logger) - nodeInfo, privKey := generateNode() - transport := network.CreateTransport(nodeInfo.NodeID) - defer transport.Close() - chID := p2p.ChannelID(1) + // Create a test network and open a channel where all peers run echoReactor. + network := p2ptest.MakeNetwork(t, 8) + local := network.RandomNode() + peers := network.Peers(local.NodeID) + channels := network.MakeChannels(t, 1, &p2ptest.Message{}) - // Start some other in-memory network nodes to communicate with, running - // a simple echo reactor that returns received messages. - peers := []p2p.NodeAddress{} - for i := 0; i < 3; i++ { - peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) - require.NoError(t, err) - peerInfo, peerKey := generateNode() - peerTransport := network.CreateTransport(peerInfo.NodeID) - defer peerTransport.Close() - peerRouter, err := p2p.NewRouter( - logger.With("peerID", i), - peerInfo, - peerKey, - peerManager, - []p2p.Transport{peerTransport}, - p2p.RouterOptions{}, + channel := channels[local.NodeID] + for _, peer := range peers { + go echoReactor(channels[peer.NodeID]) + } + + // Sending a message to each peer should work. + for _, peer := range peers { + p2ptest.RequireSendReceive(t, channel, peer.NodeID, + &p2ptest.Message{Value: "foo"}, + &p2ptest.Message{Value: "foo"}, ) - require.NoError(t, err) - peers = append(peers, peerTransport.Endpoints()[0].NodeAddress(peerInfo.NodeID)) - - channel, err := peerRouter.OpenChannel(chID, &TestMessage{}) - require.NoError(t, err) - defer channel.Close() - go echoReactor(channel) - - err = peerRouter.Start() - require.NoError(t, err) - defer func() { require.NoError(t, peerRouter.Stop()) }() } - // Start the main router and connect it to the peers above. - peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) - require.NoError(t, err) - defer peerManager.Close() - for _, address := range peers { - err := peerManager.Add(address) - require.NoError(t, err) - } - peerUpdates := peerManager.Subscribe() - defer peerUpdates.Close() - - router, err := p2p.NewRouter(logger, nodeInfo, privKey, peerManager, []p2p.Transport{transport}, p2p.RouterOptions{}) - require.NoError(t, err) - channel, err := router.OpenChannel(chID, &TestMessage{}) - require.NoError(t, err) - defer channel.Close() - - err = router.Start() - require.NoError(t, err) - defer func() { - // Since earlier defers are closed after this, and we have to make sure - // we close channels and subscriptions before the router, we explicitly - // close them here to. - peerUpdates.Close() - channel.Close() - require.NoError(t, router.Stop()) - }() - - // Wait for peers to come online, and ping them as they do. - for i := 0; i < len(peers); i++ { - peerUpdate := <-peerUpdates.Updates() - peerID := peerUpdate.PeerID - require.Equal(t, p2p.PeerUpdate{ - PeerID: peerID, - Status: p2p.PeerStatusUp, - }, peerUpdate) - - channel.Out() <- p2p.Envelope{To: peerID, Message: &TestMessage{Value: "hi!"}} - assert.Equal(t, p2p.Envelope{ - From: peerID, - Message: &TestMessage{Value: "hi!"}, - }, (<-channel.In()).Strip()) - } - - // We now send a broadcast, which we should return back from all peers. - channel.Out() <- p2p.Envelope{ + // Sending a broadcast should return back a message from all peers. + p2ptest.RequireSend(t, channel, p2p.Envelope{ Broadcast: true, - Message: &TestMessage{Value: "broadcast"}, - } - for i := 0; i < len(peers); i++ { - envelope := <-channel.In() - require.Equal(t, &TestMessage{Value: "broadcast"}, envelope.Message) + Message: &p2ptest.Message{Value: "bar"}, + }) + expect := []p2p.Envelope{} + for _, peer := range peers { + expect = append(expect, p2p.Envelope{ + From: peer.NodeID, + Message: &p2ptest.Message{Value: "bar"}, + }) } + p2ptest.RequireReceiveUnordered(t, channel, expect) - // We then submit an error for a peer, and watch it get disconnected. - channel.Error() <- p2p.PeerError{ - PeerID: peers[0].NodeID, - Err: errors.New("test error"), - Severity: p2p.PeerErrorSeverityCritical, + // We then submit an error for a peer, and watch it get disconnected and + // then reconnected as the router retries it. + peerUpdates := local.MakePeerUpdates(t) + channel.Error <- p2p.PeerError{ + NodeID: peers[0].NodeID, + Err: errors.New("boom"), } - peerUpdate := <-peerUpdates.Updates() - require.Equal(t, p2p.PeerUpdate{ - PeerID: peers[0].NodeID, - Status: p2p.PeerStatusDown, - }, peerUpdate) - - // The peer manager will automatically reconnect the peer, so we wait - // for that to happen. - peerUpdate = <-peerUpdates.Updates() - require.Equal(t, p2p.PeerUpdate{ - PeerID: peers[0].NodeID, - Status: p2p.PeerStatusUp, - }, peerUpdate) + p2ptest.RequireUpdates(t, peerUpdates, []p2p.PeerUpdate{ + {NodeID: peers[0].NodeID, Status: p2p.PeerStatusDown}, + {NodeID: peers[0].NodeID, Status: p2p.PeerStatusUp}, + }) +} + +func TestRouter_Channel(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Set up a router with no transports (so no peers). + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager, nil, p2p.RouterOptions{}) + require.NoError(t, err) + + require.NoError(t, router.Start()) + t.Cleanup(func() { + require.NoError(t, router.Stop()) + }) + + // Opening a channel should work. + channel, err := router.OpenChannel(chID, &p2ptest.Message{}) + require.NoError(t, err) + + // Opening the same channel again should fail. + _, err = router.OpenChannel(chID, &p2ptest.Message{}) + require.Error(t, err) + + // Opening a different channel should work. + _, err = router.OpenChannel(2, &p2ptest.Message{}) + require.NoError(t, err) + + // Closing the channel, then opening it again should be fine. + channel.Close() + time.Sleep(100 * time.Millisecond) // yes yes, but Close() is async... + + channel, err = router.OpenChannel(chID, &p2ptest.Message{}) + require.NoError(t, err) + + // We should be able to send on the channel, even though there are no peers. + p2ptest.RequireSend(t, channel, p2p.Envelope{ + To: p2p.NodeID(strings.Repeat("a", 40)), + Message: &p2ptest.Message{Value: "foo"}, + }) + + // A message to ourselves should be dropped. + p2ptest.RequireSend(t, channel, p2p.Envelope{ + To: selfID, + Message: &p2ptest.Message{Value: "self"}, + }) + p2ptest.RequireEmpty(t, channel) +} + +// Channel tests are hairy to mock, so we use an in-memory network instead. +func TestRouter_Channel_SendReceive(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Create a test network and open a channel on all nodes. + network := p2ptest.MakeNetwork(t, 3) + ids := network.NodeIDs() + aID, bID, cID := ids[0], ids[1], ids[2] + channels := network.MakeChannels(t, chID, &p2ptest.Message{}) + a, b, c := channels[aID], channels[bID], channels[cID] + otherChannels := network.MakeChannels(t, 9, &p2ptest.Message{}) + + // Sending a message a->b should work, and not send anything + // further to a, b, or c. + p2ptest.RequireSend(t, a, p2p.Envelope{To: bID, Message: &p2ptest.Message{Value: "foo"}}) + p2ptest.RequireReceive(t, b, p2p.Envelope{From: aID, Message: &p2ptest.Message{Value: "foo"}}) + p2ptest.RequireEmpty(t, a, b, c) + + // Sending a nil message a->c should be dropped. + p2ptest.RequireSend(t, a, p2p.Envelope{To: bID, Message: nil}) + p2ptest.RequireEmpty(t, a, b, c) + + // Sending a different message type should be dropped. + p2ptest.RequireSend(t, a, p2p.Envelope{To: bID, Message: &gogotypes.BoolValue{Value: true}}) + p2ptest.RequireEmpty(t, a, b, c) + + // Sending to an unknown peer should be dropped. + p2ptest.RequireSend(t, a, p2p.Envelope{ + To: p2p.NodeID(strings.Repeat("a", 40)), + Message: &p2ptest.Message{Value: "a"}, + }) + p2ptest.RequireEmpty(t, a, b, c) + + // Sending without a recipient should be dropped. + p2ptest.RequireSend(t, a, p2p.Envelope{Message: &p2ptest.Message{Value: "noto"}}) + p2ptest.RequireEmpty(t, a, b, c) + + // Sending to self should be dropped. + p2ptest.RequireSend(t, a, p2p.Envelope{To: aID, Message: &p2ptest.Message{Value: "self"}}) + p2ptest.RequireEmpty(t, a, b, c) + + // Removing b and sending to it should be dropped. + network.Remove(t, bID) + p2ptest.RequireSend(t, a, p2p.Envelope{To: bID, Message: &p2ptest.Message{Value: "nob"}}) + p2ptest.RequireEmpty(t, a, b, c) + + // After all this, sending a message c->a should work. + p2ptest.RequireSend(t, c, p2p.Envelope{To: aID, Message: &p2ptest.Message{Value: "bar"}}) + p2ptest.RequireReceive(t, a, p2p.Envelope{From: cID, Message: &p2ptest.Message{Value: "bar"}}) + p2ptest.RequireEmpty(t, a, b, c) + + // None of these messages should have made it onto the other channels. + for _, other := range otherChannels { + p2ptest.RequireEmpty(t, other) + } +} + +func TestRouter_Channel_Broadcast(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Create a test network and open a channel on all nodes. + network := p2ptest.MakeNetwork(t, 4) + ids := network.NodeIDs() + aID, bID, cID, dID := ids[0], ids[1], ids[2], ids[3] + channels := network.MakeChannels(t, 1, &p2ptest.Message{}) + a, b, c, d := channels[aID], channels[bID], channels[cID], channels[dID] + + // Sending a broadcast from b should work. + p2ptest.RequireSend(t, b, p2p.Envelope{Broadcast: true, Message: &p2ptest.Message{Value: "foo"}}) + p2ptest.RequireReceive(t, a, p2p.Envelope{From: bID, Message: &p2ptest.Message{Value: "foo"}}) + p2ptest.RequireReceive(t, c, p2p.Envelope{From: bID, Message: &p2ptest.Message{Value: "foo"}}) + p2ptest.RequireReceive(t, d, p2p.Envelope{From: bID, Message: &p2ptest.Message{Value: "foo"}}) + p2ptest.RequireEmpty(t, a, b, c, d) + + // Removing one node from the network shouldn't prevent broadcasts from working. + network.Remove(t, dID) + p2ptest.RequireSend(t, a, p2p.Envelope{Broadcast: true, Message: &p2ptest.Message{Value: "bar"}}) + p2ptest.RequireReceive(t, b, p2p.Envelope{From: aID, Message: &p2ptest.Message{Value: "bar"}}) + p2ptest.RequireReceive(t, c, p2p.Envelope{From: aID, Message: &p2ptest.Message{Value: "bar"}}) + p2ptest.RequireEmpty(t, a, b, c, d) +} + +func TestRouter_Channel_Wrapper(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Create a test network and open a channel on all nodes. + network := p2ptest.MakeNetwork(t, 2) + ids := network.NodeIDs() + aID, bID := ids[0], ids[1] + channels := network.MakeChannels(t, 1, &wrapperMessage{}) + a, b := channels[aID], channels[bID] + + // Since wrapperMessage implements p2p.Wrapper and handles Message, it + // should automatically wrap and unwrap sent messages -- we prepend the + // wrapper actions to the message value to signal this. + p2ptest.RequireSend(t, a, p2p.Envelope{To: bID, Message: &p2ptest.Message{Value: "foo"}}) + p2ptest.RequireReceive(t, b, p2p.Envelope{From: aID, Message: &p2ptest.Message{Value: "unwrap:wrap:foo"}}) + + // If we send a different message that can't be wrapped, it should be dropped. + p2ptest.RequireSend(t, a, p2p.Envelope{To: bID, Message: &gogotypes.BoolValue{Value: true}}) + p2ptest.RequireEmpty(t, b) + + // If we send the wrapper message itself, it should also be passed through + // since WrapperMessage supports it, and should only be unwrapped at the receiver. + p2ptest.RequireSend(t, a, p2p.Envelope{ + To: bID, + Message: &wrapperMessage{Message: p2ptest.Message{Value: "foo"}}, + }) + p2ptest.RequireReceive(t, b, p2p.Envelope{ + From: aID, + Message: &p2ptest.Message{Value: "unwrap:foo"}, + }) + +} + +// WrapperMessage prepends the value with "wrap:" and "unwrap:" to test it. +type wrapperMessage struct { + p2ptest.Message +} + +var _ p2p.Wrapper = (*wrapperMessage)(nil) + +func (w *wrapperMessage) Wrap(inner proto.Message) error { + switch inner := inner.(type) { + case *p2ptest.Message: + w.Message.Value = fmt.Sprintf("wrap:%v", inner.Value) + case *wrapperMessage: + *w = *inner + default: + return fmt.Errorf("invalid message type %T", inner) + } + return nil +} + +func (w *wrapperMessage) Unwrap() (proto.Message, error) { + return &p2ptest.Message{Value: fmt.Sprintf("unwrap:%v", w.Message.Value)}, nil +} + +func TestRouter_Channel_Error(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Create a test network and open a channel on all nodes. + network := p2ptest.MakeNetwork(t, 3) + ids := network.NodeIDs() + aID, bID := ids[0], ids[1] + channels := network.MakeChannels(t, 1, &p2ptest.Message{}) + a := channels[aID] + + // Erroring b should cause it to be disconnected. It will reconnect shortly after. + sub := network.Nodes[aID].MakePeerUpdates(t) + p2ptest.RequireError(t, a, p2p.PeerError{NodeID: bID, Err: errors.New("boom")}) + p2ptest.RequireUpdates(t, sub, []p2p.PeerUpdate{ + {NodeID: bID, Status: p2p.PeerStatusDown}, + {NodeID: bID, Status: p2p.PeerStatusUp}, + }) +} + +func TestRouter_AcceptPeers(t *testing.T) { + testcases := map[string]struct { + peerInfo p2p.NodeInfo + peerKey crypto.PubKey + ok bool + }{ + "valid handshake": {peerInfo, peerKey.PubKey(), true}, + "empty handshake": {p2p.NodeInfo{}, nil, false}, + "invalid key": {peerInfo, selfKey.PubKey(), false}, + "self handshake": {selfInfo, selfKey.PubKey(), false}, + } + for name, tc := range testcases { + tc := tc + t.Run(name, func(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Set up a mock transport that handshakes. + closer := tmsync.NewCloser() + mockConnection := &mocks.Connection{} + mockConnection.On("String").Maybe().Return("mock") + mockConnection.On("Handshake", mock.Anything, selfInfo, selfKey). + Return(tc.peerInfo, tc.peerKey, nil) + mockConnection.On("Close").Run(func(_ mock.Arguments) { closer.Close() }).Return(nil) + if tc.ok { + mockConnection.On("ReceiveMessage").Return(chID, nil, io.EOF) + } + + mockTransport := &mocks.Transport{} + mockTransport.On("String").Maybe().Return("mock") + mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"}) + mockTransport.On("Close").Return(nil) + mockTransport.On("Accept").Once().Return(mockConnection, nil) + mockTransport.On("Accept").Once().Return(nil, io.EOF) + + // Set up and start the router. + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + sub := peerManager.Subscribe() + defer sub.Close() + + router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager, + []p2p.Transport{mockTransport}, p2p.RouterOptions{}) + require.NoError(t, err) + require.NoError(t, router.Start()) + + if tc.ok { + p2ptest.RequireUpdate(t, sub, p2p.PeerUpdate{ + NodeID: tc.peerInfo.NodeID, + Status: p2p.PeerStatusUp, + }) + sub.Close() + } else { + select { + case <-closer.Done(): + case <-time.After(100 * time.Millisecond): + require.Fail(t, "connection not closed") + } + } + + require.NoError(t, router.Stop()) + mockTransport.AssertExpectations(t) + mockConnection.AssertExpectations(t) + }) + } +} + +func TestRouter_AcceptPeers_Error(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Set up a mock transport that returns an error, which should prevent + // the router from calling Accept again. + mockTransport := &mocks.Transport{} + mockTransport.On("String").Maybe().Return("mock") + mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"}) + mockTransport.On("Accept").Once().Return(nil, errors.New("boom")) + mockTransport.On("Close").Return(nil) + + // Set up and start the router. + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager, + []p2p.Transport{mockTransport}, p2p.RouterOptions{}) + require.NoError(t, err) + + require.NoError(t, router.Start()) + time.Sleep(time.Second) + require.NoError(t, router.Stop()) + + mockTransport.AssertExpectations(t) +} + +func TestRouter_AcceptPeers_ErrorEOF(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Set up a mock transport that returns io.EOF once, which should prevent + // the router from calling Accept again. + mockTransport := &mocks.Transport{} + mockTransport.On("String").Maybe().Return("mock") + mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"}) + mockTransport.On("Accept").Once().Return(nil, io.EOF) + mockTransport.On("Close").Return(nil) + + // Set up and start the router. + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager, + []p2p.Transport{mockTransport}, p2p.RouterOptions{}) + require.NoError(t, err) + + require.NoError(t, router.Start()) + time.Sleep(time.Second) + require.NoError(t, router.Stop()) + + mockTransport.AssertExpectations(t) +} + +func TestRouter_AcceptPeers_HeadOfLineBlocking(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Set up a mock transport that returns a connection that blocks during the + // handshake. It should be able to accept several of these in parallel, i.e. + // a single connection can't halt other connections being accepted. + acceptCh := make(chan bool, 3) + closeCh := make(chan time.Time) + + mockConnection := &mocks.Connection{} + mockConnection.On("String").Maybe().Return("mock") + mockConnection.On("Handshake", mock.Anything, selfInfo, selfKey). + WaitUntil(closeCh).Return(p2p.NodeInfo{}, nil, io.EOF) + mockConnection.On("Close").Return(nil) + + mockTransport := &mocks.Transport{} + mockTransport.On("String").Maybe().Return("mock") + mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"}) + mockTransport.On("Close").Return(nil) + mockTransport.On("Accept").Times(3).Run(func(_ mock.Arguments) { + acceptCh <- true + }).Return(mockConnection, nil) + mockTransport.On("Accept").Once().Return(nil, io.EOF) + + // Set up and start the router. + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager, + []p2p.Transport{mockTransport}, p2p.RouterOptions{}) + require.NoError(t, err) + require.NoError(t, router.Start()) + + require.Eventually(t, func() bool { + return len(acceptCh) == 3 + }, time.Second, 10*time.Millisecond) + close(closeCh) + time.Sleep(100 * time.Millisecond) + + require.NoError(t, router.Stop()) + mockTransport.AssertExpectations(t) + mockConnection.AssertExpectations(t) +} + +func TestRouter_DialPeers(t *testing.T) { + testcases := map[string]struct { + dialID p2p.NodeID + peerInfo p2p.NodeInfo + peerKey crypto.PubKey + dialErr error + ok bool + }{ + "valid dial": {peerInfo.NodeID, peerInfo, peerKey.PubKey(), nil, true}, + "empty handshake": {peerInfo.NodeID, p2p.NodeInfo{}, nil, nil, false}, + "invalid key": {peerInfo.NodeID, peerInfo, selfKey.PubKey(), nil, false}, + "unexpected node ID": {peerInfo.NodeID, selfInfo, selfKey.PubKey(), nil, false}, + "dial error": {peerInfo.NodeID, peerInfo, peerKey.PubKey(), errors.New("boom"), false}, + } + for name, tc := range testcases { + tc := tc + t.Run(name, func(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + address := p2p.NodeAddress{Protocol: "mock", NodeID: tc.dialID} + endpoint := p2p.Endpoint{Protocol: "mock", Path: string(tc.dialID)} + + // Set up a mock transport that handshakes. + closer := tmsync.NewCloser() + mockConnection := &mocks.Connection{} + mockConnection.On("String").Maybe().Return("mock") + if tc.dialErr == nil { + mockConnection.On("Handshake", mock.Anything, selfInfo, selfKey). + Return(tc.peerInfo, tc.peerKey, nil) + mockConnection.On("Close").Run(func(_ mock.Arguments) { closer.Close() }).Return(nil) + } + if tc.ok { + mockConnection.On("ReceiveMessage").Return(chID, nil, io.EOF) + } + + mockTransport := &mocks.Transport{} + mockTransport.On("String").Maybe().Return("mock") + mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"}) + mockTransport.On("Close").Return(nil) + mockTransport.On("Accept").Maybe().Return(nil, io.EOF) + if tc.dialErr == nil { + mockTransport.On("Dial", mock.Anything, endpoint).Once().Return(mockConnection, nil) + // This handles the retry when a dialed connection gets closed after ReceiveMessage + // returns io.EOF above. + mockTransport.On("Dial", mock.Anything, endpoint).Maybe().Return(nil, io.EOF) + } else { + mockTransport.On("Dial", mock.Anything, endpoint).Once(). + Run(func(_ mock.Arguments) { closer.Close() }). + Return(nil, tc.dialErr) + } + + // Set up and start the router. + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + require.NoError(t, peerManager.Add(address)) + sub := peerManager.Subscribe() + defer sub.Close() + + router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager, + []p2p.Transport{mockTransport}, p2p.RouterOptions{}) + require.NoError(t, err) + require.NoError(t, router.Start()) + + if tc.ok { + p2ptest.RequireUpdate(t, sub, p2p.PeerUpdate{ + NodeID: tc.peerInfo.NodeID, + Status: p2p.PeerStatusUp, + }) + sub.Close() + } else { + select { + case <-closer.Done(): + case <-time.After(100 * time.Millisecond): + require.Fail(t, "connection not closed") + } + } + + require.NoError(t, router.Stop()) + mockTransport.AssertExpectations(t) + mockConnection.AssertExpectations(t) + }) + } +} + +func TestRouter_DialPeers_Parallel(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + a := p2p.NodeAddress{Protocol: "mock", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "mock", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "mock", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + // Set up a mock transport that returns a connection that blocks during the + // handshake. It should dial all peers in parallel. + dialCh := make(chan bool, 3) + closeCh := make(chan time.Time) + + mockConnection := &mocks.Connection{} + mockConnection.On("String").Maybe().Return("mock") + mockConnection.On("Handshake", mock.Anything, selfInfo, selfKey). + WaitUntil(closeCh).Return(p2p.NodeInfo{}, nil, io.EOF) + mockConnection.On("Close").Return(nil) + + mockTransport := &mocks.Transport{} + mockTransport.On("String").Maybe().Return("mock") + mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"}) + mockTransport.On("Close").Return(nil) + mockTransport.On("Accept").Once().Return(nil, io.EOF) + for _, address := range []p2p.NodeAddress{a, b, c} { + endpoint := p2p.Endpoint{Protocol: address.Protocol, Path: string(address.NodeID)} + mockTransport.On("Dial", mock.Anything, endpoint).Run(func(_ mock.Arguments) { + dialCh <- true + }).Return(mockConnection, nil) + } + + // Set up and start the router. + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Add(c)) + + router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager, + []p2p.Transport{mockTransport}, p2p.RouterOptions{}) + require.NoError(t, err) + require.NoError(t, router.Start()) + + require.Eventually(t, func() bool { + return len(dialCh) == 3 + }, time.Second, 10*time.Millisecond) + close(closeCh) + time.Sleep(100 * time.Millisecond) + + require.NoError(t, router.Stop()) + mockTransport.AssertExpectations(t) + mockConnection.AssertExpectations(t) +} + +func TestRouter_EvictPeers(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + // Set up a mock transport that we can evict. + closeCh := make(chan time.Time) + closeOnce := sync.Once{} + + mockConnection := &mocks.Connection{} + mockConnection.On("String").Maybe().Return("mock") + mockConnection.On("Handshake", mock.Anything, selfInfo, selfKey). + Return(peerInfo, peerKey.PubKey(), nil) + mockConnection.On("ReceiveMessage").WaitUntil(closeCh).Return(chID, nil, io.EOF) + mockConnection.On("Close").Run(func(_ mock.Arguments) { + closeOnce.Do(func() { + close(closeCh) + }) + }).Return(nil) + + mockTransport := &mocks.Transport{} + mockTransport.On("String").Maybe().Return("mock") + mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"}) + mockTransport.On("Close").Return(nil) + mockTransport.On("Accept").Once().Return(mockConnection, nil) + mockTransport.On("Accept").Once().Return(nil, io.EOF) + + // Set up and start the router. + peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + sub := peerManager.Subscribe() + defer sub.Close() + + router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager, + []p2p.Transport{mockTransport}, p2p.RouterOptions{}) + require.NoError(t, err) + require.NoError(t, router.Start()) + + // Wait for the mock peer to connect, then evict it by reporting an error. + p2ptest.RequireUpdate(t, sub, p2p.PeerUpdate{ + NodeID: peerInfo.NodeID, + Status: p2p.PeerStatusUp, + }) + + require.NoError(t, peerManager.Errored(peerInfo.NodeID, errors.New("boom"))) + + p2ptest.RequireUpdate(t, sub, p2p.PeerUpdate{ + NodeID: peerInfo.NodeID, + Status: p2p.PeerStatusDown, + }) + sub.Close() + + require.NoError(t, router.Stop()) + mockTransport.AssertExpectations(t) + mockConnection.AssertExpectations(t) } diff --git a/p2p/shim.go b/p2p/shim.go index 266fc0f1f..a71a1df54 100644 --- a/p2p/shim.go +++ b/p2p/shim.go @@ -29,7 +29,7 @@ type ( BaseReactor Name string - PeerUpdates *PeerUpdatesCh + PeerUpdates *PeerUpdates Channels map[ChannelID]*ChannelShim } @@ -39,6 +39,9 @@ type ( ChannelShim struct { Descriptor *ChannelDescriptor Channel *Channel + inCh chan<- Envelope + outCh <-chan Envelope + errCh <-chan PeerError } // ChannelDescriptorShim defines a shim wrapper around a legacy p2p channel @@ -56,7 +59,7 @@ func NewReactorShim(logger log.Logger, name string, descriptors map[ChannelID]*C for _, cds := range descriptors { chShim := NewChannelShim(cds, 0) - channels[chShim.Channel.id] = chShim + channels[chShim.Channel.ID] = chShim } rs := &ReactorShim{ @@ -72,15 +75,21 @@ func NewReactorShim(logger log.Logger, name string, descriptors map[ChannelID]*C } func NewChannelShim(cds *ChannelDescriptorShim, buf uint) *ChannelShim { + inCh := make(chan Envelope, buf) + outCh := make(chan Envelope, buf) + errCh := make(chan PeerError, buf) return &ChannelShim{ Descriptor: cds.Descriptor, Channel: NewChannel( ChannelID(cds.Descriptor.ID), cds.MsgType, - make(chan Envelope, buf), - make(chan Envelope, buf), - make(chan PeerError, buf), + inCh, + outCh, + errCh, ), + inCh: inCh, + outCh: outCh, + errCh: errCh, } } @@ -91,7 +100,7 @@ func NewChannelShim(cds *ChannelDescriptorShim, buf uint) *ChannelShim { func (rs *ReactorShim) proxyPeerEnvelopes() { for _, cs := range rs.Channels { go func(cs *ChannelShim) { - for e := range cs.Channel.outCh { + for e := range cs.outCh { msg := proto.Clone(cs.Channel.messageType) msg.Reset() @@ -161,11 +170,11 @@ func (rs *ReactorShim) proxyPeerEnvelopes() { func (rs *ReactorShim) handlePeerErrors() { for _, cs := range rs.Channels { go func(cs *ChannelShim) { - for pErr := range cs.Channel.errCh { - if pErr.PeerID != "" { - peer := rs.Switch.peers.Get(pErr.PeerID) + for pErr := range cs.errCh { + if pErr.NodeID != "" { + peer := rs.Switch.peers.Get(pErr.NodeID) if peer == nil { - rs.Logger.Error("failed to handle peer error; failed to find peer", "peer", pErr.PeerID) + rs.Logger.Error("failed to handle peer error; failed to find peer", "peer", pErr.NodeID) continue } @@ -225,7 +234,7 @@ func (rs *ReactorShim) GetChannels() []*ChannelDescriptor { // handle adding a peer. func (rs *ReactorShim) AddPeer(peer Peer) { select { - case rs.PeerUpdates.updatesCh <- PeerUpdate{PeerID: peer.ID(), Status: PeerStatusUp}: + case rs.PeerUpdates.updatesCh <- PeerUpdate{NodeID: peer.ID(), Status: PeerStatusUp}: rs.Logger.Debug("sent peer update", "reactor", rs.Name, "peer", peer.ID(), "status", PeerStatusUp) case <-rs.PeerUpdates.Done(): @@ -244,7 +253,7 @@ func (rs *ReactorShim) AddPeer(peer Peer) { // handle removing a peer. func (rs *ReactorShim) RemovePeer(peer Peer, reason interface{}) { select { - case rs.PeerUpdates.updatesCh <- PeerUpdate{PeerID: peer.ID(), Status: PeerStatusDown}: + case rs.PeerUpdates.updatesCh <- PeerUpdate{NodeID: peer.ID(), Status: PeerStatusDown}: rs.Logger.Debug( "sent peer update", "reactor", rs.Name, @@ -311,7 +320,7 @@ func (rs *ReactorShim) Receive(chID byte, src Peer, msgBytes []byte) { } select { - case channelShim.Channel.inCh <- Envelope{From: src.ID(), Message: msg}: + case channelShim.inCh <- Envelope{From: src.ID(), Message: msg}: rs.Logger.Debug("proxied envelope", "reactor", rs.Name, "ch_id", cID, "peer", src.ID()) case <-channelShim.Channel.Done(): diff --git a/p2p/shim_test.go b/p2p/shim_test.go index f5b84a490..b91af366d 100644 --- a/p2p/shim_test.go +++ b/p2p/shim_test.go @@ -92,7 +92,7 @@ func TestReactorShim_GetChannel(t *testing.T) { p2pCh := rts.shim.GetChannel(p2p.ChannelID(channelID1)) require.NotNil(t, p2pCh) - require.Equal(t, p2pCh.ID(), p2p.ChannelID(channelID1)) + require.Equal(t, p2pCh.ID, p2p.ChannelID(channelID1)) p2pCh = rts.shim.GetChannel(p2p.ChannelID(byte(0x03))) require.Nil(t, p2pCh) @@ -123,7 +123,7 @@ func TestReactorShim_AddPeer(t *testing.T) { rts.shim.AddPeer(peerA) wg.Wait() - require.Equal(t, peerIDA, peerUpdate.PeerID) + require.Equal(t, peerIDA, peerUpdate.NodeID) require.Equal(t, p2p.PeerStatusUp, peerUpdate.Status) } @@ -143,7 +143,7 @@ func TestReactorShim_RemovePeer(t *testing.T) { rts.shim.RemovePeer(peerA, "test reason") wg.Wait() - require.Equal(t, peerIDA, peerUpdate.PeerID) + require.Equal(t, peerIDA, peerUpdate.NodeID) require.Equal(t, p2p.PeerStatusDown, peerUpdate.Status) } @@ -178,11 +178,11 @@ func TestReactorShim_Receive(t *testing.T) { // Simulate receiving the envelope in some real reactor and replying back with // the same envelope and then closing the Channel. go func() { - e := <-p2pCh.Channel.In() + e := <-p2pCh.Channel.In require.Equal(t, peerIDA, e.From) require.NotNil(t, e.Message) - p2pCh.Channel.Out() <- p2p.Envelope{To: e.From, Message: e.Message} + p2pCh.Channel.Out <- p2p.Envelope{To: e.From, Message: e.Message} p2pCh.Channel.Close() wg.Done() }() @@ -200,7 +200,7 @@ func TestReactorShim_Receive(t *testing.T) { // Since p2pCh was closed in the simulated reactor above, calling Receive // should not block. rts.shim.Receive(channelID1, peerA, bz) - require.Empty(t, p2pCh.Channel.In()) + require.Empty(t, p2pCh.Channel.In) peerA.AssertExpectations(t) } diff --git a/p2p/transport.go b/p2p/transport.go index dc01e5efc..ff782f966 100644 --- a/p2p/transport.go +++ b/p2p/transport.go @@ -10,6 +10,8 @@ import ( "github.com/tendermint/tendermint/p2p/conn" ) +//go:generate mockery --case underscore --name Transport|Connection + const ( // defaultProtocol is the default protocol used for NodeAddress when // a protocol isn't explicitly given as a URL scheme. diff --git a/p2p/transport_test.go b/p2p/transport_test.go index 56aa3d92b..657cb4b35 100644 --- a/p2p/transport_test.go +++ b/p2p/transport_test.go @@ -19,11 +19,8 @@ import ( // transportFactory is used to set up transports for tests. type transportFactory func(t *testing.T) p2p.Transport -var ( - ctx = context.Background() // convenience context - chID = p2p.ChannelID(1) // channel ID for use in tests - testTransports = map[string]transportFactory{} // registry for withTransports -) +// testTransports is a registry of transport factories for withTransports(). +var testTransports = map[string]transportFactory{} // withTransports is a test helper that runs a test against all transports // registered in testTransports. diff --git a/state/state.go b/state/state.go index f84ab84f4..20e78e420 100644 --- a/state/state.go +++ b/state/state.go @@ -16,11 +16,6 @@ import ( "github.com/tendermint/tendermint/version" ) -// database keys -var ( - stateKey = []byte("stateKey") -) - //----------------------------------------------------------------------------- type Version struct { diff --git a/state/store.go b/state/store.go index 58b722231..5278585a7 100644 --- a/state/store.go +++ b/state/store.go @@ -32,6 +32,7 @@ const ( prefixValidators = int64(5) prefixConsensusParams = int64(6) prefixABCIResponses = int64(7) + prefixState = int64(8) ) func encodeKey(prefix int64, height uint64) []byte { @@ -54,6 +55,17 @@ func abciResponsesKey(height uint64) []byte { return encodeKey(prefixABCIResponses, height) } +// stateKey should never change after being set in init() +var stateKey []byte + +func init() { + var err error + stateKey, err = orderedcode.Append(nil, prefixState) + if err != nil { + panic(err) + } +} + //---------------------- //go:generate mockery --case underscore --name Store @@ -239,11 +251,16 @@ func (store dbStore) PruneStates(retainHeight uint64) error { return fmt.Errorf("height %v must be greater than 0", retainHeight) } - if err := store.pruneValidatorSets(retainHeight); err != nil { + // NOTE: We need to prune consensus params first because the validator + // sets have always one extra height. If validator sets were pruned first + // we could get a situation where we prune up to the last validator set + // yet don't have the respective consensus params at that height and thus + // return an error + if err := store.pruneConsensusParams(retainHeight); err != nil { return err } - if err := store.pruneConsensusParams(retainHeight); err != nil { + if err := store.pruneValidatorSets(retainHeight); err != nil { return err } @@ -257,37 +274,48 @@ func (store dbStore) PruneStates(retainHeight uint64) error { // pruneValidatorSets calls a reverse iterator from base height to retain height (exclusive), deleting // all validator sets in between. Due to the fact that most validator sets stored reference an earlier // validator set, it is likely that there will remain one validator set left after pruning. -func (store dbStore) pruneValidatorSets(height uint64) error { - valInfo, err := loadValidatorsInfo(store.db, height) +func (store dbStore) pruneValidatorSets(retainHeight uint64) error { + valInfo, err := loadValidatorsInfo(store.db, retainHeight) if err != nil { - return fmt.Errorf("validators at height %v not found: %w", height, err) + return fmt.Errorf("validators at height %v not found: %w", retainHeight, err) } // We will prune up to the validator set at the given "height". As we don't save validator sets every // height but only when they change or at a check point, it is likely that the validator set at the height // we prune to is empty and thus dependent on the validator set saved at a previous height. We must find // that validator set and make sure it is not pruned. - lastRecordedValSetHeight := lastStoredHeightFor(height, valInfo.LastHeightChanged) + lastRecordedValSetHeight := lastStoredHeightFor(retainHeight, valInfo.LastHeightChanged) lastRecordedValSet, err := loadValidatorsInfo(store.db, lastRecordedValSetHeight) if err != nil || lastRecordedValSet.ValidatorSet == nil { return fmt.Errorf("couldn't find validators at height %d (height %d was originally requested): %w", - lastStoredHeightFor(height, valInfo.LastHeightChanged), - height, + lastStoredHeightFor(retainHeight, valInfo.LastHeightChanged), + retainHeight, err, ) } - // batch delete all the validators sets up to height - return store.batchDelete( + // if this is not equal to the retain height, prune from the retain height to the height above + // the last saved validator set. This way we can skip over the dependent validator set. + if lastRecordedValSetHeight < retainHeight { + err := store.pruneRange( + validatorsKey(lastRecordedValSetHeight+1), + validatorsKey(retainHeight), + ) + if err != nil { + return err + } + } + + // prune all the validators sets up to last saved validator set + return store.pruneRange( validatorsKey(1), - validatorsKey(height), validatorsKey(lastRecordedValSetHeight), ) } // pruneConsensusParams calls a reverse iterator from base height to retain height batch deleting // all consensus params in between. If the consensus params at the new base height is dependent -// on a prior height then this will keep that lower height to. +// on a prior height then this will keep that lower height too. func (store dbStore) pruneConsensusParams(retainHeight uint64) error { paramsInfo, err := store.loadConsensusParamsInfo(retainHeight) if err != nil { @@ -298,21 +326,31 @@ func (store dbStore) pruneConsensusParams(retainHeight uint64) error { // we must not prune (or save) the last consensus params that the consensus params info at height // is dependent on. if paramsInfo.ConsensusParams.Equal(&tmproto.ConsensusParams{}) { + // sanity check that the consensus params at the last height it was changed is there lastRecordedConsensusParams, err := store.loadConsensusParamsInfo(paramsInfo.LastHeightChanged) if err != nil || lastRecordedConsensusParams.ConsensusParams.Equal(&tmproto.ConsensusParams{}) { return fmt.Errorf( - "couldn't find consensus params at height %d as last changed from height %d: %w", + "couldn't find consensus params at height %d (height %d was originally requested): %w", paramsInfo.LastHeightChanged, retainHeight, err, ) } + + // prune the params above the height with which it last changed and below the retain height. + err = store.pruneRange( + consensusParamsKey(paramsInfo.LastHeightChanged+1), + consensusParamsKey(retainHeight), + ) + if err != nil { + return err + } } - // batch delete all the consensus params up to the retain height - return store.batchDelete( + // prune all the consensus params up to either the last height the params changed or if the params + // last changed at the retain height, then up to the retain height. + return store.pruneRange( consensusParamsKey(1), - consensusParamsKey(retainHeight), consensusParamsKey(paramsInfo.LastHeightChanged), ) } @@ -320,72 +358,69 @@ func (store dbStore) pruneConsensusParams(retainHeight uint64) error { // pruneABCIResponses calls a reverse iterator from base height to retain height batch deleting // all abci responses in between func (store dbStore) pruneABCIResponses(height uint64) error { - return store.batchDelete(abciResponsesKey(1), abciResponsesKey(height), nil) + return store.pruneRange(abciResponsesKey(1), abciResponsesKey(height)) } -// batchDelete is a generic function for deleting a range of keys in reverse order. It will -// skip keys that have been -func (store dbStore) batchDelete(start []byte, end []byte, exception []byte) error { - iter, err := store.db.ReverseIterator(start, end) - if err != nil { - return fmt.Errorf("iterator error: %w", err) - } - defer iter.Close() - +// pruneRange is a generic function for deleting a range of keys in reverse order. +// we keep filling up batches of at most 1000 keys, perform a deletion and continue until +// we have gone through all of keys in the range. This avoids doing any writes whilst +// iterating. +func (store dbStore) pruneRange(start []byte, end []byte) error { + var err error batch := store.db.NewBatch() defer batch.Close() - pruned := 0 - for iter.Valid() { - key := iter.Key() - if bytes.Equal(key, exception) { - iter.Next() - continue - } - - if err := batch.Delete(key); err != nil { - return fmt.Errorf("pruning error at key %X: %w", key, err) - } - - pruned++ - // avoid batches growing too large by flushing to disk regularly - if pruned%1000 == 0 { - if err := iter.Error(); err != nil { - return err - } - if err := iter.Close(); err != nil { - return err - } - - if err := batch.Write(); err != nil { - return fmt.Errorf("pruning error at key %X: %w", key, err) - } - if err := batch.Close(); err != nil { - return err - } - - iter, err = store.db.ReverseIterator(start, end) - if err != nil { - return fmt.Errorf("iterator error: %w", err) - } - defer iter.Close() - - batch = store.db.NewBatch() - defer batch.Close() - } else { - iter.Next() - } - } - - if err := iter.Error(); err != nil { - return fmt.Errorf("iterator error: %w", err) - } - - if err := batch.WriteSync(); err != nil { + end, err = store.reverseBatchDelete(batch, start, end) + if err != nil { return err } - return nil + // iterate until the last batch of the pruning range in which case we will perform a + // write sync + for !bytes.Equal(start, end) { + if err := batch.Write(); err != nil { + return err + } + + if err := batch.Close(); err != nil { + return err + } + + batch = store.db.NewBatch() + + // fill a new batch of keys for deletion over the remainding range + end, err = store.reverseBatchDelete(batch, start, end) + if err != nil { + return err + } + } + + return batch.WriteSync() +} + +// reverseBatchDelete runs a reverse iterator (from end to start) filling up a batch until either +// (a) the iterator reaches the start or (b) the iterator has added a 1000 keys (this avoids the +// batch from growing too large) +func (store dbStore) reverseBatchDelete(batch dbm.Batch, start, end []byte) ([]byte, error) { + iter, err := store.db.ReverseIterator(start, end) + if err != nil { + return end, fmt.Errorf("iterator error: %w", err) + } + defer iter.Close() + + size := 0 + for ; iter.Valid(); iter.Next() { + if err := batch.Delete(iter.Key()); err != nil { + return end, fmt.Errorf("pruning error at key %X: %w", iter.Key(), err) + } + + // avoid batches growing too large by capping them + size++ + if size == 1000 { + return iter.Key(), iter.Error() + } + } + return start, iter.Error() } //------------------------------------------------------------------------ @@ -584,7 +619,7 @@ func (store dbStore) LoadConsensusParams(height uint64) (types.ConsensusParams, paramsInfo2, err := store.loadConsensusParamsInfo(paramsInfo.LastHeightChanged) if err != nil { return empty, fmt.Errorf( - "couldn't find consensus params at height %d as last changed from height %d: %w", + "couldn't find consensus params at height %d (height %d was originally requested): %w", paramsInfo.LastHeightChanged, height, err, diff --git a/state/store_test.go b/state/store_test.go index b29bf8be9..a8a5f249b 100644 --- a/state/store_test.go +++ b/state/store_test.go @@ -159,25 +159,27 @@ func TestStoreLoadConsensusParams(t *testing.T) { func TestPruneStates(t *testing.T) { testcases := map[string]struct { - makeHeights uint64 - pruneHeight uint64 - expectErr bool - expectVals []int64 - expectParams []int64 - expectABCI []int64 + startHeight uint64 + endHeight uint64 + pruneHeight uint64 + expectErr bool + remainingValSetHeight uint64 + remainingParamsHeight uint64 }{ - "error when prune height is 0": {100, 0, true, nil, nil, nil}, - "error when prune height does not exist": {100, 101, true, nil, nil, nil}, - "prune all": {100, 100, false, []int64{93, 100}, []int64{95, 100}, []int64{100}}, - "prune some": {10, 8, false, []int64{3, 8, 9, 10}, - []int64{5, 8, 9, 10}, []int64{8, 9, 10}}, - "prune across checkpoint": {100002, 100002, false, []int64{100000, 100002}, - []int64{99995, 100002}, []int64{100002}}, + "error when prune height is 0": {1, 100, 0, true, 0, 0}, + "error when prune height does not exist": {1, 100, 101, true, 0, 0}, + "prune all": {1, 100, 100, false, 93, 95}, + "prune from non 1 height": {10, 50, 40, false, 33, 35}, + "prune some": {1, 10, 8, false, 3, 5}, + // we test this because we flush to disk every 1000 "states" + "prune more than 1000 state": {1, 1010, 1010, false, 1003, 1005}, + "prune across checkpoint": {99900, 100002, 100002, false, 100000, 99995}, } for name, tc := range testcases { tc := tc t.Run(name, func(t *testing.T) { db := dbm.NewMemDB() + stateStore := sm.NewStore(db) pk := ed25519.GenPrivKey().PubKey() @@ -191,7 +193,7 @@ func TestPruneStates(t *testing.T) { valsChanged := uint64(0) paramsChanged := uint64(0) - for h := uint64(1); h <= tc.makeHeights; h++ { + for h := tc.startHeight; h <= tc.endHeight; h++ { if valsChanged == 0 || h%10 == 2 { valsChanged = h + 1 // Have to add 1, since NextValidators is what's stored } @@ -236,36 +238,44 @@ func TestPruneStates(t *testing.T) { } require.NoError(t, err) - expectVals := sliceToMap(tc.expectVals) - expectParams := sliceToMap(tc.expectParams) - expectABCI := sliceToMap(tc.expectABCI) - - for h := uint64(1); h <= tc.makeHeights; h++ { + for h := tc.pruneHeight; h <= tc.endHeight; h++ { vals, err := stateStore.LoadValidators(h) - if expectVals[int64(h)] { - require.NoError(t, err, "validators height %v", h) - require.NotNil(t, vals) + require.NoError(t, err, h) + require.NotNil(t, vals, h) + + params, err := stateStore.LoadConsensusParams(h) + require.NoError(t, err, h) + require.NotNil(t, params, h) + + abci, err := stateStore.LoadABCIResponses(h) + require.NoError(t, err, h) + require.NotNil(t, abci, h) + } + + emptyParams := types.ConsensusParams{} + + for h := tc.startHeight; h < tc.pruneHeight; h++ { + vals, err := stateStore.LoadValidators(h) + if h == tc.remainingValSetHeight { + require.NoError(t, err, h) + require.NotNil(t, vals, h) } else { - require.Error(t, err, "validators height %v", h) - require.Equal(t, sm.ErrNoValSetForHeight{Height: h}, err) + require.Error(t, err, h) + require.Nil(t, vals, h) } params, err := stateStore.LoadConsensusParams(h) - if expectParams[int64(h)] { - require.NoError(t, err, "params height %v", h) - require.False(t, params.Equals(&types.ConsensusParams{}), "params should not be empty") + if h == tc.remainingParamsHeight { + require.NoError(t, err, h) + require.NotEqual(t, emptyParams, params, h) } else { - require.Error(t, err, "params height %v", h) + require.Error(t, err, h) + require.Equal(t, emptyParams, params, h) } abci, err := stateStore.LoadABCIResponses(h) - if expectABCI[int64(h)] { - require.NoError(t, err, "abci height %v", h) - require.NotNil(t, abci) - } else { - require.Error(t, err, "abci height %v", h) - require.Equal(t, sm.ErrNoABCIResponsesForHeight{Height: h}, err) - } + require.Error(t, err, h) + require.Nil(t, abci, h) } }) } @@ -292,11 +302,3 @@ func TestABCIResponsesResultsHash(t *testing.T) { require.NoError(t, err) assert.NoError(t, proof.Verify(root, bz)) } - -func sliceToMap(s []int64) map[int64]bool { - m := make(map[int64]bool, len(s)) - for _, i := range s { - m[i] = true - } - return m -} diff --git a/statesync/reactor.go b/statesync/reactor.go index 30cf5c09e..a7e536b0f 100644 --- a/statesync/reactor.go +++ b/statesync/reactor.go @@ -78,7 +78,7 @@ type Reactor struct { tempDir string snapshotCh *p2p.Channel chunkCh *p2p.Channel - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates closeCh chan struct{} // This will only be set when a state sync is in progress. It is used to feed @@ -96,7 +96,7 @@ func NewReactor( conn proxy.AppConnSnapshot, connQuery proxy.AppConnQuery, snapshotCh, chunkCh *p2p.Channel, - peerUpdates *p2p.PeerUpdatesCh, + peerUpdates *p2p.PeerUpdates, tempDir string, ) *Reactor { r := &Reactor{ @@ -170,7 +170,7 @@ func (r *Reactor) handleSnapshotMessage(envelope p2p.Envelope) error { "height", snapshot.Height, "format", snapshot.Format, ) - r.snapshotCh.Out() <- p2p.Envelope{ + r.snapshotCh.Out <- p2p.Envelope{ To: envelope.From, Message: &ssproto.SnapshotsResponse{ Height: snapshot.Height, @@ -254,7 +254,7 @@ func (r *Reactor) handleChunkMessage(envelope p2p.Envelope) error { "chunk", msg.Index, "peer", envelope.From, ) - r.chunkCh.Out() <- p2p.Envelope{ + r.chunkCh.Out <- p2p.Envelope{ To: envelope.From, Message: &ssproto.ChunkResponse{ Height: msg.Height, @@ -343,13 +343,12 @@ func (r *Reactor) processSnapshotCh() { for { select { - case envelope := <-r.snapshotCh.In(): - if err := r.handleMessage(r.snapshotCh.ID(), envelope); err != nil { - r.Logger.Error("failed to process message", "ch_id", r.snapshotCh.ID(), "envelope", envelope, "err", err) - r.snapshotCh.Error() <- p2p.PeerError{ - PeerID: envelope.From, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + case envelope := <-r.snapshotCh.In: + if err := r.handleMessage(r.snapshotCh.ID, envelope); err != nil { + r.Logger.Error("failed to process message", "ch_id", r.snapshotCh.ID, "envelope", envelope, "err", err) + r.snapshotCh.Error <- p2p.PeerError{ + NodeID: envelope.From, + Err: err, } } @@ -370,13 +369,12 @@ func (r *Reactor) processChunkCh() { for { select { - case envelope := <-r.chunkCh.In(): - if err := r.handleMessage(r.chunkCh.ID(), envelope); err != nil { - r.Logger.Error("failed to process message", "ch_id", r.chunkCh.ID(), "envelope", envelope, "err", err) - r.chunkCh.Error() <- p2p.PeerError{ - PeerID: envelope.From, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + case envelope := <-r.chunkCh.In: + if err := r.handleMessage(r.chunkCh.ID, envelope); err != nil { + r.Logger.Error("failed to process message", "ch_id", r.chunkCh.ID, "envelope", envelope, "err", err) + r.chunkCh.Error <- p2p.PeerError{ + NodeID: envelope.From, + Err: err, } } @@ -390,18 +388,18 @@ func (r *Reactor) processChunkCh() { // processPeerUpdate processes a PeerUpdate, returning an error upon failing to // handle the PeerUpdate or if a panic is recovered. func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { - r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status) + r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) r.mtx.RLock() defer r.mtx.RUnlock() if r.syncer != nil { switch peerUpdate.Status { - case p2p.PeerStatusNew, p2p.PeerStatusUp: - r.syncer.AddPeer(peerUpdate.PeerID) + case p2p.PeerStatusUp: + r.syncer.AddPeer(peerUpdate.NodeID) - case p2p.PeerStatusDown, p2p.PeerStatusRemoved, p2p.PeerStatusBanned: - r.syncer.RemovePeer(peerUpdate.PeerID) + case p2p.PeerStatusDown: + r.syncer.RemovePeer(peerUpdate.NodeID) } } } @@ -472,12 +470,12 @@ func (r *Reactor) Sync(stateProvider StateProvider, discoveryTime time.Duration) return sm.State{}, nil, errors.New("a state sync is already in progress") } - r.syncer = newSyncer(r.Logger, r.conn, r.connQuery, stateProvider, r.snapshotCh.Out(), r.chunkCh.Out(), r.tempDir) + r.syncer = newSyncer(r.Logger, r.conn, r.connQuery, stateProvider, r.snapshotCh.Out, r.chunkCh.Out, r.tempDir) r.mtx.Unlock() // request snapshots from all currently connected peers r.Logger.Debug("requesting snapshots from known peers") - r.snapshotCh.Out() <- p2p.Envelope{ + r.snapshotCh.Out <- p2p.Envelope{ Broadcast: true, Message: &ssproto.SnapshotsRequest{}, } diff --git a/statesync/reactor_test.go b/statesync/reactor_test.go index 6dfad5edb..0760c1e54 100644 --- a/statesync/reactor_test.go +++ b/statesync/reactor_test.go @@ -33,7 +33,7 @@ type reactorTestSuite struct { chunkOutCh chan p2p.Envelope chunkPeerErrCh chan p2p.PeerError - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates } func setup( @@ -127,7 +127,7 @@ func TestReactor_ChunkRequest_InvalidRequest(t *testing.T) { require.Error(t, response.Err) require.Empty(t, rts.chunkOutCh) require.Contains(t, response.Err.Error(), "received unknown message") - require.Equal(t, p2p.NodeID("aa"), response.PeerID) + require.Equal(t, p2p.NodeID("aa"), response.NodeID) } func TestReactor_ChunkRequest(t *testing.T) { @@ -198,7 +198,7 @@ func TestReactor_SnapshotsRequest_InvalidRequest(t *testing.T) { require.Error(t, response.Err) require.Empty(t, rts.snapshotOutCh) require.Contains(t, response.Err.Error(), "received unknown message") - require.Equal(t, p2p.NodeID("aa"), response.PeerID) + require.Equal(t, p2p.NodeID("aa"), response.NodeID) } func TestReactor_SnapshotsRequest(t *testing.T) { diff --git a/store/store.go b/store/store.go index b75365646..d2b4a85fc 100644 --- a/store/store.go +++ b/store/store.go @@ -1,6 +1,7 @@ package store import ( + "bytes" "fmt" "strconv" @@ -315,99 +316,111 @@ func (bs *BlockStore) PruneBlocks(height uint64) (uint64, error) { // remove block meta first as this is used to indicate whether the block exists. // For this reason, we also use ony block meta as a measure of the amount of blocks pruned - pruned, err := bs.batchDelete(blockMetaKey(0), blockMetaKey(height), removeBlockHash) + pruned, err := bs.pruneRange(blockMetaKey(0), blockMetaKey(height), removeBlockHash) if err != nil { return pruned, err } - if _, err := bs.batchDelete(blockPartKey(0, 0), blockPartKey(height, 0), nil); err != nil { + if _, err := bs.pruneRange(blockPartKey(0, 0), blockPartKey(height, 0), nil); err != nil { return pruned, err } - if _, err := bs.batchDelete(blockCommitKey(0), blockCommitKey(height), nil); err != nil { + if _, err := bs.pruneRange(blockCommitKey(0), blockCommitKey(height), nil); err != nil { return pruned, err } - if _, err := bs.batchDelete(seenCommitKey(0), seenCommitKey(height), nil); err != nil { + if _, err := bs.pruneRange(seenCommitKey(0), seenCommitKey(height), nil); err != nil { return pruned, err } return pruned, nil } -// batchDelete is a generic function for deleting a range of values based on the lowest +// pruneRange is a generic function for deleting a range of values based on the lowest // height up to but excluding retainHeight. For each key/value pair, an optional hook can be -// executed before the deletion itself is made -func (bs *BlockStore) batchDelete( +// executed before the deletion itself is made. pruneRange will use batch delete to delete +// keys in batches of at most 1000 keys. +func (bs *BlockStore) pruneRange( start []byte, end []byte, preDeletionHook func(key, value []byte, batch dbm.Batch) error, ) (uint64, error) { - iter, err := bs.db.Iterator(start, end) - if err != nil { - panic(err) - } - defer iter.Close() + var ( + err error + pruned uint64 + totalPruned uint64 = 0 + ) batch := bs.db.NewBatch() defer batch.Close() - pruned := uint64(0) - flushed := pruned - for iter.Valid() { + pruned, start, err = bs.batchDelete(batch, start, end, preDeletionHook) + if err != nil { + return totalPruned, err + } + + // loop until we have finished iterating over all the keys by writing, opening a new batch + // and incrementing through the next range of keys. + for !bytes.Equal(start, end) { + if err := batch.Write(); err != nil { + return totalPruned, err + } + + totalPruned += pruned + + if err := batch.Close(); err != nil { + return totalPruned, err + } + + batch = bs.db.NewBatch() + + pruned, start, err = bs.batchDelete(batch, start, end, preDeletionHook) + if err != nil { + return totalPruned, err + } + } + + // once we looped over all keys we do a final flush to disk + if err := batch.WriteSync(); err != nil { + return totalPruned, err + } + totalPruned += pruned + return totalPruned, nil +} + +// batchDelete runs an iterator over a set of keys, first preforming a pre deletion hook before adding it to the batch. +// The function ends when either 1000 keys have been added to the batch or the iterator has reached the end. +func (bs *BlockStore) batchDelete( + batch dbm.Batch, + start, end []byte, + preDeletionHook func(key, value []byte, batch dbm.Batch) error, +) (uint64, []byte, error) { + var pruned uint64 = 0 + iter, err := bs.db.Iterator(start, end) + if err != nil { + return pruned, start, err + } + defer iter.Close() + + for ; iter.Valid(); iter.Next() { key := iter.Key() if preDeletionHook != nil { if err := preDeletionHook(key, iter.Value(), batch); err != nil { - return flushed, err + return 0, start, fmt.Errorf("pruning error at key %X: %w", iter.Key(), err) } } if err := batch.Delete(key); err != nil { - return flushed, fmt.Errorf("pruning error at key %X: %w", iter.Key(), err) + return 0, start, fmt.Errorf("pruning error at key %X: %w", iter.Key(), err) } pruned++ - // avoid batches growing too large by flushing to database regularly - if pruned%1000 == 0 { - if err := iter.Error(); err != nil { - return flushed, err - } - if err := iter.Close(); err != nil { - return flushed, err - } - - err := batch.Write() - if err != nil { - return flushed, fmt.Errorf("pruning error at key %X: %w", iter.Key(), err) - } - if err := batch.Close(); err != nil { - return flushed, err - } - flushed = pruned - - iter, err = bs.db.Iterator(start, end) - if err != nil { - panic(err) - } - defer iter.Close() - - batch = bs.db.NewBatch() - defer batch.Close() - } else { - iter.Next() + if pruned == 1000 { + return pruned, iter.Key(), iter.Error() } } - flushed = pruned - if err := iter.Error(); err != nil { - return flushed, err - } - err = batch.WriteSync() - if err != nil { - return flushed, fmt.Errorf("pruning error at key %X: %w", iter.Key(), err) - } - - return flushed, nil + return pruned, end, iter.Error() } // SaveBlock persists the given block, blockParts, and seenCommit to the underlying db. diff --git a/test/e2e/tests/validator_test.go b/test/e2e/tests/validator_test.go index f358296f2..847a8d388 100644 --- a/test/e2e/tests/validator_test.go +++ b/test/e2e/tests/validator_test.go @@ -77,8 +77,9 @@ func TestValidator_Propose(t *testing.T) { require.False(t, proposeCount == 0 && expectCount > 0, "node did not propose any blocks (expected %v)", expectCount) - require.Less(t, expectCount-proposeCount, 5, - "validator missed proposing too many blocks (proposed %v out of %v)", proposeCount, expectCount) + if expectCount > 5 { + require.GreaterOrEqual(t, proposeCount, 3, "validator didn't propose even 3 blocks") + } }) } @@ -116,8 +117,9 @@ func TestValidator_Sign(t *testing.T) { require.False(t, signCount == 0 && expectCount > 0, "validator did not sign any blocks (expected %v)", expectCount) - require.Less(t, float64(expectCount-signCount)/float64(expectCount), 0.33, - "validator missed signing too many blocks (signed %v out of %v)", signCount, expectCount) + if expectCount > 7 { + require.GreaterOrEqual(t, signCount, 3, "validator didn't sign even 3 blocks (expected %v)", expectCount) + } }) } diff --git a/test/maverick/consensus/reactor.go b/test/maverick/consensus/reactor.go index 22f822895..b5e5f78c2 100644 --- a/test/maverick/consensus/reactor.go +++ b/test/maverick/consensus/reactor.go @@ -981,7 +981,7 @@ func (ps *PeerState) ToJSON() ([]byte, error) { // GetHeight returns an atomic snapshot of the PeerRoundState's height // used by the mempool to ensure peers are caught up before broadcasting new txs -func (ps *PeerState) GetHeight() int64 { +func (ps *PeerState) GetHeight() uint64 { ps.mtx.Lock() defer ps.mtx.Unlock() return ps.PRS.Height diff --git a/test/maverick/node/node.go b/test/maverick/node/node.go index 4ad921e9e..d094e880a 100644 --- a/test/maverick/node/node.go +++ b/test/maverick/node/node.go @@ -791,7 +791,7 @@ func NewNode(config *cfg.Config, // TODO: Fetch and provide real options and do proper p2p bootstrapping. // TODO: Use a persistent peer database. - peerMgr, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + peerMgr, err := p2p.NewPeerManager(nodeKey.ID, dbm.NewMemDB(), p2p.PeerManagerOptions{}) if err != nil { return nil, err }