diff --git a/blockchain/v0/reactor.go b/blockchain/v0/reactor.go index 7dd4d4ea3..583bb6766 100644 --- a/blockchain/v0/reactor.go +++ b/blockchain/v0/reactor.go @@ -84,7 +84,7 @@ type Reactor struct { fastSync bool blockchainCh *p2p.Channel - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates closeCh chan struct{} requestsCh <-chan BlockRequest @@ -104,7 +104,7 @@ func NewReactor( store *store.BlockStore, consReactor consensusReactor, blockchainCh *p2p.Channel, - peerUpdates *p2p.PeerUpdatesCh, + peerUpdates *p2p.PeerUpdates, fastSync bool, ) (*Reactor, error) { if state.LastBlockHeight != store.Height() { @@ -288,9 +288,8 @@ func (r *Reactor) processBlockchainCh() { if err := r.handleMessage(r.blockchainCh.ID(), envelope); err != nil { r.Logger.Error("failed to process message", "ch_id", r.blockchainCh.ID(), "envelope", envelope, "err", err) r.blockchainCh.Error() <- p2p.PeerError{ - PeerID: envelope.From, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + NodeID: envelope.From, + Err: err, } } @@ -303,26 +302,26 @@ func (r *Reactor) processBlockchainCh() { // processPeerUpdate processes a PeerUpdate. func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { - r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status) + r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) // XXX: Pool#RedoRequest can sometimes give us an empty peer. - if len(peerUpdate.PeerID) == 0 { + if len(peerUpdate.NodeID) == 0 { return } switch peerUpdate.Status { - case p2p.PeerStatusNew, p2p.PeerStatusUp: + case p2p.PeerStatusUp: // send a status update the newly added peer r.blockchainCh.Out() <- p2p.Envelope{ - To: peerUpdate.PeerID, + To: peerUpdate.NodeID, Message: &bcproto.StatusResponse{ Base: r.store.Base(), Height: r.store.Height(), }, } - case p2p.PeerStatusDown, p2p.PeerStatusRemoved, p2p.PeerStatusBanned: - r.pool.RemovePeer(peerUpdate.PeerID) + case p2p.PeerStatusDown: + r.pool.RemovePeer(peerUpdate.NodeID) } } @@ -384,9 +383,8 @@ func (r *Reactor) requestRoutine() { case pErr := <-r.errorsCh: r.blockchainCh.Error() <- p2p.PeerError{ - PeerID: pErr.peerID, - Err: pErr.err, - Severity: p2p.PeerErrorSeverityLow, + NodeID: pErr.peerID, + Err: pErr.err, } case <-statusUpdateTicker.C: @@ -525,17 +523,15 @@ FOR_LOOP: // to clean up the rest. peerID := r.pool.RedoRequest(first.Height) r.blockchainCh.Error() <- p2p.PeerError{ - PeerID: peerID, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + NodeID: peerID, + Err: err, } peerID2 := r.pool.RedoRequest(second.Height) if peerID2 != peerID { r.blockchainCh.Error() <- p2p.PeerError{ - PeerID: peerID2, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + NodeID: peerID2, + Err: err, } } diff --git a/blockchain/v0/reactor_test.go b/blockchain/v0/reactor_test.go index 372c689fb..9d955319a 100644 --- a/blockchain/v0/reactor_test.go +++ b/blockchain/v0/reactor_test.go @@ -36,7 +36,7 @@ type reactorTestSuite struct { blockchainPeerErrCh chan p2p.PeerError peerUpdatesCh chan p2p.PeerUpdate - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates } func setup( @@ -200,8 +200,8 @@ func simulateRouter(primary *reactorTestSuite, suites []*reactorTestSuite, dropC primary.reactor.Logger.Debug("dropped peer error", "err", pErr.Err) } else { primary.peerUpdatesCh <- p2p.PeerUpdate{ - PeerID: pErr.PeerID, - Status: p2p.PeerStatusRemoved, + NodeID: pErr.NodeID, + Status: p2p.PeerStatusDown, } } } @@ -229,7 +229,7 @@ func TestReactor_AbruptDisconnect(t *testing.T) { if s.peerID != ss.peerID { s.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: ss.peerID, + NodeID: ss.peerID, } } } @@ -251,7 +251,7 @@ func TestReactor_AbruptDisconnect(t *testing.T) { // deadlocks or race conditions within the context of poolRoutine. testSuites[1].peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusDown, - PeerID: testSuites[0].peerID, + NodeID: testSuites[0].peerID, } } @@ -276,7 +276,7 @@ func TestReactor_NoBlockResponse(t *testing.T) { if s.peerID != ss.peerID { s.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: ss.peerID, + NodeID: ss.peerID, } } } @@ -341,7 +341,7 @@ func TestReactor_BadBlockStopsPeer(t *testing.T) { if s.peerID != ss.peerID { s.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: ss.peerID, + NodeID: ss.peerID, } } } @@ -388,7 +388,7 @@ func TestReactor_BadBlockStopsPeer(t *testing.T) { for _, s := range testSuites[:len(testSuites)-1] { newSuite.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: s.peerID, + NodeID: s.peerID, } } diff --git a/evidence/reactor.go b/evidence/reactor.go index 643d9915a..cfa9b8989 100644 --- a/evidence/reactor.go +++ b/evidence/reactor.go @@ -55,7 +55,7 @@ type Reactor struct { evpool *Pool eventBus *types.EventBus evidenceCh *p2p.Channel - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates closeCh chan struct{} peerWG sync.WaitGroup @@ -70,7 +70,7 @@ type Reactor struct { func NewReactor( logger log.Logger, evidenceCh *p2p.Channel, - peerUpdates *p2p.PeerUpdatesCh, + peerUpdates *p2p.PeerUpdates, evpool *Pool, ) *Reactor { r := &Reactor{ @@ -196,9 +196,8 @@ func (r *Reactor) processEvidenceCh() { if err := r.handleMessage(r.evidenceCh.ID(), envelope); err != nil { r.Logger.Error("failed to process message", "ch_id", r.evidenceCh.ID(), "envelope", envelope, "err", err) r.evidenceCh.Error() <- p2p.PeerError{ - PeerID: envelope.From, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + NodeID: envelope.From, + Err: err, } } @@ -221,7 +220,7 @@ func (r *Reactor) processEvidenceCh() { // // REF: https://github.com/tendermint/tendermint/issues/4727 func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { - r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status) + r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) r.mtx.Lock() defer r.mtx.Unlock() @@ -240,21 +239,21 @@ func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { // a new done channel so we can explicitly close the goroutine if the peer // is later removed, we increment the waitgroup so the reactor can stop // safely, and finally start the goroutine to broadcast evidence to that peer. - _, ok := r.peerRoutines[peerUpdate.PeerID] + _, ok := r.peerRoutines[peerUpdate.NodeID] if !ok { closer := tmsync.NewCloser() - r.peerRoutines[peerUpdate.PeerID] = closer + r.peerRoutines[peerUpdate.NodeID] = closer r.peerWG.Add(1) - go r.broadcastEvidenceLoop(peerUpdate.PeerID, closer) + go r.broadcastEvidenceLoop(peerUpdate.NodeID, closer) } - case p2p.PeerStatusDown, p2p.PeerStatusRemoved, p2p.PeerStatusBanned: + case p2p.PeerStatusDown: // Check if we've started an evidence broadcasting goroutine for this peer. // If we have, we signal to terminate the goroutine via the channel's closure. // This will internally decrement the peer waitgroup and remove the peer // from the map of peer evidence broadcasting goroutines. - closer, ok := r.peerRoutines[peerUpdate.PeerID] + closer, ok := r.peerRoutines[peerUpdate.NodeID] if ok { closer.Close() } diff --git a/evidence/reactor_test.go b/evidence/reactor_test.go index 0958b260f..4462b851a 100644 --- a/evidence/reactor_test.go +++ b/evidence/reactor_test.go @@ -42,7 +42,7 @@ type reactorTestSuite struct { evidencePeerErrCh chan p2p.PeerError peerUpdatesCh chan p2p.PeerUpdate - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates } func setup(t *testing.T, logger log.Logger, pool *evidence.Pool, chBuf uint) *reactorTestSuite { @@ -224,18 +224,18 @@ func TestReactorMultiDisconnect(t *testing.T) { primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // Ensure "disconnecting" the secondary peer from the primary more than once // is handled gracefully. primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusDown, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusDown, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } } @@ -276,7 +276,7 @@ func TestReactorBroadcastEvidence(t *testing.T) { for _, suite := range secondaries { primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: suite.peerID, + NodeID: suite.peerID, } } @@ -327,7 +327,7 @@ func TestReactorBroadcastEvidence_Lagging(t *testing.T) { for _, suite := range secondaries { primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: suite.peerID, + NodeID: suite.peerID, } } @@ -378,7 +378,7 @@ func TestReactorBroadcastEvidence_Pending(t *testing.T) { // add the secondary reactor as a peer to the primary reactor primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // The secondary reactor should have received all the evidence ignoring the @@ -438,7 +438,7 @@ func TestReactorBroadcastEvidence_Committed(t *testing.T) { // add the secondary reactor as a peer to the primary reactor primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // The secondary reactor should have received all the evidence ignoring the @@ -487,7 +487,7 @@ func TestReactorBroadcastEvidence_FullyConnected(t *testing.T) { if suiteI.peerID != suiteJ.peerID { suiteI.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: suiteJ.peerID, + NodeID: suiteJ.peerID, } } } @@ -530,7 +530,7 @@ func TestReactorBroadcastEvidence_RemovePeer(t *testing.T) { // add the secondary reactor as a peer to the primary reactor primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // have the secondary reactor receive only half the evidence @@ -539,7 +539,7 @@ func TestReactorBroadcastEvidence_RemovePeer(t *testing.T) { // disconnect the peer primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusDown, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // Ensure the secondary only received half of the evidence before being diff --git a/libs/sync/waker.go b/libs/sync/waker.go new file mode 100644 index 000000000..0aff3ddf8 --- /dev/null +++ b/libs/sync/waker.go @@ -0,0 +1,30 @@ +package sync + +// Waker is used to wake up a sleeper when some event occurs. It debounces +// multiple wakeup calls occurring between each sleep, and wakeups are +// non-blocking to avoid having to coordinate goroutines. +type Waker struct { + wakeCh chan struct{} +} + +// NewWaker creates a new Waker. +func NewWaker() *Waker { + return &Waker{ + wakeCh: make(chan struct{}, 1), // buffer used for debouncing + } +} + +// Sleep returns a channel that blocks until Wake() is called. +func (w *Waker) Sleep() <-chan struct{} { + return w.wakeCh +} + +// Wake wakes up the sleeper. +func (w *Waker) Wake() { + // A non-blocking send with a size 1 buffer ensures that we never block, and + // that we queue up at most a single wakeup call between each Sleep(). + select { + case w.wakeCh <- struct{}{}: + default: + } +} diff --git a/libs/sync/waker_test.go b/libs/sync/waker_test.go new file mode 100644 index 000000000..dfd337e0e --- /dev/null +++ b/libs/sync/waker_test.go @@ -0,0 +1,47 @@ +package sync_test + +import ( + "testing" + + "github.com/stretchr/testify/require" + + tmsync "github.com/tendermint/tendermint/libs/sync" +) + +func TestWaker(t *testing.T) { + + // A new waker should block when sleeping. + waker := tmsync.NewWaker() + + select { + case <-waker.Sleep(): + require.Fail(t, "unexpected wakeup") + default: + } + + // Wakeups should not block, and should cause the next sleeper to awaken. + waker.Wake() + + select { + case <-waker.Sleep(): + default: + require.Fail(t, "expected wakeup, but sleeping instead") + } + + // Multiple wakeups should only wake a single sleeper. + waker.Wake() + waker.Wake() + waker.Wake() + + select { + case <-waker.Sleep(): + default: + require.Fail(t, "expected wakeup, but sleeping instead") + } + + select { + case <-waker.Sleep(): + require.Fail(t, "unexpected wakeup") + default: + } +} diff --git a/mempool/reactor.go b/mempool/reactor.go index f6ae9dc9c..8b2a1c063 100644 --- a/mempool/reactor.go +++ b/mempool/reactor.go @@ -58,7 +58,7 @@ type Reactor struct { peerMgr PeerManager mempoolCh *p2p.Channel - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates closeCh chan struct{} // peerWG is used to coordinate graceful termination of all peer broadcasting @@ -76,7 +76,7 @@ func NewReactor( peerMgr PeerManager, mempool *CListMempool, mempoolCh *p2p.Channel, - peerUpdates *p2p.PeerUpdatesCh, + peerUpdates *p2p.PeerUpdates, ) *Reactor { r := &Reactor{ @@ -225,9 +225,8 @@ func (r *Reactor) processMempoolCh() { if err := r.handleMessage(r.mempoolCh.ID(), envelope); err != nil { r.Logger.Error("failed to process message", "ch_id", r.mempoolCh.ID(), "envelope", envelope, "err", err) r.mempoolCh.Error() <- p2p.PeerError{ - PeerID: envelope.From, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + NodeID: envelope.From, + Err: err, } } @@ -244,7 +243,7 @@ func (r *Reactor) processMempoolCh() { // removed peers, we remove the peer from the mempool peer ID set and signal to // stop the tx broadcasting goroutine. func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { - r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status) + r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) r.mtx.Lock() defer r.mtx.Unlock() @@ -264,28 +263,28 @@ func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { // a new done channel so we can explicitly close the goroutine if the peer // is later removed, we increment the waitgroup so the reactor can stop // safely, and finally start the goroutine to broadcast txs to that peer. - _, ok := r.peerRoutines[peerUpdate.PeerID] + _, ok := r.peerRoutines[peerUpdate.NodeID] if !ok { closer := tmsync.NewCloser() - r.peerRoutines[peerUpdate.PeerID] = closer + r.peerRoutines[peerUpdate.NodeID] = closer r.peerWG.Add(1) - r.ids.ReserveForPeer(peerUpdate.PeerID) + r.ids.ReserveForPeer(peerUpdate.NodeID) // start a broadcast routine ensuring all txs are forwarded to the peer - go r.broadcastTxRoutine(peerUpdate.PeerID, closer) + go r.broadcastTxRoutine(peerUpdate.NodeID, closer) } } - case p2p.PeerStatusDown, p2p.PeerStatusRemoved, p2p.PeerStatusBanned: - r.ids.Reclaim(peerUpdate.PeerID) + case p2p.PeerStatusDown: + r.ids.Reclaim(peerUpdate.NodeID) // Check if we've started a tx broadcasting goroutine for this peer. // If we have, we signal to terminate the goroutine via the channel's closure. // This will internally decrement the peer waitgroup and remove the peer // from the map of peer tx broadcasting goroutines. - closer, ok := r.peerRoutines[peerUpdate.PeerID] + closer, ok := r.peerRoutines[peerUpdate.NodeID] if ok { closer.Close() } diff --git a/mempool/reactor_test.go b/mempool/reactor_test.go index d1163bf09..1f1b3be1e 100644 --- a/mempool/reactor_test.go +++ b/mempool/reactor_test.go @@ -33,7 +33,7 @@ type reactorTestSuite struct { mempoolPeerErrCh chan p2p.PeerError peerUpdatesCh chan p2p.PeerUpdate - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates } func setup(t *testing.T, cfg *cfg.MempoolConfig, logger log.Logger, chBuf uint) *reactorTestSuite { @@ -189,7 +189,7 @@ func TestReactorBroadcastTxs(t *testing.T) { for _, suite := range secondaries { primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: suite.peerID, + NodeID: suite.peerID, } } @@ -295,7 +295,7 @@ func TestReactorNoBroadcastToSender(t *testing.T) { primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } time.Sleep(100 * time.Millisecond) @@ -360,7 +360,7 @@ func TestReactor_MaxTxBytes(t *testing.T) { primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // Wait till all secondary suites (reactor) received all mempool txs from the @@ -406,7 +406,7 @@ func TestDontExhaustMaxActiveIDs(t *testing.T) { for i := 0; i < maxActiveIDs+1; i++ { reactor.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: peerID, + NodeID: peerID, } reactor.mempoolOutCh <- p2p.Envelope{ To: peerID, @@ -466,12 +466,12 @@ func TestBroadcastTxForPeerStopsWhenPeerStops(t *testing.T) { // connect peer primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusUp, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } // disconnect peer primary.peerUpdatesCh <- p2p.PeerUpdate{ Status: p2p.PeerStatusDown, - PeerID: secondary.peerID, + NodeID: secondary.peerID, } } diff --git a/p2p/address.go b/p2p/address.go index 8ff44515a..1e964b449 100644 --- a/p2p/address.go +++ b/p2p/address.go @@ -106,10 +106,9 @@ func ParseNodeAddress(urlString string) (NodeAddress, error) { Protocol: Protocol(strings.ToLower(url.Scheme)), } - // Opaque URLs are expected to contain only a node ID, also used as path. + // Opaque URLs are expected to contain only a node ID. if url.Opaque != "" { address.NodeID = NodeID(url.Opaque) - address.Path = url.Opaque return address, address.Validate() } diff --git a/p2p/address_test.go b/p2p/address_test.go index 83bfecf24..41b89eaee 100644 --- a/p2p/address_test.go +++ b/p2p/address_test.go @@ -158,7 +158,7 @@ func TestParseNodeAddress(t *testing.T) { }, { "memory:" + user, - p2p.NodeAddress{Protocol: "memory", NodeID: id, Path: user}, + p2p.NodeAddress{Protocol: "memory", NodeID: id}, true, }, diff --git a/p2p/channel.go b/p2p/channel.go index b2bcd156e..c13cd75d1 100644 --- a/p2p/channel.go +++ b/p2p/channel.go @@ -27,6 +27,21 @@ func (e Envelope) Strip() Envelope { return e } +// PeerError is a peer error reported via the Error channel. +// +// FIXME: This currently just disconnects the peer, which is too simplistic. +// For example, some errors should be logged, some should cause disconnects, +// and some should ban the peer. +// +// FIXME: This should probably be replaced by a more general PeerBehavior +// concept that can mark good and bad behavior and contributes to peer scoring. +// It should possibly also allow reactors to request explicit actions, e.g. +// disconnection or banning, in addition to doing this based on aggregates. +type PeerError struct { + NodeID NodeID + Err error +} + // Channel is a bidirectional channel for Protobuf message exchange with peers. // A Channel is safe for concurrent use by multiple goroutines. type Channel struct { diff --git a/p2p/peer.go b/p2p/peer.go index 07ffaa330..1aa923533 100644 --- a/p2p/peer.go +++ b/p2p/peer.go @@ -1,1175 +1,18 @@ package p2p import ( - "context" - "errors" "fmt" "io" - "math" - "math/rand" "net" "runtime/debug" - "sort" - "sync" "time" - "github.com/gogo/protobuf/proto" - "github.com/google/orderedcode" - dbm "github.com/tendermint/tm-db" - "github.com/tendermint/tendermint/libs/cmap" "github.com/tendermint/tendermint/libs/log" "github.com/tendermint/tendermint/libs/service" tmconn "github.com/tendermint/tendermint/p2p/conn" - p2pproto "github.com/tendermint/tendermint/proto/tendermint/p2p" ) -// PeerStatus specifies peer statuses. -type PeerStatus string - -const ( - PeerStatusNew = PeerStatus("new") // New peer which we haven't tried to contact yet. - PeerStatusUp = PeerStatus("up") // Peer which we have an active connection to. - PeerStatusDown = PeerStatus("down") // Peer which we're temporarily disconnected from. - PeerStatusRemoved = PeerStatus("removed") // Peer which has been removed. - PeerStatusBanned = PeerStatus("banned") // Peer which is banned for misbehavior. -) - -// PeerError is a peer error reported by a reactor via the Error channel. The -// severity may cause the peer to be disconnected or banned depending on policy. -type PeerError struct { - PeerID NodeID - Err error - Severity PeerErrorSeverity -} - -// PeerErrorSeverity determines the severity of a peer error. -type PeerErrorSeverity string - -const ( - PeerErrorSeverityLow PeerErrorSeverity = "low" // Mostly ignored. - PeerErrorSeverityHigh PeerErrorSeverity = "high" // May disconnect. - PeerErrorSeverityCritical PeerErrorSeverity = "critical" // Ban. -) - -// PeerUpdatesCh defines a wrapper around a PeerUpdate go channel that allows -// a reactor to listen for peer updates and safely close it when stopping. -type PeerUpdatesCh struct { - closeOnce sync.Once - - // updatesCh defines the go channel in which the router sends peer updates to - // reactors. Each reactor will have its own PeerUpdatesCh to listen for updates - // from. - updatesCh chan PeerUpdate - - // doneCh is used to signal that a PeerUpdatesCh is closed. It is the - // reactor's responsibility to invoke Close. - doneCh chan struct{} -} - -// NewPeerUpdates returns a reference to a new PeerUpdatesCh. -func NewPeerUpdates(updatesCh chan PeerUpdate) *PeerUpdatesCh { - return &PeerUpdatesCh{ - updatesCh: updatesCh, - doneCh: make(chan struct{}), - } -} - -// Updates returns a read-only go channel where a consuming reactor can listen -// for peer updates sent from the router. -func (puc *PeerUpdatesCh) Updates() <-chan PeerUpdate { - return puc.updatesCh -} - -// Close closes the PeerUpdatesCh channel. It should only be closed by the respective -// reactor when stopping and ensure nothing is listening for updates. -// -// NOTE: After a PeerUpdatesCh is closed, the router may safely assume it can no -// longer send on the internal updatesCh, however it should NEVER explicitly close -// it as that could result in panics by sending on a closed channel. -func (puc *PeerUpdatesCh) Close() { - puc.closeOnce.Do(func() { - close(puc.doneCh) - }) -} - -// Done returns a read-only version of the PeerUpdatesCh's internal doneCh go -// channel that should be used by a router to signal when it is safe to explicitly -// not send any peer updates. -func (puc *PeerUpdatesCh) Done() <-chan struct{} { - return puc.doneCh -} - -// PeerUpdate is a peer status update for reactors. -type PeerUpdate struct { - PeerID NodeID - Status PeerStatus -} - -// PeerScore is a numeric score assigned to a peer (higher is better). -type PeerScore uint16 - -const ( - // PeerScorePersistent is added for persistent peers. - PeerScorePersistent PeerScore = 100 -) - -// PeerManager manages peer lifecycle information, using a peerStore for -// underlying storage. Its primary purpose is to determine which peers to -// connect to next, make sure a peer only has a single active connection (either -// inbound or outbound), and evict peers to make room for higher-scored peers. -// It does not manage actual connections (this is handled by the Router), -// only the peer lifecycle state. -// -// We track dialing and connected states independently. This allows us to accept -// an inbound connection from a peer while the router is also dialing an -// outbound connection to that same peer, which will cause the dialer to -// eventually error when attempting to mark the peer as connected. This also -// avoids race conditions where multiple goroutines may end up dialing a peer if -// an incoming connection was briefly accepted and disconnected while we were -// also dialing. -// -// For an outbound connection, the flow is as follows: -// - DialNext: returns a peer address to dial, marking the peer as dialing. -// - DialFailed: reports a dial failure, unmarking the peer as dialing. -// - Dialed: successfully dialed, unmarking as dialing and marking as connected -// (or erroring if already connected). -// - Ready: routing is up, broadcasts a PeerStatusUp peer update to subscribers. -// - Disconnected: peer disconnects, unmarking as connected and broadcasts a -// PeerStatusDown peer update. -// -// For an inbound connection, the flow is as follows: -// - Accepted: successfully accepted connection, marking as connected (or erroring -// if already connected). -// - Ready: routing is up, broadcasts a PeerStatusUp peer update to subscribers. -// - Disconnected: peer disconnects, unmarking as connected and broadcasts a -// PeerStatusDown peer update. -// -// When evicting peers, either because peers are explicitly scheduled for -// eviction or we are connected to too many peers, the flow is as follows: -// - EvictNext: if marked evict and connected, unmark evict and mark evicting. -// If beyond MaxConnected, pick lowest-scored peer and mark evicting. -// - Disconnected: unmark connected, evicting, evict, and broadcast a -// PeerStatusDown peer update. -// -// If all connection slots are full (at MaxConnections), we can use up to -// MaxConnectionsUpgrade additional connections to probe any higher-scored -// unconnected peers, and if we reach them (or they reach us) we allow the -// connection and evict a lower-scored peer. We mark the lower-scored peer as -// upgrading[from]=to to make sure no other higher-scored peers can claim the -// same one for an upgrade. The flow is as follows: -// - Accepted: if upgrade is possible, mark connected and add lower-scored to evict. -// - DialNext: if upgrade is possible, mark upgrading[from]=to and dialing. -// - DialFailed: unmark upgrading[from]=to and dialing. -// - Dialed: unmark upgrading[from]=to and dialing, mark as connected, add -// lower-scored to evict. -// - EvictNext: pick peer from evict, mark as evicting. -// - Disconnected: unmark connected, upgrading[from]=to, evict, evicting. -// -// FIXME: The old stack supports ABCI-based peer ID filtering via -// /p2p/filter/id/ queries, we should implement this here as well by taking -// a peer ID filtering callback in PeerManagerOptions and configuring it during -// Node setup. -type PeerManager struct { - options PeerManagerOptions - wakeDialCh chan struct{} // wakes up DialNext() on relevant peer changes - wakeEvictCh chan struct{} // wakes up EvictNext() on relevant peer changes - closeCh chan struct{} // signal channel for Close() - closeOnce sync.Once - - mtx sync.Mutex - store *peerStore - dialing map[NodeID]bool // peers being dialed (DialNext -> Dialed/DialFail) - upgrading map[NodeID]NodeID // peers claimed for upgrade (DialNext -> Dialed/DialFail) - connected map[NodeID]bool // connected peers (Dialed/Accepted -> Disconnected) - evict map[NodeID]bool // peers scheduled for eviction (Connected -> EvictNext) - evicting map[NodeID]bool // peers being evicted (EvictNext -> Disconnected) - subscriptions map[*PeerUpdatesCh]*PeerUpdatesCh // keyed by struct identity (address) -} - -// PeerManagerOptions specifies options for a PeerManager. -type PeerManagerOptions struct { - // PersistentPeers are peers that we want to maintain persistent connections - // to. These will be scored higher than other peers, and if - // MaxConnectedUpgrade is non-zero any lower-scored peers will be evicted if - // necessary to make room for these. - PersistentPeers []NodeID - - // MaxPeers is the maximum number of peers to track information about, i.e. - // store in the peer store. When exceeded, the lowest-scored unconnected peers - // will be deleted. 0 means no limit. - MaxPeers uint16 - - // MaxConnected is the maximum number of connected peers (inbound and - // outbound). 0 means no limit. - MaxConnected uint16 - - // MaxConnectedUpgrade is the maximum number of additional connections to - // use for probing any better-scored peers to upgrade to when all connection - // slots are full. 0 disables peer upgrading. - // - // For example, if we are already connected to MaxConnected peers, but we - // know or learn about better-scored peers (e.g. configured persistent - // peers) that we are not connected too, then we can probe these peers by - // using up to MaxConnectedUpgrade connections, and once connected evict the - // lowest-scored connected peers. This also works for inbound connections, - // i.e. if a higher-scored peer attempts to connect to us, we can accept - // the connection and evict a lower-scored peer. - MaxConnectedUpgrade uint16 - - // MinRetryTime is the minimum time to wait between retries. Retry times - // double for each retry, up to MaxRetryTime. 0 disables retries. - MinRetryTime time.Duration - - // MaxRetryTime is the maximum time to wait between retries. 0 means - // no maximum, in which case the retry time will keep doubling. - MaxRetryTime time.Duration - - // MaxRetryTimePersistent is the maximum time to wait between retries for - // peers listed in PersistentPeers. 0 uses MaxRetryTime instead. - MaxRetryTimePersistent time.Duration - - // RetryTimeJitter is the upper bound of a random interval added to - // retry times, to avoid thundering herds. 0 disables jutter. - RetryTimeJitter time.Duration -} - -// NewPeerManager creates a new peer manager. -func NewPeerManager(peerDB dbm.DB, options PeerManagerOptions) (*PeerManager, error) { - store, err := newPeerStore(peerDB) - if err != nil { - return nil, err - } - peerManager := &PeerManager{ - options: options, - closeCh: make(chan struct{}), - - // We use a buffer of size 1 for these trigger channels, with - // non-blocking sends. This ensures that if e.g. wakeDial() is called - // multiple times before the initial trigger is picked up we only - // process the trigger once. - // - // FIXME: This should maybe be a libs/sync type. - wakeDialCh: make(chan struct{}, 1), - wakeEvictCh: make(chan struct{}, 1), - - store: store, - dialing: map[NodeID]bool{}, - upgrading: map[NodeID]NodeID{}, - connected: map[NodeID]bool{}, - evict: map[NodeID]bool{}, - evicting: map[NodeID]bool{}, - subscriptions: map[*PeerUpdatesCh]*PeerUpdatesCh{}, - } - if err = peerManager.configurePeers(); err != nil { - return nil, err - } - if err = peerManager.prunePeers(); err != nil { - return nil, err - } - return peerManager, nil -} - -// configurePeers configures peers in the peer store with ephemeral runtime -// configuration, e.g. setting peerInfo.Persistent based on -// PeerManagerOptions.PersistentPeers. The caller must hold the mutex lock. -func (m *PeerManager) configurePeers() error { - for _, peerID := range m.options.PersistentPeers { - if peer, ok := m.store.Get(peerID); ok { - peer.Persistent = true - if err := m.store.Set(peer); err != nil { - return err - } - } - } - return nil -} - -// prunePeers removes peers from the peer store if it contains more than -// MaxPeers peers. The lowest-scored non-connected peers are removed. -// The caller must hold the mutex lock. -func (m *PeerManager) prunePeers() error { - if m.options.MaxPeers == 0 || m.store.Size() <= int(m.options.MaxPeers) { - return nil - } - m.mtx.Lock() - defer m.mtx.Unlock() - - ranked := m.store.Ranked() - for i := len(ranked) - 1; i >= 0; i-- { - peerID := ranked[i].ID - switch { - case m.store.Size() <= int(m.options.MaxPeers): - break - case m.dialing[peerID]: - case m.connected[peerID]: - case m.evicting[peerID]: - default: - if err := m.store.Delete(peerID); err != nil { - return err - } - } - } - return nil -} - -// Close closes the peer manager, releasing resources allocated with it -// (specifically any running goroutines). -func (m *PeerManager) Close() { - m.closeOnce.Do(func() { - close(m.closeCh) - }) -} - -// Add adds a peer to the manager, given as an address. If the peer already -// exists, the address is added to it. -func (m *PeerManager) Add(address NodeAddress) error { - if err := address.Validate(); err != nil { - return err - } - m.mtx.Lock() - defer m.mtx.Unlock() - - peer, ok := m.store.Get(address.NodeID) - if !ok { - peer = m.makePeerInfo(address.NodeID) - } - if _, ok := peer.AddressInfo[address.String()]; !ok { - peer.AddressInfo[address.String()] = &peerAddressInfo{Address: address} - } - if err := m.store.Set(peer); err != nil { - return err - } - if err := m.prunePeers(); err != nil { - return err - } - m.wakeDial() - return nil -} - -// Advertise returns a list of peer addresses to advertise to a peer. -// -// FIXME: This is fairly naïve and only returns the addresses of the -// highest-ranked peers. -func (m *PeerManager) Advertise(peerID NodeID, limit uint16) []NodeAddress { - m.mtx.Lock() - defer m.mtx.Unlock() - - addresses := make([]NodeAddress, 0, limit) - for _, peer := range m.store.Ranked() { - if peer.ID == peerID { - continue - } - for _, addressInfo := range peer.AddressInfo { - if len(addresses) >= int(limit) { - return addresses - } - addresses = append(addresses, addressInfo.Address) - } - } - return addresses -} - -// makePeerInfo creates a peerInfo for a new peer. -func (m *PeerManager) makePeerInfo(id NodeID) peerInfo { - isPersistent := false - for _, p := range m.options.PersistentPeers { - if id == p { - isPersistent = true - break - } - } - return peerInfo{ - ID: id, - Persistent: isPersistent, - AddressInfo: map[string]*peerAddressInfo{}, - } -} - -// Subscribe subscribes to peer updates. The caller must consume the peer -// updates in a timely fashion and close the subscription when done, since -// delivery is guaranteed and will block peer connection/disconnection -// otherwise. -func (m *PeerManager) Subscribe() *PeerUpdatesCh { - // FIXME: We may want to use a size 1 buffer here. When the router - // broadcasts a peer update it has to loop over all of the - // subscriptions, and we want to avoid blocking and waiting for a - // context switch before continuing to the next subscription. This also - // prevents tail latencies from compounding across updates. We also want - // to make sure the subscribers are reasonably in sync, so it should be - // kept at 1. However, this should be benchmarked first. - peerUpdates := NewPeerUpdates(make(chan PeerUpdate)) - m.mtx.Lock() - m.subscriptions[peerUpdates] = peerUpdates - m.mtx.Unlock() - - go func() { - <-peerUpdates.Done() - m.mtx.Lock() - delete(m.subscriptions, peerUpdates) - m.mtx.Unlock() - }() - return peerUpdates -} - -// broadcast broadcasts a peer update to all subscriptions. The caller must -// already hold the mutex lock. This means the mutex is held for the duration -// of the broadcast, which we want to make sure all subscriptions receive all -// updates in the same order. -// -// FIXME: Consider using more fine-grained mutexes here, and/or a channel to -// enforce ordering of updates. -func (m *PeerManager) broadcast(peerUpdate PeerUpdate) { - for _, sub := range m.subscriptions { - select { - case sub.updatesCh <- peerUpdate: - case <-sub.doneCh: - } - } -} - -// DialNext finds an appropriate peer address to dial, and marks it as dialing. -// If no peer is found, or all connection slots are full, it blocks until one -// becomes available. The caller must call Dialed() or DialFailed() for the -// returned peer. The context can be used to cancel the call. -func (m *PeerManager) DialNext(ctx context.Context) (NodeID, NodeAddress, error) { - for { - id, address, err := m.TryDialNext() - if err != nil || id != "" { - return id, address, err - } - select { - case <-m.wakeDialCh: - case <-ctx.Done(): - return "", NodeAddress{}, ctx.Err() - } - } -} - -// TryDialNext is equivalent to DialNext(), but immediately returns an empty -// peer ID if no peers or connection slots are available. -func (m *PeerManager) TryDialNext() (NodeID, NodeAddress, error) { - m.mtx.Lock() - defer m.mtx.Unlock() - - // We allow dialing MaxConnected+MaxConnectedUpgrade peers. Including - // MaxConnectedUpgrade allows us to probe additional peers that have a - // higher score than any other peers, and if successful evict it. - if m.options.MaxConnected > 0 && - len(m.connected)+len(m.dialing) >= int(m.options.MaxConnected)+int(m.options.MaxConnectedUpgrade) { - return "", NodeAddress{}, nil - } - - for _, peer := range m.store.Ranked() { - if m.dialing[peer.ID] || m.connected[peer.ID] { - continue - } - - for _, addressInfo := range peer.AddressInfo { - if time.Since(addressInfo.LastDialFailure) < m.retryDelay(addressInfo.DialFailures, peer.Persistent) { - continue - } - - // We now have an eligible address to dial. If we're full but have - // upgrade capacity (as checked above), we find a lower-scored peer - // we can replace and mark it as upgrading so noone else claims it. - // - // If we don't find one, there is no point in trying additional - // peers, since they will all have the same or lower score than this - // peer (since they're ordered by score via peerStore.Ranked). - if m.options.MaxConnected > 0 && len(m.connected) >= int(m.options.MaxConnected) { - upgradeFromPeer := m.findUpgradeCandidate(peer.ID, peer.Score()) - if upgradeFromPeer == "" { - return "", NodeAddress{}, nil - } - m.upgrading[upgradeFromPeer] = peer.ID - } - - m.dialing[peer.ID] = true - return peer.ID, addressInfo.Address, nil - } - } - return "", NodeAddress{}, nil -} - -// wakeDial is used to notify DialNext about changes that *may* cause new -// peers to become eligible for dialing, such as peer disconnections and -// retry timeouts. -func (m *PeerManager) wakeDial() { - // The channel has a 1-size buffer. A non-blocking send ensures - // we only queue up at most 1 trigger between each DialNext(). - select { - case m.wakeDialCh <- struct{}{}: - default: - } -} - -// wakeEvict is used to notify EvictNext about changes that *may* cause -// peers to become eligible for eviction, such as peer upgrades. -func (m *PeerManager) wakeEvict() { - // The channel has a 1-size buffer. A non-blocking send ensures - // we only queue up at most 1 trigger between each EvictNext(). - select { - case m.wakeEvictCh <- struct{}{}: - default: - } -} - -// retryDelay calculates a dial retry delay using exponential backoff, based on -// retry settings in PeerManagerOptions. If MinRetryTime is 0, this returns -// MaxInt64 (i.e. an infinite retry delay, effectively disabling retries). -func (m *PeerManager) retryDelay(failures uint32, persistent bool) time.Duration { - if failures == 0 { - return 0 - } - if m.options.MinRetryTime == 0 { - return time.Duration(math.MaxInt64) - } - maxDelay := m.options.MaxRetryTime - if persistent && m.options.MaxRetryTimePersistent > 0 { - maxDelay = m.options.MaxRetryTimePersistent - } - - delay := m.options.MinRetryTime * time.Duration(math.Pow(2, float64(failures))) - if maxDelay > 0 && delay > maxDelay { - delay = maxDelay - } - // FIXME: This should use a PeerManager-scoped RNG. - delay += time.Duration(rand.Int63n(int64(m.options.RetryTimeJitter))) // nolint:gosec - return delay -} - -// DialFailed reports a failed dial attempt. This will make the peer available -// for dialing again when appropriate. -// -// FIXME: This should probably delete or mark bad addresses/peers after some time. -func (m *PeerManager) DialFailed(peerID NodeID, address NodeAddress) error { - m.mtx.Lock() - defer m.mtx.Unlock() - - delete(m.dialing, peerID) - for from, to := range m.upgrading { - if to == peerID { - delete(m.upgrading, from) // Unmark failed upgrade attempt. - } - } - - peer, ok := m.store.Get(peerID) - if !ok { // Peer may have been removed while dialing, ignore. - return nil - } - addressInfo, ok := peer.AddressInfo[address.String()] - if !ok { - return nil // Assume the address has been removed, ignore. - } - addressInfo.LastDialFailure = time.Now().UTC() - addressInfo.DialFailures++ - if err := m.store.Set(peer); err != nil { - return err - } - - // We spawn a goroutine that notifies DialNext() again when the retry - // timeout has elapsed, so that we can consider dialing it again. - go func() { - retryDelay := m.retryDelay(addressInfo.DialFailures, peer.Persistent) - if retryDelay == time.Duration(math.MaxInt64) { - return - } - // Use an explicit timer with deferred cleanup instead of - // time.After(), to avoid leaking goroutines on PeerManager.Close(). - timer := time.NewTimer(retryDelay) - defer timer.Stop() - select { - case <-timer.C: - m.wakeDial() - case <-m.closeCh: - } - }() - - m.wakeDial() - return nil -} - -// Dialed marks a peer as successfully dialed. Any further incoming connections -// will be rejected, and once disconnected the peer may be dialed again. -func (m *PeerManager) Dialed(peerID NodeID, address NodeAddress) error { - m.mtx.Lock() - defer m.mtx.Unlock() - - delete(m.dialing, peerID) - - var upgradeFromPeer NodeID - for from, to := range m.upgrading { - if to == peerID { - delete(m.upgrading, from) - upgradeFromPeer = from - // Don't break, just in case this peer was marked as upgrading for - // multiple lower-scored peers (shouldn't really happen). - } - } - - if m.connected[peerID] { - return fmt.Errorf("peer %v is already connected", peerID) - } - if m.options.MaxConnected > 0 && - len(m.connected) >= int(m.options.MaxConnected)+int(m.options.MaxConnectedUpgrade) { - return fmt.Errorf("already connected to maximum number of peers") - } - - peer, ok := m.store.Get(peerID) - if !ok { - return fmt.Errorf("peer %q was removed while dialing", peerID) - } - now := time.Now().UTC() - peer.LastConnected = now - if addressInfo, ok := peer.AddressInfo[address.String()]; ok { - addressInfo.DialFailures = 0 - addressInfo.LastDialSuccess = now - // If not found, assume address has been removed. - } - if err := m.store.Set(peer); err != nil { - return err - } - - if upgradeFromPeer != "" && m.options.MaxConnected > 0 && - len(m.connected) >= int(m.options.MaxConnected) { - // Look for an even lower-scored peer that may have appeared - // since we started the upgrade. - if p, ok := m.store.Get(upgradeFromPeer); ok { - if u := m.findUpgradeCandidate(p.ID, p.Score()); u != "" { - upgradeFromPeer = u - } - } - m.evict[upgradeFromPeer] = true - } - m.connected[peerID] = true - m.wakeEvict() - - return nil -} - -// Accepted marks an incoming peer connection successfully accepted. If the peer -// is already connected or we don't allow additional connections then this will -// return an error. -// -// If full but MaxConnectedUpgrade is non-zero and the incoming peer is -// better-scored than any existing peers, then we accept it and evict a -// lower-scored peer. -// -// NOTE: We can't take an address here, since e.g. TCP uses a different port -// number for outbound traffic than inbound traffic, so the peer's endpoint -// wouldn't necessarily be an appropriate address to dial. -// -// FIXME: When we accept a connection from a peer, we should register that -// peer's address in the peer store so that we can dial it later. In order to do -// that, we'll need to get the remote address after all, but as noted above that -// can't be the remote endpoint since that will usually have the wrong port -// number. -func (m *PeerManager) Accepted(peerID NodeID) error { - m.mtx.Lock() - defer m.mtx.Unlock() - - if m.connected[peerID] { - return fmt.Errorf("peer %q is already connected", peerID) - } - if m.options.MaxConnected > 0 && - len(m.connected) >= int(m.options.MaxConnected)+int(m.options.MaxConnectedUpgrade) { - return fmt.Errorf("already connected to maximum number of peers") - } - - peer, ok := m.store.Get(peerID) - if !ok { - peer = m.makePeerInfo(peerID) - } - - // If all connections slots are full, but we allow upgrades (and we checked - // above that we have upgrade capacity), then we can look for a lower-scored - // peer to replace and if found accept the connection anyway and evict it. - var upgradeFromPeer NodeID - if m.options.MaxConnected > 0 && len(m.connected) >= int(m.options.MaxConnected) { - upgradeFromPeer = m.findUpgradeCandidate(peer.ID, peer.Score()) - if upgradeFromPeer == "" { - return fmt.Errorf("already connected to maximum number of peers") - } - } - - peer.LastConnected = time.Now().UTC() - if err := m.store.Set(peer); err != nil { - return err - } - - m.connected[peerID] = true - if upgradeFromPeer != "" { - m.evict[upgradeFromPeer] = true - } - m.wakeEvict() - return nil -} - -// Ready marks a peer as ready, broadcasting status updates to subscribers. The -// peer must already be marked as connected. This is separate from Dialed() and -// Accepted() to allow the router to set up its internal queues before reactors -// start sending messages. -func (m *PeerManager) Ready(peerID NodeID) { - m.mtx.Lock() - defer m.mtx.Unlock() - - if m.connected[peerID] { - m.broadcast(PeerUpdate{ - PeerID: peerID, - Status: PeerStatusUp, - }) - } -} - -// Disconnected unmarks a peer as connected, allowing new connections to be -// established. -func (m *PeerManager) Disconnected(peerID NodeID) error { - m.mtx.Lock() - defer m.mtx.Unlock() - - delete(m.connected, peerID) - delete(m.upgrading, peerID) - delete(m.evict, peerID) - delete(m.evicting, peerID) - m.broadcast(PeerUpdate{ - PeerID: peerID, - Status: PeerStatusDown, - }) - m.wakeDial() - return nil -} - -// EvictNext returns the next peer to evict (i.e. disconnect). If no evictable -// peers are found, the call will block until one becomes available or the -// context is cancelled. -func (m *PeerManager) EvictNext(ctx context.Context) (NodeID, error) { - for { - id, err := m.TryEvictNext() - if err != nil || id != "" { - return id, err - } - select { - case <-m.wakeEvictCh: - case <-ctx.Done(): - return "", ctx.Err() - } - } -} - -// TryEvictNext is equivalent to EvictNext, but immediately returns an empty -// node ID if no evictable peers are found. -func (m *PeerManager) TryEvictNext() (NodeID, error) { - m.mtx.Lock() - defer m.mtx.Unlock() - - // If any connected peers are explicitly scheduled for eviction, we return a - // random one. - for peerID := range m.evict { - delete(m.evict, peerID) - if m.connected[peerID] && !m.evicting[peerID] { - m.evicting[peerID] = true - return peerID, nil - } - } - - // If we're below capacity, we don't need to evict anything. - if m.options.MaxConnected == 0 || - len(m.connected)-len(m.evicting) <= int(m.options.MaxConnected) { - return "", nil - } - - // If we're above capacity, just pick the lowest-ranked peer to evict. - ranked := m.store.Ranked() - for i := len(ranked) - 1; i >= 0; i-- { - peer := ranked[i] - if m.connected[peer.ID] && !m.evicting[peer.ID] { - m.evicting[peer.ID] = true - return peer.ID, nil - } - } - - return "", nil -} - -// findUpgradeCandidate looks for a lower-scored peer that we could evict -// to make room for the given peer. Returns an empty ID if none is found. -// The caller must hold the mutex lock. -func (m *PeerManager) findUpgradeCandidate(id NodeID, score PeerScore) NodeID { - ranked := m.store.Ranked() - for i := len(ranked) - 1; i >= 0; i-- { - candidate := ranked[i] - switch { - case candidate.Score() >= score: - return "" // no further peers can be scored lower, due to sorting - case !m.connected[candidate.ID]: - case m.evict[candidate.ID]: - case m.evicting[candidate.ID]: - case m.upgrading[candidate.ID] != "": - default: - return candidate.ID - } - } - return "" -} - -// GetHeight returns a peer's height, as reported via SetHeight. If the peer -// or height is unknown, this returns 0. -// -// FIXME: This is a temporary workaround for the peer state stored via the -// legacy Peer.Set() and Peer.Get() APIs, used to share height state between the -// consensus and mempool reactors. These dependencies should be removed from the -// reactors, and instead query this information independently via new P2P -// protocol additions. -func (m *PeerManager) GetHeight(peerID NodeID) int64 { - m.mtx.Lock() - defer m.mtx.Unlock() - - peer, _ := m.store.Get(peerID) - return peer.Height -} - -// SetHeight stores a peer's height, making it available via GetHeight. If the -// peer is unknown, it is created. -// -// FIXME: This is a temporary workaround for the peer state stored via the -// legacy Peer.Set() and Peer.Get() APIs, used to share height state between the -// consensus and mempool reactors. These dependencies should be removed from the -// reactors, and instead query this information independently via new P2P -// protocol additions. -func (m *PeerManager) SetHeight(peerID NodeID, height int64) error { - m.mtx.Lock() - defer m.mtx.Unlock() - - peer, ok := m.store.Get(peerID) - if !ok { - peer = m.makePeerInfo(peerID) - } - peer.Height = height - return m.store.Set(peer) -} - -// peerStore stores information about peers. It is not thread-safe, assuming -// it is used only by PeerManager which handles concurrency control, allowing -// it to execute multiple operations atomically via its own mutex. -// -// The entire set of peers is kept in memory, for performance. It is loaded -// from disk on initialization, and any changes are written back to disk -// (without fsync, since we can afford to lose recent writes). -type peerStore struct { - db dbm.DB - peers map[NodeID]*peerInfo - ranked []*peerInfo // cache for Ranked(), nil invalidates cache -} - -// newPeerStore creates a new peer store, loading all persisted peers from the -// database into memory. -func newPeerStore(db dbm.DB) (*peerStore, error) { - store := &peerStore{ - db: db, - } - if err := store.loadPeers(); err != nil { - return nil, err - } - return store, nil -} - -// loadPeers loads all peers from the database into memory. -func (s *peerStore) loadPeers() error { - peers := make(map[NodeID]*peerInfo) - - start, end := keyPeerInfoRange() - iter, err := s.db.Iterator(start, end) - if err != nil { - return err - } - defer iter.Close() - for ; iter.Valid(); iter.Next() { - // FIXME: We may want to tolerate failures here, by simply logging - // the errors and ignoring the faulty peer entries. - msg := new(p2pproto.PeerInfo) - if err := proto.Unmarshal(iter.Value(), msg); err != nil { - return fmt.Errorf("invalid peer Protobuf data: %w", err) - } - peer, err := peerInfoFromProto(msg) - if err != nil { - return fmt.Errorf("invalid peer data: %w", err) - } - peers[peer.ID] = peer - } - if iter.Error() != nil { - return iter.Error() - } - s.peers = peers - s.ranked = nil // invalidate cache if populated - return nil -} - -// Get fetches a peer. The boolean indicates whether the peer existed or not. -// The returned peer info is a copy, and can be mutated at will. -func (s *peerStore) Get(id NodeID) (peerInfo, bool) { - peer, ok := s.peers[id] - return peer.Copy(), ok -} - -// Set stores peer data. The input data will be copied, and can safely be reused -// by the caller. -func (s *peerStore) Set(peer peerInfo) error { - if err := peer.Validate(); err != nil { - return err - } - peer = peer.Copy() - - // FIXME: We may want to optimize this by avoiding saving to the database - // if there haven't been any changes to persisted fields. - bz, err := peer.ToProto().Marshal() - if err != nil { - return err - } - if err = s.db.Set(keyPeerInfo(peer.ID), bz); err != nil { - return err - } - - if current, ok := s.peers[peer.ID]; !ok || current.Score() != peer.Score() { - // If the peer is new, or its score changes, we invalidate the Ranked() cache. - s.peers[peer.ID] = &peer - s.ranked = nil - } else { - // Otherwise, since s.ranked contains pointers to the old data and we - // want those pointers to remain valid with the new data, we have to - // update the existing pointer address. - *current = peer - } - - return nil -} - -// Delete deletes a peer, or does nothing if it does not exist. -func (s *peerStore) Delete(id NodeID) error { - if _, ok := s.peers[id]; !ok { - return nil - } - if err := s.db.Delete(keyPeerInfo(id)); err != nil { - return err - } - delete(s.peers, id) - s.ranked = nil - return nil -} - -// List retrieves all peers in an arbitrary order. The returned data is a copy, -// and can be mutated at will. -func (s *peerStore) List() []peerInfo { - peers := make([]peerInfo, 0, len(s.peers)) - for _, peer := range s.peers { - peers = append(peers, peer.Copy()) - } - return peers -} - -// Ranked returns a list of peers ordered by score (better peers first). Peers -// with equal scores are returned in an arbitrary order. The returned list must -// not be mutated or accessed concurrently by the caller, since it returns -// pointers to internal peerStore data for performance. -// -// Ranked is used to determine both which peers to dial, which ones to evict, -// and which ones to delete completely. -// -// FIXME: For now, we simply maintain a cache in s.ranked which is invalidated -// by setting it to nil, but if necessary we should use a better data structure -// for this (e.g. a heap or ordered map). -// -// FIXME: The scoring logic is currently very naïve, see peerInfo.Score(). -func (s *peerStore) Ranked() []*peerInfo { - if s.ranked != nil { - return s.ranked - } - s.ranked = make([]*peerInfo, 0, len(s.peers)) - for _, peer := range s.peers { - s.ranked = append(s.ranked, peer) - } - sort.Slice(s.ranked, func(i, j int) bool { - // FIXME: If necessary, consider precomputing scores before sorting, - // to reduce the number of Score() calls. - return s.ranked[i].Score() > s.ranked[j].Score() - }) - return s.ranked -} - -// Size returns the number of peers in the peer store. -func (s *peerStore) Size() int { - return len(s.peers) -} - -// peerInfo contains peer information stored in a peerStore. -type peerInfo struct { - ID NodeID - AddressInfo map[string]*peerAddressInfo - LastConnected time.Time - - // These fields are ephemeral, i.e. not persisted to the database. - Persistent bool - Height int64 -} - -// peerInfoFromProto converts a Protobuf PeerInfo message to a peerInfo, -// erroring if the data is invalid. -func peerInfoFromProto(msg *p2pproto.PeerInfo) (*peerInfo, error) { - p := &peerInfo{ - ID: NodeID(msg.ID), - AddressInfo: map[string]*peerAddressInfo{}, - } - if msg.LastConnected != nil { - p.LastConnected = *msg.LastConnected - } - for _, addr := range msg.AddressInfo { - addressInfo, err := peerAddressInfoFromProto(addr) - if err != nil { - return nil, err - } - p.AddressInfo[addressInfo.Address.String()] = addressInfo - } - return p, p.Validate() -} - -// ToProto converts the peerInfo to p2pproto.PeerInfo for database storage. The -// Protobuf type only contains persisted fields, while ephemeral fields are -// discarded. The returned message may contain pointers to original data, since -// it is expected to be serialized immediately. -func (p *peerInfo) ToProto() *p2pproto.PeerInfo { - msg := &p2pproto.PeerInfo{ - ID: string(p.ID), - LastConnected: &p.LastConnected, - } - for _, addressInfo := range p.AddressInfo { - msg.AddressInfo = append(msg.AddressInfo, addressInfo.ToProto()) - } - if msg.LastConnected.IsZero() { - msg.LastConnected = nil - } - return msg -} - -// Copy returns a deep copy of the peer info. -func (p *peerInfo) Copy() peerInfo { - if p == nil { - return peerInfo{} - } - c := *p - for i, addressInfo := range c.AddressInfo { - addressInfoCopy := addressInfo.Copy() - c.AddressInfo[i] = &addressInfoCopy - } - return c -} - -// Score calculates a score for the peer. Higher-scored peers will be -// preferred over lower scores. -func (p *peerInfo) Score() PeerScore { - var score PeerScore - if p.Persistent { - score += PeerScorePersistent - } - return score -} - -// Validate validates the peer info. -func (p *peerInfo) Validate() error { - if p.ID == "" { - return errors.New("no peer ID") - } - return nil -} - -// peerAddressInfo contains information and statistics about a peer address. -type peerAddressInfo struct { - Address NodeAddress - LastDialSuccess time.Time - LastDialFailure time.Time - DialFailures uint32 // since last successful dial -} - -// peerAddressInfoFromProto converts a Protobuf PeerAddressInfo message -// to a peerAddressInfo. -func peerAddressInfoFromProto(msg *p2pproto.PeerAddressInfo) (*peerAddressInfo, error) { - address, err := ParseNodeAddress(msg.Address) - if err != nil { - return nil, fmt.Errorf("invalid address %q: %w", address, err) - } - addressInfo := &peerAddressInfo{ - Address: address, - DialFailures: msg.DialFailures, - } - if msg.LastDialSuccess != nil { - addressInfo.LastDialSuccess = *msg.LastDialSuccess - } - if msg.LastDialFailure != nil { - addressInfo.LastDialFailure = *msg.LastDialFailure - } - return addressInfo, addressInfo.Validate() -} - -// ToProto converts the address into to a Protobuf message for serialization. -func (a *peerAddressInfo) ToProto() *p2pproto.PeerAddressInfo { - msg := &p2pproto.PeerAddressInfo{ - Address: a.Address.String(), - LastDialSuccess: &a.LastDialSuccess, - LastDialFailure: &a.LastDialFailure, - DialFailures: a.DialFailures, - } - if msg.LastDialSuccess.IsZero() { - msg.LastDialSuccess = nil - } - if msg.LastDialFailure.IsZero() { - msg.LastDialFailure = nil - } - return msg -} - -// Copy returns a copy of the address info. -func (a *peerAddressInfo) Copy() peerAddressInfo { - return *a -} - -// Validate validates the address info. -func (a *peerAddressInfo) Validate() error { - return a.Address.Validate() -} - -// These are database key prefixes. -const ( - prefixPeerInfo int64 = 1 -) - -// keyPeerInfo generates a peerInfo database key. -func keyPeerInfo(id NodeID) []byte { - key, err := orderedcode.Append(nil, prefixPeerInfo, string(id)) - if err != nil { - panic(err) - } - return key -} - -// keyPeerInfoPrefix generates start/end keys for the entire peerInfo key range. -func keyPeerInfoRange() ([]byte, []byte) { - start, err := orderedcode.Append(nil, prefixPeerInfo, "") - if err != nil { - panic(err) - } - end, err := orderedcode.Append(nil, prefixPeerInfo, orderedcode.Infinity) - if err != nil { - panic(err) - } - return start, end -} - -// ============================================================================ -// Types and business logic below may be deprecated. -// -// TODO: Rename once legacy p2p types are removed. -// ref: https://github.com/tendermint/tendermint/issues/5670 -// ============================================================================ - //go:generate mockery --case underscore --name Peer const metricsTickerDuration = 10 * time.Second diff --git a/p2p/peermanager.go b/p2p/peermanager.go new file mode 100644 index 000000000..254f93a63 --- /dev/null +++ b/p2p/peermanager.go @@ -0,0 +1,1275 @@ +package p2p + +import ( + "context" + "errors" + "fmt" + "math" + "math/rand" + "sort" + "sync" + "time" + + "github.com/gogo/protobuf/proto" + "github.com/google/orderedcode" + dbm "github.com/tendermint/tm-db" + + tmsync "github.com/tendermint/tendermint/libs/sync" + p2pproto "github.com/tendermint/tendermint/proto/tendermint/p2p" +) + +const ( + // retryNever is returned by retryDelay() when retries are disabled. + retryNever time.Duration = math.MaxInt64 +) + +// PeerStatus is a peer status. +// +// The peer manager has many more internal states for a peer (e.g. dialing, +// connected, evicting, and so on), which are tracked separately. PeerStatus is +// for external use outside of the peer manager. +type PeerStatus string + +const ( + PeerStatusUp PeerStatus = "up" // connected and ready + PeerStatusDown PeerStatus = "down" // disconnected +) + +// PeerScore is a numeric score assigned to a peer (higher is better). +type PeerScore uint8 + +const ( + PeerScorePersistent PeerScore = 100 // persistent peers +) + +// PeerUpdate is a peer update event sent via PeerUpdates. +type PeerUpdate struct { + NodeID NodeID + Status PeerStatus +} + +// PeerUpdates is a peer update subscription with notifications about peer +// events (currently just status changes). +type PeerUpdates struct { + updatesCh chan PeerUpdate + closeCh chan struct{} + closeOnce sync.Once +} + +// NewPeerUpdates creates a new PeerUpdates subscription. It is primarily for +// internal use, callers should typically use PeerManager.Subscribe(). The +// subscriber must call Close() when done. +func NewPeerUpdates(updatesCh chan PeerUpdate) *PeerUpdates { + return &PeerUpdates{ + updatesCh: updatesCh, + closeCh: make(chan struct{}), + } +} + +// Updates returns a channel for consuming peer updates. +func (pu *PeerUpdates) Updates() <-chan PeerUpdate { + return pu.updatesCh +} + +// Close closes the peer updates subscription. +func (pu *PeerUpdates) Close() { + pu.closeOnce.Do(func() { + // NOTE: We don't close updatesCh since multiple goroutines may be + // sending on it. The PeerManager senders will select on closeCh as well + // to avoid blocking on a closed subscription. + close(pu.closeCh) + }) +} + +// Done returns a channel that is closed when the subscription is closed. +func (pu *PeerUpdates) Done() <-chan struct{} { + return pu.closeCh +} + +// PeerManagerOptions specifies options for a PeerManager. +type PeerManagerOptions struct { + // PersistentPeers are peers that we want to maintain persistent connections + // to. These will be scored higher than other peers, and if + // MaxConnectedUpgrade is non-zero any lower-scored peers will be evicted if + // necessary to make room for these. + PersistentPeers []NodeID + + // MaxPeers is the maximum number of peers to track information about, i.e. + // store in the peer store. When exceeded, the lowest-scored unconnected peers + // will be deleted. 0 means no limit. + MaxPeers uint16 + + // MaxConnected is the maximum number of connected peers (inbound and + // outbound). 0 means no limit. + MaxConnected uint16 + + // MaxConnectedUpgrade is the maximum number of additional connections to + // use for probing any better-scored peers to upgrade to when all connection + // slots are full. 0 disables peer upgrading. + // + // For example, if we are already connected to MaxConnected peers, but we + // know or learn about better-scored peers (e.g. configured persistent + // peers) that we are not connected too, then we can probe these peers by + // using up to MaxConnectedUpgrade connections, and once connected evict the + // lowest-scored connected peers. This also works for inbound connections, + // i.e. if a higher-scored peer attempts to connect to us, we can accept + // the connection and evict a lower-scored peer. + MaxConnectedUpgrade uint16 + + // MinRetryTime is the minimum time to wait between retries. Retry times + // double for each retry, up to MaxRetryTime. 0 disables retries. + MinRetryTime time.Duration + + // MaxRetryTime is the maximum time to wait between retries. 0 means + // no maximum, in which case the retry time will keep doubling. + MaxRetryTime time.Duration + + // MaxRetryTimePersistent is the maximum time to wait between retries for + // peers listed in PersistentPeers. 0 uses MaxRetryTime instead. + MaxRetryTimePersistent time.Duration + + // RetryTimeJitter is the upper bound of a random interval added to + // retry times, to avoid thundering herds. 0 disables jitter. + RetryTimeJitter time.Duration + + // PeerScores sets fixed scores for specific peers. It is mainly used + // for testing. A score of 0 is ignored. + PeerScores map[NodeID]PeerScore + + // persistentPeers provides fast PersistentPeers lookups. It is built + // by optimize(). + persistentPeers map[NodeID]bool +} + +// Validate validates the options. +func (o *PeerManagerOptions) Validate() error { + for _, id := range o.PersistentPeers { + if err := id.Validate(); err != nil { + return fmt.Errorf("invalid PersistentPeer ID %q: %w", id, err) + } + } + if o.MaxConnected > 0 && len(o.PersistentPeers) > int(o.MaxConnected) { + return fmt.Errorf("number of persistent peers %v can't exceed MaxConnected %v", + len(o.PersistentPeers), o.MaxConnected) + } + + if o.MaxPeers > 0 { + if o.MaxConnected == 0 || o.MaxConnected+o.MaxConnectedUpgrade > o.MaxPeers { + return fmt.Errorf("MaxConnected %v and MaxConnectedUpgrade %v can't exceed MaxPeers %v", // nolint + o.MaxConnected, o.MaxConnectedUpgrade, o.MaxPeers) + } + } + + if o.MaxRetryTime > 0 { + if o.MinRetryTime == 0 { + return errors.New("can't set MaxRetryTime without MinRetryTime") + } + if o.MinRetryTime > o.MaxRetryTime { + return fmt.Errorf("MinRetryTime %v is greater than MaxRetryTime %v", // nolint + o.MinRetryTime, o.MaxRetryTime) + } + } + if o.MaxRetryTimePersistent > 0 { + if o.MinRetryTime == 0 { + return errors.New("can't set MaxRetryTimePersistent without MinRetryTime") + } + if o.MinRetryTime > o.MaxRetryTimePersistent { + return fmt.Errorf("MinRetryTime %v is greater than MaxRetryTimePersistent %v", // nolint + o.MinRetryTime, o.MaxRetryTimePersistent) + } + } + + return nil +} + +// isPersistentPeer checks if a peer is in PersistentPeers. It will panic +// if called before optimize(). +func (o *PeerManagerOptions) isPersistent(id NodeID) bool { + if o.persistentPeers == nil { + panic("isPersistentPeer() called before optimize()") + } + return o.persistentPeers[id] +} + +// optimize optimizes operations by pregenerating lookup structures. It's a +// separate method instead of memoizing during calls to avoid dealing with +// concurrency and mutex overhead. +func (o *PeerManagerOptions) optimize() { + o.persistentPeers = make(map[NodeID]bool, len(o.PersistentPeers)) + for _, p := range o.PersistentPeers { + o.persistentPeers[p] = true + } +} + +// PeerManager manages peer lifecycle information, using a peerStore for +// underlying storage. Its primary purpose is to determine which peer to connect +// to next (including retry timers), make sure a peer only has a single active +// connection (either inbound or outbound), and evict peers to make room for +// higher-scored peers. It does not manage actual connections (this is handled +// by the Router), only the peer lifecycle state. +// +// For an outbound connection, the flow is as follows: +// - DialNext: return a peer address to dial, mark peer as dialing. +// - DialFailed: report a dial failure, unmark as dialing. +// - Dialed: report a dial success, unmark as dialing and mark as connected +// (errors if already connected, e.g. by Accepted). +// - Ready: report routing is ready, mark as ready and broadcast PeerStatusUp. +// - Disconnected: report peer disconnect, unmark as connected and broadcasts +// PeerStatusDown. +// +// For an inbound connection, the flow is as follows: +// - Accepted: report inbound connection success, mark as connected (errors if +// already connected, e.g. by Dialed). +// - Ready: report routing is ready, mark as ready and broadcast PeerStatusUp. +// - Disconnected: report peer disconnect, unmark as connected and broadcasts +// PeerStatusDown. +// +// When evicting peers, either because peers are explicitly scheduled for +// eviction or we are connected to too many peers, the flow is as follows: +// - EvictNext: if marked evict and connected, unmark evict and mark evicting. +// If beyond MaxConnected, pick lowest-scored peer and mark evicting. +// - Disconnected: unmark connected, evicting, evict, and broadcast a +// PeerStatusDown peer update. +// +// If all connection slots are full (at MaxConnections), we can use up to +// MaxConnectionsUpgrade additional connections to probe any higher-scored +// unconnected peers, and if we reach them (or they reach us) we allow the +// connection and evict a lower-scored peer. We mark the lower-scored peer as +// upgrading[from]=to to make sure no other higher-scored peers can claim the +// same one for an upgrade. The flow is as follows: +// - Accepted: if upgrade is possible, mark connected and add lower-scored to evict. +// - DialNext: if upgrade is possible, mark upgrading[from]=to and dialing. +// - DialFailed: unmark upgrading[from]=to and dialing. +// - Dialed: unmark upgrading[from]=to and dialing, mark as connected, add +// lower-scored to evict. +// - EvictNext: pick peer from evict, mark as evicting. +// - Disconnected: unmark connected, upgrading[from]=to, evict, evicting. +// +// FIXME: The old stack supports ABCI-based peer ID filtering via +// /p2p/filter/id/ queries, we should implement this here as well by taking +// a peer ID filtering callback in PeerManagerOptions and configuring it during +// Node setup. +type PeerManager struct { + options PeerManagerOptions + rand *rand.Rand + dialWaker *tmsync.Waker // wakes up DialNext() on relevant peer changes + evictWaker *tmsync.Waker // wakes up EvictNext() on relevant peer changes + closeCh chan struct{} // signal channel for Close() + closeOnce sync.Once + + mtx sync.Mutex + store *peerStore + subscriptions map[*PeerUpdates]*PeerUpdates // keyed by struct identity (address) + dialing map[NodeID]bool // peers being dialed (DialNext → Dialed/DialFail) + upgrading map[NodeID]NodeID // peers claimed for upgrade (DialNext → Dialed/DialFail) + connected map[NodeID]bool // connected peers (Dialed/Accepted → Disconnected) + ready map[NodeID]bool // ready peers (Ready → Disconnected) + evict map[NodeID]bool // peers scheduled for eviction (Connected → EvictNext) + evicting map[NodeID]bool // peers being evicted (EvictNext → Disconnected) +} + +// NewPeerManager creates a new peer manager. +func NewPeerManager(peerDB dbm.DB, options PeerManagerOptions) (*PeerManager, error) { + store, err := newPeerStore(peerDB) + if err != nil { + return nil, err + } + if err = options.Validate(); err != nil { + return nil, err + } + options.optimize() + + peerManager := &PeerManager{ + options: options, + rand: rand.New(rand.NewSource(time.Now().UnixNano())), // nolint:gosec + dialWaker: tmsync.NewWaker(), + evictWaker: tmsync.NewWaker(), + closeCh: make(chan struct{}), + + store: store, + dialing: map[NodeID]bool{}, + upgrading: map[NodeID]NodeID{}, + connected: map[NodeID]bool{}, + ready: map[NodeID]bool{}, + evict: map[NodeID]bool{}, + evicting: map[NodeID]bool{}, + subscriptions: map[*PeerUpdates]*PeerUpdates{}, + } + if err = peerManager.configurePeers(); err != nil { + return nil, err + } + if err = peerManager.prunePeers(); err != nil { + return nil, err + } + return peerManager, nil +} + +// configurePeers configures peers in the peer store with ephemeral runtime +// configuration, e.g. PersistentPeers. The caller must hold the mutex lock. +func (m *PeerManager) configurePeers() error { + configure := map[NodeID]bool{} + for _, id := range m.options.PersistentPeers { + configure[id] = true + } + for id := range m.options.PeerScores { + configure[id] = true + } + for id := range configure { + if peer, ok := m.store.Get(id); ok { + if err := m.store.Set(m.configurePeer(peer)); err != nil { + return err + } + } + } + return nil +} + +// configurePeer configures a peer with ephemeral runtime configuration. +func (m *PeerManager) configurePeer(peer peerInfo) peerInfo { + peer.Persistent = m.options.isPersistent(peer.ID) + peer.FixedScore = m.options.PeerScores[peer.ID] + return peer +} + +// newPeerInfo creates a peerInfo for a new peer. +func (m *PeerManager) newPeerInfo(id NodeID) peerInfo { + peerInfo := peerInfo{ + ID: id, + AddressInfo: map[NodeAddress]*peerAddressInfo{}, + } + return m.configurePeer(peerInfo) +} + +// prunePeers removes low-scored peers from the peer store if it contains more +// than MaxPeers peers. The caller must hold the mutex lock. +func (m *PeerManager) prunePeers() error { + if m.options.MaxPeers == 0 || m.store.Size() <= int(m.options.MaxPeers) { + return nil + } + + ranked := m.store.Ranked() + for i := len(ranked) - 1; i >= 0; i-- { + peerID := ranked[i].ID + switch { + case m.store.Size() <= int(m.options.MaxPeers): + break + case m.dialing[peerID]: + case m.connected[peerID]: + default: + if err := m.store.Delete(peerID); err != nil { + return err + } + } + } + return nil +} + +// Add adds a peer to the manager, given as an address. If the peer already +// exists, the address is added to it if not already present. +func (m *PeerManager) Add(address NodeAddress) error { + if err := address.Validate(); err != nil { + return err + } + + m.mtx.Lock() + defer m.mtx.Unlock() + + peer, ok := m.store.Get(address.NodeID) + if !ok { + peer = m.newPeerInfo(address.NodeID) + } + if _, ok := peer.AddressInfo[address]; !ok { + peer.AddressInfo[address] = &peerAddressInfo{Address: address} + } + if err := m.store.Set(peer); err != nil { + return err + } + if err := m.prunePeers(); err != nil { + return err + } + m.dialWaker.Wake() + return nil +} + +// DialNext finds an appropriate peer address to dial, and marks it as dialing. +// If no peer is found, or all connection slots are full, it blocks until one +// becomes available. The caller must call Dialed() or DialFailed() for the +// returned peer. +func (m *PeerManager) DialNext(ctx context.Context) (NodeAddress, error) { + for { + address, err := m.TryDialNext() + if err != nil || (address != NodeAddress{}) { + return address, err + } + select { + case <-m.dialWaker.Sleep(): + case <-ctx.Done(): + return NodeAddress{}, ctx.Err() + } + } +} + +// TryDialNext is equivalent to DialNext(), but immediately returns an empty +// address if no peers or connection slots are available. +func (m *PeerManager) TryDialNext() (NodeAddress, error) { + m.mtx.Lock() + defer m.mtx.Unlock() + + // We allow dialing MaxConnected+MaxConnectedUpgrade peers. Including + // MaxConnectedUpgrade allows us to probe additional peers that have a + // higher score than any other peers, and if successful evict it. + if m.options.MaxConnected > 0 && len(m.connected)+len(m.dialing) >= + int(m.options.MaxConnected)+int(m.options.MaxConnectedUpgrade) { + return NodeAddress{}, nil + } + + for _, peer := range m.store.Ranked() { + if m.dialing[peer.ID] || m.connected[peer.ID] { + continue + } + + for _, addressInfo := range peer.AddressInfo { + if time.Since(addressInfo.LastDialFailure) < m.retryDelay(addressInfo.DialFailures, peer.Persistent) { + continue + } + + // We now have an eligible address to dial. If we're full but have + // upgrade capacity (as checked above), we find a lower-scored peer + // we can replace and mark it as upgrading so noone else claims it. + // + // If we don't find one, there is no point in trying additional + // peers, since they will all have the same or lower score than this + // peer (since they're ordered by score via peerStore.Ranked). + if m.options.MaxConnected > 0 && len(m.connected) >= int(m.options.MaxConnected) { + upgradeFromPeer := m.findUpgradeCandidate(peer.ID, peer.Score()) + if upgradeFromPeer == "" { + return NodeAddress{}, nil + } + m.upgrading[upgradeFromPeer] = peer.ID + } + + m.dialing[peer.ID] = true + return addressInfo.Address, nil + } + } + return NodeAddress{}, nil +} + +// DialFailed reports a failed dial attempt. This will make the peer available +// for dialing again when appropriate (possibly after a retry timeout). +// +// FIXME: This should probably delete or mark bad addresses/peers after some time. +func (m *PeerManager) DialFailed(address NodeAddress) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + delete(m.dialing, address.NodeID) + for from, to := range m.upgrading { + if to == address.NodeID { + delete(m.upgrading, from) // Unmark failed upgrade attempt. + } + } + + peer, ok := m.store.Get(address.NodeID) + if !ok { // Peer may have been removed while dialing, ignore. + return nil + } + addressInfo, ok := peer.AddressInfo[address] + if !ok { + return nil // Assume the address has been removed, ignore. + } + addressInfo.LastDialFailure = time.Now().UTC() + addressInfo.DialFailures++ + if err := m.store.Set(peer); err != nil { + return err + } + + // We spawn a goroutine that notifies DialNext() again when the retry + // timeout has elapsed, so that we can consider dialing it again. We + // calculate the retry delay outside the goroutine, since it must hold + // the mutex lock. + if d := m.retryDelay(addressInfo.DialFailures, peer.Persistent); d != retryNever { + go func() { + // Use an explicit timer with deferred cleanup instead of + // time.After(), to avoid leaking goroutines on PeerManager.Close(). + timer := time.NewTimer(d) + defer timer.Stop() + select { + case <-timer.C: + m.dialWaker.Wake() + case <-m.closeCh: + } + }() + } + + m.dialWaker.Wake() + return nil +} + +// Dialed marks a peer as successfully dialed. Any further connections will be +// rejected, and once disconnected the peer may be dialed again. +func (m *PeerManager) Dialed(address NodeAddress) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + delete(m.dialing, address.NodeID) + + var upgradeFromPeer NodeID + for from, to := range m.upgrading { + if to == address.NodeID { + delete(m.upgrading, from) + upgradeFromPeer = from + // Don't break, just in case this peer was marked as upgrading for + // multiple lower-scored peers (shouldn't really happen). + } + } + + if m.connected[address.NodeID] { + return fmt.Errorf("peer %v is already connected", address.NodeID) + } + if m.options.MaxConnected > 0 && len(m.connected) >= int(m.options.MaxConnected) { + if upgradeFromPeer == "" || len(m.connected) >= + int(m.options.MaxConnected)+int(m.options.MaxConnectedUpgrade) { + return fmt.Errorf("already connected to maximum number of peers") + } + } + + peer, ok := m.store.Get(address.NodeID) + if !ok { + return fmt.Errorf("peer %q was removed while dialing", address.NodeID) + } + now := time.Now().UTC() + peer.LastConnected = now + if addressInfo, ok := peer.AddressInfo[address]; ok { + addressInfo.DialFailures = 0 + addressInfo.LastDialSuccess = now + // If not found, assume address has been removed. + } + if err := m.store.Set(peer); err != nil { + return err + } + + if upgradeFromPeer != "" && m.options.MaxConnected > 0 && + len(m.connected) >= int(m.options.MaxConnected) { + // Look for an even lower-scored peer that may have appeared since we + // started the upgrade. + if p, ok := m.store.Get(upgradeFromPeer); ok { + if u := m.findUpgradeCandidate(p.ID, p.Score()); u != "" { + upgradeFromPeer = u + } + } + m.evict[upgradeFromPeer] = true + } + m.connected[peer.ID] = true + m.evictWaker.Wake() + + return nil +} + +// Accepted marks an incoming peer connection successfully accepted. If the peer +// is already connected or we don't allow additional connections then this will +// return an error. +// +// If full but MaxConnectedUpgrade is non-zero and the incoming peer is +// better-scored than any existing peers, then we accept it and evict a +// lower-scored peer. +// +// NOTE: We can't take an address here, since e.g. TCP uses a different port +// number for outbound traffic than inbound traffic, so the peer's endpoint +// wouldn't necessarily be an appropriate address to dial. +// +// FIXME: When we accept a connection from a peer, we should register that +// peer's address in the peer store so that we can dial it later. In order to do +// that, we'll need to get the remote address after all, but as noted above that +// can't be the remote endpoint since that will usually have the wrong port +// number. +func (m *PeerManager) Accepted(peerID NodeID) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + if m.connected[peerID] { + return fmt.Errorf("peer %q is already connected", peerID) + } + if m.options.MaxConnected > 0 && + len(m.connected) >= int(m.options.MaxConnected)+int(m.options.MaxConnectedUpgrade) { + return fmt.Errorf("already connected to maximum number of peers") + } + + peer, ok := m.store.Get(peerID) + if !ok { + peer = m.newPeerInfo(peerID) + } + + // If all connections slots are full, but we allow upgrades (and we checked + // above that we have upgrade capacity), then we can look for a lower-scored + // peer to replace and if found accept the connection anyway and evict it. + var upgradeFromPeer NodeID + if m.options.MaxConnected > 0 && len(m.connected) >= int(m.options.MaxConnected) { + upgradeFromPeer = m.findUpgradeCandidate(peer.ID, peer.Score()) + if upgradeFromPeer == "" { + return fmt.Errorf("already connected to maximum number of peers") + } + } + + peer.LastConnected = time.Now().UTC() + if err := m.store.Set(peer); err != nil { + return err + } + + m.connected[peerID] = true + if upgradeFromPeer != "" { + m.evict[upgradeFromPeer] = true + } + m.evictWaker.Wake() + return nil +} + +// Ready marks a peer as ready, broadcasting status updates to subscribers. The +// peer must already be marked as connected. This is separate from Dialed() and +// Accepted() to allow the router to set up its internal queues before reactors +// start sending messages. +func (m *PeerManager) Ready(peerID NodeID) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + if m.connected[peerID] { + m.ready[peerID] = true + m.broadcast(PeerUpdate{ + NodeID: peerID, + Status: PeerStatusUp, + }) + } + return nil +} + +// EvictNext returns the next peer to evict (i.e. disconnect). If no evictable +// peers are found, the call will block until one becomes available. +func (m *PeerManager) EvictNext(ctx context.Context) (NodeID, error) { + for { + id, err := m.TryEvictNext() + if err != nil || id != "" { + return id, err + } + select { + case <-m.evictWaker.Sleep(): + case <-ctx.Done(): + return "", ctx.Err() + } + } +} + +// TryEvictNext is equivalent to EvictNext, but immediately returns an empty +// node ID if no evictable peers are found. +func (m *PeerManager) TryEvictNext() (NodeID, error) { + m.mtx.Lock() + defer m.mtx.Unlock() + + // If any connected peers are explicitly scheduled for eviction, we return a + // random one. + for peerID := range m.evict { + delete(m.evict, peerID) + if m.connected[peerID] && !m.evicting[peerID] { + m.evicting[peerID] = true + return peerID, nil + } + } + + // If we're below capacity, we don't need to evict anything. + if m.options.MaxConnected == 0 || + len(m.connected)-len(m.evicting) <= int(m.options.MaxConnected) { + return "", nil + } + + // If we're above capacity (shouldn't really happen), just pick the + // lowest-ranked peer to evict. + ranked := m.store.Ranked() + for i := len(ranked) - 1; i >= 0; i-- { + peer := ranked[i] + if m.connected[peer.ID] && !m.evicting[peer.ID] { + m.evicting[peer.ID] = true + return peer.ID, nil + } + } + + return "", nil +} + +// Disconnected unmarks a peer as connected, allowing it to be dialed or +// accepted again as appropriate. +func (m *PeerManager) Disconnected(peerID NodeID) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + ready := m.ready[peerID] + + delete(m.connected, peerID) + delete(m.upgrading, peerID) + delete(m.evict, peerID) + delete(m.evicting, peerID) + delete(m.ready, peerID) + + if ready { + m.broadcast(PeerUpdate{ + NodeID: peerID, + Status: PeerStatusDown, + }) + } + + m.dialWaker.Wake() + return nil +} + +// Errored reports a peer error, causing the peer to be evicted if it's +// currently connected. +// +// FIXME: This should probably be replaced with a peer behavior API, see +// PeerError comments for more details. +// +// FIXME: This will cause the peer manager to immediately try to reconnect to +// the peer, which is probably not always what we want. +func (m *PeerManager) Errored(peerID NodeID, err error) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + if m.connected[peerID] { + m.evict[peerID] = true + } + + m.evictWaker.Wake() + return nil +} + +// Advertise returns a list of peer addresses to advertise to a peer. +// +// FIXME: This is fairly naïve and only returns the addresses of the +// highest-ranked peers. +func (m *PeerManager) Advertise(peerID NodeID, limit uint16) []NodeAddress { + m.mtx.Lock() + defer m.mtx.Unlock() + + addresses := make([]NodeAddress, 0, limit) + for _, peer := range m.store.Ranked() { + if peer.ID == peerID { + continue + } + for _, addressInfo := range peer.AddressInfo { + if len(addresses) >= int(limit) { + return addresses + } + addresses = append(addresses, addressInfo.Address) + } + } + return addresses +} + +// Subscribe subscribes to peer updates. The caller must consume the peer +// updates in a timely fashion and close the subscription when done, otherwise +// the PeerManager will halt. +func (m *PeerManager) Subscribe() *PeerUpdates { + // FIXME: We use a size 1 buffer here. When we broadcast a peer update + // we have to loop over all of the subscriptions, and we want to avoid + // having to block and wait for a context switch before continuing on + // to the next subscriptions. This also prevents tail latencies from + // compounding. Limiting it to 1 means that the subscribers are still + // reasonably in sync. However, this should probably be benchmarked. + peerUpdates := NewPeerUpdates(make(chan PeerUpdate, 1)) + m.mtx.Lock() + m.subscriptions[peerUpdates] = peerUpdates + m.mtx.Unlock() + + go func() { + select { + case <-peerUpdates.Done(): + m.mtx.Lock() + delete(m.subscriptions, peerUpdates) + m.mtx.Unlock() + case <-m.closeCh: + } + }() + return peerUpdates +} + +// broadcast broadcasts a peer update to all subscriptions. The caller must +// already hold the mutex lock, to make sure updates are sent in the same order +// as the PeerManager processes them, but this means subscribers must be +// responsive at all times or the entire PeerManager will halt. +// +// FIXME: Consider using an internal channel to buffer updates while also +// maintaining order if this is a problem. +func (m *PeerManager) broadcast(peerUpdate PeerUpdate) { + for _, sub := range m.subscriptions { + // We have to check closeCh separately first, otherwise there's a 50% + // chance the second select will send on a closed subscription. + select { + case <-sub.closeCh: + continue + default: + } + select { + case sub.updatesCh <- peerUpdate: + case <-sub.closeCh: + } + } +} + +// Close closes the peer manager, releasing resources (i.e. goroutines). +func (m *PeerManager) Close() { + m.closeOnce.Do(func() { + close(m.closeCh) + }) +} + +// Addresses returns all known addresses for a peer, primarily for testing. +// The order is arbitrary. +func (m *PeerManager) Addresses(peerID NodeID) []NodeAddress { + m.mtx.Lock() + defer m.mtx.Unlock() + + addresses := []NodeAddress{} + if peer, ok := m.store.Get(peerID); ok { + for _, addressInfo := range peer.AddressInfo { + addresses = append(addresses, addressInfo.Address) + } + } + return addresses +} + +// Peers returns all known peers, primarily for testing. The order is arbitrary. +func (m *PeerManager) Peers() []NodeID { + m.mtx.Lock() + defer m.mtx.Unlock() + + peers := []NodeID{} + for _, peer := range m.store.Ranked() { + peers = append(peers, peer.ID) + } + return peers +} + +// Scores returns the peer scores for all known peers, primarily for testing. +func (m *PeerManager) Scores() map[NodeID]PeerScore { + m.mtx.Lock() + defer m.mtx.Unlock() + + scores := map[NodeID]PeerScore{} + for _, peer := range m.store.Ranked() { + scores[peer.ID] = peer.Score() + } + return scores +} + +// Status returns the status for a peer, primarily for testing. +func (m *PeerManager) Status(id NodeID) PeerStatus { + m.mtx.Lock() + defer m.mtx.Unlock() + switch { + case m.ready[id]: + return PeerStatusUp + default: + return PeerStatusDown + } +} + +// findUpgradeCandidate looks for a lower-scored peer that we could evict +// to make room for the given peer. Returns an empty ID if none is found. +// If the peer is already being upgraded to, we return that same upgrade. +// The caller must hold the mutex lock. +func (m *PeerManager) findUpgradeCandidate(id NodeID, score PeerScore) NodeID { + for from, to := range m.upgrading { + if to == id { + return from + } + } + + ranked := m.store.Ranked() + for i := len(ranked) - 1; i >= 0; i-- { + candidate := ranked[i] + switch { + case candidate.Score() >= score: + return "" // no further peers can be scored lower, due to sorting + case !m.connected[candidate.ID]: + case m.evict[candidate.ID]: + case m.evicting[candidate.ID]: + case m.upgrading[candidate.ID] != "": + default: + return candidate.ID + } + } + return "" +} + +// retryDelay calculates a dial retry delay using exponential backoff, based on +// retry settings in PeerManagerOptions. If retries are disabled (i.e. +// MinRetryTime is 0), this returns retryNever (i.e. an infinite retry delay). +// The caller must hold the mutex lock (for m.rand which is not thread-safe). +func (m *PeerManager) retryDelay(failures uint32, persistent bool) time.Duration { + if failures == 0 { + return 0 + } + if m.options.MinRetryTime == 0 { + return retryNever + } + maxDelay := m.options.MaxRetryTime + if persistent && m.options.MaxRetryTimePersistent > 0 { + maxDelay = m.options.MaxRetryTimePersistent + } + + delay := m.options.MinRetryTime * time.Duration(math.Pow(2, float64(failures-1))) + if maxDelay > 0 && delay > maxDelay { + delay = maxDelay + } + if m.options.RetryTimeJitter > 0 { + delay += time.Duration(m.rand.Int63n(int64(m.options.RetryTimeJitter))) + } + return delay +} + +// GetHeight returns a peer's height, as reported via SetHeight, or 0 if the +// peer or height is unknown. +// +// FIXME: This is a temporary workaround to share state between the consensus +// and mempool reactors, carried over from the legacy P2P stack. Reactors should +// not have dependencies on each other, instead tracking this themselves. +func (m *PeerManager) GetHeight(peerID NodeID) int64 { + m.mtx.Lock() + defer m.mtx.Unlock() + + peer, _ := m.store.Get(peerID) + return peer.Height +} + +// SetHeight stores a peer's height, making it available via GetHeight. +// +// FIXME: This is a temporary workaround to share state between the consensus +// and mempool reactors, carried over from the legacy P2P stack. Reactors should +// not have dependencies on each other, instead tracking this themselves. +func (m *PeerManager) SetHeight(peerID NodeID, height int64) error { + m.mtx.Lock() + defer m.mtx.Unlock() + + peer, ok := m.store.Get(peerID) + if !ok { + peer = m.newPeerInfo(peerID) + } + peer.Height = height + return m.store.Set(peer) +} + +// peerStore stores information about peers. It is not thread-safe, assuming it +// is only used by PeerManager which handles concurrency control. This allows +// the manager to execute multiple operations atomically via its own mutex. +// +// The entire set of peers is kept in memory, for performance. It is loaded +// from disk on initialization, and any changes are written back to disk +// (without fsync, since we can afford to lose recent writes). +type peerStore struct { + db dbm.DB + peers map[NodeID]*peerInfo + ranked []*peerInfo // cache for Ranked(), nil invalidates cache +} + +// newPeerStore creates a new peer store, loading all persisted peers from the +// database into memory. +func newPeerStore(db dbm.DB) (*peerStore, error) { + if db == nil { + return nil, errors.New("no database provided") + } + store := &peerStore{db: db} + if err := store.loadPeers(); err != nil { + return nil, err + } + return store, nil +} + +// loadPeers loads all peers from the database into memory. +func (s *peerStore) loadPeers() error { + peers := map[NodeID]*peerInfo{} + + start, end := keyPeerInfoRange() + iter, err := s.db.Iterator(start, end) + if err != nil { + return err + } + defer iter.Close() + for ; iter.Valid(); iter.Next() { + // FIXME: We may want to tolerate failures here, by simply logging + // the errors and ignoring the faulty peer entries. + msg := new(p2pproto.PeerInfo) + if err := proto.Unmarshal(iter.Value(), msg); err != nil { + return fmt.Errorf("invalid peer Protobuf data: %w", err) + } + peer, err := peerInfoFromProto(msg) + if err != nil { + return fmt.Errorf("invalid peer data: %w", err) + } + peers[peer.ID] = peer + } + if iter.Error() != nil { + return iter.Error() + } + s.peers = peers + s.ranked = nil // invalidate cache if populated + return nil +} + +// Get fetches a peer. The boolean indicates whether the peer existed or not. +// The returned peer info is a copy, and can be mutated at will. +func (s *peerStore) Get(id NodeID) (peerInfo, bool) { + peer, ok := s.peers[id] + return peer.Copy(), ok +} + +// Set stores peer data. The input data will be copied, and can safely be reused +// by the caller. +func (s *peerStore) Set(peer peerInfo) error { + if err := peer.Validate(); err != nil { + return err + } + peer = peer.Copy() + + // FIXME: We may want to optimize this by avoiding saving to the database + // if there haven't been any changes to persisted fields. + bz, err := peer.ToProto().Marshal() + if err != nil { + return err + } + if err = s.db.Set(keyPeerInfo(peer.ID), bz); err != nil { + return err + } + + if current, ok := s.peers[peer.ID]; !ok || current.Score() != peer.Score() { + // If the peer is new, or its score changes, we invalidate the Ranked() cache. + s.peers[peer.ID] = &peer + s.ranked = nil + } else { + // Otherwise, since s.ranked contains pointers to the old data and we + // want those pointers to remain valid with the new data, we have to + // update the existing pointer address. + *current = peer + } + + return nil +} + +// Delete deletes a peer, or does nothing if it does not exist. +func (s *peerStore) Delete(id NodeID) error { + if _, ok := s.peers[id]; !ok { + return nil + } + if err := s.db.Delete(keyPeerInfo(id)); err != nil { + return err + } + delete(s.peers, id) + s.ranked = nil + return nil +} + +// List retrieves all peers in an arbitrary order. The returned data is a copy, +// and can be mutated at will. +func (s *peerStore) List() []peerInfo { + peers := make([]peerInfo, 0, len(s.peers)) + for _, peer := range s.peers { + peers = append(peers, peer.Copy()) + } + return peers +} + +// Ranked returns a list of peers ordered by score (better peers first). Peers +// with equal scores are returned in an arbitrary order. The returned list must +// not be mutated or accessed concurrently by the caller, since it returns +// pointers to internal peerStore data for performance. +// +// Ranked is used to determine both which peers to dial, which ones to evict, +// and which ones to delete completely. +// +// FIXME: For now, we simply maintain a cache in s.ranked which is invalidated +// by setting it to nil, but if necessary we should use a better data structure +// for this (e.g. a heap or ordered map). +// +// FIXME: The scoring logic is currently very naïve, see peerInfo.Score(). +func (s *peerStore) Ranked() []*peerInfo { + if s.ranked != nil { + return s.ranked + } + s.ranked = make([]*peerInfo, 0, len(s.peers)) + for _, peer := range s.peers { + s.ranked = append(s.ranked, peer) + } + sort.Slice(s.ranked, func(i, j int) bool { + // FIXME: If necessary, consider precomputing scores before sorting, + // to reduce the number of Score() calls. + return s.ranked[i].Score() > s.ranked[j].Score() + }) + return s.ranked +} + +// Size returns the number of peers in the peer store. +func (s *peerStore) Size() int { + return len(s.peers) +} + +// peerInfo contains peer information stored in a peerStore. +type peerInfo struct { + ID NodeID + AddressInfo map[NodeAddress]*peerAddressInfo + LastConnected time.Time + + // These fields are ephemeral, i.e. not persisted to the database. + Persistent bool + Height int64 + FixedScore PeerScore // mainly for tests +} + +// peerInfoFromProto converts a Protobuf PeerInfo message to a peerInfo, +// erroring if the data is invalid. +func peerInfoFromProto(msg *p2pproto.PeerInfo) (*peerInfo, error) { + p := &peerInfo{ + ID: NodeID(msg.ID), + AddressInfo: map[NodeAddress]*peerAddressInfo{}, + } + if msg.LastConnected != nil { + p.LastConnected = *msg.LastConnected + } + for _, a := range msg.AddressInfo { + addressInfo, err := peerAddressInfoFromProto(a) + if err != nil { + return nil, err + } + p.AddressInfo[addressInfo.Address] = addressInfo + } + return p, p.Validate() +} + +// ToProto converts the peerInfo to p2pproto.PeerInfo for database storage. The +// Protobuf type only contains persisted fields, while ephemeral fields are +// discarded. The returned message may contain pointers to original data, since +// it is expected to be serialized immediately. +func (p *peerInfo) ToProto() *p2pproto.PeerInfo { + msg := &p2pproto.PeerInfo{ + ID: string(p.ID), + LastConnected: &p.LastConnected, + } + for _, addressInfo := range p.AddressInfo { + msg.AddressInfo = append(msg.AddressInfo, addressInfo.ToProto()) + } + if msg.LastConnected.IsZero() { + msg.LastConnected = nil + } + return msg +} + +// Copy returns a deep copy of the peer info. +func (p *peerInfo) Copy() peerInfo { + if p == nil { + return peerInfo{} + } + c := *p + for i, addressInfo := range c.AddressInfo { + addressInfoCopy := addressInfo.Copy() + c.AddressInfo[i] = &addressInfoCopy + } + return c +} + +// Score calculates a score for the peer. Higher-scored peers will be +// preferred over lower scores. +func (p *peerInfo) Score() PeerScore { + var score PeerScore + if p.FixedScore > 0 { + return p.FixedScore + } + if p.Persistent { + score += PeerScorePersistent + } + return score +} + +// Validate validates the peer info. +func (p *peerInfo) Validate() error { + if p.ID == "" { + return errors.New("no peer ID") + } + return nil +} + +// peerAddressInfo contains information and statistics about a peer address. +type peerAddressInfo struct { + Address NodeAddress + LastDialSuccess time.Time + LastDialFailure time.Time + DialFailures uint32 // since last successful dial +} + +// peerAddressInfoFromProto converts a Protobuf PeerAddressInfo message +// to a peerAddressInfo. +func peerAddressInfoFromProto(msg *p2pproto.PeerAddressInfo) (*peerAddressInfo, error) { + address, err := ParseNodeAddress(msg.Address) + if err != nil { + return nil, fmt.Errorf("invalid address %q: %w", address, err) + } + addressInfo := &peerAddressInfo{ + Address: address, + DialFailures: msg.DialFailures, + } + if msg.LastDialSuccess != nil { + addressInfo.LastDialSuccess = *msg.LastDialSuccess + } + if msg.LastDialFailure != nil { + addressInfo.LastDialFailure = *msg.LastDialFailure + } + return addressInfo, addressInfo.Validate() +} + +// ToProto converts the address into to a Protobuf message for serialization. +func (a *peerAddressInfo) ToProto() *p2pproto.PeerAddressInfo { + msg := &p2pproto.PeerAddressInfo{ + Address: a.Address.String(), + LastDialSuccess: &a.LastDialSuccess, + LastDialFailure: &a.LastDialFailure, + DialFailures: a.DialFailures, + } + if msg.LastDialSuccess.IsZero() { + msg.LastDialSuccess = nil + } + if msg.LastDialFailure.IsZero() { + msg.LastDialFailure = nil + } + return msg +} + +// Copy returns a copy of the address info. +func (a *peerAddressInfo) Copy() peerAddressInfo { + return *a +} + +// Validate validates the address info. +func (a *peerAddressInfo) Validate() error { + return a.Address.Validate() +} + +// Database key prefixes. +const ( + prefixPeerInfo int64 = 1 +) + +// keyPeerInfo generates a peerInfo database key. +func keyPeerInfo(id NodeID) []byte { + key, err := orderedcode.Append(nil, prefixPeerInfo, string(id)) + if err != nil { + panic(err) + } + return key +} + +// keyPeerInfoRange generates start/end keys for the entire peerInfo key range. +func keyPeerInfoRange() ([]byte, []byte) { + start, err := orderedcode.Append(nil, prefixPeerInfo, "") + if err != nil { + panic(err) + } + end, err := orderedcode.Append(nil, prefixPeerInfo, orderedcode.Infinity) + if err != nil { + panic(err) + } + return start, end +} diff --git a/p2p/peermanager_test.go b/p2p/peermanager_test.go new file mode 100644 index 000000000..76d8c21a5 --- /dev/null +++ b/p2p/peermanager_test.go @@ -0,0 +1,1580 @@ +package p2p_test + +import ( + "context" + "errors" + "strings" + "testing" + "time" + + "github.com/fortytw2/leaktest" + "github.com/stretchr/testify/require" + dbm "github.com/tendermint/tm-db" + + "github.com/tendermint/tendermint/p2p" +) + +// FIXME: We should probably have some randomized property-based tests for the +// PeerManager too, which runs a bunch of random operations with random peers +// and ensures certain invariants always hold. The logic can be complex, with +// many interactions, and it's hard to cover all scenarios with handwritten +// tests. + +func TestPeerManagerOptions_Validate(t *testing.T) { + nodeID := p2p.NodeID("00112233445566778899aabbccddeeff00112233") + + testcases := map[string]struct { + options p2p.PeerManagerOptions + ok bool + }{ + "zero options is valid": {p2p.PeerManagerOptions{}, true}, + + // PersistentPeers + "valid PersistentPeers NodeID": {p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{"00112233445566778899aabbccddeeff00112233"}, + }, true}, + "invalid PersistentPeers NodeID": {p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{"foo"}, + }, false}, + "uppercase PersistentPeers NodeID": {p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{"00112233445566778899AABBCCDDEEFF00112233"}, + }, false}, + "PersistentPeers at MaxConnected": {p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{nodeID, nodeID, nodeID}, + MaxConnected: 3, + }, true}, + "PersistentPeers above MaxConnected": {p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{nodeID, nodeID, nodeID}, + MaxConnected: 2, + }, false}, + "PersistentPeers above MaxConnected below MaxConnectedUpgrade": {p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{nodeID, nodeID, nodeID}, + MaxConnected: 2, + MaxConnectedUpgrade: 2, + }, false}, + + // MaxPeers + "MaxPeers without MaxConnected": {p2p.PeerManagerOptions{ + MaxPeers: 3, + }, false}, + "MaxPeers below MaxConnected+MaxConnectedUpgrade": {p2p.PeerManagerOptions{ + MaxPeers: 2, + MaxConnected: 2, + MaxConnectedUpgrade: 1, + }, false}, + "MaxPeers at MaxConnected+MaxConnectedUpgrade": {p2p.PeerManagerOptions{ + MaxPeers: 3, + MaxConnected: 2, + MaxConnectedUpgrade: 1, + }, true}, + + // MaxRetryTime + "MaxRetryTime below MinRetryTime": {p2p.PeerManagerOptions{ + MinRetryTime: 7 * time.Second, + MaxRetryTime: 5 * time.Second, + }, false}, + "MaxRetryTime at MinRetryTime": {p2p.PeerManagerOptions{ + MinRetryTime: 5 * time.Second, + MaxRetryTime: 5 * time.Second, + }, true}, + "MaxRetryTime without MinRetryTime": {p2p.PeerManagerOptions{ + MaxRetryTime: 5 * time.Second, + }, false}, + + // MaxRetryTimePersistent + "MaxRetryTimePersistent below MinRetryTime": {p2p.PeerManagerOptions{ + MinRetryTime: 7 * time.Second, + MaxRetryTimePersistent: 5 * time.Second, + }, false}, + "MaxRetryTimePersistent at MinRetryTime": {p2p.PeerManagerOptions{ + MinRetryTime: 5 * time.Second, + MaxRetryTimePersistent: 5 * time.Second, + }, true}, + "MaxRetryTimePersistent without MinRetryTime": {p2p.PeerManagerOptions{ + MaxRetryTimePersistent: 5 * time.Second, + }, false}, + } + for name, tc := range testcases { + tc := tc + t.Run(name, func(t *testing.T) { + err := tc.options.Validate() + if tc.ok { + require.NoError(t, err) + } else { + require.Error(t, err) + } + }) + } +} + +func TestNewPeerManager(t *testing.T) { + + // Invalid options should error. + _, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{"foo"}, + }) + require.Error(t, err) + + // Invalid database should error. + _, err = p2p.NewPeerManager(nil, p2p.PeerManagerOptions{}) + require.Error(t, err) + + // Zero options should be valid. + _, err = p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) +} + +func TestNewPeerManager_Persistence(t *testing.T) { + aID := p2p.NodeID(strings.Repeat("a", 40)) + aAddresses := []p2p.NodeAddress{ + {Protocol: "tcp", NodeID: aID, Hostname: "127.0.0.1", Port: 26657, Path: "/path"}, + {Protocol: "memory", NodeID: aID}, + } + + bID := p2p.NodeID(strings.Repeat("b", 40)) + bAddresses := []p2p.NodeAddress{ + {Protocol: "tcp", NodeID: bID, Hostname: "b10c::1", Port: 26657, Path: "/path"}, + {Protocol: "memory", NodeID: bID}, + } + + cID := p2p.NodeID(strings.Repeat("c", 40)) + cAddresses := []p2p.NodeAddress{ + {Protocol: "tcp", NodeID: cID, Hostname: "host.domain", Port: 80}, + {Protocol: "memory", NodeID: cID}, + } + + // Create an initial peer manager and add the peers. + db := dbm.NewMemDB() + peerManager, err := p2p.NewPeerManager(db, p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{aID}, + PeerScores: map[p2p.NodeID]p2p.PeerScore{bID: 1}, + }) + require.NoError(t, err) + defer peerManager.Close() + + for _, addr := range append(append(aAddresses, bAddresses...), cAddresses...) { + require.NoError(t, peerManager.Add(addr)) + } + + require.ElementsMatch(t, aAddresses, peerManager.Addresses(aID)) + require.ElementsMatch(t, bAddresses, peerManager.Addresses(bID)) + require.ElementsMatch(t, cAddresses, peerManager.Addresses(cID)) + require.Equal(t, map[p2p.NodeID]p2p.PeerScore{ + aID: p2p.PeerScorePersistent, + bID: 1, + cID: 0, + }, peerManager.Scores()) + + peerManager.Close() + + // Creating a new peer manager with the same database should retain the + // peers, but they should have updated scores from the new PersistentPeers + // configuration. + peerManager, err = p2p.NewPeerManager(db, p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{bID}, + PeerScores: map[p2p.NodeID]p2p.PeerScore{cID: 1}, + }) + require.NoError(t, err) + defer peerManager.Close() + + require.ElementsMatch(t, aAddresses, peerManager.Addresses(aID)) + require.ElementsMatch(t, bAddresses, peerManager.Addresses(bID)) + require.ElementsMatch(t, cAddresses, peerManager.Addresses(cID)) + require.Equal(t, map[p2p.NodeID]p2p.PeerScore{ + aID: 0, + bID: p2p.PeerScorePersistent, + cID: 1, + }, peerManager.Scores()) +} + +func TestPeerManager_Add(t *testing.T) { + aID := p2p.NodeID(strings.Repeat("a", 40)) + bID := p2p.NodeID(strings.Repeat("b", 40)) + cID := p2p.NodeID(strings.Repeat("c", 40)) + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + PersistentPeers: []p2p.NodeID{aID, cID}, + MaxPeers: 2, + MaxConnected: 2, + }) + require.NoError(t, err) + + // Adding a couple of addresses should work. + aAddresses := []p2p.NodeAddress{ + {Protocol: "tcp", NodeID: aID, Hostname: "localhost"}, + {Protocol: "memory", NodeID: aID}, + } + for _, addr := range aAddresses { + err = peerManager.Add(addr) + require.NoError(t, err) + } + require.ElementsMatch(t, aAddresses, peerManager.Addresses(aID)) + + // Adding a different peer should be fine. + bAddress := p2p.NodeAddress{Protocol: "tcp", NodeID: bID, Hostname: "localhost"} + require.NoError(t, peerManager.Add(bAddress)) + require.Equal(t, []p2p.NodeAddress{bAddress}, peerManager.Addresses(bID)) + require.ElementsMatch(t, aAddresses, peerManager.Addresses(aID)) + + // Adding an existing address again should be a noop. + require.NoError(t, peerManager.Add(aAddresses[0])) + require.ElementsMatch(t, aAddresses, peerManager.Addresses(aID)) + + // Adding a third peer with MaxPeers=2 should cause bID, which is + // the lowest-scored peer (not in PersistentPeers), to be removed. + require.NoError(t, peerManager.Add(p2p.NodeAddress{ + Protocol: "tcp", NodeID: cID, Hostname: "localhost"})) + require.ElementsMatch(t, []p2p.NodeID{aID, cID}, peerManager.Peers()) + + // Adding an invalid address should error. + require.Error(t, peerManager.Add(p2p.NodeAddress{Path: "foo"})) +} + +func TestPeerManager_DialNext(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Add an address. DialNext should return it. + require.NoError(t, peerManager.Add(a)) + address, err := peerManager.DialNext(ctx) + require.NoError(t, err) + require.Equal(t, a, address) + + // Since there are no more undialed peers, the next call should block + // until it times out. + timeoutCtx, cancel := context.WithTimeout(ctx, 100*time.Millisecond) + defer cancel() + _, err = peerManager.DialNext(timeoutCtx) + require.Error(t, err) + require.Equal(t, context.DeadlineExceeded, err) +} + +func TestPeerManager_DialNext_Retry(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + options := p2p.PeerManagerOptions{ + MinRetryTime: 100 * time.Millisecond, + MaxRetryTime: 500 * time.Millisecond, + } + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), options) + require.NoError(t, err) + + require.NoError(t, peerManager.Add(a)) + + // Do five dial retries (six dials total). The retry time should double for + // each failure. At the forth retry, MaxRetryTime should kick in. + ctx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + + for i := 0; i <= 5; i++ { + start := time.Now() + dial, err := peerManager.DialNext(ctx) + require.NoError(t, err) + require.Equal(t, a, dial) + elapsed := time.Since(start).Round(time.Millisecond) + + switch i { + case 0: + require.LessOrEqual(t, elapsed, options.MinRetryTime) + case 1: + require.GreaterOrEqual(t, elapsed, options.MinRetryTime) + case 2: + require.GreaterOrEqual(t, elapsed, 2*options.MinRetryTime) + case 3: + require.GreaterOrEqual(t, elapsed, 4*options.MinRetryTime) + case 4, 5: + require.GreaterOrEqual(t, elapsed, options.MaxRetryTime) + require.LessOrEqual(t, elapsed, 8*options.MinRetryTime) + default: + require.Fail(t, "unexpected retry") + } + + require.NoError(t, peerManager.DialFailed(a)) + } +} + +func TestPeerManager_DialNext_WakeOnAdd(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Spawn a goroutine to add a peer after a delay. + go func() { + time.Sleep(200 * time.Millisecond) + require.NoError(t, peerManager.Add(a)) + }() + + // This will block until peer is added above. + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + dial, err := peerManager.DialNext(ctx) + require.NoError(t, err) + require.Equal(t, a, dial) +} + +func TestPeerManager_DialNext_WakeOnDialFailed(t *testing.T) { + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 1, + }) + require.NoError(t, err) + + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + // Add and dial a. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + + // Add b. We shouldn't be able to dial it, due to MaxConnected. + require.NoError(t, peerManager.Add(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) + + // Spawn a goroutine to fail a's dial attempt. + go func() { + time.Sleep(200 * time.Millisecond) + require.NoError(t, peerManager.DialFailed(a)) + }() + + // This should make b available for dialing (not a, retries are disabled). + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + dial, err = peerManager.DialNext(ctx) + require.NoError(t, err) + require.Equal(t, b, dial) +} + +func TestPeerManager_DialNext_WakeOnDialFailedRetry(t *testing.T) { + options := p2p.PeerManagerOptions{MinRetryTime: 200 * time.Millisecond} + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), options) + require.NoError(t, err) + + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + // Add a, dial it, and mark it a failure. This will start a retry timer. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.DialFailed(dial)) + failed := time.Now() + + // The retry timer should unblock DialNext and make a available again after + // the retry time passes. + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + dial, err = peerManager.DialNext(ctx) + require.NoError(t, err) + require.Equal(t, a, dial) + require.GreaterOrEqual(t, time.Since(failed), options.MinRetryTime) +} + +func TestPeerManager_DialNext_WakeOnDisconnected(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + err = peerManager.Add(a) + require.NoError(t, err) + err = peerManager.Accepted(a.NodeID) + require.NoError(t, err) + + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) + + go func() { + time.Sleep(200 * time.Millisecond) + require.NoError(t, peerManager.Disconnected(a.NodeID)) + }() + + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + dial, err = peerManager.DialNext(ctx) + require.NoError(t, err) + require.Equal(t, a, dial) +} + +func TestPeerManager_TryDialNext_MaxConnected(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 2, + }) + require.NoError(t, err) + + // Add a and connect to it. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.Dialed(a)) + + // Add b and start dialing it. + require.NoError(t, peerManager.Add(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + + // At this point, adding c will not allow dialing it. + require.NoError(t, peerManager.Add(c)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) +} + +func TestPeerManager_TryDialNext_MaxConnectedUpgrade(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + d := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("d", 40))} + e := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("e", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{ + a.NodeID: 0, + b.NodeID: 1, + c.NodeID: 2, + d.NodeID: 3, + e.NodeID: 0, + }, + PersistentPeers: []p2p.NodeID{c.NodeID, d.NodeID}, + MaxConnected: 2, + MaxConnectedUpgrade: 1, + }) + require.NoError(t, err) + + // Add a and connect to it. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.Dialed(a)) + + // Add b and start dialing it. + require.NoError(t, peerManager.Add(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + + // Even though we are at capacity, we should be allowed to dial c for an + // upgrade of a, since it's higher-scored. + require.NoError(t, peerManager.Add(c)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, c, dial) + + // However, since we're using all upgrade slots now, we can't add and dial + // d, even though it's also higher-scored. + require.NoError(t, peerManager.Add(d)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) + + // We go through with c's upgrade. + require.NoError(t, peerManager.Dialed(c)) + + // Still can't dial d. + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) + + // Now, if we disconnect a, we should be allowed to dial d because we have a + // free upgrade slot. + require.NoError(t, peerManager.Disconnected(a.NodeID)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, d, dial) + require.NoError(t, peerManager.Dialed(d)) + + // However, if we disconnect b (such that only c and d are connected), we + // should not be allowed to dial e even though there are upgrade slots, + // because there are no lower-scored nodes that can be upgraded. + require.NoError(t, peerManager.Disconnected(b.NodeID)) + require.NoError(t, peerManager.Add(e)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) +} + +func TestPeerManager_TryDialNext_UpgradeReservesPeer(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{b.NodeID: 1, c.NodeID: 1}, + MaxConnected: 1, + MaxConnectedUpgrade: 2, + }) + require.NoError(t, err) + + // Add a and connect to it. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.Dialed(a)) + + // Add b and start dialing it. This will claim a for upgrading. + require.NoError(t, peerManager.Add(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + + // Adding c and dialing it will fail, because a is the only connected + // peer that can be upgraded, and b is already trying to upgrade it. + require.NoError(t, peerManager.Add(c)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Empty(t, dial) +} + +func TestPeerManager_TryDialNext_DialingConnected(t *testing.T) { + aID := p2p.NodeID(strings.Repeat("a", 40)) + a := p2p.NodeAddress{Protocol: "memory", NodeID: aID} + aTCP := p2p.NodeAddress{Protocol: "tcp", NodeID: aID, Hostname: "localhost"} + + bID := p2p.NodeID(strings.Repeat("b", 40)) + b := p2p.NodeAddress{Protocol: "memory", NodeID: bID} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 2, + }) + require.NoError(t, err) + + // Add a and dial it. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + + // Adding a's TCP address will not dispense a, since it's already dialing. + require.NoError(t, peerManager.Add(aTCP)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) + + // Marking a as dialed will still not dispense it. + require.NoError(t, peerManager.Dialed(a)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) + + // Adding b and accepting a connection from it will not dispense it either. + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Accepted(bID)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) +} + +func TestPeerManager_TryDialNext_Multiple(t *testing.T) { + aID := p2p.NodeID(strings.Repeat("a", 40)) + bID := p2p.NodeID(strings.Repeat("b", 40)) + addresses := []p2p.NodeAddress{ + {Protocol: "memory", NodeID: aID}, + {Protocol: "memory", NodeID: bID}, + {Protocol: "tcp", NodeID: aID, Hostname: "127.0.0.1"}, + {Protocol: "tcp", NodeID: bID, Hostname: "::1"}, + } + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + for _, address := range addresses { + require.NoError(t, peerManager.Add(address)) + } + + // All addresses should be dispensed as long as dialing them has failed. + dial := []p2p.NodeAddress{} + for range addresses { + address, err := peerManager.TryDialNext() + require.NoError(t, err) + require.NotZero(t, address) + require.NoError(t, peerManager.DialFailed(address)) + dial = append(dial, address) + } + require.ElementsMatch(t, dial, addresses) + + address, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, address) +} + +func TestPeerManager_DialFailed(t *testing.T) { + // DialFailed is tested through other tests, we'll just check a few basic + // things here, e.g. reporting unknown addresses. + aID := p2p.NodeID(strings.Repeat("a", 40)) + a := p2p.NodeAddress{Protocol: "memory", NodeID: aID} + bID := p2p.NodeID(strings.Repeat("b", 40)) + b := p2p.NodeAddress{Protocol: "memory", NodeID: bID} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + require.NoError(t, peerManager.Add(a)) + + // Dialing and then calling DialFailed with a different address (same + // NodeID) should unmark as dialing and allow us to dial the other address + // again, but not register the failed address. + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.DialFailed(p2p.NodeAddress{ + Protocol: "tcp", NodeID: aID, Hostname: "localhost"})) + require.Equal(t, []p2p.NodeAddress{a}, peerManager.Addresses(aID)) + + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + + // Calling DialFailed on same address twice should be fine. + require.NoError(t, peerManager.DialFailed(a)) + require.NoError(t, peerManager.DialFailed(a)) + + // DialFailed on an unknown peer shouldn't error or add it. + require.NoError(t, peerManager.DialFailed(b)) + require.Equal(t, []p2p.NodeID{aID}, peerManager.Peers()) +} + +func TestPeerManager_DialFailed_UnreservePeer(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{b.NodeID: 1, c.NodeID: 1}, + MaxConnected: 1, + MaxConnectedUpgrade: 2, + }) + require.NoError(t, err) + + // Add a and connect to it. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.Dialed(a)) + + // Add b and start dialing it. This will claim a for upgrading. + require.NoError(t, peerManager.Add(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + + // Adding c and dialing it will fail, even though it could upgrade a and we + // have free upgrade slots, because a is the only connected peer that can be + // upgraded and b is already trying to upgrade it. + require.NoError(t, peerManager.Add(c)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Empty(t, dial) + + // Failing b's dial will now make c available for dialing. + require.NoError(t, peerManager.DialFailed(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, c, dial) +} + +func TestPeerManager_Dialed_Connected(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Marking a as dialed twice should error. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + + require.NoError(t, peerManager.Dialed(a)) + require.Error(t, peerManager.Dialed(a)) + + // Accepting a connection from b and then trying to mark it as dialed should fail. + require.NoError(t, peerManager.Add(b)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + + require.NoError(t, peerManager.Accepted(b.NodeID)) + require.Error(t, peerManager.Dialed(b)) +} + +func TestPeerManager_Dialed_MaxConnected(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 1, + }) + require.NoError(t, err) + + // Start to dial a. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + + // Marking b as dialed in the meanwhile (even without TryDialNext) + // should be fine. + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Dialed(b)) + + // Completing the dial for a should now error. + require.Error(t, peerManager.Dialed(a)) +} + +func TestPeerManager_Dialed_MaxConnectedUpgrade(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + d := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("d", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 2, + MaxConnectedUpgrade: 1, + PeerScores: map[p2p.NodeID]p2p.PeerScore{c.NodeID: 1, d.NodeID: 1}, + }) + require.NoError(t, err) + + // Dialing a and b is fine. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Dialed(a)) + + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Dialed(b)) + + // Starting an upgrade of c should be fine. + require.NoError(t, peerManager.Add(c)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, c, dial) + require.NoError(t, peerManager.Dialed(c)) + + // Trying to mark d dialed should fail, since there are no more upgrade + // slots and a/b haven't been evicted yet. + require.NoError(t, peerManager.Add(d)) + require.Error(t, peerManager.Dialed(d)) +} + +func TestPeerManager_Dialed_Unknown(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Marking an unknown node as dialed should error. + require.Error(t, peerManager.Dialed(a)) +} + +func TestPeerManager_Dialed_Upgrade(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 1, + MaxConnectedUpgrade: 2, + PeerScores: map[p2p.NodeID]p2p.PeerScore{b.NodeID: 1, c.NodeID: 1}, + }) + require.NoError(t, err) + + // Dialing a is fine. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Dialed(a)) + + // Upgrading it with b should work, since b has a higher score. + require.NoError(t, peerManager.Add(b)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + require.NoError(t, peerManager.Dialed(b)) + + // a hasn't been evicted yet, but c shouldn't be allowed to upgrade anyway + // since it's about to be evicted. + require.NoError(t, peerManager.Add(c)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Empty(t, dial) + + // a should now be evicted. + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) +} + +func TestPeerManager_Dialed_UpgradeEvenLower(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + d := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("d", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 2, + MaxConnectedUpgrade: 1, + PeerScores: map[p2p.NodeID]p2p.PeerScore{ + a.NodeID: 3, + b.NodeID: 2, + c.NodeID: 10, + d.NodeID: 1, + }, + }) + require.NoError(t, err) + + // Connect to a and b. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Dialed(a)) + + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Dialed(b)) + + // Start an upgrade with c, which should pick b to upgrade (since it + // has score 2). + require.NoError(t, peerManager.Add(c)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, c, dial) + + // In the meanwhile, a disconnects and d connects. d is even lower-scored + // than b (1 vs 2), which is currently being upgraded. + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.NoError(t, peerManager.Add(d)) + require.NoError(t, peerManager.Accepted(d.NodeID)) + + // Once c completes the upgrade of b, it should instead evict d, + // since it has en even lower score. + require.NoError(t, peerManager.Dialed(c)) + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, d.NodeID, evict) +} + +func TestPeerManager_Dialed_UpgradeNoEvict(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 2, + MaxConnectedUpgrade: 1, + PeerScores: map[p2p.NodeID]p2p.PeerScore{ + a.NodeID: 1, + b.NodeID: 2, + c.NodeID: 3, + }, + }) + require.NoError(t, err) + + // Connect to a and b. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Dialed(a)) + + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Dialed(b)) + + // Start an upgrade with c, which should pick a to upgrade. + require.NoError(t, peerManager.Add(c)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, c, dial) + + // In the meanwhile, b disconnects. + require.NoError(t, peerManager.Disconnected(b.NodeID)) + + // Once c completes the upgrade of b, there is no longer a need to + // evict anything since we're at capacity. + // since it has en even lower score. + require.NoError(t, peerManager.Dialed(c)) + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) +} + +func TestPeerManager_Accepted(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + d := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("d", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Accepting a connection from a known peer should work. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + + // Accepting a connection from an already accepted peer should error. + require.Error(t, peerManager.Accepted(a.NodeID)) + + // Accepting a connection from an unknown peer should work and register it. + require.NoError(t, peerManager.Accepted(b.NodeID)) + require.ElementsMatch(t, []p2p.NodeID{a.NodeID, b.NodeID}, peerManager.Peers()) + + // Accepting a connection from a peer that's being dialed should work, and + // should cause the dial to fail. + require.NoError(t, peerManager.Add(c)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, c, dial) + require.NoError(t, peerManager.Accepted(c.NodeID)) + require.Error(t, peerManager.Dialed(c)) + + // Accepting a connection from a peer that's been dialed should fail. + require.NoError(t, peerManager.Add(d)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, d, dial) + require.NoError(t, peerManager.Dialed(d)) + require.Error(t, peerManager.Accepted(d.NodeID)) +} + +func TestPeerManager_Accepted_MaxConnected(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 2, + }) + require.NoError(t, err) + + // Connect to a and b. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Dialed(a)) + + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Accepted(b.NodeID)) + + // Accepting c should now fail. + require.NoError(t, peerManager.Add(c)) + require.Error(t, peerManager.Accepted(c.NodeID)) +} + +func TestPeerManager_Accepted_MaxConnectedUpgrade(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + d := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("d", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{ + c.NodeID: 1, + d.NodeID: 2, + }, + MaxConnected: 1, + MaxConnectedUpgrade: 1, + }) + require.NoError(t, err) + + // Dial a. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Dialed(a)) + + // Accepting b should fail, since it's not an upgrade over a. + require.NoError(t, peerManager.Add(b)) + require.Error(t, peerManager.Accepted(b.NodeID)) + + // Accepting c should work, since it upgrades a. + require.NoError(t, peerManager.Add(c)) + require.NoError(t, peerManager.Accepted(c.NodeID)) + + // a still hasn't been evicted, so accepting b should still fail. + require.NoError(t, peerManager.Add(b)) + require.Error(t, peerManager.Accepted(b.NodeID)) + + // Also, accepting d should fail, since all upgrade slots are full. + require.NoError(t, peerManager.Add(d)) + require.Error(t, peerManager.Accepted(d.NodeID)) +} + +func TestPeerManager_Accepted_Upgrade(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{ + b.NodeID: 1, + c.NodeID: 1, + }, + MaxConnected: 1, + MaxConnectedUpgrade: 2, + }) + require.NoError(t, err) + + // Accept a. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + + // Accepting b should work, since it upgrades a. + require.NoError(t, peerManager.Add(b)) + require.NoError(t, peerManager.Accepted(b.NodeID)) + + // c cannot get accepted, since a has been upgraded by b. + require.NoError(t, peerManager.Add(c)) + require.Error(t, peerManager.Accepted(c.NodeID)) + + // This should cause a to get evicted. + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) + require.NoError(t, peerManager.Disconnected(a.NodeID)) + + // c still cannot get accepted, since it's not scored above b. + require.Error(t, peerManager.Accepted(c.NodeID)) +} + +func TestPeerManager_Accepted_UpgradeDialing(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + c := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("c", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{ + b.NodeID: 1, + c.NodeID: 1, + }, + MaxConnected: 1, + MaxConnectedUpgrade: 2, + }) + require.NoError(t, err) + + // Accept a. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + + // Start dial upgrade from a to b. + require.NoError(t, peerManager.Add(b)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + + // a has already been claimed as an upgrade of a, so accepting + // c should fail since there's noone else to upgrade. + require.NoError(t, peerManager.Add(c)) + require.Error(t, peerManager.Accepted(c.NodeID)) + + // However, if b connects to us while we're also trying to upgrade to it via + // dialing, then we accept the incoming connection as an upgrade. + require.NoError(t, peerManager.Accepted(b.NodeID)) + + // This should cause a to get evicted, and the dial upgrade to fail. + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) + require.Error(t, peerManager.Dialed(b)) +} + +func TestPeerManager_Ready(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + sub := peerManager.Subscribe() + defer sub.Close() + + // Connecting to a should still have it as status down. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.Equal(t, p2p.PeerStatusDown, peerManager.Status(a.NodeID)) + + // Marking a as ready should transition it to PeerStatusUp and send an update. + require.NoError(t, peerManager.Ready(a.NodeID)) + require.Equal(t, p2p.PeerStatusUp, peerManager.Status(a.NodeID)) + require.Equal(t, p2p.PeerUpdate{ + NodeID: a.NodeID, + Status: p2p.PeerStatusUp, + }, <-sub.Updates()) + + // Marking an unconnected peer as ready should do nothing. + require.NoError(t, peerManager.Add(b)) + require.Equal(t, p2p.PeerStatusDown, peerManager.Status(b.NodeID)) + require.NoError(t, peerManager.Ready(b.NodeID)) + require.Equal(t, p2p.PeerStatusDown, peerManager.Status(b.NodeID)) + require.Empty(t, sub.Updates()) +} + +// See TryEvictNext for most tests, this just tests blocking behavior. +func TestPeerManager_EvictNext(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + + // Since there are no peers to evict, EvictNext should block until timeout. + timeoutCtx, cancel := context.WithTimeout(ctx, 100*time.Millisecond) + defer cancel() + _, err = peerManager.EvictNext(timeoutCtx) + require.Error(t, err) + require.Equal(t, context.DeadlineExceeded, err) + + // Erroring the peer will return it from EvictNext(). + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + evict, err := peerManager.EvictNext(timeoutCtx) + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) + + // Since there are no more peers to evict, the next call should block. + timeoutCtx, cancel = context.WithTimeout(ctx, 100*time.Millisecond) + defer cancel() + _, err = peerManager.EvictNext(timeoutCtx) + require.Error(t, err) + require.Equal(t, context.DeadlineExceeded, err) +} + +func TestPeerManager_EvictNext_WakeOnError(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + + // Spawn a goroutine to error a peer after a delay. + go func() { + time.Sleep(200 * time.Millisecond) + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + }() + + // This will block until peer errors above. + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + evict, err := peerManager.EvictNext(ctx) + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) +} + +func TestPeerManager_EvictNext_WakeOnUpgradeDialed(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 1, + MaxConnectedUpgrade: 1, + PeerScores: map[p2p.NodeID]p2p.PeerScore{b.NodeID: 1}, + }) + require.NoError(t, err) + + // Connect a. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + + // Spawn a goroutine to upgrade to b with a delay. + go func() { + time.Sleep(200 * time.Millisecond) + require.NoError(t, peerManager.Add(b)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, b, dial) + require.NoError(t, peerManager.Dialed(b)) + }() + + // This will block until peer is upgraded above. + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + evict, err := peerManager.EvictNext(ctx) + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) +} + +func TestPeerManager_EvictNext_WakeOnUpgradeAccepted(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MaxConnected: 1, + MaxConnectedUpgrade: 1, + PeerScores: map[p2p.NodeID]p2p.PeerScore{b.NodeID: 1}, + }) + require.NoError(t, err) + + // Connect a. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + + // Spawn a goroutine to upgrade b with a delay. + go func() { + time.Sleep(200 * time.Millisecond) + require.NoError(t, peerManager.Accepted(b.NodeID)) + }() + + // This will block until peer is upgraded above. + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + evict, err := peerManager.EvictNext(ctx) + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) +} +func TestPeerManager_TryEvictNext(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + require.NoError(t, peerManager.Add(a)) + + // Nothing is evicted with no peers connected. + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) + + // Connecting to a won't evict anything either. + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + + // But if a errors it should be evicted. + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + evict, err = peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) + + // While a is being evicted (before disconnect), it shouldn't get evicted again. + evict, err = peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) + + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + evict, err = peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) +} + +func TestPeerManager_Disconnected(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + sub := peerManager.Subscribe() + defer sub.Close() + + // Disconnecting an unknown peer does nothing. + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.Empty(t, peerManager.Peers()) + require.Empty(t, sub.Updates()) + + // Disconnecting an accepted non-ready peer does not send a status update. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.Empty(t, sub.Updates()) + + // Disconnecting a ready peer sends a status update. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + require.Equal(t, p2p.PeerStatusUp, peerManager.Status(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{ + NodeID: a.NodeID, + Status: p2p.PeerStatusUp, + }, <-sub.Updates()) + + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.Equal(t, p2p.PeerStatusDown, peerManager.Status(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{ + NodeID: a.NodeID, + Status: p2p.PeerStatusDown, + }, <-sub.Updates()) + + // Disconnecting a dialing peer does not unmark it as dialing, to avoid + // dialing it multiple times in parallel. + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + + require.NoError(t, peerManager.Disconnected(a.NodeID)) + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Zero(t, dial) +} + +func TestPeerManager_Errored(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Erroring an unknown peer does nothing. + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + require.Empty(t, peerManager.Peers()) + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) + + // Erroring a known peer does nothing, and won't evict it later, + // even when it connects. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + evict, err = peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) + + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + evict, err = peerManager.TryEvictNext() + require.NoError(t, err) + require.Zero(t, evict) + + // However, erroring once connected will evict it. + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + evict, err = peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) +} + +func TestPeerManager_Subscribe(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // This tests all subscription events for full peer lifecycles. + sub := peerManager.Subscribe() + defer sub.Close() + + require.NoError(t, peerManager.Add(a)) + require.Empty(t, sub.Updates()) + + // Inbound connection. + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.Empty(t, sub.Updates()) + + require.NoError(t, peerManager.Ready(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusUp}, <-sub.Updates()) + + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusDown}, <-sub.Updates()) + + // Outbound connection with peer error and eviction. + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.Empty(t, sub.Updates()) + + require.NoError(t, peerManager.Dialed(a)) + require.Empty(t, sub.Updates()) + + require.NoError(t, peerManager.Ready(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusUp}, <-sub.Updates()) + + require.NoError(t, peerManager.Errored(a.NodeID, errors.New("foo"))) + require.Empty(t, sub.Updates()) + + evict, err := peerManager.TryEvictNext() + require.NoError(t, err) + require.Equal(t, a.NodeID, evict) + + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusDown}, <-sub.Updates()) + + // Outbound connection with dial failure. + dial, err = peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.Empty(t, sub.Updates()) + + require.NoError(t, peerManager.DialFailed(a)) + require.Empty(t, sub.Updates()) +} + +func TestPeerManager_Subscribe_Close(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + sub := peerManager.Subscribe() + defer sub.Close() + + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.Empty(t, sub.Updates()) + + require.NoError(t, peerManager.Ready(a.NodeID)) + require.NotEmpty(t, sub.Updates()) + require.Equal(t, p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusUp}, <-sub.Updates()) + + // Closing the subscription should not send us the disconnected update. + sub.Close() + require.NoError(t, peerManager.Disconnected(a.NodeID)) + require.Empty(t, sub.Updates()) +} + +func TestPeerManager_Subscribe_Broadcast(t *testing.T) { + t.Cleanup(leaktest.Check(t)) + + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{}) + require.NoError(t, err) + + s1 := peerManager.Subscribe() + defer s1.Close() + s2 := peerManager.Subscribe() + defer s2.Close() + s3 := peerManager.Subscribe() + defer s3.Close() + + // Connecting to a peer should send updates on all subscriptions. + require.NoError(t, peerManager.Add(a)) + require.NoError(t, peerManager.Accepted(a.NodeID)) + require.NoError(t, peerManager.Ready(a.NodeID)) + + expectUp := p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusUp} + require.NotEmpty(t, s1) + require.Equal(t, expectUp, <-s1.Updates()) + require.NotEmpty(t, s2) + require.Equal(t, expectUp, <-s2.Updates()) + require.NotEmpty(t, s3) + require.Equal(t, expectUp, <-s3.Updates()) + + // We now close s2. Disconnecting the peer should only send updates + // on s1 and s3. + s2.Close() + require.NoError(t, peerManager.Disconnected(a.NodeID)) + + expectDown := p2p.PeerUpdate{NodeID: a.NodeID, Status: p2p.PeerStatusDown} + require.NotEmpty(t, s1) + require.Equal(t, expectDown, <-s1.Updates()) + require.Empty(t, s2.Updates()) + require.NotEmpty(t, s3) + require.Equal(t, expectDown, <-s3.Updates()) +} + +func TestPeerManager_Close(t *testing.T) { + // leaktest will check that spawned goroutines are closed. + t.Cleanup(leaktest.CheckTimeout(t, 1*time.Second)) + + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + MinRetryTime: 10 * time.Second, + }) + require.NoError(t, err) + + // This subscription isn't closed, but PeerManager.Close() + // should reap the spawned goroutine. + _ = peerManager.Subscribe() + + // This dial failure will start a retry timer for 10 seconds, which + // should be reaped. + require.NoError(t, peerManager.Add(a)) + dial, err := peerManager.TryDialNext() + require.NoError(t, err) + require.Equal(t, a, dial) + require.NoError(t, peerManager.DialFailed(a)) + + // This should clean up the goroutines. + peerManager.Close() +} + +func TestPeerManager_Advertise(t *testing.T) { + aID := p2p.NodeID(strings.Repeat("a", 40)) + aTCP := p2p.NodeAddress{Protocol: "tcp", NodeID: aID, Hostname: "127.0.0.1", Port: 26657, Path: "/path"} + aMem := p2p.NodeAddress{Protocol: "memory", NodeID: aID} + + bID := p2p.NodeID(strings.Repeat("b", 40)) + bTCP := p2p.NodeAddress{Protocol: "tcp", NodeID: bID, Hostname: "b10c::1", Port: 26657, Path: "/path"} + bMem := p2p.NodeAddress{Protocol: "memory", NodeID: bID} + + cID := p2p.NodeID(strings.Repeat("c", 40)) + cTCP := p2p.NodeAddress{Protocol: "tcp", NodeID: cID, Hostname: "host.domain", Port: 80} + cMem := p2p.NodeAddress{Protocol: "memory", NodeID: cID} + + dID := p2p.NodeID(strings.Repeat("d", 40)) + + // Create an initial peer manager and add the peers. + peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{ + PeerScores: map[p2p.NodeID]p2p.PeerScore{aID: 3, bID: 2, cID: 1}, + }) + require.NoError(t, err) + defer peerManager.Close() + + require.NoError(t, peerManager.Add(aTCP)) + require.NoError(t, peerManager.Add(aMem)) + require.NoError(t, peerManager.Add(bTCP)) + require.NoError(t, peerManager.Add(bMem)) + require.NoError(t, peerManager.Add(cTCP)) + require.NoError(t, peerManager.Add(cMem)) + + // d should get all addresses. + require.ElementsMatch(t, []p2p.NodeAddress{ + aTCP, aMem, bTCP, bMem, cTCP, cMem, + }, peerManager.Advertise(dID, 100)) + + // a should not get its own addresses. + require.ElementsMatch(t, []p2p.NodeAddress{ + bTCP, bMem, cTCP, cMem, + }, peerManager.Advertise(aID, 100)) + + // Asking for 0 addresses should return, well, 0. + require.Empty(t, peerManager.Advertise(aID, 0)) + + // Asking for 2 addresses should get the highest-rated ones, i.e. a. + require.ElementsMatch(t, []p2p.NodeAddress{ + aTCP, aMem, + }, peerManager.Advertise(dID, 2)) +} + +func TestPeerManager_SetHeight_GetHeight(t *testing.T) { + a := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("a", 40))} + b := p2p.NodeAddress{Protocol: "memory", NodeID: p2p.NodeID(strings.Repeat("b", 40))} + + db := dbm.NewMemDB() + peerManager, err := p2p.NewPeerManager(db, p2p.PeerManagerOptions{}) + require.NoError(t, err) + + // Getting a height should default to 0, for unknown peers and + // for known peers without height. + require.NoError(t, peerManager.Add(a)) + require.EqualValues(t, 0, peerManager.GetHeight(a.NodeID)) + require.EqualValues(t, 0, peerManager.GetHeight(b.NodeID)) + + // Setting a height should work for a known node. + require.NoError(t, peerManager.SetHeight(a.NodeID, 3)) + require.EqualValues(t, 3, peerManager.GetHeight(a.NodeID)) + + // Setting a height should add an unknown node. + require.Equal(t, []p2p.NodeID{a.NodeID}, peerManager.Peers()) + require.NoError(t, peerManager.SetHeight(b.NodeID, 7)) + require.EqualValues(t, 7, peerManager.GetHeight(b.NodeID)) + require.ElementsMatch(t, []p2p.NodeID{a.NodeID, b.NodeID}, peerManager.Peers()) + + // The heights should not be persisted. + peerManager.Close() + peerManager, err = p2p.NewPeerManager(db, p2p.PeerManagerOptions{}) + require.NoError(t, err) + + require.ElementsMatch(t, []p2p.NodeID{a.NodeID, b.NodeID}, peerManager.Peers()) + require.Zero(t, peerManager.GetHeight(a.NodeID)) + require.Zero(t, peerManager.GetHeight(b.NodeID)) +} diff --git a/p2p/pex/reactor.go b/p2p/pex/reactor.go index 4762260b0..52eccbe68 100644 --- a/p2p/pex/reactor.go +++ b/p2p/pex/reactor.go @@ -30,7 +30,7 @@ type ReactorV2 struct { peerManager *p2p.PeerManager pexCh *p2p.Channel - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates closeCh chan struct{} } @@ -39,7 +39,7 @@ func NewReactorV2( logger log.Logger, peerManager *p2p.PeerManager, pexCh *p2p.Channel, - peerUpdates *p2p.PeerUpdatesCh, + peerUpdates *p2p.PeerUpdates, ) *ReactorV2 { r := &ReactorV2{ peerManager: peerManager, @@ -181,9 +181,8 @@ func (r *ReactorV2) processPexCh() { if err := r.handleMessage(r.pexCh.ID(), envelope); err != nil { r.Logger.Error("failed to process message", "ch_id", r.pexCh.ID(), "envelope", envelope, "err", err) r.pexCh.Error() <- p2p.PeerError{ - PeerID: envelope.From, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + NodeID: envelope.From, + Err: err, } } @@ -197,11 +196,11 @@ func (r *ReactorV2) processPexCh() { // processPeerUpdate processes a PeerUpdate. For added peers, PeerStatusUp, we // send a request for addresses. func (r *ReactorV2) processPeerUpdate(peerUpdate p2p.PeerUpdate) { - r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status) + r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) if peerUpdate.Status == p2p.PeerStatusUp { r.pexCh.Out() <- p2p.Envelope{ - To: peerUpdate.PeerID, + To: peerUpdate.NodeID, Message: &protop2p.PexRequest{}, } } diff --git a/p2p/router.go b/p2p/router.go index 669c96265..379dfc9f6 100644 --- a/p2p/router.go +++ b/p2p/router.go @@ -255,13 +255,10 @@ func (r *Router) routeChannel(channel *Channel) { if !ok { return } - // FIXME: We just disconnect the peer for now - r.logger.Error("peer error, disconnecting", "peer", peerError.PeerID, "err", peerError.Err) - r.peerMtx.RLock() - peerQueue, ok := r.peerQueues[peerError.PeerID] - r.peerMtx.RUnlock() - if ok { - peerQueue.close() + // FIXME: We just evict the peer for now. + r.logger.Error("peer error, evicting", "peer", peerError.NodeID, "err", peerError.Err) + if err := r.peerManager.Errored(peerError.NodeID, peerError.Err); err != nil { + r.logger.Error("failed to report peer error", "peer", peerError.NodeID, "err", err) } case <-channel.Done(): @@ -338,7 +335,6 @@ func (r *Router) acceptPeers(transport Transport) { r.peerMtx.Lock() r.peerQueues[peerInfo.NodeID] = queue r.peerMtx.Unlock() - r.peerManager.Ready(peerInfo.NodeID) defer func() { r.peerMtx.Lock() @@ -350,6 +346,11 @@ func (r *Router) acceptPeers(transport Transport) { } }() + if err := r.peerManager.Ready(peerInfo.NodeID); err != nil { + r.logger.Error("failed to mark peer as ready", "peer", peerInfo.NodeID, "err", err) + return + } + r.routePeer(peerInfo.NodeID, conn, queue) }() } @@ -359,7 +360,7 @@ func (r *Router) acceptPeers(transport Transport) { func (r *Router) dialPeers() { ctx := r.stopCtx() for { - peerID, address, err := r.peerManager.DialNext(ctx) + address, err := r.peerManager.DialNext(ctx) switch err { case nil: case context.Canceled: @@ -371,12 +372,13 @@ func (r *Router) dialPeers() { } go func() { + peerID := address.NodeID conn, err := r.dialPeer(ctx, address) if errors.Is(err, context.Canceled) { return } else if err != nil { r.logger.Error("failed to dial peer", "peer", peerID, "err", err) - if err = r.peerManager.DialFailed(peerID, address); err != nil { + if err = r.peerManager.DialFailed(address); err != nil { r.logger.Error("failed to report dial failure", "peer", peerID, "err", err) } return @@ -388,13 +390,13 @@ func (r *Router) dialPeers() { return } else if err != nil { r.logger.Error("failed to handshake with peer", "peer", peerID, "err", err) - if err = r.peerManager.DialFailed(peerID, address); err != nil { + if err = r.peerManager.DialFailed(address); err != nil { r.logger.Error("failed to report dial failure", "peer", peerID, "err", err) } return } - if err = r.peerManager.Dialed(peerID, address); err != nil { + if err = r.peerManager.Dialed(address); err != nil { r.logger.Error("failed to dial peer", "peer", peerID, "err", err) return } @@ -403,7 +405,6 @@ func (r *Router) dialPeers() { r.peerMtx.Lock() r.peerQueues[peerID] = queue r.peerMtx.Unlock() - r.peerManager.Ready(peerID) defer func() { r.peerMtx.Lock() @@ -415,6 +416,11 @@ func (r *Router) dialPeers() { } }() + if err := r.peerManager.Ready(peerID); err != nil { + r.logger.Error("failed to mark peer as ready", "peer", peerID, "err", err) + return + } + r.routePeer(peerID, conn, queue) }() } diff --git a/p2p/router_test.go b/p2p/router_test.go index 8f5cb91b9..01ad949d0 100644 --- a/p2p/router_test.go +++ b/p2p/router_test.go @@ -115,9 +115,9 @@ func TestRouter(t *testing.T) { // Wait for peers to come online, and ping them as they do. for i := 0; i < len(peers); i++ { peerUpdate := <-peerUpdates.Updates() - peerID := peerUpdate.PeerID + peerID := peerUpdate.NodeID require.Equal(t, p2p.PeerUpdate{ - PeerID: peerID, + NodeID: peerID, Status: p2p.PeerStatusUp, }, peerUpdate) @@ -140,13 +140,12 @@ func TestRouter(t *testing.T) { // We then submit an error for a peer, and watch it get disconnected. channel.Error() <- p2p.PeerError{ - PeerID: peers[0].NodeID, - Err: errors.New("test error"), - Severity: p2p.PeerErrorSeverityCritical, + NodeID: peers[0].NodeID, + Err: errors.New("test error"), } peerUpdate := <-peerUpdates.Updates() require.Equal(t, p2p.PeerUpdate{ - PeerID: peers[0].NodeID, + NodeID: peers[0].NodeID, Status: p2p.PeerStatusDown, }, peerUpdate) @@ -154,7 +153,7 @@ func TestRouter(t *testing.T) { // for that to happen. peerUpdate = <-peerUpdates.Updates() require.Equal(t, p2p.PeerUpdate{ - PeerID: peers[0].NodeID, + NodeID: peers[0].NodeID, Status: p2p.PeerStatusUp, }, peerUpdate) } diff --git a/p2p/shim.go b/p2p/shim.go index 266fc0f1f..a349849c7 100644 --- a/p2p/shim.go +++ b/p2p/shim.go @@ -29,7 +29,7 @@ type ( BaseReactor Name string - PeerUpdates *PeerUpdatesCh + PeerUpdates *PeerUpdates Channels map[ChannelID]*ChannelShim } @@ -162,10 +162,10 @@ func (rs *ReactorShim) handlePeerErrors() { for _, cs := range rs.Channels { go func(cs *ChannelShim) { for pErr := range cs.Channel.errCh { - if pErr.PeerID != "" { - peer := rs.Switch.peers.Get(pErr.PeerID) + if pErr.NodeID != "" { + peer := rs.Switch.peers.Get(pErr.NodeID) if peer == nil { - rs.Logger.Error("failed to handle peer error; failed to find peer", "peer", pErr.PeerID) + rs.Logger.Error("failed to handle peer error; failed to find peer", "peer", pErr.NodeID) continue } @@ -225,7 +225,7 @@ func (rs *ReactorShim) GetChannels() []*ChannelDescriptor { // handle adding a peer. func (rs *ReactorShim) AddPeer(peer Peer) { select { - case rs.PeerUpdates.updatesCh <- PeerUpdate{PeerID: peer.ID(), Status: PeerStatusUp}: + case rs.PeerUpdates.updatesCh <- PeerUpdate{NodeID: peer.ID(), Status: PeerStatusUp}: rs.Logger.Debug("sent peer update", "reactor", rs.Name, "peer", peer.ID(), "status", PeerStatusUp) case <-rs.PeerUpdates.Done(): @@ -244,7 +244,7 @@ func (rs *ReactorShim) AddPeer(peer Peer) { // handle removing a peer. func (rs *ReactorShim) RemovePeer(peer Peer, reason interface{}) { select { - case rs.PeerUpdates.updatesCh <- PeerUpdate{PeerID: peer.ID(), Status: PeerStatusDown}: + case rs.PeerUpdates.updatesCh <- PeerUpdate{NodeID: peer.ID(), Status: PeerStatusDown}: rs.Logger.Debug( "sent peer update", "reactor", rs.Name, diff --git a/p2p/shim_test.go b/p2p/shim_test.go index f5b84a490..a12a184cb 100644 --- a/p2p/shim_test.go +++ b/p2p/shim_test.go @@ -123,7 +123,7 @@ func TestReactorShim_AddPeer(t *testing.T) { rts.shim.AddPeer(peerA) wg.Wait() - require.Equal(t, peerIDA, peerUpdate.PeerID) + require.Equal(t, peerIDA, peerUpdate.NodeID) require.Equal(t, p2p.PeerStatusUp, peerUpdate.Status) } @@ -143,7 +143,7 @@ func TestReactorShim_RemovePeer(t *testing.T) { rts.shim.RemovePeer(peerA, "test reason") wg.Wait() - require.Equal(t, peerIDA, peerUpdate.PeerID) + require.Equal(t, peerIDA, peerUpdate.NodeID) require.Equal(t, p2p.PeerStatusDown, peerUpdate.Status) } diff --git a/statesync/reactor.go b/statesync/reactor.go index 30cf5c09e..f88e2d1e0 100644 --- a/statesync/reactor.go +++ b/statesync/reactor.go @@ -78,7 +78,7 @@ type Reactor struct { tempDir string snapshotCh *p2p.Channel chunkCh *p2p.Channel - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates closeCh chan struct{} // This will only be set when a state sync is in progress. It is used to feed @@ -96,7 +96,7 @@ func NewReactor( conn proxy.AppConnSnapshot, connQuery proxy.AppConnQuery, snapshotCh, chunkCh *p2p.Channel, - peerUpdates *p2p.PeerUpdatesCh, + peerUpdates *p2p.PeerUpdates, tempDir string, ) *Reactor { r := &Reactor{ @@ -347,9 +347,8 @@ func (r *Reactor) processSnapshotCh() { if err := r.handleMessage(r.snapshotCh.ID(), envelope); err != nil { r.Logger.Error("failed to process message", "ch_id", r.snapshotCh.ID(), "envelope", envelope, "err", err) r.snapshotCh.Error() <- p2p.PeerError{ - PeerID: envelope.From, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + NodeID: envelope.From, + Err: err, } } @@ -374,9 +373,8 @@ func (r *Reactor) processChunkCh() { if err := r.handleMessage(r.chunkCh.ID(), envelope); err != nil { r.Logger.Error("failed to process message", "ch_id", r.chunkCh.ID(), "envelope", envelope, "err", err) r.chunkCh.Error() <- p2p.PeerError{ - PeerID: envelope.From, - Err: err, - Severity: p2p.PeerErrorSeverityLow, + NodeID: envelope.From, + Err: err, } } @@ -390,18 +388,18 @@ func (r *Reactor) processChunkCh() { // processPeerUpdate processes a PeerUpdate, returning an error upon failing to // handle the PeerUpdate or if a panic is recovered. func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { - r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status) + r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) r.mtx.RLock() defer r.mtx.RUnlock() if r.syncer != nil { switch peerUpdate.Status { - case p2p.PeerStatusNew, p2p.PeerStatusUp: - r.syncer.AddPeer(peerUpdate.PeerID) + case p2p.PeerStatusUp: + r.syncer.AddPeer(peerUpdate.NodeID) - case p2p.PeerStatusDown, p2p.PeerStatusRemoved, p2p.PeerStatusBanned: - r.syncer.RemovePeer(peerUpdate.PeerID) + case p2p.PeerStatusDown: + r.syncer.RemovePeer(peerUpdate.NodeID) } } } diff --git a/statesync/reactor_test.go b/statesync/reactor_test.go index 6dfad5edb..0760c1e54 100644 --- a/statesync/reactor_test.go +++ b/statesync/reactor_test.go @@ -33,7 +33,7 @@ type reactorTestSuite struct { chunkOutCh chan p2p.Envelope chunkPeerErrCh chan p2p.PeerError - peerUpdates *p2p.PeerUpdatesCh + peerUpdates *p2p.PeerUpdates } func setup( @@ -127,7 +127,7 @@ func TestReactor_ChunkRequest_InvalidRequest(t *testing.T) { require.Error(t, response.Err) require.Empty(t, rts.chunkOutCh) require.Contains(t, response.Err.Error(), "received unknown message") - require.Equal(t, p2p.NodeID("aa"), response.PeerID) + require.Equal(t, p2p.NodeID("aa"), response.NodeID) } func TestReactor_ChunkRequest(t *testing.T) { @@ -198,7 +198,7 @@ func TestReactor_SnapshotsRequest_InvalidRequest(t *testing.T) { require.Error(t, response.Err) require.Empty(t, rts.snapshotOutCh) require.Contains(t, response.Err.Error(), "received unknown message") - require.Equal(t, p2p.NodeID("aa"), response.PeerID) + require.Equal(t, p2p.NodeID("aa"), response.NodeID) } func TestReactor_SnapshotsRequest(t *testing.T) {