Merge branch 'master' into marko/int64-

2026-07-27 10:32:44 +00:00 · 2021-02-04 14:29:06 +01:00
parent e6d95c4ef8 740008e32b
commit fe6a8f72c6
49 changed files with 5358 additions and 2067 deletions
@@ -1,7 +1,7 @@
-## Description
+Please add a description of the changes that this PR introduces and the files that
+are the most critical to review.

-_Please add a description of the changes that this PR introduces and the files that
-are the most critical to review._ 
+If this PR fixes an open Issue, please include "Closes #XXX" (where "XXX" is the Issue number) 
+so that GitHub will automatically close the Issue when this PR is merged.

-Closes: #XXX

@@ -0,0 +1,76 @@
+# Runs randomly generated E2E testnets nightly
+# on the 0.34.x release branch
+
+# !! If you change something in this file, you probably want
+# to update the e2e-nightly-master workflow as well!
+
+name: e2e-nightly-34x
+on:
+  workflow_dispatch: # allow running workflow manually, in theory
+  schedule:
+    - cron: '0 2 * * *'
+
+jobs:
+  e2e-nightly-test:
+    # Run parallel jobs for the listed testnet groups (must match the
+    # ./build/generator -g flag)
+    strategy:
+      fail-fast: false
+      matrix:
+        group: ['00', '01', '02', '03']
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    steps:
+      - uses: actions/setup-go@v2
+        with:
+          go-version: '1.15'
+
+      - uses: actions/checkout@v2
+        with:
+          ref: 'v0.34.x'
+
+      - name: Build
+        working-directory: test/e2e
+        # Run make jobs in parallel, since we can't run steps in parallel.
+        run: make -j2 docker generator runner
+
+      - name: Generate testnets
+        working-directory: test/e2e
+        # When changing -g, also change the matrix groups above
+        run: ./build/generator -g 4 -d networks/nightly
+
+      - name: Run testnets in group ${{ matrix.group }}
+        working-directory: test/e2e
+        run: ./run-multiple.sh networks/nightly/*-group${{ matrix.group }}-*.toml
+
+  e2e-nightly-fail:
+    needs: e2e-nightly-test
+    if: ${{ failure() }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Notify Slack on failure
+        uses: rtCamp/action-slack-notify@ae4223259071871559b6e9d08b24a63d71b3f0c0
+        env:
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          SLACK_CHANNEL: tendermint-internal
+          SLACK_USERNAME: Nightly E2E Tests
+          SLACK_ICON_EMOJI: ':skull:'
+          SLACK_COLOR: danger
+          SLACK_MESSAGE: Nightly E2E tests failed on v0.34.x
+          SLACK_FOOTER: ''
+
+  e2e-nightly-success: # may turn this off once they seem to pass consistently
+    needs: e2e-nightly-test
+    if: ${{ success() }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Notify Slack on success
+        uses: rtCamp/action-slack-notify@ae4223259071871559b6e9d08b24a63d71b3f0c0
+        env:
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          SLACK_CHANNEL: tendermint-internal
+          SLACK_USERNAME: Nightly E2E Tests
+          SLACK_ICON_EMOJI: ':white_check_mark:'
+          SLACK_COLOR: good
+          SLACK_MESSAGE: Nightly E2E tests passed on v0.34.x
+          SLACK_FOOTER: ''
@@ -1,20 +1,22 @@
-# Runs randomly generated E2E testnets nightly.
-name: e2e-nightly
+# Runs randomly generated E2E testnets nightly on master
+
+# !! If you change something in this file, you probably want
+# to update the e2e-nightly-34x workflow as well!
+
+name: e2e-nightly-master
 on:
  workflow_dispatch: # allow running workflow manually
  schedule:
    - cron: '0 2 * * *'

 jobs:
-  e2e-nightly-test:
+  e2e-nightly-test-2:
    # Run parallel jobs for the listed testnet groups (must match the
    # ./build/generator -g flag)
    strategy:
      fail-fast: false
      matrix:
        group: ['00', '01', '02', '03']
-          # todo: expand to multiple versions after 0.35 release
-        branch: ['master', 'v0.34.x']
    runs-on: ubuntu-latest
    timeout-minutes: 60
    steps:
@@ -23,8 +25,6 @@ jobs:
          go-version: '1.15'

      - uses: actions/checkout@v2
-        with:
-          ref: ${{ matrix.branch}}

      - name: Build
        working-directory: test/e2e
@@ -40,8 +40,8 @@ jobs:
        working-directory: test/e2e
        run: ./run-multiple.sh networks/nightly/*-group${{ matrix.group }}-*.toml

-  e2e-nightly-fail:
-    needs: e2e-nightly-test
+  e2e-nightly-fail-2:
+    needs: e2e-nightly-test-2
    if: ${{ failure() }}
    runs-on: ubuntu-latest
    steps:
@@ -53,5 +53,21 @@ jobs:
          SLACK_USERNAME: Nightly E2E Tests
          SLACK_ICON_EMOJI: ':skull:'
          SLACK_COLOR: danger
-          SLACK_MESSAGE: Nightly E2E tests failed
+          SLACK_MESSAGE: Nightly E2E tests failed on master
+          SLACK_FOOTER: ''
+
+  e2e-nightly-success: # may turn this off once they seem to pass consistently
+    needs: e2e-nightly-test-2
+    if: ${{ success() }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Notify Slack on success
+        uses: rtCamp/action-slack-notify@ae4223259071871559b6e9d08b24a63d71b3f0c0
+        env:
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          SLACK_CHANNEL: tendermint-internal
+          SLACK_USERNAME: Nightly E2E Tests
+          SLACK_ICON_EMOJI: ':white_check_mark:'
+          SLACK_COLOR: good
+          SLACK_MESSAGE: Nightly E2E tests passed on master
          SLACK_FOOTER: ''
@@ -1,4 +1,4 @@
-project_name: Tendermint
+project_name: tendermint

 env:
  # Require use of Go modules.
@@ -28,8 +28,7 @@ Friendly reminder, we have a [bug bounty program](https://hackerone.com/tendermi
  - [proto/p2p] Renamed `DefaultNodeInfo` and `DefaultNodeInfoOther` to `NodeInfo` and `NodeInfoOther` (@erikgrinaker)
  - [proto/p2p] Rename `NodeInfo.default_node_id` to `node_id` (@erikgrinaker)
  - [libs/os] Kill() and {Must,}{Read,Write}File() functions have been removed. (@alessio)
-  - [store] \#5848 Remove block store state in favor of using the db iterators directly (@cmwaters)
-  - [state] \#5864 Use an iterator when pruning state (@cmwaters)
+  - [store] \#5848 Remove block store state in favor of using the db iterators directly (@cmwaters)  - [state] \#5864 Use an iterator when pruning state (@cmwaters)
  - [types] \#6023 Remove `tm2pb.Header`, `tm2pb.BlockID`, `tm2pb.PartSetHeader` and `tm2pb.NewValidatorUpdate`.
    - Each of the above types has a `ToProto` and `FromProto` method or function which replaced this logic.
  - [rpc/client/http] \#6022 Change `timeout` type to `time.Duration` in `NewWithTimeout`
@@ -66,3 +65,4 @@ Friendly reminder, we have a [bug bounty program](https://hackerone.com/tendermi
 - [blockchain/v1] [\#5701](https://github.com/tendermint/tendermint/pull/5701) Handle peers without blocks (@melekes)
 - [blockchain/v1] \#5711 Fix deadlock (@melekes)
 - [light] \#6022 Fix a bug when the number of validators equals 100 (@melekes)
+- [light] \#6026 Fix a bug when height isn't provided for the rpc calls: `/commit` and `/validators` (@cmwaters)
@@ -84,7 +84,7 @@ type Reactor struct {
 	fastSync    bool

 	blockchainCh *p2p.Channel
-	peerUpdates  *p2p.PeerUpdatesCh
+	peerUpdates  *p2p.PeerUpdates
 	closeCh      chan struct{}

 	requestsCh <-chan BlockRequest
@@ -104,7 +104,7 @@ func NewReactor(
 	store *store.BlockStore,
 	consReactor consensusReactor,
 	blockchainCh *p2p.Channel,
-	peerUpdates *p2p.PeerUpdatesCh,
+	peerUpdates *p2p.PeerUpdates,
 	fastSync bool,
 ) (*Reactor, error) {
 	if state.LastBlockHeight != store.Height() {
@@ -194,7 +194,7 @@ func (r *Reactor) respondToPeer(msg *bcproto.BlockRequest, peerID p2p.NodeID) {
 			return
 		}

-		r.blockchainCh.Out() <- p2p.Envelope{
+		r.blockchainCh.Out <- p2p.Envelope{
 			To:      peerID,
 			Message: &bcproto.BlockResponse{Block: blockProto},
 		}
@@ -203,7 +203,7 @@ func (r *Reactor) respondToPeer(msg *bcproto.BlockRequest, peerID p2p.NodeID) {
 	}

 	r.Logger.Info("peer requesting a block we do not have", "peer", peerID, "height", msg.Height)
-	r.blockchainCh.Out() <- p2p.Envelope{
+	r.blockchainCh.Out <- p2p.Envelope{
 		To:      peerID,
 		Message: &bcproto.NoBlockResponse{Height: msg.Height},
 	}
@@ -229,7 +229,7 @@ func (r *Reactor) handleBlockchainMessage(envelope p2p.Envelope) error {
 		r.pool.AddBlock(envelope.From, block, block.Size())

 	case *bcproto.StatusRequest:
-		r.blockchainCh.Out() <- p2p.Envelope{
+		r.blockchainCh.Out <- p2p.Envelope{
 			To: envelope.From,
 			Message: &bcproto.StatusResponse{
 				Height: r.store.Height(),
@@ -284,13 +284,12 @@ func (r *Reactor) processBlockchainCh() {

 	for {
 		select {
-		case envelope := <-r.blockchainCh.In():
-			if err := r.handleMessage(r.blockchainCh.ID(), envelope); err != nil {
-				r.Logger.Error("failed to process message", "ch_id", r.blockchainCh.ID(), "envelope", envelope, "err", err)
-				r.blockchainCh.Error() <- p2p.PeerError{
-					PeerID:   envelope.From,
-					Err:      err,
-					Severity: p2p.PeerErrorSeverityLow,
+		case envelope := <-r.blockchainCh.In:
+			if err := r.handleMessage(r.blockchainCh.ID, envelope); err != nil {
+				r.Logger.Error("failed to process message", "ch_id", r.blockchainCh.ID, "envelope", envelope, "err", err)
+				r.blockchainCh.Error <- p2p.PeerError{
+					NodeID: envelope.From,
+					Err:    err,
 				}
 			}

@@ -303,26 +302,26 @@ func (r *Reactor) processBlockchainCh() {

 // processPeerUpdate processes a PeerUpdate.
 func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) {
-	r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status)
+	r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status)

 	// XXX: Pool#RedoRequest can sometimes give us an empty peer.
-	if len(peerUpdate.PeerID) == 0 {
+	if len(peerUpdate.NodeID) == 0 {
 		return
 	}

 	switch peerUpdate.Status {
-	case p2p.PeerStatusNew, p2p.PeerStatusUp:
+	case p2p.PeerStatusUp:
 		// send a status update the newly added peer
-		r.blockchainCh.Out() <- p2p.Envelope{
-			To: peerUpdate.PeerID,
+		r.blockchainCh.Out <- p2p.Envelope{
+			To: peerUpdate.NodeID,
 			Message: &bcproto.StatusResponse{
 				Base:   r.store.Base(),
 				Height: r.store.Height(),
 			},
 		}

-	case p2p.PeerStatusDown, p2p.PeerStatusRemoved, p2p.PeerStatusBanned:
-		r.pool.RemovePeer(peerUpdate.PeerID)
+	case p2p.PeerStatusDown:
+		r.pool.RemovePeer(peerUpdate.NodeID)
 	}
 }

@@ -377,16 +376,15 @@ func (r *Reactor) requestRoutine() {
 			return

 		case request := <-r.requestsCh:
-			r.blockchainCh.Out() <- p2p.Envelope{
+			r.blockchainCh.Out <- p2p.Envelope{
 				To:      request.PeerID,
 				Message: &bcproto.BlockRequest{Height: request.Height},
 			}

 		case pErr := <-r.errorsCh:
-			r.blockchainCh.Error() <- p2p.PeerError{
-				PeerID:   pErr.peerID,
-				Err:      pErr.err,
-				Severity: p2p.PeerErrorSeverityLow,
+			r.blockchainCh.Error <- p2p.PeerError{
+				NodeID: pErr.peerID,
+				Err:    pErr.err,
 			}

 		case <-statusUpdateTicker.C:
@@ -395,7 +393,7 @@ func (r *Reactor) requestRoutine() {
 			go func() {
 				defer r.poolWG.Done()

-				r.blockchainCh.Out() <- p2p.Envelope{
+				r.blockchainCh.Out <- p2p.Envelope{
 					Broadcast: true,
 					Message:   &bcproto.StatusRequest{},
 				}
@@ -524,18 +522,16 @@ FOR_LOOP:
 				// NOTE: We've already removed the peer's request, but we still need
 				// to clean up the rest.
 				peerID := r.pool.RedoRequest(first.Height)
-				r.blockchainCh.Error() <- p2p.PeerError{
-					PeerID:   peerID,
-					Err:      err,
-					Severity: p2p.PeerErrorSeverityLow,
+				r.blockchainCh.Error <- p2p.PeerError{
+					NodeID: peerID,
+					Err:    err,
 				}

 				peerID2 := r.pool.RedoRequest(second.Height)
 				if peerID2 != peerID {
-					r.blockchainCh.Error() <- p2p.PeerError{
-						PeerID:   peerID2,
-						Err:      err,
-						Severity: p2p.PeerErrorSeverityLow,
+					r.blockchainCh.Error <- p2p.PeerError{
+						NodeID: peerID2,
+						Err:    err,
 					}
 				}

@@ -36,7 +36,7 @@ type reactorTestSuite struct {
 	blockchainPeerErrCh chan p2p.PeerError

 	peerUpdatesCh chan p2p.PeerUpdate
-	peerUpdates   *p2p.PeerUpdatesCh
+	peerUpdates   *p2p.PeerUpdates
 }

 func setup(
@@ -200,8 +200,8 @@ func simulateRouter(primary *reactorTestSuite, suites []*reactorTestSuite, dropC
 				primary.reactor.Logger.Debug("dropped peer error", "err", pErr.Err)
 			} else {
 				primary.peerUpdatesCh <- p2p.PeerUpdate{
-					PeerID: pErr.PeerID,
-					Status: p2p.PeerStatusRemoved,
+					NodeID: pErr.NodeID,
+					Status: p2p.PeerStatusDown,
 				}
 			}
 		}
@@ -229,7 +229,7 @@ func TestReactor_AbruptDisconnect(t *testing.T) {
 			if s.peerID != ss.peerID {
 				s.peerUpdatesCh <- p2p.PeerUpdate{
 					Status: p2p.PeerStatusUp,
-					PeerID: ss.peerID,
+					NodeID: ss.peerID,
 				}
 			}
 		}
@@ -251,7 +251,7 @@ func TestReactor_AbruptDisconnect(t *testing.T) {
 	// deadlocks or race conditions within the context of poolRoutine.
 	testSuites[1].peerUpdatesCh <- p2p.PeerUpdate{
 		Status: p2p.PeerStatusDown,
-		PeerID: testSuites[0].peerID,
+		NodeID: testSuites[0].peerID,
 	}
 }

@@ -276,7 +276,7 @@ func TestReactor_NoBlockResponse(t *testing.T) {
 			if s.peerID != ss.peerID {
 				s.peerUpdatesCh <- p2p.PeerUpdate{
 					Status: p2p.PeerStatusUp,
-					PeerID: ss.peerID,
+					NodeID: ss.peerID,
 				}
 			}
 		}
@@ -341,7 +341,7 @@ func TestReactor_BadBlockStopsPeer(t *testing.T) {
 			if s.peerID != ss.peerID {
 				s.peerUpdatesCh <- p2p.PeerUpdate{
 					Status: p2p.PeerStatusUp,
-					PeerID: ss.peerID,
+					NodeID: ss.peerID,
 				}
 			}
 		}
@@ -388,7 +388,7 @@ func TestReactor_BadBlockStopsPeer(t *testing.T) {
 	for _, s := range testSuites[:len(testSuites)-1] {
 		newSuite.peerUpdatesCh <- p2p.PeerUpdate{
 			Status: p2p.PeerStatusUp,
-			PeerID: s.peerID,
+			NodeID: s.peerID,
 		}
 	}

@@ -385,7 +385,7 @@ peer-query-maj23-sleep-duration = "2s"
 # Options:
 #   1) "null"
 #   2) "kv" (default) - the simplest possible indexer, backed by key-value storage (defaults to levelDB; see DBBackend).
-# 		- When "kv" is chosen "tx.height" and "tx.hash" will always be indexed.
+#   - When "kv" is chosen "tx.height" and "tx.hash" will always be indexed.
 indexer = "kv"

 #######################################################
@@ -487,6 +487,7 @@ Here's a brief summary of the timeouts:
 This section will cover settings within the p2p section of the `config.toml`.

 - `external-address` = is the address that will be advertised for other nodes to use. We recommend setting this field with your public IP and p2p port.
+  - > We recommend setting an external address. When used in a private network, Tendermint Core currently doesn't advertise the node's public address. There is active and ongoing work to improve the P2P system, but this is a helpful workaround for now.
 - `seeds` = is a list of comma separated seed nodes that you will connect upon a start and ask for peers. A seed node is a node that does not participate in consensus but only helps propagate peers to nodes in the networks
 - `persistent-peers` = is a list of comma separated peers that you will always want to be connected to. If you're already connected to the maximum number of peers, persistent peers will not be added.
 - `max-num-inbound-peers` = is the maximum number of peers you will accept inbound connections from at one time (where they dial your address and initiate the connection).
@@ -494,4 +495,4 @@ This section will cover settings within the p2p section of the `config.toml`.
 - `unconditional-peer-ids` = is similar to `persistent-peers` except that these peers will be connected to even if you are already connected to the maximum number of peers. This can be a validator node ID on your sentry node.
 - `pex` = turns the peer exchange reactor on or off. Validator node will want the `pex` turned off so it would not begin gossiping to unknown peers on the network. PeX can also be turned off for statically configured networks with fixed network connectivity. For full nodes on open, dynamic networks, it should be turned on.
 - `seed-mode` = is used for when node operators want to run their node as a seed node. Seed node's run a variation of the PeX protocol that disconnects from peers after sending them a list of peers to connect to. To minimize the servers usage, it is recommended to set the mempool's size to 0.
-  `private-peer-ids` = is a comma separated list of node ids that you would not like exposed to other peers (ie. you will not tell other peers about the private-peer-ids). This can be filled with a validators node id.
+- `private-peer-ids` = is a comma-separated list of node ids that will _not_ be exposed to other peers (i.e., you will not tell other peers about the ids in this list). This can be filled with a validator's node id.
@@ -23,8 +23,8 @@ import (

 const (
 	// prefixes are unique across all tm db's
-	prefixCommitted = int64(8)
-	prefixPending   = int64(9)
+	prefixCommitted = int64(9)
+	prefixPending   = int64(10)
 )

 // Pool maintains a pool of valid evidence to be broadcasted and committed
@@ -132,7 +132,7 @@ func (evpool *Pool) Update(state sm.State, ev types.EvidenceList) {
 	evpool.updateState(state)

 	// move committed evidence out from the pending pool and into the committed pool
-	evpool.markEvidenceAsCommitted(ev)
+	evpool.markEvidenceAsCommitted(ev, state.LastBlockHeight)

 	// Prune pending evidence when it has expired. This also updates when the next
 	// evidence will expire.
@@ -386,23 +386,18 @@ func (evpool *Pool) addPendingEvidence(ev types.Evidence) error {
 	return nil
 }

-func (evpool *Pool) removePendingEvidence(evidence types.Evidence) {
-	key := keyPending(evidence)
-	if err := evpool.evidenceStore.Delete(key); err != nil {
-		evpool.logger.Error("failed to delete pending evidence", "err", err)
-	} else {
-		atomic.AddUint32(&evpool.evidenceSize, ^uint32(0))
-		evpool.logger.Debug("deleted pending evidence", "evidence", evidence)
-	}
-}
-
 // markEvidenceAsCommitted processes all the evidence in the block, marking it as
 // committed and removing it from the pending database.
-func (evpool *Pool) markEvidenceAsCommitted(evidence types.EvidenceList) {
+func (evpool *Pool) markEvidenceAsCommitted(evidence types.EvidenceList, height uint64) {
 	blockEvidenceMap := make(map[string]struct{}, len(evidence))
+	batch := evpool.evidenceStore.NewBatch()
+	defer batch.Close()
+
 	for _, ev := range evidence {
 		if evpool.isPending(ev) {
-			evpool.removePendingEvidence(ev)
+			if err := batch.Delete(keyPending(ev)); err != nil {
+				evpool.logger.Error("failed to batch pending evidence", "err", err)
+			}
 			blockEvidenceMap[evMapKey(ev)] = struct{}{}
 		}

@@ -410,7 +405,7 @@ func (evpool *Pool) markEvidenceAsCommitted(evidence types.EvidenceList) {
 		// we only need to record the height that it was saved at.
 		key := keyCommitted(ev)

-		h := gogotypes.UInt64Value{Value: ev.Height()}
+		h := gogotypes.UInt64Value{Value: height}
 		evBytes, err := proto.Marshal(&h)
 		if err != nil {
 			evpool.logger.Error("failed to marshal committed evidence", "key(height/hash)", key, "err", err)
@@ -424,10 +419,22 @@ func (evpool *Pool) markEvidenceAsCommitted(evidence types.EvidenceList) {
 		evpool.logger.Debug("marked evidence as committed", "evidence", ev)
 	}

-	// remove committed evidence from the clist
-	if len(blockEvidenceMap) != 0 {
-		evpool.removeEvidenceFromList(blockEvidenceMap)
+	// check if we need to remove any pending evidence
+	if len(blockEvidenceMap) == 0 {
+		return
 	}
+
+	// remove committed evidence from pending bucket
+	if err := batch.WriteSync(); err != nil {
+		evpool.logger.Error("failed to batch delete pending evidence", "err", err)
+		return
+	}
+
+	// remove committed evidence from the clist
+	evpool.removeEvidenceFromList(blockEvidenceMap)
+
+	// update the evidence size
+	atomic.AddUint32(&evpool.evidenceSize, ^uint32(len(blockEvidenceMap)-1))
 }

 // listEvidence retrieves lists evidence from oldest to newest within maxBytes.
@@ -481,44 +488,73 @@ func (evpool *Pool) listEvidence(prefixKey int64, maxBytes int64) ([]types.Evide
 }

 func (evpool *Pool) removeExpiredPendingEvidence() (uint64, time.Time) {
-	iter, err := dbm.IteratePrefix(evpool.evidenceStore, prefixToBytes(prefixPending))
-	if err != nil {
-		evpool.logger.Error("failed to iterate over pending evidence", "err", err)
+	batch := evpool.evidenceStore.NewBatch()
+	defer batch.Close()
+
+	height, time, blockEvidenceMap := evpool.batchExpiredPendingEvidence(batch)
+
+	// if we haven't removed any evidence then return early
+	if len(blockEvidenceMap) == 0 {
+		return height, time
+	}
+
+	evpool.logger.Debug("removing expired evidence",
+		"height", evpool.State().LastBlockHeight,
+		"time", evpool.State().LastBlockTime,
+		"expired evidence", len(blockEvidenceMap),
+	)
+
+	// remove expired evidence from pending bucket
+	if err := batch.WriteSync(); err != nil {
+		evpool.logger.Error("failed to batch delete pending evidence", "err", err)
 		return evpool.State().LastBlockHeight, evpool.State().LastBlockTime
 	}

-	defer iter.Close()
+	// remove evidence from the clist
+	evpool.removeEvidenceFromList(blockEvidenceMap)

+	// update the evidence size
+	atomic.AddUint32(&evpool.evidenceSize, ^uint32(len(blockEvidenceMap)-1))
+
+	return height, time
+}
+
+func (evpool *Pool) batchExpiredPendingEvidence(batch dbm.Batch) (uint64, time.Time, map[string]struct{}) {
 	blockEvidenceMap := make(map[string]struct{})
+	iter, err := dbm.IteratePrefix(evpool.evidenceStore, prefixToBytes(prefixPending))
+	if err != nil {
+		evpool.logger.Error("failed to iterate over pending evidence", "err", err)
+		return evpool.State().LastBlockHeight, evpool.State().LastBlockTime, blockEvidenceMap
+	}
+	defer iter.Close()

 	for ; iter.Valid(); iter.Next() {
 		ev, err := bytesToEv(iter.Value())
 		if err != nil {
-			evpool.logger.Error("failed to transition evidence from protobuf", "err", err)
+			evpool.logger.Error("failed to transition evidence from protobuf", "err", err, "ev", ev)
 			continue
 		}

+		// if true, we have looped through all expired evidence
 		if !evpool.isExpired(ev.Height(), ev.Time()) {
-			if len(blockEvidenceMap) != 0 {
-				evpool.removeEvidenceFromList(blockEvidenceMap)
-			}
-
 			// Return the height and time with which this evidence will have expired
 			// so we know when to prune next.
 			return ev.Height() + uint64(evpool.State().ConsensusParams.Evidence.MaxAgeNumBlocks+1),
-				ev.Time().Add(evpool.State().ConsensusParams.Evidence.MaxAgeDuration).Add(time.Second)
+				ev.Time().Add(evpool.State().ConsensusParams.Evidence.MaxAgeDuration).Add(time.Second),
+				blockEvidenceMap
 		}

-		evpool.removePendingEvidence(ev)
+		// else add to the batch
+		if err := batch.Delete(iter.Key()); err != nil {
+			evpool.logger.Error("failed to batch evidence", "err", err, "ev", ev)
+			continue
+		}
+
+		// and add to the map to remove the evidence from the clist
 		blockEvidenceMap[evMapKey(ev)] = struct{}{}
 	}

-	// we either have no pending evidence or all evidence has expired
-	if len(blockEvidenceMap) != 0 {
-		evpool.removeEvidenceFromList(blockEvidenceMap)
-	}
-
-	return evpool.State().LastBlockHeight, evpool.State().LastBlockTime
+	return evpool.State().LastBlockHeight, evpool.State().LastBlockTime, blockEvidenceMap
 }

 func (evpool *Pool) removeEvidenceFromList(
@@ -172,7 +172,7 @@ func TestEvidencePoolUpdate(t *testing.T) {
 	pool, val := defaultTestPool(t, height)
 	state := pool.State()

-	// create new block (no need to save it to blockStore)
+	// create two lots of old evidence that we expect to be pruned when we update
 	prunedEv := types.NewMockDuplicateVoteEvidenceWithValidator(
 		1,
 		defaultEvidenceTime.Add(1*time.Minute),
@@ -180,7 +180,15 @@ func TestEvidencePoolUpdate(t *testing.T) {
 		evidenceChainID,
 	)

+	notPrunedEv := types.NewMockDuplicateVoteEvidenceWithValidator(
+		2,
+		defaultEvidenceTime.Add(2*time.Minute),
+		val,
+		evidenceChainID,
+	)
+
 	require.NoError(t, pool.AddEvidence(prunedEv))
+	require.NoError(t, pool.AddEvidence(notPrunedEv))

 	ev := types.NewMockDuplicateVoteEvidenceWithValidator(
 		height,
@@ -195,14 +203,23 @@ func TestEvidencePoolUpdate(t *testing.T) {
 	state.LastBlockHeight = height + 1
 	state.LastBlockTime = defaultEvidenceTime.Add(22 * time.Minute)

+	evList, _ := pool.PendingEvidence(2 * defaultEvidenceMaxBytes)
+	require.Equal(t, 2, len(evList))
+
+	require.Equal(t, uint32(2), pool.Size())
+
 	require.NoError(t, pool.CheckEvidence(types.EvidenceList{ev}))

+	evList, _ = pool.PendingEvidence(3 * defaultEvidenceMaxBytes)
+	require.Equal(t, 3, len(evList))
+
+	require.Equal(t, uint32(3), pool.Size())
+
 	pool.Update(state, block.Evidence.Evidence)

 	// a) Update marks evidence as committed so pending evidence should be empty
-	evList, evSize := pool.PendingEvidence(defaultEvidenceMaxBytes)
-	require.Empty(t, evList)
-	require.Zero(t, evSize)
+	evList, _ = pool.PendingEvidence(defaultEvidenceMaxBytes)
+	require.Equal(t, []types.Evidence{notPrunedEv}, evList)

 	// b) If we try to check this evidence again it should fail because it has already been committed
 	err := pool.CheckEvidence(types.EvidenceList{ev})
@@ -55,7 +55,7 @@ type Reactor struct {
 	evpool      *Pool
 	eventBus    *types.EventBus
 	evidenceCh  *p2p.Channel
-	peerUpdates *p2p.PeerUpdatesCh
+	peerUpdates *p2p.PeerUpdates
 	closeCh     chan struct{}

 	peerWG sync.WaitGroup
@@ -70,7 +70,7 @@ type Reactor struct {
 func NewReactor(
 	logger log.Logger,
 	evidenceCh *p2p.Channel,
-	peerUpdates *p2p.PeerUpdatesCh,
+	peerUpdates *p2p.PeerUpdates,
 	evpool *Pool,
 ) *Reactor {
 	r := &Reactor{
@@ -192,13 +192,12 @@ func (r *Reactor) processEvidenceCh() {

 	for {
 		select {
-		case envelope := <-r.evidenceCh.In():
-			if err := r.handleMessage(r.evidenceCh.ID(), envelope); err != nil {
-				r.Logger.Error("failed to process message", "ch_id", r.evidenceCh.ID(), "envelope", envelope, "err", err)
-				r.evidenceCh.Error() <- p2p.PeerError{
-					PeerID:   envelope.From,
-					Err:      err,
-					Severity: p2p.PeerErrorSeverityLow,
+		case envelope := <-r.evidenceCh.In:
+			if err := r.handleMessage(r.evidenceCh.ID, envelope); err != nil {
+				r.Logger.Error("failed to process message", "ch_id", r.evidenceCh.ID, "envelope", envelope, "err", err)
+				r.evidenceCh.Error <- p2p.PeerError{
+					NodeID: envelope.From,
+					Err:    err,
 				}
 			}

@@ -221,7 +220,7 @@ func (r *Reactor) processEvidenceCh() {
 //
 // REF: https://github.com/tendermint/tendermint/issues/4727
 func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) {
-	r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status)
+	r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status)

 	r.mtx.Lock()
 	defer r.mtx.Unlock()
@@ -240,21 +239,21 @@ func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) {
 		// a new done channel so we can explicitly close the goroutine if the peer
 		// is later removed, we increment the waitgroup so the reactor can stop
 		// safely, and finally start the goroutine to broadcast evidence to that peer.
-		_, ok := r.peerRoutines[peerUpdate.PeerID]
+		_, ok := r.peerRoutines[peerUpdate.NodeID]
 		if !ok {
 			closer := tmsync.NewCloser()

-			r.peerRoutines[peerUpdate.PeerID] = closer
+			r.peerRoutines[peerUpdate.NodeID] = closer
 			r.peerWG.Add(1)
-			go r.broadcastEvidenceLoop(peerUpdate.PeerID, closer)
+			go r.broadcastEvidenceLoop(peerUpdate.NodeID, closer)
 		}

-	case p2p.PeerStatusDown, p2p.PeerStatusRemoved, p2p.PeerStatusBanned:
+	case p2p.PeerStatusDown:
 		// Check if we've started an evidence broadcasting goroutine for this peer.
 		// If we have, we signal to terminate the goroutine via the channel's closure.
 		// This will internally decrement the peer waitgroup and remove the peer
 		// from the map of peer evidence broadcasting goroutines.
-		closer, ok := r.peerRoutines[peerUpdate.PeerID]
+		closer, ok := r.peerRoutines[peerUpdate.NodeID]
 		if ok {
 			closer.Close()
 		}
@@ -338,7 +337,7 @@ func (r *Reactor) broadcastEvidenceLoop(peerID p2p.NodeID, closer *tmsync.Closer
 		// and thus would not be able to process the evidence correctly. Also, the
 		// peer may receive this piece of evidence multiple times if it added and
 		// removed frequently from the broadcasting peer.
-		r.evidenceCh.Out() <- p2p.Envelope{
+		r.evidenceCh.Out <- p2p.Envelope{
 			To: peerID,
 			Message: &tmproto.EvidenceList{
 				Evidence: []tmproto.Evidence{*evProto},
@@ -42,7 +42,7 @@ type reactorTestSuite struct {
 	evidencePeerErrCh chan p2p.PeerError

 	peerUpdatesCh chan p2p.PeerUpdate
-	peerUpdates   *p2p.PeerUpdatesCh
+	peerUpdates   *p2p.PeerUpdates
 }

 func setup(t *testing.T, logger log.Logger, pool *evidence.Pool, chBuf uint) *reactorTestSuite {
@@ -224,18 +224,18 @@ func TestReactorMultiDisconnect(t *testing.T) {

 	primary.peerUpdatesCh <- p2p.PeerUpdate{
 		Status: p2p.PeerStatusUp,
-		PeerID: secondary.peerID,
+		NodeID: secondary.peerID,
 	}

 	// Ensure "disconnecting" the secondary peer from the primary more than once
 	// is handled gracefully.
 	primary.peerUpdatesCh <- p2p.PeerUpdate{
 		Status: p2p.PeerStatusDown,
-		PeerID: secondary.peerID,
+		NodeID: secondary.peerID,
 	}
 	primary.peerUpdatesCh <- p2p.PeerUpdate{
 		Status: p2p.PeerStatusDown,
-		PeerID: secondary.peerID,
+		NodeID: secondary.peerID,
 	}
 }

@@ -276,7 +276,7 @@ func TestReactorBroadcastEvidence(t *testing.T) {
 	for _, suite := range secondaries {
 		primary.peerUpdatesCh <- p2p.PeerUpdate{
 			Status: p2p.PeerStatusUp,
-			PeerID: suite.peerID,
+			NodeID: suite.peerID,
 		}
 	}

@@ -327,7 +327,7 @@ func TestReactorBroadcastEvidence_Lagging(t *testing.T) {
 	for _, suite := range secondaries {
 		primary.peerUpdatesCh <- p2p.PeerUpdate{
 			Status: p2p.PeerStatusUp,
-			PeerID: suite.peerID,
+			NodeID: suite.peerID,
 		}
 	}

@@ -378,7 +378,7 @@ func TestReactorBroadcastEvidence_Pending(t *testing.T) {
 	// add the secondary reactor as a peer to the primary reactor
 	primary.peerUpdatesCh <- p2p.PeerUpdate{
 		Status: p2p.PeerStatusUp,
-		PeerID: secondary.peerID,
+		NodeID: secondary.peerID,
 	}

 	// The secondary reactor should have received all the evidence ignoring the
@@ -438,7 +438,7 @@ func TestReactorBroadcastEvidence_Committed(t *testing.T) {
 	// add the secondary reactor as a peer to the primary reactor
 	primary.peerUpdatesCh <- p2p.PeerUpdate{
 		Status: p2p.PeerStatusUp,
-		PeerID: secondary.peerID,
+		NodeID: secondary.peerID,
 	}

 	// The secondary reactor should have received all the evidence ignoring the
@@ -487,7 +487,7 @@ func TestReactorBroadcastEvidence_FullyConnected(t *testing.T) {
 			if suiteI.peerID != suiteJ.peerID {
 				suiteI.peerUpdatesCh <- p2p.PeerUpdate{
 					Status: p2p.PeerStatusUp,
-					PeerID: suiteJ.peerID,
+					NodeID: suiteJ.peerID,
 				}
 			}
 		}
@@ -530,7 +530,7 @@ func TestReactorBroadcastEvidence_RemovePeer(t *testing.T) {
 	// add the secondary reactor as a peer to the primary reactor
 	primary.peerUpdatesCh <- p2p.PeerUpdate{
 		Status: p2p.PeerStatusUp,
-		PeerID: secondary.peerID,
+		NodeID: secondary.peerID,
 	}

 	// have the secondary reactor receive only half the evidence
@@ -539,7 +539,7 @@ func TestReactorBroadcastEvidence_RemovePeer(t *testing.T) {
 	// disconnect the peer
 	primary.peerUpdatesCh <- p2p.PeerUpdate{
 		Status: p2p.PeerStatusDown,
-		PeerID: secondary.peerID,
+		NodeID: secondary.peerID,
 	}

 	// Ensure the secondary only received half of the evidence before being
@@ -0,0 +1,30 @@
+package sync
+
+// Waker is used to wake up a sleeper when some event occurs. It debounces
+// multiple wakeup calls occurring between each sleep, and wakeups are
+// non-blocking to avoid having to coordinate goroutines.
+type Waker struct {
+	wakeCh chan struct{}
+}
+
+// NewWaker creates a new Waker.
+func NewWaker() *Waker {
+	return &Waker{
+		wakeCh: make(chan struct{}, 1), // buffer used for debouncing
+	}
+}
+
+// Sleep returns a channel that blocks until Wake() is called.
+func (w *Waker) Sleep() <-chan struct{} {
+	return w.wakeCh
+}
+
+// Wake wakes up the sleeper.
+func (w *Waker) Wake() {
+	// A non-blocking send with a size 1 buffer ensures that we never block, and
+	// that we queue up at most a single wakeup call between each Sleep().
+	select {
+	case w.wakeCh <- struct{}{}:
+	default:
+	}
+}
@@ -0,0 +1,47 @@
+package sync_test
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+
+	tmsync "github.com/tendermint/tendermint/libs/sync"
+)
+
+func TestWaker(t *testing.T) {
+
+	// A new waker should block when sleeping.
+	waker := tmsync.NewWaker()
+
+	select {
+	case <-waker.Sleep():
+		require.Fail(t, "unexpected wakeup")
+	default:
+	}
+
+	// Wakeups should not block, and should cause the next sleeper to awaken.
+	waker.Wake()
+
+	select {
+	case <-waker.Sleep():
+	default:
+		require.Fail(t, "expected wakeup, but sleeping instead")
+	}
+
+	// Multiple wakeups should only wake a single sleeper.
+	waker.Wake()
+	waker.Wake()
+	waker.Wake()
+
+	select {
+	case <-waker.Sleep():
+	default:
+		require.Fail(t, "expected wakeup, but sleeping instead")
+	}
+
+	select {
+	case <-waker.Sleep():
+		require.Fail(t, "unexpected wakeup")
+	default:
+	}
+}
@@ -70,7 +70,7 @@ func (p *http) LightBlock(ctx context.Context, height uint64) (*types.LightBlock
 		return nil, err
 	}

-	vs, err := p.validatorSet(ctx, h)
+	vs, err := p.validatorSet(ctx, &sh.Height)
 	if err != nil {
 		return nil, err
 	}
@@ -29,6 +29,7 @@ type KeyPathFunc func(path string, key []byte) (merkle.KeyPath, error)
 //go:generate mockery --case underscore --name LightClient
 type LightClient interface {
 	ChainID() string
+	Update(ctx context.Context, now time.Time) (*types.LightBlock, error)
 	VerifyLightBlockAtHeight(ctx context.Context, height uint64, now time.Time) (*types.LightBlock, error)
 	TrustedLightBlock(height uint64) (*types.LightBlock, error)
 }
@@ -131,7 +132,8 @@ func (c *Client) ABCIQueryWithOptions(ctx context.Context, path string, data tmb

 	// Update the light client if we're behind.
 	// NOTE: AppHash for height H is in header H+1.
-	l, err := c.updateLightClientIfNeededTo(ctx, resp.Height+1)
+	nextHeight := resp.Height + 1
+	l, err := c.updateLightClientIfNeededTo(ctx, &nextHeight)
 	if err != nil {
 		return nil, err
 	}
@@ -214,7 +216,7 @@ func (c *Client) ConsensusParams(ctx context.Context, height *uint64) (*ctypes.R
 	}

 	// Update the light client if we're behind.
-	l, err := c.updateLightClientIfNeededTo(ctx, res.BlockHeight)
+	l, err := c.updateLightClientIfNeededTo(ctx, &res.BlockHeight)
 	if err != nil {
 		return nil, err
 	}
@@ -253,7 +255,7 @@ func (c *Client) BlockchainInfo(ctx context.Context, minHeight, maxHeight uint64
 	// Update the light client if we're behind.
 	if len(res.BlockMetas) > 0 {
 		lastHeight := res.BlockMetas[len(res.BlockMetas)-1].Header.Height
-		if _, err := c.updateLightClientIfNeededTo(ctx, lastHeight); err != nil {
+		if _, err := c.updateLightClientIfNeededTo(ctx, &lastHeight); err != nil {
 			return nil, err
 		}
 	}
@@ -297,7 +299,7 @@ func (c *Client) Block(ctx context.Context, height *uint64) (*ctypes.ResultBlock
 	}

 	// Update the light client if we're behind.
-	l, err := c.updateLightClientIfNeededTo(ctx, res.Block.Height)
+	l, err := c.updateLightClientIfNeededTo(ctx, &res.Block.Height)
 	if err != nil {
 		return nil, err
 	}
@@ -331,7 +333,7 @@ func (c *Client) BlockByHash(ctx context.Context, hash []byte) (*ctypes.ResultBl
 	}

 	// Update the light client if we're behind.
-	l, err := c.updateLightClientIfNeededTo(ctx, res.Block.Height)
+	l, err := c.updateLightClientIfNeededTo(ctx, &res.Block.Height)
 	if err != nil {
 		return nil, err
 	}
@@ -372,7 +374,8 @@ func (c *Client) BlockResults(ctx context.Context, height *uint64) (*ctypes.Resu
 	}

 	// Update the light client if we're behind.
-	trustedBlock, err := c.updateLightClientIfNeededTo(ctx, h+1)
+	nextHeight := h + 1
+	trustedBlock, err := c.updateLightClientIfNeededTo(ctx, &nextHeight)
 	if err != nil {
 		return nil, err
 	}
@@ -410,7 +413,8 @@ func (c *Client) BlockResults(ctx context.Context, height *uint64) (*ctypes.Resu

 func (c *Client) Commit(ctx context.Context, height *uint64) (*ctypes.ResultCommit, error) {
 	// Update the light client if we're behind and retrieve the light block at the requested height
-	l, err := c.updateLightClientIfNeededTo(ctx, *height)
+	// or at the latest height if no height is provided.
+	l, err := c.updateLightClientIfNeededTo(ctx, height)
 	if err != nil {
 		return nil, err
 	}
@@ -435,7 +439,7 @@ func (c *Client) Tx(ctx context.Context, hash []byte, prove bool) (*ctypes.Resul
 	}

 	// Update the light client if we're behind.
-	l, err := c.updateLightClientIfNeededTo(ctx, res.Height)
+	l, err := c.updateLightClientIfNeededTo(ctx, &res.Height)
 	if err != nil {
 		return nil, err
 	}
@@ -452,8 +456,9 @@ func (c *Client) TxSearch(ctx context.Context, query string, prove bool, page, p
 // Validators fetches and verifies validators.
 func (c *Client) Validators(ctx context.Context, height *uint64, pagePtr, perPagePtr *int) (*ctypes.ResultValidators,
 	error) {
-	// Update the light client if we're behind and retrieve the light block at the requested height.
-	l, err := c.updateLightClientIfNeededTo(ctx, *height)
+	// Update the light client if we're behind and retrieve the light block at the requested height
+	// or at the latest height if no height is provided.
+	l, err := c.updateLightClientIfNeededTo(ctx, height)
 	if err != nil {
 		return nil, err
 	}
@@ -470,7 +475,7 @@ func (c *Client) Validators(ctx context.Context, height *uint64, pagePtr, perPag
 	v := l.ValidatorSet.Validators[skipCount : skipCount+tmmath.MinInt(perPage, totalCount-skipCount)]

 	return &ctypes.ResultValidators{
-		BlockHeight: *height,
+		BlockHeight: l.Height,
 		Validators:  v,
 		Count:       len(v),
 		Total:       totalCount}, nil
@@ -493,8 +498,16 @@ func (c *Client) UnsubscribeAll(ctx context.Context, subscriber string) error {
 	return c.next.UnsubscribeAll(ctx, subscriber)
 }

-func (c *Client) updateLightClientIfNeededTo(ctx context.Context, height uint64) (*types.LightBlock, error) {
-	l, err := c.lc.VerifyLightBlockAtHeight(ctx, height, time.Now())
+func (c *Client) updateLightClientIfNeededTo(ctx context.Context, height *uint64) (*types.LightBlock, error) {
+	var (
+		l   *types.LightBlock
+		err error
+	)
+	if height == nil {
+		l, err = c.lc.Update(ctx, time.Now())
+	} else {
+		l, err = c.lc.VerifyLightBlockAtHeight(ctx, *height, time.Now())
+	}
 	if err != nil {
 		return nil, fmt.Errorf("failed to update light client to %d: %w", height, err)
 	}
@@ -1,4 +1,4 @@
-// Code generated by mockery v2.5.1. DO NOT EDIT.
+// Code generated by mockery v0.0.0-dev. DO NOT EDIT.

 package mocks

@@ -54,6 +54,29 @@ func (_m *LightClient) TrustedLightBlock(height uint64) (*types.LightBlock, erro
 	return r0, r1
 }

+// Update provides a mock function with given fields: ctx, now
+func (_m *LightClient) Update(ctx context.Context, now time.Time) (*types.LightBlock, error) {
+	ret := _m.Called(ctx, now)
+
+	var r0 *types.LightBlock
+	if rf, ok := ret.Get(0).(func(context.Context, time.Time) *types.LightBlock); ok {
+		r0 = rf(ctx, now)
+	} else {
+		if ret.Get(0) != nil {
+			r0 = ret.Get(0).(*types.LightBlock)
+		}
+	}
+
+	var r1 error
+	if rf, ok := ret.Get(1).(func(context.Context, time.Time) error); ok {
+		r1 = rf(ctx, now)
+	} else {
+		r1 = ret.Error(1)
+	}
+
+	return r0, r1
+}
+
 // VerifyLightBlockAtHeight provides a mock function with given fields: ctx, height, now
 func (_m *LightClient) VerifyLightBlockAtHeight(ctx context.Context, height uint64, now time.Time) (*types.LightBlock, error) {
 	ret := _m.Called(ctx, height, now)
@@ -14,8 +14,8 @@ import (
 )

 const (
-	prefixLightBlock = int64(0x0a)
-	prefixSize       = int64(0x0b)
+	prefixLightBlock = int64(11)
+	prefixSize       = int64(12)
 )

 type dbs struct {
@@ -230,27 +230,11 @@ func (s *dbs) Prune(size uint16) error {
 	}
 	numToPrune := sSize - size

-	// 2) Iterate over headers and perform a batch operation.
-	itr, err := s.db.Iterator(
-		s.lbKey(1),
-		append(s.lbKey(1<<63-1), byte(0x00)),
-	)
-	if err != nil {
-		panic(err)
-	}
-	defer itr.Close()
-
 	b := s.db.NewBatch()
 	defer b.Close()

-	for itr.Valid() && numToPrune > 0 {
-		if err = b.Delete(itr.Key()); err != nil {
-			return err
-		}
-		itr.Next()
-		numToPrune--
-	}
-	if err = itr.Error(); err != nil {
+	// 2) use an iterator to batch together all the blocks that need to be deleted
+	if err := s.batchDelete(b, numToPrune); err != nil {
 		return err
 	}

@@ -261,12 +245,7 @@ func (s *dbs) Prune(size uint16) error {
 	}

 	// 4) write batch deletion to disk
-	err = b.WriteSync()
-	if err != nil {
-		return err
-	}
-
-	return nil
+	return b.WriteSync()
 }

 // Size returns the number of header & validator set pairs.
@@ -278,6 +257,27 @@ func (s *dbs) Size() uint16 {
 	return s.size
 }

+func (s *dbs) batchDelete(batch dbm.Batch, numToPrune uint16) error {
+	itr, err := s.db.Iterator(
+		s.lbKey(1),
+		append(s.lbKey(1<<63-1), byte(0x00)),
+	)
+	if err != nil {
+		return err
+	}
+	defer itr.Close()
+
+	for itr.Valid() && numToPrune > 0 {
+		if err = batch.Delete(itr.Key()); err != nil {
+			return err
+		}
+		itr.Next()
+		numToPrune--
+	}
+
+	return itr.Error()
+}
+
 func (s *dbs) sizeKey() []byte {
 	key, err := orderedcode.Append(nil, prefixSize)
 	if err != nil {
@@ -58,7 +58,7 @@ type Reactor struct {
 	peerMgr PeerManager

 	mempoolCh   *p2p.Channel
-	peerUpdates *p2p.PeerUpdatesCh
+	peerUpdates *p2p.PeerUpdates
 	closeCh     chan struct{}

 	// peerWG is used to coordinate graceful termination of all peer broadcasting
@@ -76,7 +76,7 @@ func NewReactor(
 	peerMgr PeerManager,
 	mempool *CListMempool,
 	mempoolCh *p2p.Channel,
-	peerUpdates *p2p.PeerUpdatesCh,
+	peerUpdates *p2p.PeerUpdates,
 ) *Reactor {

 	r := &Reactor{
@@ -221,13 +221,12 @@ func (r *Reactor) processMempoolCh() {

 	for {
 		select {
-		case envelope := <-r.mempoolCh.In():
-			if err := r.handleMessage(r.mempoolCh.ID(), envelope); err != nil {
-				r.Logger.Error("failed to process message", "ch_id", r.mempoolCh.ID(), "envelope", envelope, "err", err)
-				r.mempoolCh.Error() <- p2p.PeerError{
-					PeerID:   envelope.From,
-					Err:      err,
-					Severity: p2p.PeerErrorSeverityLow,
+		case envelope := <-r.mempoolCh.In:
+			if err := r.handleMessage(r.mempoolCh.ID, envelope); err != nil {
+				r.Logger.Error("failed to process message", "ch_id", r.mempoolCh.ID, "envelope", envelope, "err", err)
+				r.mempoolCh.Error <- p2p.PeerError{
+					NodeID: envelope.From,
+					Err:    err,
 				}
 			}

@@ -244,7 +243,7 @@ func (r *Reactor) processMempoolCh() {
 // removed peers, we remove the peer from the mempool peer ID set and signal to
 // stop the tx broadcasting goroutine.
 func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) {
-	r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status)
+	r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status)

 	r.mtx.Lock()
 	defer r.mtx.Unlock()
@@ -264,28 +263,28 @@ func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) {
 			// a new done channel so we can explicitly close the goroutine if the peer
 			// is later removed, we increment the waitgroup so the reactor can stop
 			// safely, and finally start the goroutine to broadcast txs to that peer.
-			_, ok := r.peerRoutines[peerUpdate.PeerID]
+			_, ok := r.peerRoutines[peerUpdate.NodeID]
 			if !ok {
 				closer := tmsync.NewCloser()

-				r.peerRoutines[peerUpdate.PeerID] = closer
+				r.peerRoutines[peerUpdate.NodeID] = closer
 				r.peerWG.Add(1)

-				r.ids.ReserveForPeer(peerUpdate.PeerID)
+				r.ids.ReserveForPeer(peerUpdate.NodeID)

 				// start a broadcast routine ensuring all txs are forwarded to the peer
-				go r.broadcastTxRoutine(peerUpdate.PeerID, closer)
+				go r.broadcastTxRoutine(peerUpdate.NodeID, closer)
 			}
 		}

-	case p2p.PeerStatusDown, p2p.PeerStatusRemoved, p2p.PeerStatusBanned:
-		r.ids.Reclaim(peerUpdate.PeerID)
+	case p2p.PeerStatusDown:
+		r.ids.Reclaim(peerUpdate.NodeID)

 		// Check if we've started a tx broadcasting goroutine for this peer.
 		// If we have, we signal to terminate the goroutine via the channel's closure.
 		// This will internally decrement the peer waitgroup and remove the peer
 		// from the map of peer tx broadcasting goroutines.
-		closer, ok := r.peerRoutines[peerUpdate.PeerID]
+		closer, ok := r.peerRoutines[peerUpdate.NodeID]
 		if ok {
 			closer.Close()
 		}
@@ -371,7 +370,7 @@ func (r *Reactor) broadcastTxRoutine(peerID p2p.NodeID, closer *tmsync.Closer) {
 		if _, ok := memTx.senders.Load(peerMempoolID); !ok {
 			// Send the mempool tx to the corresponding peer. Note, the peer may be
 			// behind and thus would not be able to process the mempool tx correctly.
-			r.mempoolCh.Out() <- p2p.Envelope{
+			r.mempoolCh.Out <- p2p.Envelope{
 				To: peerID,
 				Message: &protomem.Txs{
 					Txs: [][]byte{memTx.tx},
@@ -33,7 +33,7 @@ type reactorTestSuite struct {
 	mempoolPeerErrCh chan p2p.PeerError

 	peerUpdatesCh chan p2p.PeerUpdate
-	peerUpdates   *p2p.PeerUpdatesCh
+	peerUpdates   *p2p.PeerUpdates
 }

 func setup(t *testing.T, cfg *cfg.MempoolConfig, logger log.Logger, chBuf uint) *reactorTestSuite {
@@ -189,7 +189,7 @@ func TestReactorBroadcastTxs(t *testing.T) {
 	for _, suite := range secondaries {
 		primary.peerUpdatesCh <- p2p.PeerUpdate{
 			Status: p2p.PeerStatusUp,
-			PeerID: suite.peerID,
+			NodeID: suite.peerID,
 		}
 	}

@@ -295,7 +295,7 @@ func TestReactorNoBroadcastToSender(t *testing.T) {

 	primary.peerUpdatesCh <- p2p.PeerUpdate{
 		Status: p2p.PeerStatusUp,
-		PeerID: secondary.peerID,
+		NodeID: secondary.peerID,
 	}

 	time.Sleep(100 * time.Millisecond)
@@ -360,7 +360,7 @@ func TestReactor_MaxTxBytes(t *testing.T) {

 	primary.peerUpdatesCh <- p2p.PeerUpdate{
 		Status: p2p.PeerStatusUp,
-		PeerID: secondary.peerID,
+		NodeID: secondary.peerID,
 	}

 	// Wait till all secondary suites (reactor) received all mempool txs from the
@@ -406,7 +406,7 @@ func TestDontExhaustMaxActiveIDs(t *testing.T) {
 	for i := 0; i < maxActiveIDs+1; i++ {
 		reactor.peerUpdatesCh <- p2p.PeerUpdate{
 			Status: p2p.PeerStatusUp,
-			PeerID: peerID,
+			NodeID: peerID,
 		}
 		reactor.mempoolOutCh <- p2p.Envelope{
 			To: peerID,
@@ -466,12 +466,12 @@ func TestBroadcastTxForPeerStopsWhenPeerStops(t *testing.T) {
 	// connect peer
 	primary.peerUpdatesCh <- p2p.PeerUpdate{
 		Status: p2p.PeerStatusUp,
-		PeerID: secondary.peerID,
+		NodeID: secondary.peerID,
 	}

 	// disconnect peer
 	primary.peerUpdatesCh <- p2p.PeerUpdate{
 		Status: p2p.PeerStatusDown,
-		PeerID: secondary.peerID,
+		NodeID: secondary.peerID,
 	}
 }
@@ -758,7 +758,7 @@ func NewNode(config *cfg.Config,

 	// TODO: Fetch and provide real options and do proper p2p bootstrapping.
 	// TODO: Use a persistent peer database.
-	peerMgr, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{})
+	peerMgr, err := p2p.NewPeerManager(nodeKey.ID, dbm.NewMemDB(), p2p.PeerManagerOptions{})
 	if err != nil {
 		return nil, err
 	}
@@ -106,10 +106,9 @@ func ParseNodeAddress(urlString string) (NodeAddress, error) {
 		Protocol: Protocol(strings.ToLower(url.Scheme)),
 	}

-	// Opaque URLs are expected to contain only a node ID, also used as path.
+	// Opaque URLs are expected to contain only a node ID.
 	if url.Opaque != "" {
 		address.NodeID = NodeID(url.Opaque)
-		address.Path = url.Opaque
 		return address, address.Validate()
 	}

@@ -158,7 +158,7 @@ func TestParseNodeAddress(t *testing.T) {
 		},
 		{
 			"memory:" + user,
-			p2p.NodeAddress{Protocol: "memory", NodeID: id, Path: user},
+			p2p.NodeAddress{Protocol: "memory", NodeID: id},
 			true,
 		},

@@ -1,130 +0,0 @@
-package p2p
-
-import (
-	"sync"
-
-	"github.com/gogo/protobuf/proto"
-)
-
-// ChannelID is an arbitrary channel ID.
-type ChannelID uint16
-
-// Envelope specifies the message receiver and sender.
-type Envelope struct {
-	From      NodeID        // Message sender, or empty for outbound messages.
-	To        NodeID        // Message receiver, or empty for inbound messages.
-	Broadcast bool          // Send message to all connected peers, ignoring To.
-	Message   proto.Message // Payload.
-
-	// For internal use in the Router.
-	channelID ChannelID
-}
-
-// Strip strips internal information from the envelope. Primarily used for
-// testing, such that returned envelopes can be compared with literals.
-func (e Envelope) Strip() Envelope {
-	e.channelID = 0
-	return e
-}
-
-// Channel is a bidirectional channel for Protobuf message exchange with peers.
-// A Channel is safe for concurrent use by multiple goroutines.
-type Channel struct {
-	closeOnce sync.Once
-
-	// id defines the unique channel ID.
-	id ChannelID
-
-	// messageType specifies the type of messages exchanged via the channel, and
-	// is used e.g. for automatic unmarshaling.
-	messageType proto.Message
-
-	// inCh is a channel for receiving inbound messages. Envelope.From is always
-	// set.
-	inCh chan Envelope
-
-	// outCh is a channel for sending outbound messages. Envelope.To or Broadcast
-	// must be set, otherwise the message is discarded.
-	outCh chan Envelope
-
-	// errCh is a channel for reporting peer errors to the router, typically used
-	// when peers send an invalid or malignant message.
-	errCh chan PeerError
-
-	// doneCh is used to signal that a Channel is closed. A Channel is bi-directional
-	// and should be closed by the reactor, where as the router is responsible
-	// for explicitly closing the internal In channel.
-	doneCh chan struct{}
-}
-
-// NewChannel returns a reference to a new p2p Channel. It is the reactor's
-// responsibility to close the Channel. After a channel is closed, the router may
-// safely and explicitly close the internal In channel.
-func NewChannel(id ChannelID, mType proto.Message, in, out chan Envelope, errCh chan PeerError) *Channel {
-	return &Channel{
-		id:          id,
-		messageType: mType,
-		inCh:        in,
-		outCh:       out,
-		errCh:       errCh,
-		doneCh:      make(chan struct{}),
-	}
-}
-
-// ID returns the Channel's ID.
-func (c *Channel) ID() ChannelID {
-	return c.id
-}
-
-// In returns a read-only inbound go channel. This go channel should be used by
-// reactors to consume Envelopes sent from peers.
-func (c *Channel) In() <-chan Envelope {
-	return c.inCh
-}
-
-// Out returns a write-only outbound go channel. This go channel should be used
-// by reactors to route Envelopes to other peers.
-func (c *Channel) Out() chan<- Envelope {
-	return c.outCh
-}
-
-// Error returns a write-only outbound go channel designated for peer errors only.
-// This go channel should be used by reactors to send peer errors when consuming
-// Envelopes sent from other peers.
-func (c *Channel) Error() chan<- PeerError {
-	return c.errCh
-}
-
-// Close closes the outbound channel and marks the Channel as done. Internally,
-// the outbound outCh and peer error errCh channels are closed. It is the reactor's
-// responsibility to invoke Close. Any send on the Out or Error channel will
-// panic after the Channel is closed.
-//
-// NOTE: After a Channel is closed, the router may safely assume it can no longer
-// send on the internal inCh, however it should NEVER explicitly close it as
-// that could result in panics by sending on a closed channel.
-func (c *Channel) Close() {
-	c.closeOnce.Do(func() {
-		close(c.doneCh)
-		close(c.outCh)
-		close(c.errCh)
-	})
-}
-
-// Done returns the Channel's internal channel that should be used by a router
-// to signal when it is safe to send on the internal inCh go channel.
-func (c *Channel) Done() <-chan struct{} {
-	return c.doneCh
-}
-
-// Wrapper is a Protobuf message that can contain a variety of inner messages.
-// If a Channel's message type implements Wrapper, the channel will
-// automatically (un)wrap passed messages using the container type, such that
-// the channel can transparently support multiple message types.
-type Wrapper interface {
-	// Wrap will take a message and wrap it in this one.
-	Wrap(proto.Message) error
-
-	// Unwrap will unwrap the inner message contained in this message.
-	Unwrap() (proto.Message, error)
-}
@@ -0,0 +1,206 @@
+// Code generated by mockery v2.5.1. DO NOT EDIT.
+
+package mocks
+
+import (
+	context "context"
+
+	conn "github.com/tendermint/tendermint/p2p/conn"
+
+	crypto "github.com/tendermint/tendermint/crypto"
+
+	mock "github.com/stretchr/testify/mock"
+
+	p2p "github.com/tendermint/tendermint/p2p"
+)
+
+// Connection is an autogenerated mock type for the Connection type
+type Connection struct {
+	mock.Mock
+}
+
+// Close provides a mock function with given fields:
+func (_m *Connection) Close() error {
+	ret := _m.Called()
+
+	var r0 error
+	if rf, ok := ret.Get(0).(func() error); ok {
+		r0 = rf()
+	} else {
+		r0 = ret.Error(0)
+	}
+
+	return r0
+}
+
+// FlushClose provides a mock function with given fields:
+func (_m *Connection) FlushClose() error {
+	ret := _m.Called()
+
+	var r0 error
+	if rf, ok := ret.Get(0).(func() error); ok {
+		r0 = rf()
+	} else {
+		r0 = ret.Error(0)
+	}
+
+	return r0
+}
+
+// Handshake provides a mock function with given fields: _a0, _a1, _a2
+func (_m *Connection) Handshake(_a0 context.Context, _a1 p2p.NodeInfo, _a2 crypto.PrivKey) (p2p.NodeInfo, crypto.PubKey, error) {
+	ret := _m.Called(_a0, _a1, _a2)
+
+	var r0 p2p.NodeInfo
+	if rf, ok := ret.Get(0).(func(context.Context, p2p.NodeInfo, crypto.PrivKey) p2p.NodeInfo); ok {
+		r0 = rf(_a0, _a1, _a2)
+	} else {
+		r0 = ret.Get(0).(p2p.NodeInfo)
+	}
+
+	var r1 crypto.PubKey
+	if rf, ok := ret.Get(1).(func(context.Context, p2p.NodeInfo, crypto.PrivKey) crypto.PubKey); ok {
+		r1 = rf(_a0, _a1, _a2)
+	} else {
+		if ret.Get(1) != nil {
+			r1 = ret.Get(1).(crypto.PubKey)
+		}
+	}
+
+	var r2 error
+	if rf, ok := ret.Get(2).(func(context.Context, p2p.NodeInfo, crypto.PrivKey) error); ok {
+		r2 = rf(_a0, _a1, _a2)
+	} else {
+		r2 = ret.Error(2)
+	}
+
+	return r0, r1, r2
+}
+
+// LocalEndpoint provides a mock function with given fields:
+func (_m *Connection) LocalEndpoint() p2p.Endpoint {
+	ret := _m.Called()
+
+	var r0 p2p.Endpoint
+	if rf, ok := ret.Get(0).(func() p2p.Endpoint); ok {
+		r0 = rf()
+	} else {
+		r0 = ret.Get(0).(p2p.Endpoint)
+	}
+
+	return r0
+}
+
+// ReceiveMessage provides a mock function with given fields:
+func (_m *Connection) ReceiveMessage() (p2p.ChannelID, []byte, error) {
+	ret := _m.Called()
+
+	var r0 p2p.ChannelID
+	if rf, ok := ret.Get(0).(func() p2p.ChannelID); ok {
+		r0 = rf()
+	} else {
+		r0 = ret.Get(0).(p2p.ChannelID)
+	}
+
+	var r1 []byte
+	if rf, ok := ret.Get(1).(func() []byte); ok {
+		r1 = rf()
+	} else {
+		if ret.Get(1) != nil {
+			r1 = ret.Get(1).([]byte)
+		}
+	}
+
+	var r2 error
+	if rf, ok := ret.Get(2).(func() error); ok {
+		r2 = rf()
+	} else {
+		r2 = ret.Error(2)
+	}
+
+	return r0, r1, r2
+}
+
+// RemoteEndpoint provides a mock function with given fields:
+func (_m *Connection) RemoteEndpoint() p2p.Endpoint {
+	ret := _m.Called()
+
+	var r0 p2p.Endpoint
+	if rf, ok := ret.Get(0).(func() p2p.Endpoint); ok {
+		r0 = rf()
+	} else {
+		r0 = ret.Get(0).(p2p.Endpoint)
+	}
+
+	return r0
+}
+
+// SendMessage provides a mock function with given fields: _a0, _a1
+func (_m *Connection) SendMessage(_a0 p2p.ChannelID, _a1 []byte) (bool, error) {
+	ret := _m.Called(_a0, _a1)
+
+	var r0 bool
+	if rf, ok := ret.Get(0).(func(p2p.ChannelID, []byte) bool); ok {
+		r0 = rf(_a0, _a1)
+	} else {
+		r0 = ret.Get(0).(bool)
+	}
+
+	var r1 error
+	if rf, ok := ret.Get(1).(func(p2p.ChannelID, []byte) error); ok {
+		r1 = rf(_a0, _a1)
+	} else {
+		r1 = ret.Error(1)
+	}
+
+	return r0, r1
+}
+
+// Status provides a mock function with given fields:
+func (_m *Connection) Status() conn.ConnectionStatus {
+	ret := _m.Called()
+
+	var r0 conn.ConnectionStatus
+	if rf, ok := ret.Get(0).(func() conn.ConnectionStatus); ok {
+		r0 = rf()
+	} else {
+		r0 = ret.Get(0).(conn.ConnectionStatus)
+	}
+
+	return r0
+}
+
+// String provides a mock function with given fields:
+func (_m *Connection) String() string {
+	ret := _m.Called()
+
+	var r0 string
+	if rf, ok := ret.Get(0).(func() string); ok {
+		r0 = rf()
+	} else {
+		r0 = ret.Get(0).(string)
+	}
+
+	return r0
+}
+
+// TrySendMessage provides a mock function with given fields: _a0, _a1
+func (_m *Connection) TrySendMessage(_a0 p2p.ChannelID, _a1 []byte) (bool, error) {
+	ret := _m.Called(_a0, _a1)
+
+	var r0 bool
+	if rf, ok := ret.Get(0).(func(p2p.ChannelID, []byte) bool); ok {
+		r0 = rf(_a0, _a1)
+	} else {
+		r0 = ret.Get(0).(bool)
+	}
+
+	var r1 error
+	if rf, ok := ret.Get(1).(func(p2p.ChannelID, []byte) error); ok {
+		r1 = rf(_a0, _a1)
+	} else {
+		r1 = ret.Error(1)
+	}
+
+	return r0, r1
+}
@@ -0,0 +1,121 @@
+// Code generated by mockery v2.5.1. DO NOT EDIT.
+
+package mocks
+
+import (
+	context "context"
+
+	mock "github.com/stretchr/testify/mock"
+	p2p "github.com/tendermint/tendermint/p2p"
+)
+
+// Transport is an autogenerated mock type for the Transport type
+type Transport struct {
+	mock.Mock
+}
+
+// Accept provides a mock function with given fields:
+func (_m *Transport) Accept() (p2p.Connection, error) {
+	ret := _m.Called()
+
+	var r0 p2p.Connection
+	if rf, ok := ret.Get(0).(func() p2p.Connection); ok {
+		r0 = rf()
+	} else {
+		if ret.Get(0) != nil {
+			r0 = ret.Get(0).(p2p.Connection)
+		}
+	}
+
+	var r1 error
+	if rf, ok := ret.Get(1).(func() error); ok {
+		r1 = rf()
+	} else {
+		r1 = ret.Error(1)
+	}
+
+	return r0, r1
+}
+
+// Close provides a mock function with given fields:
+func (_m *Transport) Close() error {
+	ret := _m.Called()
+
+	var r0 error
+	if rf, ok := ret.Get(0).(func() error); ok {
+		r0 = rf()
+	} else {
+		r0 = ret.Error(0)
+	}
+
+	return r0
+}
+
+// Dial provides a mock function with given fields: _a0, _a1
+func (_m *Transport) Dial(_a0 context.Context, _a1 p2p.Endpoint) (p2p.Connection, error) {
+	ret := _m.Called(_a0, _a1)
+
+	var r0 p2p.Connection
+	if rf, ok := ret.Get(0).(func(context.Context, p2p.Endpoint) p2p.Connection); ok {
+		r0 = rf(_a0, _a1)
+	} else {
+		if ret.Get(0) != nil {
+			r0 = ret.Get(0).(p2p.Connection)
+		}
+	}
+
+	var r1 error
+	if rf, ok := ret.Get(1).(func(context.Context, p2p.Endpoint) error); ok {
+		r1 = rf(_a0, _a1)
+	} else {
+		r1 = ret.Error(1)
+	}
+
+	return r0, r1
+}
+
+// Endpoints provides a mock function with given fields:
+func (_m *Transport) Endpoints() []p2p.Endpoint {
+	ret := _m.Called()
+
+	var r0 []p2p.Endpoint
+	if rf, ok := ret.Get(0).(func() []p2p.Endpoint); ok {
+		r0 = rf()
+	} else {
+		if ret.Get(0) != nil {
+			r0 = ret.Get(0).([]p2p.Endpoint)
+		}
+	}
+
+	return r0
+}
+
+// Protocols provides a mock function with given fields:
+func (_m *Transport) Protocols() []p2p.Protocol {
+	ret := _m.Called()
+
+	var r0 []p2p.Protocol
+	if rf, ok := ret.Get(0).(func() []p2p.Protocol); ok {
+		r0 = rf()
+	} else {
+		if ret.Get(0) != nil {
+			r0 = ret.Get(0).([]p2p.Protocol)
+		}
+	}
+
+	return r0
+}
+
+// String provides a mock function with given fields:
+func (_m *Transport) String() string {
+	ret := _m.Called()
+
+	var r0 string
+	if rf, ok := ret.Get(0).(func() string); ok {
+		r0 = rf()
+	} else {
+		r0 = ret.Get(0).(string)
+	}
+
+	return r0
+}
@@ -0,0 +1,34 @@
+package p2p_test
+
+import (
+	"context"
+
+	"github.com/tendermint/tendermint/crypto"
+	"github.com/tendermint/tendermint/crypto/ed25519"
+	"github.com/tendermint/tendermint/p2p"
+)
+
+// Common setup for P2P tests.
+
+var (
+	ctx  = context.Background()
+	chID = p2p.ChannelID(1)
+
+	selfKey  crypto.PrivKey = ed25519.GenPrivKeyFromSecret([]byte{0xf9, 0x1b, 0x08, 0xaa, 0x38, 0xee, 0x34, 0xdd})
+	selfID                  = p2p.NodeIDFromPubKey(selfKey.PubKey())
+	selfInfo                = p2p.NodeInfo{
+		NodeID:     selfID,
+		ListenAddr: "0.0.0.0:0",
+		Network:    "test",
+		Moniker:    string(selfID),
+	}
+
+	peerKey  crypto.PrivKey = ed25519.GenPrivKeyFromSecret([]byte{0x84, 0xd7, 0x01, 0xbf, 0x83, 0x20, 0x1c, 0xfe})
+	peerID                  = p2p.NodeIDFromPubKey(peerKey.PubKey())
+	peerInfo                = p2p.NodeInfo{
+		NodeID:     peerID,
+		ListenAddr: "0.0.0.0:0",
+		Network:    "test",
+		Moniker:    string(peerID),
+	}
+)
@@ -0,0 +1,240 @@
+package p2ptest
+
+import (
+	"math/rand"
+	"testing"
+	"time"
+
+	"github.com/gogo/protobuf/proto"
+	"github.com/stretchr/testify/require"
+	dbm "github.com/tendermint/tm-db"
+
+	"github.com/tendermint/tendermint/crypto"
+	"github.com/tendermint/tendermint/crypto/ed25519"
+	"github.com/tendermint/tendermint/libs/log"
+	"github.com/tendermint/tendermint/p2p"
+)
+
+// Network sets up an in-memory network that can be used for high-level P2P
+// testing. It creates an arbitrary number of nodes that are connected to each
+// other, and can open channels across all nodes with custom reactors.
+type Network struct {
+	Nodes map[p2p.NodeID]*Node
+
+	logger        log.Logger
+	memoryNetwork *p2p.MemoryNetwork
+}
+
+// MakeNetwork creates a test network with the given number of nodes and
+// connects them to each other.
+func MakeNetwork(t *testing.T, nodes int) *Network {
+	logger := log.TestingLogger()
+	network := &Network{
+		Nodes:         map[p2p.NodeID]*Node{},
+		logger:        logger,
+		memoryNetwork: p2p.NewMemoryNetwork(logger),
+	}
+	for i := 0; i < nodes; i++ {
+		node := MakeNode(t, network)
+		network.Nodes[node.NodeID] = node
+	}
+
+	// Set up a list of node addresses to dial, and a peer update subscription
+	// for each node.
+	dialQueue := []p2p.NodeAddress{}
+	subs := map[p2p.NodeID]*p2p.PeerUpdates{}
+	for _, node := range network.Nodes {
+		dialQueue = append(dialQueue, node.NodeAddress)
+		subs[node.NodeID] = node.PeerManager.Subscribe()
+		defer subs[node.NodeID].Close()
+	}
+
+	// For each node, dial the nodes that it still doesn't have a connection to
+	// (either inbound or outbound), and wait for both sides to confirm the
+	// connection via the subscriptions.
+	for i, sourceAddress := range dialQueue {
+		sourceNode := network.Nodes[sourceAddress.NodeID]
+		sourceSub := subs[sourceAddress.NodeID]
+		for _, targetAddress := range dialQueue[i+1:] { // nodes <i already connected
+			targetNode := network.Nodes[targetAddress.NodeID]
+			targetSub := subs[targetAddress.NodeID]
+			require.NoError(t, sourceNode.PeerManager.Add(targetAddress))
+
+			select {
+			case peerUpdate := <-sourceSub.Updates():
+				require.Equal(t, p2p.PeerUpdate{
+					NodeID: targetNode.NodeID,
+					Status: p2p.PeerStatusUp,
+				}, peerUpdate)
+			case <-time.After(time.Second):
+				require.Fail(t, "timed out waiting for peer", "%v dialing %v",
+					sourceNode.NodeID, targetNode.NodeID)
+			}
+
+			select {
+			case peerUpdate := <-targetSub.Updates():
+				require.Equal(t, p2p.PeerUpdate{
+					NodeID: sourceNode.NodeID,
+					Status: p2p.PeerStatusUp,
+				}, peerUpdate)
+			case <-time.After(time.Second):
+				require.Fail(t, "timed out waiting for peer", "%v accepting %v",
+					targetNode.NodeID, sourceNode.NodeID)
+			}
+
+			// Add the address to the target as well, so it's able to dial the
+			// source back if that's even necessary.
+			require.NoError(t, targetNode.PeerManager.Add(sourceAddress))
+		}
+	}
+
+	return network
+}
+
+// NodeIDs returns the network's node IDs.
+func (n *Network) NodeIDs() []p2p.NodeID {
+	ids := []p2p.NodeID{}
+	for id := range n.Nodes {
+		ids = append(ids, id)
+	}
+	return ids
+}
+
+// MakeChannels makes a channel on all nodes and returns them, automatically
+// doing error checks and cleanups.
+func (n *Network) MakeChannels(
+	t *testing.T,
+	chID p2p.ChannelID,
+	messageType proto.Message,
+) map[p2p.NodeID]*p2p.Channel {
+	channels := map[p2p.NodeID]*p2p.Channel{}
+	for _, node := range n.Nodes {
+		channels[node.NodeID] = node.MakeChannel(t, chID, messageType)
+	}
+	return channels
+}
+
+// RandomNode returns a random node.
+func (n *Network) RandomNode() *Node {
+	nodes := make([]*Node, 0, len(n.Nodes))
+	for _, node := range n.Nodes {
+		nodes = append(nodes, node)
+	}
+	return nodes[rand.Intn(len(nodes))] // nolint:gosec
+}
+
+// Peers returns a node's peers (i.e. everyone except itself).
+func (n *Network) Peers(id p2p.NodeID) []*Node {
+	peers := make([]*Node, 0, len(n.Nodes)-1)
+	for _, peer := range n.Nodes {
+		if peer.NodeID != id {
+			peers = append(peers, peer)
+		}
+	}
+	return peers
+}
+
+// Remove removes a node from the network, stopping it and waiting for all other
+// nodes to pick up the disconnection.
+func (n *Network) Remove(t *testing.T, id p2p.NodeID) {
+	require.Contains(t, n.Nodes, id)
+	node := n.Nodes[id]
+	delete(n.Nodes, id)
+
+	subs := []*p2p.PeerUpdates{}
+	for _, peer := range n.Nodes {
+		sub := peer.PeerManager.Subscribe()
+		defer sub.Close()
+		subs = append(subs, sub)
+	}
+
+	require.NoError(t, node.Transport.Close())
+	if node.Router.IsRunning() {
+		require.NoError(t, node.Router.Stop())
+	}
+	node.PeerManager.Close()
+
+	for _, sub := range subs {
+		RequireUpdate(t, sub, p2p.PeerUpdate{
+			NodeID: node.NodeID,
+			Status: p2p.PeerStatusDown,
+		})
+	}
+}
+
+// Node is a node in a Network, with a Router and a PeerManager.
+type Node struct {
+	NodeID      p2p.NodeID
+	NodeInfo    p2p.NodeInfo
+	NodeAddress p2p.NodeAddress
+	PrivKey     crypto.PrivKey
+	Router      *p2p.Router
+	PeerManager *p2p.PeerManager
+	Transport   *p2p.MemoryTransport
+}
+
+// MakeNode creates a new Node.
+func MakeNode(t *testing.T, network *Network) *Node {
+	privKey := ed25519.GenPrivKey()
+	nodeID := p2p.NodeIDFromPubKey(privKey.PubKey())
+	nodeInfo := p2p.NodeInfo{
+		NodeID:     nodeID,
+		ListenAddr: "0.0.0.0:0", // FIXME: We have to fake this for now.
+		Moniker:    string(nodeID),
+	}
+
+	transport := network.memoryNetwork.CreateTransport(nodeID)
+	require.Len(t, transport.Endpoints(), 1, "transport not listening on 1 endpoint")
+
+	peerManager, err := p2p.NewPeerManager(nodeID, dbm.NewMemDB(), p2p.PeerManagerOptions{
+		MinRetryTime: 10 * time.Millisecond,
+	})
+	require.NoError(t, err)
+
+	router, err := p2p.NewRouter(network.logger, nodeInfo, privKey, peerManager,
+		[]p2p.Transport{transport}, p2p.RouterOptions{})
+	require.NoError(t, err)
+	require.NoError(t, router.Start())
+
+	t.Cleanup(func() {
+		if router.IsRunning() {
+			require.NoError(t, router.Stop())
+		}
+		peerManager.Close()
+		require.NoError(t, transport.Close())
+	})
+
+	return &Node{
+		NodeID:      nodeID,
+		NodeInfo:    nodeInfo,
+		NodeAddress: transport.Endpoints()[0].NodeAddress(nodeID),
+		PrivKey:     privKey,
+		Router:      router,
+		PeerManager: peerManager,
+		Transport:   transport,
+	}
+}
+
+// MakeChannel opens a channel, with automatic error handling and cleanup. On
+// test cleanup, it also checks that the channel is empty, to make sure
+// all expected messages have been asserted.
+func (n *Node) MakeChannel(t *testing.T, chID p2p.ChannelID, messageType proto.Message) *p2p.Channel {
+	channel, err := n.Router.OpenChannel(chID, messageType)
+	require.NoError(t, err)
+	t.Cleanup(func() {
+		RequireEmpty(t, channel)
+		channel.Close()
+	})
+	return channel
+}
+
+// MakePeerUpdates opens a peer update subscription, with automatic cleanup.
+// It checks that all updates have been consumed during cleanup.
+func (n *Node) MakePeerUpdates(t *testing.T) *p2p.PeerUpdates {
+	sub := n.PeerManager.Subscribe()
+	t.Cleanup(func() {
+		RequireNoUpdates(t, sub)
+		sub.Close()
+	})
+	return sub
+}
@@ -0,0 +1,155 @@
+package p2ptest
+
+import (
+	"testing"
+	"time"
+
+	"github.com/gogo/protobuf/proto"
+	"github.com/stretchr/testify/require"
+
+	"github.com/tendermint/tendermint/p2p"
+)
+
+// RequireEmpty requires that the given channel is empty.
+func RequireEmpty(t *testing.T, channels ...*p2p.Channel) {
+	for _, channel := range channels {
+		select {
+		case e := <-channel.In:
+			require.Fail(t, "unexpected message", "channel %v should be empty, got %v", channel.ID, e)
+		case <-time.After(10 * time.Millisecond):
+		}
+	}
+}
+
+// RequireReceive requires that the given envelope is received on the channel.
+func RequireReceive(t *testing.T, channel *p2p.Channel, expect p2p.Envelope) {
+	timer := time.NewTimer(time.Second) // not time.After due to goroutine leaks
+	defer timer.Stop()
+
+	select {
+	case e, ok := <-channel.In:
+		require.True(t, ok, "channel %v is closed", channel.ID)
+		require.Equal(t, expect, e)
+
+	case <-channel.Done():
+		require.Fail(t, "channel %v is closed", channel.ID)
+
+	case <-timer.C:
+		require.Fail(t, "timed out waiting for message", "%v on channel %v", expect, channel.ID)
+	}
+}
+
+// RequireReceiveUnordered requires that the given envelopes are all received on
+// the channel, ignoring order.
+func RequireReceiveUnordered(t *testing.T, channel *p2p.Channel, expect []p2p.Envelope) {
+	timer := time.NewTimer(time.Second) // not time.After due to goroutine leaks
+	defer timer.Stop()
+
+	actual := []p2p.Envelope{}
+	for {
+		select {
+		case e, ok := <-channel.In:
+			require.True(t, ok, "channel %v is closed", channel.ID)
+			actual = append(actual, e)
+			if len(actual) == len(expect) {
+				require.ElementsMatch(t, expect, actual)
+				return
+			}
+
+		case <-channel.Done():
+			require.Fail(t, "channel %v is closed", channel.ID)
+
+		case <-timer.C:
+			require.ElementsMatch(t, expect, actual)
+			return
+		}
+	}
+
+}
+
+// RequireSend requires that the given envelope is sent on the channel.
+func RequireSend(t *testing.T, channel *p2p.Channel, envelope p2p.Envelope) {
+	timer := time.NewTimer(time.Second) // not time.After due to goroutine leaks
+	defer timer.Stop()
+	select {
+	case channel.Out <- envelope:
+	case <-timer.C:
+		require.Fail(t, "timed out sending message", "%v on channel %v", envelope, channel.ID)
+	}
+}
+
+// RequireSendReceive requires that a given Protobuf message is sent to the
+// given peer, and then that the given response is received back.
+func RequireSendReceive(
+	t *testing.T,
+	channel *p2p.Channel,
+	peerID p2p.NodeID,
+	send proto.Message,
+	receive proto.Message,
+) {
+	RequireSend(t, channel, p2p.Envelope{To: peerID, Message: send})
+	RequireReceive(t, channel, p2p.Envelope{From: peerID, Message: send})
+}
+
+// RequireNoUpdates requires that a PeerUpdates subscription is empty.
+func RequireNoUpdates(t *testing.T, peerUpdates *p2p.PeerUpdates) {
+	select {
+	case update := <-peerUpdates.Updates():
+		require.Fail(t, "unexpected peer updates", "got %v", update)
+	default:
+	}
+}
+
+// RequireError requires that the given peer error is submitted for a peer.
+func RequireError(t *testing.T, channel *p2p.Channel, peerError p2p.PeerError) {
+	timer := time.NewTimer(time.Second) // not time.After due to goroutine leaks
+	defer timer.Stop()
+	select {
+	case channel.Error <- peerError:
+	case <-timer.C:
+		require.Fail(t, "timed out reporting error", "%v on %v", peerError, channel.ID)
+	}
+}
+
+// RequireUpdate requires that a PeerUpdates subscription yields the given update.
+func RequireUpdate(t *testing.T, peerUpdates *p2p.PeerUpdates, expect p2p.PeerUpdate) {
+	timer := time.NewTimer(time.Second) // not time.After due to goroutine leaks
+	defer timer.Stop()
+
+	select {
+	case update := <-peerUpdates.Updates():
+		require.Equal(t, expect, update, "peer update did not match")
+
+	case <-peerUpdates.Done():
+		require.Fail(t, "peer updates subscription is closed")
+
+	case <-timer.C:
+		require.Fail(t, "timed out waiting for peer update", "expected %v", expect)
+	}
+}
+
+// RequireUpdates requires that a PeerUpdates subscription yields the given updates
+// in the given order.
+func RequireUpdates(t *testing.T, peerUpdates *p2p.PeerUpdates, expect []p2p.PeerUpdate) {
+	timer := time.NewTimer(time.Second) // not time.After due to goroutine leaks
+	defer timer.Stop()
+
+	actual := []p2p.PeerUpdate{}
+	for {
+		select {
+		case update := <-peerUpdates.Updates():
+			actual = append(actual, update)
+			if len(actual) == len(expect) {
+				require.Equal(t, expect, actual)
+				return
+			}
+
+		case <-peerUpdates.Done():
+			require.Fail(t, "peer updates subscription is closed")
+
+		case <-timer.C:
+			require.Equal(t, expect, actual, "did not receive expected peer updates")
+			return
+		}
+	}
+}
@@ -0,0 +1,8 @@
+package p2ptest
+
+import (
+	gogotypes "github.com/gogo/protobuf/types"
+)
+
+// Message is a simple message containing a string-typed Value field.
+type Message = gogotypes.StringValue
@@ -30,7 +30,7 @@ type ReactorV2 struct {

 	peerManager *p2p.PeerManager
 	pexCh       *p2p.Channel
-	peerUpdates *p2p.PeerUpdatesCh
+	peerUpdates *p2p.PeerUpdates
 	closeCh     chan struct{}
 }

@@ -39,7 +39,7 @@ func NewReactorV2(
 	logger log.Logger,
 	peerManager *p2p.PeerManager,
 	pexCh *p2p.Channel,
-	peerUpdates *p2p.PeerUpdatesCh,
+	peerUpdates *p2p.PeerUpdates,
 ) *ReactorV2 {
 	r := &ReactorV2{
 		peerManager: peerManager,
@@ -85,7 +85,7 @@ func (r *ReactorV2) handlePexMessage(envelope p2p.Envelope) error {
 	switch msg := envelope.Message.(type) {
 	case *protop2p.PexRequest:
 		pexAddresses := r.resolve(r.peerManager.Advertise(envelope.From, maxAddresses), maxAddresses)
-		r.pexCh.Out() <- p2p.Envelope{
+		r.pexCh.Out <- p2p.Envelope{
 			To:      envelope.From,
 			Message: &protop2p.PexResponse{Addresses: pexAddresses},
 		}
@@ -177,13 +177,12 @@ func (r *ReactorV2) processPexCh() {

 	for {
 		select {
-		case envelope := <-r.pexCh.In():
-			if err := r.handleMessage(r.pexCh.ID(), envelope); err != nil {
-				r.Logger.Error("failed to process message", "ch_id", r.pexCh.ID(), "envelope", envelope, "err", err)
-				r.pexCh.Error() <- p2p.PeerError{
-					PeerID:   envelope.From,
-					Err:      err,
-					Severity: p2p.PeerErrorSeverityLow,
+		case envelope := <-r.pexCh.In:
+			if err := r.handleMessage(r.pexCh.ID, envelope); err != nil {
+				r.Logger.Error("failed to process message", "ch_id", r.pexCh.ID, "envelope", envelope, "err", err)
+				r.pexCh.Error <- p2p.PeerError{
+					NodeID: envelope.From,
+					Err:    err,
 				}
 			}

@@ -197,11 +196,11 @@ func (r *ReactorV2) processPexCh() {
 // processPeerUpdate processes a PeerUpdate. For added peers, PeerStatusUp, we
 // send a request for addresses.
 func (r *ReactorV2) processPeerUpdate(peerUpdate p2p.PeerUpdate) {
-	r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status)
+	r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status)

 	if peerUpdate.Status == p2p.PeerStatusUp {
-		r.pexCh.Out() <- p2p.Envelope{
-			To:      peerUpdate.PeerID,
+		r.pexCh.Out <- p2p.Envelope{
+			To:      peerUpdate.NodeID,
 			Message: &protop2p.PexRequest{},
 		}
 	}
@@ -9,11 +9,109 @@ import (
 	"time"

 	"github.com/gogo/protobuf/proto"
+
 	"github.com/tendermint/tendermint/crypto"
 	"github.com/tendermint/tendermint/libs/log"
 	"github.com/tendermint/tendermint/libs/service"
 )

+// ChannelID is an arbitrary channel ID.
+type ChannelID uint16
+
+// Envelope contains a message with sender/receiver routing info.
+type Envelope struct {
+	From      NodeID        // sender (empty if outbound)
+	To        NodeID        // receiver (empty if inbound)
+	Broadcast bool          // send to all connected peers (ignores To)
+	Message   proto.Message // message payload
+
+	// channelID is for internal Router use, set on outbound messages to inform
+	// the sendPeer() goroutine which transport channel to use.
+	//
+	// FIXME: If we migrate the Transport API to a byte-oriented multi-stream
+	// API, this will no longer be necessary since each channel will be mapped
+	// onto a stream during channel/peer setup. See:
+	// https://github.com/tendermint/spec/pull/227
+	channelID ChannelID
+}
+
+// PeerError is a peer error reported via Channel.Error.
+//
+// FIXME: This currently just disconnects the peer, which is too simplistic.
+// For example, some errors should be logged, some should cause disconnects,
+// and some should ban the peer.
+//
+// FIXME: This should probably be replaced by a more general PeerBehavior
+// concept that can mark good and bad behavior and contributes to peer scoring.
+// It should possibly also allow reactors to request explicit actions, e.g.
+// disconnection or banning, in addition to doing this based on aggregates.
+type PeerError struct {
+	NodeID NodeID
+	Err    error
+}
+
+// Channel is a bidirectional channel to exchange Protobuf messages with peers,
+// wrapped in Envelope to specify routing info (i.e. sender/receiver).
+type Channel struct {
+	ID    ChannelID
+	In    <-chan Envelope  // inbound messages (peers to reactors)
+	Out   chan<- Envelope  // outbound messages (reactors to peers)
+	Error chan<- PeerError // peer error reporting
+
+	messageType proto.Message // the channel's message type, used for unmarshalling
+	closeCh     chan struct{}
+	closeOnce   sync.Once
+}
+
+// NewChannel creates a new channel. It is primarily for internal and test
+// use, reactors should use Router.OpenChannel().
+func NewChannel(
+	id ChannelID,
+	messageType proto.Message,
+	inCh <-chan Envelope,
+	outCh chan<- Envelope,
+	errCh chan<- PeerError,
+) *Channel {
+	return &Channel{
+		ID:          id,
+		messageType: messageType,
+		In:          inCh,
+		Out:         outCh,
+		Error:       errCh,
+		closeCh:     make(chan struct{}),
+	}
+}
+
+// Close closes the channel. Future sends on Out and Error will panic. The In
+// channel remains open to avoid having to synchronize Router senders, which
+// should use Done() to detect channel closure instead.
+func (c *Channel) Close() {
+	c.closeOnce.Do(func() {
+		close(c.closeCh)
+		close(c.Out)
+		close(c.Error)
+	})
+}
+
+// Done returns a channel that's closed when Channel.Close() is called.
+func (c *Channel) Done() <-chan struct{} {
+	return c.closeCh
+}
+
+// Wrapper is a Protobuf message that can contain a variety of inner messages
+// (e.g. via oneof fields). If a Channel's message type implements Wrapper, the
+// Router will automatically wrap outbound messages and unwrap inbound messages,
+// such that reactors do not have to do this themselves.
+type Wrapper interface {
+	proto.Message
+
+	// Wrap will take a message and wrap it in this one if possible.
+	Wrap(proto.Message) error
+
+	// Unwrap will unwrap the inner message contained in this message.
+	Unwrap() (proto.Message, error)
+}
+
 // RouterOptions specifies options for a Router.
 type RouterOptions struct {
 	// ResolveTimeout is the timeout for resolving NodeAddress URLs.
@@ -28,83 +126,62 @@ type RouterOptions struct {
 	HandshakeTimeout time.Duration
 }

-// Validate validates the options.
+// Validate validates router options.
 func (o *RouterOptions) Validate() error {
 	return nil
 }

 // Router manages peer connections and routes messages between peers and reactor
-// channels. This is an early prototype.
+// channels. It takes a PeerManager for peer lifecycle management (e.g. which
+// peers to dial and when) and a set of Transports for connecting and
+// communicating with peers.
 //
-// Channels are registered via OpenChannel(). When called, we register an input
-// message queue for the channel in channelQueues and spawn off a goroutine for
-// Router.routeChannel(). This goroutine reads off outbound messages and puts
-// them in the appropriate peer message queue, and processes peer errors which
-// will close (and thus disconnect) the appriate peer queue. It runs until
-// either the channel is closed by the caller or the router is stopped, at which
-// point the input message queue is closed and removed.
+// On startup, three main goroutines are spawned to maintain peer connections:
 //
-// On startup, the router spawns off three primary goroutines that maintain
-// connections to peers and run for the lifetime of the router:
+//   dialPeers(): in a loop, calls PeerManager.DialNext() to get the next peer
+//   address to dial and spawns a goroutine that dials the peer, handshakes
+//   with it, and begins to route messages if successful.
 //
-//   Router.dialPeers(): in a loop, asks the PeerManager for the next peer
-//   address to contact, resolves it into endpoints, and attempts to dial
-//   each one.
+//   acceptPeers(): in a loop, waits for an inbound connection via
+//   Transport.Accept() and spawns a goroutine that handshakes with it and
+//   begins to route messages if successful.
 //
-//   Router.acceptPeers(): in a loop, waits for the next inbound connection
-//   from a peer, and checks with the PeerManager if it should be accepted.
+//   evictPeers(): in a loop, calls PeerManager.EvictNext() to get the next
+//   peer to evict, and disconnects it by closing its message queue.
 //
-//   Router.evictPeers(): in a loop, asks the PeerManager for any connected
-//   peers to evict, and disconnects them.
+// When a peer is connected, an outbound peer message queue is registered in
+// peerQueues, and routePeer() is called to spawn off two additional goroutines:
 //
-// Once either an inbound or outbound connection has been made, an outbound
-// message queue is registered in Router.peerQueues and a goroutine is spawned
-// off for Router.routePeer() which will spawn off additional goroutines for
-// Router.sendPeer() that sends outbound messages from the peer queue over the
-// connection and for Router.receivePeer() that reads inbound messages from
-// the connection and places them in the appropriate channel queue. When either
-// goroutine exits, the connection and peer queue is closed, which will cause
-// the other goroutines to close as well.
+//   sendPeer(): waits for an outbound message from the peerQueues queue,
+//   marshals it, and passes it to the peer transport which delivers it.
 //
-// The peerStore is used to coordinate peer connections, by only allowing a peer
-// to be claimed (owned) by a single caller at a time (both for outbound and
-// inbound connections). This is done either via peerStore.Dispense() which
-// dispenses and claims an eligible peer to dial, or via peerStore.Claim() which
-// attempts to claim a given peer for an inbound connection. Peers must be
-// returned to the peerStore with peerStore.Return() to release the claim. Over
-// time, the peerStore will also do peer scheduling and prioritization, e.g.
-// ensuring we do exponential backoff on dial failures and connecting to
-// more important peers first (such as persistent peers and validators).
+//   receivePeer(): waits for an inbound message from the peer transport,
+//   unmarshals it, and passes it to the appropriate inbound channel queue
+//   in channelQueues.
 //
-// An additional goroutine Router.broadcastPeerUpdates() is also spawned off
-// on startup, which consumes peer updates from Router.peerUpdatesCh (currently
-// only connections and disconnections), and broadcasts them to all peer update
-// subscriptions registered via SubscribePeerUpdates().
+// When a reactor opens a channel via OpenChannel, an inbound channel message
+// queue is registered in channelQueues, and a channel goroutine is spawned:
 //
-// On router shutdown, we close Router.stopCh which will signal to all
-// goroutines to terminate. This in turn will cause all pending channel/peer
-// queues to close, and we wait for this as a signal that goroutines have ended.
+//   routeChannel(): waits for an outbound message from the channel, looks
+//   up the recipient peer's outbound message queue in peerQueues, and submits
+//   the message to it.
 //
-// All message scheduling should be limited to the queue implementations used
-// for channel queues and peer queues. All message sending throughout the router
-// is blocking, and if any messages should be dropped or buffered this is the
-// sole responsibility of the queue, such that we can limit this logic to a
-// single place. There is currently only a FIFO queue implementation that always
-// blocks and never drops messages, but this must be improved with other
-// implementations. The only exception is that all message sending must also
-// select on appropriate channel/queue/router closure signals, to avoid blocking
-// forever on a channel that has no consumer.
+// All channel sends in the router are blocking. It is the responsibility of the
+// queue interface in peerQueues and channelQueues to prioritize and drop
+// messages as appropriate during contention to prevent stalls and ensure good
+// quality of service.
 type Router struct {
 	*service.BaseService

-	logger      log.Logger
-	nodeInfo    NodeInfo
-	privKey     crypto.PrivKey
-	transports  map[Protocol]Transport
-	peerManager *PeerManager
-	options     RouterOptions
+	logger             log.Logger
+	options            RouterOptions
+	nodeInfo           NodeInfo
+	privKey            crypto.PrivKey
+	peerManager        *PeerManager
+	transports         []Transport
+	protocolTransports map[Protocol]Transport
+	stopCh             chan struct{} // signals Router shutdown

-	// FIXME: Consider using sync.Map.
 	peerMtx    sync.RWMutex
 	peerQueues map[NodeID]queue

@@ -114,12 +191,11 @@ type Router struct {
 	channelMtx      sync.RWMutex
 	channelQueues   map[ChannelID]queue
 	channelMessages map[ChannelID]proto.Message
-
-	// stopCh is used to signal router shutdown, by closing the channel.
-	stopCh chan struct{}
 }

-// NewRouter creates a new Router.
+// NewRouter creates a new Router. The given Transports must already be
+// listening on appropriate interfaces, and will be closed by the Router when it
+// stops.
 func NewRouter(
 	logger log.Logger,
 	nodeInfo NodeInfo,
@@ -133,23 +209,24 @@ func NewRouter(
 	}

 	router := &Router{
-		logger:          logger,
-		nodeInfo:        nodeInfo,
-		privKey:         privKey,
-		transports:      map[Protocol]Transport{},
-		peerManager:     peerManager,
-		options:         options,
-		stopCh:          make(chan struct{}),
-		channelQueues:   map[ChannelID]queue{},
-		channelMessages: map[ChannelID]proto.Message{},
-		peerQueues:      map[NodeID]queue{},
+		logger:             logger,
+		nodeInfo:           nodeInfo,
+		privKey:            privKey,
+		transports:         transports,
+		protocolTransports: map[Protocol]Transport{},
+		peerManager:        peerManager,
+		options:            options,
+		stopCh:             make(chan struct{}),
+		channelQueues:      map[ChannelID]queue{},
+		channelMessages:    map[ChannelID]proto.Message{},
+		peerQueues:         map[NodeID]queue{},
 	}
 	router.BaseService = service.NewBaseService(logger, "router", router)

 	for _, transport := range transports {
 		for _, protocol := range transport.Protocols() {
-			if _, ok := router.transports[protocol]; !ok {
-				router.transports[protocol] = transport
+			if _, ok := router.protocolTransports[protocol]; !ok {
+				router.protocolTransports[protocol] = transport
 			}
 		}
 	}
@@ -158,12 +235,20 @@ func NewRouter(
 }

 // OpenChannel opens a new channel for the given message type. The caller must
-// close the channel when done, and this must happen before the router stops.
+// close the channel when done, before stopping the Router. messageType is the
+// type of message passed through the channel (used for unmarshaling), which can
+// implement Wrapper to automatically (un)wrap multiple message types in a
+// wrapper message.
 func (r *Router) OpenChannel(id ChannelID, messageType proto.Message) (*Channel, error) {
-	// FIXME: NewChannel should take directional channels so we can pass
-	// queue.dequeue() instead of reaching inside for queue.queueCh.
 	queue := newFIFOQueue()
-	channel := NewChannel(id, messageType, queue.queueCh, make(chan Envelope), make(chan PeerError))
+	outCh := make(chan Envelope)
+	errCh := make(chan PeerError)
+	channel := NewChannel(id, messageType, queue.dequeue(), outCh, errCh)
+
+	var wrapper Wrapper
+	if w, ok := messageType.(Wrapper); ok {
+		wrapper = w
+	}

 	r.channelMtx.Lock()
 	defer r.channelMtx.Unlock()
@@ -182,98 +267,97 @@ func (r *Router) OpenChannel(id ChannelID, messageType proto.Message) (*Channel,
 			r.channelMtx.Unlock()
 			queue.close()
 		}()
-		r.routeChannel(channel)
+
+		r.routeChannel(id, outCh, errCh, wrapper)
 	}()

 	return channel, nil
 }

-// routeChannel receives outbound messages and errors from a channel and routes
-// them to the appropriate peer. It returns when either the channel is closed or
-// the router is shutting down.
-func (r *Router) routeChannel(channel *Channel) {
+// routeChannel receives outbound channel messages and routes them to the
+// appropriate peer. It also receives peer errors and reports them to the peer
+// manager. It returns when either the outbound channel or error channel is
+// closed, or the Router is stopped. wrapper is an optional message wrapper
+// for messages, see Wrapper for details.
+func (r *Router) routeChannel(
+	chID ChannelID,
+	outCh <-chan Envelope,
+	errCh <-chan PeerError,
+	wrapper Wrapper,
+) {
 	for {
 		select {
-		case envelope, ok := <-channel.outCh:
+		case envelope, ok := <-outCh:
 			if !ok {
 				return
 			}

-			// FIXME: This is a bit unergonomic, maybe it'd be better for Wrap()
-			// to return a wrapped copy.
-			if _, ok := channel.messageType.(Wrapper); ok {
-				wrapper := proto.Clone(channel.messageType)
-				if err := wrapper.(Wrapper).Wrap(envelope.Message); err != nil {
-					r.Logger.Error("failed to wrap message", "err", err)
+			// Mark the envelope with the channel ID to allow sendPeer() to pass
+			// it on to Transport.SendMessage().
+			envelope.channelID = chID
+
+			// Wrap the message in a wrapper message, if requested.
+			if wrapper != nil {
+				msg := proto.Clone(wrapper)
+				if err := msg.(Wrapper).Wrap(envelope.Message); err != nil {
+					r.Logger.Error("failed to wrap message", "channel", chID, "err", err)
 					continue
 				}
-				envelope.Message = wrapper
+				envelope.Message = msg
 			}
-			envelope.channelID = channel.id

+			// Collect peer queues to pass the message via.
+			var queues []queue
 			if envelope.Broadcast {
 				r.peerMtx.RLock()
-				peerQueues := make(map[NodeID]queue, len(r.peerQueues))
-				for peerID, peerQueue := range r.peerQueues {
-					peerQueues[peerID] = peerQueue
+				queues = make([]queue, 0, len(r.peerQueues))
+				for _, q := range r.peerQueues {
+					queues = append(queues, q)
 				}
 				r.peerMtx.RUnlock()
-
-				for peerID, peerQueue := range peerQueues {
-					e := envelope
-					e.Broadcast = false
-					e.To = peerID
-					select {
-					case peerQueue.enqueue() <- e:
-					case <-peerQueue.closed():
-					case <-r.stopCh:
-						return
-					}
-				}
-
 			} else {
 				r.peerMtx.RLock()
-				peerQueue, ok := r.peerQueues[envelope.To]
+				q, ok := r.peerQueues[envelope.To]
 				r.peerMtx.RUnlock()
 				if !ok {
-					r.logger.Error("dropping message for non-connected peer",
-						"peer", envelope.To, "channel", channel.id)
+					r.logger.Debug("dropping message for unconnected peer",
+						"peer", envelope.To, "channel", chID)
 					continue
 				}
+				queues = []queue{q}
+			}

+			// Send message to peers.
+			for _, q := range queues {
 				select {
-				case peerQueue.enqueue() <- envelope:
-				case <-peerQueue.closed():
-					r.logger.Error("dropping message for non-connected peer",
-						"peer", envelope.To, "channel", channel.id)
+				case q.enqueue() <- envelope:
+				case <-q.closed():
+					r.logger.Debug("dropping message for unconnected peer",
+						"peer", envelope.To, "channel", chID)
 				case <-r.stopCh:
 					return
 				}
 			}

-		case peerError, ok := <-channel.errCh:
+		case peerError, ok := <-errCh:
 			if !ok {
 				return
 			}
-			// FIXME: We just disconnect the peer for now
-			r.logger.Error("peer error, disconnecting", "peer", peerError.PeerID, "err", peerError.Err)
-			r.peerMtx.RLock()
-			peerQueue, ok := r.peerQueues[peerError.PeerID]
-			r.peerMtx.RUnlock()
-			if ok {
-				peerQueue.close()
+			r.logger.Error("peer error, evicting", "peer", peerError.NodeID, "err", peerError.Err)
+			if err := r.peerManager.Errored(peerError.NodeID, peerError.Err); err != nil {
+				r.logger.Error("failed to report peer error", "peer", peerError.NodeID, "err", err)
 			}

-		case <-channel.Done():
-			return
 		case <-r.stopCh:
 			return
 		}
 	}
 }

-// acceptPeers accepts inbound connections from peers on the given transport.
+// acceptPeers accepts inbound connections from peers on the given transport,
+// and spawns goroutines that route messages to/from them.
 func (r *Router) acceptPeers(transport Transport) {
+	r.logger.Debug("starting accept routine", "transport", transport)
 	ctx := r.stopCtx()
 	for {
 		// FIXME: We may need transports to enforce some sort of rate limiting
@@ -301,34 +385,37 @@ func (r *Router) acceptPeers(transport Transport) {
 			return
 		default:
 			r.logger.Error("failed to accept connection", "transport", transport, "err", err)
-			continue
+			return
 		}

+		// Spawn a goroutine for the handshake, to avoid head-of-line blocking.
 		go func() {
-			defer func() {
-				_ = conn.Close()
-			}()
+			defer conn.Close()

-			// FIXME: Because we do the handshake in each transport, rather than
-			// here in the Router, the remote peer will think they've
-			// successfully connected and start sending us messages, although we
-			// can end up rejecting the connection here. This can e.g. cause
-			// problems in tests, where because of race conditions a
-			// disconnection can cause the local node to immediately redial,
-			// while the remote node may not have completed the disconnection
-			// registration yet and reject the accept below.
+			// FIXME: The peer manager may reject the peer during Accepted()
+			// after we've handshaked with the peer (to find out which peer it
+			// is). However, because the handshake has no ack, the remote peer
+			// will think the handshake was successful and start sending us
+			// messages.
 			//
-			// The Router should do the handshake, and we should check with the
-			// peer manager before completing the handshake -- this probably
-			// requires protocol changes to send an additional message when the
-			// handshake is accepted.
+			// This can cause problems in tests, where a disconnection can cause
+			// the local node to immediately redial, while the remote node may
+			// not have completed the disconnection yet and therefore reject the
+			// reconnection attempt (since it thinks we're still connected from
+			// before).
+			//
+			// The Router should do the handshake and have a final ack/fail
+			// message to make sure both ends have accepted the connection, such
+			// that it can be coordinated with the peer manager.
 			peerInfo, _, err := r.handshakePeer(ctx, conn, "")
-			if err == context.Canceled {
+			switch {
+			case errors.Is(err, context.Canceled):
 				return
-			} else if err != nil {
-				r.logger.Error("failed to handshake with peer", "err", err)
+			case err != nil:
+				r.logger.Error("peer handshake failed", "endpoint", conn, "err", err)
 				return
 			}
+
 			if err := r.peerManager.Accepted(peerInfo.NodeID); err != nil {
 				r.logger.Error("failed to accept connection", "peer", peerInfo.NodeID, "err", err)
 				return
@@ -338,7 +425,6 @@ func (r *Router) acceptPeers(transport Transport) {
 			r.peerMtx.Lock()
 			r.peerQueues[peerInfo.NodeID] = queue
 			r.peerMtx.Unlock()
-			r.peerManager.Ready(peerInfo.NodeID)

 			defer func() {
 				r.peerMtx.Lock()
@@ -350,52 +436,62 @@ func (r *Router) acceptPeers(transport Transport) {
 				}
 			}()

+			if err := r.peerManager.Ready(peerInfo.NodeID); err != nil {
+				r.logger.Error("failed to mark peer as ready", "peer", peerInfo.NodeID, "err", err)
+				return
+			}
+
 			r.routePeer(peerInfo.NodeID, conn, queue)
 		}()
 	}
 }

-// dialPeers maintains outbound connections to peers.
+// dialPeers maintains outbound connections to peers by dialing them.
 func (r *Router) dialPeers() {
+	r.logger.Debug("starting dial routine")
 	ctx := r.stopCtx()
 	for {
-		peerID, address, err := r.peerManager.DialNext(ctx)
-		switch err {
-		case nil:
-		case context.Canceled:
+		address, err := r.peerManager.DialNext(ctx)
+		switch {
+		case errors.Is(err, context.Canceled):
 			r.logger.Debug("stopping dial routine")
 			return
-		default:
+		case err != nil:
 			r.logger.Error("failed to find next peer to dial", "err", err)
 			return
 		}

+		// Spawn off a goroutine to actually dial the peer, so that we can
+		// dial multiple peers in parallel.
 		go func() {
 			conn, err := r.dialPeer(ctx, address)
-			if errors.Is(err, context.Canceled) {
+			switch {
+			case errors.Is(err, context.Canceled):
 				return
-			} else if err != nil {
-				r.logger.Error("failed to dial peer", "peer", peerID, "err", err)
-				if err = r.peerManager.DialFailed(peerID, address); err != nil {
-					r.logger.Error("failed to report dial failure", "peer", peerID, "err", err)
+			case err != nil:
+				r.logger.Error("failed to dial peer", "peer", address, "err", err)
+				if err = r.peerManager.DialFailed(address); err != nil {
+					r.logger.Error("failed to report dial failure", "peer", address, "err", err)
 				}
 				return
 			}
 			defer conn.Close()

+			peerID := address.NodeID
 			_, _, err = r.handshakePeer(ctx, conn, peerID)
-			if errors.Is(err, context.Canceled) {
+			switch {
+			case errors.Is(err, context.Canceled):
 				return
-			} else if err != nil {
-				r.logger.Error("failed to handshake with peer", "peer", peerID, "err", err)
-				if err = r.peerManager.DialFailed(peerID, address); err != nil {
-					r.logger.Error("failed to report dial failure", "peer", peerID, "err", err)
+			case err != nil:
+				r.logger.Error("failed to handshake with peer", "peer", address, "err", err)
+				if err = r.peerManager.DialFailed(address); err != nil {
+					r.logger.Error("failed to report dial failure", "peer", address, "err", err)
 				}
 				return
 			}

-			if err = r.peerManager.Dialed(peerID, address); err != nil {
-				r.logger.Error("failed to dial peer", "peer", peerID, "err", err)
+			if err = r.peerManager.Dialed(address); err != nil {
+				r.logger.Error("failed to dial peer", "peer", address, "err", err)
 				return
 			}

@@ -403,7 +499,6 @@ func (r *Router) dialPeers() {
 			r.peerMtx.Lock()
 			r.peerQueues[peerID] = queue
 			r.peerMtx.Unlock()
-			r.peerManager.Ready(peerID)

 			defer func() {
 				r.peerMtx.Lock()
@@ -411,10 +506,15 @@ func (r *Router) dialPeers() {
 				r.peerMtx.Unlock()
 				queue.close()
 				if err := r.peerManager.Disconnected(peerID); err != nil {
-					r.logger.Error("failed to disconnect peer", "peer", peerID, "err", err)
+					r.logger.Error("failed to disconnect peer", "peer", address, "err", err)
 				}
 			}()

+			if err := r.peerManager.Ready(peerID); err != nil {
+				r.logger.Error("failed to mark peer as ready", "peer", address, "err", err)
+				return
+			}
+
 			r.routePeer(peerID, conn, queue)
 		}()
 	}
@@ -422,22 +522,26 @@ func (r *Router) dialPeers() {

 // dialPeer connects to a peer by dialing it.
 func (r *Router) dialPeer(ctx context.Context, address NodeAddress) (Connection, error) {
-	r.logger.Info("resolving peer address", "address", address)
 	resolveCtx := ctx
 	if r.options.ResolveTimeout > 0 {
 		var cancel context.CancelFunc
 		resolveCtx, cancel = context.WithTimeout(resolveCtx, r.options.ResolveTimeout)
 		defer cancel()
 	}
+
+	r.logger.Debug("resolving peer address", "peer", address)
 	endpoints, err := address.Resolve(resolveCtx)
-	if err != nil {
+	switch {
+	case err != nil:
 		return nil, fmt.Errorf("failed to resolve address %q: %w", address, err)
+	case len(endpoints) == 0:
+		return nil, fmt.Errorf("address %q did not resolve to any endpoints", address)
 	}

 	for _, endpoint := range endpoints {
-		transport, ok := r.transports[endpoint.Protocol]
+		transport, ok := r.protocolTransports[endpoint.Protocol]
 		if !ok {
-			r.logger.Error("no transport found for endpoint protocol", "endpoint", endpoint)
+			r.logger.Error("no transport found for protocol", "endpoint", endpoint)
 			continue
 		}

@@ -457,17 +561,17 @@ func (r *Router) dialPeer(ctx context.Context, address NodeAddress) (Connection,
 		// Internet can't and needs a different public address.
 		conn, err := transport.Dial(dialCtx, endpoint)
 		if err != nil {
-			r.logger.Error("failed to dial endpoint", "endpoint", endpoint, "err", err)
+			r.logger.Error("failed to dial endpoint", "peer", address.NodeID, "endpoint", endpoint, "err", err)
 		} else {
-			r.logger.Info("connected to peer", "peer", address.NodeID, "endpoint", endpoint)
+			r.logger.Debug("dialed peer", "peer", address.NodeID, "endpoint", endpoint)
 			return conn, nil
 		}
 	}
-	return nil, fmt.Errorf("failed to connect to peer via %q", address)
+	return nil, errors.New("all endpoints failed")
 }

 // handshakePeer handshakes with a peer, validating the peer's information. If
-// expectID is given, we check that the peer's public key matches it.
+// expectID is given, we check that the peer's info matches it.
 func (r *Router) handshakePeer(ctx context.Context, conn Connection, expectID NodeID) (NodeInfo, crypto.PubKey, error) {
 	if r.options.HandshakeTimeout > 0 {
 		var cancel context.CancelFunc
@@ -478,51 +582,47 @@ func (r *Router) handshakePeer(ctx context.Context, conn Connection, expectID No
 	if err != nil {
 		return peerInfo, peerKey, err
 	}
+
 	if err = peerInfo.Validate(); err != nil {
 		return peerInfo, peerKey, fmt.Errorf("invalid handshake NodeInfo: %w", err)
 	}
-	if expectID != "" && expectID != peerInfo.NodeID {
-		return peerInfo, peerKey, fmt.Errorf("expected to connect with peer %q, got %q",
-			expectID, peerInfo.NodeID)
-	}
 	if NodeIDFromPubKey(peerKey) != peerInfo.NodeID {
 		return peerInfo, peerKey, fmt.Errorf("peer's public key did not match its node ID %q (expected %q)",
 			peerInfo.NodeID, NodeIDFromPubKey(peerKey))
 	}
-	if peerInfo.NodeID == r.nodeInfo.NodeID {
-		return peerInfo, peerKey, errors.New("rejecting handshake with self")
+	if expectID != "" && expectID != peerInfo.NodeID {
+		return peerInfo, peerKey, fmt.Errorf("expected to connect with peer %q, got %q",
+			expectID, peerInfo.NodeID)
 	}
 	return peerInfo, peerKey, nil
 }

-// routePeer routes inbound messages from a peer to channels, and also sends
-// outbound queued messages to the peer. It will close the connection and send
-// queue, using this as a signal to coordinate the internal receivePeer() and
-// sendPeer() goroutines. It blocks until the peer is done, e.g. when the
-// connection or queue is closed.
+// routePeer routes inbound and outbound messages between a peer and the reactor
+// channels. It will close the given connection and send queue when done, or if
+// they are closed elsewhere it will cause this method to shut down and return.
 func (r *Router) routePeer(peerID NodeID, conn Connection, sendQueue queue) {
-	r.logger.Info("routing peer", "peer", peerID)
-	resultsCh := make(chan error, 2)
+	r.logger.Info("peer connected", "peer", peerID, "endpoint", conn)
+	errCh := make(chan error, 2)
 	go func() {
-		resultsCh <- r.receivePeer(peerID, conn)
+		errCh <- r.receivePeer(peerID, conn)
 	}()
 	go func() {
-		resultsCh <- r.sendPeer(peerID, conn, sendQueue)
+		errCh <- r.sendPeer(peerID, conn, sendQueue)
 	}()

-	err := <-resultsCh
+	err := <-errCh
 	_ = conn.Close()
 	sendQueue.close()
-	if e := <-resultsCh; err == nil {
-		// The first err was nil, so we update it with the second result,
-		// which may or may not be nil.
+	if e := <-errCh; err == nil {
+		// The first err was nil, so we update it with the second err, which may
+		// or may not be nil.
 		err = e
 	}
 	switch err {
-	case nil, io.EOF, ErrTransportClosed{}:
-		r.logger.Info("peer disconnected", "peer", peerID)
+	case nil, io.EOF:
+		r.logger.Info("peer disconnected", "peer", peerID, "endpoint", conn)
 	default:
-		r.logger.Error("peer failure", "peer", peerID, "err", err)
+		r.logger.Error("peer failure", "peer", peerID, "endpoint", conn, "err", err)
 	}
 }

@@ -540,7 +640,7 @@ func (r *Router) receivePeer(peerID NodeID, conn Connection) error {
 		messageType := r.channelMessages[chID]
 		r.channelMtx.RUnlock()
 		if !ok {
-			r.logger.Error("dropping message for unknown channel", "peer", peerID, "channel", chID)
+			r.logger.Debug("dropping message for unknown channel", "peer", peerID, "channel", chID)
 			continue
 		}

@@ -558,10 +658,10 @@ func (r *Router) receivePeer(peerID NodeID, conn Connection) error {
 		}

 		select {
-		case queue.enqueue() <- Envelope{channelID: chID, From: peerID, Message: msg}:
+		case queue.enqueue() <- Envelope{From: peerID, Message: msg}:
 			r.logger.Debug("received message", "peer", peerID, "message", msg)
 		case <-queue.closed():
-			r.logger.Error("channel closed, dropping message", "peer", peerID, "channel", chID)
+			r.logger.Debug("channel closed, dropping message", "peer", peerID, "channel", chID)
 		case <-r.stopCh:
 			return nil
 		}
@@ -573,6 +673,10 @@ func (r *Router) sendPeer(peerID NodeID, conn Connection, queue queue) error {
 	for {
 		select {
 		case envelope := <-queue.dequeue():
+			if envelope.Message == nil {
+				r.logger.Error("dropping nil message", "peer", peerID)
+				continue
+			}
 			bz, err := proto.Marshal(envelope.Message)
 			if err != nil {
 				r.logger.Error("failed to marshal message", "peer", peerID, "err", err)
@@ -596,43 +700,57 @@ func (r *Router) sendPeer(peerID NodeID, conn Connection, queue queue) error {

 // evictPeers evicts connected peers as requested by the peer manager.
 func (r *Router) evictPeers() {
+	r.logger.Debug("starting evict routine")
 	ctx := r.stopCtx()
 	for {
 		peerID, err := r.peerManager.EvictNext(ctx)
-		switch err {
-		case nil:
-		case context.Canceled:
+		switch {
+		case errors.Is(err, context.Canceled):
 			r.logger.Debug("stopping evict routine")
 			return
-		default:
+		case err != nil:
 			r.logger.Error("failed to find next peer to evict", "err", err)
 			return
 		}

 		r.logger.Info("evicting peer", "peer", peerID)
 		r.peerMtx.RLock()
-		if queue, ok := r.peerQueues[peerID]; ok {
+		queue, ok := r.peerQueues[peerID]
+		r.peerMtx.RUnlock()
+		if ok {
 			queue.close()
 		}
-		r.peerMtx.RUnlock()
 	}
 }

 // OnStart implements service.Service.
 func (r *Router) OnStart() error {
 	go r.dialPeers()
+	go r.evictPeers()
 	for _, transport := range r.transports {
 		go r.acceptPeers(transport)
 	}
-	go r.evictPeers()
 	return nil
 }

 // OnStop implements service.Service.
 //
-// FIXME: This needs to close transports as well.
+// All channels must be closed by OpenChannel() callers before stopping the
+// router, to prevent blocked channel sends in reactors. Channels are not closed
+// here, since that would cause any reactor senders to panic, so it is the
+// sender's responsibility.
 func (r *Router) OnStop() {
-	// Collect all active queues, so we can wait for them to close.
+	// Signal router shutdown.
+	close(r.stopCh)
+
+	// Close transport listeners (unblocks Accept calls).
+	for _, transport := range r.transports {
+		if err := transport.Close(); err != nil {
+			r.logger.Error("failed to close transport", "transport", transport, "err", err)
+		}
+	}
+
+	// Collect all remaining queues, and wait for them to close.
 	queues := []queue{}
 	r.channelMtx.RLock()
 	for _, q := range r.channelQueues {
@@ -644,16 +762,12 @@ func (r *Router) OnStop() {
 		queues = append(queues, q)
 	}
 	r.peerMtx.RUnlock()
-
-	// Signal router shutdown, and wait for queues (and thus goroutines)
-	// to complete.
-	close(r.stopCh)
 	for _, q := range queues {
 		<-q.closed()
 	}
 }

-// stopCtx returns a context that is cancelled when the router stops.
+// stopCtx returns a new context that is cancelled when the router stops.
 func (r *Router) stopCtx() context.Context {
 	ctx, cancel := context.WithCancel(context.Background())
 	go func() {
@@ -2,159 +2,644 @@ package p2p_test

 import (
 	"errors"
+	"fmt"
+	"io"
+	"strings"
+	"sync"
 	"testing"
+	"time"

 	"github.com/fortytw2/leaktest"
+	"github.com/gogo/protobuf/proto"
 	gogotypes "github.com/gogo/protobuf/types"
-	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
 	"github.com/stretchr/testify/require"
 	dbm "github.com/tendermint/tm-db"

 	"github.com/tendermint/tendermint/crypto"
-	"github.com/tendermint/tendermint/crypto/ed25519"
 	"github.com/tendermint/tendermint/libs/log"
+	tmsync "github.com/tendermint/tendermint/libs/sync"
 	"github.com/tendermint/tendermint/p2p"
+	"github.com/tendermint/tendermint/p2p/mocks"
+	"github.com/tendermint/tendermint/p2p/p2ptest"
 )

-type TestMessage = gogotypes.StringValue
-
-func generateNode() (p2p.NodeInfo, crypto.PrivKey) {
-	privKey := ed25519.GenPrivKey()
-	nodeID := p2p.NodeIDFromPubKey(privKey.PubKey())
-	nodeInfo := p2p.NodeInfo{
-		NodeID: nodeID,
-		// FIXME: We have to fake a ListenAddr for now.
-		ListenAddr: "127.0.0.1:1234",
-		Moniker:    "foo",
-	}
-	return nodeInfo, privKey
-}
-
 func echoReactor(channel *p2p.Channel) {
 	for {
 		select {
-		case envelope := <-channel.In():
-			channel.Out() <- p2p.Envelope{
+		case envelope := <-channel.In:
+			value := envelope.Message.(*p2ptest.Message).Value
+			channel.Out <- p2p.Envelope{
 				To:      envelope.From,
-				Message: &TestMessage{Value: envelope.Message.(*TestMessage).Value},
+				Message: &p2ptest.Message{Value: value},
 			}
+
 		case <-channel.Done():
 			return
 		}
 	}
 }

-func TestRouter(t *testing.T) {
-	defer leaktest.Check(t)()
+func TestRouter_Network(t *testing.T) {
+	t.Cleanup(leaktest.Check(t))

-	logger := log.TestingLogger()
-	network := p2p.NewMemoryNetwork(logger)
-	nodeInfo, privKey := generateNode()
-	transport := network.CreateTransport(nodeInfo.NodeID)
-	defer transport.Close()
-	chID := p2p.ChannelID(1)
+	// Create a test network and open a channel where all peers run echoReactor.
+	network := p2ptest.MakeNetwork(t, 8)
+	local := network.RandomNode()
+	peers := network.Peers(local.NodeID)
+	channels := network.MakeChannels(t, 1, &p2ptest.Message{})

-	// Start some other in-memory network nodes to communicate with, running
-	// a simple echo reactor that returns received messages.
-	peers := []p2p.NodeAddress{}
-	for i := 0; i < 3; i++ {
-		peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{})
-		require.NoError(t, err)
-		peerInfo, peerKey := generateNode()
-		peerTransport := network.CreateTransport(peerInfo.NodeID)
-		defer peerTransport.Close()
-		peerRouter, err := p2p.NewRouter(
-			logger.With("peerID", i),
-			peerInfo,
-			peerKey,
-			peerManager,
-			[]p2p.Transport{peerTransport},
-			p2p.RouterOptions{},
+	channel := channels[local.NodeID]
+	for _, peer := range peers {
+		go echoReactor(channels[peer.NodeID])
+	}
+
+	// Sending a message to each peer should work.
+	for _, peer := range peers {
+		p2ptest.RequireSendReceive(t, channel, peer.NodeID,
+			&p2ptest.Message{Value: "foo"},
+			&p2ptest.Message{Value: "foo"},
 		)
-		require.NoError(t, err)
-		peers = append(peers, peerTransport.Endpoints()[0].NodeAddress(peerInfo.NodeID))
-
-		channel, err := peerRouter.OpenChannel(chID, &TestMessage{})
-		require.NoError(t, err)
-		defer channel.Close()
-		go echoReactor(channel)
-
-		err = peerRouter.Start()
-		require.NoError(t, err)
-		defer func() { require.NoError(t, peerRouter.Stop()) }()
 	}

-	// Start the main router and connect it to the peers above.
-	peerManager, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{})
-	require.NoError(t, err)
-	defer peerManager.Close()
-	for _, address := range peers {
-		err := peerManager.Add(address)
-		require.NoError(t, err)
-	}
-	peerUpdates := peerManager.Subscribe()
-	defer peerUpdates.Close()
-
-	router, err := p2p.NewRouter(logger, nodeInfo, privKey, peerManager, []p2p.Transport{transport}, p2p.RouterOptions{})
-	require.NoError(t, err)
-	channel, err := router.OpenChannel(chID, &TestMessage{})
-	require.NoError(t, err)
-	defer channel.Close()
-
-	err = router.Start()
-	require.NoError(t, err)
-	defer func() {
-		// Since earlier defers are closed after this, and we have to make sure
-		// we close channels and subscriptions before the router, we explicitly
-		// close them here to.
-		peerUpdates.Close()
-		channel.Close()
-		require.NoError(t, router.Stop())
-	}()
-
-	// Wait for peers to come online, and ping them as they do.
-	for i := 0; i < len(peers); i++ {
-		peerUpdate := <-peerUpdates.Updates()
-		peerID := peerUpdate.PeerID
-		require.Equal(t, p2p.PeerUpdate{
-			PeerID: peerID,
-			Status: p2p.PeerStatusUp,
-		}, peerUpdate)
-
-		channel.Out() <- p2p.Envelope{To: peerID, Message: &TestMessage{Value: "hi!"}}
-		assert.Equal(t, p2p.Envelope{
-			From:    peerID,
-			Message: &TestMessage{Value: "hi!"},
-		}, (<-channel.In()).Strip())
-	}
-
-	// We now send a broadcast, which we should return back from all peers.
-	channel.Out() <- p2p.Envelope{
+	// Sending a broadcast should return back a message from all peers.
+	p2ptest.RequireSend(t, channel, p2p.Envelope{
 		Broadcast: true,
-		Message:   &TestMessage{Value: "broadcast"},
-	}
-	for i := 0; i < len(peers); i++ {
-		envelope := <-channel.In()
-		require.Equal(t, &TestMessage{Value: "broadcast"}, envelope.Message)
+		Message:   &p2ptest.Message{Value: "bar"},
+	})
+	expect := []p2p.Envelope{}
+	for _, peer := range peers {
+		expect = append(expect, p2p.Envelope{
+			From:    peer.NodeID,
+			Message: &p2ptest.Message{Value: "bar"},
+		})
 	}
+	p2ptest.RequireReceiveUnordered(t, channel, expect)

-	// We then submit an error for a peer, and watch it get disconnected.
-	channel.Error() <- p2p.PeerError{
-		PeerID:   peers[0].NodeID,
-		Err:      errors.New("test error"),
-		Severity: p2p.PeerErrorSeverityCritical,
+	// We then submit an error for a peer, and watch it get disconnected and
+	// then reconnected as the router retries it.
+	peerUpdates := local.MakePeerUpdates(t)
+	channel.Error <- p2p.PeerError{
+		NodeID: peers[0].NodeID,
+		Err:    errors.New("boom"),
 	}
-	peerUpdate := <-peerUpdates.Updates()
-	require.Equal(t, p2p.PeerUpdate{
-		PeerID: peers[0].NodeID,
+	p2ptest.RequireUpdates(t, peerUpdates, []p2p.PeerUpdate{
+		{NodeID: peers[0].NodeID, Status: p2p.PeerStatusDown},
+		{NodeID: peers[0].NodeID, Status: p2p.PeerStatusUp},
+	})
+}
+
+func TestRouter_Channel(t *testing.T) {
+	t.Cleanup(leaktest.Check(t))
+
+	// Set up a router with no transports (so no peers).
+	peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{})
+	require.NoError(t, err)
+	router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager, nil, p2p.RouterOptions{})
+	require.NoError(t, err)
+
+	require.NoError(t, router.Start())
+	t.Cleanup(func() {
+		require.NoError(t, router.Stop())
+	})
+
+	// Opening a channel should work.
+	channel, err := router.OpenChannel(chID, &p2ptest.Message{})
+	require.NoError(t, err)
+
+	// Opening the same channel again should fail.
+	_, err = router.OpenChannel(chID, &p2ptest.Message{})
+	require.Error(t, err)
+
+	// Opening a different channel should work.
+	_, err = router.OpenChannel(2, &p2ptest.Message{})
+	require.NoError(t, err)
+
+	// Closing the channel, then opening it again should be fine.
+	channel.Close()
+	time.Sleep(100 * time.Millisecond) // yes yes, but Close() is async...
+
+	channel, err = router.OpenChannel(chID, &p2ptest.Message{})
+	require.NoError(t, err)
+
+	// We should be able to send on the channel, even though there are no peers.
+	p2ptest.RequireSend(t, channel, p2p.Envelope{
+		To:      p2p.NodeID(strings.Repeat("a", 40)),
+		Message: &p2ptest.Message{Value: "foo"},
+	})
+
+	// A message to ourselves should be dropped.
+	p2ptest.RequireSend(t, channel, p2p.Envelope{
+		To:      selfID,
+		Message: &p2ptest.Message{Value: "self"},
+	})
+	p2ptest.RequireEmpty(t, channel)
+}
+
+// Channel tests are hairy to mock, so we use an in-memory network instead.
+func TestRouter_Channel_SendReceive(t *testing.T) {
+	t.Cleanup(leaktest.Check(t))
+
+	// Create a test network and open a channel on all nodes.
+	network := p2ptest.MakeNetwork(t, 3)
+	ids := network.NodeIDs()
+	aID, bID, cID := ids[0], ids[1], ids[2]
+	channels := network.MakeChannels(t, chID, &p2ptest.Message{})
+	a, b, c := channels[aID], channels[bID], channels[cID]
+	otherChannels := network.MakeChannels(t, 9, &p2ptest.Message{})
+
+	// Sending a message a->b should work, and not send anything
+	// further to a, b, or c.
+	p2ptest.RequireSend(t, a, p2p.Envelope{To: bID, Message: &p2ptest.Message{Value: "foo"}})
+	p2ptest.RequireReceive(t, b, p2p.Envelope{From: aID, Message: &p2ptest.Message{Value: "foo"}})
+	p2ptest.RequireEmpty(t, a, b, c)
+
+	// Sending a nil message a->c should be dropped.
+	p2ptest.RequireSend(t, a, p2p.Envelope{To: bID, Message: nil})
+	p2ptest.RequireEmpty(t, a, b, c)
+
+	// Sending a different message type should be dropped.
+	p2ptest.RequireSend(t, a, p2p.Envelope{To: bID, Message: &gogotypes.BoolValue{Value: true}})
+	p2ptest.RequireEmpty(t, a, b, c)
+
+	// Sending to an unknown peer should be dropped.
+	p2ptest.RequireSend(t, a, p2p.Envelope{
+		To:      p2p.NodeID(strings.Repeat("a", 40)),
+		Message: &p2ptest.Message{Value: "a"},
+	})
+	p2ptest.RequireEmpty(t, a, b, c)
+
+	// Sending without a recipient should be dropped.
+	p2ptest.RequireSend(t, a, p2p.Envelope{Message: &p2ptest.Message{Value: "noto"}})
+	p2ptest.RequireEmpty(t, a, b, c)
+
+	// Sending to self should be dropped.
+	p2ptest.RequireSend(t, a, p2p.Envelope{To: aID, Message: &p2ptest.Message{Value: "self"}})
+	p2ptest.RequireEmpty(t, a, b, c)
+
+	// Removing b and sending to it should be dropped.
+	network.Remove(t, bID)
+	p2ptest.RequireSend(t, a, p2p.Envelope{To: bID, Message: &p2ptest.Message{Value: "nob"}})
+	p2ptest.RequireEmpty(t, a, b, c)
+
+	// After all this, sending a message c->a should work.
+	p2ptest.RequireSend(t, c, p2p.Envelope{To: aID, Message: &p2ptest.Message{Value: "bar"}})
+	p2ptest.RequireReceive(t, a, p2p.Envelope{From: cID, Message: &p2ptest.Message{Value: "bar"}})
+	p2ptest.RequireEmpty(t, a, b, c)
+
+	// None of these messages should have made it onto the other channels.
+	for _, other := range otherChannels {
+		p2ptest.RequireEmpty(t, other)
+	}
+}
+
+func TestRouter_Channel_Broadcast(t *testing.T) {
+	t.Cleanup(leaktest.Check(t))
+
+	// Create a test network and open a channel on all nodes.
+	network := p2ptest.MakeNetwork(t, 4)
+	ids := network.NodeIDs()
+	aID, bID, cID, dID := ids[0], ids[1], ids[2], ids[3]
+	channels := network.MakeChannels(t, 1, &p2ptest.Message{})
+	a, b, c, d := channels[aID], channels[bID], channels[cID], channels[dID]
+
+	// Sending a broadcast from b should work.
+	p2ptest.RequireSend(t, b, p2p.Envelope{Broadcast: true, Message: &p2ptest.Message{Value: "foo"}})
+	p2ptest.RequireReceive(t, a, p2p.Envelope{From: bID, Message: &p2ptest.Message{Value: "foo"}})
+	p2ptest.RequireReceive(t, c, p2p.Envelope{From: bID, Message: &p2ptest.Message{Value: "foo"}})
+	p2ptest.RequireReceive(t, d, p2p.Envelope{From: bID, Message: &p2ptest.Message{Value: "foo"}})
+	p2ptest.RequireEmpty(t, a, b, c, d)
+
+	// Removing one node from the network shouldn't prevent broadcasts from working.
+	network.Remove(t, dID)
+	p2ptest.RequireSend(t, a, p2p.Envelope{Broadcast: true, Message: &p2ptest.Message{Value: "bar"}})
+	p2ptest.RequireReceive(t, b, p2p.Envelope{From: aID, Message: &p2ptest.Message{Value: "bar"}})
+	p2ptest.RequireReceive(t, c, p2p.Envelope{From: aID, Message: &p2ptest.Message{Value: "bar"}})
+	p2ptest.RequireEmpty(t, a, b, c, d)
+}
+
+func TestRouter_Channel_Wrapper(t *testing.T) {
+	t.Cleanup(leaktest.Check(t))
+
+	// Create a test network and open a channel on all nodes.
+	network := p2ptest.MakeNetwork(t, 2)
+	ids := network.NodeIDs()
+	aID, bID := ids[0], ids[1]
+	channels := network.MakeChannels(t, 1, &wrapperMessage{})
+	a, b := channels[aID], channels[bID]
+
+	// Since wrapperMessage implements p2p.Wrapper and handles Message, it
+	// should automatically wrap and unwrap sent messages -- we prepend the
+	// wrapper actions to the message value to signal this.
+	p2ptest.RequireSend(t, a, p2p.Envelope{To: bID, Message: &p2ptest.Message{Value: "foo"}})
+	p2ptest.RequireReceive(t, b, p2p.Envelope{From: aID, Message: &p2ptest.Message{Value: "unwrap:wrap:foo"}})
+
+	// If we send a different message that can't be wrapped, it should be dropped.
+	p2ptest.RequireSend(t, a, p2p.Envelope{To: bID, Message: &gogotypes.BoolValue{Value: true}})
+	p2ptest.RequireEmpty(t, b)
+
+	// If we send the wrapper message itself, it should also be passed through
+	// since WrapperMessage supports it, and should only be unwrapped at the receiver.
+	p2ptest.RequireSend(t, a, p2p.Envelope{
+		To:      bID,
+		Message: &wrapperMessage{Message: p2ptest.Message{Value: "foo"}},
+	})
+	p2ptest.RequireReceive(t, b, p2p.Envelope{
+		From:    aID,
+		Message: &p2ptest.Message{Value: "unwrap:foo"},
+	})
+
+}
+
+// WrapperMessage prepends the value with "wrap:" and "unwrap:" to test it.
+type wrapperMessage struct {
+	p2ptest.Message
+}
+
+var _ p2p.Wrapper = (*wrapperMessage)(nil)
+
+func (w *wrapperMessage) Wrap(inner proto.Message) error {
+	switch inner := inner.(type) {
+	case *p2ptest.Message:
+		w.Message.Value = fmt.Sprintf("wrap:%v", inner.Value)
+	case *wrapperMessage:
+		*w = *inner
+	default:
+		return fmt.Errorf("invalid message type %T", inner)
+	}
+	return nil
+}
+
+func (w *wrapperMessage) Unwrap() (proto.Message, error) {
+	return &p2ptest.Message{Value: fmt.Sprintf("unwrap:%v", w.Message.Value)}, nil
+}
+
+func TestRouter_Channel_Error(t *testing.T) {
+	t.Cleanup(leaktest.Check(t))
+
+	// Create a test network and open a channel on all nodes.
+	network := p2ptest.MakeNetwork(t, 3)
+	ids := network.NodeIDs()
+	aID, bID := ids[0], ids[1]
+	channels := network.MakeChannels(t, 1, &p2ptest.Message{})
+	a := channels[aID]
+
+	// Erroring b should cause it to be disconnected. It will reconnect shortly after.
+	sub := network.Nodes[aID].MakePeerUpdates(t)
+	p2ptest.RequireError(t, a, p2p.PeerError{NodeID: bID, Err: errors.New("boom")})
+	p2ptest.RequireUpdates(t, sub, []p2p.PeerUpdate{
+		{NodeID: bID, Status: p2p.PeerStatusDown},
+		{NodeID: bID, Status: p2p.PeerStatusUp},
+	})
+}
+
+func TestRouter_AcceptPeers(t *testing.T) {
+	testcases := map[string]struct {
+		peerInfo p2p.NodeInfo
+		peerKey  crypto.PubKey
+		ok       bool
+	}{
+		"valid handshake": {peerInfo, peerKey.PubKey(), true},
+		"empty handshake": {p2p.NodeInfo{}, nil, false},
+		"invalid key":     {peerInfo, selfKey.PubKey(), false},
+		"self handshake":  {selfInfo, selfKey.PubKey(), false},
+	}
+	for name, tc := range testcases {
+		tc := tc
+		t.Run(name, func(t *testing.T) {
+			t.Cleanup(leaktest.Check(t))
+
+			// Set up a mock transport that handshakes.
+			closer := tmsync.NewCloser()
+			mockConnection := &mocks.Connection{}
+			mockConnection.On("String").Maybe().Return("mock")
+			mockConnection.On("Handshake", mock.Anything, selfInfo, selfKey).
+				Return(tc.peerInfo, tc.peerKey, nil)
+			mockConnection.On("Close").Run(func(_ mock.Arguments) { closer.Close() }).Return(nil)
+			if tc.ok {
+				mockConnection.On("ReceiveMessage").Return(chID, nil, io.EOF)
+			}
+
+			mockTransport := &mocks.Transport{}
+			mockTransport.On("String").Maybe().Return("mock")
+			mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"})
+			mockTransport.On("Close").Return(nil)
+			mockTransport.On("Accept").Once().Return(mockConnection, nil)
+			mockTransport.On("Accept").Once().Return(nil, io.EOF)
+
+			// Set up and start the router.
+			peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{})
+			require.NoError(t, err)
+			sub := peerManager.Subscribe()
+			defer sub.Close()
+
+			router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager,
+				[]p2p.Transport{mockTransport}, p2p.RouterOptions{})
+			require.NoError(t, err)
+			require.NoError(t, router.Start())
+
+			if tc.ok {
+				p2ptest.RequireUpdate(t, sub, p2p.PeerUpdate{
+					NodeID: tc.peerInfo.NodeID,
+					Status: p2p.PeerStatusUp,
+				})
+				sub.Close()
+			} else {
+				select {
+				case <-closer.Done():
+				case <-time.After(100 * time.Millisecond):
+					require.Fail(t, "connection not closed")
+				}
+			}
+
+			require.NoError(t, router.Stop())
+			mockTransport.AssertExpectations(t)
+			mockConnection.AssertExpectations(t)
+		})
+	}
+}
+
+func TestRouter_AcceptPeers_Error(t *testing.T) {
+	t.Cleanup(leaktest.Check(t))
+
+	// Set up a mock transport that returns an error, which should prevent
+	// the router from calling Accept again.
+	mockTransport := &mocks.Transport{}
+	mockTransport.On("String").Maybe().Return("mock")
+	mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"})
+	mockTransport.On("Accept").Once().Return(nil, errors.New("boom"))
+	mockTransport.On("Close").Return(nil)
+
+	// Set up and start the router.
+	peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{})
+	require.NoError(t, err)
+	router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager,
+		[]p2p.Transport{mockTransport}, p2p.RouterOptions{})
+	require.NoError(t, err)
+
+	require.NoError(t, router.Start())
+	time.Sleep(time.Second)
+	require.NoError(t, router.Stop())
+
+	mockTransport.AssertExpectations(t)
+}
+
+func TestRouter_AcceptPeers_ErrorEOF(t *testing.T) {
+	t.Cleanup(leaktest.Check(t))
+
+	// Set up a mock transport that returns io.EOF once, which should prevent
+	// the router from calling Accept again.
+	mockTransport := &mocks.Transport{}
+	mockTransport.On("String").Maybe().Return("mock")
+	mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"})
+	mockTransport.On("Accept").Once().Return(nil, io.EOF)
+	mockTransport.On("Close").Return(nil)
+
+	// Set up and start the router.
+	peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{})
+	require.NoError(t, err)
+	router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager,
+		[]p2p.Transport{mockTransport}, p2p.RouterOptions{})
+	require.NoError(t, err)
+
+	require.NoError(t, router.Start())
+	time.Sleep(time.Second)
+	require.NoError(t, router.Stop())
+
+	mockTransport.AssertExpectations(t)
+}
+
+func TestRouter_AcceptPeers_HeadOfLineBlocking(t *testing.T) {
+	t.Cleanup(leaktest.Check(t))
+
+	// Set up a mock transport that returns a connection that blocks during the
+	// handshake. It should be able to accept several of these in parallel, i.e.
+	// a single connection can't halt other connections being accepted.
+	acceptCh := make(chan bool, 3)
+	closeCh := make(chan time.Time)
+
+	mockConnection := &mocks.Connection{}
+	mockConnection.On("String").Maybe().Return("mock")
+	mockConnection.On("Handshake", mock.Anything, selfInfo, selfKey).
+		WaitUntil(closeCh).Return(p2p.NodeInfo{}, nil, io.EOF)
+	mockConnection.On("Close").Return(nil)
+
+	mockTransport := &mocks.Transport{}
+	mockTransport.On("String").Maybe().Return("mock")
+	mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"})
+	mockTransport.On("Close").Return(nil)
+	mockTransport.On("Accept").Times(3).Run(func(_ mock.Arguments) {
+		acceptCh <- true
+	}).Return(mockConnection, nil)
+	mockTransport.On("Accept").Once().Return(nil, io.EOF)
+
+	// Set up and start the router.
+	peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{})
+	require.NoError(t, err)
+	router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager,
+		[]p2p.Transport{mockTransport}, p2p.RouterOptions{})
+	require.NoError(t, err)
+	require.NoError(t, router.Start())
+
+	require.Eventually(t, func() bool {
+		return len(acceptCh) == 3
+	}, time.Second, 10*time.Millisecond)
+	close(closeCh)
+	time.Sleep(100 * time.Millisecond)
+
+	require.NoError(t, router.Stop())
+	mockTransport.AssertExpectations(t)
+	mockConnection.AssertExpectations(t)
+}
+
+func TestRouter_DialPeers(t *testing.T) {
+	testcases := map[string]struct {
+		dialID   p2p.NodeID
+		peerInfo p2p.NodeInfo
+		peerKey  crypto.PubKey
+		dialErr  error
+		ok       bool
+	}{
+		"valid dial":         {peerInfo.NodeID, peerInfo, peerKey.PubKey(), nil, true},
+		"empty handshake":    {peerInfo.NodeID, p2p.NodeInfo{}, nil, nil, false},
+		"invalid key":        {peerInfo.NodeID, peerInfo, selfKey.PubKey(), nil, false},
+		"unexpected node ID": {peerInfo.NodeID, selfInfo, selfKey.PubKey(), nil, false},
+		"dial error":         {peerInfo.NodeID, peerInfo, peerKey.PubKey(), errors.New("boom"), false},
+	}
+	for name, tc := range testcases {
+		tc := tc
+		t.Run(name, func(t *testing.T) {
+			t.Cleanup(leaktest.Check(t))
+
+			address := p2p.NodeAddress{Protocol: "mock", NodeID: tc.dialID}
+			endpoint := p2p.Endpoint{Protocol: "mock", Path: string(tc.dialID)}
+
+			// Set up a mock transport that handshakes.
+			closer := tmsync.NewCloser()
+			mockConnection := &mocks.Connection{}
+			mockConnection.On("String").Maybe().Return("mock")
+			if tc.dialErr == nil {
+				mockConnection.On("Handshake", mock.Anything, selfInfo, selfKey).
+					Return(tc.peerInfo, tc.peerKey, nil)
+				mockConnection.On("Close").Run(func(_ mock.Arguments) { closer.Close() }).Return(nil)
+			}
+			if tc.ok {
+				mockConnection.On("ReceiveMessage").Return(chID, nil, io.EOF)
+			}
+
+			mockTransport := &mocks.Transport{}
+			mockTransport.On("String").Maybe().Return("mock")
+			mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"})
+			mockTransport.On("Close").Return(nil)
+			mockTransport.On("Accept").Maybe().Return(nil, io.EOF)
+			if tc.dialErr == nil {
+				mockTransport.On("Dial", mock.Anything, endpoint).Once().Return(mockConnection, nil)
+				// This handles the retry when a dialed connection gets closed after ReceiveMessage
+				// returns io.EOF above.
+				mockTransport.On("Dial", mock.Anything, endpoint).Maybe().Return(nil, io.EOF)
+			} else {
+				mockTransport.On("Dial", mock.Anything, endpoint).Once().
+					Run(func(_ mock.Arguments) { closer.Close() }).
+					Return(nil, tc.dialErr)
+			}
+
+			// Set up and start the router.
+			peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{})
+			require.NoError(t, err)
+			require.NoError(t, peerManager.Add(address))
+			sub := peerManager.Subscribe()
+			defer sub.Close()
+
+			router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager,
+				[]p2p.Transport{mockTransport}, p2p.RouterOptions{})
+			require.NoError(t, err)
+			require.NoError(t, router.Start())
+
+			if tc.ok {
+				p2ptest.RequireUpdate(t, sub, p2p.PeerUpdate{
+					NodeID: tc.peerInfo.NodeID,
+					Status: p2p.PeerStatusUp,
+				})
+				sub.Close()
+			} else {
+				select {
+				case <-closer.Done():
+				case <-time.After(100 * time.Millisecond):
+					require.Fail(t, "connection not closed")
+				}
+			}
+
+			require.NoError(t, router.Stop())
+			mockTransport.AssertExpectations(t)
+			mockConnection.AssertExpectations(t)
+		})
+	}
+}
+
+func TestRouter_DialPeers_Parallel(t *testing.T) {
+	t.Cleanup(leaktest.Check(t))
+
+	a := p2p.NodeAddress{Protocol: "mock", NodeID: p2p.NodeID(strings.Repeat("a", 40))}
+	b := p2p.NodeAddress{Protocol: "mock", NodeID: p2p.NodeID(strings.Repeat("b", 40))}
+	c := p2p.NodeAddress{Protocol: "mock", NodeID: p2p.NodeID(strings.Repeat("c", 40))}
+
+	// Set up a mock transport that returns a connection that blocks during the
+	// handshake. It should dial all peers in parallel.
+	dialCh := make(chan bool, 3)
+	closeCh := make(chan time.Time)
+
+	mockConnection := &mocks.Connection{}
+	mockConnection.On("String").Maybe().Return("mock")
+	mockConnection.On("Handshake", mock.Anything, selfInfo, selfKey).
+		WaitUntil(closeCh).Return(p2p.NodeInfo{}, nil, io.EOF)
+	mockConnection.On("Close").Return(nil)
+
+	mockTransport := &mocks.Transport{}
+	mockTransport.On("String").Maybe().Return("mock")
+	mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"})
+	mockTransport.On("Close").Return(nil)
+	mockTransport.On("Accept").Once().Return(nil, io.EOF)
+	for _, address := range []p2p.NodeAddress{a, b, c} {
+		endpoint := p2p.Endpoint{Protocol: address.Protocol, Path: string(address.NodeID)}
+		mockTransport.On("Dial", mock.Anything, endpoint).Run(func(_ mock.Arguments) {
+			dialCh <- true
+		}).Return(mockConnection, nil)
+	}
+
+	// Set up and start the router.
+	peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{})
+	require.NoError(t, err)
+	require.NoError(t, peerManager.Add(a))
+	require.NoError(t, peerManager.Add(b))
+	require.NoError(t, peerManager.Add(c))
+
+	router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager,
+		[]p2p.Transport{mockTransport}, p2p.RouterOptions{})
+	require.NoError(t, err)
+	require.NoError(t, router.Start())
+
+	require.Eventually(t, func() bool {
+		return len(dialCh) == 3
+	}, time.Second, 10*time.Millisecond)
+	close(closeCh)
+	time.Sleep(100 * time.Millisecond)
+
+	require.NoError(t, router.Stop())
+	mockTransport.AssertExpectations(t)
+	mockConnection.AssertExpectations(t)
+}
+
+func TestRouter_EvictPeers(t *testing.T) {
+	t.Cleanup(leaktest.Check(t))
+
+	// Set up a mock transport that we can evict.
+	closeCh := make(chan time.Time)
+	closeOnce := sync.Once{}
+
+	mockConnection := &mocks.Connection{}
+	mockConnection.On("String").Maybe().Return("mock")
+	mockConnection.On("Handshake", mock.Anything, selfInfo, selfKey).
+		Return(peerInfo, peerKey.PubKey(), nil)
+	mockConnection.On("ReceiveMessage").WaitUntil(closeCh).Return(chID, nil, io.EOF)
+	mockConnection.On("Close").Run(func(_ mock.Arguments) {
+		closeOnce.Do(func() {
+			close(closeCh)
+		})
+	}).Return(nil)
+
+	mockTransport := &mocks.Transport{}
+	mockTransport.On("String").Maybe().Return("mock")
+	mockTransport.On("Protocols").Return([]p2p.Protocol{"mock"})
+	mockTransport.On("Close").Return(nil)
+	mockTransport.On("Accept").Once().Return(mockConnection, nil)
+	mockTransport.On("Accept").Once().Return(nil, io.EOF)
+
+	// Set up and start the router.
+	peerManager, err := p2p.NewPeerManager(selfID, dbm.NewMemDB(), p2p.PeerManagerOptions{})
+	require.NoError(t, err)
+	sub := peerManager.Subscribe()
+	defer sub.Close()
+
+	router, err := p2p.NewRouter(log.TestingLogger(), selfInfo, selfKey, peerManager,
+		[]p2p.Transport{mockTransport}, p2p.RouterOptions{})
+	require.NoError(t, err)
+	require.NoError(t, router.Start())
+
+	// Wait for the mock peer to connect, then evict it by reporting an error.
+	p2ptest.RequireUpdate(t, sub, p2p.PeerUpdate{
+		NodeID: peerInfo.NodeID,
+		Status: p2p.PeerStatusUp,
+	})
+
+	require.NoError(t, peerManager.Errored(peerInfo.NodeID, errors.New("boom")))
+
+	p2ptest.RequireUpdate(t, sub, p2p.PeerUpdate{
+		NodeID: peerInfo.NodeID,
 		Status: p2p.PeerStatusDown,
-	}, peerUpdate)
+	})
+	sub.Close()

-	// The peer manager will automatically reconnect the peer, so we wait
-	// for that to happen.
-	peerUpdate = <-peerUpdates.Updates()
-	require.Equal(t, p2p.PeerUpdate{
-		PeerID: peers[0].NodeID,
-		Status: p2p.PeerStatusUp,
-	}, peerUpdate)
+	require.NoError(t, router.Stop())
+	mockTransport.AssertExpectations(t)
+	mockConnection.AssertExpectations(t)
 }
@@ -29,7 +29,7 @@ type (
 		BaseReactor

 		Name        string
-		PeerUpdates *PeerUpdatesCh
+		PeerUpdates *PeerUpdates
 		Channels    map[ChannelID]*ChannelShim
 	}

@@ -39,6 +39,9 @@ type (
 	ChannelShim struct {
 		Descriptor *ChannelDescriptor
 		Channel    *Channel
+		inCh       chan<- Envelope
+		outCh      <-chan Envelope
+		errCh      <-chan PeerError
 	}

 	// ChannelDescriptorShim defines a shim wrapper around a legacy p2p channel
@@ -56,7 +59,7 @@ func NewReactorShim(logger log.Logger, name string, descriptors map[ChannelID]*C

 	for _, cds := range descriptors {
 		chShim := NewChannelShim(cds, 0)
-		channels[chShim.Channel.id] = chShim
+		channels[chShim.Channel.ID] = chShim
 	}

 	rs := &ReactorShim{
@@ -72,15 +75,21 @@ func NewReactorShim(logger log.Logger, name string, descriptors map[ChannelID]*C
 }

 func NewChannelShim(cds *ChannelDescriptorShim, buf uint) *ChannelShim {
+	inCh := make(chan Envelope, buf)
+	outCh := make(chan Envelope, buf)
+	errCh := make(chan PeerError, buf)
 	return &ChannelShim{
 		Descriptor: cds.Descriptor,
 		Channel: NewChannel(
 			ChannelID(cds.Descriptor.ID),
 			cds.MsgType,
-			make(chan Envelope, buf),
-			make(chan Envelope, buf),
-			make(chan PeerError, buf),
+			inCh,
+			outCh,
+			errCh,
 		),
+		inCh:  inCh,
+		outCh: outCh,
+		errCh: errCh,
 	}
 }

@@ -91,7 +100,7 @@ func NewChannelShim(cds *ChannelDescriptorShim, buf uint) *ChannelShim {
 func (rs *ReactorShim) proxyPeerEnvelopes() {
 	for _, cs := range rs.Channels {
 		go func(cs *ChannelShim) {
-			for e := range cs.Channel.outCh {
+			for e := range cs.outCh {
 				msg := proto.Clone(cs.Channel.messageType)
 				msg.Reset()

@@ -161,11 +170,11 @@ func (rs *ReactorShim) proxyPeerEnvelopes() {
 func (rs *ReactorShim) handlePeerErrors() {
 	for _, cs := range rs.Channels {
 		go func(cs *ChannelShim) {
-			for pErr := range cs.Channel.errCh {
-				if pErr.PeerID != "" {
-					peer := rs.Switch.peers.Get(pErr.PeerID)
+			for pErr := range cs.errCh {
+				if pErr.NodeID != "" {
+					peer := rs.Switch.peers.Get(pErr.NodeID)
 					if peer == nil {
-						rs.Logger.Error("failed to handle peer error; failed to find peer", "peer", pErr.PeerID)
+						rs.Logger.Error("failed to handle peer error; failed to find peer", "peer", pErr.NodeID)
 						continue
 					}

@@ -225,7 +234,7 @@ func (rs *ReactorShim) GetChannels() []*ChannelDescriptor {
 // handle adding a peer.
 func (rs *ReactorShim) AddPeer(peer Peer) {
 	select {
-	case rs.PeerUpdates.updatesCh <- PeerUpdate{PeerID: peer.ID(), Status: PeerStatusUp}:
+	case rs.PeerUpdates.updatesCh <- PeerUpdate{NodeID: peer.ID(), Status: PeerStatusUp}:
 		rs.Logger.Debug("sent peer update", "reactor", rs.Name, "peer", peer.ID(), "status", PeerStatusUp)

 	case <-rs.PeerUpdates.Done():
@@ -244,7 +253,7 @@ func (rs *ReactorShim) AddPeer(peer Peer) {
 // handle removing a peer.
 func (rs *ReactorShim) RemovePeer(peer Peer, reason interface{}) {
 	select {
-	case rs.PeerUpdates.updatesCh <- PeerUpdate{PeerID: peer.ID(), Status: PeerStatusDown}:
+	case rs.PeerUpdates.updatesCh <- PeerUpdate{NodeID: peer.ID(), Status: PeerStatusDown}:
 		rs.Logger.Debug(
 			"sent peer update",
 			"reactor", rs.Name,
@@ -311,7 +320,7 @@ func (rs *ReactorShim) Receive(chID byte, src Peer, msgBytes []byte) {
 	}

 	select {
-	case channelShim.Channel.inCh <- Envelope{From: src.ID(), Message: msg}:
+	case channelShim.inCh <- Envelope{From: src.ID(), Message: msg}:
 		rs.Logger.Debug("proxied envelope", "reactor", rs.Name, "ch_id", cID, "peer", src.ID())

 	case <-channelShim.Channel.Done():
@@ -92,7 +92,7 @@ func TestReactorShim_GetChannel(t *testing.T) {

 	p2pCh := rts.shim.GetChannel(p2p.ChannelID(channelID1))
 	require.NotNil(t, p2pCh)
-	require.Equal(t, p2pCh.ID(), p2p.ChannelID(channelID1))
+	require.Equal(t, p2pCh.ID, p2p.ChannelID(channelID1))

 	p2pCh = rts.shim.GetChannel(p2p.ChannelID(byte(0x03)))
 	require.Nil(t, p2pCh)
@@ -123,7 +123,7 @@ func TestReactorShim_AddPeer(t *testing.T) {
 	rts.shim.AddPeer(peerA)
 	wg.Wait()

-	require.Equal(t, peerIDA, peerUpdate.PeerID)
+	require.Equal(t, peerIDA, peerUpdate.NodeID)
 	require.Equal(t, p2p.PeerStatusUp, peerUpdate.Status)
 }

@@ -143,7 +143,7 @@ func TestReactorShim_RemovePeer(t *testing.T) {
 	rts.shim.RemovePeer(peerA, "test reason")
 	wg.Wait()

-	require.Equal(t, peerIDA, peerUpdate.PeerID)
+	require.Equal(t, peerIDA, peerUpdate.NodeID)
 	require.Equal(t, p2p.PeerStatusDown, peerUpdate.Status)
 }

@@ -178,11 +178,11 @@ func TestReactorShim_Receive(t *testing.T) {
 	// Simulate receiving the envelope in some real reactor and replying back with
 	// the same envelope and then closing the Channel.
 	go func() {
-		e := <-p2pCh.Channel.In()
+		e := <-p2pCh.Channel.In
 		require.Equal(t, peerIDA, e.From)
 		require.NotNil(t, e.Message)

-		p2pCh.Channel.Out() <- p2p.Envelope{To: e.From, Message: e.Message}
+		p2pCh.Channel.Out <- p2p.Envelope{To: e.From, Message: e.Message}
 		p2pCh.Channel.Close()
 		wg.Done()
 	}()
@@ -200,7 +200,7 @@ func TestReactorShim_Receive(t *testing.T) {
 	// Since p2pCh was closed in the simulated reactor above, calling Receive
 	// should not block.
 	rts.shim.Receive(channelID1, peerA, bz)
-	require.Empty(t, p2pCh.Channel.In())
+	require.Empty(t, p2pCh.Channel.In)

 	peerA.AssertExpectations(t)
 }
@@ -10,6 +10,8 @@ import (
 	"github.com/tendermint/tendermint/p2p/conn"
 )

+//go:generate mockery --case underscore --name Transport|Connection
+
 const (
 	// defaultProtocol is the default protocol used for NodeAddress when
 	// a protocol isn't explicitly given as a URL scheme.
@@ -19,11 +19,8 @@ import (
 // transportFactory is used to set up transports for tests.
 type transportFactory func(t *testing.T) p2p.Transport

-var (
-	ctx            = context.Background()          // convenience context
-	chID           = p2p.ChannelID(1)              // channel ID for use in tests
-	testTransports = map[string]transportFactory{} // registry for withTransports
-)
+// testTransports is a registry of transport factories for withTransports().
+var testTransports = map[string]transportFactory{}

 // withTransports is a test helper that runs a test against all transports
 // registered in testTransports.
@@ -16,11 +16,6 @@ import (
 	"github.com/tendermint/tendermint/version"
 )

-// database keys
-var (
-	stateKey = []byte("stateKey")
-)
-
 //-----------------------------------------------------------------------------

 type Version struct {
@@ -32,6 +32,7 @@ const (
 	prefixValidators      = int64(5)
 	prefixConsensusParams = int64(6)
 	prefixABCIResponses   = int64(7)
+	prefixState           = int64(8)
 )

 func encodeKey(prefix int64, height uint64) []byte {
@@ -54,6 +55,17 @@ func abciResponsesKey(height uint64) []byte {
 	return encodeKey(prefixABCIResponses, height)
 }

+// stateKey should never change after being set in init()
+var stateKey []byte
+
+func init() {
+	var err error
+	stateKey, err = orderedcode.Append(nil, prefixState)
+	if err != nil {
+		panic(err)
+	}
+}
+
 //----------------------

 //go:generate mockery --case underscore --name Store
@@ -239,11 +251,16 @@ func (store dbStore) PruneStates(retainHeight uint64) error {
 		return fmt.Errorf("height %v must be greater than 0", retainHeight)
 	}

-	if err := store.pruneValidatorSets(retainHeight); err != nil {
+	// NOTE: We need to prune consensus params first because the validator
+	// sets have always one extra height. If validator sets were pruned first
+	// we could get a situation where we prune up to the last validator set
+	// yet don't have the respective consensus params at that height and thus
+	// return an error
+	if err := store.pruneConsensusParams(retainHeight); err != nil {
 		return err
 	}

-	if err := store.pruneConsensusParams(retainHeight); err != nil {
+	if err := store.pruneValidatorSets(retainHeight); err != nil {
 		return err
 	}

@@ -257,37 +274,48 @@ func (store dbStore) PruneStates(retainHeight uint64) error {
 // pruneValidatorSets calls a reverse iterator from base height to retain height (exclusive), deleting
 // all validator sets in between. Due to the fact that most validator sets stored reference an earlier
 // validator set, it is likely that there will remain one validator set left after pruning.
-func (store dbStore) pruneValidatorSets(height uint64) error {
-	valInfo, err := loadValidatorsInfo(store.db, height)
+func (store dbStore) pruneValidatorSets(retainHeight uint64) error {
+	valInfo, err := loadValidatorsInfo(store.db, retainHeight)
 	if err != nil {
-		return fmt.Errorf("validators at height %v not found: %w", height, err)
+		return fmt.Errorf("validators at height %v not found: %w", retainHeight, err)
 	}

 	// We will prune up to the validator set at the given "height". As we don't save validator sets every
 	// height but only when they change or at a check point, it is likely that the validator set at the height
 	// we prune to is empty and thus dependent on the validator set saved at a previous height. We must find
 	// that validator set and make sure it is not pruned.
-	lastRecordedValSetHeight := lastStoredHeightFor(height, valInfo.LastHeightChanged)
+	lastRecordedValSetHeight := lastStoredHeightFor(retainHeight, valInfo.LastHeightChanged)
 	lastRecordedValSet, err := loadValidatorsInfo(store.db, lastRecordedValSetHeight)
 	if err != nil || lastRecordedValSet.ValidatorSet == nil {
 		return fmt.Errorf("couldn't find validators at height %d (height %d was originally requested): %w",
-			lastStoredHeightFor(height, valInfo.LastHeightChanged),
-			height,
+			lastStoredHeightFor(retainHeight, valInfo.LastHeightChanged),
+			retainHeight,
 			err,
 		)
 	}

-	// batch delete all the validators sets up to height
-	return store.batchDelete(
+	// if this is not equal to the retain height, prune from the retain height to the height above
+	// the last saved validator set. This way we can skip over the dependent validator set.
+	if lastRecordedValSetHeight < retainHeight {
+		err := store.pruneRange(
+			validatorsKey(lastRecordedValSetHeight+1),
+			validatorsKey(retainHeight),
+		)
+		if err != nil {
+			return err
+		}
+	}
+
+	// prune all the validators sets up to last saved validator set
+	return store.pruneRange(
 		validatorsKey(1),
-		validatorsKey(height),
 		validatorsKey(lastRecordedValSetHeight),
 	)
 }

 // pruneConsensusParams calls a reverse iterator from base height to retain height batch deleting
 // all consensus params in between. If the consensus params at the new base height is dependent
-// on a prior height then this will keep that lower height to.
+// on a prior height then this will keep that lower height too.
 func (store dbStore) pruneConsensusParams(retainHeight uint64) error {
 	paramsInfo, err := store.loadConsensusParamsInfo(retainHeight)
 	if err != nil {
@@ -298,21 +326,31 @@ func (store dbStore) pruneConsensusParams(retainHeight uint64) error {
 	// we must not prune (or save) the last consensus params that the consensus params info at height
 	// is dependent on.
 	if paramsInfo.ConsensusParams.Equal(&tmproto.ConsensusParams{}) {
+		// sanity check that the consensus params at the last height it was changed is there
 		lastRecordedConsensusParams, err := store.loadConsensusParamsInfo(paramsInfo.LastHeightChanged)
 		if err != nil || lastRecordedConsensusParams.ConsensusParams.Equal(&tmproto.ConsensusParams{}) {
 			return fmt.Errorf(
-				"couldn't find consensus params at height %d as last changed from height %d: %w",
+				"couldn't find consensus params at height %d (height %d was originally requested): %w",
 				paramsInfo.LastHeightChanged,
 				retainHeight,
 				err,
 			)
 		}
+
+		// prune the params above the height with which it last changed and below the retain height.
+		err = store.pruneRange(
+			consensusParamsKey(paramsInfo.LastHeightChanged+1),
+			consensusParamsKey(retainHeight),
+		)
+		if err != nil {
+			return err
+		}
 	}

-	// batch delete all the consensus params up to the retain height
-	return store.batchDelete(
+	// prune all the consensus params up to either the last height the params changed or if the params
+	// last changed at the retain height, then up to the retain height.
+	return store.pruneRange(
 		consensusParamsKey(1),
-		consensusParamsKey(retainHeight),
 		consensusParamsKey(paramsInfo.LastHeightChanged),
 	)
 }
@@ -320,72 +358,69 @@ func (store dbStore) pruneConsensusParams(retainHeight uint64) error {
 // pruneABCIResponses calls a reverse iterator from base height to retain height batch deleting
 // all abci responses in between
 func (store dbStore) pruneABCIResponses(height uint64) error {
-	return store.batchDelete(abciResponsesKey(1), abciResponsesKey(height), nil)
+	return store.pruneRange(abciResponsesKey(1), abciResponsesKey(height))
 }

-// batchDelete is a generic function for deleting a range of keys in reverse order. It will
-// skip keys that have been
-func (store dbStore) batchDelete(start []byte, end []byte, exception []byte) error {
-	iter, err := store.db.ReverseIterator(start, end)
-	if err != nil {
-		return fmt.Errorf("iterator error: %w", err)
-	}
-	defer iter.Close()
-
+// pruneRange is a generic function for deleting a range of keys in reverse order.
+// we keep filling up batches of at most 1000 keys, perform a deletion and continue until
+// we have gone through all of keys in the range. This avoids doing any writes whilst
+// iterating.
+func (store dbStore) pruneRange(start []byte, end []byte) error {
+	var err error
 	batch := store.db.NewBatch()
 	defer batch.Close()

-	pruned := 0
-	for iter.Valid() {
-		key := iter.Key()
-		if bytes.Equal(key, exception) {
-			iter.Next()
-			continue
-		}
-
-		if err := batch.Delete(key); err != nil {
-			return fmt.Errorf("pruning error at key %X: %w", key, err)
-		}
-
-		pruned++
-		// avoid batches growing too large by flushing to disk regularly
-		if pruned%1000 == 0 {
-			if err := iter.Error(); err != nil {
-				return err
-			}
-			if err := iter.Close(); err != nil {
-				return err
-			}
-
-			if err := batch.Write(); err != nil {
-				return fmt.Errorf("pruning error at key %X: %w", key, err)
-			}
-			if err := batch.Close(); err != nil {
-				return err
-			}
-
-			iter, err = store.db.ReverseIterator(start, end)
-			if err != nil {
-				return fmt.Errorf("iterator error: %w", err)
-			}
-			defer iter.Close()
-
-			batch = store.db.NewBatch()
-			defer batch.Close()
-		} else {
-			iter.Next()
-		}
-	}
-
-	if err := iter.Error(); err != nil {
-		return fmt.Errorf("iterator error: %w", err)
-	}
-
-	if err := batch.WriteSync(); err != nil {
+	end, err = store.reverseBatchDelete(batch, start, end)
+	if err != nil {
 		return err
 	}

-	return nil
+	// iterate until the last batch of the pruning range in which case we will perform a
+	// write sync
+	for !bytes.Equal(start, end) {
+		if err := batch.Write(); err != nil {
+			return err
+		}
+
+		if err := batch.Close(); err != nil {
+			return err
+		}
+
+		batch = store.db.NewBatch()
+
+		// fill a new batch of keys for deletion over the remainding range
+		end, err = store.reverseBatchDelete(batch, start, end)
+		if err != nil {
+			return err
+		}
+	}
+
+	return batch.WriteSync()
+}
+
+// reverseBatchDelete runs a reverse iterator (from end to start) filling up a batch until either
+// (a) the iterator reaches the start or (b) the iterator has added a 1000 keys (this avoids the
+// batch from growing too large)
+func (store dbStore) reverseBatchDelete(batch dbm.Batch, start, end []byte) ([]byte, error) {
+	iter, err := store.db.ReverseIterator(start, end)
+	if err != nil {
+		return end, fmt.Errorf("iterator error: %w", err)
+	}
+	defer iter.Close()
+
+	size := 0
+	for ; iter.Valid(); iter.Next() {
+		if err := batch.Delete(iter.Key()); err != nil {
+			return end, fmt.Errorf("pruning error at key %X: %w", iter.Key(), err)
+		}
+
+		// avoid batches growing too large by capping them
+		size++
+		if size == 1000 {
+			return iter.Key(), iter.Error()
+		}
+	}
+	return start, iter.Error()
 }

 //------------------------------------------------------------------------
@@ -584,7 +619,7 @@ func (store dbStore) LoadConsensusParams(height uint64) (types.ConsensusParams,
 		paramsInfo2, err := store.loadConsensusParamsInfo(paramsInfo.LastHeightChanged)
 		if err != nil {
 			return empty, fmt.Errorf(
-				"couldn't find consensus params at height %d as last changed from height %d: %w",
+				"couldn't find consensus params at height %d (height %d was originally requested): %w",
 				paramsInfo.LastHeightChanged,
 				height,
 				err,
@@ -159,25 +159,27 @@ func TestStoreLoadConsensusParams(t *testing.T) {

 func TestPruneStates(t *testing.T) {
 	testcases := map[string]struct {
-		makeHeights  uint64
-		pruneHeight  uint64
-		expectErr    bool
-		expectVals   []int64
-		expectParams []int64
-		expectABCI   []int64
+		startHeight           uint64
+		endHeight             uint64
+		pruneHeight           uint64
+		expectErr             bool
+		remainingValSetHeight uint64
+		remainingParamsHeight uint64
 	}{
-		"error when prune height is 0":           {100, 0, true, nil, nil, nil},
-		"error when prune height does not exist": {100, 101, true, nil, nil, nil},
-		"prune all":                              {100, 100, false, []int64{93, 100}, []int64{95, 100}, []int64{100}},
-		"prune some": {10, 8, false, []int64{3, 8, 9, 10},
-			[]int64{5, 8, 9, 10}, []int64{8, 9, 10}},
-		"prune across checkpoint": {100002, 100002, false, []int64{100000, 100002},
-			[]int64{99995, 100002}, []int64{100002}},
+		"error when prune height is 0":           {1, 100, 0, true, 0, 0},
+		"error when prune height does not exist": {1, 100, 101, true, 0, 0},
+		"prune all":                              {1, 100, 100, false, 93, 95},
+		"prune from non 1 height":                {10, 50, 40, false, 33, 35},
+		"prune some":                             {1, 10, 8, false, 3, 5},
+		// we test this because we flush to disk every 1000 "states"
+		"prune more than 1000 state": {1, 1010, 1010, false, 1003, 1005},
+		"prune across checkpoint":    {99900, 100002, 100002, false, 100000, 99995},
 	}
 	for name, tc := range testcases {
 		tc := tc
 		t.Run(name, func(t *testing.T) {
 			db := dbm.NewMemDB()
+
 			stateStore := sm.NewStore(db)
 			pk := ed25519.GenPrivKey().PubKey()

@@ -191,7 +193,7 @@ func TestPruneStates(t *testing.T) {
 			valsChanged := uint64(0)
 			paramsChanged := uint64(0)

-			for h := uint64(1); h <= tc.makeHeights; h++ {
+			for h := tc.startHeight; h <= tc.endHeight; h++ {
 				if valsChanged == 0 || h%10 == 2 {
 					valsChanged = h + 1 // Have to add 1, since NextValidators is what's stored
 				}
@@ -236,36 +238,44 @@ func TestPruneStates(t *testing.T) {
 			}
 			require.NoError(t, err)

-			expectVals := sliceToMap(tc.expectVals)
-			expectParams := sliceToMap(tc.expectParams)
-			expectABCI := sliceToMap(tc.expectABCI)
-
-			for h := uint64(1); h <= tc.makeHeights; h++ {
+			for h := tc.pruneHeight; h <= tc.endHeight; h++ {
 				vals, err := stateStore.LoadValidators(h)
-				if expectVals[int64(h)] {
-					require.NoError(t, err, "validators height %v", h)
-					require.NotNil(t, vals)
+				require.NoError(t, err, h)
+				require.NotNil(t, vals, h)
+
+				params, err := stateStore.LoadConsensusParams(h)
+				require.NoError(t, err, h)
+				require.NotNil(t, params, h)
+
+				abci, err := stateStore.LoadABCIResponses(h)
+				require.NoError(t, err, h)
+				require.NotNil(t, abci, h)
+			}
+
+			emptyParams := types.ConsensusParams{}
+
+			for h := tc.startHeight; h < tc.pruneHeight; h++ {
+				vals, err := stateStore.LoadValidators(h)
+				if h == tc.remainingValSetHeight {
+					require.NoError(t, err, h)
+					require.NotNil(t, vals, h)
 				} else {
-					require.Error(t, err, "validators height %v", h)
-					require.Equal(t, sm.ErrNoValSetForHeight{Height: h}, err)
+					require.Error(t, err, h)
+					require.Nil(t, vals, h)
 				}

 				params, err := stateStore.LoadConsensusParams(h)
-				if expectParams[int64(h)] {
-					require.NoError(t, err, "params height %v", h)
-					require.False(t, params.Equals(&types.ConsensusParams{}), "params should not be empty")
+				if h == tc.remainingParamsHeight {
+					require.NoError(t, err, h)
+					require.NotEqual(t, emptyParams, params, h)
 				} else {
-					require.Error(t, err, "params height %v", h)
+					require.Error(t, err, h)
+					require.Equal(t, emptyParams, params, h)
 				}

 				abci, err := stateStore.LoadABCIResponses(h)
-				if expectABCI[int64(h)] {
-					require.NoError(t, err, "abci height %v", h)
-					require.NotNil(t, abci)
-				} else {
-					require.Error(t, err, "abci height %v", h)
-					require.Equal(t, sm.ErrNoABCIResponsesForHeight{Height: h}, err)
-				}
+				require.Error(t, err, h)
+				require.Nil(t, abci, h)
 			}
 		})
 	}
@@ -292,11 +302,3 @@ func TestABCIResponsesResultsHash(t *testing.T) {
 	require.NoError(t, err)
 	assert.NoError(t, proof.Verify(root, bz))
 }
-
-func sliceToMap(s []int64) map[int64]bool {
-	m := make(map[int64]bool, len(s))
-	for _, i := range s {
-		m[i] = true
-	}
-	return m
-}
@@ -78,7 +78,7 @@ type Reactor struct {
 	tempDir     string
 	snapshotCh  *p2p.Channel
 	chunkCh     *p2p.Channel
-	peerUpdates *p2p.PeerUpdatesCh
+	peerUpdates *p2p.PeerUpdates
 	closeCh     chan struct{}

 	// This will only be set when a state sync is in progress. It is used to feed
@@ -96,7 +96,7 @@ func NewReactor(
 	conn proxy.AppConnSnapshot,
 	connQuery proxy.AppConnQuery,
 	snapshotCh, chunkCh *p2p.Channel,
-	peerUpdates *p2p.PeerUpdatesCh,
+	peerUpdates *p2p.PeerUpdates,
 	tempDir string,
 ) *Reactor {
 	r := &Reactor{
@@ -170,7 +170,7 @@ func (r *Reactor) handleSnapshotMessage(envelope p2p.Envelope) error {
 				"height", snapshot.Height,
 				"format", snapshot.Format,
 			)
-			r.snapshotCh.Out() <- p2p.Envelope{
+			r.snapshotCh.Out <- p2p.Envelope{
 				To: envelope.From,
 				Message: &ssproto.SnapshotsResponse{
 					Height:   snapshot.Height,
@@ -254,7 +254,7 @@ func (r *Reactor) handleChunkMessage(envelope p2p.Envelope) error {
 			"chunk", msg.Index,
 			"peer", envelope.From,
 		)
-		r.chunkCh.Out() <- p2p.Envelope{
+		r.chunkCh.Out <- p2p.Envelope{
 			To: envelope.From,
 			Message: &ssproto.ChunkResponse{
 				Height:  msg.Height,
@@ -343,13 +343,12 @@ func (r *Reactor) processSnapshotCh() {

 	for {
 		select {
-		case envelope := <-r.snapshotCh.In():
-			if err := r.handleMessage(r.snapshotCh.ID(), envelope); err != nil {
-				r.Logger.Error("failed to process message", "ch_id", r.snapshotCh.ID(), "envelope", envelope, "err", err)
-				r.snapshotCh.Error() <- p2p.PeerError{
-					PeerID:   envelope.From,
-					Err:      err,
-					Severity: p2p.PeerErrorSeverityLow,
+		case envelope := <-r.snapshotCh.In:
+			if err := r.handleMessage(r.snapshotCh.ID, envelope); err != nil {
+				r.Logger.Error("failed to process message", "ch_id", r.snapshotCh.ID, "envelope", envelope, "err", err)
+				r.snapshotCh.Error <- p2p.PeerError{
+					NodeID: envelope.From,
+					Err:    err,
 				}
 			}

@@ -370,13 +369,12 @@ func (r *Reactor) processChunkCh() {

 	for {
 		select {
-		case envelope := <-r.chunkCh.In():
-			if err := r.handleMessage(r.chunkCh.ID(), envelope); err != nil {
-				r.Logger.Error("failed to process message", "ch_id", r.chunkCh.ID(), "envelope", envelope, "err", err)
-				r.chunkCh.Error() <- p2p.PeerError{
-					PeerID:   envelope.From,
-					Err:      err,
-					Severity: p2p.PeerErrorSeverityLow,
+		case envelope := <-r.chunkCh.In:
+			if err := r.handleMessage(r.chunkCh.ID, envelope); err != nil {
+				r.Logger.Error("failed to process message", "ch_id", r.chunkCh.ID, "envelope", envelope, "err", err)
+				r.chunkCh.Error <- p2p.PeerError{
+					NodeID: envelope.From,
+					Err:    err,
 				}
 			}

@@ -390,18 +388,18 @@ func (r *Reactor) processChunkCh() {
 // processPeerUpdate processes a PeerUpdate, returning an error upon failing to
 // handle the PeerUpdate or if a panic is recovered.
 func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) {
-	r.Logger.Debug("received peer update", "peer", peerUpdate.PeerID, "status", peerUpdate.Status)
+	r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status)

 	r.mtx.RLock()
 	defer r.mtx.RUnlock()

 	if r.syncer != nil {
 		switch peerUpdate.Status {
-		case p2p.PeerStatusNew, p2p.PeerStatusUp:
-			r.syncer.AddPeer(peerUpdate.PeerID)
+		case p2p.PeerStatusUp:
+			r.syncer.AddPeer(peerUpdate.NodeID)

-		case p2p.PeerStatusDown, p2p.PeerStatusRemoved, p2p.PeerStatusBanned:
-			r.syncer.RemovePeer(peerUpdate.PeerID)
+		case p2p.PeerStatusDown:
+			r.syncer.RemovePeer(peerUpdate.NodeID)
 		}
 	}
 }
@@ -472,12 +470,12 @@ func (r *Reactor) Sync(stateProvider StateProvider, discoveryTime time.Duration)
 		return sm.State{}, nil, errors.New("a state sync is already in progress")
 	}

-	r.syncer = newSyncer(r.Logger, r.conn, r.connQuery, stateProvider, r.snapshotCh.Out(), r.chunkCh.Out(), r.tempDir)
+	r.syncer = newSyncer(r.Logger, r.conn, r.connQuery, stateProvider, r.snapshotCh.Out, r.chunkCh.Out, r.tempDir)
 	r.mtx.Unlock()

 	// request snapshots from all currently connected peers
 	r.Logger.Debug("requesting snapshots from known peers")
-	r.snapshotCh.Out() <- p2p.Envelope{
+	r.snapshotCh.Out <- p2p.Envelope{
 		Broadcast: true,
 		Message:   &ssproto.SnapshotsRequest{},
 	}
@@ -33,7 +33,7 @@ type reactorTestSuite struct {
 	chunkOutCh     chan p2p.Envelope
 	chunkPeerErrCh chan p2p.PeerError

-	peerUpdates *p2p.PeerUpdatesCh
+	peerUpdates *p2p.PeerUpdates
 }

 func setup(
@@ -127,7 +127,7 @@ func TestReactor_ChunkRequest_InvalidRequest(t *testing.T) {
 	require.Error(t, response.Err)
 	require.Empty(t, rts.chunkOutCh)
 	require.Contains(t, response.Err.Error(), "received unknown message")
-	require.Equal(t, p2p.NodeID("aa"), response.PeerID)
+	require.Equal(t, p2p.NodeID("aa"), response.NodeID)
 }

 func TestReactor_ChunkRequest(t *testing.T) {
@@ -198,7 +198,7 @@ func TestReactor_SnapshotsRequest_InvalidRequest(t *testing.T) {
 	require.Error(t, response.Err)
 	require.Empty(t, rts.snapshotOutCh)
 	require.Contains(t, response.Err.Error(), "received unknown message")
-	require.Equal(t, p2p.NodeID("aa"), response.PeerID)
+	require.Equal(t, p2p.NodeID("aa"), response.NodeID)
 }

 func TestReactor_SnapshotsRequest(t *testing.T) {
@@ -1,6 +1,7 @@
 package store

 import (
+	"bytes"
 	"fmt"
 	"strconv"

@@ -315,99 +316,111 @@ func (bs *BlockStore) PruneBlocks(height uint64) (uint64, error) {

 	// remove block meta first as this is used to indicate whether the block exists.
 	// For this reason, we also use ony block meta as a measure of the amount of blocks pruned
-	pruned, err := bs.batchDelete(blockMetaKey(0), blockMetaKey(height), removeBlockHash)
+	pruned, err := bs.pruneRange(blockMetaKey(0), blockMetaKey(height), removeBlockHash)
 	if err != nil {
 		return pruned, err
 	}

-	if _, err := bs.batchDelete(blockPartKey(0, 0), blockPartKey(height, 0), nil); err != nil {
+	if _, err := bs.pruneRange(blockPartKey(0, 0), blockPartKey(height, 0), nil); err != nil {
 		return pruned, err
 	}

-	if _, err := bs.batchDelete(blockCommitKey(0), blockCommitKey(height), nil); err != nil {
+	if _, err := bs.pruneRange(blockCommitKey(0), blockCommitKey(height), nil); err != nil {
 		return pruned, err
 	}

-	if _, err := bs.batchDelete(seenCommitKey(0), seenCommitKey(height), nil); err != nil {
+	if _, err := bs.pruneRange(seenCommitKey(0), seenCommitKey(height), nil); err != nil {
 		return pruned, err
 	}

 	return pruned, nil
 }

-// batchDelete is a generic function for deleting a range of values based on the lowest
+// pruneRange is a generic function for deleting a range of values based on the lowest
 // height up to but excluding retainHeight. For each key/value pair, an optional hook can be
-// executed before the deletion itself is made
-func (bs *BlockStore) batchDelete(
+// executed before the deletion itself is made. pruneRange will use batch delete to delete
+// keys in batches of at most 1000 keys.
+func (bs *BlockStore) pruneRange(
 	start []byte,
 	end []byte,
 	preDeletionHook func(key, value []byte, batch dbm.Batch) error,
 ) (uint64, error) {
-	iter, err := bs.db.Iterator(start, end)
-	if err != nil {
-		panic(err)
-	}
-	defer iter.Close()
+	var (
+		err         error
+		pruned      uint64
+		totalPruned uint64 = 0
+	)

 	batch := bs.db.NewBatch()
 	defer batch.Close()

-	pruned := uint64(0)
-	flushed := pruned
-	for iter.Valid() {
+	pruned, start, err = bs.batchDelete(batch, start, end, preDeletionHook)
+	if err != nil {
+		return totalPruned, err
+	}
+
+	// loop until we have finished iterating over all the keys by writing, opening a new batch
+	// and incrementing through the next range of keys.
+	for !bytes.Equal(start, end) {
+		if err := batch.Write(); err != nil {
+			return totalPruned, err
+		}
+
+		totalPruned += pruned
+
+		if err := batch.Close(); err != nil {
+			return totalPruned, err
+		}
+
+		batch = bs.db.NewBatch()
+
+		pruned, start, err = bs.batchDelete(batch, start, end, preDeletionHook)
+		if err != nil {
+			return totalPruned, err
+		}
+	}
+
+	// once we looped over all keys we do a final flush to disk
+	if err := batch.WriteSync(); err != nil {
+		return totalPruned, err
+	}
+	totalPruned += pruned
+	return totalPruned, nil
+}
+
+// batchDelete runs an iterator over a set of keys, first preforming a pre deletion hook before adding it to the batch.
+// The function ends when either 1000 keys have been added to the batch or the iterator has reached the end.
+func (bs *BlockStore) batchDelete(
+	batch dbm.Batch,
+	start, end []byte,
+	preDeletionHook func(key, value []byte, batch dbm.Batch) error,
+) (uint64, []byte, error) {
+	var pruned uint64 = 0
+	iter, err := bs.db.Iterator(start, end)
+	if err != nil {
+		return pruned, start, err
+	}
+	defer iter.Close()
+
+	for ; iter.Valid(); iter.Next() {
 		key := iter.Key()
 		if preDeletionHook != nil {
 			if err := preDeletionHook(key, iter.Value(), batch); err != nil {
-				return flushed, err
+				return 0, start, fmt.Errorf("pruning error at key %X: %w", iter.Key(), err)
 			}
 		}

 		if err := batch.Delete(key); err != nil {
-			return flushed, fmt.Errorf("pruning error at key %X: %w", iter.Key(), err)
+			return 0, start, fmt.Errorf("pruning error at key %X: %w", iter.Key(), err)
 		}

 		pruned++
-		// avoid batches growing too large by flushing to database regularly
-		if pruned%1000 == 0 {
-			if err := iter.Error(); err != nil {
-				return flushed, err
-			}
-			if err := iter.Close(); err != nil {
-				return flushed, err
-			}
-
-			err := batch.Write()
-			if err != nil {
-				return flushed, fmt.Errorf("pruning error at key %X: %w", iter.Key(), err)
-			}
-			if err := batch.Close(); err != nil {
-				return flushed, err
-			}
-			flushed = pruned
-
-			iter, err = bs.db.Iterator(start, end)
-			if err != nil {
-				panic(err)
-			}
-			defer iter.Close()
-
-			batch = bs.db.NewBatch()
-			defer batch.Close()
-		} else {
-			iter.Next()
+		if pruned == 1000 {
+			return pruned, iter.Key(), iter.Error()
 		}
 	}
-	flushed = pruned
-	if err := iter.Error(); err != nil {
-		return flushed, err
-	}

-	err = batch.WriteSync()
-	if err != nil {
-		return flushed, fmt.Errorf("pruning error at key %X: %w", iter.Key(), err)
-	}
-
-	return flushed, nil
+	return pruned, end, iter.Error()
 }

 // SaveBlock persists the given block, blockParts, and seenCommit to the underlying db.
@@ -77,8 +77,9 @@ func TestValidator_Propose(t *testing.T) {

 		require.False(t, proposeCount == 0 && expectCount > 0,
 			"node did not propose any blocks (expected %v)", expectCount)
-		require.Less(t, expectCount-proposeCount, 5,
-			"validator missed proposing too many blocks (proposed %v out of %v)", proposeCount, expectCount)
+		if expectCount > 5 {
+			require.GreaterOrEqual(t, proposeCount, 3, "validator didn't propose even 3 blocks")
+		}
 	})
 }

@@ -116,8 +117,9 @@ func TestValidator_Sign(t *testing.T) {

 		require.False(t, signCount == 0 && expectCount > 0,
 			"validator did not sign any blocks (expected %v)", expectCount)
-		require.Less(t, float64(expectCount-signCount)/float64(expectCount), 0.33,
-			"validator missed signing too many blocks (signed %v out of %v)", signCount, expectCount)
+		if expectCount > 7 {
+			require.GreaterOrEqual(t, signCount, 3, "validator didn't sign even 3 blocks (expected %v)", expectCount)
+		}
 	})
 }

@@ -981,7 +981,7 @@ func (ps *PeerState) ToJSON() ([]byte, error) {

 // GetHeight returns an atomic snapshot of the PeerRoundState's height
 // used by the mempool to ensure peers are caught up before broadcasting new txs
-func (ps *PeerState) GetHeight() int64 {
+func (ps *PeerState) GetHeight() uint64 {
 	ps.mtx.Lock()
 	defer ps.mtx.Unlock()
 	return ps.PRS.Height
@@ -791,7 +791,7 @@ func NewNode(config *cfg.Config,

 	// TODO: Fetch and provide real options and do proper p2p bootstrapping.
 	// TODO: Use a persistent peer database.
-	peerMgr, err := p2p.NewPeerManager(dbm.NewMemDB(), p2p.PeerManagerOptions{})
+	peerMgr, err := p2p.NewPeerManager(nodeKey.ID, dbm.NewMemDB(), p2p.PeerManagerOptions{})
 	if err != nil {
 		return nil, err
 	}